升级

2025-11-05 13:14:31 +08:00
parent d35a634608
commit 1d9971373a
1 changed files with 428 additions and 108 deletions
--- a/application/common/ArticleParserService.php
+++ b/application/common/ArticleParserService.php
@@ -18,7 +18,7 @@ class ArticleParserService
    public function __construct($filePath = '')
    {
        if (!file_exists($filePath)) {
-            throw new Exception("文档不存在：{$filePath}");
+            return json_encode(['status' => 5, 'msg' => '"文档不存在：{$filePath}"']);
        }
        try {
            // 关键配置：关闭“仅读数据”，保留完整节结构
@@ -32,9 +32,8 @@ class ArticleParserService
            // $this->log("✅ 文档直接加载成功，节数量：{$sectionCount}");
            $this->phpWord = $reader->load($filePath);
            $this->sections = $this->phpWord->getSections();
        } catch (\Exception $e) {
-            return json(['status' => 'error', 'msg' => $e->getMessage()]);
+            return json_encode(['status' => 5, 'msg' => $e->getMessage()]);
        }
    }
@@ -260,6 +259,168 @@ class ArticleParserService
 //           var_dump($aAuthorData);exit;
 //        return ['author' => $aAuthorData,'report' => array_unique($aReport)];
 //    }
    // 提取作者
    private function parseAuthorsWithoutRegex($str = '') {
        if (empty($str)) {
            return [];
        }
        // 清理乱码和特殊字符（扩展全角数字处理）
        $str = mb_convert_encoding($str, 'UTF-8', 'auto');
        $str = str_replace(["\xC2\xA0", 'ï¼', 'ï¿½', '，', '１', '２', '３', '４', '５', '６', '７', '８', '９', '０'], 
                          [' ', ' ', ' ', ' ', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0'], $str);
        $str = trim(str_replace([' and ', ' AND ', ' And '], ', ', $str));
        // 合并上标中数字与逗号间的空格（如"2, 3"→"2,3"）
        $len = mb_strlen($str);
        $processed = '';
        for ($i = 0; $i < $len; $i++) {
            $char = mb_substr($str, $i, 1);
            if ($char === ',' && $i - 1 >= 0 && $i + 2 < $len) {
                $prevChar = mb_substr($str, $i - 1, 1);
                $next1 = mb_substr($str, $i + 1, 1);
                $next2 = mb_substr($str, $i + 2, 1);
                // 兼容全角数字转半角后的判断
                if ((ctype_digit($prevChar) || is_numeric($prevChar)) && $next1 === ' ' && (ctype_digit($next2) || is_numeric($next2))) {
                    $processed .= $char;
                    $i += 1;
                    continue;
                }
            }
            $processed .= $char;
        }
        $str = $processed;
        // 合并数字与符号间的空格（如"1 *"→"1*"）
        $len = mb_strlen($str);
        $processed = '';
        for ($i = 0; $i < $len; $i++) {
            $char = mb_substr($str, $i, 1);
            if ((ctype_digit($char) || is_numeric($char)) && $i + 2 < $len) { // 支持数字判断
                $next1 = mb_substr($str, $i + 1, 1);
                $next2 = mb_substr($str, $i + 2, 1);
                if ($next1 === ' ' && in_array($next2, ['#', '*', '†', '‡', '§'])) { // 扩展符号支持
                    $processed .= $char;
                    $i += 2;
                    $processed .= $next2;
                    continue;
                }
            }
            $processed .= $char;
        }
        $str = $processed;
        // 合并连续空格
        $len = mb_strlen($str);
        $processed = '';
        $prevSpace = false;
        for ($i = 0; $i < $len; $i++) {
            $char = mb_substr($str, $i, 1);
            if ($char === ' ') {
                if (!$prevSpace) {
                    $processed .= $char;
                    $prevSpace = true;
                }
            } else {
                $processed .= $char;
                $prevSpace = false;
            }
        }
        $str = trim($processed);
        // 作者处理
        $authors = [];
        $currentName = '';
        $currentSuperscript = '';
        $inName = true;
        $len = mb_strlen($str);
        for ($i = 0; $i < $len; $i++) {
            $char = mb_substr($str, $i, 1);
            // 处理作者分隔符：逗号+空格
            if ($char === ',' && $i + 1 < $len) {
                $nextChar = mb_substr($str, $i + 1, 1);
                if ($nextChar === ' ') {
                    if (!empty($currentName)) {
                        $currentSuperscript = rtrim($currentSuperscript, ',');
                        $authors[] = [
                            'name' => trim($currentName),
                            'superscript' => trim($currentSuperscript)
                        ];
                    }
                    $currentName = '';
                    $currentSuperscript = '';
                    $inName = true;
                    $i++;
                    continue;
                }
            }
            // 支持姓名中的点、连字符、特殊字母（如带重音的字母）
            if (ctype_alpha($char) || in_array($char, [' ', '.', '-', 'à', 'á', 'â', 'ã', 'ä', 'ç', 'è', 'é', 'ê', 'ë'])) {
                if ($inName) {
                    $currentName .= $char;
                } else {
                    $currentSuperscript = rtrim($currentSuperscript, ',');
                    $authors[] = [
                        'name' => trim($currentName),
                        'superscript' => trim($currentSuperscript)
                    ];
                    $currentName = $char;
                    $currentSuperscript = '';
                    $inName = true;
                }
            }
            // 解析上标（数字、逗号、#、*、†等）
            elseif ((ctype_digit($char) || is_numeric($char)) || in_array($char, ['#', '*', '†', ',', '‡', '§'])) {
                $inName = false;
                $currentSuperscript .= $char;
            }
            // 忽略其他字符
            else {
                continue;
            }
        }
        // 处理最后一个作者
        if (!empty($currentName)) {
            $currentSuperscript = rtrim($currentSuperscript, ',');
            $authors[] = [
                'name' => trim($currentName),
                'superscript' => trim($currentSuperscript)
            ];
        }
        // 提取机构编号为数组、判断通讯作者和第一作者
        foreach ($authors as $index => &$author) {
            // 提取机构编号（兼容多字节数字）
            $institutionIds = [];
            $superscript = $author['superscript'];
            $numStr = '';
            for ($i = 0; $i < mb_strlen($superscript); $i++) {
                $c = mb_substr($superscript, $i, 1);
                if (ctype_digit($c) || is_numeric($c)) { // 支持数字判断
                    $numStr .= $c;
                } else {
                    if (!empty($numStr)) {
                        $institutionIds[] = (int)$numStr;
                        $numStr = '';
                    }
                }
            }
            if (!empty($numStr)) {
                $institutionIds[] = (int)$numStr;
            }
            $institutionIds = array_values(array_unique($institutionIds));
            $author['company_id'] = $institutionIds;
            // 判断第一作者（#标记）和通讯作者（*、†标记）
            $author['is_super'] = strpos($superscript, '#') !== false ? 1 : 0;
            $author['is_report'] = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0;
        }
        unset($author); // 释放引用
        return $authors;
    }
    private function getAuthors($aParam = []) {
        $title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
        $sAuthorContent = $this->getNextParagraphAfterText($title);
@@ -291,95 +452,192 @@ class ArticleParserService
        $sAuthorContent = preg_replace('/\s+and\s+/i', ', ', $sAuthorContent); // and转逗号
        $sAuthorContent = preg_replace('/\s+/', ' ', $sAuthorContent); // 合并多余空格
        $sAuthorContent = trim($sAuthorContent);
-
+        $aAuthor = $this->parseAuthorsWithoutRegex($sAuthorContent);
-        // 处理作者
+        if(empty($aAuthor)){
-        $content = mb_convert_encoding($sAuthorContent, 'UTF-8', 'auto'); // 确保编码正确
+            return ['author' => [],'report' => []];
        $content = str_replace("\xC2\xA0", ' ', $content); // 替换非-breaking空格为普通空格
        $content = preg_replace('/(\d+)\s*([*#†])/', '$1$2', $content); // 合并"1 *"为"1*"、"1 #"为"1#"
        $content = preg_replace('/,†/', ',†', $content); // 保留"1,†"格式（防止被拆分）
        //标记上标内的逗号+空格（多编号）
        $tempStr = preg_replace('/(\d+)\s*,\s*(\d+)/', '$1<SEP>$2', $content);
        // 原有步骤2：正则匹配（扩展上标符号支持，保持原有逻辑）
        $pattern = '/
            ([A-Za-z\s\.\-]+?)          # 姓名（支持缩写、空格）
            \s*                         # 姓名与上标间空格
            (                           # 上标组（扩展符号支持）
                \d+                     # 起始数字
                (?:[†#*,]|<SEP>\d+)*    # 允许：†#*符号、逗号、<SEP>+数字（兼容1,†、1,*等）
            )
            \s*,?                       # 作者间逗号（可选）
            (?=\s|$)                    # 确保后面是空格或结尾
        /ux';
        preg_match_all($pattern, $tempStr, $matches);
        $authorList = [];
        if(!empty($matches[1])){
            foreach ($matches[1] as $i => $name) {
                $name = trim($name);
                $superscript = trim($matches[2][$i]);
                $superscript = str_replace('<SEP>', ',', $superscript); // 恢复多编号逗号
                $superscript = preg_replace('/,$/', '', $superscript); // 清理末尾逗号
                // 修复符号与数字间的空格（如原始"1 *"被误处理为"1*"的情况，保持原样）
                $superscript = preg_replace('/(\d)([*#†])/', '$1$2', $superscript);
                if (!empty($name)) {
                    $authorList[] = [
                        'name' => $name,
                        'superscript' => $superscript
                    ];
                }
            }
        }else {
            // 按“两个或多个连续空格”拆分（姓名之间的分隔）
            $authorList = array_filter(
                array_map('trim', 
                    preg_split('/(,\p{Z}*|\p{Z}{2,})/u', $sAuthorContent)
                )
            );
        }
        $aReport = $aAuthorData = [];
-
+        foreach ($aAuthor as $key => $value) {
-        // //处理作者
+            if(empty($value['name']) && empty($value['superscript'])){
-        $aAuthorData = [];
+                continue;
        $aReport = [];
        $namePattern = '/
            (?:[A-Za-z\s·\-\']+|                # 英文姓名（支持空格、连字符）
             [\x{4e00}-\x{9fa5}]+|             # 中文姓名
             [\x{1800}-\x{18AF}]+|             # 蒙古文姓名
             [A-Z]\.)                           # 单字母缩写（如 J.）
        /ux';
        foreach ($authorList as $authorStr){
            if (empty($authorStr)) continue;
            //获取下标
            $superscript = empty($authorStr['superscript']) ? $authorStr : $authorStr['superscript'];
            $nameStr = empty($authorStr['name']) ? $authorStr : $authorStr['name'];
            $companyId = [];
            $isSuper = 0;
            $isReport = 0;
            if (!empty($superscript)) {
                // 提取机构编号（忽略上标中的逗号，如1,† → 提取1）
                preg_match_all('/\d+/', $superscript, $numMatch);
                // 识别特殊符号（#为超级作者，*†为通讯作者）
                $isSuper = strpos($superscript, '#') !== false ? 1 : 0;
                $isReport = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0;
            }
-            if (preg_match("/^([A-Za-z\s'\.-]+)/u", $nameStr, $match)) {
+            if(!empty($value['name']) && !empty($value['is_report']) && $value['is_report'] == 1){
-                $nameStr = trim($match[1]);
+                $aReport[] = $value['name'];
            }
            $aAuthorData[] = [
                'name' => $nameStr,
                'company_id' => empty($numMatch[0]) ? [] : $numMatch[0],
                'is_super' => $isSuper,
                'is_report' => $isReport
            ];
            if ($isReport) {
                $aReport[] = $nameStr;
            }
            $aAuthorData[] = $value;
        }
        return ['author' => $aAuthorData,'report' => array_unique($aReport)];
    }
 //     private function getAuthors($aParam = []) {
 //         $title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
 //         $sAuthorContent = $this->getNextParagraphAfterText($title);
 //         if (empty($sAuthorContent)) {
 //             return ['author' => [], 'report' => []];
 //         }
 //         //编码修复
 //         $possibleEncodings = [
 //             'Windows-1252', 'UTF-8', 'GBK', 'GB2312', 
 //             'Latin-1', 'ISO-8859-1', 'CP1252'
 //         ];
 //         $encodedContent = @mb_convert_encoding($sAuthorContent, 'UTF-8', implode(',', $possibleEncodings));
 //         $sAuthorContent = $encodedContent ?: $sAuthorContent;
 //         //清理不可见字符
 //         $sAuthorContent = preg_replace('/[\x00-\x1F\x7F\x{200B}-\x{200F}]/u', '', $sAuthorContent);
 //         //修复特殊符号乱码
 //         $symbolMap = [
 //             'â€ ' => '†', 'â  ' => '†', 'â' => '†', '?†' => '†',
 //             'ï¼š' => ':', 'ï¼Œ' => ',', 'â€”' => '-',
 //             '啊' => '' // 针对性移除异常字符“啊”（若为固定乱码）
 //         ];
 //         $sAuthorContent = strtr($sAuthorContent, $symbolMap);
 //         //格式标准化
 //         $sAuthorContent = str_replace(['，', ';', '；', '、'], ',', $sAuthorContent); // 统一分隔符
 //         $sAuthorContent = preg_replace('/\s+and\s+/i', ', ', $sAuthorContent); // and转逗号
 //         $sAuthorContent = preg_replace('/\s+/', ' ', $sAuthorContent); // 合并多余空格
 //         $sAuthorContent = trim($sAuthorContent);
 // var_dump($this->parseAuthorsWithoutRegex($sAuthorContent));exit;
 //         // 关键预处理：兼容"and"分隔符、清理乱码、统一空格
 // $content = mb_convert_encoding($sAuthorContent, 'UTF-8', 'auto');
 // $content = str_replace(["\xC2\xA0", 'ï¼', 'ï¿½', '，'], ' ', $content); // 清理乱码和全角符号
 // $content = preg_replace('/\band\b/i', ',', $content); // 将 "and" 转为逗号（统一分隔符）
 // $content = preg_replace('/(\d+)\s*([*#†])/', '$1$2', $content); // 合并数字与符号间的空格（如"1 *"→"1*"）
 // $content = trim(preg_replace('/\s+/', ' ', $content)); // 合并连续空格
 // // 标记上标内的逗号（多编号处理）
 // $tempStr = preg_replace('/(\d+)\s*,\s*(\d+)/', '$1<SEP>$2', $content);
 // // 核心正则（保持原有结构，扩展符号支持）
 // $pattern = '/
 //     ([A-Za-z\s\.\-]+?)          # 姓名（支持缩写、空格、连字符）
 //     \s*                         # 姓名与上标间的空格（允许0或多个）
 //     (                           # 上标组（扩展兼容所有符号）
 //         \d+                     # 起始数字（至少1个数字）
 //         (?:[†#*,]|<SEP>\d+)*    # 允许：符号（†#*）、逗号、<SEP>+数字（多编号）
 //     )
 //     \s*,?                       # 作者间的逗号（可选，允许逗号前有空格）
 //     (?=\s|$)                    # 确保后面是空格或字符串结尾（避免跨作者匹配）
 // /ux';
 // preg_match_all($pattern, $tempStr, $matches);
 // // 解析结果并格式化
 // $authorList = [];
 // if (!empty($matches[1])) {
 //     foreach ($matches[1] as $i => $name) {
 //         $name = trim($name);
 //         $superscript = trim($matches[2][$i]);
 //         $superscript = str_replace('<SEP>', ',', $superscript); // 恢复多编号逗号
 //         $superscript = preg_replace('/,$/', '', $superscript); // 清理末尾多余逗号
 //         if (!empty($name)) {
 //             $authorList[] = [
 //                 'name' => $name,
 //                 'superscript' => $superscript
 //             ];
 //         }
 //     }
 // }
 // // 输出结果
 // echo "<pre>";
 // print_r($authorList);
 // echo "</pre>";
 // exit;
 //         // 处理作者
 //         $content = mb_convert_encoding($sAuthorContent, 'UTF-8', 'auto'); // 确保编码正确
 //         $content = str_replace("\xC2\xA0", ' ', $content); // 替换非-breaking空格为普通空格
 //         $content = preg_replace('/(\d+)\s*([*#†])/', '$1$2', $content); // 合并"1 *"为"1*"、"1 #"为"1#"
 //         $content = preg_replace('/,†/', ',†', $content); // 保留"1,†"格式（防止被拆分）
 //         //标记上标内的逗号+空格（多编号）
 //         $tempStr = preg_replace('/(\d+)\s*,\s*(\d+)/', '$1<SEP>$2', $content);
 //         // 原有步骤2：正则匹配（扩展上标符号支持，保持原有逻辑）
 //         $pattern = '/
 //             ([A-Za-z\s\.\-]+?)          # 姓名（支持缩写、空格）
 //             \s*                         # 姓名与上标间空格
 //             (                           # 上标组（扩展符号支持）
 //                 \d+                     # 起始数字
 //                 (?:[†#*,]|<SEP>\d+)*    # 允许：†#*符号、逗号、<SEP>+数字（兼容1,†、1,*等）
 //             )
 //             \s*,?                       # 作者间逗号（可选）
 //             (?=\s|$)                    # 确保后面是空格或结尾
 //         /ux';
 //         preg_match_all($pattern, $tempStr, $matches);
 //         var_dump($matches);exit;
 //         $authorList = [];
 //         if(!empty($matches[1])){
 //             foreach ($matches[1] as $i => $name) {
 //                 $name = trim($name);
 //                 $superscript = trim($matches[2][$i]);
 //                 $superscript = str_replace('<SEP>', ',', $superscript); // 恢复多编号逗号
 //                 $superscript = preg_replace('/,$/', '', $superscript); // 清理末尾逗号
 //                 // 修复符号与数字间的空格（如原始"1 *"被误处理为"1*"的情况，保持原样）
 //                 $superscript = preg_replace('/(\d)([*#†])/', '$1$2', $superscript);
 //                 if (!empty($name)) {
 //                     $authorList[] = [
 //                         'name' => $name,
 //                         'superscript' => $superscript
 //                     ];
 //                 }
 //             }
 //         }else {
 //             // 按“两个或多个连续空格”拆分（姓名之间的分隔）
 //             $authorList = array_filter(
 //                 array_map('trim', 
 //                     preg_split('/(,\p{Z}*|\p{Z}{2,})/u', $sAuthorContent)
 //                 )
 //             );
 //         }
 //         // //处理作者
 //         $aAuthorData = [];
 //         $aReport = [];
 //         $namePattern = '/
 //             (?:[A-Za-z\s·\-\']+|                # 英文姓名（支持空格、连字符）
 //              [\x{4e00}-\x{9fa5}]+|             # 中文姓名
 //              [\x{1800}-\x{18AF}]+|             # 蒙古文姓名
 //              [A-Z]\.)                           # 单字母缩写（如 J.）
 //         /ux';
 //         foreach ($authorList as $authorStr){
 //             if (empty($authorStr)) continue;
 //             //获取下标
 //             $superscript = empty($authorStr['superscript']) ? $authorStr : $authorStr['superscript'];
 //             $nameStr = empty($authorStr['name']) ? $authorStr : $authorStr['name'];
 //             $companyId = [];
 //             $isSuper = 0;
 //             $isReport = 0;
 //             if (!empty($superscript)) {
 //                 // 提取机构编号（忽略上标中的逗号，如1,† → 提取1）
 //                 preg_match_all('/\d+/', $superscript, $numMatch);
 //                 // 识别特殊符号（#为超级作者，*†为通讯作者）
 //                 $isSuper = strpos($superscript, '#') !== false ? 1 : 0;
 //                 $isReport = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0;
 //             }
 //             if (preg_match("/^([A-Za-z\s'\.-]+)/u", $nameStr, $match)) {
 //                 $nameStr = trim($match[1]);
 //             }
 //             $aAuthorData[] = [
 //                 'name' => $nameStr,
 //                 'company_id' => empty($numMatch[0]) ? [] : $numMatch[0],
 //                 'is_super' => $isSuper,
 //                 'is_report' => $isReport
 //             ];
 //             if ($isReport) {
 //                 $aReport[] = $nameStr;
 //             }
 //         }
 //         return ['author' => $aAuthorData,'report' => array_unique($aReport)];
 //     }
    // 获取机构
    private function getCompany($aParam = []){
@@ -388,32 +646,68 @@ class ArticleParserService
        //获取标题下的作者
        $sAuthorContent = empty($aParam['authors']) ? $this->getNextParagraphAfterText($title) : $aParam['authors'];
        //获取作者结构
-        $sCompany = $this->getContentAfterText($sAuthorContent);
+        $allLines = $this->getContentAfterText($sAuthorContent,1);
-        if(empty($sCompany)){
+        if(empty($allLines)){
            return [];
        }
-        //编码修复
+        // 2. 按序号分组，合并同一序号的多行内容
        $grouped = [];
        $currentNumber = null; // 当前序号
        foreach ($allLines as $line) {
            $line = trim($line);
            if (empty($line)) continue;
            // 判断是否是新条目的开头：行首为数字（后续可接任意字符或直接接内容）
            $number = '';
            $i = 0;
            $lineLen = strlen($line);
            // 提取行首的连续数字（作为序号）
            while ($i < $lineLen && ctype_digit($line[$i])) {
                $number .= $line[$i];
                $i++;
            }
            // 若行首有数字，则视为新条目
            if (!empty($number)) {
                $currentNumber = $number;
                // 提取序号后的内容（跳过数字后的符号/空格，保留核心内容）
                // 从数字后的位置开始，跳过可能的符号（./*）或空格
                while ($i < $lineLen && (in_array($line[$i], ['.', '*', ' ']))) {
                    $i++;
                }
                $content = trim(substr($line, $i)); // 序号后的内容
                $grouped[$currentNumber] = $content;
                continue;
            }
            // 非新条目，合并到当前序号的内容中
            if ($currentNumber !== null) {
                $grouped[$currentNumber] .= ' ' . $line;
            }
        }
        //清理结果
        $possibleEncodings = [
            'Windows-1252', 'UTF-8', 'GBK', 'GB2312', 
            'Latin-1', 'ISO-8859-1', 'CP1252'
        ];
        $encodedContent = @mb_convert_encoding($sCompany, 'UTF-8', implode(',', $possibleEncodings));
        $sCompany = $encodedContent ?: $sCompany;
        //按行拆分，保留数字开头的行
        $sCompany = str_replace(["\r\n", "\r"], "\n", $sCompany);
        $aCompanyLines = explode("\n", $sCompany);
        $aCompanyLines = array_filter(array_map('trim', $aCompanyLines), function($line) {
            return preg_match('/^\d+/', $line); // 仅保留数字开头的行
        });
        $aCompany = [];
-        foreach ($aCompanyLines as $line) {
+        foreach ($grouped as $number => $institution) {
-            if (preg_match('/^(\d+)\s*(.+)$/', $line, $match)) {
+            $encodedContent = @mb_convert_encoding($institution, 'UTF-8', implode(',', $possibleEncodings));
-                if(empty($match[1]) || empty($match[2])){
+            $sCompany = $encodedContent ?: $sCompany;
-                    continue;
+            $institution = preg_replace('/\s+/', ' ', $institution); // 合并多余空格
            $institution = rtrim($institution, '.');
            $institution = preg_replace('/^\d+\s+/', '', $institution);
            $institution = trim($institution); // 清理首尾空格
            preg_match('/(.*?, [A-Za-z]+ \d+, [A-Za-z]+)/', $institution, $institutionmatches);;
            $institution = trim($institutionmatches[1] ?? $institution);
            if(!mb_check_encoding($institution, 'UTF-8')){
                $institution = mb_convert_encoding($institution, 'UTF-8', 'GBK');
            }
-                $aCompany[$match[1]] = ltrim(trim(ltrim($match[2]),'.'),' ');
+            if (preg_match('/^(.*?)(?=\s*\*Email)/', $institution, $matches)) {
                $institution = trim($matches[1]); // trim() 去除内容前后多余空格
            }
            $aCompany[$number] = $institution;
        }
        return $aCompany;
    }
@@ -451,7 +745,6 @@ class ArticleParserService
        $corrText = str_replace(['：', '＠'], [':', '@'], $corrText);
        $corrText = preg_replace('/\s+/', ' ', $corrText); // 统一空格
        $corrText = str_replace('  ', ' ', $corrText); // 去除多余空格
        //按"*"分割通讯作者
        $corrBlocks = preg_split('/\s*\*\s*/', $corrText);
        $corrBlocks = array_filter(array_map('trim', $corrBlocks));
@@ -466,7 +759,6 @@ class ArticleParserService
            preg_match('/(E[\s-]*mail|邮箱)[\s:]*([^\s]+@[^\s]+)/i', $block, $email);
            preg_match('/(Postal[\s-]*address|地址)[\s:]*([^,;]+)/i', $block, $address);
            preg_match('/(Tel|电话)[\s:]*([^\s]+)/i', $block, $tel);
            $aCorresponding[] = [
                'name' => $sName,
                'email' => isset($email[2]) ? trim($email[2]) : '',
@@ -474,6 +766,24 @@ class ArticleParserService
                'tel' => isset($tel[2]) ? trim($tel[2]) : ''
            ];
        }
        if(empty($aCorresponding)){
            $pattern = '/Corresponding Authors: (.*?)(?=$|;)/s';
            preg_match($pattern, $corrText, $match);
            if (!empty($match[1])) {
                $corrContent = $match[1];
                // 提取每个作者的名称和邮箱（优化正则，支持更多字符）
                $authorPattern = '/([A-Za-z\s]+?),\s*E-mail:\s*([\w@\.\-]+)/';
                preg_match_all($authorPattern, $corrContent, $authors);
                if(!empty($authors[1])){
                    for ($i = 0; $i < count($authors[1]); $i++) {
                        $aCorresponding[] = [
                            'name' => empty($authors[1][$i]) ? '' : trim($authors[1][$i]),
                            'email' => empty($authors[2][$i]) ? '' : trim($authors[2][$i])
                        ];
                    }
                }
            }
        }
        return $aCorresponding;
    }
@@ -518,7 +828,7 @@ class ArticleParserService
    }
    // 获取目标文本后的所有内容
-    private function getContentAfterText($targetText){
+    private function getContentAfterText($targetText,$return_type = 2){
        $found = false;
        $content = [];
        $stopKeywords = ['关键词', 'Key words', '摘要', 'Abstract'];
@@ -559,7 +869,14 @@ class ArticleParserService
            }
            if (count($content) >= $maxLines || (isset($shouldStop) && $shouldStop)) break;
        }
-        return implode("\n", $content);
+        if($return_type == 1){
            return $content;
        }
        $content = implode("\n", $content);
        if(!mb_check_encoding($content, 'UTF-8')){
            $content = mb_convert_encoding($content, 'UTF-8', 'GBK');
        }
        return $content;
    }
    // 统一提取元素文本
@@ -676,6 +993,9 @@ class ArticleParserService
                $sContent .= "\n";
            }
        }
        if(!mb_check_encoding($sContent, 'UTF-8')){
            $sContent = mb_convert_encoding($sContent, 'UTF-8', 'GBK');
        }
        // 2. 基础文本清理（合并多余空格，保留有效换行）
        $textContent = preg_replace('/(\S)\s+/', '$1 ', $sContent);
        $textContent = trim($textContent);