升级

2025-11-05 13:14:31 +08:00
parent d35a634608
commit 1d9971373a
1 changed files with 428 additions and 108 deletions
--- a/application/common/ArticleParserService.php
+++ b/application/common/ArticleParserService.php
@@ -18,7 +18,7 @@ class ArticleParserService
    public function __construct($filePath = '')
    {
        if (!file_exists($filePath)) {
-            throw new Exception("文档不存在：{$filePath}");
+            return json_encode(['status' => 5, 'msg' => '"文档不存在：{$filePath}"']);
        }
        try {
            // 关键配置：关闭“仅读数据”，保留完整节结构
@@ -32,9 +32,8 @@ class ArticleParserService
            // $this->log("✅ 文档直接加载成功，节数量：{$sectionCount}");
            $this->phpWord = $reader->load($filePath);
            $this->sections = $this->phpWord->getSections();
-
        } catch (\Exception $e) {
-            return json(['status' => 'error', 'msg' => $e->getMessage()]);
+            return json_encode(['status' => 5, 'msg' => $e->getMessage()]);
        }
    }

@@ -260,6 +259,168 @@ class ArticleParserService
 //           var_dump($aAuthorData);exit;
 //        return ['author' => $aAuthorData,'report' => array_unique($aReport)];
 //    }
+
+    // 提取作者
+    private function parseAuthorsWithoutRegex($str = '') {
+        if (empty($str)) {
+            return [];
+        }
+        // 清理乱码和特殊字符（扩展全角数字处理）
+        $str = mb_convert_encoding($str, 'UTF-8', 'auto');
+        $str = str_replace(["\xC2\xA0", 'ï¼', 'ï¿½', '，', '１', '２', '３', '４', '５', '６', '７', '８', '９', '０'], 
+                          [' ', ' ', ' ', ' ', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0'], $str);
+        $str = trim(str_replace([' and ', ' AND ', ' And '], ', ', $str));
+        
+        // 合并上标中数字与逗号间的空格（如"2, 3"→"2,3"）
+        $len = mb_strlen($str);
+        $processed = '';
+        for ($i = 0; $i < $len; $i++) {
+            $char = mb_substr($str, $i, 1);
+            if ($char === ',' && $i - 1 >= 0 && $i + 2 < $len) {
+                $prevChar = mb_substr($str, $i - 1, 1);
+                $next1 = mb_substr($str, $i + 1, 1);
+                $next2 = mb_substr($str, $i + 2, 1);
+                // 兼容全角数字转半角后的判断
+                if ((ctype_digit($prevChar) || is_numeric($prevChar)) && $next1 === ' ' && (ctype_digit($next2) || is_numeric($next2))) {
+                    $processed .= $char;
+                    $i += 1;
+                    continue;
+                }
+            }
+            $processed .= $char;
+        }
+        $str = $processed;
+        
+        // 合并数字与符号间的空格（如"1 *"→"1*"）
+        $len = mb_strlen($str);
+        $processed = '';
+        for ($i = 0; $i < $len; $i++) {
+            $char = mb_substr($str, $i, 1);
+            if ((ctype_digit($char) || is_numeric($char)) && $i + 2 < $len) { // 支持数字判断
+                $next1 = mb_substr($str, $i + 1, 1);
+                $next2 = mb_substr($str, $i + 2, 1);
+                if ($next1 === ' ' && in_array($next2, ['#', '*', '†', '‡', '§'])) { // 扩展符号支持
+                    $processed .= $char;
+                    $i += 2;
+                    $processed .= $next2;
+                    continue;
+                }
+            }
+            $processed .= $char;
+        }
+        $str = $processed;
+        
+        // 合并连续空格
+        $len = mb_strlen($str);
+        $processed = '';
+        $prevSpace = false;
+        for ($i = 0; $i < $len; $i++) {
+            $char = mb_substr($str, $i, 1);
+            if ($char === ' ') {
+                if (!$prevSpace) {
+                    $processed .= $char;
+                    $prevSpace = true;
+                }
+            } else {
+                $processed .= $char;
+                $prevSpace = false;
+            }
+        }
+        $str = trim($processed);
+        
+        // 作者处理
+        $authors = [];
+        $currentName = '';
+        $currentSuperscript = '';
+        $inName = true;
+        $len = mb_strlen($str);
+        for ($i = 0; $i < $len; $i++) {
+            $char = mb_substr($str, $i, 1);
+
+            // 处理作者分隔符：逗号+空格
+            if ($char === ',' && $i + 1 < $len) {
+                $nextChar = mb_substr($str, $i + 1, 1);
+                if ($nextChar === ' ') {
+                    if (!empty($currentName)) {
+                        $currentSuperscript = rtrim($currentSuperscript, ',');
+                        $authors[] = [
+                            'name' => trim($currentName),
+                            'superscript' => trim($currentSuperscript)
+                        ];
+                    }
+                    $currentName = '';
+                    $currentSuperscript = '';
+                    $inName = true;
+                    $i++;
+                    continue;
+                }
+            }
+
+            // 支持姓名中的点、连字符、特殊字母（如带重音的字母）
+            if (ctype_alpha($char) || in_array($char, [' ', '.', '-', 'à', 'á', 'â', 'ã', 'ä', 'ç', 'è', 'é', 'ê', 'ë'])) {
+                if ($inName) {
+                    $currentName .= $char;
+                } else {
+                    $currentSuperscript = rtrim($currentSuperscript, ',');
+                    $authors[] = [
+                        'name' => trim($currentName),
+                        'superscript' => trim($currentSuperscript)
+                    ];
+                    $currentName = $char;
+                    $currentSuperscript = '';
+                    $inName = true;
+                }
+            }
+            // 解析上标（数字、逗号、#、*、†等）
+            elseif ((ctype_digit($char) || is_numeric($char)) || in_array($char, ['#', '*', '†', ',', '‡', '§'])) {
+                $inName = false;
+                $currentSuperscript .= $char;
+            }
+            // 忽略其他字符
+            else {
+                continue;
+            }
+        }
+
+        // 处理最后一个作者
+        if (!empty($currentName)) {
+            $currentSuperscript = rtrim($currentSuperscript, ',');
+            $authors[] = [
+                'name' => trim($currentName),
+                'superscript' => trim($currentSuperscript)
+            ];
+        }
+
+        // 提取机构编号为数组、判断通讯作者和第一作者
+        foreach ($authors as $index => &$author) {
+            // 提取机构编号（兼容多字节数字）
+            $institutionIds = [];
+            $superscript = $author['superscript'];
+            $numStr = '';
+            for ($i = 0; $i < mb_strlen($superscript); $i++) {
+                $c = mb_substr($superscript, $i, 1);
+                if (ctype_digit($c) || is_numeric($c)) { // 支持数字判断
+                    $numStr .= $c;
+                } else {
+                    if (!empty($numStr)) {
+                        $institutionIds[] = (int)$numStr;
+                        $numStr = '';
+                    }
+                }
+            }
+            if (!empty($numStr)) {
+                $institutionIds[] = (int)$numStr;
+            }
+            $institutionIds = array_values(array_unique($institutionIds));
+            $author['company_id'] = $institutionIds;
+
+            // 判断第一作者（#标记）和通讯作者（*、†标记）
+            $author['is_super'] = strpos($superscript, '#') !== false ? 1 : 0;
+            $author['is_report'] = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0;
+        }
+        unset($author); // 释放引用
+        return $authors;
+    }
    private function getAuthors($aParam = []) {
        $title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
        $sAuthorContent = $this->getNextParagraphAfterText($title);
@@ -291,95 +452,192 @@ class ArticleParserService
        $sAuthorContent = preg_replace('/\s+and\s+/i', ', ', $sAuthorContent); // and转逗号
        $sAuthorContent = preg_replace('/\s+/', ' ', $sAuthorContent); // 合并多余空格
        $sAuthorContent = trim($sAuthorContent);
-
-        // 处理作者
-        $content = mb_convert_encoding($sAuthorContent, 'UTF-8', 'auto'); // 确保编码正确
-        $content = str_replace("\xC2\xA0", ' ', $content); // 替换非-breaking空格为普通空格
-        $content = preg_replace('/(\d+)\s*([*#†])/', '$1$2', $content); // 合并"1 *"为"1*"、"1 #"为"1#"
-        $content = preg_replace('/,†/', ',†', $content); // 保留"1,†"格式（防止被拆分）
-        //标记上标内的逗号+空格（多编号）
-        $tempStr = preg_replace('/(\d+)\s*,\s*(\d+)/', '$1<SEP>$2', $content);
-        // 原有步骤2：正则匹配（扩展上标符号支持，保持原有逻辑）
-        $pattern = '/
-            ([A-Za-z\s\.\-]+?)          # 姓名（支持缩写、空格）
-            \s*                         # 姓名与上标间空格
-            (                           # 上标组（扩展符号支持）
-                \d+                     # 起始数字
-                (?:[†#*,]|<SEP>\d+)*    # 允许：†#*符号、逗号、<SEP>+数字（兼容1,†、1,*等）
-            )
-            \s*,?                       # 作者间逗号（可选）
-            (?=\s|$)                    # 确保后面是空格或结尾
-        /ux';
-
-        preg_match_all($pattern, $tempStr, $matches);
-        $authorList = [];
-        if(!empty($matches[1])){
-            foreach ($matches[1] as $i => $name) {
-                $name = trim($name);
-                $superscript = trim($matches[2][$i]);
-                $superscript = str_replace('<SEP>', ',', $superscript); // 恢复多编号逗号
-                $superscript = preg_replace('/,$/', '', $superscript); // 清理末尾逗号
-                // 修复符号与数字间的空格（如原始"1 *"被误处理为"1*"的情况，保持原样）
-                $superscript = preg_replace('/(\d)([*#†])/', '$1$2', $superscript);
-                if (!empty($name)) {
-                    $authorList[] = [
-                        'name' => $name,
-                        'superscript' => $superscript
-                    ];
-                }
-            }
-        }else {
-            // 按“两个或多个连续空格”拆分（姓名之间的分隔）
-            $authorList = array_filter(
-                array_map('trim', 
-                    preg_split('/(,\p{Z}*|\p{Z}{2,})/u', $sAuthorContent)
-                )
-            );
+        $aAuthor = $this->parseAuthorsWithoutRegex($sAuthorContent);
+        if(empty($aAuthor)){
+            return ['author' => [],'report' => []];
        }
-        
+        $aReport = $aAuthorData = [];

-        // //处理作者
-        $aAuthorData = [];
-        $aReport = [];
-        $namePattern = '/
-            (?:[A-Za-z\s·\-\']+|                # 英文姓名（支持空格、连字符）
-             [\x{4e00}-\x{9fa5}]+|             # 中文姓名
-             [\x{1800}-\x{18AF}]+|             # 蒙古文姓名
-             [A-Z]\.)                           # 单字母缩写（如 J.）
-        /ux';
- 
-        foreach ($authorList as $authorStr){
-            if (empty($authorStr)) continue;
-            
-            //获取下标
-            $superscript = empty($authorStr['superscript']) ? $authorStr : $authorStr['superscript'];
-            $nameStr = empty($authorStr['name']) ? $authorStr : $authorStr['name'];
-
-            $companyId = [];
-            $isSuper = 0;
-            $isReport = 0;
-            if (!empty($superscript)) {
-                // 提取机构编号（忽略上标中的逗号，如1,† → 提取1）
-                preg_match_all('/\d+/', $superscript, $numMatch);
-                // 识别特殊符号（#为超级作者，*†为通讯作者）
-                $isSuper = strpos($superscript, '#') !== false ? 1 : 0;
-                $isReport = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0;
+        foreach ($aAuthor as $key => $value) {
+            if(empty($value['name']) && empty($value['superscript'])){
+                continue;
            }
-            if (preg_match("/^([A-Za-z\s'\.-]+)/u", $nameStr, $match)) {
-                $nameStr = trim($match[1]);
-            }
-            $aAuthorData[] = [
-                'name' => $nameStr,
-                'company_id' => empty($numMatch[0]) ? [] : $numMatch[0],
-                'is_super' => $isSuper,
-                'is_report' => $isReport
-            ];
-            if ($isReport) {
-                $aReport[] = $nameStr;
+            if(!empty($value['name']) && !empty($value['is_report']) && $value['is_report'] == 1){
+                $aReport[] = $value['name'];
            }
+            $aAuthorData[] = $value;
        }
        return ['author' => $aAuthorData,'report' => array_unique($aReport)];
    }
+//     private function getAuthors($aParam = []) {
+//         $title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
+//         $sAuthorContent = $this->getNextParagraphAfterText($title);
+//         if (empty($sAuthorContent)) {
+//             return ['author' => [], 'report' => []];
+//         }
+
+//         //编码修复
+//         $possibleEncodings = [
+//             'Windows-1252', 'UTF-8', 'GBK', 'GB2312', 
+//             'Latin-1', 'ISO-8859-1', 'CP1252'
+//         ];
+//         $encodedContent = @mb_convert_encoding($sAuthorContent, 'UTF-8', implode(',', $possibleEncodings));
+//         $sAuthorContent = $encodedContent ?: $sAuthorContent;
+
+//         //清理不可见字符
+//         $sAuthorContent = preg_replace('/[\x00-\x1F\x7F\x{200B}-\x{200F}]/u', '', $sAuthorContent);
+
+//         //修复特殊符号乱码
+//         $symbolMap = [
+//             'â€ ' => '†', 'â  ' => '†', 'â' => '†', '?†' => '†',
+//             'ï¼š' => ':', 'ï¼Œ' => ',', 'â€”' => '-',
+//             '啊' => '' // 针对性移除异常字符“啊”（若为固定乱码）
+//         ];
+//         $sAuthorContent = strtr($sAuthorContent, $symbolMap);
+
+//         //格式标准化
+//         $sAuthorContent = str_replace(['，', ';', '；', '、'], ',', $sAuthorContent); // 统一分隔符
+//         $sAuthorContent = preg_replace('/\s+and\s+/i', ', ', $sAuthorContent); // and转逗号
+//         $sAuthorContent = preg_replace('/\s+/', ' ', $sAuthorContent); // 合并多余空格
+//         $sAuthorContent = trim($sAuthorContent);
+// var_dump($this->parseAuthorsWithoutRegex($sAuthorContent));exit;
+//         // 关键预处理：兼容"and"分隔符、清理乱码、统一空格
+// $content = mb_convert_encoding($sAuthorContent, 'UTF-8', 'auto');
+// $content = str_replace(["\xC2\xA0", 'ï¼', 'ï¿½', '，'], ' ', $content); // 清理乱码和全角符号
+// $content = preg_replace('/\band\b/i', ',', $content); // 将 "and" 转为逗号（统一分隔符）
+// $content = preg_replace('/(\d+)\s*([*#†])/', '$1$2', $content); // 合并数字与符号间的空格（如"1 *"→"1*"）
+// $content = trim(preg_replace('/\s+/', ' ', $content)); // 合并连续空格
+
+// // 标记上标内的逗号（多编号处理）
+// $tempStr = preg_replace('/(\d+)\s*,\s*(\d+)/', '$1<SEP>$2', $content);
+
+// // 核心正则（保持原有结构，扩展符号支持）
+// $pattern = '/
+//     ([A-Za-z\s\.\-]+?)          # 姓名（支持缩写、空格、连字符）
+//     \s*                         # 姓名与上标间的空格（允许0或多个）
+//     (                           # 上标组（扩展兼容所有符号）
+//         \d+                     # 起始数字（至少1个数字）
+//         (?:[†#*,]|<SEP>\d+)*    # 允许：符号（†#*）、逗号、<SEP>+数字（多编号）
+//     )
+//     \s*,?                       # 作者间的逗号（可选，允许逗号前有空格）
+//     (?=\s|$)                    # 确保后面是空格或字符串结尾（避免跨作者匹配）
+// /ux';
+
+// preg_match_all($pattern, $tempStr, $matches);
+
+// // 解析结果并格式化
+// $authorList = [];
+// if (!empty($matches[1])) {
+//     foreach ($matches[1] as $i => $name) {
+//         $name = trim($name);
+//         $superscript = trim($matches[2][$i]);
+//         $superscript = str_replace('<SEP>', ',', $superscript); // 恢复多编号逗号
+//         $superscript = preg_replace('/,$/', '', $superscript); // 清理末尾多余逗号
+//         if (!empty($name)) {
+//             $authorList[] = [
+//                 'name' => $name,
+//                 'superscript' => $superscript
+//             ];
+//         }
+//     }
+// }
+
+// // 输出结果
+// echo "<pre>";
+// print_r($authorList);
+// echo "</pre>";
+// exit;
+
+//         // 处理作者
+//         $content = mb_convert_encoding($sAuthorContent, 'UTF-8', 'auto'); // 确保编码正确
+//         $content = str_replace("\xC2\xA0", ' ', $content); // 替换非-breaking空格为普通空格
+//         $content = preg_replace('/(\d+)\s*([*#†])/', '$1$2', $content); // 合并"1 *"为"1*"、"1 #"为"1#"
+//         $content = preg_replace('/,†/', ',†', $content); // 保留"1,†"格式（防止被拆分）
+
+//         //标记上标内的逗号+空格（多编号）
+//         $tempStr = preg_replace('/(\d+)\s*,\s*(\d+)/', '$1<SEP>$2', $content);
+//         // 原有步骤2：正则匹配（扩展上标符号支持，保持原有逻辑）
+//         $pattern = '/
+//             ([A-Za-z\s\.\-]+?)          # 姓名（支持缩写、空格）
+//             \s*                         # 姓名与上标间空格
+//             (                           # 上标组（扩展符号支持）
+//                 \d+                     # 起始数字
+//                 (?:[†#*,]|<SEP>\d+)*    # 允许：†#*符号、逗号、<SEP>+数字（兼容1,†、1,*等）
+//             )
+//             \s*,?                       # 作者间逗号（可选）
+//             (?=\s|$)                    # 确保后面是空格或结尾
+//         /ux';
+     
+//         preg_match_all($pattern, $tempStr, $matches);
+//         var_dump($matches);exit;
+//         $authorList = [];
+//         if(!empty($matches[1])){
+//             foreach ($matches[1] as $i => $name) {
+//                 $name = trim($name);
+//                 $superscript = trim($matches[2][$i]);
+//                 $superscript = str_replace('<SEP>', ',', $superscript); // 恢复多编号逗号
+//                 $superscript = preg_replace('/,$/', '', $superscript); // 清理末尾逗号
+//                 // 修复符号与数字间的空格（如原始"1 *"被误处理为"1*"的情况，保持原样）
+//                 $superscript = preg_replace('/(\d)([*#†])/', '$1$2', $superscript);
+//                 if (!empty($name)) {
+//                     $authorList[] = [
+//                         'name' => $name,
+//                         'superscript' => $superscript
+//                     ];
+//                 }
+//             }
+//         }else {
+//             // 按“两个或多个连续空格”拆分（姓名之间的分隔）
+//             $authorList = array_filter(
+//                 array_map('trim', 
+//                     preg_split('/(,\p{Z}*|\p{Z}{2,})/u', $sAuthorContent)
+//                 )
+//             );
+//         }
+        
+
+//         // //处理作者
+//         $aAuthorData = [];
+//         $aReport = [];
+//         $namePattern = '/
+//             (?:[A-Za-z\s·\-\']+|                # 英文姓名（支持空格、连字符）
+//              [\x{4e00}-\x{9fa5}]+|             # 中文姓名
+//              [\x{1800}-\x{18AF}]+|             # 蒙古文姓名
+//              [A-Z]\.)                           # 单字母缩写（如 J.）
+//         /ux';
+ 
+//         foreach ($authorList as $authorStr){
+//             if (empty($authorStr)) continue;
+            
+//             //获取下标
+//             $superscript = empty($authorStr['superscript']) ? $authorStr : $authorStr['superscript'];
+//             $nameStr = empty($authorStr['name']) ? $authorStr : $authorStr['name'];
+
+//             $companyId = [];
+//             $isSuper = 0;
+//             $isReport = 0;
+//             if (!empty($superscript)) {
+//                 // 提取机构编号（忽略上标中的逗号，如1,† → 提取1）
+//                 preg_match_all('/\d+/', $superscript, $numMatch);
+//                 // 识别特殊符号（#为超级作者，*†为通讯作者）
+//                 $isSuper = strpos($superscript, '#') !== false ? 1 : 0;
+//                 $isReport = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0;
+//             }
+//             if (preg_match("/^([A-Za-z\s'\.-]+)/u", $nameStr, $match)) {
+//                 $nameStr = trim($match[1]);
+//             }
+//             $aAuthorData[] = [
+//                 'name' => $nameStr,
+//                 'company_id' => empty($numMatch[0]) ? [] : $numMatch[0],
+//                 'is_super' => $isSuper,
+//                 'is_report' => $isReport
+//             ];
+//             if ($isReport) {
+//                 $aReport[] = $nameStr;
+//             }
+//         }
+//         return ['author' => $aAuthorData,'report' => array_unique($aReport)];
+//     }

    // 获取机构
    private function getCompany($aParam = []){
@@ -388,32 +646,68 @@ class ArticleParserService
        //获取标题下的作者
        $sAuthorContent = empty($aParam['authors']) ? $this->getNextParagraphAfterText($title) : $aParam['authors'];
        //获取作者结构
-        $sCompany = $this->getContentAfterText($sAuthorContent);
-        if(empty($sCompany)){
+        $allLines = $this->getContentAfterText($sAuthorContent,1);
+        if(empty($allLines)){
            return [];
        }
-        //编码修复
+        // 2. 按序号分组，合并同一序号的多行内容
+        $grouped = [];
+        $currentNumber = null; // 当前序号
+        foreach ($allLines as $line) {
+            $line = trim($line);
+            if (empty($line)) continue;
+
+            // 判断是否是新条目的开头：行首为数字（后续可接任意字符或直接接内容）
+            $number = '';
+            $i = 0;
+            $lineLen = strlen($line);
+            // 提取行首的连续数字（作为序号）
+            while ($i < $lineLen && ctype_digit($line[$i])) {
+                $number .= $line[$i];
+                $i++;
+            }
+
+            // 若行首有数字，则视为新条目
+            if (!empty($number)) {
+                $currentNumber = $number;
+                // 提取序号后的内容（跳过数字后的符号/空格，保留核心内容）
+                // 从数字后的位置开始，跳过可能的符号（./*）或空格
+                while ($i < $lineLen && (in_array($line[$i], ['.', '*', ' ']))) {
+                    $i++;
+                }
+                $content = trim(substr($line, $i)); // 序号后的内容
+                $grouped[$currentNumber] = $content;
+                continue;
+            }
+
+            // 非新条目，合并到当前序号的内容中
+            if ($currentNumber !== null) {
+                $grouped[$currentNumber] .= ' ' . $line;
+            }
+        }
+
+        //清理结果
        $possibleEncodings = [
            'Windows-1252', 'UTF-8', 'GBK', 'GB2312', 
            'Latin-1', 'ISO-8859-1', 'CP1252'
        ];
-        $encodedContent = @mb_convert_encoding($sCompany, 'UTF-8', implode(',', $possibleEncodings));
-        $sCompany = $encodedContent ?: $sCompany;
-        //按行拆分，保留数字开头的行
-        $sCompany = str_replace(["\r\n", "\r"], "\n", $sCompany);
-        $aCompanyLines = explode("\n", $sCompany);
-        $aCompanyLines = array_filter(array_map('trim', $aCompanyLines), function($line) {
-            return preg_match('/^\d+/', $line); // 仅保留数字开头的行
-        });
-
        $aCompany = [];
-        foreach ($aCompanyLines as $line) {
-            if (preg_match('/^(\d+)\s*(.+)$/', $line, $match)) {
-                if(empty($match[1]) || empty($match[2])){
-                    continue;
-                }
-                $aCompany[$match[1]] = ltrim(trim(ltrim($match[2]),'.'),' ');
+        foreach ($grouped as $number => $institution) {
+            $encodedContent = @mb_convert_encoding($institution, 'UTF-8', implode(',', $possibleEncodings));
+            $sCompany = $encodedContent ?: $sCompany;
+            $institution = preg_replace('/\s+/', ' ', $institution); // 合并多余空格
+            $institution = rtrim($institution, '.');
+            $institution = preg_replace('/^\d+\s+/', '', $institution);
+            $institution = trim($institution); // 清理首尾空格
+            preg_match('/(.*?, [A-Za-z]+ \d+, [A-Za-z]+)/', $institution, $institutionmatches);;
+            $institution = trim($institutionmatches[1] ?? $institution);
+            if(!mb_check_encoding($institution, 'UTF-8')){
+                $institution = mb_convert_encoding($institution, 'UTF-8', 'GBK');
            }
+            if (preg_match('/^(.*?)(?=\s*\*Email)/', $institution, $matches)) {
+                $institution = trim($matches[1]); // trim() 去除内容前后多余空格
+            }
+            $aCompany[$number] = $institution;
        }
        return $aCompany;
    }
@@ -451,11 +745,10 @@ class ArticleParserService
        $corrText = str_replace(['：', '＠'], [':', '@'], $corrText);
        $corrText = preg_replace('/\s+/', ' ', $corrText); // 统一空格
        $corrText = str_replace('  ', ' ', $corrText); // 去除多余空格
-
        //按"*"分割通讯作者
        $corrBlocks = preg_split('/\s*\*\s*/', $corrText);
        $corrBlocks = array_filter(array_map('trim', $corrBlocks));
-
+       
        $aCorresponding = [];
        foreach ($corrBlocks as $block) {
            //匹配通讯作者姓名
@@ -466,7 +759,6 @@ class ArticleParserService
            preg_match('/(E[\s-]*mail|邮箱)[\s:]*([^\s]+@[^\s]+)/i', $block, $email);
            preg_match('/(Postal[\s-]*address|地址)[\s:]*([^,;]+)/i', $block, $address);
            preg_match('/(Tel|电话)[\s:]*([^\s]+)/i', $block, $tel);
-
            $aCorresponding[] = [
                'name' => $sName,
                'email' => isset($email[2]) ? trim($email[2]) : '',
@@ -474,6 +766,24 @@ class ArticleParserService
                'tel' => isset($tel[2]) ? trim($tel[2]) : ''
            ];
        }
+        if(empty($aCorresponding)){
+            $pattern = '/Corresponding Authors: (.*?)(?=$|;)/s';
+            preg_match($pattern, $corrText, $match);
+            if (!empty($match[1])) {
+                $corrContent = $match[1];
+                // 提取每个作者的名称和邮箱（优化正则，支持更多字符）
+                $authorPattern = '/([A-Za-z\s]+?),\s*E-mail:\s*([\w@\.\-]+)/';
+                preg_match_all($authorPattern, $corrContent, $authors);
+                if(!empty($authors[1])){
+                    for ($i = 0; $i < count($authors[1]); $i++) {
+                        $aCorresponding[] = [
+                            'name' => empty($authors[1][$i]) ? '' : trim($authors[1][$i]),
+                            'email' => empty($authors[2][$i]) ? '' : trim($authors[2][$i])
+                        ];
+                    }
+                }
+            }
+        }
        return $aCorresponding;
    }

@@ -518,7 +828,7 @@ class ArticleParserService
    }

    // 获取目标文本后的所有内容
-    private function getContentAfterText($targetText){
+    private function getContentAfterText($targetText,$return_type = 2){
        $found = false;
        $content = [];
        $stopKeywords = ['关键词', 'Key words', '摘要', 'Abstract'];
@@ -559,7 +869,14 @@ class ArticleParserService
            }
            if (count($content) >= $maxLines || (isset($shouldStop) && $shouldStop)) break;
        }
-        return implode("\n", $content);
+        if($return_type == 1){
+            return $content;
+        }
+        $content = implode("\n", $content);
+        if(!mb_check_encoding($content, 'UTF-8')){
+            $content = mb_convert_encoding($content, 'UTF-8', 'GBK');
+        }
+        return $content;
    }

    // 统一提取元素文本
@@ -676,6 +993,9 @@ class ArticleParserService
                $sContent .= "\n";
            }
        }
+        if(!mb_check_encoding($sContent, 'UTF-8')){
+            $sContent = mb_convert_encoding($sContent, 'UTF-8', 'GBK');
+        }
        // 2. 基础文本清理（合并多余空格，保留有效换行）
        $textContent = preg_replace('/(\S)\s+/', '$1 ', $sContent);
        $textContent = trim($textContent);