测试问题修改

2025-12-01 09:16:50 +08:00
parent ab0ee4d6b4
commit 5daf18608b
1 changed files with 323 additions and 440 deletions
--- a/application/common/ArticleParserService.php
+++ b/application/common/ArticleParserService.php
@@ -225,6 +225,10 @@ class ArticleParserService
        $aParam['corresponding'] = $oDealFile->getCorrespondingAuthors($aParam);
        //keywords 和 摘要
        $aContent = $oDealFile->extractFromWord();
+        if(!mb_check_encoding($sTitle, 'UTF-8')){
+            $sTitle = mb_convert_encoding($sTitle, 'UTF-8', 'GBK');
+        }
+        $aParam['title'] = $oDealFile->fullDecode($aParam['title']);
        $aParam += empty($aContent['data']) ? [] : $aContent['data'];
        return json_encode(['status' => 1,'msg' => 'success','data' => $aParam]);
    }
@@ -240,190 +244,25 @@ class ArticleParserService
            foreach ($section->getElements() as $element) {
                $text = $this->getTextFromElement($element);
                $length = mb_strlen(trim($text));
-                if ($length > $maxLength && $length > 10) { // 标题通常较长
+                if ($length > $maxLength && $length > 3) { // 标题通常较长
                    $title = trim($text);
                    $maxLength = $length;
                    break 2; // 取第一个最长段落作为标题
                }
            }
        }
-        if(!empty($title) && !mb_check_encoding($title, 'UTF-8')){
-            $title = mb_convert_encoding($title, 'UTF-8', 'GBK');
-        }
        return $title;
    }
-    // 提取作者
- //    private function getAuthors($aParam = []) {
- //        $title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
- //        $sAuthorContent = $this->getNextParagraphAfterText($title);
- //        if (empty($sAuthorContent)) {
- //            return ['author' => [], 'report' => []];
- //        }
-
- //        //编码修复
- //        $possibleEncodings = [
- //            'Windows-1252', 'UTF-8', 'GBK', 'GB2312', 
- //            'Latin-1', 'ISO-8859-1', 'CP1252'
- //        ];
- //        $encodedContent = @mb_convert_encoding($sAuthorContent, 'UTF-8', implode(',', $possibleEncodings));
- //        $sAuthorContent = $encodedContent ?: $sAuthorContent;
-
- //        //清理不可见字符
- //        $sAuthorContent = preg_replace('/[\x00-\x1F\x7F\x{200B}-\x{200F}]/u', '', $sAuthorContent);
-
- //        //修复特殊符号乱码
- //        $symbolMap = [
- //            'â€ ' => '†', 'â  ' => '†', 'â' => '†', '?†' => '†',
- //            'ï¼š' => ':', 'ï¼Œ' => ',', 'â€”' => '-',
- //            '啊' => '' // 针对性移除异常字符“啊”（若为固定乱码）
- //        ];
- //        $sAuthorContent = strtr($sAuthorContent, $symbolMap);
-
- //        //格式标准化
- //        $sAuthorContent = str_replace(['，', ';', '；', '、'], ',', $sAuthorContent); // 统一分隔符
- //        $sAuthorContent = preg_replace('/\s+and\s+/i', ', ', $sAuthorContent); // and转逗号
- //        $sAuthorContent = preg_replace('/\s+/', ' ', $sAuthorContent); // 合并多余空格
- //        $sAuthorContent = trim($sAuthorContent);
-
- //        // 处理作者
- //        $content = mb_convert_encoding($sAuthorContent, 'UTF-8', 'auto'); // 确保编码正确
- //        $content = str_replace("\xC2\xA0", ' ', $content); // 替换非-breaking空格为普通空格
- //        $content = preg_replace('/(\d+)\s*([*#†])/', '$1$2', $content); // 合并"1 *"为"1*"、"1 #"为"1#"
- //        $content = preg_replace('/,†/', ',†', $content); // 保留"1,†"格式（防止被拆分）
- //        //标记上标内的逗号+空格（多编号）
- //        $tempStr = preg_replace('/(\d+)\s*,\s*(\d+)/', '$1<SEP>$2', $content);
- //        // 原有步骤2：正则匹配（扩展上标符号支持，保持原有逻辑）
- //        $pattern = '/
- //            ([A-Za-z\s\.\-]+?)          # 姓名（支持缩写、空格）
- //            \s*                         # 姓名与上标间空格
- //            (                           # 上标组（扩展符号支持）
- //                \d+                     # 起始数字
- //                (?:[†#*,]|<SEP>\d+)*    # 允许：†#*符号、逗号、<SEP>+数字（兼容1,†、1,*等）
- //            )
- //            \s*,?                       # 作者间逗号（可选）
- //            (?=\s|$)                    # 确保后面是空格或结尾
- //        /ux';
-
- //        preg_match_all($pattern, $tempStr, $matches);
- //        $authorList = [];
- //        if(!empty($matches[1])){
- //            foreach ($matches[1] as $i => $name) {
- //                $name = trim($name);
- //                $superscript = trim($matches[2][$i]);
- //                $superscript = str_replace('<SEP>', ',', $superscript); // 恢复多编号逗号
- //                $superscript = preg_replace('/,$/', '', $superscript); // 清理末尾逗号
- //                // 修复符号与数字间的空格（如原始"1 *"被误处理为"1*"的情况，保持原样）
- //                $superscript = preg_replace('/(\d)([*#†])/', '$1$2', $superscript);
- //                if (!empty($name)) {
- //                    $authorList[] = [
- //                        'name' => $name,
- //                        'superscript' => $superscript
- //                    ];
- //                }
- //            }
- //        }else {
- //            // 按“两个或多个连续空格”拆分（姓名之间的分隔）
- //            $authorList = array_filter(
- //                array_map('trim', 
- //                    preg_split('/(,\p{Z}*|\p{Z}{2,})/u', $sAuthorContent)
- //                )
- //            );
- //        }
-        
-
- //        // //处理作者
- //        // $authorList = [];
- //        // // 新正则：匹配“姓名+上标”整体，允许上标含逗号（如1,†）
- //        // // 逻辑：姓名以字母/中文开头，上标以数字开头、以符号/数字结尾
- //        // // if (preg_match_all('/([A-Za-z\x{4e00}-\x{9fa5}][A-Za-z\s·\-\'\x{4e00}-\x{9fa5}]*)\s*([\d,†#*]+)/u', $sAuthorContent, $matches)) {
- //        // if(preg_match_all('/([A-Za-z\x{4e00}-\x{9fa5}][A-Za-z\s·\-\'\x{4e00}-\x{9fa5}]*)\s*(\d[\d,†#\s*]*)/u', $sAuthorContent, $matches)){
- //        //     for ($i = 0; $i < count($matches[1]); $i++) {
- //        //         $authorList[] = trim($matches[1][$i] . $matches[2][$i]);
- //        //     }
- //        // } else {
- //        //     // 按“两个或多个连续空格”拆分（姓名之间的分隔）
- //        //     $authorList = array_filter(
- //        //         array_map('trim', 
- //        //             preg_split('/(,\p{Z}*|\p{Z}{2,})/u', $sAuthorContent)
- //        //         )
- //        //     );
- //        // }
- //        $aAuthorData = [];
- //        $aReport = [];
- //        $namePattern = '/
- //            (?:[A-Za-z\s·\-\']+|                # 英文姓名（支持空格、连字符）
- //             [\x{4e00}-\x{9fa5}]+|             # 中文姓名
- //             [\x{1800}-\x{18AF}]+|             # 蒙古文姓名
- //             [A-Z]\.)                           # 单字母缩写（如 J.）
- //        /ux';
- // var_dump($authorList);exit;
- //        foreach ($authorList as $authorStr) {
- //            if (empty($authorStr)) continue;
- //            var_dump($authorList);exit;
- //            //分离姓名与上标（支持上标含逗号，如1,†）
- //            $superscript = '';
- //            // 新正则：匹配以数字开头、含逗号/符号的完整上标（如1,†、2*#）
- //            $authorStr = trim(trim($authorStr,','),' ');
- //            // if (preg_match('/([\d,†#*]+)$/u', $authorStr, $supMatch)) {
- //            // if(preg_match('/\s*([\d,†#* ]+)$/u', $authorStr, $supMatch)){
- //            // if (preg_match('/.*?\s*([\d,†#* ]+)$/u', $authorStr, $supMatch)) {
- //            // if (preg_match('/.*?\s*([\d,\x{2020}#* ]+?)\s*$/u', $authorStr, $supMatch)) {
- //            // if (preg_match('/^(.+?)\D*?(\d[\d,#*†,\s]*)$/u', $authorStr, $supMatch)) {
- //            //     $superscript = $supMatch[1];
- //            //     // 移除上标，保留纯姓名（避免残留符号）
- //            //     $nameStr = trim(preg_replace('/' . preg_quote($superscript, '/') . '$/', '', $authorStr));
- //            // } else {
- //            //     $nameStr = $authorStr;
- //            // }
- //            $pattern = '/^(.+?)\s*(\d[\d,#*†\s]*?)\s*$/u';
- //            if (preg_match($pattern, $authorStr, $supMatch)) {
- //                $nameStr = empty($supMatch[1]) ? '' : trim($supMatch[1]); // 姓名部分："Liguo Zhang"
- //                $superscript = empty($supMatch[2]) ? $nameStr : $nameStr.trim($supMatch[2]); // 上标部分："1 
- //                // echo "姓名: $nameStr, 上标: $superscript\n";
- //            } else {
- //                $nameStr = $authorStr;
- //            }
- //            //验证姓名合法性（过滤无效内容）
- //            if (!preg_match($namePattern, $nameStr)) {
- //                continue;
- //            }
- //            //解析上标信息（正确识别1,†中的机构编号和符号）
- //            $companyId = '';
- //            $isSuper = 0;
- //            $isReport = 0;
- //            if (!empty($superscript)) {
- //                // 提取机构编号（忽略上标中的逗号，如1,† → 提取1）
- //                if (preg_match('/(\d+)/', $superscript, $numMatch)) {
- //                    $companyId = $numMatch[1];
- //                }
- //                // 识别特殊符号（#为超级作者，*†为通讯作者）
- //                $isSuper = strpos($superscript, '#') !== false ? 1 : 0;
- //                $isReport = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0;
- //            }
- //            if (preg_match("/^([A-Za-z\s'\.-]+)/u", $nameStr, $match)) {
- //                $nameStr = trim($match[1]);
- //            }
- //            $aAuthorData[] = [
- //                'name' => $nameStr,
- //                'company_id' => $companyId,
- //                'is_super' => $isSuper,
- //                'is_report' => $isReport
- //            ];
- //            if ($isReport) {
- //                $aReport[] = $nameStr;
- //            }
- //        }
- //           var_dump($aAuthorData);exit;
- //        return ['author' => $aAuthorData,'report' => array_unique($aReport)];
- //    }

    // 提取作者
    private function parseAuthorsWithoutRegex($str = '') {
        if (empty($str)) {
            return [];
        }
-        // 清理乱码和特殊字符（扩展全角数字处理）
-        $str = mb_convert_encoding($str, 'UTF-8', 'auto');
+        if(!mb_check_encoding($str, 'UTF-8')){
+            $str = mb_convert_encoding($str, 'UTF-8', 'GBK');
+        }
+        $str = $this->fullDecode($str);
        $str = str_replace(["\xC2\xA0", 'ï¼', 'ï¿½', '，', '１', '２', '３', '４', '５', '６', '７', '８', '９', '０'], 
                          [' ', ' ', ' ', ' ', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0'], $str);
        $str = trim(str_replace([' and ', ' AND ', ' And '], ', ', $str));
@@ -584,15 +423,10 @@ class ArticleParserService
        if (empty($sAuthorContent)) {
            return ['author' => [], 'report' => []];
        }
-
-        //编码修复
-        $possibleEncodings = [
-            'Windows-1252', 'UTF-8', 'GBK', 'GB2312', 
-            'Latin-1', 'ISO-8859-1', 'CP1252'
-        ];
-        $encodedContent = @mb_convert_encoding($sAuthorContent, 'UTF-8', implode(',', $possibleEncodings));
-        $sAuthorContent = $encodedContent ?: $sAuthorContent;
-
+        if(!mb_check_encoding($sAuthorContent, 'UTF-8')){
+            $sAuthorContent = mb_convert_encoding($sAuthorContent, 'UTF-8', 'GBK');
+        }
+        $sAuthorContent = $this->fullDecode($sAuthorContent);
        //清理不可见字符
        $sAuthorContent = preg_replace('/[\x00-\x1F\x7F\x{200B}-\x{200F}]/u', '', $sAuthorContent);

@@ -614,14 +448,10 @@ class ArticleParserService
            return ['author' => [],'report' => []];
        }
        $aReport = $aAuthorData = [];
-
        foreach ($aAuthor as $key => $value) {
            if(empty($value['name']) && empty($value['superscript'])){
                continue;
            }
-            if(!mb_check_encoding($value['name'], 'UTF-8')){
-                $value['name'] = mb_convert_encoding($value['name'], 'UTF-8', 'GBK');
-            }
            if(!empty($value['name']) && !empty($value['is_report']) && $value['is_report'] == 1){
                $aReport[] = $value['name'];
            }
@@ -629,175 +459,6 @@ class ArticleParserService
        }
        return ['author' => $aAuthorData,'report' => array_unique($aReport)];
    }
-//     private function getAuthors($aParam = []) {
-//         $title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
-//         $sAuthorContent = $this->getNextParagraphAfterText($title);
-//         if (empty($sAuthorContent)) {
-//             return ['author' => [], 'report' => []];
-//         }
-
-//         //编码修复
-//         $possibleEncodings = [
-//             'Windows-1252', 'UTF-8', 'GBK', 'GB2312', 
-//             'Latin-1', 'ISO-8859-1', 'CP1252'
-//         ];
-//         $encodedContent = @mb_convert_encoding($sAuthorContent, 'UTF-8', implode(',', $possibleEncodings));
-//         $sAuthorContent = $encodedContent ?: $sAuthorContent;
-
-//         //清理不可见字符
-//         $sAuthorContent = preg_replace('/[\x00-\x1F\x7F\x{200B}-\x{200F}]/u', '', $sAuthorContent);
-
-//         //修复特殊符号乱码
-//         $symbolMap = [
-//             'â€ ' => '†', 'â  ' => '†', 'â' => '†', '?†' => '†',
-//             'ï¼š' => ':', 'ï¼Œ' => ',', 'â€”' => '-',
-//             '啊' => '' // 针对性移除异常字符“啊”（若为固定乱码）
-//         ];
-//         $sAuthorContent = strtr($sAuthorContent, $symbolMap);
-
-//         //格式标准化
-//         $sAuthorContent = str_replace(['，', ';', '；', '、'], ',', $sAuthorContent); // 统一分隔符
-//         $sAuthorContent = preg_replace('/\s+and\s+/i', ', ', $sAuthorContent); // and转逗号
-//         $sAuthorContent = preg_replace('/\s+/', ' ', $sAuthorContent); // 合并多余空格
-//         $sAuthorContent = trim($sAuthorContent);
-// var_dump($this->parseAuthorsWithoutRegex($sAuthorContent));exit;
-//         // 关键预处理：兼容"and"分隔符、清理乱码、统一空格
-// $content = mb_convert_encoding($sAuthorContent, 'UTF-8', 'auto');
-// $content = str_replace(["\xC2\xA0", 'ï¼', 'ï¿½', '，'], ' ', $content); // 清理乱码和全角符号
-// $content = preg_replace('/\band\b/i', ',', $content); // 将 "and" 转为逗号（统一分隔符）
-// $content = preg_replace('/(\d+)\s*([*#†])/', '$1$2', $content); // 合并数字与符号间的空格（如"1 *"→"1*"）
-// $content = trim(preg_replace('/\s+/', ' ', $content)); // 合并连续空格
-
-// // 标记上标内的逗号（多编号处理）
-// $tempStr = preg_replace('/(\d+)\s*,\s*(\d+)/', '$1<SEP>$2', $content);
-
-// // 核心正则（保持原有结构，扩展符号支持）
-// $pattern = '/
-//     ([A-Za-z\s\.\-]+?)          # 姓名（支持缩写、空格、连字符）
-//     \s*                         # 姓名与上标间的空格（允许0或多个）
-//     (                           # 上标组（扩展兼容所有符号）
-//         \d+                     # 起始数字（至少1个数字）
-//         (?:[†#*,]|<SEP>\d+)*    # 允许：符号（†#*）、逗号、<SEP>+数字（多编号）
-//     )
-//     \s*,?                       # 作者间的逗号（可选，允许逗号前有空格）
-//     (?=\s|$)                    # 确保后面是空格或字符串结尾（避免跨作者匹配）
-// /ux';
-
-// preg_match_all($pattern, $tempStr, $matches);
-
-// // 解析结果并格式化
-// $authorList = [];
-// if (!empty($matches[1])) {
-//     foreach ($matches[1] as $i => $name) {
-//         $name = trim($name);
-//         $superscript = trim($matches[2][$i]);
-//         $superscript = str_replace('<SEP>', ',', $superscript); // 恢复多编号逗号
-//         $superscript = preg_replace('/,$/', '', $superscript); // 清理末尾多余逗号
-//         if (!empty($name)) {
-//             $authorList[] = [
-//                 'name' => $name,
-//                 'superscript' => $superscript
-//             ];
-//         }
-//     }
-// }
-
-// // 输出结果
-// echo "<pre>";
-// print_r($authorList);
-// echo "</pre>";
-// exit;
-
-//         // 处理作者
-//         $content = mb_convert_encoding($sAuthorContent, 'UTF-8', 'auto'); // 确保编码正确
-//         $content = str_replace("\xC2\xA0", ' ', $content); // 替换非-breaking空格为普通空格
-//         $content = preg_replace('/(\d+)\s*([*#†])/', '$1$2', $content); // 合并"1 *"为"1*"、"1 #"为"1#"
-//         $content = preg_replace('/,†/', ',†', $content); // 保留"1,†"格式（防止被拆分）
-
-//         //标记上标内的逗号+空格（多编号）
-//         $tempStr = preg_replace('/(\d+)\s*,\s*(\d+)/', '$1<SEP>$2', $content);
-//         // 原有步骤2：正则匹配（扩展上标符号支持，保持原有逻辑）
-//         $pattern = '/
-//             ([A-Za-z\s\.\-]+?)          # 姓名（支持缩写、空格）
-//             \s*                         # 姓名与上标间空格
-//             (                           # 上标组（扩展符号支持）
-//                 \d+                     # 起始数字
-//                 (?:[†#*,]|<SEP>\d+)*    # 允许：†#*符号、逗号、<SEP>+数字（兼容1,†、1,*等）
-//             )
-//             \s*,?                       # 作者间逗号（可选）
-//             (?=\s|$)                    # 确保后面是空格或结尾
-//         /ux';
-     
-//         preg_match_all($pattern, $tempStr, $matches);
-//         var_dump($matches);exit;
-//         $authorList = [];
-//         if(!empty($matches[1])){
-//             foreach ($matches[1] as $i => $name) {
-//                 $name = trim($name);
-//                 $superscript = trim($matches[2][$i]);
-//                 $superscript = str_replace('<SEP>', ',', $superscript); // 恢复多编号逗号
-//                 $superscript = preg_replace('/,$/', '', $superscript); // 清理末尾逗号
-//                 // 修复符号与数字间的空格（如原始"1 *"被误处理为"1*"的情况，保持原样）
-//                 $superscript = preg_replace('/(\d)([*#†])/', '$1$2', $superscript);
-//                 if (!empty($name)) {
-//                     $authorList[] = [
-//                         'name' => $name,
-//                         'superscript' => $superscript
-//                     ];
-//                 }
-//             }
-//         }else {
-//             // 按“两个或多个连续空格”拆分（姓名之间的分隔）
-//             $authorList = array_filter(
-//                 array_map('trim', 
-//                     preg_split('/(,\p{Z}*|\p{Z}{2,})/u', $sAuthorContent)
-//                 )
-//             );
-//         }
-        
-
-//         // //处理作者
-//         $aAuthorData = [];
-//         $aReport = [];
-//         $namePattern = '/
-//             (?:[A-Za-z\s·\-\']+|                # 英文姓名（支持空格、连字符）
-//              [\x{4e00}-\x{9fa5}]+|             # 中文姓名
-//              [\x{1800}-\x{18AF}]+|             # 蒙古文姓名
-//              [A-Z]\.)                           # 单字母缩写（如 J.）
-//         /ux';
- 
-//         foreach ($authorList as $authorStr){
-//             if (empty($authorStr)) continue;
-            
-//             //获取下标
-//             $superscript = empty($authorStr['superscript']) ? $authorStr : $authorStr['superscript'];
-//             $nameStr = empty($authorStr['name']) ? $authorStr : $authorStr['name'];
-
-//             $companyId = [];
-//             $isSuper = 0;
-//             $isReport = 0;
-//             if (!empty($superscript)) {
-//                 // 提取机构编号（忽略上标中的逗号，如1,† → 提取1）
-//                 preg_match_all('/\d+/', $superscript, $numMatch);
-//                 // 识别特殊符号（#为超级作者，*†为通讯作者）
-//                 $isSuper = strpos($superscript, '#') !== false ? 1 : 0;
-//                 $isReport = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0;
-//             }
-//             if (preg_match("/^([A-Za-z\s'\.-]+)/u", $nameStr, $match)) {
-//                 $nameStr = trim($match[1]);
-//             }
-//             $aAuthorData[] = [
-//                 'name' => $nameStr,
-//                 'company_id' => empty($numMatch[0]) ? [] : $numMatch[0],
-//                 'is_super' => $isSuper,
-//                 'is_report' => $isReport
-//             ];
-//             if ($isReport) {
-//                 $aReport[] = $nameStr;
-//             }
-//         }
-//         return ['author' => $aAuthorData,'report' => array_unique($aReport)];
-//     }

    // 获取机构
    private function getCompany($aParam = []){
@@ -815,16 +476,39 @@ class ArticleParserService
        $currentNumber = null; // 当前序号
        foreach ($allLines as $line) {
            $line = trim($line);
-            if (empty($line)) continue;
-
-            // 判断是否是新条目的开头：行首为数字（后续可接任意字符或直接接内容）
+            if (empty($line)) {
+                continue;
+            }
+            if(!mb_check_encoding($line, 'UTF-8')){
+                $line = mb_convert_encoding($line, 'UTF-8', 'GBK');
+            }
+            $line = $this->fullDecode($line);
            $number = '';
            $i = 0;
            $lineLen = strlen($line);
            // 提取行首的连续数字（作为序号）
-            while ($i < $lineLen && ctype_digit($line[$i])) {
-                $number .= $line[$i];
-                $i++;
+            $hasFirstChar = false;
+            while ($i < $lineLen) {
+                $currentChar = $line[$i];
+                // 首字符处理：允许 26个字母（大小写）或数字
+                if (!$hasFirstChar) {
+                    if (ctype_digit($currentChar) || ctype_alpha($currentChar)) {
+                        $number .= $currentChar;
+                        $hasFirstChar = true;
+                        $i++;
+                    } else {
+                        // 首字符不符合（非字母/数字），终止循环
+                        break;
+                    }
+                } else {
+                    // 后续字符必须是数字（保持原逻辑）
+                    if (ctype_digit($currentChar)) {
+                        $number .= $currentChar;
+                        $i++;
+                    } else {
+                        break;
+                    }
+                }
            }

            // 若行首有数字，则视为新条目
@@ -840,31 +524,33 @@ class ArticleParserService
                continue;
            }

-            // 非新条目，合并到当前序号的内容中
-            if ($currentNumber !== null) {
-                $grouped[$currentNumber] .= ' ' . $line;
-            }
+            // // 非新条目，合并到当前序号的内容中
+            // if ($currentNumber !== null) {
+            //     $grouped[$currentNumber] .= ' ' . $line;
+            // }
        }

-        //清理结果
-        $possibleEncodings = [
-            'Windows-1252', 'UTF-8', 'GBK', 'GB2312', 
-            'Latin-1', 'ISO-8859-1', 'CP1252'
-        ];
        $aCompany = [];
        foreach ($grouped as $number => $institution) {
-            $encodedContent = @mb_convert_encoding($institution, 'UTF-8', implode(',', $possibleEncodings));
-            $sCompany = $encodedContent ?: $sCompany;
+            $institution = $this->fullDecode($institution);
+            // 原有基础清理逻辑不变
            $institution = preg_replace('/\s+/', ' ', $institution); // 合并多余空格
-            $institution = rtrim($institution, '.');
-            $institution = preg_replace('/^\d+\s+/', '', $institution);
+            $institution = rtrim($institution, '.'); // 去除末尾句号
+            $institution = preg_replace('/^\d+\s+/', '', $institution); // 去除开头数字
            $institution = trim($institution); // 清理首尾空格
-            preg_match('/(.*?, [A-Za-z]+ \d+, [A-Za-z]+)/', $institution, $institutionmatches);;
-            $institution = trim($institutionmatches[1] ?? $institution);
-            if (preg_match('/^(.*?)(?=\s*\*Email)/', $institution, $matches)) {
-                $institution = trim($matches[1]); // trim() 去除内容前后多余空格
+
+            // 增强地址提取：匹配"机构名, 城市 邮编, 国家"格式（兼容更多变体）
+            // 允许地址中包含多个逗号（如子机构、街道信息），最终以"城市 邮编, 国家"结尾
+            // preg_match('/(.*?, [A-Za-z\s]+ \d+, [A-Za-z\s]+)/', $institution, $institutionmatches);
+            // $institution = trim($institutionmatches[1] ?? $institution);
+            // 强化冗余信息过滤：去除"*"及之后的内容（包括通讯作者、邮箱等）
+            // 新增对"#"、"†"等标记的过滤，兼容更多期刊格式
+            if (preg_match('/^(.*?)(?=\s*[\*#†]|(?i)\s*Email)/', $institution, $matches)) {
+                $institution = trim($matches[1]);
            }
-            if(!empty($institution) && !mb_check_encoding($institution, 'UTF-8')){
+
+            // 编码校验不变
+            if (!empty($institution) && !mb_check_encoding($institution, 'UTF-8')) {
                $institution = mb_convert_encoding($institution, 'UTF-8', 'GBK');
            }
            $aCompany[$number] = $institution;
@@ -891,13 +577,10 @@ class ArticleParserService

        // 获取机构后的完整内容
        $corrText = $this->getContentAfterText($sCompany);
-         //编码修复
-        $possibleEncodings = [
-            'Windows-1252', 'UTF-8', 'GBK', 'GB2312', 
-            'Latin-1', 'ISO-8859-1', 'CP1252'
-        ];
-        $encodedContent = @mb_convert_encoding($corrText, 'UTF-8', implode(',', $possibleEncodings));
-        $corrText = $encodedContent ?: $corrText;
+        if(!mb_check_encoding($corrText, 'UTF-8')){
+            $corrText = mb_convert_encoding($corrText, 'UTF-8', 'GBK');
+        }
+        $corrText = $this->fullDecode($corrText);
        // // 调试
        // file_put_contents(ROOT_PATH . 'runtime/corr_text_raw.log', $corrText);

@@ -927,7 +610,8 @@ class ArticleParserService
            ];
        }
        if(empty($aCorresponding)){
-            $pattern = '/Corresponding Authors: (.*?)(?=$|;)/s';
+            $pattern = '/Corresponding Authors|Correspondence to|Correspondence: (.*?)(?=$|;)/s';
+            $corrText = trim($corrText,'*');
            preg_match($pattern, $corrText, $match);
            if (!empty($match[1])) {
                $corrContent = $match[1];
@@ -942,6 +626,16 @@ class ArticleParserService
                        ];
                    }
                }
+                if(empty($authors[1])){
+                    $authorPattern = '/([A-Za-z0-9\s]+?),\s*([\w@\.\-]+)(?=\.?)/';
+                    preg_match_all($authorPattern, $corrContent, $authors);
+                    for ($i = 0; $i < count($authors[1]); $i++) {
+                        $aCorresponding[] = [
+                            'name' => empty($authors[1][$i]) ? '' : trim($authors[1][$i]),
+                            'email' => empty($authors[2][$i]) ? '' : trim($authors[2][$i])
+                        ];
+                    }
+                }
            }
        }
        return $aCorresponding;
@@ -1122,24 +816,88 @@ class ArticleParserService
     * 从 Word 文档提取摘要和关键词
     * @return array 提取结果
     */
+    function extractContentIntervals($str, $markers = []) {
+        // 1. 初始化标记（支持自定义，默认值兼容原逻辑）
+        $defaultMarkers = [
+            'abstract' => 'abstract',
+            'keywords' => 'keywords',
+            'end_span' => '===========end-span'
+        ];
+        $markers = array_merge($defaultMarkers, $markers);
+        extract($markers); // 解析为变量 $abstract, $keywords, $end_span
+
+        // 2. 初始化结果（包含元信息）
+        $result = [
+            'abstract_to_keywords' => '',
+            'keywords_to_end' => '',
+            'positions' => [ // 标记位置信息（-1 表示未找到）
+                'abstract' => -1,
+                'keywords' => -1,
+                'end_span' => -1
+            ],
+            'is_valid' => false, // 整体区间是否有效
+            'error' => '' // 错误信息（如标记顺序异常）
+        ];
+
+        // 3. 定位 Abstract（不区分大小写）
+        $absPos = stripos($str, $abstract);
+        if ($absPos === false) {
+            $result['error'] = "未找到标记: {$abstract}";
+            return $result;
+        }
+        $result['positions']['abstract'] = $absPos;
+        $absEndPos = $absPos + strlen($abstract);
+
+        // 4. 定位 Keywords（需在 Abstract 之后，不区分大小写）
+        $keyPos = stripos($str, $keywords, $absEndPos);
+        if ($keyPos === false) {
+            $result['error'] = "未找到 {$keywords} 或在 {$abstract} 之前";
+            return $result;
+        }
+        $result['positions']['keywords'] = $keyPos;
+        $keyEndPos = $keyPos + strlen($keywords);
+
+        // 5. 定位 end-span（需在 Keywords 之后，严格匹配）
+        $endPos = strpos($str, $end_span, $keyEndPos);
+        if ($endPos === false) {
+            $result['error'] = "未找到 {$end_span} 或在 {$keywords} 之前";
+            return $result;
+        }
+        $result['positions']['end_span'] = $endPos;
+
+        // 6. 截取区间内容（清理标记后的紧邻符号）
+        // 区间1：Abstract 结束 → Keywords 开始（清理标记后的冒号/空格）
+        $len1 = $keyPos - $absEndPos;
+        $part1 = substr($str, $absEndPos, $len1);
+        $part1 = trim($part1);
+        // 移除 Abstract 后可能的冒号/短横线（如 "Abstract: ..." → 去掉开头的 ":"）
+        $part1 = ltrim($part1, ': -—'); 
+        $result['abstract_to_keywords'] = trim($part1);
+
+        // 区间2：Keywords 结束 → end-span 开始（同理清理）
+        $len2 = $endPos - $keyEndPos;
+        $part2 = substr($str, $keyEndPos, $len2);
+        $part2 = trim($part2);
+        $part2 = ltrim($part2, ': -—');
+        $result['keywords_to_end'] = trim($part2);
+
+        // 7. 标记为有效
+        $result['is_valid'] = true;
+        return $result;
+    }
    public function extractFromWord() {
        $sContent = '';
        //文本处理
        $sFundContent = '';
+        $aContent = [];
        foreach ($this->sections as $section) {
            foreach ($section->getElements() as $element) {
                $textContent = $this->getTextFromElement($element);
                if(empty($textContent)){
                    continue;
                }
-                //编码修复
-                $possibleEncodings = [
-                    'Windows-1252', 'UTF-8', 'GBK', 'GB2312', 
-                    'Latin-1', 'ISO-8859-1', 'CP1252'
-                ];
-                $sContent .= @mb_convert_encoding($textContent, 'UTF-8', implode(',', $possibleEncodings));
-                if(stripos($textContent, 'Keywords:') !== false){
-                    $sContent .= "Keywords-End-Flag";
+                if(!empty($textContent) && !mb_check_encoding($textContent, 'UTF-8')){
+                    $textContent = mb_convert_encoding($textContent, 'UTF-8', 'GBK');
                }
                if(empty($sFundContent)){
                    $aFund = $this->getMatchedFundPhrases($sContent);
@@ -1152,69 +910,194 @@ class ArticleParserService
                        }
                    }
                }
-                $sContent .= "\n";
+                $sContent .= $textContent."===========end-span";
            }
        }
-
        if(!empty($sContent) && !mb_check_encoding($sContent, 'UTF-8')){
            $sContent = mb_convert_encoding($sContent, 'UTF-8', 'GBK');
        }
-        // 2. 基础文本清理（合并多余空格，保留有效换行）
-        $textContent = preg_replace('/(\S)\s+/', '$1 ', $sContent);
-        $textContent = trim($textContent);
-
+        $result = $this->extractContentIntervals($sContent);
        // 3. 提取摘要
-        $abstract = '';
-        $abstractPattern = '/Abstract\s*([\s\S]*?)(?=Keywords|$)/i';
-        if (preg_match($abstractPattern, $textContent, $abstractMatches)) {
-            $abstract = trim($abstractMatches[1]);
-            $abstract = preg_replace('/\n+/', ' ', $abstract);
+        $abstract = empty($result['abstract_to_keywords']) ? '' : $result['abstract_to_keywords'];
+        if(!empty($abstract) && !mb_check_encoding($abstract, 'UTF-8')){
+            $abstract =  mb_convert_encoding($abstract, 'UTF-8', 'GBK');
        }
-        // 4. 提取关键词（核心：仅保留两种强制匹配逻辑）
-        $keywords = [];
-        // $keywordPattern = '/Keywords:\s*([\s\S]*?)(?=\s*\d+\.|[;,]\s*[\r\n]+\s*[\r\n]+|(?i)\bintroduction|abbreviations\b|$)/i';
-        $keywordPattern = '/Keywords\s*(.*?)\s*Keywords-End-Flag/s';
-
-        if (preg_match($keywordPattern, $textContent, $keywordMatches)) {
-            $keywordStr = trim($keywordMatches[1]);
-            
-            // 清理关键词列表格式（去除换行、末尾多余符号）
-            $keywordStr = preg_replace('/\n+/', ' ', $keywordStr);
-            $keywordStr = rtrim($keywordStr, ';,. '); // 去除末尾分号、逗号等
-            $keywordStr = trim($keywordStr);
-
-            // 分割并过滤有效关键词
-            $keywords = preg_split('/[,;]\s*/', $keywordStr);
-            $keywords = array_filter(array_map('trim', $keywords), function($item) {
-                return !empty($item) && !ctype_space($item);
-            });
+        $keywords = empty($result['keywords_to_end']) ? '' : $result['keywords_to_end'];
+        if(!empty($keywords) && !mb_check_encoding($keywords, 'UTF-8')){
+            $keywords = mb_convert_encoding($keywords, 'UTF-8', 'GBK');
        }
-        if(empty($keywords)){
-            $keywordPattern = '/Keywords\s*([\s\S]*?)(?=Introduction|$)/i';
-            if (preg_match($keywordPattern, $textContent, $keywordMatches)) {
-                $keywordStr = trim($keywordMatches[1]);
-                // 清理关键词列表格式（去除换行、末尾多余符号）
-                $keywordStr = preg_replace('/\n+/', ' ', $keywordStr);
-                $keywordStr = rtrim($keywordStr, ';,. '); // 去除末尾分号、逗号等
-                $keywordStr = trim($keywordStr);
-
-                // 分割并过滤有效关键词
-                $keywords = preg_split('/[,;]\s*/', $keywordStr);
-                $keywords = array_filter(array_map('trim', $keywords), function($item) {
-                    return !empty($item) && !ctype_space($item);
-                });
-            }
+        if(!empty($sFundContent) && !mb_check_encoding($sFundContent, 'UTF-8')){
+            $sFundContent = mb_convert_encoding($sFundContent, 'UTF-8', 'GBK');
        }
+
        return [
            'status' => 1,
            'msg' => '提取成功',
            'data' => [
-                'abstrart' => $abstract,
-                'keywords' => $keywords,
-                'fund' => $sFundContent
+                'abstrart' => empty($abstract) ? '' : $this->fullDecode(str_replace('===========end-span', '',$abstract)),
+                'keywords' => empty($keywords) ? '' : $this->fullDecode(str_replace('===========end-span', '',$keywords)),
+                'fund' => empty($sFundContent) ? '' : $this->fullDecode(str_replace('===========end-span', '',$sFundContent))
            ]
        ];
    }
+    private function fullDecode($str, $maxDepth = 5) {
+        // 空值/深度为0，直接返回（提前终止，避免无效操作）
+        if (empty($str) || $maxDepth <= 0) {
+            return $str;
+        }
+
+        // 【性能优化1：预编译所有正则表达式】避免每次循环重新解析正则
+        // 预编译：≥专属场景正则
+        $regOb0 = '/0B\s*\?0/';
+        $regDl18 = '/DL\s*\?.18/';
+        // 预编译：≥通用场景正则
+        $regQMarkNum = '/\?(\d+)/';
+        $regQMarkDotNum = '/\?(\.\d+)/';
+        // 预编译：≤、≠空格修复正则
+        $regNeNum = '/≠\s*(\d+)/';
+        $regLeNum = '/≤\s*(\d+)/';
+        // 预编译：混合符号乱码正则（中文顿号/英文逗号）
+        $regMixCn = '/(\?)\s*、\s*(\?)\s*、\s*(\?)(\d+)/';
+        $regMixEn = '/(\?)\s*,\s*(\?)\s*,\s*(\?)(\d+)/';
+        // 预编译：≤、≠专属标识正则
+        $regLeMark = '/LE\s*\?(\d+)/';
+        $regNeMark = '/NE\s*\?(\d+)/';
+        // 预编译：Unicode转义正则（提取到外部，避免闭包重复创建）
+        $regUnicode = '/\\\\u([0-9a-fA-F]{4})/';
+
+        // 【性能优化2：预定义常量/映射】避免循环内重复创建数组/字符串
+        // HTML实体映射（一次性定义，避免循环内重复赋值）
+        $htmlEntityMap = [
+            '&le;' => '≤', '&#8804;' => '≤', '&#x2264;' => '≤',
+            '&ge;' => '≥', '&#8805;' => '≥', '&#x2265;' => '≥',
+            '&ne;' => '≠', '&#8800;' => '≠', '&#x2260;' => '≠',
+        ];
+        // 不间断空格替换数组
+        $nbspReplace = [chr(0xC2) . chr(0xA0), chr(0xA0)];
+        // Unicode回调函数（预定义，避免循环内重复创建闭包）
+        $unicodeCallback = function ($m) {
+            return mb_chr(hexdec($m[1]), 'UTF-8') ?: $m[0];
+        };
+
+        $original = $str;
+        $depth = 0;
+        $hasChange = false; // 标记是否有变化，提前终止循环
+
+        // 循环解码：仅在有变化且未达最大深度时执行
+        do {
+            $depth++;
+            $hasChange = false;
+            $prevStr = $str; // 保存当前状态，用于判断变化
+
+            // 1. 解码Unicode转义（\uXXXX格式）
+            $str = $this->decodeUnicode($str);
+
+            // 2. 解码HTML实体（先替换专属实体，再执行通用解码）
+            $str = strtr($str, $htmlEntityMap); // 高性能替换（strtr比str_replace快）
+            $str = html_entity_decode($str, ENT_QUOTES | ENT_HTML5, 'UTF-8');
+
+            // 3. 再次处理遗漏的Unicode转义（使用预编译正则+预定义回调）
+            $str = preg_replace_callback($regUnicode, $unicodeCallback, $str);
+
+            // 4. 替换不间断空格为普通空格（strtr比str_replace更高效）
+            $str = str_replace($nbspReplace, ' ', $str);
+
+            // 5. 核心替换逻辑（优化执行顺序，避免覆盖）
+            // 5.1 原有≥专属场景（保留）
+            $str = preg_replace($regOb0, '0B≥30', $str, -1, $count1);
+            $str = preg_replace($regDl18, 'DL≥0.18', $str, -1, $count2);
+            // 5.2 ≤、≠空格修复（保留）
+            $str = preg_replace($regNeNum, '≠$1', $str, -1, $count3);
+            $str = preg_replace($regLeNum, '≤$1', $str, -1, $count4);
+            // 5.3 原有≥通用场景（保留）
+            $str = preg_replace($regQMarkNum, '≥$1', $str, -1, $count5);
+            $str = preg_replace($regQMarkDotNum, '≥0$1', $str, -1, $count6);
+            // 5.4 混合符号乱码还原（保留）
+            $str = preg_replace($regMixCn, '≤、≥、≠$4', $str, -1, $count7);
+            $str = preg_replace($regMixEn, '≤、≥、≠$4', $str, -1, $count8);
+            // 5.5 ≤、≠专属标识还原（保留）
+            $str = preg_replace($regLeMark, '≤$1', $str, -1, $count9);
+            $str = preg_replace($regNeMark, '≠$1', $str, -1, $count10);
+
+            // 5.6 修复前缀"d with "乱码（保留）
+            $str = str_replace('d with ', 'd with ', $str, $count11);
+
+            // 【性能优化3：统计所有替换次数，判断是否有变化】
+            $totalCount = $count1 + $count2 + $count3 + $count4 + $count5 + $count6 +
+                          $count7 + $count8 + $count9 + $count10 + $count11;
+            if ($totalCount > 0 || $str !== $prevStr) {
+                $hasChange = true;
+                $original = $str;
+            }
+
+            // 【性能优化4：提前终止】单次循环无变化，直接退出
+            if (!$hasChange) {
+                break;
+            }
+
+        } while ($depth < $maxDepth); // 改用do-while，减少循环判断次数
+
+        // 最终清理：仅执行一次trim
+        return trim($str, ':');
+    }
+    // private function fullDecode($str, $maxDepth = 5) {
+    //     if (empty($str) || $maxDepth <= 0) {
+    //         return $str;
+    //     }
+
+    //     $original = $str;
+    //     $depth = 0;
+
+    //     // 循环解码，直到无变化或达到最大次数
+    //     while (true) {
+    //         $depth++;
+    //         if ($depth > $maxDepth) {
+    //             break; // 防止过度解码导致死循环
+    //         }
+
+    //         // 1. 解码 Unicode 转义（\uXXXX 格式）
+    //         $str = $this->decodeUnicode($str);
+
+    //         // 2. 解码 HTML 实体（&amp;、&#039;、&lt; 等）
+    //         $str = html_entity_decode($str, ENT_QUOTES | ENT_HTML5, 'UTF-8');
+
+    //         $str = preg_replace_callback('/\\\\u([0-9a-fA-F]{4})/', function ($m) {
+    //             return mb_chr(hexdec($m[1]), 'UTF-8') ?: $m[0];
+    //         }, $str);
+    //         $str = str_replace([chr(0xC2).chr(0xA0), chr(0xA0)], ' ', $str);
+
+    //         // 2. 核心：强制匹配所有可能的乱码格式，还原≥
+    //         // 匹配：0B?0、0B ?0、0B ?0（空格/制表符）→ 0B≥30
+    //         $str = preg_replace('/0B\s*\?0/', '0B≥30', $str);
+    //         // 匹配：DL?.18、DL ?.18、DL ?.18 → DL≥0.18
+    //         $str = preg_replace('/DL\s*\?.18/', 'DL≥0.18', $str);
+    //         // 通用匹配：数字前的?（如?30、?0.18）→ ≥30、≥0.18（防止其他变体）
+    //         $str = preg_replace('/\?(\d+)/', '≥$1', $str);
+    //         $str = preg_replace('/\?(\.\d+)/', '≥0$1', $str);
+
+    //         // 3. 修复前缀的"d with "可能的乱码（若有）
+    //         $str = str_replace('d with ', 'd with ', $str); // 若前缀也乱码，可同步替换
+
+    //         // 若解码后无变化，退出循环
+    //         if ($str === $original) {
+    //             break;
+    //         }
+
+    //         $original = $str;
+    //     }
+
+    //     return trim($str,':');
+    // }
+    private function decodeUnicode($str) {
+        return preg_replace_callback(
+            '/\\\\u([0-9a-fA-F]{4})/',
+            function ($matches) {
+                // 将十六进制 Unicode 码转为 UTF-8 字符
+                return mb_convert_encoding(pack('H*', $matches[1]), 'UTF-8', 'UCS-2BE');
+            },
+            $str
+        );
+    }
    private function getMatchedFundPhrases($content = '') {
        if (empty($content)) {
            return [];
@@ -1223,7 +1106,7 @@ class ArticleParserService
        // 基金支持词组列表
        $fundPhrases = [
            'Supported by', 'Funded by', 'Sponsored by', 'Supported in part by',
-            'Funding was provided by', 'Funded in part by'
+            'Funding was provided by', 'Funded in part by','FUNDING:'
        ];

        // 1. 转义词组中的特殊字符，使用 # 作为分隔符