代码修改

2025-12-02 13:17:23 +08:00
parent 93f9e705cb
commit 705dce5e94
1 changed files with 405 additions and 202 deletions
--- a/application/common/ArticleParserService.php
+++ b/application/common/ArticleParserService.php
@@ -553,7 +553,7 @@ class ArticleParserService
            if (!empty($institution) && !mb_check_encoding($institution, 'UTF-8')) {
                $institution = mb_convert_encoding($institution, 'UTF-8', 'GBK');
            }
-            $aCompany[$number] = $institution;
+            $aCompany[$number] = empty($institution) ? '' : trim(trim($institution),'.');
        }
        return $aCompany;
    }
@@ -581,6 +581,7 @@ class ArticleParserService
            $corrText = mb_convert_encoding($corrText, 'UTF-8', 'GBK');
        }
        $corrText = $this->fullDecode($corrText);
+
        // // 调试
        // file_put_contents(ROOT_PATH . 'runtime/corr_text_raw.log', $corrText);

@@ -605,24 +606,25 @@ class ArticleParserService
            $aCorresponding[] = [
                'name' => $sName,
                'email' => isset($email[2]) ? trim($email[2]) : '',
-                'postal_address' => isset($address[2]) ? trim($address[2]) : '',
+                'postal_address' => isset($address[2]) ? trim(trim($address[2]),'.') : '',
                'tel' => isset($tel[2]) ? trim($tel[2]) : ''
            ];
        }
        if(empty($aCorresponding)){
-            $pattern = '/Corresponding Authors|Correspondence to|Correspondence: (.*?)(?=$|;)/s';
+            // $pattern = '/Corresponding Authors|Correspondence to|Correspondence: (.*?)(?=$|;)/s';
+            $pattern = '/(Corresponding Authors|Correspondence to|Correspondence)\s*:\s*([\s\S]+?)(?=\n\s*\n|$|;)/is';
            $corrText = trim($corrText,'*');
            preg_match($pattern, $corrText, $match);
-            if (!empty($match[1])) {
-                $corrContent = $match[1];
+            if (!empty($match[2])) {
+                $corrContent = $match[2];
                // 提取每个作者的名称和邮箱（优化正则，支持更多字符）
                $authorPattern = '/([A-Za-z\s]+?),\s*E-mail:\s*([\w@\.\-]+)/';
                preg_match_all($authorPattern, $corrContent, $authors);
                if(!empty($authors[1])){
                    for ($i = 0; $i < count($authors[1]); $i++) {
                        $aCorresponding[] = [
-                            'name' => empty($authors[1][$i]) ? '' : trim($authors[1][$i]),
-                            'email' => empty($authors[2][$i]) ? '' : trim($authors[2][$i])
+                            'name' => empty($authors[1][$i]) ? '' : trim(trim($authors[1][$i]),'.'),
+                            'email' => empty($authors[2][$i]) ? '' : trim(trim($authors[2][$i]),'.')
                        ];
                    }
                }
@@ -631,8 +633,8 @@ class ArticleParserService
                    preg_match_all($authorPattern, $corrContent, $authors);
                    for ($i = 0; $i < count($authors[1]); $i++) {
                        $aCorresponding[] = [
-                            'name' => empty($authors[1][$i]) ? '' : trim($authors[1][$i]),
-                            'email' => empty($authors[2][$i]) ? '' : trim($authors[2][$i])
+                            'name' => empty($authors[1][$i]) ? '' : trim(trim($authors[1][$i]),'.'),
+                            'email' => empty($authors[2][$i]) ? '' : trim(trim($authors[2][$i]),'.')
                        ];
                    }
                }
@@ -734,88 +736,293 @@ class ArticleParserService
    }

    // 统一提取元素文本
-    private function getTextFromElement($element,$lineNumber = 0){
+    private function getTextFromElement(\PhpOffice\PhpWord\Element\AbstractElement $element, int $lineNumber = 0){
        $text = '';
-        if ($element instanceof \PhpOffice\PhpWord\Element\ListItemRun) {
-           $this->iNum++;
-            $text .= $this->iNum;
+
+        // 1. 常量化特殊引号映射（避免每次调用重建数组，提升循环调用性能）
+        static $specialQuotesMap = [
+            '’' => "'",  // 右单引号（U+2019）→ 普通单引号（U+0027）
+            '‘' => "'",  // 左单引号（U+2018）→ 普通单引号（U+0027）
+            '“' => '"',  // 左双引号（U+201C）→ 普通双引号（U+0022）
+            '”' => '"',  // 右双引号（U+201D）→ 普通双引号（U+0022）
+            '„' => '"',  // 下双引号（U+201E）→ 普通双引号（兼容欧洲排版）
+            '‟' => '"',  // 右双引号（U+201F）→ 普通双引号（兼容少见排版）
+        ];
+
+        // 支持H1-H9标题格式（优化：移除无用变量 $titleDepth，避免冗余）
+        if ($element instanceof \PhpOffice\PhpWord\Element\Title) {
+            $titleContent = $element->getText();
+            $titleText = '';
+
+            if ($titleContent instanceof \PhpOffice\PhpWord\Element\TextRun) {
+                $titleText = $this->getTextFromElement($titleContent);
+            } else {
+                $titleText = strtr((string)$titleContent, $specialQuotesMap);
+            }
+
+            $text .= $titleText . ' ';
+            return $this->cleanText($text);
        }
-        // 处理PreserveText元素
+
+        // 项目编号（优化：严格空值判断，避免 0 被 empty 误判）
+        if ($element instanceof \PhpOffice\PhpWord\Element\ListItemRun) {
+            $this->iNum = isset($this->iNum) && is_numeric($this->iNum) ? $this->iNum : 0;
+            $this->iNum++;
+            $text .= $this->iNum . ' ';
+        }
+
+        // 处理PreserveText（含HYPERLINK邮箱提取，优化：反射前先判断属性存在）
        if ($element instanceof \PhpOffice\PhpWord\Element\PreserveText) {
-            // 通过反射获取私有属性 text
-            $reflection = new \ReflectionClass($element);
-            $property = $reflection->getProperty('text');
-            $property->setAccessible(true);
-            $textParts = $property->getValue($element);
+            try {
+                $reflection = new \ReflectionClass($element);
+                // 先判断属性是否存在，避免反射不存在的属性报错（兼容极端版本）
+                if (!$reflection->hasProperty('text')) {
+                    return $this->cleanText($text);
+                }
+                $property = $reflection->getProperty('text');
+                $property->setAccessible(true);
+                $textParts = $property->getValue($element) ?? [];
+            } catch (\ReflectionException $e) {
+                return $this->cleanText($text);
+            }
+
            foreach ($textParts as $part) {
+                $part = (string)$part;
                if (strpos($part, 'HYPERLINK') !== false) {
-                    // 解码 HTML 实体（&quot; -> "）
-                    $decoded = html_entity_decode($part);
-                    // 提取 mailto: 后的邮箱
-                    if (preg_match('/mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/i', $decoded, $match)) {
+                    $decoded = html_entity_decode($part, ENT_QUOTES | ENT_HTML5);
+                    // 邮箱正则不变（已优化，兼容国际域名）
+                    if (preg_match('/mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,10})/i', $decoded, $match)) {
                        $text .= $match[1] . ' ';
                    }
                } else {
-                    // 普通文本直接拼接
+                    $part = strtr($part, $specialQuotesMap);
                    $text .= $part;
                }
            }
-            return $text;
+            return $this->cleanText($text);
        }
-        // 处理表格和单元格（E-mail可能在表格中）
+
+        // 处理表格（优化：避免行尾多余空格，通过 cleanText 自动合并）
        if ($element instanceof \PhpOffice\PhpWord\Element\Table) {
            foreach ($element->getRows() as $row) {
                foreach ($row->getCells() as $cell) {
-                    $text .= $this->getTextFromElement($cell);
+                    $text .= $this->getTextFromElement($cell) . ' ';
                }
+                // 移除行尾额外空格（cleanText 会合并连续空格，无需手动添加）
            }
-            return $text;
+            return $this->cleanText($text);
        }
+
+        // 处理单元格（逻辑不变，保持递归提取）
        if ($element instanceof \PhpOffice\PhpWord\Element\Cell) {
            foreach ($element->getElements() as $child) {
                $text .= $this->getTextFromElement($child);
            }
-            return $text;
+            return $this->cleanText($text);
        }

-        //处理嵌套元素（递归提取所有子元素）
-        if (method_exists($element, 'getElements')) {
+        // 处理嵌套元素（逻辑不变，增强类型校验可读性）
+        if (method_exists($element, 'getElements') && is_callable([$element, 'getElements'])) {
            foreach ($element->getElements() as $child) {
-                $text .= $this->getTextFromElement($child);
+                if ($child instanceof \PhpOffice\PhpWord\Element\AbstractElement) {
+                    $text .= $this->getTextFromElement($child);
+                }
            }
        }

-        //处理文本元素（包括带格式的文本）
+        // 处理纯文本元素（逻辑不变，保持特殊引号替换）
        if ($element instanceof \PhpOffice\PhpWord\Element\Text) {
-            $text .= $element->getText();
+            $textPart = (string)$element->getText(); // 显式强制转换，避免类型隐患
+            $textPart = strtr($textPart, $specialQuotesMap);
+            $text .= $textPart;
        }

-        //处理超链接（优先提取链接目标，可能是邮箱）
+        // 处理超链接（逻辑不变，保持邮箱优先提取）
        if ($element instanceof \PhpOffice\PhpWord\Element\Link) {
-            $target = $element->getTarget();
+            $target = (string)$element->getTarget();
            if (strpos($target, 'mailto:') === 0) {
-                $text .= str_replace('mailto:', '', $target) . ' '; // 剥离mailto:前缀
+                $text .= rtrim(str_replace('mailto:', '', $target)) . ' ';
            }
-            $text .= $element->getText() . ' ';
+            $linkText = strtr((string)$element->getText(), $specialQuotesMap);
+            $text .= $linkText . ' ';
        }

-        //处理字段和注释（可能包含隐藏邮箱）
+        // 处理字段和注释（优化：显式强制转换，避免非字符串拼接）
        if ($element instanceof \PhpOffice\PhpWord\Element\Field) {
-            $text .= $element->getContent() . ' ';
+            $text .= (string)$element->getContent() . ' ';
        }
        if ($element instanceof \PhpOffice\PhpWord\Element\Note) {
-            $text .= $element->getContent() . ' ';
+            $text .= (string)$element->getContent() . ' ';
        }
-        //清理所有不可见字符（关键：移除格式干扰）
-        $text = preg_replace('/[\x00-\x1F\x7F-\x9F]/', ' ', $text); // 移除控制字符
-        $text = str_replace(["\t", "\r", "\n"], ' ', $text); // 统一空白字符
-        $text = preg_replace('/\s+/', ' ', $text); // 合并多个空格
-        if(!empty($text) && !mb_check_encoding($text, 'UTF-8')){
-            $text = mb_convert_encoding($text, 'UTF-8', 'GBK');
-        }
-        return $text;
+
+        return $this->cleanText($text);
    }

+    /**
+     * 统一文本清理方法（稳健、高效、不破坏普通单引号）
+     * @param string $text 待清理文本
+     * @return string 清理后的纯文本
+     */
+    private function cleanText(string $text){
+
+        //编码正确
+        if (!mb_check_encoding($text, 'UTF-8')) {
+            $text = mb_convert_encoding(
+                $text,
+                'UTF-8',
+                'GBK,GB2312,GB18030,Big5,ISO-8859-1,CP1252,UTF-16,UTF-32' // 补充常见西文编码，兼容更多场景
+            );
+        }
+        //移除不可见控制字符
+        $text = preg_replace('/[\x00-\x1F\x7F-\x9F]/u', ' ', $text);
+        
+        //统一空白字符
+        $text = str_replace([
+            "\t", "\r", "\n",
+            chr(0xC2) . chr(0xA0), // 不间断空格（&nbsp;）
+            '　', // 全角空格（U+3000）
+            chr(0xE2) . chr(0x80) . chr(0xAF), // 窄无中断空格（U+202F）
+        ], ' ', $text);
+        
+        //合并连续空格
+        $text = preg_replace('/\s+/u', ' ', $text);
+        
+        return $text;
+    }
+    // private function getTextFromElement($element, $lineNumber = 0){
+    //     // 初始化默认空字符串（保持原有逻辑）
+    //     $text = '';
+
+    //     // 1. 常量化特殊引号映射（避免重复创建数组，提升性能）
+    //     static $specialQuotesMap = [
+    //         '’' => "'",  // 右单引号（U+2019）→ 普通单引号（U+0027）
+    //         '‘' => "'",  // 左单引号（U+2018）→ 普通单引号（U+0027）
+    //         '“' => '"',  // 左双引号（U+201C）→ 普通双引号（U+0022）
+    //         '”' => '"',  // 右双引号（U+201D）→ 普通双引号（U+0022）
+    //         '„' => '"',  // 下双引号（U+201E）→ 普通双引号（兼容欧洲排版）
+    //         '‟' => '"',  // 右双引号（U+201F）→ 普通双引号（兼容少见排版）
+    //     ];
+
+    //     // 2. 提前校验元素合法性（避免后续 instanceof 无效判断，减少报错）
+    //     if (!is_object($element) || !$element instanceof \PhpOffice\PhpWord\Element\AbstractElement) {
+    //         return $text;
+    //     }
+
+    //     // 支持H1标题格式（逻辑不变，优化变量命名可读性）
+    //     if ($element instanceof \PhpOffice\PhpWord\Element\Title) {
+    //         $titleContent = $element->getText();
+    //         $titleText = '';
+
+    //         // 关键修复：判断返回类型，递归提取文本（逻辑不变）
+    //         if ($titleContent instanceof \PhpOffice\PhpWord\Element\TextRun) {
+    //             $titleText = $this->getTextFromElement($titleContent);
+    //         } else {
+    //             $titleText = strtr((string)$titleContent, $specialQuotesMap);
+    //         }
+
+    //         $text .= $titleText . ' ';
+    //         return $text;
+    //     }
+
+    //     // 项目编号（逻辑不变，优化空值判断为严格判断）
+    //     if ($element instanceof \PhpOffice\PhpWord\Element\ListItemRun) {
+    //         $this->iNum = isset($this->iNum) && is_numeric($this->iNum) ? $this->iNum : 0;
+    //         $this->iNum++;
+    //         $text .= $this->iNum . ' ';
+    //     }
+
+    //     // 处理PreserveText元素（核心逻辑不变，增强容错性）
+    //     if ($element instanceof \PhpOffice\PhpWord\Element\PreserveText) {
+    //         try {
+    //             $reflection = new \ReflectionClass($element);
+    //             $property = $reflection->getProperty('text');
+    //             $property->setAccessible(true);
+    //             // 空值兜底，避免遍历非数组报错
+    //             $textParts = $property->getValue($element) ?? [];
+    //         } catch (\ReflectionException $e) {
+    //             // 反射失败时返回已拼接文本，不中断流程
+    //             return $text;
+    //         }
+
+    //         foreach ($textParts as $part) {
+    //             $part = (string)$part; // 强制转字符串，避免类型错误
+    //             if (strpos($part, 'HYPERLINK') !== false) {
+    //                 $decoded = html_entity_decode($part, ENT_QUOTES | ENT_HTML5);
+    //                 // 邮箱正则不变，保持原有匹配逻辑
+    //                 if (preg_match('/mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,10})/i', $decoded, $match)) {
+    //                     $text .= $match[1] . ' ';
+    //                 }
+    //             } else {
+    //                 $text .= $part;
+    //             }
+    //         }
+    //         return $text;
+    //     }
+
+    //     // 处理表格和单元格（逻辑不变，优化循环变量命名）
+    //     if ($element instanceof \PhpOffice\PhpWord\Element\Table) {
+    //         foreach ($element->getRows() as $row) {
+    //             foreach ($row->getCells() as $cell) {
+    //                 $text .= $this->getTextFromElement($cell);
+    //             }
+    //         }
+    //         return $text;
+    //     }
+
+    //     if ($element instanceof \PhpOffice\PhpWord\Element\Cell) {
+    //         foreach ($element->getElements() as $child) {
+    //             $text .= $this->getTextFromElement($child);
+    //         }
+    //         return $text;
+    //     }
+
+    //     // 处理嵌套元素（逻辑不变，增强方法存在性校验）
+    //     if (method_exists($element, 'getElements') && is_callable([$element, 'getElements'])) {
+    //         foreach ($element->getElements() as $child) {
+    //             // 双重校验，避免非元素对象传入
+    //             if (is_object($child) && $child instanceof \PhpOffice\PhpWord\Element\AbstractElement) {
+    //                 $textPart = $this->getTextFromElement($child);
+    //                 $text .= $textPart;
+    //             }
+    //         }
+    //     }
+
+    //     // 处理文本元素（逻辑不变，保持特殊引号替换）
+    //     if ($element instanceof \PhpOffice\PhpWord\Element\Text) {
+    //         $textPart = (string)$element->getText(); // 强制转字符串，避免空值
+    //         $textPart = strtr($textPart, $specialQuotesMap);
+    //         $text .= $textPart;
+    //     }
+
+    //     // 处理超链接（逻辑不变，优化变量类型转换）
+    //     if ($element instanceof \PhpOffice\PhpWord\Element\Link) {
+    //         $target = (string)$element->getTarget();
+    //         if (strpos($target, 'mailto:') === 0) {
+    //             $text .= rtrim(str_replace('mailto:', '', $target)) . ' ';
+    //         }
+    //         $linkText = strtr((string)$element->getText(), $specialQuotesMap);
+    //         $text .= $linkText . ' ';
+    //     }
+
+    //     // 处理字段和注释（逻辑不变，增加类型转换，避免非字符串拼接）
+    //     if ($element instanceof \PhpOffice\PhpWord\Element\Field) {
+    //         $text .= (string)$element->getContent() . ' ';
+    //     }
+    //     if ($element instanceof \PhpOffice\PhpWord\Element\Note) {
+    //         $text .= (string)$element->getContent() . ' ';
+    //     }
+
+    //     // 清理文本（逻辑不变，优化编码校验顺序，提升性能）
+    //     $text = str_replace(["\t", "\r", "\n"], ' ', $text);
+    //     $text = preg_replace('/\s+/', ' ', $text);
+    //     // 先trim再判断，避免空白字符导致的无效编码转换
+    //     $textTrimmed = trim($text);
+    //     if (!empty($textTrimmed) && !mb_check_encoding($textTrimmed, 'UTF-8')) {
+    //         $text = mb_convert_encoding($text, 'UTF-8', 'GBK');
+    //     }
+
+    //     return $text;
+    // }
    /**
     * 从 Word 文档提取摘要和关键词
     * @return array 提取结果
@@ -950,221 +1157,217 @@ class ArticleParserService
     * @param int $maxDepth 最大解析深度
     * @return string
     */
-    private function fullDecode($str, $maxDepth = 2)
-    {
-        // 空值/深度为0，直接返回（提前终止，避免无效操作）
-        if (empty($str) || $maxDepth <= 0) {
-            return $str;
+   private function fullDecode(?string $str, int $maxDepth = 2){
+        // 空值/无效深度/纯空格，直接返回（严谨前置判断，避免无效运算）
+        if ($str === null || trim((string)$str) === '' || $maxDepth <= 0) {
+            return $str === null ? '' : trim((string)$str);
        }
+
+        // 确保输入是字符串（兼容非字符串输入场景）
+        $str = (string)$str;
+        // 前置Unicode解码（避免转义字符干扰后续匹配）
        $str = $this->decodeUnicode($str);
-        // ========== 预编译所有正则（合并同类型，避免循环内重复解析） ==========
+
+        // ========== 预编译正则（优化匹配精度、避免歧义，仅编译一次） ==========
        $regexps = [
-            // 原有专属场景正则
-            'ob0' => '/0B\s*\?0/',
-            'dl18' => '/DL\s*\?.18/',
-            // 原有通用场景正则
-            'qMarkNum' => '/\?(\d+)/',
-            'qMarkDotNum' => '/\?(\.\d+)/',
-            // ≤、≠空格修复正则
-            'neNum' => '/≠\s*(\d+)/',
-            'leNum' => '/≤\s*(\d+)/',
-            // 混合符号乱码正则（合并中英文顿号/逗号）
-            'mixSymbol' => '/(\?)\s*(、|,)\s*(\?)\s*(、|,)\s*(\?)(\d+)/',
-            // ≤、≠专属标识正则（合并LE/NE）
-            'leNeMark' => '/(LE|NE)\s*\?(\d+)/',
-            // Unicode转义正则
-            'unicode' => '/\\\\u([0-9a-fA-F]{4})/',
-            // Word二进制乱码（合并≤≥≠）
-            'wordBin' => '/(\\xE2\\x89\\x86|\\xE2 0x89 0x86|e28986|\\xE2\\x89\\x87|\\xE2 0x89 0x87|e28987|\\xE2\\x89\\x80|\\xE2 0x89 0x80|e28980)/i',
-            // Word XML实体异常（合并≤≥≠）
-            'wordEntity' => '/&#\s*(\x|X)?\s*(2264|2265|2260)\s*;?/i',
-            // 不可见控制字符
-            'controlChar' => '/[\x00-\x1F\x7F]/',
-            // 重复符号去重（合并≤≥≠）
-            'repeatSymbol' => '/(≤{2,}|≥{2,}|≠{2,})/',
-            // GBK编码乱码（合并≤≥≠）
-            'gbkSymbol' => '/(\xA1\xF2|\xA1\xF3|\xA1\xF0)/'
+            // 专属场景正则：优化空格匹配（任意空白字符）+ 问号转义（避免正则歧义）
+            'ob0' => '/0B\s*\\?0/',          // 匹配 0B?0、0B  ?0 等场景
+            'dl18' => '/DL\s*\\?\.18/',      // 精准匹配 DL?.18（避免误匹配 DL?x.18）
+            // 通用场景正则：问号转义，确保仅匹配字面问号
+            'qMarkNum' => '/\\?(\d+)/',       // 匹配 ?123、?45 等（问号转义）
+            'qMarkDotNum' => '/\\?(\.\d+)/',  // 匹配 ?.18、?.25 等（问号转义）
+            // ≤、≠空格修复：支持任意空白字符（含全角空格）
+            'neNum' => '/≠\s*(\d+)/u',
+            'leNum' => '/≤\s*(\d+)/u',
+            // 混合符号乱码：用非捕获组减少开销，优化分组逻辑
+            'mixSymbol' => '/\\?\s*(?:、|,)\s*\\?\s*(?:、|,)\s*\\?(\d+)/u',
+            // ≤、≠专属标识：支持大小写不敏感（覆盖 LE/le/NE/ne）
+            'leNeMark' => '/(LE|NE)\s*\\?(\d+)/i',
+            // Unicode转义：支持 \u/\U 前缀，覆盖更多转义格式
+            'unicode' => '/\\\\[uU]([0-9a-fA-F]{4})/',
+            // Word二进制乱码：优化正则结构（非捕获组），避免重复分组
+            'wordBin' => '/(?:\\xE2\\x89\\x86|\\xE2\s*0x89\s*0x86|e28986|\\xE2\\x89\\x87|\\xE2\s*0x89\s*0x87|e28987|\\xE2\\x89\\x80|\\xE2\s*0x89\s*0x80|e28980)/i',
+            // Word XML实体异常：优化匹配（支持无分号、空格间隔）
+            'wordEntity' => '/&#\s*(?:x|X)?\s*(2264|2265|2260)\s*;?/i',
+            // 不可见控制字符：添加UTF-8修饰符，避免匹配多字节字符异常
+            'controlChar' => '/[\x00-\x1F\x7F]/u',
+            // 重复符号去重：用反向引用优化，匹配更高效（支持≤≥≠）
+            'repeatSymbol' => '/(≤|≥|≠)\1+/u',
+            // GBK编码乱码：优化正则（无冗余分组），确保匹配原生字节
+            'gbkSymbol' => '/\xA1\xF2|\xA1\xF3|\xA1\xF0/'
        ];

-        // ========== 预定义所有替换映射（避免循环内重复创建） ==========
+        // ========== 预定义替换映射（扩展场景、去冗余、修复转义问题） ==========
        $maps = [
-            // HTML实体映射（扩展Word实体）
+            // HTML实体映射：补充更多Word常见实体，覆盖不完整实体场景
            'htmlEntity' => [
-                '&le;' => '≤', '&#8804;' => '≤', '&#x2264;' => '≤',
-                '&ge;' => '≥', '&#8805;' => '≥', '&#x2265;' => '≥',
-                '&ne;' => '≠', '&#8800;' => '≠', '&#x2260;' => '≠',
-                '&le' => '≤', '&ge' => '≥', '&ne' => '≠',
-                '&#2264' => '≤', '&#2265' => '≥', '&#2260' => '≠',
-                '&#x2264' => '≤', '&#x2265' => '≥', '&#x2260' => '≠',
-                '&#X2264' => '≤', '&#X2265' => '≥', '&#X2260' => '≠',
-                '&#60;' => '≤', '&#62;' => '≥'
+                '&le;' => '≤', '&#8804;' => '≤', '&#x2264;' => '≤', '&#X2264;' => '≤',
+                '&ge;' => '≥', '&#8805;' => '≥', '&#x2265;' => '≥', '&#X2265;' => '≥',
+                '&ne;' => '≠', '&#8800;' => '≠', '&#x2260;' => '≠', '&#X2260;' => '≠',
+                '&le' => '≤', '&ge' => '≥', '&ne' => '≠',  // 无分号实体
+                '&#2264' => '≤', '&#2265' => '≥', '&#2260' => '≠',  // 无分号数字实体
+                '&#x2264' => '≤', '&#x2265' => '≥', '&#x2260' => '≠',  // 无分号十六进制实体
+                '&#60;' => '≤', '&#62;' => '≥',  // 业务专属映射（保留）
            ],
-            // 空格替换数组（扩展Word中的各种空格）
+            // 空格替换数组：补充Word中常见的特殊空格，覆盖更多场景
            'nbsp' => [
-                chr(0xC2) . chr(0xA0), // UTF-8不间断空格
-                chr(0xA0), // 拉丁1不间断空格
-                '　', // 全角空格
-                chr(0x2002), // 方头空格
-                chr(0x2003), // 全角空格
-                chr(0x2004)  // 三分之一全角空格
+                chr(0xC2) . chr(0xA0),  // UTF-8不间断空格（&nbsp;）
+                chr(0xA0),              // 拉丁1不间断空格
+                '　',                    // 全角空格（U+3000）
+                chr(0x2002),            // 半角空格（U+2002）
+                chr(0x2003),            // 全角空格（U+2003）
+                chr(0x2004),            // 三分之一全角空格（U+2004）
+                chr(0x2005),            // 四分之一全角空格（U+2005）
+                chr(0x202F),            // 窄无中断空格（U+202F，Word常用）
            ],
-            // 二进制乱码映射
+            // 二进制乱码映射：统一键名格式（去除空格），避免重复匹配
            'wordBin' => [
-                'e28986' => '≤', '\\xe2\\x89\\x86' => '≤', '\\xe2 0x89 0x86' => '≤',
-                'e28987' => '≥', '\\xe2\\x89\\x87' => '≥', '\\xe2 0x89 0x87' => '≥',
-                'e28980' => '≠', '\\xe2\\x89\\x80' => '≠', '\\xe2 0x89 0x80' => '≠'
+                'e28986' => '≤',
+                '\xe2\x89\x86' => '≤',
+                '\xe20x890x86' => '≤',  // 去除空格后的统一键名
+                'e28987' => '≥',
+                '\xe2\x89\x87' => '≥',
+                '\xe20x890x87' => '≥',
+                'e28980' => '≠',
+                '\xe2\x89\x80' => '≠',
+                '\xe20x890x80' => '≠',
            ],
-            // XML实体编码映射
+            // XML实体编码映射：保持简洁，仅映射核心数字
            'wordEntity' => [
                '2264' => '≤',
                '2265' => '≥',
-                '2260' => '≠'
+                '2260' => '≠',
            ],
-            // GBK编码映射
+            // GBK编码映射：修复转义问题（用双引号包裹原生字节，避免匹配失败）
            'gbkSymbol' => [
-                '\xA1\xF2' => '≤',
-                '\xA1\xF3' => '≥',
-                '\xA1\xF0' => '≠'
-            ]
+                "\xA1\xF2" => '≤',  // 原生GBK字节，无需转义（双引号关键）
+                "\xA1\xF3" => '≥',
+                "\xA1\xF0" => '≠',
+            ],
        ];

-        // 预定义回调函数（仅创建一次，避免循环内重复实例化）
+        // 预定义回调函数（仅创建一次，提升性能，增加容错）
        $unicodeCallback = function ($m) {
-            return mb_chr(hexdec($m[1]), 'UTF-8') ?: $m[0];
+            $code = hexdec($m[1]);
+            // 容错：十六进制转换失败/无效Unicode码点，返回原始值
+            return ($code >= 0x20 && $code <= 0x10FFFF) ? mb_chr($code, 'UTF-8') : $m[0];
        };

        $depth = 0;
        $hasChange = false;
-        $original = $str;
+        $currentStr = $str;

-        // 循环解码：仅在有变化且未达最大深度时执行
+        // 循环解码：仅在有变化且未达最大深度时执行（避免无限循环）
        do {
            $depth++;
            $hasChange = false;
-            $prevStr = $str;
+            $prevStr = $currentStr;

-            // ========== 前置处理（惰性执行，避免无意义操作） ==========
-            $countCtrl = 0;
+            // ========== 前置处理（惰性执行，仅在需要时触发） ==========
            // 1. 过滤不可见控制字符（仅当包含时执行）
-            if (preg_match($regexps['controlChar'], $str)) {
-                $str = preg_replace($regexps['controlChar'], '', $str, -1, $countCtrl);
+            if (preg_match($regexps['controlChar'], $currentStr)) {
+                $currentStr = preg_replace($regexps['controlChar'], '', $currentStr);
            }

-            // 2. GBK/GB2312编码转UTF-8（仅当非UTF-8时执行）
-            if (!mb_check_encoding($str, 'UTF-8')) {
-                $str = mb_convert_encoding($str, 'UTF-8', 'GBK,GB2312,ISO-8859-1');
+            // 2. 编码校正（非UTF-8时才转换，增加容错机制）
+            if (!mb_check_encoding($currentStr, 'UTF-8')) {
+                $converted = mb_convert_encoding(
+                    $currentStr,
+                    'UTF-8',
+                    'GBK,GB2312,ISO-8859-1,CP1252'  // 补充CP1252（Windows西文编码）
+                );
+                // 容错：转换失败时保留原文本，避免乱码加剧
+                $currentStr = mb_check_encoding($converted, 'UTF-8') ? $converted : $currentStr;
            }

-            // ========== 核心解码逻辑 ==========
-            // 1. 解码Unicode转义
-            $str = preg_replace_callback($regexps['unicode'], $unicodeCallback, $str);
+            // ========== 核心解码逻辑（按优先级执行，避免冲突） ==========
+            // 1. Unicode转义解码（优先处理，避免转义字符干扰后续匹配）
+            $currentStr = preg_replace_callback($regexps['unicode'], $unicodeCallback, $currentStr);

-            // 2. 解码HTML实体（高性能strtr替换）
-            $str = strtr($str, $maps['htmlEntity']);
-            $str = html_entity_decode($str, ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE, 'UTF-8');
+            // 2. HTML实体替换（先精准映射，再解码剩余实体）
+            $currentStr = strtr($currentStr, $maps['htmlEntity']);
+            $currentStr = html_entity_decode(
+                $currentStr,
+                ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE,
+                'UTF-8'
+            );

-            // 3. 替换各种空格为普通空格
-            $str = str_replace($maps['nbsp'], ' ', $str);
+            // 3. 统一所有空格为普通空格（避免空格类型导致的匹配失败）
+            $currentStr = str_replace($maps['nbsp'], ' ', $currentStr);

-            // ========== Word特殊符号乱码修复（合并+惰性） ==========
-            $countBin = $countEnt = $countGbk = $countRepeat = 0;
-
-            // 1. 二进制乱码还原（合并正则+回调）
-            if (preg_match($regexps['wordBin'], $str)) {
-                $str = preg_replace_callback($regexps['wordBin'], function ($m) use ($maps) {
-                    $key = strtolower(str_replace(' ', '', $m[0]));
-                    return $maps['wordBin'][$key] ?? $m[0];
-                }, $str, -1, $countBin);
+            // ========== Word特殊符号乱码修复（惰性执行，优化效率） ==========
+            // 1. 二进制乱码还原（先去除空格统一格式，再匹配）
+            if (preg_match($regexps['wordBin'], $currentStr)) {
+                $tempStr = str_replace(' ', '', $currentStr);  // 去除所有空格，统一键名格式
+                $currentStr = str_ireplace(array_keys($maps['wordBin']), $maps['wordBin'], $tempStr);
            }

-            // 2. XML实体异常修复（合并正则+回调）
-            if (preg_match($regexps['wordEntity'], $str)) {
-                $str = preg_replace_callback($regexps['wordEntity'], function ($m) use ($maps) {
-                    return $maps['wordEntity'][$m[2]] ?? $m[0];
-                }, $str, -1, $countEnt);
+            // 2. XML实体异常修复
+            if (preg_match($regexps['wordEntity'], $currentStr)) {
+                $currentStr = preg_replace_callback($regexps['wordEntity'], function ($m) use ($maps) {
+                    return $maps['wordEntity'][$m[1]] ?? $m[0];
+                }, $currentStr);
            }

-            // 3. GBK编码乱码修复（合并正则+回调）
-            if (preg_match($regexps['gbkSymbol'], $str)) {
-                $str = preg_replace_callback($regexps['gbkSymbol'], function ($m) use ($maps) {
-                    return $maps['gbkSymbol'][$m[0]] ?? $m[0];
-                }, $str, -1, $countGbk);
+            // 3. GBK编码乱码修复（用strtr替代preg_replace_callback，效率更高）
+            if (preg_match($regexps['gbkSymbol'], $currentStr)) {
+                $currentStr = strtr($currentStr, $maps['gbkSymbol']);
            }

-            // 4. 重复符号去重（合并正则+极简回调）
-            if (preg_match($regexps['repeatSymbol'], $str)) {
-                $str = preg_replace_callback($regexps['repeatSymbol'], function ($m) {
-                    return $m[0][0]; // 取第一个字符实现去重
-                }, $str, -1, $countRepeat);
+            // 4. 重复符号去重（用preg_replace简化，无需回调）
+            if (preg_match($regexps['repeatSymbol'], $currentStr)) {
+                $currentStr = preg_replace($regexps['repeatSymbol'], '$1', $currentStr);
            }

-            // ========== 原有核心替换逻辑（合并+惰性） ==========
-            $count1 = $count2 = $count3 = $count4 = $count5 = $count6 = 0;
-            $count7 = $count8 = $count9 = 0;
-
-            // 1. 专属场景替换（惰性执行）
-            if (strpos($str, '0B?0') !== false) {
-                $str = preg_replace($regexps['ob0'], '0B≥30', $str, -1, $count1);
+            // ========== 业务场景专属替换（惰性执行，精准匹配） ==========
+            // 1. 专属场景替换（0B?0 → 0B≥30，DL?.18 → DL≥0.18）
+            if (strpos($currentStr, '0B') !== false) {
+                $currentStr = preg_replace($regexps['ob0'], '0B≥30', $currentStr);
            }
-            if (strpos($str, 'DL?.18') !== false) {
-                $str = preg_replace($regexps['dl18'], 'DL≥0.18', $str, -1, $count2);
+            if (strpos($currentStr, 'DL') !== false) {
+                $currentStr = preg_replace($regexps['dl18'], 'DL≥0.18', $currentStr);
            }

-            // 2. ≤、≠空格修复（惰性执行）
-            if (preg_match($regexps['neNum'], $str)) {
-                $str = preg_replace($regexps['neNum'], '≠$1', $str, -1, $count3);
+            // 2. ≤、≠空格修复（去除符号与数字间的空格）
+            if (preg_match($regexps['neNum'], $currentStr)) {
+                $currentStr = preg_replace($regexps['neNum'], '≠$1', $currentStr);
            }
-            if (preg_match($regexps['leNum'], $str)) {
-                $str = preg_replace($regexps['leNum'], '≤$1', $str, -1, $count4);
+            if (preg_match($regexps['leNum'], $currentStr)) {
+                $currentStr = preg_replace($regexps['leNum'], '≤$1', $currentStr);
            }

-            // 3. 通用场景替换（惰性执行）
-            if (preg_match($regexps['qMarkNum'], $str)) {
-                $str = preg_replace($regexps['qMarkNum'], '≥$1', $str, -1, $count5);
+            // 3. 通用场景替换（问号 → ≥）
+            if (preg_match($regexps['qMarkNum'], $currentStr)) {
+                $currentStr = preg_replace($regexps['qMarkNum'], '≥$1', $currentStr);
            }
-            if (preg_match($regexps['qMarkDotNum'], $str)) {
-                $str = preg_replace($regexps['qMarkDotNum'], '≥0$1', $str, -1, $count6);
+            if (preg_match($regexps['qMarkDotNum'], $currentStr)) {
+                $currentStr = preg_replace($regexps['qMarkDotNum'], '≥0$1', $currentStr);
            }

-            // 4. 混合符号乱码还原（合并中英文，惰性执行）
-            if (preg_match($regexps['mixSymbol'], $str)) {
-                $str = preg_replace($regexps['mixSymbol'], '≤$2≥$4≠$6', $str, -1, $count7);
+            // 4. 混合符号乱码还原（?、，?、，?123 → ≤≥≠123）
+            if (preg_match($regexps['mixSymbol'], $currentStr)) {
+                $currentStr = preg_replace($regexps['mixSymbol'], '≤≥≠$1', $currentStr);
            }

-            // 5. ≤、≠专属标识还原（合并正则，惰性执行）
-            if (preg_match($regexps['leNeMark'], $str)) {
-                $str = preg_replace_callback($regexps['leNeMark'], function ($m) {
-                    return $m[1] === 'LE' ? '≤' . $m[2] : '≠' . $m[2];
-                }, $str, -1, $count8);
+            // 5. ≤、≠专属标识还原（LE?123 → ≤123，NE?456 → ≠456）
+            if (preg_match($regexps['leNeMark'], $currentStr)) {
+                $currentStr = preg_replace_callback($regexps['leNeMark'], function ($m) {
+                    return strtoupper($m[1]) === 'LE' ? '≤' . $m[2] : '≠' . $m[2];
+                }, $currentStr);
            }

-            // 6. 修复前缀"d with "乱码（惰性执行）
-            if (strpos($str, 'd with ') !== false) {
-                $str = str_replace('d with ', 'd with ', $str, $count9);
-            }
+            // 6. 移除冗余代码（原代码"d with "替换无意义，直接删除）

-            // ========== 变化判断（合并计数，减少运算） ==========
-            $totalCount = $countCtrl + $countBin + $countEnt + $countGbk + $countRepeat +
-                          $count1 + $count2 + $count3 + $count4 + $count5 + $count6 +
-                          $count7 + $count8 + $count9;
+            // ========== 变化判断（简化逻辑，避免无效计数） ==========
+            $hasChange = ($currentStr !== $prevStr);

-            if ($totalCount > 0 || $str !== $prevStr) {
-                $hasChange = true;
-                $original = $str;
-            }
+        } while ($depth < $maxDepth && $hasChange);

-            // 提前终止：无变化则退出循环
-            if (!$hasChange) {
-                break;
-            }
+        // 最终清理（去除首尾冒号+二次实体替换，确保无遗漏）
+        $currentStr = trim($currentStr, ':');
+        $currentStr = strtr($currentStr, $maps['htmlEntity']);

-        } while ($depth < $maxDepth);
-
-        // 最终清理+兜底替换
-        $str = trim($str, ':');
-        $str = strtr($str, $maps['htmlEntity']);
-
-        return $str;
+        return $currentStr;
    }

    // private function fullDecode($str, $maxDepth = 5) {