测试问题修改

2025-12-02 15:20:51 +08:00
parent 5daf18608b
commit 90884273e0
1 changed files with 497 additions and 132 deletions
--- a/application/common/ArticleParserService.php
+++ b/application/common/ArticleParserService.php
@@ -14,7 +14,7 @@ class ArticleParserService
 {
    private $phpWord;
    private $sections;
-
+    private $iNum = 0;
    public function __construct($filePath = '')
    {
        if (!file_exists($filePath)) {
@@ -553,7 +553,7 @@ class ArticleParserService
            if (!empty($institution) && !mb_check_encoding($institution, 'UTF-8')) {
                $institution = mb_convert_encoding($institution, 'UTF-8', 'GBK');
            }
-            $aCompany[$number] = $institution;
+            $aCompany[$number] = empty($institution) ? '' : trim(trim($institution),'.');
        }
        return $aCompany;
    }
@@ -581,6 +581,7 @@ class ArticleParserService
            $corrText = mb_convert_encoding($corrText, 'UTF-8', 'GBK');
        }
        $corrText = $this->fullDecode($corrText);
+
        // // 调试
        // file_put_contents(ROOT_PATH . 'runtime/corr_text_raw.log', $corrText);

@@ -605,24 +606,25 @@ class ArticleParserService
            $aCorresponding[] = [
                'name' => $sName,
                'email' => isset($email[2]) ? trim($email[2]) : '',
-                'postal_address' => isset($address[2]) ? trim($address[2]) : '',
+                'postal_address' => isset($address[2]) ? trim(trim($address[2]),'.') : '',
                'tel' => isset($tel[2]) ? trim($tel[2]) : ''
            ];
        }
        if(empty($aCorresponding)){
-            $pattern = '/Corresponding Authors|Correspondence to|Correspondence: (.*?)(?=$|;)/s';
+            // $pattern = '/Corresponding Authors|Correspondence to|Correspondence: (.*?)(?=$|;)/s';
+            $pattern = '/(Corresponding Authors|Correspondence to|Correspondence)\s*:\s*([\s\S]+?)(?=\n\s*\n|$|;)/is';
            $corrText = trim($corrText,'*');
            preg_match($pattern, $corrText, $match);
-            if (!empty($match[1])) {
-                $corrContent = $match[1];
+            if (!empty($match[2])) {
+                $corrContent = $match[2];
                // 提取每个作者的名称和邮箱（优化正则，支持更多字符）
                $authorPattern = '/([A-Za-z\s]+?),\s*E-mail:\s*([\w@\.\-]+)/';
                preg_match_all($authorPattern, $corrContent, $authors);
                if(!empty($authors[1])){
                    for ($i = 0; $i < count($authors[1]); $i++) {
                        $aCorresponding[] = [
-                            'name' => empty($authors[1][$i]) ? '' : trim($authors[1][$i]),
-                            'email' => empty($authors[2][$i]) ? '' : trim($authors[2][$i])
+                            'name' => empty($authors[1][$i]) ? '' : trim(trim($authors[1][$i]),'.'),
+                            'email' => empty($authors[2][$i]) ? '' : trim(trim($authors[2][$i]),'.')
                        ];
                    }
                }
@@ -631,8 +633,8 @@ class ArticleParserService
                    preg_match_all($authorPattern, $corrContent, $authors);
                    for ($i = 0; $i < count($authors[1]); $i++) {
                        $aCorresponding[] = [
-                            'name' => empty($authors[1][$i]) ? '' : trim($authors[1][$i]),
-                            'email' => empty($authors[2][$i]) ? '' : trim($authors[2][$i])
+                            'name' => empty($authors[1][$i]) ? '' : trim(trim($authors[1][$i]),'.'),
+                            'email' => empty($authors[2][$i]) ? '' : trim(trim($authors[2][$i]),'.')
                        ];
                    }
                }
@@ -734,84 +736,293 @@ class ArticleParserService
    }

    // 统一提取元素文本
-    private function getTextFromElement($element,$lineNumber = 0){
+    private function getTextFromElement(\PhpOffice\PhpWord\Element\AbstractElement $element, int $lineNumber = 0){
        $text = '';
-        // 处理PreserveText元素
+
+        // 1. 常量化特殊引号映射（避免每次调用重建数组，提升循环调用性能）
+        static $specialQuotesMap = [
+            '’' => "'",  // 右单引号（U+2019）→ 普通单引号（U+0027）
+            '‘' => "'",  // 左单引号（U+2018）→ 普通单引号（U+0027）
+            '“' => '"',  // 左双引号（U+201C）→ 普通双引号（U+0022）
+            '”' => '"',  // 右双引号（U+201D）→ 普通双引号（U+0022）
+            '„' => '"',  // 下双引号（U+201E）→ 普通双引号（兼容欧洲排版）
+            '‟' => '"',  // 右双引号（U+201F）→ 普通双引号（兼容少见排版）
+        ];
+
+        // 支持H1-H9标题格式（优化：移除无用变量 $titleDepth，避免冗余）
+        if ($element instanceof \PhpOffice\PhpWord\Element\Title) {
+            $titleContent = $element->getText();
+            $titleText = '';
+
+            if ($titleContent instanceof \PhpOffice\PhpWord\Element\TextRun) {
+                $titleText = $this->getTextFromElement($titleContent);
+            } else {
+                $titleText = strtr((string)$titleContent, $specialQuotesMap);
+            }
+
+            $text .= $titleText . ' ';
+            return $this->cleanText($text);
+        }
+
+        // 项目编号（优化：严格空值判断，避免 0 被 empty 误判）
+        if ($element instanceof \PhpOffice\PhpWord\Element\ListItemRun) {
+            $this->iNum = isset($this->iNum) && is_numeric($this->iNum) ? $this->iNum : 0;
+            $this->iNum++;
+            $text .= $this->iNum . ' ';
+        }
+
+        // 处理PreserveText（含HYPERLINK邮箱提取，优化：反射前先判断属性存在）
        if ($element instanceof \PhpOffice\PhpWord\Element\PreserveText) {
-            // 通过反射获取私有属性 text
-            $reflection = new \ReflectionClass($element);
-            $property = $reflection->getProperty('text');
-            $property->setAccessible(true);
-            $textParts = $property->getValue($element);
+            try {
+                $reflection = new \ReflectionClass($element);
+                // 先判断属性是否存在，避免反射不存在的属性报错（兼容极端版本）
+                if (!$reflection->hasProperty('text')) {
+                    return $this->cleanText($text);
+                }
+                $property = $reflection->getProperty('text');
+                $property->setAccessible(true);
+                $textParts = $property->getValue($element) ?? [];
+            } catch (\ReflectionException $e) {
+                return $this->cleanText($text);
+            }
+
            foreach ($textParts as $part) {
+                $part = (string)$part;
                if (strpos($part, 'HYPERLINK') !== false) {
-                    // 解码 HTML 实体（&quot; -> "）
-                    $decoded = html_entity_decode($part);
-                    // 提取 mailto: 后的邮箱
-                    if (preg_match('/mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/i', $decoded, $match)) {
+                    $decoded = html_entity_decode($part, ENT_QUOTES | ENT_HTML5);
+                    // 邮箱正则不变（已优化，兼容国际域名）
+                    if (preg_match('/mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,10})/i', $decoded, $match)) {
                        $text .= $match[1] . ' ';
                    }
                } else {
-                    // 普通文本直接拼接
+                    $part = strtr($part, $specialQuotesMap);
                    $text .= $part;
                }
            }
-            return $text;
+            return $this->cleanText($text);
        }
-        // 处理表格和单元格（E-mail可能在表格中）
+
+        // 处理表格（优化：避免行尾多余空格，通过 cleanText 自动合并）
        if ($element instanceof \PhpOffice\PhpWord\Element\Table) {
            foreach ($element->getRows() as $row) {
                foreach ($row->getCells() as $cell) {
-                    $text .= $this->getTextFromElement($cell);
+                    $text .= $this->getTextFromElement($cell) . ' ';
                }
+                // 移除行尾额外空格（cleanText 会合并连续空格，无需手动添加）
            }
-            return $text;
+            return $this->cleanText($text);
        }
+
+        // 处理单元格（逻辑不变，保持递归提取）
        if ($element instanceof \PhpOffice\PhpWord\Element\Cell) {
            foreach ($element->getElements() as $child) {
                $text .= $this->getTextFromElement($child);
            }
-            return $text;
+            return $this->cleanText($text);
        }

-        //处理嵌套元素（递归提取所有子元素）
-        if (method_exists($element, 'getElements')) {
+        // 处理嵌套元素（逻辑不变，增强类型校验可读性）
+        if (method_exists($element, 'getElements') && is_callable([$element, 'getElements'])) {
            foreach ($element->getElements() as $child) {
-                $text .= $this->getTextFromElement($child);
+                if ($child instanceof \PhpOffice\PhpWord\Element\AbstractElement) {
+                    $text .= $this->getTextFromElement($child);
+                }
            }
        }

-        //处理文本元素（包括带格式的文本）
+        // 处理纯文本元素（逻辑不变，保持特殊引号替换）
        if ($element instanceof \PhpOffice\PhpWord\Element\Text) {
-            $text .= $element->getText();
+            $textPart = (string)$element->getText(); // 显式强制转换，避免类型隐患
+            $textPart = strtr($textPart, $specialQuotesMap);
+            $text .= $textPart;
        }

-        //处理超链接（优先提取链接目标，可能是邮箱）
+        // 处理超链接（逻辑不变，保持邮箱优先提取）
        if ($element instanceof \PhpOffice\PhpWord\Element\Link) {
-            $target = $element->getTarget();
+            $target = (string)$element->getTarget();
            if (strpos($target, 'mailto:') === 0) {
-                $text .= str_replace('mailto:', '', $target) . ' '; // 剥离mailto:前缀
+                $text .= rtrim(str_replace('mailto:', '', $target)) . ' ';
            }
-            $text .= $element->getText() . ' ';
+            $linkText = strtr((string)$element->getText(), $specialQuotesMap);
+            $text .= $linkText . ' ';
        }

-        //处理字段和注释（可能包含隐藏邮箱）
+        // 处理字段和注释（优化：显式强制转换，避免非字符串拼接）
        if ($element instanceof \PhpOffice\PhpWord\Element\Field) {
-            $text .= $element->getContent() . ' ';
+            $text .= (string)$element->getContent() . ' ';
        }
        if ($element instanceof \PhpOffice\PhpWord\Element\Note) {
-            $text .= $element->getContent() . ' ';
+            $text .= (string)$element->getContent() . ' ';
        }
-        //清理所有不可见字符（关键：移除格式干扰）
-        $text = preg_replace('/[\x00-\x1F\x7F-\x9F]/', ' ', $text); // 移除控制字符
-        $text = str_replace(["\t", "\r", "\n"], ' ', $text); // 统一空白字符
-        $text = preg_replace('/\s+/', ' ', $text); // 合并多个空格
-        if(!empty($text) && !mb_check_encoding($text, 'UTF-8')){
-            $text = mb_convert_encoding($text, 'UTF-8', 'GBK');
-        }
-        return $text;
+
+        return $this->cleanText($text);
    }

+    /**
+     * 统一文本清理方法（稳健、高效、不破坏普通单引号）
+     * @param string $text 待清理文本
+     * @return string 清理后的纯文本
+     */
+    private function cleanText(string $text){
+
+        //编码正确
+        if (!mb_check_encoding($text, 'UTF-8')) {
+            $text = mb_convert_encoding(
+                $text,
+                'UTF-8',
+                'GBK,GB2312,GB18030,Big5,ISO-8859-1,CP1252,UTF-16,UTF-32' // 补充常见西文编码，兼容更多场景
+            );
+        }
+        //移除不可见控制字符
+        $text = preg_replace('/[\x00-\x1F\x7F-\x9F]/u', ' ', $text);
+        
+        //统一空白字符
+        $text = str_replace([
+            "\t", "\r", "\n",
+            chr(0xC2) . chr(0xA0), // 不间断空格（&nbsp;）
+            '　', // 全角空格（U+3000）
+            chr(0xE2) . chr(0x80) . chr(0xAF), // 窄无中断空格（U+202F）
+        ], ' ', $text);
+        
+        //合并连续空格
+        $text = preg_replace('/\s+/u', ' ', $text);
+        
+        return $text;
+    }
+    // private function getTextFromElement($element, $lineNumber = 0){
+    //     // 初始化默认空字符串（保持原有逻辑）
+    //     $text = '';
+
+    //     // 1. 常量化特殊引号映射（避免重复创建数组，提升性能）
+    //     static $specialQuotesMap = [
+    //         '’' => "'",  // 右单引号（U+2019）→ 普通单引号（U+0027）
+    //         '‘' => "'",  // 左单引号（U+2018）→ 普通单引号（U+0027）
+    //         '“' => '"',  // 左双引号（U+201C）→ 普通双引号（U+0022）
+    //         '”' => '"',  // 右双引号（U+201D）→ 普通双引号（U+0022）
+    //         '„' => '"',  // 下双引号（U+201E）→ 普通双引号（兼容欧洲排版）
+    //         '‟' => '"',  // 右双引号（U+201F）→ 普通双引号（兼容少见排版）
+    //     ];
+
+    //     // 2. 提前校验元素合法性（避免后续 instanceof 无效判断，减少报错）
+    //     if (!is_object($element) || !$element instanceof \PhpOffice\PhpWord\Element\AbstractElement) {
+    //         return $text;
+    //     }
+
+    //     // 支持H1标题格式（逻辑不变，优化变量命名可读性）
+    //     if ($element instanceof \PhpOffice\PhpWord\Element\Title) {
+    //         $titleContent = $element->getText();
+    //         $titleText = '';
+
+    //         // 关键修复：判断返回类型，递归提取文本（逻辑不变）
+    //         if ($titleContent instanceof \PhpOffice\PhpWord\Element\TextRun) {
+    //             $titleText = $this->getTextFromElement($titleContent);
+    //         } else {
+    //             $titleText = strtr((string)$titleContent, $specialQuotesMap);
+    //         }
+
+    //         $text .= $titleText . ' ';
+    //         return $text;
+    //     }
+
+    //     // 项目编号（逻辑不变，优化空值判断为严格判断）
+    //     if ($element instanceof \PhpOffice\PhpWord\Element\ListItemRun) {
+    //         $this->iNum = isset($this->iNum) && is_numeric($this->iNum) ? $this->iNum : 0;
+    //         $this->iNum++;
+    //         $text .= $this->iNum . ' ';
+    //     }
+
+    //     // 处理PreserveText元素（核心逻辑不变，增强容错性）
+    //     if ($element instanceof \PhpOffice\PhpWord\Element\PreserveText) {
+    //         try {
+    //             $reflection = new \ReflectionClass($element);
+    //             $property = $reflection->getProperty('text');
+    //             $property->setAccessible(true);
+    //             // 空值兜底，避免遍历非数组报错
+    //             $textParts = $property->getValue($element) ?? [];
+    //         } catch (\ReflectionException $e) {
+    //             // 反射失败时返回已拼接文本，不中断流程
+    //             return $text;
+    //         }
+
+    //         foreach ($textParts as $part) {
+    //             $part = (string)$part; // 强制转字符串，避免类型错误
+    //             if (strpos($part, 'HYPERLINK') !== false) {
+    //                 $decoded = html_entity_decode($part, ENT_QUOTES | ENT_HTML5);
+    //                 // 邮箱正则不变，保持原有匹配逻辑
+    //                 if (preg_match('/mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,10})/i', $decoded, $match)) {
+    //                     $text .= $match[1] . ' ';
+    //                 }
+    //             } else {
+    //                 $text .= $part;
+    //             }
+    //         }
+    //         return $text;
+    //     }
+
+    //     // 处理表格和单元格（逻辑不变，优化循环变量命名）
+    //     if ($element instanceof \PhpOffice\PhpWord\Element\Table) {
+    //         foreach ($element->getRows() as $row) {
+    //             foreach ($row->getCells() as $cell) {
+    //                 $text .= $this->getTextFromElement($cell);
+    //             }
+    //         }
+    //         return $text;
+    //     }
+
+    //     if ($element instanceof \PhpOffice\PhpWord\Element\Cell) {
+    //         foreach ($element->getElements() as $child) {
+    //             $text .= $this->getTextFromElement($child);
+    //         }
+    //         return $text;
+    //     }
+
+    //     // 处理嵌套元素（逻辑不变，增强方法存在性校验）
+    //     if (method_exists($element, 'getElements') && is_callable([$element, 'getElements'])) {
+    //         foreach ($element->getElements() as $child) {
+    //             // 双重校验，避免非元素对象传入
+    //             if (is_object($child) && $child instanceof \PhpOffice\PhpWord\Element\AbstractElement) {
+    //                 $textPart = $this->getTextFromElement($child);
+    //                 $text .= $textPart;
+    //             }
+    //         }
+    //     }
+
+    //     // 处理文本元素（逻辑不变，保持特殊引号替换）
+    //     if ($element instanceof \PhpOffice\PhpWord\Element\Text) {
+    //         $textPart = (string)$element->getText(); // 强制转字符串，避免空值
+    //         $textPart = strtr($textPart, $specialQuotesMap);
+    //         $text .= $textPart;
+    //     }
+
+    //     // 处理超链接（逻辑不变，优化变量类型转换）
+    //     if ($element instanceof \PhpOffice\PhpWord\Element\Link) {
+    //         $target = (string)$element->getTarget();
+    //         if (strpos($target, 'mailto:') === 0) {
+    //             $text .= rtrim(str_replace('mailto:', '', $target)) . ' ';
+    //         }
+    //         $linkText = strtr((string)$element->getText(), $specialQuotesMap);
+    //         $text .= $linkText . ' ';
+    //     }
+
+    //     // 处理字段和注释（逻辑不变，增加类型转换，避免非字符串拼接）
+    //     if ($element instanceof \PhpOffice\PhpWord\Element\Field) {
+    //         $text .= (string)$element->getContent() . ' ';
+    //     }
+    //     if ($element instanceof \PhpOffice\PhpWord\Element\Note) {
+    //         $text .= (string)$element->getContent() . ' ';
+    //     }
+
+    //     // 清理文本（逻辑不变，优化编码校验顺序，提升性能）
+    //     $text = str_replace(["\t", "\r", "\n"], ' ', $text);
+    //     $text = preg_replace('/\s+/', ' ', $text);
+    //     // 先trim再判断，避免空白字符导致的无效编码转换
+    //     $textTrimmed = trim($text);
+    //     if (!empty($textTrimmed) && !mb_check_encoding($textTrimmed, 'UTF-8')) {
+    //         $text = mb_convert_encoding($text, 'UTF-8', 'GBK');
+    //     }
+
+    //     return $text;
+    // }
    /**
     * 从 Word 文档提取摘要和关键词
     * @return array 提取结果
@@ -940,106 +1151,260 @@ class ArticleParserService
            ]
        ];
    }
-    private function fullDecode($str, $maxDepth = 5) {
-        // 空值/深度为0，直接返回（提前终止，避免无效操作）
-        if (empty($str) || $maxDepth <= 0) {
-            return $str;
-        }
+    /**
+     * 核心解码方法
+     * @param string $str 待解码字符串
+     * @param int $maxDepth 最大解析深度
+     * @return string
+     */
+    private function fullDecode($str = '', int $maxDepth = 2){
+        try {
+            if ($str === null || trim((string)$str) === '' || $maxDepth <= 0) {
+                return $str === null ? '' : trim((string)$str);
+            }

-        // 【性能优化1：预编译所有正则表达式】避免每次循环重新解析正则
-        // 预编译：≥专属场景正则
-        $regOb0 = '/0B\s*\?0/';
-        $regDl18 = '/DL\s*\?.18/';
-        // 预编译：≥通用场景正则
-        $regQMarkNum = '/\?(\d+)/';
-        $regQMarkDotNum = '/\?(\.\d+)/';
-        // 预编译：≤、≠空格修复正则
-        $regNeNum = '/≠\s*(\d+)/';
-        $regLeNum = '/≤\s*(\d+)/';
-        // 预编译：混合符号乱码正则（中文顿号/英文逗号）
-        $regMixCn = '/(\?)\s*、\s*(\?)\s*、\s*(\?)(\d+)/';
-        $regMixEn = '/(\?)\s*,\s*(\?)\s*,\s*(\?)(\d+)/';
-        // 预编译：≤、≠专属标识正则
-        $regLeMark = '/LE\s*\?(\d+)/';
-        $regNeMark = '/NE\s*\?(\d+)/';
-        // 预编译：Unicode转义正则（提取到外部，避免闭包重复创建）
-        $regUnicode = '/\\\\u([0-9a-fA-F]{4})/';
+            $str = (string)$str;

-        // 【性能优化2：预定义常量/映射】避免循环内重复创建数组/字符串
-        // HTML实体映射（一次性定义，避免循环内重复赋值）
-        $htmlEntityMap = [
-            '&le;' => '≤', '&#8804;' => '≤', '&#x2264;' => '≤',
-            '&ge;' => '≥', '&#8805;' => '≥', '&#x2265;' => '≥',
-            '&ne;' => '≠', '&#8800;' => '≠', '&#x2260;' => '≠',
-        ];
-        // 不间断空格替换数组
-        $nbspReplace = [chr(0xC2) . chr(0xA0), chr(0xA0)];
-        // Unicode回调函数（预定义，避免循环内重复创建闭包）
-        $unicodeCallback = function ($m) {
-            return mb_chr(hexdec($m[1]), 'UTF-8') ?: $m[0];
-        };
+            // Unicode解码
+            if (method_exists($this, 'decodeUnicode')) {
+                $str = $this->decodeUnicode($str);
+            } else {
+                $str = preg_replace_callback(
+                    '/\\\\[uU]([0-9a-fA-F]{4})/',
+                    function ($m) {
+                        $code = hexdec($m[1]);
+                        return ($code >= 0x20 && $code <= 0x10FFFF) ? mb_chr($code, 'UTF-8') : $m[0];
+                    },
+                    $str
+                );
+            }

-        $original = $str;
-        $depth = 0;
-        $hasChange = false; // 标记是否有变化，提前终止循环
+            // 预编译正则
+            $regexps = [
+                'ob0' => '/0B\s*\\?0/',
+                'dl18' => '/DL\s*\\?\.18/',
+                'qMarkNum' => '/\\?(\d+)/',
+                'qMarkDotNum' => '/\\?(\.\d+)/',
+                'neNum' => '/≠\s*(\d+)/u',
+                'leNum' => '/≤\s*(\d+)/u',
+                'mixSymbol' => '/\\?\s*(?:、|,)\s*\\?\s*(?:、|,)\s*\\?(\d+)/u',
+                'leNeMark' => '/(LE|NE)\s*\\?(\d+)/i',
+                'unicode' => '/\\\\[uU]([0-9a-fA-F]{4})/',
+                'wordBin' => '/(?:\\xE2\\x89\\x86|\\xE2\s*0x89\s*0x86|e28986|\\xE2\\x89\\x87|\\xE2\s*0x89\s*0x87|e28987|\\xE2\\x89\\x80|\\xE2\s*0x89\s*0x80|e28980)/i',
+                'wordEntity' => '/&#\s*(?:x|X)?\s*(2264|2265|2260)\s*;?/i',
+                'repeatSymbol' => '/(≤|≥|≠)\1+/u',
+                'gbkSymbol' => '/\xA1\xF2|\xA1\xF3|\xA1\xF0/'
+            ];

-        // 循环解码：仅在有变化且未达最大深度时执行
-        do {
-            $depth++;
+            // 预定义替换映射
+            $maps = [
+                'htmlEntity' => [
+                    '&le;' => '≤', '&#8804;' => '≤', '&#x2264;' => '≤', '&#X2264;' => '≤',
+                    '&ge;' => '≥', '&#8805;' => '≥', '&#x2265;' => '≥', '&#X2265;' => '≥',
+                    '&ne;' => '≠', '&#8800;' => '≠', '&#x2260;' => '≠', '&#X2260;' => '≠',
+                    '&le' => '≤', '&ge' => '≥', '&ne' => '≠',
+                    '&#2264' => '≤', '&#2265' => '≥', '&#2260' => '≠',
+                    '&#x2264' => '≤', '&#x2265' => '≥', '&#x2260' => '≠',
+                    '&#60;' => '≤', '&#62;' => '≥',
+                ],
+                'wordBin' => [
+                    "\xE2\x89\x86" => '≤', "\xE2\x89\x87" => '≥', "\xE2\x89\x80" => '≠',
+                    "\xe2\x89\x86" => '≤', "\xe2\x89\x87" => '≥', "\xe2\x89\x80" => '≠',
+                    'e28986' => '≤', '\xe2\x89\x86' => '≤', '\xe20x890x86' => '≤',
+                    'e28987' => '≥', '\xe2\x89\x87' => '≥', '\xe20x890x87' => '≥',
+                    'e28980' => '≠', '\xe2\x89\x80' => '≠', '\xe20x890x80' => '≠',
+                ],
+                'wordEntity' => ['2264' => '≤', '2265' => '≥', '2260' => '≠'],
+                'gbkSymbol' => ["\xA1\xF2" => '≤', "\xA1\xF3" => '≥', "\xA1\xF0" => '≠'],
+            ];
+
+            $unicodeCallback = function ($m) {
+                $code = hexdec($m[1]);
+                return ($code >= 0x20 && $code <= 0x10FFFF) ? mb_chr($code, 'UTF-8') : $m[0];
+            };
+
+            $depth = 0;
            $hasChange = false;
-            $prevStr = $str; // 保存当前状态，用于判断变化
+            $currentStr = $str;

-            // 1. 解码Unicode转义（\uXXXX格式）
-            $str = $this->decodeUnicode($str);
+            // 循环解码
+            do {
+                $depth++;
+                $hasChange = false;
+                $prevStr = $currentStr;

-            // 2. 解码HTML实体（先替换专属实体，再执行通用解码）
-            $str = strtr($str, $htmlEntityMap); // 高性能替换（strtr比str_replace快）
-            $str = html_entity_decode($str, ENT_QUOTES | ENT_HTML5, 'UTF-8');
+                // Unicode转义解码
+                $currentStr = preg_replace_callback($regexps['unicode'], $unicodeCallback, $currentStr);

-            // 3. 再次处理遗漏的Unicode转义（使用预编译正则+预定义回调）
-            $str = preg_replace_callback($regUnicode, $unicodeCallback, $str);
+                //HTML实体替换
+                $currentStr = strtr($currentStr, $maps['htmlEntity']);
+                $currentStr = html_entity_decode(
+                    $currentStr,
+                    ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE,
+                    'UTF-8'
+                );

-            // 4. 替换不间断空格为普通空格（strtr比str_replace更高效）
-            $str = str_replace($nbspReplace, ' ', $str);
+                //  Word特殊符号乱码修复
+                if (preg_match($regexps['wordBin'], $currentStr)) {
+                    $tempStr = str_replace(' ', '', $currentStr);
+                    $currentStr = str_ireplace(array_keys($maps['wordBin']), $maps['wordBin'], $tempStr);
+                }
+                if (preg_match($regexps['wordEntity'], $currentStr)) {
+                    $currentStr = preg_replace_callback(
+                        $regexps['wordEntity'],
+                        function ($m) use ($maps) {
+                            return $maps['wordEntity'][$m[1]] ?? $m[0];
+                        },
+                        $currentStr
+                    );
+                }
+                if (preg_match($regexps['gbkSymbol'], $currentStr)) {
+                    $currentStr = strtr($currentStr, $maps['gbkSymbol']);
+                }
+                if (preg_match($regexps['repeatSymbol'], $currentStr)) {
+                    $currentStr = preg_replace($regexps['repeatSymbol'], '$1', $currentStr);
+                }

-            // 5. 核心替换逻辑（优化执行顺序，避免覆盖）
-            // 5.1 原有≥专属场景（保留）
-            $str = preg_replace($regOb0, '0B≥30', $str, -1, $count1);
-            $str = preg_replace($regDl18, 'DL≥0.18', $str, -1, $count2);
-            // 5.2 ≤、≠空格修复（保留）
-            $str = preg_replace($regNeNum, '≠$1', $str, -1, $count3);
-            $str = preg_replace($regLeNum, '≤$1', $str, -1, $count4);
-            // 5.3 原有≥通用场景（保留）
-            $str = preg_replace($regQMarkNum, '≥$1', $str, -1, $count5);
-            $str = preg_replace($regQMarkDotNum, '≥0$1', $str, -1, $count6);
-            // 5.4 混合符号乱码还原（保留）
-            $str = preg_replace($regMixCn, '≤、≥、≠$4', $str, -1, $count7);
-            $str = preg_replace($regMixEn, '≤、≥、≠$4', $str, -1, $count8);
-            // 5.5 ≤、≠专属标识还原（保留）
-            $str = preg_replace($regLeMark, '≤$1', $str, -1, $count9);
-            $str = preg_replace($regNeMark, '≠$1', $str, -1, $count10);
+                //业务场景专属替换
+                if (preg_match($regexps['neNum'], $currentStr)) {
+                    $currentStr = preg_replace($regexps['neNum'], '≠$1', $currentStr);
+                }
+                if (preg_match($regexps['leNum'], $currentStr)) {
+                    $currentStr = preg_replace($regexps['leNum'], '≤$1', $currentStr);
+                }
+                if (preg_match($regexps['qMarkNum'], $currentStr)) {
+                    $currentStr = preg_replace($regexps['qMarkNum'], '≥$1', $currentStr);
+                }
+                if (preg_match($regexps['qMarkDotNum'], $currentStr)) {
+                    $currentStr = preg_replace($regexps['qMarkDotNum'], '≥0$1', $currentStr);
+                }
+                if (preg_match($regexps['mixSymbol'], $currentStr)) {
+                    $currentStr = preg_replace($regexps['mixSymbol'], '≤≥≠$1', $currentStr);
+                }
+                if (preg_match($regexps['leNeMark'], $currentStr)) {
+                    $currentStr = preg_replace_callback(
+                        $regexps['leNeMark'],
+                        function ($m) {
+                            return strtoupper($m[1]) === 'LE' ? '≤' . $m[2] : '≠' . $m[2];
+                        },
+                        $currentStr
+                    );
+                }

-            // 5.6 修复前缀"d with "乱码（保留）
-            $str = str_replace('d with ', 'd with ', $str, $count11);
+                $hasChange = ($currentStr !== $prevStr);
+            } while ($depth < $maxDepth && $hasChange);

-            // 【性能优化3：统计所有替换次数，判断是否有变化】
-            $totalCount = $count1 + $count2 + $count3 + $count4 + $count5 + $count6 +
-                          $count7 + $count8 + $count9 + $count10 + $count11;
-            if ($totalCount > 0 || $str !== $prevStr) {
-                $hasChange = true;
-                $original = $str;
-            }
+            // 最终清理
+            $currentStr = trim($currentStr, ':');
+            $currentStr = strtr($currentStr, $maps['htmlEntity']);

-            // 【性能优化4：提前终止】单次循环无变化，直接退出
-            if (!$hasChange) {
-                break;
-            }
+            return $currentStr;

-        } while ($depth < $maxDepth); // 改用do-while，减少循环判断次数
-
-        // 最终清理：仅执行一次trim
-        return trim($str, ':');
+        } catch (\Throwable $e) {
+            return trim((string)$str);
+        }
    }
+
+    // private function fullDecode($str, $maxDepth = 5) {
+    //     // 空值/深度为0，直接返回（提前终止，避免无效操作）
+    //     if (empty($str) || $maxDepth <= 0) {
+    //         return $str;
+    //     }
+
+    //     // 【性能优化1：预编译所有正则表达式】避免每次循环重新解析正则
+    //     // 预编译：≥专属场景正则
+    //     $regOb0 = '/0B\s*\?0/';
+    //     $regDl18 = '/DL\s*\?.18/';
+    //     // 预编译：≥通用场景正则
+    //     $regQMarkNum = '/\?(\d+)/';
+    //     $regQMarkDotNum = '/\?(\.\d+)/';
+    //     // 预编译：≤、≠空格修复正则
+    //     $regNeNum = '/≠\s*(\d+)/';
+    //     $regLeNum = '/≤\s*(\d+)/';
+    //     // 预编译：混合符号乱码正则（中文顿号/英文逗号）
+    //     $regMixCn = '/(\?)\s*、\s*(\?)\s*、\s*(\?)(\d+)/';
+    //     $regMixEn = '/(\?)\s*,\s*(\?)\s*,\s*(\?)(\d+)/';
+    //     // 预编译：≤、≠专属标识正则
+    //     $regLeMark = '/LE\s*\?(\d+)/';
+    //     $regNeMark = '/NE\s*\?(\d+)/';
+    //     // 预编译：Unicode转义正则（提取到外部，避免闭包重复创建）
+    //     $regUnicode = '/\\\\u([0-9a-fA-F]{4})/';
+
+    //     // 【性能优化2：预定义常量/映射】避免循环内重复创建数组/字符串
+    //     // HTML实体映射（一次性定义，避免循环内重复赋值）
+    //     $htmlEntityMap = [
+    //         '&le;' => '≤', '&#8804;' => '≤', '&#x2264;' => '≤',
+    //         '&ge;' => '≥', '&#8805;' => '≥', '&#x2265;' => '≥',
+    //         '&ne;' => '≠', '&#8800;' => '≠', '&#x2260;' => '≠',
+    //     ];
+    //     // 不间断空格替换数组
+    //     $nbspReplace = [chr(0xC2) . chr(0xA0), chr(0xA0)];
+    //     // Unicode回调函数（预定义，避免循环内重复创建闭包）
+    //     $unicodeCallback = function ($m) {
+    //         return mb_chr(hexdec($m[1]), 'UTF-8') ?: $m[0];
+    //     };
+
+    //     $original = $str;
+    //     $depth = 0;
+    //     $hasChange = false; // 标记是否有变化，提前终止循环
+
+    //     // 循环解码：仅在有变化且未达最大深度时执行
+    //     do {
+    //         $depth++;
+    //         $hasChange = false;
+    //         $prevStr = $str; // 保存当前状态，用于判断变化
+
+    //         // 1. 解码Unicode转义（\uXXXX格式）
+    //         $str = $this->decodeUnicode($str);
+
+    //         // 2. 解码HTML实体（先替换专属实体，再执行通用解码）
+    //         $str = strtr($str, $htmlEntityMap); // 高性能替换（strtr比str_replace快）
+    //         $str = html_entity_decode($str, ENT_QUOTES | ENT_HTML5, 'UTF-8');
+
+    //         // 3. 再次处理遗漏的Unicode转义（使用预编译正则+预定义回调）
+    //         $str = preg_replace_callback($regUnicode, $unicodeCallback, $str);
+
+    //         // 4. 替换不间断空格为普通空格（strtr比str_replace更高效）
+    //         $str = str_replace($nbspReplace, ' ', $str);
+
+    //         // 5. 核心替换逻辑（优化执行顺序，避免覆盖）
+    //         // 5.1 原有≥专属场景（保留）
+    //         $str = preg_replace($regOb0, '0B≥30', $str, -1, $count1);
+    //         $str = preg_replace($regDl18, 'DL≥0.18', $str, -1, $count2);
+    //         // 5.2 ≤、≠空格修复（保留）
+    //         $str = preg_replace($regNeNum, '≠$1', $str, -1, $count3);
+    //         $str = preg_replace($regLeNum, '≤$1', $str, -1, $count4);
+    //         // 5.3 原有≥通用场景（保留）
+    //         $str = preg_replace($regQMarkNum, '≥$1', $str, -1, $count5);
+    //         $str = preg_replace($regQMarkDotNum, '≥0$1', $str, -1, $count6);
+    //         // 5.4 混合符号乱码还原（保留）
+    //         $str = preg_replace($regMixCn, '≤、≥、≠$4', $str, -1, $count7);
+    //         $str = preg_replace($regMixEn, '≤、≥、≠$4', $str, -1, $count8);
+    //         // 5.5 ≤、≠专属标识还原（保留）
+    //         $str = preg_replace($regLeMark, '≤$1', $str, -1, $count9);
+    //         $str = preg_replace($regNeMark, '≠$1', $str, -1, $count10);
+
+    //         // 5.6 修复前缀"d with "乱码（保留）
+    //         $str = str_replace('d with ', 'd with ', $str, $count11);
+
+    //         // 【性能优化3：统计所有替换次数，判断是否有变化】
+    //         $totalCount = $count1 + $count2 + $count3 + $count4 + $count5 + $count6 +
+    //                       $count7 + $count8 + $count9 + $count10 + $count11;
+    //         if ($totalCount > 0 || $str !== $prevStr) {
+    //             $hasChange = true;
+    //             $original = $str;
+    //         }
+
+    //         // 【性能优化4：提前终止】单次循环无变化，直接退出
+    //         if (!$hasChange) {
+    //             break;
+    //         }
+
+    //     } while ($depth < $maxDepth); // 改用do-while，减少循环判断次数
+
+    //     // 最终清理：仅执行一次trim
+    //     return trim($str, ':');
+    // }
    // private function fullDecode($str, $maxDepth = 5) {
    //     if (empty($str) || $maxDepth <= 0) {
    //         return $str;