代码修改

2025-12-02 14:26:53 +08:00
parent 705dce5e94
commit f15d072b2e
1 changed files with 138 additions and 203 deletions
--- a/application/common/ArticleParserService.php
+++ b/application/common/ArticleParserService.php
@@ -1152,222 +1152,157 @@ class ArticleParserService
        ];
    }
    /**
-     * 核心解码方法（无静态缓存，高性能版）
+     * 核心解码方法
     * @param string $str 待解码字符串
     * @param int $maxDepth 最大解析深度
     * @return string
     */
-   private function fullDecode(?string $str, int $maxDepth = 2){
+    private function fullDecode($str = '', int $maxDepth = 2){
-        // 空值/无效深度/纯空格，直接返回（严谨前置判断，避免无效运算）
+        try {
-        if ($str === null || trim((string)$str) === '' || $maxDepth <= 0) {
+            if ($str === null || trim((string)$str) === '' || $maxDepth <= 0) {
-            return $str === null ? '' : trim((string)$str);
+                return $str === null ? '' : trim((string)$str);
        }
        // 确保输入是字符串（兼容非字符串输入场景）
        $str = (string)$str;
        // 前置Unicode解码（避免转义字符干扰后续匹配）
        $str = $this->decodeUnicode($str);
        // ========== 预编译正则（优化匹配精度、避免歧义，仅编译一次） ==========
        $regexps = [
            // 专属场景正则：优化空格匹配（任意空白字符）+ 问号转义（避免正则歧义）
            'ob0' => '/0B\s*\\?0/',          // 匹配 0B?0、0B  ?0 等场景
            'dl18' => '/DL\s*\\?\.18/',      // 精准匹配 DL?.18（避免误匹配 DL?x.18）
            // 通用场景正则：问号转义，确保仅匹配字面问号
            'qMarkNum' => '/\\?(\d+)/',       // 匹配 ?123、?45 等（问号转义）
            'qMarkDotNum' => '/\\?(\.\d+)/',  // 匹配 ?.18、?.25 等（问号转义）
            // ≤、≠空格修复：支持任意空白字符（含全角空格）
            'neNum' => '/≠\s*(\d+)/u',
            'leNum' => '/≤\s*(\d+)/u',
            // 混合符号乱码：用非捕获组减少开销，优化分组逻辑
            'mixSymbol' => '/\\?\s*(?:、|,)\s*\\?\s*(?:、|,)\s*\\?(\d+)/u',
            // ≤、≠专属标识：支持大小写不敏感（覆盖 LE/le/NE/ne）
            'leNeMark' => '/(LE|NE)\s*\\?(\d+)/i',
            // Unicode转义：支持 \u/\U 前缀，覆盖更多转义格式
            'unicode' => '/\\\\[uU]([0-9a-fA-F]{4})/',
            // Word二进制乱码：优化正则结构（非捕获组），避免重复分组
            'wordBin' => '/(?:\\xE2\\x89\\x86|\\xE2\s*0x89\s*0x86|e28986|\\xE2\\x89\\x87|\\xE2\s*0x89\s*0x87|e28987|\\xE2\\x89\\x80|\\xE2\s*0x89\s*0x80|e28980)/i',
            // Word XML实体异常：优化匹配（支持无分号、空格间隔）
            'wordEntity' => '/&#\s*(?:x|X)?\s*(2264|2265|2260)\s*;?/i',
            // 不可见控制字符：添加UTF-8修饰符，避免匹配多字节字符异常
            'controlChar' => '/[\x00-\x1F\x7F]/u',
            // 重复符号去重：用反向引用优化，匹配更高效（支持≤≥≠）
            'repeatSymbol' => '/(≤|≥|≠)\1+/u',
            // GBK编码乱码：优化正则（无冗余分组），确保匹配原生字节
            'gbkSymbol' => '/\xA1\xF2|\xA1\xF3|\xA1\xF0/'
        ];
        // ========== 预定义替换映射（扩展场景、去冗余、修复转义问题） ==========
        $maps = [
            // HTML实体映射：补充更多Word常见实体，覆盖不完整实体场景
            'htmlEntity' => [
                '&le;' => '≤', '&#8804;' => '≤', '&#x2264;' => '≤', '&#X2264;' => '≤',
                '&ge;' => '≥', '&#8805;' => '≥', '&#x2265;' => '≥', '&#X2265;' => '≥',
                '&ne;' => '≠', '&#8800;' => '≠', '&#x2260;' => '≠', '&#X2260;' => '≠',
                '&le' => '≤', '&ge' => '≥', '&ne' => '≠',  // 无分号实体
                '&#2264' => '≤', '&#2265' => '≥', '&#2260' => '≠',  // 无分号数字实体
                '&#x2264' => '≤', '&#x2265' => '≥', '&#x2260' => '≠',  // 无分号十六进制实体
                '&#60;' => '≤', '&#62;' => '≥',  // 业务专属映射（保留）
            ],
            // 空格替换数组：补充Word中常见的特殊空格，覆盖更多场景
            'nbsp' => [
                chr(0xC2) . chr(0xA0),  // UTF-8不间断空格（&nbsp;）
                chr(0xA0),              // 拉丁1不间断空格
                '　',                    // 全角空格（U+3000）
                chr(0x2002),            // 半角空格（U+2002）
                chr(0x2003),            // 全角空格（U+2003）
                chr(0x2004),            // 三分之一全角空格（U+2004）
                chr(0x2005),            // 四分之一全角空格（U+2005）
                chr(0x202F),            // 窄无中断空格（U+202F，Word常用）
            ],
            // 二进制乱码映射：统一键名格式（去除空格），避免重复匹配
            'wordBin' => [
                'e28986' => '≤',
                '\xe2\x89\x86' => '≤',
                '\xe20x890x86' => '≤',  // 去除空格后的统一键名
                'e28987' => '≥',
                '\xe2\x89\x87' => '≥',
                '\xe20x890x87' => '≥',
                'e28980' => '≠',
                '\xe2\x89\x80' => '≠',
                '\xe20x890x80' => '≠',
            ],
            // XML实体编码映射：保持简洁，仅映射核心数字
            'wordEntity' => [
                '2264' => '≤',
                '2265' => '≥',
                '2260' => '≠',
            ],
            // GBK编码映射：修复转义问题（用双引号包裹原生字节，避免匹配失败）
            'gbkSymbol' => [
                "\xA1\xF2" => '≤',  // 原生GBK字节，无需转义（双引号关键）
                "\xA1\xF3" => '≥',
                "\xA1\xF0" => '≠',
            ],
        ];
        // 预定义回调函数（仅创建一次，提升性能，增加容错）
        $unicodeCallback = function ($m) {
            $code = hexdec($m[1]);
            // 容错：十六进制转换失败/无效Unicode码点，返回原始值
            return ($code >= 0x20 && $code <= 0x10FFFF) ? mb_chr($code, 'UTF-8') : $m[0];
        };
        $depth = 0;
        $hasChange = false;
        $currentStr = $str;
        // 循环解码：仅在有变化且未达最大深度时执行（避免无限循环）
        do {
            $depth++;
            $hasChange = false;
            $prevStr = $currentStr;
            // ========== 前置处理（惰性执行，仅在需要时触发） ==========
            // 1. 过滤不可见控制字符（仅当包含时执行）
            if (preg_match($regexps['controlChar'], $currentStr)) {
                $currentStr = preg_replace($regexps['controlChar'], '', $currentStr);
            }
-            // 2. 编码校正（非UTF-8时才转换，增加容错机制）
+            $str = (string)$str;
-            if (!mb_check_encoding($currentStr, 'UTF-8')) {
+
-                $converted = mb_convert_encoding(
+            // Unicode解码
-                    $currentStr,
+            if (method_exists($this, 'decodeUnicode')) {
-                    'UTF-8',
+                $str = $this->decodeUnicode($str);
-                    'GBK,GB2312,ISO-8859-1,CP1252'  // 补充CP1252（Windows西文编码）
+            } else {
                $str = preg_replace_callback(
                    '/\\\\[uU]([0-9a-fA-F]{4})/',
                    function ($m) {
                        $code = hexdec($m[1]);
                        return ($code >= 0x20 && $code <= 0x10FFFF) ? mb_chr($code, 'UTF-8') : $m[0];
                    },
                    $str
                );
                // 容错：转换失败时保留原文本，避免乱码加剧
                $currentStr = mb_check_encoding($converted, 'UTF-8') ? $converted : $currentStr;
            }
-            // ========== 核心解码逻辑（按优先级执行，避免冲突） ==========
+            // 预编译正则
-            // 1. Unicode转义解码（优先处理，避免转义字符干扰后续匹配）
+            $regexps = [
-            $currentStr = preg_replace_callback($regexps['unicode'], $unicodeCallback, $currentStr);
+                'ob0' => '/0B\s*\\?0/',
                'dl18' => '/DL\s*\\?\.18/',
                'qMarkNum' => '/\\?(\d+)/',
                'qMarkDotNum' => '/\\?(\.\d+)/',
                'neNum' => '/≠\s*(\d+)/u',
                'leNum' => '/≤\s*(\d+)/u',
                'mixSymbol' => '/\\?\s*(?:、|,)\s*\\?\s*(?:、|,)\s*\\?(\d+)/u',
                'leNeMark' => '/(LE|NE)\s*\\?(\d+)/i',
                'unicode' => '/\\\\[uU]([0-9a-fA-F]{4})/',
                'wordBin' => '/(?:\\xE2\\x89\\x86|\\xE2\s*0x89\s*0x86|e28986|\\xE2\\x89\\x87|\\xE2\s*0x89\s*0x87|e28987|\\xE2\\x89\\x80|\\xE2\s*0x89\s*0x80|e28980)/i',
                'wordEntity' => '/&#\s*(?:x|X)?\s*(2264|2265|2260)\s*;?/i',
                'repeatSymbol' => '/(≤|≥|≠)\1+/u',
                'gbkSymbol' => '/\xA1\xF2|\xA1\xF3|\xA1\xF0/'
            ];
-            // 2. HTML实体替换（先精准映射，再解码剩余实体）
+            // 预定义替换映射
            $maps = [
                'htmlEntity' => [
                    '&le;' => '≤', '&#8804;' => '≤', '&#x2264;' => '≤', '&#X2264;' => '≤',
                    '&ge;' => '≥', '&#8805;' => '≥', '&#x2265;' => '≥', '&#X2265;' => '≥',
                    '&ne;' => '≠', '&#8800;' => '≠', '&#x2260;' => '≠', '&#X2260;' => '≠',
                    '&le' => '≤', '&ge' => '≥', '&ne' => '≠',
                    '&#2264' => '≤', '&#2265' => '≥', '&#2260' => '≠',
                    '&#x2264' => '≤', '&#x2265' => '≥', '&#x2260' => '≠',
                    '&#60;' => '≤', '&#62;' => '≥',
                ],
                'wordBin' => [
                    "\xE2\x89\x86" => '≤', "\xE2\x89\x87" => '≥', "\xE2\x89\x80" => '≠',
                    "\xe2\x89\x86" => '≤', "\xe2\x89\x87" => '≥', "\xe2\x89\x80" => '≠',
                    'e28986' => '≤', '\xe2\x89\x86' => '≤', '\xe20x890x86' => '≤',
                    'e28987' => '≥', '\xe2\x89\x87' => '≥', '\xe20x890x87' => '≥',
                    'e28980' => '≠', '\xe2\x89\x80' => '≠', '\xe20x890x80' => '≠',
                ],
                'wordEntity' => ['2264' => '≤', '2265' => '≥', '2260' => '≠'],
                'gbkSymbol' => ["\xA1\xF2" => '≤', "\xA1\xF3" => '≥', "\xA1\xF0" => '≠'],
            ];
            $unicodeCallback = function ($m) {
                $code = hexdec($m[1]);
                return ($code >= 0x20 && $code <= 0x10FFFF) ? mb_chr($code, 'UTF-8') : $m[0];
            };
            $depth = 0;
            $hasChange = false;
            $currentStr = $str;
            // 循环解码
            do {
                $depth++;
                $hasChange = false;
                $prevStr = $currentStr;
                // Unicode转义解码
                $currentStr = preg_replace_callback($regexps['unicode'], $unicodeCallback, $currentStr);
                //HTML实体替换
                $currentStr = strtr($currentStr, $maps['htmlEntity']);
                $currentStr = html_entity_decode(
                    $currentStr,
                    ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE,
                    'UTF-8'
                );
                //  Word特殊符号乱码修复
                if (preg_match($regexps['wordBin'], $currentStr)) {
                    $tempStr = str_replace(' ', '', $currentStr);
                    $currentStr = str_ireplace(array_keys($maps['wordBin']), $maps['wordBin'], $tempStr);
                }
                if (preg_match($regexps['wordEntity'], $currentStr)) {
                    $currentStr = preg_replace_callback(
                        $regexps['wordEntity'],
                        function ($m) use ($maps) {
                            return $maps['wordEntity'][$m[1]] ?? $m[0];
                        },
                        $currentStr
                    );
                }
                if (preg_match($regexps['gbkSymbol'], $currentStr)) {
                    $currentStr = strtr($currentStr, $maps['gbkSymbol']);
                }
                if (preg_match($regexps['repeatSymbol'], $currentStr)) {
                    $currentStr = preg_replace($regexps['repeatSymbol'], '$1', $currentStr);
                }
                //业务场景专属替换
                if (preg_match($regexps['neNum'], $currentStr)) {
                    $currentStr = preg_replace($regexps['neNum'], '≠$1', $currentStr);
                }
                if (preg_match($regexps['leNum'], $currentStr)) {
                    $currentStr = preg_replace($regexps['leNum'], '≤$1', $currentStr);
                }
                if (preg_match($regexps['qMarkNum'], $currentStr)) {
                    $currentStr = preg_replace($regexps['qMarkNum'], '≥$1', $currentStr);
                }
                if (preg_match($regexps['qMarkDotNum'], $currentStr)) {
                    $currentStr = preg_replace($regexps['qMarkDotNum'], '≥0$1', $currentStr);
                }
                if (preg_match($regexps['mixSymbol'], $currentStr)) {
                    $currentStr = preg_replace($regexps['mixSymbol'], '≤≥≠$1', $currentStr);
                }
                if (preg_match($regexps['leNeMark'], $currentStr)) {
                    $currentStr = preg_replace_callback(
                        $regexps['leNeMark'],
                        function ($m) {
                            return strtoupper($m[1]) === 'LE' ? '≤' . $m[2] : '≠' . $m[2];
                        },
                        $currentStr
                    );
                }
                $hasChange = ($currentStr !== $prevStr);
            } while ($depth < $maxDepth && $hasChange);
            // 最终清理
            $currentStr = trim($currentStr, ':');
            $currentStr = strtr($currentStr, $maps['htmlEntity']);
            $currentStr = html_entity_decode(
                $currentStr,
                ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE,
                'UTF-8'
            );
-            // 3. 统一所有空格为普通空格（避免空格类型导致的匹配失败）
+            return $currentStr;
            $currentStr = str_replace($maps['nbsp'], ' ', $currentStr);
-            // ========== Word特殊符号乱码修复（惰性执行，优化效率） ==========
+        } catch (\Throwable $e) {
-            // 1. 二进制乱码还原（先去除空格统一格式，再匹配）
+            return trim((string)$str);
-            if (preg_match($regexps['wordBin'], $currentStr)) {
+        }
                $tempStr = str_replace(' ', '', $currentStr);  // 去除所有空格，统一键名格式
                $currentStr = str_ireplace(array_keys($maps['wordBin']), $maps['wordBin'], $tempStr);
            }
            // 2. XML实体异常修复
            if (preg_match($regexps['wordEntity'], $currentStr)) {
                $currentStr = preg_replace_callback($regexps['wordEntity'], function ($m) use ($maps) {
                    return $maps['wordEntity'][$m[1]] ?? $m[0];
                }, $currentStr);
            }
            // 3. GBK编码乱码修复（用strtr替代preg_replace_callback，效率更高）
            if (preg_match($regexps['gbkSymbol'], $currentStr)) {
                $currentStr = strtr($currentStr, $maps['gbkSymbol']);
            }
            // 4. 重复符号去重（用preg_replace简化，无需回调）
            if (preg_match($regexps['repeatSymbol'], $currentStr)) {
                $currentStr = preg_replace($regexps['repeatSymbol'], '$1', $currentStr);
            }
            // ========== 业务场景专属替换（惰性执行，精准匹配） ==========
            // 1. 专属场景替换（0B?0 → 0B≥30，DL?.18 → DL≥0.18）
            if (strpos($currentStr, '0B') !== false) {
                $currentStr = preg_replace($regexps['ob0'], '0B≥30', $currentStr);
            }
            if (strpos($currentStr, 'DL') !== false) {
                $currentStr = preg_replace($regexps['dl18'], 'DL≥0.18', $currentStr);
            }
            // 2. ≤、≠空格修复（去除符号与数字间的空格）
            if (preg_match($regexps['neNum'], $currentStr)) {
                $currentStr = preg_replace($regexps['neNum'], '≠$1', $currentStr);
            }
            if (preg_match($regexps['leNum'], $currentStr)) {
                $currentStr = preg_replace($regexps['leNum'], '≤$1', $currentStr);
            }
            // 3. 通用场景替换（问号 → ≥）
            if (preg_match($regexps['qMarkNum'], $currentStr)) {
                $currentStr = preg_replace($regexps['qMarkNum'], '≥$1', $currentStr);
            }
            if (preg_match($regexps['qMarkDotNum'], $currentStr)) {
                $currentStr = preg_replace($regexps['qMarkDotNum'], '≥0$1', $currentStr);
            }
            // 4. 混合符号乱码还原（?、，?、，?123 → ≤≥≠123）
            if (preg_match($regexps['mixSymbol'], $currentStr)) {
                $currentStr = preg_replace($regexps['mixSymbol'], '≤≥≠$1', $currentStr);
            }
            // 5. ≤、≠专属标识还原（LE?123 → ≤123，NE?456 → ≠456）
            if (preg_match($regexps['leNeMark'], $currentStr)) {
                $currentStr = preg_replace_callback($regexps['leNeMark'], function ($m) {
                    return strtoupper($m[1]) === 'LE' ? '≤' . $m[2] : '≠' . $m[2];
                }, $currentStr);
            }
            // 6. 移除冗余代码（原代码"d with "替换无意义，直接删除）
            // ========== 变化判断（简化逻辑，避免无效计数） ==========
            $hasChange = ($currentStr !== $prevStr);
        } while ($depth < $maxDepth && $hasChange);
        // 最终清理（去除首尾冒号+二次实体替换，确保无遗漏）
        $currentStr = trim($currentStr, ':');
        $currentStr = strtr($currentStr, $maps['htmlEntity']);
        return $currentStr;
    }
    // private function fullDecode($str, $maxDepth = 5) {