代码修改

2025-12-02 14:26:53 +08:00
parent 705dce5e94
commit f15d072b2e
1 changed files with 138 additions and 203 deletions
--- a/application/common/ArticleParserService.php
+++ b/application/common/ArticleParserService.php
@@ -1152,222 +1152,157 @@ class ArticleParserService
        ];
    }
    /**
-     * 核心解码方法（无静态缓存，高性能版）
+     * 核心解码方法
     * @param string $str 待解码字符串
     * @param int $maxDepth 最大解析深度
     * @return string
     */
-   private function fullDecode(?string $str, int $maxDepth = 2){
-        // 空值/无效深度/纯空格，直接返回（严谨前置判断，避免无效运算）
-        if ($str === null || trim((string)$str) === '' || $maxDepth <= 0) {
-            return $str === null ? '' : trim((string)$str);
-        }
-
-        // 确保输入是字符串（兼容非字符串输入场景）
-        $str = (string)$str;
-        // 前置Unicode解码（避免转义字符干扰后续匹配）
-        $str = $this->decodeUnicode($str);
-
-        // ========== 预编译正则（优化匹配精度、避免歧义，仅编译一次） ==========
-        $regexps = [
-            // 专属场景正则：优化空格匹配（任意空白字符）+ 问号转义（避免正则歧义）
-            'ob0' => '/0B\s*\\?0/',          // 匹配 0B?0、0B  ?0 等场景
-            'dl18' => '/DL\s*\\?\.18/',      // 精准匹配 DL?.18（避免误匹配 DL?x.18）
-            // 通用场景正则：问号转义，确保仅匹配字面问号
-            'qMarkNum' => '/\\?(\d+)/',       // 匹配 ?123、?45 等（问号转义）
-            'qMarkDotNum' => '/\\?(\.\d+)/',  // 匹配 ?.18、?.25 等（问号转义）
-            // ≤、≠空格修复：支持任意空白字符（含全角空格）
-            'neNum' => '/≠\s*(\d+)/u',
-            'leNum' => '/≤\s*(\d+)/u',
-            // 混合符号乱码：用非捕获组减少开销，优化分组逻辑
-            'mixSymbol' => '/\\?\s*(?:、|,)\s*\\?\s*(?:、|,)\s*\\?(\d+)/u',
-            // ≤、≠专属标识：支持大小写不敏感（覆盖 LE/le/NE/ne）
-            'leNeMark' => '/(LE|NE)\s*\\?(\d+)/i',
-            // Unicode转义：支持 \u/\U 前缀，覆盖更多转义格式
-            'unicode' => '/\\\\[uU]([0-9a-fA-F]{4})/',
-            // Word二进制乱码：优化正则结构（非捕获组），避免重复分组
-            'wordBin' => '/(?:\\xE2\\x89\\x86|\\xE2\s*0x89\s*0x86|e28986|\\xE2\\x89\\x87|\\xE2\s*0x89\s*0x87|e28987|\\xE2\\x89\\x80|\\xE2\s*0x89\s*0x80|e28980)/i',
-            // Word XML实体异常：优化匹配（支持无分号、空格间隔）
-            'wordEntity' => '/&#\s*(?:x|X)?\s*(2264|2265|2260)\s*;?/i',
-            // 不可见控制字符：添加UTF-8修饰符，避免匹配多字节字符异常
-            'controlChar' => '/[\x00-\x1F\x7F]/u',
-            // 重复符号去重：用反向引用优化，匹配更高效（支持≤≥≠）
-            'repeatSymbol' => '/(≤|≥|≠)\1+/u',
-            // GBK编码乱码：优化正则（无冗余分组），确保匹配原生字节
-            'gbkSymbol' => '/\xA1\xF2|\xA1\xF3|\xA1\xF0/'
-        ];
-
-        // ========== 预定义替换映射（扩展场景、去冗余、修复转义问题） ==========
-        $maps = [
-            // HTML实体映射：补充更多Word常见实体，覆盖不完整实体场景
-            'htmlEntity' => [
-                '&le;' => '≤', '&#8804;' => '≤', '&#x2264;' => '≤', '&#X2264;' => '≤',
-                '&ge;' => '≥', '&#8805;' => '≥', '&#x2265;' => '≥', '&#X2265;' => '≥',
-                '&ne;' => '≠', '&#8800;' => '≠', '&#x2260;' => '≠', '&#X2260;' => '≠',
-                '&le' => '≤', '&ge' => '≥', '&ne' => '≠',  // 无分号实体
-                '&#2264' => '≤', '&#2265' => '≥', '&#2260' => '≠',  // 无分号数字实体
-                '&#x2264' => '≤', '&#x2265' => '≥', '&#x2260' => '≠',  // 无分号十六进制实体
-                '&#60;' => '≤', '&#62;' => '≥',  // 业务专属映射（保留）
-            ],
-            // 空格替换数组：补充Word中常见的特殊空格，覆盖更多场景
-            'nbsp' => [
-                chr(0xC2) . chr(0xA0),  // UTF-8不间断空格（&nbsp;）
-                chr(0xA0),              // 拉丁1不间断空格
-                '　',                    // 全角空格（U+3000）
-                chr(0x2002),            // 半角空格（U+2002）
-                chr(0x2003),            // 全角空格（U+2003）
-                chr(0x2004),            // 三分之一全角空格（U+2004）
-                chr(0x2005),            // 四分之一全角空格（U+2005）
-                chr(0x202F),            // 窄无中断空格（U+202F，Word常用）
-            ],
-            // 二进制乱码映射：统一键名格式（去除空格），避免重复匹配
-            'wordBin' => [
-                'e28986' => '≤',
-                '\xe2\x89\x86' => '≤',
-                '\xe20x890x86' => '≤',  // 去除空格后的统一键名
-                'e28987' => '≥',
-                '\xe2\x89\x87' => '≥',
-                '\xe20x890x87' => '≥',
-                'e28980' => '≠',
-                '\xe2\x89\x80' => '≠',
-                '\xe20x890x80' => '≠',
-            ],
-            // XML实体编码映射：保持简洁，仅映射核心数字
-            'wordEntity' => [
-                '2264' => '≤',
-                '2265' => '≥',
-                '2260' => '≠',
-            ],
-            // GBK编码映射：修复转义问题（用双引号包裹原生字节，避免匹配失败）
-            'gbkSymbol' => [
-                "\xA1\xF2" => '≤',  // 原生GBK字节，无需转义（双引号关键）
-                "\xA1\xF3" => '≥',
-                "\xA1\xF0" => '≠',
-            ],
-        ];
-
-        // 预定义回调函数（仅创建一次，提升性能，增加容错）
-        $unicodeCallback = function ($m) {
-            $code = hexdec($m[1]);
-            // 容错：十六进制转换失败/无效Unicode码点，返回原始值
-            return ($code >= 0x20 && $code <= 0x10FFFF) ? mb_chr($code, 'UTF-8') : $m[0];
-        };
-
-        $depth = 0;
-        $hasChange = false;
-        $currentStr = $str;
-
-        // 循环解码：仅在有变化且未达最大深度时执行（避免无限循环）
-        do {
-            $depth++;
-            $hasChange = false;
-            $prevStr = $currentStr;
-
-            // ========== 前置处理（惰性执行，仅在需要时触发） ==========
-            // 1. 过滤不可见控制字符（仅当包含时执行）
-            if (preg_match($regexps['controlChar'], $currentStr)) {
-                $currentStr = preg_replace($regexps['controlChar'], '', $currentStr);
+    private function fullDecode($str = '', int $maxDepth = 2){
+        try {
+            if ($str === null || trim((string)$str) === '' || $maxDepth <= 0) {
+                return $str === null ? '' : trim((string)$str);
            }

-            // 2. 编码校正（非UTF-8时才转换，增加容错机制）
-            if (!mb_check_encoding($currentStr, 'UTF-8')) {
-                $converted = mb_convert_encoding(
-                    $currentStr,
-                    'UTF-8',
-                    'GBK,GB2312,ISO-8859-1,CP1252'  // 补充CP1252（Windows西文编码）
+            $str = (string)$str;
+
+            // Unicode解码
+            if (method_exists($this, 'decodeUnicode')) {
+                $str = $this->decodeUnicode($str);
+            } else {
+                $str = preg_replace_callback(
+                    '/\\\\[uU]([0-9a-fA-F]{4})/',
+                    function ($m) {
+                        $code = hexdec($m[1]);
+                        return ($code >= 0x20 && $code <= 0x10FFFF) ? mb_chr($code, 'UTF-8') : $m[0];
+                    },
+                    $str
                );
-                // 容错：转换失败时保留原文本，避免乱码加剧
-                $currentStr = mb_check_encoding($converted, 'UTF-8') ? $converted : $currentStr;
            }

-            // ========== 核心解码逻辑（按优先级执行，避免冲突） ==========
-            // 1. Unicode转义解码（优先处理，避免转义字符干扰后续匹配）
-            $currentStr = preg_replace_callback($regexps['unicode'], $unicodeCallback, $currentStr);
+            // 预编译正则
+            $regexps = [
+                'ob0' => '/0B\s*\\?0/',
+                'dl18' => '/DL\s*\\?\.18/',
+                'qMarkNum' => '/\\?(\d+)/',
+                'qMarkDotNum' => '/\\?(\.\d+)/',
+                'neNum' => '/≠\s*(\d+)/u',
+                'leNum' => '/≤\s*(\d+)/u',
+                'mixSymbol' => '/\\?\s*(?:、|,)\s*\\?\s*(?:、|,)\s*\\?(\d+)/u',
+                'leNeMark' => '/(LE|NE)\s*\\?(\d+)/i',
+                'unicode' => '/\\\\[uU]([0-9a-fA-F]{4})/',
+                'wordBin' => '/(?:\\xE2\\x89\\x86|\\xE2\s*0x89\s*0x86|e28986|\\xE2\\x89\\x87|\\xE2\s*0x89\s*0x87|e28987|\\xE2\\x89\\x80|\\xE2\s*0x89\s*0x80|e28980)/i',
+                'wordEntity' => '/&#\s*(?:x|X)?\s*(2264|2265|2260)\s*;?/i',
+                'repeatSymbol' => '/(≤|≥|≠)\1+/u',
+                'gbkSymbol' => '/\xA1\xF2|\xA1\xF3|\xA1\xF0/'
+            ];

-            // 2. HTML实体替换（先精准映射，再解码剩余实体）
+            // 预定义替换映射
+            $maps = [
+                'htmlEntity' => [
+                    '&le;' => '≤', '&#8804;' => '≤', '&#x2264;' => '≤', '&#X2264;' => '≤',
+                    '&ge;' => '≥', '&#8805;' => '≥', '&#x2265;' => '≥', '&#X2265;' => '≥',
+                    '&ne;' => '≠', '&#8800;' => '≠', '&#x2260;' => '≠', '&#X2260;' => '≠',
+                    '&le' => '≤', '&ge' => '≥', '&ne' => '≠',
+                    '&#2264' => '≤', '&#2265' => '≥', '&#2260' => '≠',
+                    '&#x2264' => '≤', '&#x2265' => '≥', '&#x2260' => '≠',
+                    '&#60;' => '≤', '&#62;' => '≥',
+                ],
+                'wordBin' => [
+                    "\xE2\x89\x86" => '≤', "\xE2\x89\x87" => '≥', "\xE2\x89\x80" => '≠',
+                    "\xe2\x89\x86" => '≤', "\xe2\x89\x87" => '≥', "\xe2\x89\x80" => '≠',
+                    'e28986' => '≤', '\xe2\x89\x86' => '≤', '\xe20x890x86' => '≤',
+                    'e28987' => '≥', '\xe2\x89\x87' => '≥', '\xe20x890x87' => '≥',
+                    'e28980' => '≠', '\xe2\x89\x80' => '≠', '\xe20x890x80' => '≠',
+                ],
+                'wordEntity' => ['2264' => '≤', '2265' => '≥', '2260' => '≠'],
+                'gbkSymbol' => ["\xA1\xF2" => '≤', "\xA1\xF3" => '≥', "\xA1\xF0" => '≠'],
+            ];
+
+            $unicodeCallback = function ($m) {
+                $code = hexdec($m[1]);
+                return ($code >= 0x20 && $code <= 0x10FFFF) ? mb_chr($code, 'UTF-8') : $m[0];
+            };
+
+            $depth = 0;
+            $hasChange = false;
+            $currentStr = $str;
+
+            // 循环解码
+            do {
+                $depth++;
+                $hasChange = false;
+                $prevStr = $currentStr;
+
+                // Unicode转义解码
+                $currentStr = preg_replace_callback($regexps['unicode'], $unicodeCallback, $currentStr);
+
+                //HTML实体替换
+                $currentStr = strtr($currentStr, $maps['htmlEntity']);
+                $currentStr = html_entity_decode(
+                    $currentStr,
+                    ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE,
+                    'UTF-8'
+                );
+
+                //  Word特殊符号乱码修复
+                if (preg_match($regexps['wordBin'], $currentStr)) {
+                    $tempStr = str_replace(' ', '', $currentStr);
+                    $currentStr = str_ireplace(array_keys($maps['wordBin']), $maps['wordBin'], $tempStr);
+                }
+                if (preg_match($regexps['wordEntity'], $currentStr)) {
+                    $currentStr = preg_replace_callback(
+                        $regexps['wordEntity'],
+                        function ($m) use ($maps) {
+                            return $maps['wordEntity'][$m[1]] ?? $m[0];
+                        },
+                        $currentStr
+                    );
+                }
+                if (preg_match($regexps['gbkSymbol'], $currentStr)) {
+                    $currentStr = strtr($currentStr, $maps['gbkSymbol']);
+                }
+                if (preg_match($regexps['repeatSymbol'], $currentStr)) {
+                    $currentStr = preg_replace($regexps['repeatSymbol'], '$1', $currentStr);
+                }
+
+                //业务场景专属替换
+                if (preg_match($regexps['neNum'], $currentStr)) {
+                    $currentStr = preg_replace($regexps['neNum'], '≠$1', $currentStr);
+                }
+                if (preg_match($regexps['leNum'], $currentStr)) {
+                    $currentStr = preg_replace($regexps['leNum'], '≤$1', $currentStr);
+                }
+                if (preg_match($regexps['qMarkNum'], $currentStr)) {
+                    $currentStr = preg_replace($regexps['qMarkNum'], '≥$1', $currentStr);
+                }
+                if (preg_match($regexps['qMarkDotNum'], $currentStr)) {
+                    $currentStr = preg_replace($regexps['qMarkDotNum'], '≥0$1', $currentStr);
+                }
+                if (preg_match($regexps['mixSymbol'], $currentStr)) {
+                    $currentStr = preg_replace($regexps['mixSymbol'], '≤≥≠$1', $currentStr);
+                }
+                if (preg_match($regexps['leNeMark'], $currentStr)) {
+                    $currentStr = preg_replace_callback(
+                        $regexps['leNeMark'],
+                        function ($m) {
+                            return strtoupper($m[1]) === 'LE' ? '≤' . $m[2] : '≠' . $m[2];
+                        },
+                        $currentStr
+                    );
+                }
+
+                $hasChange = ($currentStr !== $prevStr);
+            } while ($depth < $maxDepth && $hasChange);
+
+            // 最终清理
+            $currentStr = trim($currentStr, ':');
            $currentStr = strtr($currentStr, $maps['htmlEntity']);
-            $currentStr = html_entity_decode(
-                $currentStr,
-                ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE,
-                'UTF-8'
-            );

-            // 3. 统一所有空格为普通空格（避免空格类型导致的匹配失败）
-            $currentStr = str_replace($maps['nbsp'], ' ', $currentStr);
+            return $currentStr;

-            // ========== Word特殊符号乱码修复（惰性执行，优化效率） ==========
-            // 1. 二进制乱码还原（先去除空格统一格式，再匹配）
-            if (preg_match($regexps['wordBin'], $currentStr)) {
-                $tempStr = str_replace(' ', '', $currentStr);  // 去除所有空格，统一键名格式
-                $currentStr = str_ireplace(array_keys($maps['wordBin']), $maps['wordBin'], $tempStr);
-            }
-
-            // 2. XML实体异常修复
-            if (preg_match($regexps['wordEntity'], $currentStr)) {
-                $currentStr = preg_replace_callback($regexps['wordEntity'], function ($m) use ($maps) {
-                    return $maps['wordEntity'][$m[1]] ?? $m[0];
-                }, $currentStr);
-            }
-
-            // 3. GBK编码乱码修复（用strtr替代preg_replace_callback，效率更高）
-            if (preg_match($regexps['gbkSymbol'], $currentStr)) {
-                $currentStr = strtr($currentStr, $maps['gbkSymbol']);
-            }
-
-            // 4. 重复符号去重（用preg_replace简化，无需回调）
-            if (preg_match($regexps['repeatSymbol'], $currentStr)) {
-                $currentStr = preg_replace($regexps['repeatSymbol'], '$1', $currentStr);
-            }
-
-            // ========== 业务场景专属替换（惰性执行，精准匹配） ==========
-            // 1. 专属场景替换（0B?0 → 0B≥30，DL?.18 → DL≥0.18）
-            if (strpos($currentStr, '0B') !== false) {
-                $currentStr = preg_replace($regexps['ob0'], '0B≥30', $currentStr);
-            }
-            if (strpos($currentStr, 'DL') !== false) {
-                $currentStr = preg_replace($regexps['dl18'], 'DL≥0.18', $currentStr);
-            }
-
-            // 2. ≤、≠空格修复（去除符号与数字间的空格）
-            if (preg_match($regexps['neNum'], $currentStr)) {
-                $currentStr = preg_replace($regexps['neNum'], '≠$1', $currentStr);
-            }
-            if (preg_match($regexps['leNum'], $currentStr)) {
-                $currentStr = preg_replace($regexps['leNum'], '≤$1', $currentStr);
-            }
-
-            // 3. 通用场景替换（问号 → ≥）
-            if (preg_match($regexps['qMarkNum'], $currentStr)) {
-                $currentStr = preg_replace($regexps['qMarkNum'], '≥$1', $currentStr);
-            }
-            if (preg_match($regexps['qMarkDotNum'], $currentStr)) {
-                $currentStr = preg_replace($regexps['qMarkDotNum'], '≥0$1', $currentStr);
-            }
-
-            // 4. 混合符号乱码还原（?、，?、，?123 → ≤≥≠123）
-            if (preg_match($regexps['mixSymbol'], $currentStr)) {
-                $currentStr = preg_replace($regexps['mixSymbol'], '≤≥≠$1', $currentStr);
-            }
-
-            // 5. ≤、≠专属标识还原（LE?123 → ≤123，NE?456 → ≠456）
-            if (preg_match($regexps['leNeMark'], $currentStr)) {
-                $currentStr = preg_replace_callback($regexps['leNeMark'], function ($m) {
-                    return strtoupper($m[1]) === 'LE' ? '≤' . $m[2] : '≠' . $m[2];
-                }, $currentStr);
-            }
-
-            // 6. 移除冗余代码（原代码"d with "替换无意义，直接删除）
-
-            // ========== 变化判断（简化逻辑，避免无效计数） ==========
-            $hasChange = ($currentStr !== $prevStr);
-
-        } while ($depth < $maxDepth && $hasChange);
-
-        // 最终清理（去除首尾冒号+二次实体替换，确保无遗漏）
-        $currentStr = trim($currentStr, ':');
-        $currentStr = strtr($currentStr, $maps['htmlEntity']);
-
-        return $currentStr;
+        } catch (\Throwable $e) {
+            return trim((string)$str);
+        }
    }

    // private function fullDecode($str, $maxDepth = 5) {