diff --git a/application/common/ArticleParserService.php b/application/common/ArticleParserService.php index ff715a30..5b0dd525 100644 --- a/application/common/ArticleParserService.php +++ b/application/common/ArticleParserService.php @@ -1152,222 +1152,157 @@ class ArticleParserService ]; } /** - * 核心解码方法(无静态缓存,高性能版) + * 核心解码方法 * @param string $str 待解码字符串 * @param int $maxDepth 最大解析深度 * @return string */ - private function fullDecode(?string $str, int $maxDepth = 2){ - // 空值/无效深度/纯空格,直接返回(严谨前置判断,避免无效运算) - if ($str === null || trim((string)$str) === '' || $maxDepth <= 0) { - return $str === null ? '' : trim((string)$str); - } - - // 确保输入是字符串(兼容非字符串输入场景) - $str = (string)$str; - // 前置Unicode解码(避免转义字符干扰后续匹配) - $str = $this->decodeUnicode($str); - - // ========== 预编译正则(优化匹配精度、避免歧义,仅编译一次) ========== - $regexps = [ - // 专属场景正则:优化空格匹配(任意空白字符)+ 问号转义(避免正则歧义) - 'ob0' => '/0B\s*\\?0/', // 匹配 0B?0、0B ?0 等场景 - 'dl18' => '/DL\s*\\?\.18/', // 精准匹配 DL?.18(避免误匹配 DL?x.18) - // 通用场景正则:问号转义,确保仅匹配字面问号 - 'qMarkNum' => '/\\?(\d+)/', // 匹配 ?123、?45 等(问号转义) - 'qMarkDotNum' => '/\\?(\.\d+)/', // 匹配 ?.18、?.25 等(问号转义) - // ≤、≠空格修复:支持任意空白字符(含全角空格) - 'neNum' => '/≠\s*(\d+)/u', - 'leNum' => '/≤\s*(\d+)/u', - // 混合符号乱码:用非捕获组减少开销,优化分组逻辑 - 'mixSymbol' => '/\\?\s*(?:、|,)\s*\\?\s*(?:、|,)\s*\\?(\d+)/u', - // ≤、≠专属标识:支持大小写不敏感(覆盖 LE/le/NE/ne) - 'leNeMark' => '/(LE|NE)\s*\\?(\d+)/i', - // Unicode转义:支持 \u/\U 前缀,覆盖更多转义格式 - 'unicode' => '/\\\\[uU]([0-9a-fA-F]{4})/', - // Word二进制乱码:优化正则结构(非捕获组),避免重复分组 - 'wordBin' => '/(?:\\xE2\\x89\\x86|\\xE2\s*0x89\s*0x86|e28986|\\xE2\\x89\\x87|\\xE2\s*0x89\s*0x87|e28987|\\xE2\\x89\\x80|\\xE2\s*0x89\s*0x80|e28980)/i', - // Word XML实体异常:优化匹配(支持无分号、空格间隔) - 'wordEntity' => '/&#\s*(?:x|X)?\s*(2264|2265|2260)\s*;?/i', - // 不可见控制字符:添加UTF-8修饰符,避免匹配多字节字符异常 - 'controlChar' => '/[\x00-\x1F\x7F]/u', - // 重复符号去重:用反向引用优化,匹配更高效(支持≤≥≠) - 'repeatSymbol' => '/(≤|≥|≠)\1+/u', - // GBK编码乱码:优化正则(无冗余分组),确保匹配原生字节 - 'gbkSymbol' => '/\xA1\xF2|\xA1\xF3|\xA1\xF0/' - ]; - - // ========== 预定义替换映射(扩展场景、去冗余、修复转义问题) ========== - $maps = [ - // HTML实体映射:补充更多Word常见实体,覆盖不完整实体场景 - 'htmlEntity' => [ - '≤' => '≤', '≤' => '≤', '≤' => '≤', '≤' => '≤', - '≥' => '≥', '≥' => '≥', '≥' => '≥', '≥' => '≥', - '≠' => '≠', '≠' => '≠', '≠' => '≠', '≠' => '≠', - '&le' => '≤', '&ge' => '≥', '&ne' => '≠', // 无分号实体 - 'ࣘ' => '≤', 'ࣙ' => '≥', 'ࣔ' => '≠', // 无分号数字实体 - '≤' => '≤', '≥' => '≥', '≠' => '≠', // 无分号十六进制实体 - '<' => '≤', '>' => '≥', // 业务专属映射(保留) - ], - // 空格替换数组:补充Word中常见的特殊空格,覆盖更多场景 - 'nbsp' => [ - chr(0xC2) . chr(0xA0), // UTF-8不间断空格( ) - chr(0xA0), // 拉丁1不间断空格 - ' ', // 全角空格(U+3000) - chr(0x2002), // 半角空格(U+2002) - chr(0x2003), // 全角空格(U+2003) - chr(0x2004), // 三分之一全角空格(U+2004) - chr(0x2005), // 四分之一全角空格(U+2005) - chr(0x202F), // 窄无中断空格(U+202F,Word常用) - ], - // 二进制乱码映射:统一键名格式(去除空格),避免重复匹配 - 'wordBin' => [ - 'e28986' => '≤', - '\xe2\x89\x86' => '≤', - '\xe20x890x86' => '≤', // 去除空格后的统一键名 - 'e28987' => '≥', - '\xe2\x89\x87' => '≥', - '\xe20x890x87' => '≥', - 'e28980' => '≠', - '\xe2\x89\x80' => '≠', - '\xe20x890x80' => '≠', - ], - // XML实体编码映射:保持简洁,仅映射核心数字 - 'wordEntity' => [ - '2264' => '≤', - '2265' => '≥', - '2260' => '≠', - ], - // GBK编码映射:修复转义问题(用双引号包裹原生字节,避免匹配失败) - 'gbkSymbol' => [ - "\xA1\xF2" => '≤', // 原生GBK字节,无需转义(双引号关键) - "\xA1\xF3" => '≥', - "\xA1\xF0" => '≠', - ], - ]; - - // 预定义回调函数(仅创建一次,提升性能,增加容错) - $unicodeCallback = function ($m) { - $code = hexdec($m[1]); - // 容错:十六进制转换失败/无效Unicode码点,返回原始值 - return ($code >= 0x20 && $code <= 0x10FFFF) ? mb_chr($code, 'UTF-8') : $m[0]; - }; - - $depth = 0; - $hasChange = false; - $currentStr = $str; - - // 循环解码:仅在有变化且未达最大深度时执行(避免无限循环) - do { - $depth++; - $hasChange = false; - $prevStr = $currentStr; - - // ========== 前置处理(惰性执行,仅在需要时触发) ========== - // 1. 过滤不可见控制字符(仅当包含时执行) - if (preg_match($regexps['controlChar'], $currentStr)) { - $currentStr = preg_replace($regexps['controlChar'], '', $currentStr); + private function fullDecode($str = '', int $maxDepth = 2){ + try { + if ($str === null || trim((string)$str) === '' || $maxDepth <= 0) { + return $str === null ? '' : trim((string)$str); } - // 2. 编码校正(非UTF-8时才转换,增加容错机制) - if (!mb_check_encoding($currentStr, 'UTF-8')) { - $converted = mb_convert_encoding( - $currentStr, - 'UTF-8', - 'GBK,GB2312,ISO-8859-1,CP1252' // 补充CP1252(Windows西文编码) + $str = (string)$str; + + // Unicode解码 + if (method_exists($this, 'decodeUnicode')) { + $str = $this->decodeUnicode($str); + } else { + $str = preg_replace_callback( + '/\\\\[uU]([0-9a-fA-F]{4})/', + function ($m) { + $code = hexdec($m[1]); + return ($code >= 0x20 && $code <= 0x10FFFF) ? mb_chr($code, 'UTF-8') : $m[0]; + }, + $str ); - // 容错:转换失败时保留原文本,避免乱码加剧 - $currentStr = mb_check_encoding($converted, 'UTF-8') ? $converted : $currentStr; } - // ========== 核心解码逻辑(按优先级执行,避免冲突) ========== - // 1. Unicode转义解码(优先处理,避免转义字符干扰后续匹配) - $currentStr = preg_replace_callback($regexps['unicode'], $unicodeCallback, $currentStr); + // 预编译正则 + $regexps = [ + 'ob0' => '/0B\s*\\?0/', + 'dl18' => '/DL\s*\\?\.18/', + 'qMarkNum' => '/\\?(\d+)/', + 'qMarkDotNum' => '/\\?(\.\d+)/', + 'neNum' => '/≠\s*(\d+)/u', + 'leNum' => '/≤\s*(\d+)/u', + 'mixSymbol' => '/\\?\s*(?:、|,)\s*\\?\s*(?:、|,)\s*\\?(\d+)/u', + 'leNeMark' => '/(LE|NE)\s*\\?(\d+)/i', + 'unicode' => '/\\\\[uU]([0-9a-fA-F]{4})/', + 'wordBin' => '/(?:\\xE2\\x89\\x86|\\xE2\s*0x89\s*0x86|e28986|\\xE2\\x89\\x87|\\xE2\s*0x89\s*0x87|e28987|\\xE2\\x89\\x80|\\xE2\s*0x89\s*0x80|e28980)/i', + 'wordEntity' => '/&#\s*(?:x|X)?\s*(2264|2265|2260)\s*;?/i', + 'repeatSymbol' => '/(≤|≥|≠)\1+/u', + 'gbkSymbol' => '/\xA1\xF2|\xA1\xF3|\xA1\xF0/' + ]; - // 2. HTML实体替换(先精准映射,再解码剩余实体) + // 预定义替换映射 + $maps = [ + 'htmlEntity' => [ + '≤' => '≤', '≤' => '≤', '≤' => '≤', '≤' => '≤', + '≥' => '≥', '≥' => '≥', '≥' => '≥', '≥' => '≥', + '≠' => '≠', '≠' => '≠', '≠' => '≠', '≠' => '≠', + '&le' => '≤', '&ge' => '≥', '&ne' => '≠', + 'ࣘ' => '≤', 'ࣙ' => '≥', 'ࣔ' => '≠', + '≤' => '≤', '≥' => '≥', '≠' => '≠', + '<' => '≤', '>' => '≥', + ], + 'wordBin' => [ + "\xE2\x89\x86" => '≤', "\xE2\x89\x87" => '≥', "\xE2\x89\x80" => '≠', + "\xe2\x89\x86" => '≤', "\xe2\x89\x87" => '≥', "\xe2\x89\x80" => '≠', + 'e28986' => '≤', '\xe2\x89\x86' => '≤', '\xe20x890x86' => '≤', + 'e28987' => '≥', '\xe2\x89\x87' => '≥', '\xe20x890x87' => '≥', + 'e28980' => '≠', '\xe2\x89\x80' => '≠', '\xe20x890x80' => '≠', + ], + 'wordEntity' => ['2264' => '≤', '2265' => '≥', '2260' => '≠'], + 'gbkSymbol' => ["\xA1\xF2" => '≤', "\xA1\xF3" => '≥', "\xA1\xF0" => '≠'], + ]; + + $unicodeCallback = function ($m) { + $code = hexdec($m[1]); + return ($code >= 0x20 && $code <= 0x10FFFF) ? mb_chr($code, 'UTF-8') : $m[0]; + }; + + $depth = 0; + $hasChange = false; + $currentStr = $str; + + // 循环解码 + do { + $depth++; + $hasChange = false; + $prevStr = $currentStr; + + // Unicode转义解码 + $currentStr = preg_replace_callback($regexps['unicode'], $unicodeCallback, $currentStr); + + //HTML实体替换 + $currentStr = strtr($currentStr, $maps['htmlEntity']); + $currentStr = html_entity_decode( + $currentStr, + ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE, + 'UTF-8' + ); + + // Word特殊符号乱码修复 + if (preg_match($regexps['wordBin'], $currentStr)) { + $tempStr = str_replace(' ', '', $currentStr); + $currentStr = str_ireplace(array_keys($maps['wordBin']), $maps['wordBin'], $tempStr); + } + if (preg_match($regexps['wordEntity'], $currentStr)) { + $currentStr = preg_replace_callback( + $regexps['wordEntity'], + function ($m) use ($maps) { + return $maps['wordEntity'][$m[1]] ?? $m[0]; + }, + $currentStr + ); + } + if (preg_match($regexps['gbkSymbol'], $currentStr)) { + $currentStr = strtr($currentStr, $maps['gbkSymbol']); + } + if (preg_match($regexps['repeatSymbol'], $currentStr)) { + $currentStr = preg_replace($regexps['repeatSymbol'], '$1', $currentStr); + } + + //业务场景专属替换 + if (preg_match($regexps['neNum'], $currentStr)) { + $currentStr = preg_replace($regexps['neNum'], '≠$1', $currentStr); + } + if (preg_match($regexps['leNum'], $currentStr)) { + $currentStr = preg_replace($regexps['leNum'], '≤$1', $currentStr); + } + if (preg_match($regexps['qMarkNum'], $currentStr)) { + $currentStr = preg_replace($regexps['qMarkNum'], '≥$1', $currentStr); + } + if (preg_match($regexps['qMarkDotNum'], $currentStr)) { + $currentStr = preg_replace($regexps['qMarkDotNum'], '≥0$1', $currentStr); + } + if (preg_match($regexps['mixSymbol'], $currentStr)) { + $currentStr = preg_replace($regexps['mixSymbol'], '≤≥≠$1', $currentStr); + } + if (preg_match($regexps['leNeMark'], $currentStr)) { + $currentStr = preg_replace_callback( + $regexps['leNeMark'], + function ($m) { + return strtoupper($m[1]) === 'LE' ? '≤' . $m[2] : '≠' . $m[2]; + }, + $currentStr + ); + } + + $hasChange = ($currentStr !== $prevStr); + } while ($depth < $maxDepth && $hasChange); + + // 最终清理 + $currentStr = trim($currentStr, ':'); $currentStr = strtr($currentStr, $maps['htmlEntity']); - $currentStr = html_entity_decode( - $currentStr, - ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE, - 'UTF-8' - ); - // 3. 统一所有空格为普通空格(避免空格类型导致的匹配失败) - $currentStr = str_replace($maps['nbsp'], ' ', $currentStr); + return $currentStr; - // ========== Word特殊符号乱码修复(惰性执行,优化效率) ========== - // 1. 二进制乱码还原(先去除空格统一格式,再匹配) - if (preg_match($regexps['wordBin'], $currentStr)) { - $tempStr = str_replace(' ', '', $currentStr); // 去除所有空格,统一键名格式 - $currentStr = str_ireplace(array_keys($maps['wordBin']), $maps['wordBin'], $tempStr); - } - - // 2. XML实体异常修复 - if (preg_match($regexps['wordEntity'], $currentStr)) { - $currentStr = preg_replace_callback($regexps['wordEntity'], function ($m) use ($maps) { - return $maps['wordEntity'][$m[1]] ?? $m[0]; - }, $currentStr); - } - - // 3. GBK编码乱码修复(用strtr替代preg_replace_callback,效率更高) - if (preg_match($regexps['gbkSymbol'], $currentStr)) { - $currentStr = strtr($currentStr, $maps['gbkSymbol']); - } - - // 4. 重复符号去重(用preg_replace简化,无需回调) - if (preg_match($regexps['repeatSymbol'], $currentStr)) { - $currentStr = preg_replace($regexps['repeatSymbol'], '$1', $currentStr); - } - - // ========== 业务场景专属替换(惰性执行,精准匹配) ========== - // 1. 专属场景替换(0B?0 → 0B≥30,DL?.18 → DL≥0.18) - if (strpos($currentStr, '0B') !== false) { - $currentStr = preg_replace($regexps['ob0'], '0B≥30', $currentStr); - } - if (strpos($currentStr, 'DL') !== false) { - $currentStr = preg_replace($regexps['dl18'], 'DL≥0.18', $currentStr); - } - - // 2. ≤、≠空格修复(去除符号与数字间的空格) - if (preg_match($regexps['neNum'], $currentStr)) { - $currentStr = preg_replace($regexps['neNum'], '≠$1', $currentStr); - } - if (preg_match($regexps['leNum'], $currentStr)) { - $currentStr = preg_replace($regexps['leNum'], '≤$1', $currentStr); - } - - // 3. 通用场景替换(问号 → ≥) - if (preg_match($regexps['qMarkNum'], $currentStr)) { - $currentStr = preg_replace($regexps['qMarkNum'], '≥$1', $currentStr); - } - if (preg_match($regexps['qMarkDotNum'], $currentStr)) { - $currentStr = preg_replace($regexps['qMarkDotNum'], '≥0$1', $currentStr); - } - - // 4. 混合符号乱码还原(?、,?、,?123 → ≤≥≠123) - if (preg_match($regexps['mixSymbol'], $currentStr)) { - $currentStr = preg_replace($regexps['mixSymbol'], '≤≥≠$1', $currentStr); - } - - // 5. ≤、≠专属标识还原(LE?123 → ≤123,NE?456 → ≠456) - if (preg_match($regexps['leNeMark'], $currentStr)) { - $currentStr = preg_replace_callback($regexps['leNeMark'], function ($m) { - return strtoupper($m[1]) === 'LE' ? '≤' . $m[2] : '≠' . $m[2]; - }, $currentStr); - } - - // 6. 移除冗余代码(原代码"d with "替换无意义,直接删除) - - // ========== 变化判断(简化逻辑,避免无效计数) ========== - $hasChange = ($currentStr !== $prevStr); - - } while ($depth < $maxDepth && $hasChange); - - // 最终清理(去除首尾冒号+二次实体替换,确保无遗漏) - $currentStr = trim($currentStr, ':'); - $currentStr = strtr($currentStr, $maps['htmlEntity']); - - return $currentStr; + } catch (\Throwable $e) { + return trim((string)$str); + } } // private function fullDecode($str, $maxDepth = 5) {