代码修改

This commit is contained in:
chengxl
2025-12-02 14:26:53 +08:00
parent 705dce5e94
commit f15d072b2e

View File

@@ -1152,222 +1152,157 @@ class ArticleParserService
];
}
/**
* 核心解码方法(无静态缓存,高性能版)
* 核心解码方法
* @param string $str 待解码字符串
* @param int $maxDepth 最大解析深度
* @return string
*/
private function fullDecode(?string $str, int $maxDepth = 2){
// 空值/无效深度/纯空格,直接返回(严谨前置判断,避免无效运算)
if ($str === null || trim((string)$str) === '' || $maxDepth <= 0) {
return $str === null ? '' : trim((string)$str);
}
// 确保输入是字符串(兼容非字符串输入场景)
$str = (string)$str;
// 前置Unicode解码避免转义字符干扰后续匹配
$str = $this->decodeUnicode($str);
// ========== 预编译正则(优化匹配精度、避免歧义,仅编译一次) ==========
$regexps = [
// 专属场景正则:优化空格匹配(任意空白字符)+ 问号转义(避免正则歧义)
'ob0' => '/0B\s*\\?0/', // 匹配 0B?0、0B ?0 等场景
'dl18' => '/DL\s*\\?\.18/', // 精准匹配 DL?.18(避免误匹配 DL?x.18
// 通用场景正则:问号转义,确保仅匹配字面问号
'qMarkNum' => '/\\?(\d+)/', // 匹配 ?123、?45 等(问号转义)
'qMarkDotNum' => '/\\?(\.\d+)/', // 匹配 ?.18、?.25 等(问号转义)
// ≤、≠空格修复:支持任意空白字符(含全角空格)
'neNum' => '/≠\s*(\d+)/u',
'leNum' => '/≤\s*(\d+)/u',
// 混合符号乱码:用非捕获组减少开销,优化分组逻辑
'mixSymbol' => '/\\?\s*(?:、|,)\s*\\?\s*(?:、|,)\s*\\?(\d+)/u',
// ≤、≠专属标识:支持大小写不敏感(覆盖 LE/le/NE/ne
'leNeMark' => '/(LE|NE)\s*\\?(\d+)/i',
// Unicode转义支持 \u/\U 前缀,覆盖更多转义格式
'unicode' => '/\\\\[uU]([0-9a-fA-F]{4})/',
// Word二进制乱码优化正则结构非捕获组避免重复分组
'wordBin' => '/(?:\\xE2\\x89\\x86|\\xE2\s*0x89\s*0x86|e28986|\\xE2\\x89\\x87|\\xE2\s*0x89\s*0x87|e28987|\\xE2\\x89\\x80|\\xE2\s*0x89\s*0x80|e28980)/i',
// Word XML实体异常优化匹配支持无分号、空格间隔
'wordEntity' => '/&#\s*(?:x|X)?\s*(2264|2265|2260)\s*;?/i',
// 不可见控制字符添加UTF-8修饰符避免匹配多字节字符异常
'controlChar' => '/[\x00-\x1F\x7F]/u',
// 重复符号去重:用反向引用优化,匹配更高效(支持≤≥≠)
'repeatSymbol' => '/(≤|≥|≠)\1+/u',
// GBK编码乱码优化正则无冗余分组确保匹配原生字节
'gbkSymbol' => '/\xA1\xF2|\xA1\xF3|\xA1\xF0/'
];
// ========== 预定义替换映射(扩展场景、去冗余、修复转义问题) ==========
$maps = [
// HTML实体映射补充更多Word常见实体覆盖不完整实体场景
'htmlEntity' => [
'&le;' => '≤', '&#8804;' => '≤', '&#x2264;' => '≤', '&#X2264;' => '≤',
'&ge;' => '≥', '&#8805;' => '≥', '&#x2265;' => '≥', '&#X2265;' => '≥',
'&ne;' => '≠', '&#8800;' => '≠', '&#x2260;' => '≠', '&#X2260;' => '≠',
'&le' => '≤', '&ge' => '≥', '&ne' => '≠', // 无分号实体
'&#2264' => '≤', '&#2265' => '≥', '&#2260' => '≠', // 无分号数字实体
'&#x2264' => '≤', '&#x2265' => '≥', '&#x2260' => '≠', // 无分号十六进制实体
'&#60;' => '≤', '&#62;' => '≥', // 业务专属映射(保留)
],
// 空格替换数组补充Word中常见的特殊空格覆盖更多场景
'nbsp' => [
chr(0xC2) . chr(0xA0), // UTF-8不间断空格&nbsp;
chr(0xA0), // 拉丁1不间断空格
' ', // 全角空格U+3000
chr(0x2002), // 半角空格U+2002
chr(0x2003), // 全角空格U+2003
chr(0x2004), // 三分之一全角空格U+2004
chr(0x2005), // 四分之一全角空格U+2005
chr(0x202F), // 窄无中断空格U+202FWord常用
],
// 二进制乱码映射:统一键名格式(去除空格),避免重复匹配
'wordBin' => [
'e28986' => '≤',
'\xe2\x89\x86' => '≤',
'\xe20x890x86' => '≤', // 去除空格后的统一键名
'e28987' => '≥',
'\xe2\x89\x87' => '≥',
'\xe20x890x87' => '≥',
'e28980' => '≠',
'\xe2\x89\x80' => '≠',
'\xe20x890x80' => '≠',
],
// XML实体编码映射保持简洁仅映射核心数字
'wordEntity' => [
'2264' => '≤',
'2265' => '≥',
'2260' => '≠',
],
// GBK编码映射修复转义问题用双引号包裹原生字节避免匹配失败
'gbkSymbol' => [
"\xA1\xF2" => '≤', // 原生GBK字节无需转义双引号关键
"\xA1\xF3" => '≥',
"\xA1\xF0" => '≠',
],
];
// 预定义回调函数(仅创建一次,提升性能,增加容错)
$unicodeCallback = function ($m) {
$code = hexdec($m[1]);
// 容错:十六进制转换失败/无效Unicode码点返回原始值
return ($code >= 0x20 && $code <= 0x10FFFF) ? mb_chr($code, 'UTF-8') : $m[0];
};
$depth = 0;
$hasChange = false;
$currentStr = $str;
// 循环解码:仅在有变化且未达最大深度时执行(避免无限循环)
do {
$depth++;
$hasChange = false;
$prevStr = $currentStr;
// ========== 前置处理(惰性执行,仅在需要时触发) ==========
// 1. 过滤不可见控制字符(仅当包含时执行)
if (preg_match($regexps['controlChar'], $currentStr)) {
$currentStr = preg_replace($regexps['controlChar'], '', $currentStr);
private function fullDecode($str = '', int $maxDepth = 2){
try {
if ($str === null || trim((string)$str) === '' || $maxDepth <= 0) {
return $str === null ? '' : trim((string)$str);
}
// 2. 编码校正非UTF-8时才转换增加容错机制
if (!mb_check_encoding($currentStr, 'UTF-8')) {
$converted = mb_convert_encoding(
$currentStr,
'UTF-8',
'GBK,GB2312,ISO-8859-1,CP1252' // 补充CP1252Windows西文编码
$str = (string)$str;
// Unicode解码
if (method_exists($this, 'decodeUnicode')) {
$str = $this->decodeUnicode($str);
} else {
$str = preg_replace_callback(
'/\\\\[uU]([0-9a-fA-F]{4})/',
function ($m) {
$code = hexdec($m[1]);
return ($code >= 0x20 && $code <= 0x10FFFF) ? mb_chr($code, 'UTF-8') : $m[0];
},
$str
);
// 容错:转换失败时保留原文本,避免乱码加剧
$currentStr = mb_check_encoding($converted, 'UTF-8') ? $converted : $currentStr;
}
// ========== 核心解码逻辑(按优先级执行,避免冲突) ==========
// 1. Unicode转义解码优先处理避免转义字符干扰后续匹配
$currentStr = preg_replace_callback($regexps['unicode'], $unicodeCallback, $currentStr);
// 预编译正则
$regexps = [
'ob0' => '/0B\s*\\?0/',
'dl18' => '/DL\s*\\?\.18/',
'qMarkNum' => '/\\?(\d+)/',
'qMarkDotNum' => '/\\?(\.\d+)/',
'neNum' => '/≠\s*(\d+)/u',
'leNum' => '/≤\s*(\d+)/u',
'mixSymbol' => '/\\?\s*(?:、|,)\s*\\?\s*(?:、|,)\s*\\?(\d+)/u',
'leNeMark' => '/(LE|NE)\s*\\?(\d+)/i',
'unicode' => '/\\\\[uU]([0-9a-fA-F]{4})/',
'wordBin' => '/(?:\\xE2\\x89\\x86|\\xE2\s*0x89\s*0x86|e28986|\\xE2\\x89\\x87|\\xE2\s*0x89\s*0x87|e28987|\\xE2\\x89\\x80|\\xE2\s*0x89\s*0x80|e28980)/i',
'wordEntity' => '/&#\s*(?:x|X)?\s*(2264|2265|2260)\s*;?/i',
'repeatSymbol' => '/(≤|≥|≠)\1+/u',
'gbkSymbol' => '/\xA1\xF2|\xA1\xF3|\xA1\xF0/'
];
// 2. HTML实体替换先精准映射再解码剩余实体
// 预定义替换映射
$maps = [
'htmlEntity' => [
'&le;' => '≤', '&#8804;' => '≤', '&#x2264;' => '≤', '&#X2264;' => '≤',
'&ge;' => '≥', '&#8805;' => '≥', '&#x2265;' => '≥', '&#X2265;' => '≥',
'&ne;' => '≠', '&#8800;' => '≠', '&#x2260;' => '≠', '&#X2260;' => '≠',
'&le' => '≤', '&ge' => '≥', '&ne' => '≠',
'&#2264' => '≤', '&#2265' => '≥', '&#2260' => '≠',
'&#x2264' => '≤', '&#x2265' => '≥', '&#x2260' => '≠',
'&#60;' => '≤', '&#62;' => '≥',
],
'wordBin' => [
"\xE2\x89\x86" => '≤', "\xE2\x89\x87" => '≥', "\xE2\x89\x80" => '≠',
"\xe2\x89\x86" => '≤', "\xe2\x89\x87" => '≥', "\xe2\x89\x80" => '≠',
'e28986' => '≤', '\xe2\x89\x86' => '≤', '\xe20x890x86' => '≤',
'e28987' => '≥', '\xe2\x89\x87' => '≥', '\xe20x890x87' => '≥',
'e28980' => '≠', '\xe2\x89\x80' => '≠', '\xe20x890x80' => '≠',
],
'wordEntity' => ['2264' => '≤', '2265' => '≥', '2260' => '≠'],
'gbkSymbol' => ["\xA1\xF2" => '≤', "\xA1\xF3" => '≥', "\xA1\xF0" => '≠'],
];
$unicodeCallback = function ($m) {
$code = hexdec($m[1]);
return ($code >= 0x20 && $code <= 0x10FFFF) ? mb_chr($code, 'UTF-8') : $m[0];
};
$depth = 0;
$hasChange = false;
$currentStr = $str;
// 循环解码
do {
$depth++;
$hasChange = false;
$prevStr = $currentStr;
// Unicode转义解码
$currentStr = preg_replace_callback($regexps['unicode'], $unicodeCallback, $currentStr);
//HTML实体替换
$currentStr = strtr($currentStr, $maps['htmlEntity']);
$currentStr = html_entity_decode(
$currentStr,
ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE,
'UTF-8'
);
// Word特殊符号乱码修复
if (preg_match($regexps['wordBin'], $currentStr)) {
$tempStr = str_replace(' ', '', $currentStr);
$currentStr = str_ireplace(array_keys($maps['wordBin']), $maps['wordBin'], $tempStr);
}
if (preg_match($regexps['wordEntity'], $currentStr)) {
$currentStr = preg_replace_callback(
$regexps['wordEntity'],
function ($m) use ($maps) {
return $maps['wordEntity'][$m[1]] ?? $m[0];
},
$currentStr
);
}
if (preg_match($regexps['gbkSymbol'], $currentStr)) {
$currentStr = strtr($currentStr, $maps['gbkSymbol']);
}
if (preg_match($regexps['repeatSymbol'], $currentStr)) {
$currentStr = preg_replace($regexps['repeatSymbol'], '$1', $currentStr);
}
//业务场景专属替换
if (preg_match($regexps['neNum'], $currentStr)) {
$currentStr = preg_replace($regexps['neNum'], '≠$1', $currentStr);
}
if (preg_match($regexps['leNum'], $currentStr)) {
$currentStr = preg_replace($regexps['leNum'], '≤$1', $currentStr);
}
if (preg_match($regexps['qMarkNum'], $currentStr)) {
$currentStr = preg_replace($regexps['qMarkNum'], '≥$1', $currentStr);
}
if (preg_match($regexps['qMarkDotNum'], $currentStr)) {
$currentStr = preg_replace($regexps['qMarkDotNum'], '≥0$1', $currentStr);
}
if (preg_match($regexps['mixSymbol'], $currentStr)) {
$currentStr = preg_replace($regexps['mixSymbol'], '≤≥≠$1', $currentStr);
}
if (preg_match($regexps['leNeMark'], $currentStr)) {
$currentStr = preg_replace_callback(
$regexps['leNeMark'],
function ($m) {
return strtoupper($m[1]) === 'LE' ? '≤' . $m[2] : '≠' . $m[2];
},
$currentStr
);
}
$hasChange = ($currentStr !== $prevStr);
} while ($depth < $maxDepth && $hasChange);
// 最终清理
$currentStr = trim($currentStr, ':');
$currentStr = strtr($currentStr, $maps['htmlEntity']);
$currentStr = html_entity_decode(
$currentStr,
ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE,
'UTF-8'
);
// 3. 统一所有空格为普通空格(避免空格类型导致的匹配失败)
$currentStr = str_replace($maps['nbsp'], ' ', $currentStr);
return $currentStr;
// ========== Word特殊符号乱码修复惰性执行优化效率 ==========
// 1. 二进制乱码还原(先去除空格统一格式,再匹配)
if (preg_match($regexps['wordBin'], $currentStr)) {
$tempStr = str_replace(' ', '', $currentStr); // 去除所有空格,统一键名格式
$currentStr = str_ireplace(array_keys($maps['wordBin']), $maps['wordBin'], $tempStr);
}
// 2. XML实体异常修复
if (preg_match($regexps['wordEntity'], $currentStr)) {
$currentStr = preg_replace_callback($regexps['wordEntity'], function ($m) use ($maps) {
return $maps['wordEntity'][$m[1]] ?? $m[0];
}, $currentStr);
}
// 3. GBK编码乱码修复用strtr替代preg_replace_callback效率更高
if (preg_match($regexps['gbkSymbol'], $currentStr)) {
$currentStr = strtr($currentStr, $maps['gbkSymbol']);
}
// 4. 重复符号去重用preg_replace简化无需回调
if (preg_match($regexps['repeatSymbol'], $currentStr)) {
$currentStr = preg_replace($regexps['repeatSymbol'], '$1', $currentStr);
}
// ========== 业务场景专属替换(惰性执行,精准匹配) ==========
// 1. 专属场景替换0B?0 → 0B≥30DL?.18 → DL≥0.18
if (strpos($currentStr, '0B') !== false) {
$currentStr = preg_replace($regexps['ob0'], '0B≥30', $currentStr);
}
if (strpos($currentStr, 'DL') !== false) {
$currentStr = preg_replace($regexps['dl18'], 'DL≥0.18', $currentStr);
}
// 2. ≤、≠空格修复(去除符号与数字间的空格)
if (preg_match($regexps['neNum'], $currentStr)) {
$currentStr = preg_replace($regexps['neNum'], '≠$1', $currentStr);
}
if (preg_match($regexps['leNum'], $currentStr)) {
$currentStr = preg_replace($regexps['leNum'], '≤$1', $currentStr);
}
// 3. 通用场景替换(问号 → ≥)
if (preg_match($regexps['qMarkNum'], $currentStr)) {
$currentStr = preg_replace($regexps['qMarkNum'], '≥$1', $currentStr);
}
if (preg_match($regexps['qMarkDotNum'], $currentStr)) {
$currentStr = preg_replace($regexps['qMarkDotNum'], '≥0$1', $currentStr);
}
// 4. 混合符号乱码还原(?、,?、,?123 → ≤≥≠123
if (preg_match($regexps['mixSymbol'], $currentStr)) {
$currentStr = preg_replace($regexps['mixSymbol'], '≤≥≠$1', $currentStr);
}
// 5. ≤、≠专属标识还原LE?123 → ≤123NE?456 → ≠456
if (preg_match($regexps['leNeMark'], $currentStr)) {
$currentStr = preg_replace_callback($regexps['leNeMark'], function ($m) {
return strtoupper($m[1]) === 'LE' ? '≤' . $m[2] : '≠' . $m[2];
}, $currentStr);
}
// 6. 移除冗余代码(原代码"d with "替换无意义,直接删除)
// ========== 变化判断(简化逻辑,避免无效计数) ==========
$hasChange = ($currentStr !== $prevStr);
} while ($depth < $maxDepth && $hasChange);
// 最终清理(去除首尾冒号+二次实体替换,确保无遗漏)
$currentStr = trim($currentStr, ':');
$currentStr = strtr($currentStr, $maps['htmlEntity']);
return $currentStr;
} catch (\Throwable $e) {
return trim((string)$str);
}
}
// private function fullDecode($str, $maxDepth = 5) {