From 93f9e705cb34763e71f2b3e24df54a1f84f6bbe9 Mon Sep 17 00:00:00 2001 From: chengxl Date: Mon, 1 Dec 2025 11:53:41 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B5=8B=E8=AF=95=E9=97=AE=E9=A2=98=E4=BF=AE?= =?UTF-8?q?=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- application/common/ArticleParserService.php | 363 ++++++++++++++++---- 1 file changed, 295 insertions(+), 68 deletions(-) diff --git a/application/common/ArticleParserService.php b/application/common/ArticleParserService.php index 526ac52a..05881dee 100644 --- a/application/common/ArticleParserService.php +++ b/application/common/ArticleParserService.php @@ -14,7 +14,7 @@ class ArticleParserService { private $phpWord; private $sections; - + private $iNum = 0; public function __construct($filePath = '') { if (!file_exists($filePath)) { @@ -736,6 +736,10 @@ class ArticleParserService // 统一提取元素文本 private function getTextFromElement($element,$lineNumber = 0){ $text = ''; + if ($element instanceof \PhpOffice\PhpWord\Element\ListItemRun) { + $this->iNum++; + $text .= $this->iNum; + } // 处理PreserveText元素 if ($element instanceof \PhpOffice\PhpWord\Element\PreserveText) { // 通过反射获取私有属性 text @@ -940,106 +944,329 @@ class ArticleParserService ] ]; } - private function fullDecode($str, $maxDepth = 5) { + /** + * 核心解码方法(无静态缓存,高性能版) + * @param string $str 待解码字符串 + * @param int $maxDepth 最大解析深度 + * @return string + */ + private function fullDecode($str, $maxDepth = 2) + { // 空值/深度为0,直接返回(提前终止,避免无效操作) if (empty($str) || $maxDepth <= 0) { return $str; } - - // 【性能优化1:预编译所有正则表达式】避免每次循环重新解析正则 - // 预编译:≥专属场景正则 - $regOb0 = '/0B\s*\?0/'; - $regDl18 = '/DL\s*\?.18/'; - // 预编译:≥通用场景正则 - $regQMarkNum = '/\?(\d+)/'; - $regQMarkDotNum = '/\?(\.\d+)/'; - // 预编译:≤、≠空格修复正则 - $regNeNum = '/≠\s*(\d+)/'; - $regLeNum = '/≤\s*(\d+)/'; - // 预编译:混合符号乱码正则(中文顿号/英文逗号) - $regMixCn = '/(\?)\s*、\s*(\?)\s*、\s*(\?)(\d+)/'; - $regMixEn = '/(\?)\s*,\s*(\?)\s*,\s*(\?)(\d+)/'; - // 预编译:≤、≠专属标识正则 - $regLeMark = '/LE\s*\?(\d+)/'; - $regNeMark = '/NE\s*\?(\d+)/'; - // 预编译:Unicode转义正则(提取到外部,避免闭包重复创建) - $regUnicode = '/\\\\u([0-9a-fA-F]{4})/'; - - // 【性能优化2:预定义常量/映射】避免循环内重复创建数组/字符串 - // HTML实体映射(一次性定义,避免循环内重复赋值) - $htmlEntityMap = [ - '≤' => '≤', '≤' => '≤', '≤' => '≤', - '≥' => '≥', '≥' => '≥', '≥' => '≥', - '≠' => '≠', '≠' => '≠', '≠' => '≠', + $str = $this->decodeUnicode($str); + // ========== 预编译所有正则(合并同类型,避免循环内重复解析) ========== + $regexps = [ + // 原有专属场景正则 + 'ob0' => '/0B\s*\?0/', + 'dl18' => '/DL\s*\?.18/', + // 原有通用场景正则 + 'qMarkNum' => '/\?(\d+)/', + 'qMarkDotNum' => '/\?(\.\d+)/', + // ≤、≠空格修复正则 + 'neNum' => '/≠\s*(\d+)/', + 'leNum' => '/≤\s*(\d+)/', + // 混合符号乱码正则(合并中英文顿号/逗号) + 'mixSymbol' => '/(\?)\s*(、|,)\s*(\?)\s*(、|,)\s*(\?)(\d+)/', + // ≤、≠专属标识正则(合并LE/NE) + 'leNeMark' => '/(LE|NE)\s*\?(\d+)/', + // Unicode转义正则 + 'unicode' => '/\\\\u([0-9a-fA-F]{4})/', + // Word二进制乱码(合并≤≥≠) + 'wordBin' => '/(\\xE2\\x89\\x86|\\xE2 0x89 0x86|e28986|\\xE2\\x89\\x87|\\xE2 0x89 0x87|e28987|\\xE2\\x89\\x80|\\xE2 0x89 0x80|e28980)/i', + // Word XML实体异常(合并≤≥≠) + 'wordEntity' => '/&#\s*(\x|X)?\s*(2264|2265|2260)\s*;?/i', + // 不可见控制字符 + 'controlChar' => '/[\x00-\x1F\x7F]/', + // 重复符号去重(合并≤≥≠) + 'repeatSymbol' => '/(≤{2,}|≥{2,}|≠{2,})/', + // GBK编码乱码(合并≤≥≠) + 'gbkSymbol' => '/(\xA1\xF2|\xA1\xF3|\xA1\xF0)/' ]; - // 不间断空格替换数组 - $nbspReplace = [chr(0xC2) . chr(0xA0), chr(0xA0)]; - // Unicode回调函数(预定义,避免循环内重复创建闭包) + + // ========== 预定义所有替换映射(避免循环内重复创建) ========== + $maps = [ + // HTML实体映射(扩展Word实体) + 'htmlEntity' => [ + '≤' => '≤', '≤' => '≤', '≤' => '≤', + '≥' => '≥', '≥' => '≥', '≥' => '≥', + '≠' => '≠', '≠' => '≠', '≠' => '≠', + '&le' => '≤', '&ge' => '≥', '&ne' => '≠', + 'ࣘ' => '≤', 'ࣙ' => '≥', 'ࣔ' => '≠', + '≤' => '≤', '≥' => '≥', '≠' => '≠', + '≤' => '≤', '≥' => '≥', '≠' => '≠', + '<' => '≤', '>' => '≥' + ], + // 空格替换数组(扩展Word中的各种空格) + 'nbsp' => [ + chr(0xC2) . chr(0xA0), // UTF-8不间断空格 + chr(0xA0), // 拉丁1不间断空格 + ' ', // 全角空格 + chr(0x2002), // 方头空格 + chr(0x2003), // 全角空格 + chr(0x2004) // 三分之一全角空格 + ], + // 二进制乱码映射 + 'wordBin' => [ + 'e28986' => '≤', '\\xe2\\x89\\x86' => '≤', '\\xe2 0x89 0x86' => '≤', + 'e28987' => '≥', '\\xe2\\x89\\x87' => '≥', '\\xe2 0x89 0x87' => '≥', + 'e28980' => '≠', '\\xe2\\x89\\x80' => '≠', '\\xe2 0x89 0x80' => '≠' + ], + // XML实体编码映射 + 'wordEntity' => [ + '2264' => '≤', + '2265' => '≥', + '2260' => '≠' + ], + // GBK编码映射 + 'gbkSymbol' => [ + '\xA1\xF2' => '≤', + '\xA1\xF3' => '≥', + '\xA1\xF0' => '≠' + ] + ]; + + // 预定义回调函数(仅创建一次,避免循环内重复实例化) $unicodeCallback = function ($m) { return mb_chr(hexdec($m[1]), 'UTF-8') ?: $m[0]; }; - $original = $str; $depth = 0; - $hasChange = false; // 标记是否有变化,提前终止循环 + $hasChange = false; + $original = $str; // 循环解码:仅在有变化且未达最大深度时执行 do { $depth++; $hasChange = false; - $prevStr = $str; // 保存当前状态,用于判断变化 + $prevStr = $str; - // 1. 解码Unicode转义(\uXXXX格式) - $str = $this->decodeUnicode($str); + // ========== 前置处理(惰性执行,避免无意义操作) ========== + $countCtrl = 0; + // 1. 过滤不可见控制字符(仅当包含时执行) + if (preg_match($regexps['controlChar'], $str)) { + $str = preg_replace($regexps['controlChar'], '', $str, -1, $countCtrl); + } - // 2. 解码HTML实体(先替换专属实体,再执行通用解码) - $str = strtr($str, $htmlEntityMap); // 高性能替换(strtr比str_replace快) - $str = html_entity_decode($str, ENT_QUOTES | ENT_HTML5, 'UTF-8'); + // 2. GBK/GB2312编码转UTF-8(仅当非UTF-8时执行) + if (!mb_check_encoding($str, 'UTF-8')) { + $str = mb_convert_encoding($str, 'UTF-8', 'GBK,GB2312,ISO-8859-1'); + } - // 3. 再次处理遗漏的Unicode转义(使用预编译正则+预定义回调) - $str = preg_replace_callback($regUnicode, $unicodeCallback, $str); + // ========== 核心解码逻辑 ========== + // 1. 解码Unicode转义 + $str = preg_replace_callback($regexps['unicode'], $unicodeCallback, $str); - // 4. 替换不间断空格为普通空格(strtr比str_replace更高效) - $str = str_replace($nbspReplace, ' ', $str); + // 2. 解码HTML实体(高性能strtr替换) + $str = strtr($str, $maps['htmlEntity']); + $str = html_entity_decode($str, ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE, 'UTF-8'); - // 5. 核心替换逻辑(优化执行顺序,避免覆盖) - // 5.1 原有≥专属场景(保留) - $str = preg_replace($regOb0, '0B≥30', $str, -1, $count1); - $str = preg_replace($regDl18, 'DL≥0.18', $str, -1, $count2); - // 5.2 ≤、≠空格修复(保留) - $str = preg_replace($regNeNum, '≠$1', $str, -1, $count3); - $str = preg_replace($regLeNum, '≤$1', $str, -1, $count4); - // 5.3 原有≥通用场景(保留) - $str = preg_replace($regQMarkNum, '≥$1', $str, -1, $count5); - $str = preg_replace($regQMarkDotNum, '≥0$1', $str, -1, $count6); - // 5.4 混合符号乱码还原(保留) - $str = preg_replace($regMixCn, '≤、≥、≠$4', $str, -1, $count7); - $str = preg_replace($regMixEn, '≤、≥、≠$4', $str, -1, $count8); - // 5.5 ≤、≠专属标识还原(保留) - $str = preg_replace($regLeMark, '≤$1', $str, -1, $count9); - $str = preg_replace($regNeMark, '≠$1', $str, -1, $count10); + // 3. 替换各种空格为普通空格 + $str = str_replace($maps['nbsp'], ' ', $str); - // 5.6 修复前缀"d with "乱码(保留) - $str = str_replace('d with ', 'd with ', $str, $count11); + // ========== Word特殊符号乱码修复(合并+惰性) ========== + $countBin = $countEnt = $countGbk = $countRepeat = 0; + + // 1. 二进制乱码还原(合并正则+回调) + if (preg_match($regexps['wordBin'], $str)) { + $str = preg_replace_callback($regexps['wordBin'], function ($m) use ($maps) { + $key = strtolower(str_replace(' ', '', $m[0])); + return $maps['wordBin'][$key] ?? $m[0]; + }, $str, -1, $countBin); + } + + // 2. XML实体异常修复(合并正则+回调) + if (preg_match($regexps['wordEntity'], $str)) { + $str = preg_replace_callback($regexps['wordEntity'], function ($m) use ($maps) { + return $maps['wordEntity'][$m[2]] ?? $m[0]; + }, $str, -1, $countEnt); + } + + // 3. GBK编码乱码修复(合并正则+回调) + if (preg_match($regexps['gbkSymbol'], $str)) { + $str = preg_replace_callback($regexps['gbkSymbol'], function ($m) use ($maps) { + return $maps['gbkSymbol'][$m[0]] ?? $m[0]; + }, $str, -1, $countGbk); + } + + // 4. 重复符号去重(合并正则+极简回调) + if (preg_match($regexps['repeatSymbol'], $str)) { + $str = preg_replace_callback($regexps['repeatSymbol'], function ($m) { + return $m[0][0]; // 取第一个字符实现去重 + }, $str, -1, $countRepeat); + } + + // ========== 原有核心替换逻辑(合并+惰性) ========== + $count1 = $count2 = $count3 = $count4 = $count5 = $count6 = 0; + $count7 = $count8 = $count9 = 0; + + // 1. 专属场景替换(惰性执行) + if (strpos($str, '0B?0') !== false) { + $str = preg_replace($regexps['ob0'], '0B≥30', $str, -1, $count1); + } + if (strpos($str, 'DL?.18') !== false) { + $str = preg_replace($regexps['dl18'], 'DL≥0.18', $str, -1, $count2); + } + + // 2. ≤、≠空格修复(惰性执行) + if (preg_match($regexps['neNum'], $str)) { + $str = preg_replace($regexps['neNum'], '≠$1', $str, -1, $count3); + } + if (preg_match($regexps['leNum'], $str)) { + $str = preg_replace($regexps['leNum'], '≤$1', $str, -1, $count4); + } + + // 3. 通用场景替换(惰性执行) + if (preg_match($regexps['qMarkNum'], $str)) { + $str = preg_replace($regexps['qMarkNum'], '≥$1', $str, -1, $count5); + } + if (preg_match($regexps['qMarkDotNum'], $str)) { + $str = preg_replace($regexps['qMarkDotNum'], '≥0$1', $str, -1, $count6); + } + + // 4. 混合符号乱码还原(合并中英文,惰性执行) + if (preg_match($regexps['mixSymbol'], $str)) { + $str = preg_replace($regexps['mixSymbol'], '≤$2≥$4≠$6', $str, -1, $count7); + } + + // 5. ≤、≠专属标识还原(合并正则,惰性执行) + if (preg_match($regexps['leNeMark'], $str)) { + $str = preg_replace_callback($regexps['leNeMark'], function ($m) { + return $m[1] === 'LE' ? '≤' . $m[2] : '≠' . $m[2]; + }, $str, -1, $count8); + } + + // 6. 修复前缀"d with "乱码(惰性执行) + if (strpos($str, 'd with ') !== false) { + $str = str_replace('d with ', 'd with ', $str, $count9); + } + + // ========== 变化判断(合并计数,减少运算) ========== + $totalCount = $countCtrl + $countBin + $countEnt + $countGbk + $countRepeat + + $count1 + $count2 + $count3 + $count4 + $count5 + $count6 + + $count7 + $count8 + $count9; - // 【性能优化3:统计所有替换次数,判断是否有变化】 - $totalCount = $count1 + $count2 + $count3 + $count4 + $count5 + $count6 + - $count7 + $count8 + $count9 + $count10 + $count11; if ($totalCount > 0 || $str !== $prevStr) { $hasChange = true; $original = $str; } - // 【性能优化4:提前终止】单次循环无变化,直接退出 + // 提前终止:无变化则退出循环 if (!$hasChange) { break; } - } while ($depth < $maxDepth); // 改用do-while,减少循环判断次数 + } while ($depth < $maxDepth); - // 最终清理:仅执行一次trim - return trim($str, ':'); + // 最终清理+兜底替换 + $str = trim($str, ':'); + $str = strtr($str, $maps['htmlEntity']); + + return $str; } + + // private function fullDecode($str, $maxDepth = 5) { + // // 空值/深度为0,直接返回(提前终止,避免无效操作) + // if (empty($str) || $maxDepth <= 0) { + // return $str; + // } + + // // 【性能优化1:预编译所有正则表达式】避免每次循环重新解析正则 + // // 预编译:≥专属场景正则 + // $regOb0 = '/0B\s*\?0/'; + // $regDl18 = '/DL\s*\?.18/'; + // // 预编译:≥通用场景正则 + // $regQMarkNum = '/\?(\d+)/'; + // $regQMarkDotNum = '/\?(\.\d+)/'; + // // 预编译:≤、≠空格修复正则 + // $regNeNum = '/≠\s*(\d+)/'; + // $regLeNum = '/≤\s*(\d+)/'; + // // 预编译:混合符号乱码正则(中文顿号/英文逗号) + // $regMixCn = '/(\?)\s*、\s*(\?)\s*、\s*(\?)(\d+)/'; + // $regMixEn = '/(\?)\s*,\s*(\?)\s*,\s*(\?)(\d+)/'; + // // 预编译:≤、≠专属标识正则 + // $regLeMark = '/LE\s*\?(\d+)/'; + // $regNeMark = '/NE\s*\?(\d+)/'; + // // 预编译:Unicode转义正则(提取到外部,避免闭包重复创建) + // $regUnicode = '/\\\\u([0-9a-fA-F]{4})/'; + + // // 【性能优化2:预定义常量/映射】避免循环内重复创建数组/字符串 + // // HTML实体映射(一次性定义,避免循环内重复赋值) + // $htmlEntityMap = [ + // '≤' => '≤', '≤' => '≤', '≤' => '≤', + // '≥' => '≥', '≥' => '≥', '≥' => '≥', + // '≠' => '≠', '≠' => '≠', '≠' => '≠', + // ]; + // // 不间断空格替换数组 + // $nbspReplace = [chr(0xC2) . chr(0xA0), chr(0xA0)]; + // // Unicode回调函数(预定义,避免循环内重复创建闭包) + // $unicodeCallback = function ($m) { + // return mb_chr(hexdec($m[1]), 'UTF-8') ?: $m[0]; + // }; + + // $original = $str; + // $depth = 0; + // $hasChange = false; // 标记是否有变化,提前终止循环 + + // // 循环解码:仅在有变化且未达最大深度时执行 + // do { + // $depth++; + // $hasChange = false; + // $prevStr = $str; // 保存当前状态,用于判断变化 + + // // 1. 解码Unicode转义(\uXXXX格式) + // $str = $this->decodeUnicode($str); + + // // 2. 解码HTML实体(先替换专属实体,再执行通用解码) + // $str = strtr($str, $htmlEntityMap); // 高性能替换(strtr比str_replace快) + // $str = html_entity_decode($str, ENT_QUOTES | ENT_HTML5, 'UTF-8'); + + // // 3. 再次处理遗漏的Unicode转义(使用预编译正则+预定义回调) + // $str = preg_replace_callback($regUnicode, $unicodeCallback, $str); + + // // 4. 替换不间断空格为普通空格(strtr比str_replace更高效) + // $str = str_replace($nbspReplace, ' ', $str); + + // // 5. 核心替换逻辑(优化执行顺序,避免覆盖) + // // 5.1 原有≥专属场景(保留) + // $str = preg_replace($regOb0, '0B≥30', $str, -1, $count1); + // $str = preg_replace($regDl18, 'DL≥0.18', $str, -1, $count2); + // // 5.2 ≤、≠空格修复(保留) + // $str = preg_replace($regNeNum, '≠$1', $str, -1, $count3); + // $str = preg_replace($regLeNum, '≤$1', $str, -1, $count4); + // // 5.3 原有≥通用场景(保留) + // $str = preg_replace($regQMarkNum, '≥$1', $str, -1, $count5); + // $str = preg_replace($regQMarkDotNum, '≥0$1', $str, -1, $count6); + // // 5.4 混合符号乱码还原(保留) + // $str = preg_replace($regMixCn, '≤、≥、≠$4', $str, -1, $count7); + // $str = preg_replace($regMixEn, '≤、≥、≠$4', $str, -1, $count8); + // // 5.5 ≤、≠专属标识还原(保留) + // $str = preg_replace($regLeMark, '≤$1', $str, -1, $count9); + // $str = preg_replace($regNeMark, '≠$1', $str, -1, $count10); + + // // 5.6 修复前缀"d with "乱码(保留) + // $str = str_replace('d with ', 'd with ', $str, $count11); + + // // 【性能优化3:统计所有替换次数,判断是否有变化】 + // $totalCount = $count1 + $count2 + $count3 + $count4 + $count5 + $count6 + + // $count7 + $count8 + $count9 + $count10 + $count11; + // if ($totalCount > 0 || $str !== $prevStr) { + // $hasChange = true; + // $original = $str; + // } + + // // 【性能优化4:提前终止】单次循环无变化,直接退出 + // if (!$hasChange) { + // break; + // } + + // } while ($depth < $maxDepth); // 改用do-while,减少循环判断次数 + + // // 最终清理:仅执行一次trim + // return trim($str, ':'); + // } // private function fullDecode($str, $maxDepth = 5) { // if (empty($str) || $maxDepth <= 0) { // return $str;