From 90884273e00bc4cdae25deb9b4404bf656e87d21 Mon Sep 17 00:00:00 2001 From: chengxl Date: Tue, 2 Dec 2025 15:20:51 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B5=8B=E8=AF=95=E9=97=AE=E9=A2=98=E4=BF=AE?= =?UTF-8?q?=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- application/common/ArticleParserService.php | 629 ++++++++++++++++---- 1 file changed, 497 insertions(+), 132 deletions(-) diff --git a/application/common/ArticleParserService.php b/application/common/ArticleParserService.php index 526ac52..5b0dd52 100644 --- a/application/common/ArticleParserService.php +++ b/application/common/ArticleParserService.php @@ -14,7 +14,7 @@ class ArticleParserService { private $phpWord; private $sections; - + private $iNum = 0; public function __construct($filePath = '') { if (!file_exists($filePath)) { @@ -553,7 +553,7 @@ class ArticleParserService if (!empty($institution) && !mb_check_encoding($institution, 'UTF-8')) { $institution = mb_convert_encoding($institution, 'UTF-8', 'GBK'); } - $aCompany[$number] = $institution; + $aCompany[$number] = empty($institution) ? '' : trim(trim($institution),'.'); } return $aCompany; } @@ -581,6 +581,7 @@ class ArticleParserService $corrText = mb_convert_encoding($corrText, 'UTF-8', 'GBK'); } $corrText = $this->fullDecode($corrText); + // // 调试 // file_put_contents(ROOT_PATH . 'runtime/corr_text_raw.log', $corrText); @@ -605,24 +606,25 @@ class ArticleParserService $aCorresponding[] = [ 'name' => $sName, 'email' => isset($email[2]) ? trim($email[2]) : '', - 'postal_address' => isset($address[2]) ? trim($address[2]) : '', + 'postal_address' => isset($address[2]) ? trim(trim($address[2]),'.') : '', 'tel' => isset($tel[2]) ? trim($tel[2]) : '' ]; } if(empty($aCorresponding)){ - $pattern = '/Corresponding Authors|Correspondence to|Correspondence: (.*?)(?=$|;)/s'; + // $pattern = '/Corresponding Authors|Correspondence to|Correspondence: (.*?)(?=$|;)/s'; + $pattern = '/(Corresponding Authors|Correspondence to|Correspondence)\s*:\s*([\s\S]+?)(?=\n\s*\n|$|;)/is'; $corrText = trim($corrText,'*'); preg_match($pattern, $corrText, $match); - if (!empty($match[1])) { - $corrContent = $match[1]; + if (!empty($match[2])) { + $corrContent = $match[2]; // 提取每个作者的名称和邮箱(优化正则,支持更多字符) $authorPattern = '/([A-Za-z\s]+?),\s*E-mail:\s*([\w@\.\-]+)/'; preg_match_all($authorPattern, $corrContent, $authors); if(!empty($authors[1])){ for ($i = 0; $i < count($authors[1]); $i++) { $aCorresponding[] = [ - 'name' => empty($authors[1][$i]) ? '' : trim($authors[1][$i]), - 'email' => empty($authors[2][$i]) ? '' : trim($authors[2][$i]) + 'name' => empty($authors[1][$i]) ? '' : trim(trim($authors[1][$i]),'.'), + 'email' => empty($authors[2][$i]) ? '' : trim(trim($authors[2][$i]),'.') ]; } } @@ -631,8 +633,8 @@ class ArticleParserService preg_match_all($authorPattern, $corrContent, $authors); for ($i = 0; $i < count($authors[1]); $i++) { $aCorresponding[] = [ - 'name' => empty($authors[1][$i]) ? '' : trim($authors[1][$i]), - 'email' => empty($authors[2][$i]) ? '' : trim($authors[2][$i]) + 'name' => empty($authors[1][$i]) ? '' : trim(trim($authors[1][$i]),'.'), + 'email' => empty($authors[2][$i]) ? '' : trim(trim($authors[2][$i]),'.') ]; } } @@ -734,84 +736,293 @@ class ArticleParserService } // 统一提取元素文本 - private function getTextFromElement($element,$lineNumber = 0){ + private function getTextFromElement(\PhpOffice\PhpWord\Element\AbstractElement $element, int $lineNumber = 0){ $text = ''; - // 处理PreserveText元素 + + // 1. 常量化特殊引号映射(避免每次调用重建数组,提升循环调用性能) + static $specialQuotesMap = [ + '’' => "'", // 右单引号(U+2019)→ 普通单引号(U+0027) + '‘' => "'", // 左单引号(U+2018)→ 普通单引号(U+0027) + '“' => '"', // 左双引号(U+201C)→ 普通双引号(U+0022) + '”' => '"', // 右双引号(U+201D)→ 普通双引号(U+0022) + '„' => '"', // 下双引号(U+201E)→ 普通双引号(兼容欧洲排版) + '‟' => '"', // 右双引号(U+201F)→ 普通双引号(兼容少见排版) + ]; + + // 支持H1-H9标题格式(优化:移除无用变量 $titleDepth,避免冗余) + if ($element instanceof \PhpOffice\PhpWord\Element\Title) { + $titleContent = $element->getText(); + $titleText = ''; + + if ($titleContent instanceof \PhpOffice\PhpWord\Element\TextRun) { + $titleText = $this->getTextFromElement($titleContent); + } else { + $titleText = strtr((string)$titleContent, $specialQuotesMap); + } + + $text .= $titleText . ' '; + return $this->cleanText($text); + } + + // 项目编号(优化:严格空值判断,避免 0 被 empty 误判) + if ($element instanceof \PhpOffice\PhpWord\Element\ListItemRun) { + $this->iNum = isset($this->iNum) && is_numeric($this->iNum) ? $this->iNum : 0; + $this->iNum++; + $text .= $this->iNum . ' '; + } + + // 处理PreserveText(含HYPERLINK邮箱提取,优化:反射前先判断属性存在) if ($element instanceof \PhpOffice\PhpWord\Element\PreserveText) { - // 通过反射获取私有属性 text - $reflection = new \ReflectionClass($element); - $property = $reflection->getProperty('text'); - $property->setAccessible(true); - $textParts = $property->getValue($element); + try { + $reflection = new \ReflectionClass($element); + // 先判断属性是否存在,避免反射不存在的属性报错(兼容极端版本) + if (!$reflection->hasProperty('text')) { + return $this->cleanText($text); + } + $property = $reflection->getProperty('text'); + $property->setAccessible(true); + $textParts = $property->getValue($element) ?? []; + } catch (\ReflectionException $e) { + return $this->cleanText($text); + } + foreach ($textParts as $part) { + $part = (string)$part; if (strpos($part, 'HYPERLINK') !== false) { - // 解码 HTML 实体(" -> ") - $decoded = html_entity_decode($part); - // 提取 mailto: 后的邮箱 - if (preg_match('/mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/i', $decoded, $match)) { + $decoded = html_entity_decode($part, ENT_QUOTES | ENT_HTML5); + // 邮箱正则不变(已优化,兼容国际域名) + if (preg_match('/mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,10})/i', $decoded, $match)) { $text .= $match[1] . ' '; } } else { - // 普通文本直接拼接 + $part = strtr($part, $specialQuotesMap); $text .= $part; } } - return $text; + return $this->cleanText($text); } - // 处理表格和单元格(E-mail可能在表格中) + + // 处理表格(优化:避免行尾多余空格,通过 cleanText 自动合并) if ($element instanceof \PhpOffice\PhpWord\Element\Table) { foreach ($element->getRows() as $row) { foreach ($row->getCells() as $cell) { - $text .= $this->getTextFromElement($cell); + $text .= $this->getTextFromElement($cell) . ' '; } + // 移除行尾额外空格(cleanText 会合并连续空格,无需手动添加) } - return $text; + return $this->cleanText($text); } + + // 处理单元格(逻辑不变,保持递归提取) if ($element instanceof \PhpOffice\PhpWord\Element\Cell) { foreach ($element->getElements() as $child) { $text .= $this->getTextFromElement($child); } - return $text; + return $this->cleanText($text); } - //处理嵌套元素(递归提取所有子元素) - if (method_exists($element, 'getElements')) { + // 处理嵌套元素(逻辑不变,增强类型校验可读性) + if (method_exists($element, 'getElements') && is_callable([$element, 'getElements'])) { foreach ($element->getElements() as $child) { - $text .= $this->getTextFromElement($child); + if ($child instanceof \PhpOffice\PhpWord\Element\AbstractElement) { + $text .= $this->getTextFromElement($child); + } } } - //处理文本元素(包括带格式的文本) + // 处理纯文本元素(逻辑不变,保持特殊引号替换) if ($element instanceof \PhpOffice\PhpWord\Element\Text) { - $text .= $element->getText(); + $textPart = (string)$element->getText(); // 显式强制转换,避免类型隐患 + $textPart = strtr($textPart, $specialQuotesMap); + $text .= $textPart; } - //处理超链接(优先提取链接目标,可能是邮箱) + // 处理超链接(逻辑不变,保持邮箱优先提取) if ($element instanceof \PhpOffice\PhpWord\Element\Link) { - $target = $element->getTarget(); + $target = (string)$element->getTarget(); if (strpos($target, 'mailto:') === 0) { - $text .= str_replace('mailto:', '', $target) . ' '; // 剥离mailto:前缀 + $text .= rtrim(str_replace('mailto:', '', $target)) . ' '; } - $text .= $element->getText() . ' '; + $linkText = strtr((string)$element->getText(), $specialQuotesMap); + $text .= $linkText . ' '; } - //处理字段和注释(可能包含隐藏邮箱) + // 处理字段和注释(优化:显式强制转换,避免非字符串拼接) if ($element instanceof \PhpOffice\PhpWord\Element\Field) { - $text .= $element->getContent() . ' '; + $text .= (string)$element->getContent() . ' '; } if ($element instanceof \PhpOffice\PhpWord\Element\Note) { - $text .= $element->getContent() . ' '; + $text .= (string)$element->getContent() . ' '; } - //清理所有不可见字符(关键:移除格式干扰) - $text = preg_replace('/[\x00-\x1F\x7F-\x9F]/', ' ', $text); // 移除控制字符 - $text = str_replace(["\t", "\r", "\n"], ' ', $text); // 统一空白字符 - $text = preg_replace('/\s+/', ' ', $text); // 合并多个空格 - if(!empty($text) && !mb_check_encoding($text, 'UTF-8')){ - $text = mb_convert_encoding($text, 'UTF-8', 'GBK'); - } - return $text; + + return $this->cleanText($text); } + /** + * 统一文本清理方法(稳健、高效、不破坏普通单引号) + * @param string $text 待清理文本 + * @return string 清理后的纯文本 + */ + private function cleanText(string $text){ + + //编码正确 + if (!mb_check_encoding($text, 'UTF-8')) { + $text = mb_convert_encoding( + $text, + 'UTF-8', + 'GBK,GB2312,GB18030,Big5,ISO-8859-1,CP1252,UTF-16,UTF-32' // 补充常见西文编码,兼容更多场景 + ); + } + //移除不可见控制字符 + $text = preg_replace('/[\x00-\x1F\x7F-\x9F]/u', ' ', $text); + + //统一空白字符 + $text = str_replace([ + "\t", "\r", "\n", + chr(0xC2) . chr(0xA0), // 不间断空格( ) + ' ', // 全角空格(U+3000) + chr(0xE2) . chr(0x80) . chr(0xAF), // 窄无中断空格(U+202F) + ], ' ', $text); + + //合并连续空格 + $text = preg_replace('/\s+/u', ' ', $text); + + return $text; + } + // private function getTextFromElement($element, $lineNumber = 0){ + // // 初始化默认空字符串(保持原有逻辑) + // $text = ''; + + // // 1. 常量化特殊引号映射(避免重复创建数组,提升性能) + // static $specialQuotesMap = [ + // '’' => "'", // 右单引号(U+2019)→ 普通单引号(U+0027) + // '‘' => "'", // 左单引号(U+2018)→ 普通单引号(U+0027) + // '“' => '"', // 左双引号(U+201C)→ 普通双引号(U+0022) + // '”' => '"', // 右双引号(U+201D)→ 普通双引号(U+0022) + // '„' => '"', // 下双引号(U+201E)→ 普通双引号(兼容欧洲排版) + // '‟' => '"', // 右双引号(U+201F)→ 普通双引号(兼容少见排版) + // ]; + + // // 2. 提前校验元素合法性(避免后续 instanceof 无效判断,减少报错) + // if (!is_object($element) || !$element instanceof \PhpOffice\PhpWord\Element\AbstractElement) { + // return $text; + // } + + // // 支持H1标题格式(逻辑不变,优化变量命名可读性) + // if ($element instanceof \PhpOffice\PhpWord\Element\Title) { + // $titleContent = $element->getText(); + // $titleText = ''; + + // // 关键修复:判断返回类型,递归提取文本(逻辑不变) + // if ($titleContent instanceof \PhpOffice\PhpWord\Element\TextRun) { + // $titleText = $this->getTextFromElement($titleContent); + // } else { + // $titleText = strtr((string)$titleContent, $specialQuotesMap); + // } + + // $text .= $titleText . ' '; + // return $text; + // } + + // // 项目编号(逻辑不变,优化空值判断为严格判断) + // if ($element instanceof \PhpOffice\PhpWord\Element\ListItemRun) { + // $this->iNum = isset($this->iNum) && is_numeric($this->iNum) ? $this->iNum : 0; + // $this->iNum++; + // $text .= $this->iNum . ' '; + // } + + // // 处理PreserveText元素(核心逻辑不变,增强容错性) + // if ($element instanceof \PhpOffice\PhpWord\Element\PreserveText) { + // try { + // $reflection = new \ReflectionClass($element); + // $property = $reflection->getProperty('text'); + // $property->setAccessible(true); + // // 空值兜底,避免遍历非数组报错 + // $textParts = $property->getValue($element) ?? []; + // } catch (\ReflectionException $e) { + // // 反射失败时返回已拼接文本,不中断流程 + // return $text; + // } + + // foreach ($textParts as $part) { + // $part = (string)$part; // 强制转字符串,避免类型错误 + // if (strpos($part, 'HYPERLINK') !== false) { + // $decoded = html_entity_decode($part, ENT_QUOTES | ENT_HTML5); + // // 邮箱正则不变,保持原有匹配逻辑 + // if (preg_match('/mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,10})/i', $decoded, $match)) { + // $text .= $match[1] . ' '; + // } + // } else { + // $text .= $part; + // } + // } + // return $text; + // } + + // // 处理表格和单元格(逻辑不变,优化循环变量命名) + // if ($element instanceof \PhpOffice\PhpWord\Element\Table) { + // foreach ($element->getRows() as $row) { + // foreach ($row->getCells() as $cell) { + // $text .= $this->getTextFromElement($cell); + // } + // } + // return $text; + // } + + // if ($element instanceof \PhpOffice\PhpWord\Element\Cell) { + // foreach ($element->getElements() as $child) { + // $text .= $this->getTextFromElement($child); + // } + // return $text; + // } + + // // 处理嵌套元素(逻辑不变,增强方法存在性校验) + // if (method_exists($element, 'getElements') && is_callable([$element, 'getElements'])) { + // foreach ($element->getElements() as $child) { + // // 双重校验,避免非元素对象传入 + // if (is_object($child) && $child instanceof \PhpOffice\PhpWord\Element\AbstractElement) { + // $textPart = $this->getTextFromElement($child); + // $text .= $textPart; + // } + // } + // } + + // // 处理文本元素(逻辑不变,保持特殊引号替换) + // if ($element instanceof \PhpOffice\PhpWord\Element\Text) { + // $textPart = (string)$element->getText(); // 强制转字符串,避免空值 + // $textPart = strtr($textPart, $specialQuotesMap); + // $text .= $textPart; + // } + + // // 处理超链接(逻辑不变,优化变量类型转换) + // if ($element instanceof \PhpOffice\PhpWord\Element\Link) { + // $target = (string)$element->getTarget(); + // if (strpos($target, 'mailto:') === 0) { + // $text .= rtrim(str_replace('mailto:', '', $target)) . ' '; + // } + // $linkText = strtr((string)$element->getText(), $specialQuotesMap); + // $text .= $linkText . ' '; + // } + + // // 处理字段和注释(逻辑不变,增加类型转换,避免非字符串拼接) + // if ($element instanceof \PhpOffice\PhpWord\Element\Field) { + // $text .= (string)$element->getContent() . ' '; + // } + // if ($element instanceof \PhpOffice\PhpWord\Element\Note) { + // $text .= (string)$element->getContent() . ' '; + // } + + // // 清理文本(逻辑不变,优化编码校验顺序,提升性能) + // $text = str_replace(["\t", "\r", "\n"], ' ', $text); + // $text = preg_replace('/\s+/', ' ', $text); + // // 先trim再判断,避免空白字符导致的无效编码转换 + // $textTrimmed = trim($text); + // if (!empty($textTrimmed) && !mb_check_encoding($textTrimmed, 'UTF-8')) { + // $text = mb_convert_encoding($text, 'UTF-8', 'GBK'); + // } + + // return $text; + // } /** * 从 Word 文档提取摘要和关键词 * @return array 提取结果 @@ -940,106 +1151,260 @@ class ArticleParserService ] ]; } - private function fullDecode($str, $maxDepth = 5) { - // 空值/深度为0,直接返回(提前终止,避免无效操作) - if (empty($str) || $maxDepth <= 0) { - return $str; - } + /** + * 核心解码方法 + * @param string $str 待解码字符串 + * @param int $maxDepth 最大解析深度 + * @return string + */ + private function fullDecode($str = '', int $maxDepth = 2){ + try { + if ($str === null || trim((string)$str) === '' || $maxDepth <= 0) { + return $str === null ? '' : trim((string)$str); + } - // 【性能优化1:预编译所有正则表达式】避免每次循环重新解析正则 - // 预编译:≥专属场景正则 - $regOb0 = '/0B\s*\?0/'; - $regDl18 = '/DL\s*\?.18/'; - // 预编译:≥通用场景正则 - $regQMarkNum = '/\?(\d+)/'; - $regQMarkDotNum = '/\?(\.\d+)/'; - // 预编译:≤、≠空格修复正则 - $regNeNum = '/≠\s*(\d+)/'; - $regLeNum = '/≤\s*(\d+)/'; - // 预编译:混合符号乱码正则(中文顿号/英文逗号) - $regMixCn = '/(\?)\s*、\s*(\?)\s*、\s*(\?)(\d+)/'; - $regMixEn = '/(\?)\s*,\s*(\?)\s*,\s*(\?)(\d+)/'; - // 预编译:≤、≠专属标识正则 - $regLeMark = '/LE\s*\?(\d+)/'; - $regNeMark = '/NE\s*\?(\d+)/'; - // 预编译:Unicode转义正则(提取到外部,避免闭包重复创建) - $regUnicode = '/\\\\u([0-9a-fA-F]{4})/'; + $str = (string)$str; - // 【性能优化2:预定义常量/映射】避免循环内重复创建数组/字符串 - // HTML实体映射(一次性定义,避免循环内重复赋值) - $htmlEntityMap = [ - '≤' => '≤', '≤' => '≤', '≤' => '≤', - '≥' => '≥', '≥' => '≥', '≥' => '≥', - '≠' => '≠', '≠' => '≠', '≠' => '≠', - ]; - // 不间断空格替换数组 - $nbspReplace = [chr(0xC2) . chr(0xA0), chr(0xA0)]; - // Unicode回调函数(预定义,避免循环内重复创建闭包) - $unicodeCallback = function ($m) { - return mb_chr(hexdec($m[1]), 'UTF-8') ?: $m[0]; - }; + // Unicode解码 + if (method_exists($this, 'decodeUnicode')) { + $str = $this->decodeUnicode($str); + } else { + $str = preg_replace_callback( + '/\\\\[uU]([0-9a-fA-F]{4})/', + function ($m) { + $code = hexdec($m[1]); + return ($code >= 0x20 && $code <= 0x10FFFF) ? mb_chr($code, 'UTF-8') : $m[0]; + }, + $str + ); + } - $original = $str; - $depth = 0; - $hasChange = false; // 标记是否有变化,提前终止循环 + // 预编译正则 + $regexps = [ + 'ob0' => '/0B\s*\\?0/', + 'dl18' => '/DL\s*\\?\.18/', + 'qMarkNum' => '/\\?(\d+)/', + 'qMarkDotNum' => '/\\?(\.\d+)/', + 'neNum' => '/≠\s*(\d+)/u', + 'leNum' => '/≤\s*(\d+)/u', + 'mixSymbol' => '/\\?\s*(?:、|,)\s*\\?\s*(?:、|,)\s*\\?(\d+)/u', + 'leNeMark' => '/(LE|NE)\s*\\?(\d+)/i', + 'unicode' => '/\\\\[uU]([0-9a-fA-F]{4})/', + 'wordBin' => '/(?:\\xE2\\x89\\x86|\\xE2\s*0x89\s*0x86|e28986|\\xE2\\x89\\x87|\\xE2\s*0x89\s*0x87|e28987|\\xE2\\x89\\x80|\\xE2\s*0x89\s*0x80|e28980)/i', + 'wordEntity' => '/&#\s*(?:x|X)?\s*(2264|2265|2260)\s*;?/i', + 'repeatSymbol' => '/(≤|≥|≠)\1+/u', + 'gbkSymbol' => '/\xA1\xF2|\xA1\xF3|\xA1\xF0/' + ]; - // 循环解码:仅在有变化且未达最大深度时执行 - do { - $depth++; + // 预定义替换映射 + $maps = [ + 'htmlEntity' => [ + '≤' => '≤', '≤' => '≤', '≤' => '≤', '≤' => '≤', + '≥' => '≥', '≥' => '≥', '≥' => '≥', '≥' => '≥', + '≠' => '≠', '≠' => '≠', '≠' => '≠', '≠' => '≠', + '&le' => '≤', '&ge' => '≥', '&ne' => '≠', + 'ࣘ' => '≤', 'ࣙ' => '≥', 'ࣔ' => '≠', + '≤' => '≤', '≥' => '≥', '≠' => '≠', + '<' => '≤', '>' => '≥', + ], + 'wordBin' => [ + "\xE2\x89\x86" => '≤', "\xE2\x89\x87" => '≥', "\xE2\x89\x80" => '≠', + "\xe2\x89\x86" => '≤', "\xe2\x89\x87" => '≥', "\xe2\x89\x80" => '≠', + 'e28986' => '≤', '\xe2\x89\x86' => '≤', '\xe20x890x86' => '≤', + 'e28987' => '≥', '\xe2\x89\x87' => '≥', '\xe20x890x87' => '≥', + 'e28980' => '≠', '\xe2\x89\x80' => '≠', '\xe20x890x80' => '≠', + ], + 'wordEntity' => ['2264' => '≤', '2265' => '≥', '2260' => '≠'], + 'gbkSymbol' => ["\xA1\xF2" => '≤', "\xA1\xF3" => '≥', "\xA1\xF0" => '≠'], + ]; + + $unicodeCallback = function ($m) { + $code = hexdec($m[1]); + return ($code >= 0x20 && $code <= 0x10FFFF) ? mb_chr($code, 'UTF-8') : $m[0]; + }; + + $depth = 0; $hasChange = false; - $prevStr = $str; // 保存当前状态,用于判断变化 + $currentStr = $str; - // 1. 解码Unicode转义(\uXXXX格式) - $str = $this->decodeUnicode($str); + // 循环解码 + do { + $depth++; + $hasChange = false; + $prevStr = $currentStr; - // 2. 解码HTML实体(先替换专属实体,再执行通用解码) - $str = strtr($str, $htmlEntityMap); // 高性能替换(strtr比str_replace快) - $str = html_entity_decode($str, ENT_QUOTES | ENT_HTML5, 'UTF-8'); + // Unicode转义解码 + $currentStr = preg_replace_callback($regexps['unicode'], $unicodeCallback, $currentStr); - // 3. 再次处理遗漏的Unicode转义(使用预编译正则+预定义回调) - $str = preg_replace_callback($regUnicode, $unicodeCallback, $str); + //HTML实体替换 + $currentStr = strtr($currentStr, $maps['htmlEntity']); + $currentStr = html_entity_decode( + $currentStr, + ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE, + 'UTF-8' + ); - // 4. 替换不间断空格为普通空格(strtr比str_replace更高效) - $str = str_replace($nbspReplace, ' ', $str); + // Word特殊符号乱码修复 + if (preg_match($regexps['wordBin'], $currentStr)) { + $tempStr = str_replace(' ', '', $currentStr); + $currentStr = str_ireplace(array_keys($maps['wordBin']), $maps['wordBin'], $tempStr); + } + if (preg_match($regexps['wordEntity'], $currentStr)) { + $currentStr = preg_replace_callback( + $regexps['wordEntity'], + function ($m) use ($maps) { + return $maps['wordEntity'][$m[1]] ?? $m[0]; + }, + $currentStr + ); + } + if (preg_match($regexps['gbkSymbol'], $currentStr)) { + $currentStr = strtr($currentStr, $maps['gbkSymbol']); + } + if (preg_match($regexps['repeatSymbol'], $currentStr)) { + $currentStr = preg_replace($regexps['repeatSymbol'], '$1', $currentStr); + } - // 5. 核心替换逻辑(优化执行顺序,避免覆盖) - // 5.1 原有≥专属场景(保留) - $str = preg_replace($regOb0, '0B≥30', $str, -1, $count1); - $str = preg_replace($regDl18, 'DL≥0.18', $str, -1, $count2); - // 5.2 ≤、≠空格修复(保留) - $str = preg_replace($regNeNum, '≠$1', $str, -1, $count3); - $str = preg_replace($regLeNum, '≤$1', $str, -1, $count4); - // 5.3 原有≥通用场景(保留) - $str = preg_replace($regQMarkNum, '≥$1', $str, -1, $count5); - $str = preg_replace($regQMarkDotNum, '≥0$1', $str, -1, $count6); - // 5.4 混合符号乱码还原(保留) - $str = preg_replace($regMixCn, '≤、≥、≠$4', $str, -1, $count7); - $str = preg_replace($regMixEn, '≤、≥、≠$4', $str, -1, $count8); - // 5.5 ≤、≠专属标识还原(保留) - $str = preg_replace($regLeMark, '≤$1', $str, -1, $count9); - $str = preg_replace($regNeMark, '≠$1', $str, -1, $count10); + //业务场景专属替换 + if (preg_match($regexps['neNum'], $currentStr)) { + $currentStr = preg_replace($regexps['neNum'], '≠$1', $currentStr); + } + if (preg_match($regexps['leNum'], $currentStr)) { + $currentStr = preg_replace($regexps['leNum'], '≤$1', $currentStr); + } + if (preg_match($regexps['qMarkNum'], $currentStr)) { + $currentStr = preg_replace($regexps['qMarkNum'], '≥$1', $currentStr); + } + if (preg_match($regexps['qMarkDotNum'], $currentStr)) { + $currentStr = preg_replace($regexps['qMarkDotNum'], '≥0$1', $currentStr); + } + if (preg_match($regexps['mixSymbol'], $currentStr)) { + $currentStr = preg_replace($regexps['mixSymbol'], '≤≥≠$1', $currentStr); + } + if (preg_match($regexps['leNeMark'], $currentStr)) { + $currentStr = preg_replace_callback( + $regexps['leNeMark'], + function ($m) { + return strtoupper($m[1]) === 'LE' ? '≤' . $m[2] : '≠' . $m[2]; + }, + $currentStr + ); + } - // 5.6 修复前缀"d with "乱码(保留) - $str = str_replace('d with ', 'd with ', $str, $count11); + $hasChange = ($currentStr !== $prevStr); + } while ($depth < $maxDepth && $hasChange); - // 【性能优化3:统计所有替换次数,判断是否有变化】 - $totalCount = $count1 + $count2 + $count3 + $count4 + $count5 + $count6 + - $count7 + $count8 + $count9 + $count10 + $count11; - if ($totalCount > 0 || $str !== $prevStr) { - $hasChange = true; - $original = $str; - } + // 最终清理 + $currentStr = trim($currentStr, ':'); + $currentStr = strtr($currentStr, $maps['htmlEntity']); - // 【性能优化4:提前终止】单次循环无变化,直接退出 - if (!$hasChange) { - break; - } + return $currentStr; - } while ($depth < $maxDepth); // 改用do-while,减少循环判断次数 - - // 最终清理:仅执行一次trim - return trim($str, ':'); + } catch (\Throwable $e) { + return trim((string)$str); + } } + + // private function fullDecode($str, $maxDepth = 5) { + // // 空值/深度为0,直接返回(提前终止,避免无效操作) + // if (empty($str) || $maxDepth <= 0) { + // return $str; + // } + + // // 【性能优化1:预编译所有正则表达式】避免每次循环重新解析正则 + // // 预编译:≥专属场景正则 + // $regOb0 = '/0B\s*\?0/'; + // $regDl18 = '/DL\s*\?.18/'; + // // 预编译:≥通用场景正则 + // $regQMarkNum = '/\?(\d+)/'; + // $regQMarkDotNum = '/\?(\.\d+)/'; + // // 预编译:≤、≠空格修复正则 + // $regNeNum = '/≠\s*(\d+)/'; + // $regLeNum = '/≤\s*(\d+)/'; + // // 预编译:混合符号乱码正则(中文顿号/英文逗号) + // $regMixCn = '/(\?)\s*、\s*(\?)\s*、\s*(\?)(\d+)/'; + // $regMixEn = '/(\?)\s*,\s*(\?)\s*,\s*(\?)(\d+)/'; + // // 预编译:≤、≠专属标识正则 + // $regLeMark = '/LE\s*\?(\d+)/'; + // $regNeMark = '/NE\s*\?(\d+)/'; + // // 预编译:Unicode转义正则(提取到外部,避免闭包重复创建) + // $regUnicode = '/\\\\u([0-9a-fA-F]{4})/'; + + // // 【性能优化2:预定义常量/映射】避免循环内重复创建数组/字符串 + // // HTML实体映射(一次性定义,避免循环内重复赋值) + // $htmlEntityMap = [ + // '≤' => '≤', '≤' => '≤', '≤' => '≤', + // '≥' => '≥', '≥' => '≥', '≥' => '≥', + // '≠' => '≠', '≠' => '≠', '≠' => '≠', + // ]; + // // 不间断空格替换数组 + // $nbspReplace = [chr(0xC2) . chr(0xA0), chr(0xA0)]; + // // Unicode回调函数(预定义,避免循环内重复创建闭包) + // $unicodeCallback = function ($m) { + // return mb_chr(hexdec($m[1]), 'UTF-8') ?: $m[0]; + // }; + + // $original = $str; + // $depth = 0; + // $hasChange = false; // 标记是否有变化,提前终止循环 + + // // 循环解码:仅在有变化且未达最大深度时执行 + // do { + // $depth++; + // $hasChange = false; + // $prevStr = $str; // 保存当前状态,用于判断变化 + + // // 1. 解码Unicode转义(\uXXXX格式) + // $str = $this->decodeUnicode($str); + + // // 2. 解码HTML实体(先替换专属实体,再执行通用解码) + // $str = strtr($str, $htmlEntityMap); // 高性能替换(strtr比str_replace快) + // $str = html_entity_decode($str, ENT_QUOTES | ENT_HTML5, 'UTF-8'); + + // // 3. 再次处理遗漏的Unicode转义(使用预编译正则+预定义回调) + // $str = preg_replace_callback($regUnicode, $unicodeCallback, $str); + + // // 4. 替换不间断空格为普通空格(strtr比str_replace更高效) + // $str = str_replace($nbspReplace, ' ', $str); + + // // 5. 核心替换逻辑(优化执行顺序,避免覆盖) + // // 5.1 原有≥专属场景(保留) + // $str = preg_replace($regOb0, '0B≥30', $str, -1, $count1); + // $str = preg_replace($regDl18, 'DL≥0.18', $str, -1, $count2); + // // 5.2 ≤、≠空格修复(保留) + // $str = preg_replace($regNeNum, '≠$1', $str, -1, $count3); + // $str = preg_replace($regLeNum, '≤$1', $str, -1, $count4); + // // 5.3 原有≥通用场景(保留) + // $str = preg_replace($regQMarkNum, '≥$1', $str, -1, $count5); + // $str = preg_replace($regQMarkDotNum, '≥0$1', $str, -1, $count6); + // // 5.4 混合符号乱码还原(保留) + // $str = preg_replace($regMixCn, '≤、≥、≠$4', $str, -1, $count7); + // $str = preg_replace($regMixEn, '≤、≥、≠$4', $str, -1, $count8); + // // 5.5 ≤、≠专属标识还原(保留) + // $str = preg_replace($regLeMark, '≤$1', $str, -1, $count9); + // $str = preg_replace($regNeMark, '≠$1', $str, -1, $count10); + + // // 5.6 修复前缀"d with "乱码(保留) + // $str = str_replace('d with ', 'd with ', $str, $count11); + + // // 【性能优化3:统计所有替换次数,判断是否有变化】 + // $totalCount = $count1 + $count2 + $count3 + $count4 + $count5 + $count6 + + // $count7 + $count8 + $count9 + $count10 + $count11; + // if ($totalCount > 0 || $str !== $prevStr) { + // $hasChange = true; + // $original = $str; + // } + + // // 【性能优化4:提前终止】单次循环无变化,直接退出 + // if (!$hasChange) { + // break; + // } + + // } while ($depth < $maxDepth); // 改用do-while,减少循环判断次数 + + // // 最终清理:仅执行一次trim + // return trim($str, ':'); + // } // private function fullDecode($str, $maxDepth = 5) { // if (empty($str) || $maxDepth <= 0) { // return $str;