测试问题修改

This commit is contained in:
chengxl
2025-12-02 15:20:51 +08:00
parent 5daf18608b
commit 90884273e0

View File

@@ -14,7 +14,7 @@ class ArticleParserService
{ {
private $phpWord; private $phpWord;
private $sections; private $sections;
private $iNum = 0;
public function __construct($filePath = '') public function __construct($filePath = '')
{ {
if (!file_exists($filePath)) { if (!file_exists($filePath)) {
@@ -553,7 +553,7 @@ class ArticleParserService
if (!empty($institution) && !mb_check_encoding($institution, 'UTF-8')) { if (!empty($institution) && !mb_check_encoding($institution, 'UTF-8')) {
$institution = mb_convert_encoding($institution, 'UTF-8', 'GBK'); $institution = mb_convert_encoding($institution, 'UTF-8', 'GBK');
} }
$aCompany[$number] = $institution; $aCompany[$number] = empty($institution) ? '' : trim(trim($institution),'.');
} }
return $aCompany; return $aCompany;
} }
@@ -581,6 +581,7 @@ class ArticleParserService
$corrText = mb_convert_encoding($corrText, 'UTF-8', 'GBK'); $corrText = mb_convert_encoding($corrText, 'UTF-8', 'GBK');
} }
$corrText = $this->fullDecode($corrText); $corrText = $this->fullDecode($corrText);
// // 调试 // // 调试
// file_put_contents(ROOT_PATH . 'runtime/corr_text_raw.log', $corrText); // file_put_contents(ROOT_PATH . 'runtime/corr_text_raw.log', $corrText);
@@ -605,24 +606,25 @@ class ArticleParserService
$aCorresponding[] = [ $aCorresponding[] = [
'name' => $sName, 'name' => $sName,
'email' => isset($email[2]) ? trim($email[2]) : '', 'email' => isset($email[2]) ? trim($email[2]) : '',
'postal_address' => isset($address[2]) ? trim($address[2]) : '', 'postal_address' => isset($address[2]) ? trim(trim($address[2]),'.') : '',
'tel' => isset($tel[2]) ? trim($tel[2]) : '' 'tel' => isset($tel[2]) ? trim($tel[2]) : ''
]; ];
} }
if(empty($aCorresponding)){ if(empty($aCorresponding)){
$pattern = '/Corresponding Authors|Correspondence to|Correspondence: (.*?)(?=$|;)/s'; // $pattern = '/Corresponding Authors|Correspondence to|Correspondence: (.*?)(?=$|;)/s';
$pattern = '/(Corresponding Authors|Correspondence to|Correspondence)\s*:\s*([\s\S]+?)(?=\n\s*\n|$|;)/is';
$corrText = trim($corrText,'*'); $corrText = trim($corrText,'*');
preg_match($pattern, $corrText, $match); preg_match($pattern, $corrText, $match);
if (!empty($match[1])) { if (!empty($match[2])) {
$corrContent = $match[1]; $corrContent = $match[2];
// 提取每个作者的名称和邮箱(优化正则,支持更多字符) // 提取每个作者的名称和邮箱(优化正则,支持更多字符)
$authorPattern = '/([A-Za-z\s]+?),\s*E-mail:\s*([\w@\.\-]+)/'; $authorPattern = '/([A-Za-z\s]+?),\s*E-mail:\s*([\w@\.\-]+)/';
preg_match_all($authorPattern, $corrContent, $authors); preg_match_all($authorPattern, $corrContent, $authors);
if(!empty($authors[1])){ if(!empty($authors[1])){
for ($i = 0; $i < count($authors[1]); $i++) { for ($i = 0; $i < count($authors[1]); $i++) {
$aCorresponding[] = [ $aCorresponding[] = [
'name' => empty($authors[1][$i]) ? '' : trim($authors[1][$i]), 'name' => empty($authors[1][$i]) ? '' : trim(trim($authors[1][$i]),'.'),
'email' => empty($authors[2][$i]) ? '' : trim($authors[2][$i]) 'email' => empty($authors[2][$i]) ? '' : trim(trim($authors[2][$i]),'.')
]; ];
} }
} }
@@ -631,8 +633,8 @@ class ArticleParserService
preg_match_all($authorPattern, $corrContent, $authors); preg_match_all($authorPattern, $corrContent, $authors);
for ($i = 0; $i < count($authors[1]); $i++) { for ($i = 0; $i < count($authors[1]); $i++) {
$aCorresponding[] = [ $aCorresponding[] = [
'name' => empty($authors[1][$i]) ? '' : trim($authors[1][$i]), 'name' => empty($authors[1][$i]) ? '' : trim(trim($authors[1][$i]),'.'),
'email' => empty($authors[2][$i]) ? '' : trim($authors[2][$i]) 'email' => empty($authors[2][$i]) ? '' : trim(trim($authors[2][$i]),'.')
]; ];
} }
} }
@@ -734,84 +736,293 @@ class ArticleParserService
} }
// 统一提取元素文本 // 统一提取元素文本
private function getTextFromElement($element,$lineNumber = 0){ private function getTextFromElement(\PhpOffice\PhpWord\Element\AbstractElement $element, int $lineNumber = 0){
$text = ''; $text = '';
// 处理PreserveText元素
// 1. 常量化特殊引号映射(避免每次调用重建数组,提升循环调用性能)
static $specialQuotesMap = [
'' => "'", // 右单引号U+2019→ 普通单引号U+0027
'' => "'", // 左单引号U+2018→ 普通单引号U+0027
'“' => '"', // 左双引号U+201C→ 普通双引号U+0022
'”' => '"', // 右双引号U+201D→ 普通双引号U+0022
'„' => '"', // 下双引号U+201E→ 普通双引号(兼容欧洲排版)
'‟' => '"', // 右双引号U+201F→ 普通双引号(兼容少见排版)
];
// 支持H1-H9标题格式优化移除无用变量 $titleDepth避免冗余
if ($element instanceof \PhpOffice\PhpWord\Element\Title) {
$titleContent = $element->getText();
$titleText = '';
if ($titleContent instanceof \PhpOffice\PhpWord\Element\TextRun) {
$titleText = $this->getTextFromElement($titleContent);
} else {
$titleText = strtr((string)$titleContent, $specialQuotesMap);
}
$text .= $titleText . ' ';
return $this->cleanText($text);
}
// 项目编号(优化:严格空值判断,避免 0 被 empty 误判)
if ($element instanceof \PhpOffice\PhpWord\Element\ListItemRun) {
$this->iNum = isset($this->iNum) && is_numeric($this->iNum) ? $this->iNum : 0;
$this->iNum++;
$text .= $this->iNum . ' ';
}
// 处理PreserveText含HYPERLINK邮箱提取优化反射前先判断属性存在
if ($element instanceof \PhpOffice\PhpWord\Element\PreserveText) { if ($element instanceof \PhpOffice\PhpWord\Element\PreserveText) {
// 通过反射获取私有属性 text try {
$reflection = new \ReflectionClass($element); $reflection = new \ReflectionClass($element);
// 先判断属性是否存在,避免反射不存在的属性报错(兼容极端版本)
if (!$reflection->hasProperty('text')) {
return $this->cleanText($text);
}
$property = $reflection->getProperty('text'); $property = $reflection->getProperty('text');
$property->setAccessible(true); $property->setAccessible(true);
$textParts = $property->getValue($element); $textParts = $property->getValue($element) ?? [];
} catch (\ReflectionException $e) {
return $this->cleanText($text);
}
foreach ($textParts as $part) { foreach ($textParts as $part) {
$part = (string)$part;
if (strpos($part, 'HYPERLINK') !== false) { if (strpos($part, 'HYPERLINK') !== false) {
// 解码 HTML 实体(&quot; -> " $decoded = html_entity_decode($part, ENT_QUOTES | ENT_HTML5);
$decoded = html_entity_decode($part); // 邮箱正则不变(已优化,兼容国际域名)
// 提取 mailto: 后的邮箱 if (preg_match('/mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,10})/i', $decoded, $match)) {
if (preg_match('/mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/i', $decoded, $match)) {
$text .= $match[1] . ' '; $text .= $match[1] . ' ';
} }
} else { } else {
// 普通文本直接拼接 $part = strtr($part, $specialQuotesMap);
$text .= $part; $text .= $part;
} }
} }
return $text; return $this->cleanText($text);
} }
// 处理表格和单元格E-mail可能在表格中
// 处理表格(优化:避免行尾多余空格,通过 cleanText 自动合并)
if ($element instanceof \PhpOffice\PhpWord\Element\Table) { if ($element instanceof \PhpOffice\PhpWord\Element\Table) {
foreach ($element->getRows() as $row) { foreach ($element->getRows() as $row) {
foreach ($row->getCells() as $cell) { foreach ($row->getCells() as $cell) {
$text .= $this->getTextFromElement($cell); $text .= $this->getTextFromElement($cell) . ' ';
} }
// 移除行尾额外空格cleanText 会合并连续空格,无需手动添加)
} }
return $text; return $this->cleanText($text);
} }
// 处理单元格(逻辑不变,保持递归提取)
if ($element instanceof \PhpOffice\PhpWord\Element\Cell) { if ($element instanceof \PhpOffice\PhpWord\Element\Cell) {
foreach ($element->getElements() as $child) { foreach ($element->getElements() as $child) {
$text .= $this->getTextFromElement($child); $text .= $this->getTextFromElement($child);
} }
return $text; return $this->cleanText($text);
} }
//处理嵌套元素(递归提取所有子元素 // 处理嵌套元素(逻辑不变,增强类型校验可读性
if (method_exists($element, 'getElements')) { if (method_exists($element, 'getElements') && is_callable([$element, 'getElements'])) {
foreach ($element->getElements() as $child) { foreach ($element->getElements() as $child) {
if ($child instanceof \PhpOffice\PhpWord\Element\AbstractElement) {
$text .= $this->getTextFromElement($child); $text .= $this->getTextFromElement($child);
} }
} }
}
//处理文本元素(包括带格式的文本 // 处理文本元素(逻辑不变,保持特殊引号替换
if ($element instanceof \PhpOffice\PhpWord\Element\Text) { if ($element instanceof \PhpOffice\PhpWord\Element\Text) {
$text .= $element->getText(); $textPart = (string)$element->getText(); // 显式强制转换,避免类型隐患
$textPart = strtr($textPart, $specialQuotesMap);
$text .= $textPart;
} }
//处理超链接(优先提取链接目标,可能是邮箱 // 处理超链接(逻辑不变,保持邮箱优先提取)
if ($element instanceof \PhpOffice\PhpWord\Element\Link) { if ($element instanceof \PhpOffice\PhpWord\Element\Link) {
$target = $element->getTarget(); $target = (string)$element->getTarget();
if (strpos($target, 'mailto:') === 0) { if (strpos($target, 'mailto:') === 0) {
$text .= str_replace('mailto:', '', $target) . ' '; // 剥离mailto:前缀 $text .= rtrim(str_replace('mailto:', '', $target)) . ' ';
} }
$text .= $element->getText() . ' '; $linkText = strtr((string)$element->getText(), $specialQuotesMap);
$text .= $linkText . ' ';
} }
//处理字段和注释(可能包含隐藏邮箱 // 处理字段和注释(优化:显式强制转换,避免非字符串拼接
if ($element instanceof \PhpOffice\PhpWord\Element\Field) { if ($element instanceof \PhpOffice\PhpWord\Element\Field) {
$text .= $element->getContent() . ' '; $text .= (string)$element->getContent() . ' ';
} }
if ($element instanceof \PhpOffice\PhpWord\Element\Note) { if ($element instanceof \PhpOffice\PhpWord\Element\Note) {
$text .= $element->getContent() . ' '; $text .= (string)$element->getContent() . ' ';
}
//清理所有不可见字符(关键:移除格式干扰)
$text = preg_replace('/[\x00-\x1F\x7F-\x9F]/', ' ', $text); // 移除控制字符
$text = str_replace(["\t", "\r", "\n"], ' ', $text); // 统一空白字符
$text = preg_replace('/\s+/', ' ', $text); // 合并多个空格
if(!empty($text) && !mb_check_encoding($text, 'UTF-8')){
$text = mb_convert_encoding($text, 'UTF-8', 'GBK');
}
return $text;
} }
return $this->cleanText($text);
}
/**
* 统一文本清理方法(稳健、高效、不破坏普通单引号)
* @param string $text 待清理文本
* @return string 清理后的纯文本
*/
private function cleanText(string $text){
//编码正确
if (!mb_check_encoding($text, 'UTF-8')) {
$text = mb_convert_encoding(
$text,
'UTF-8',
'GBK,GB2312,GB18030,Big5,ISO-8859-1,CP1252,UTF-16,UTF-32' // 补充常见西文编码,兼容更多场景
);
}
//移除不可见控制字符
$text = preg_replace('/[\x00-\x1F\x7F-\x9F]/u', ' ', $text);
//统一空白字符
$text = str_replace([
"\t", "\r", "\n",
chr(0xC2) . chr(0xA0), // 不间断空格(&nbsp;
' ', // 全角空格U+3000
chr(0xE2) . chr(0x80) . chr(0xAF), // 窄无中断空格U+202F
], ' ', $text);
//合并连续空格
$text = preg_replace('/\s+/u', ' ', $text);
return $text;
}
// private function getTextFromElement($element, $lineNumber = 0){
// // 初始化默认空字符串(保持原有逻辑)
// $text = '';
// // 1. 常量化特殊引号映射(避免重复创建数组,提升性能)
// static $specialQuotesMap = [
// '' => "'", // 右单引号U+2019→ 普通单引号U+0027
// '' => "'", // 左单引号U+2018→ 普通单引号U+0027
// '“' => '"', // 左双引号U+201C→ 普通双引号U+0022
// '”' => '"', // 右双引号U+201D→ 普通双引号U+0022
// '„' => '"', // 下双引号U+201E→ 普通双引号(兼容欧洲排版)
// '‟' => '"', // 右双引号U+201F→ 普通双引号(兼容少见排版)
// ];
// // 2. 提前校验元素合法性(避免后续 instanceof 无效判断,减少报错)
// if (!is_object($element) || !$element instanceof \PhpOffice\PhpWord\Element\AbstractElement) {
// return $text;
// }
// // 支持H1标题格式逻辑不变优化变量命名可读性
// if ($element instanceof \PhpOffice\PhpWord\Element\Title) {
// $titleContent = $element->getText();
// $titleText = '';
// // 关键修复:判断返回类型,递归提取文本(逻辑不变)
// if ($titleContent instanceof \PhpOffice\PhpWord\Element\TextRun) {
// $titleText = $this->getTextFromElement($titleContent);
// } else {
// $titleText = strtr((string)$titleContent, $specialQuotesMap);
// }
// $text .= $titleText . ' ';
// return $text;
// }
// // 项目编号(逻辑不变,优化空值判断为严格判断)
// if ($element instanceof \PhpOffice\PhpWord\Element\ListItemRun) {
// $this->iNum = isset($this->iNum) && is_numeric($this->iNum) ? $this->iNum : 0;
// $this->iNum++;
// $text .= $this->iNum . ' ';
// }
// // 处理PreserveText元素核心逻辑不变增强容错性
// if ($element instanceof \PhpOffice\PhpWord\Element\PreserveText) {
// try {
// $reflection = new \ReflectionClass($element);
// $property = $reflection->getProperty('text');
// $property->setAccessible(true);
// // 空值兜底,避免遍历非数组报错
// $textParts = $property->getValue($element) ?? [];
// } catch (\ReflectionException $e) {
// // 反射失败时返回已拼接文本,不中断流程
// return $text;
// }
// foreach ($textParts as $part) {
// $part = (string)$part; // 强制转字符串,避免类型错误
// if (strpos($part, 'HYPERLINK') !== false) {
// $decoded = html_entity_decode($part, ENT_QUOTES | ENT_HTML5);
// // 邮箱正则不变,保持原有匹配逻辑
// if (preg_match('/mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,10})/i', $decoded, $match)) {
// $text .= $match[1] . ' ';
// }
// } else {
// $text .= $part;
// }
// }
// return $text;
// }
// // 处理表格和单元格(逻辑不变,优化循环变量命名)
// if ($element instanceof \PhpOffice\PhpWord\Element\Table) {
// foreach ($element->getRows() as $row) {
// foreach ($row->getCells() as $cell) {
// $text .= $this->getTextFromElement($cell);
// }
// }
// return $text;
// }
// if ($element instanceof \PhpOffice\PhpWord\Element\Cell) {
// foreach ($element->getElements() as $child) {
// $text .= $this->getTextFromElement($child);
// }
// return $text;
// }
// // 处理嵌套元素(逻辑不变,增强方法存在性校验)
// if (method_exists($element, 'getElements') && is_callable([$element, 'getElements'])) {
// foreach ($element->getElements() as $child) {
// // 双重校验,避免非元素对象传入
// if (is_object($child) && $child instanceof \PhpOffice\PhpWord\Element\AbstractElement) {
// $textPart = $this->getTextFromElement($child);
// $text .= $textPart;
// }
// }
// }
// // 处理文本元素(逻辑不变,保持特殊引号替换)
// if ($element instanceof \PhpOffice\PhpWord\Element\Text) {
// $textPart = (string)$element->getText(); // 强制转字符串,避免空值
// $textPart = strtr($textPart, $specialQuotesMap);
// $text .= $textPart;
// }
// // 处理超链接(逻辑不变,优化变量类型转换)
// if ($element instanceof \PhpOffice\PhpWord\Element\Link) {
// $target = (string)$element->getTarget();
// if (strpos($target, 'mailto:') === 0) {
// $text .= rtrim(str_replace('mailto:', '', $target)) . ' ';
// }
// $linkText = strtr((string)$element->getText(), $specialQuotesMap);
// $text .= $linkText . ' ';
// }
// // 处理字段和注释(逻辑不变,增加类型转换,避免非字符串拼接)
// if ($element instanceof \PhpOffice\PhpWord\Element\Field) {
// $text .= (string)$element->getContent() . ' ';
// }
// if ($element instanceof \PhpOffice\PhpWord\Element\Note) {
// $text .= (string)$element->getContent() . ' ';
// }
// // 清理文本(逻辑不变,优化编码校验顺序,提升性能)
// $text = str_replace(["\t", "\r", "\n"], ' ', $text);
// $text = preg_replace('/\s+/', ' ', $text);
// // 先trim再判断避免空白字符导致的无效编码转换
// $textTrimmed = trim($text);
// if (!empty($textTrimmed) && !mb_check_encoding($textTrimmed, 'UTF-8')) {
// $text = mb_convert_encoding($text, 'UTF-8', 'GBK');
// }
// return $text;
// }
/** /**
* 从 Word 文档提取摘要和关键词 * 从 Word 文档提取摘要和关键词
* @return array 提取结果 * @return array 提取结果
@@ -940,106 +1151,260 @@ class ArticleParserService
] ]
]; ];
} }
private function fullDecode($str, $maxDepth = 5) { /**
// 空值/深度为0直接返回提前终止避免无效操作 * 核心解码方法
if (empty($str) || $maxDepth <= 0) { * @param string $str 待解码字符串
return $str; * @param int $maxDepth 最大解析深度
* @return string
*/
private function fullDecode($str = '', int $maxDepth = 2){
try {
if ($str === null || trim((string)$str) === '' || $maxDepth <= 0) {
return $str === null ? '' : trim((string)$str);
} }
// 【性能优化1预编译所有正则表达式】避免每次循环重新解析正则 $str = (string)$str;
// 预编译:≥专属场景正则
$regOb0 = '/0B\s*\?0/';
$regDl18 = '/DL\s*\?.18/';
// 预编译:≥通用场景正则
$regQMarkNum = '/\?(\d+)/';
$regQMarkDotNum = '/\?(\.\d+)/';
// 预编译:≤、≠空格修复正则
$regNeNum = '/≠\s*(\d+)/';
$regLeNum = '/≤\s*(\d+)/';
// 预编译:混合符号乱码正则(中文顿号/英文逗号)
$regMixCn = '/(\?)\s*、\s*(\?)\s*、\s*(\?)(\d+)/';
$regMixEn = '/(\?)\s*,\s*(\?)\s*,\s*(\?)(\d+)/';
// 预编译:≤、≠专属标识正则
$regLeMark = '/LE\s*\?(\d+)/';
$regNeMark = '/NE\s*\?(\d+)/';
// 预编译Unicode转义正则提取到外部避免闭包重复创建
$regUnicode = '/\\\\u([0-9a-fA-F]{4})/';
// 【性能优化2预定义常量/映射】避免循环内重复创建数组/字符串 // Unicode解码
// HTML实体映射一次性定义避免循环内重复赋值 if (method_exists($this, 'decodeUnicode')) {
$htmlEntityMap = [ $str = $this->decodeUnicode($str);
'&le;' => '≤', '&#8804;' => '≤', '&#x2264;' => '≤', } else {
'&ge;' => '≥', '&#8805;' => '≥', '&#x2265;' => '≥', $str = preg_replace_callback(
'&ne;' => '≠', '&#8800;' => '≠', '&#x2260;' => '≠', '/\\\\[uU]([0-9a-fA-F]{4})/',
function ($m) {
$code = hexdec($m[1]);
return ($code >= 0x20 && $code <= 0x10FFFF) ? mb_chr($code, 'UTF-8') : $m[0];
},
$str
);
}
// 预编译正则
$regexps = [
'ob0' => '/0B\s*\\?0/',
'dl18' => '/DL\s*\\?\.18/',
'qMarkNum' => '/\\?(\d+)/',
'qMarkDotNum' => '/\\?(\.\d+)/',
'neNum' => '/≠\s*(\d+)/u',
'leNum' => '/≤\s*(\d+)/u',
'mixSymbol' => '/\\?\s*(?:、|,)\s*\\?\s*(?:、|,)\s*\\?(\d+)/u',
'leNeMark' => '/(LE|NE)\s*\\?(\d+)/i',
'unicode' => '/\\\\[uU]([0-9a-fA-F]{4})/',
'wordBin' => '/(?:\\xE2\\x89\\x86|\\xE2\s*0x89\s*0x86|e28986|\\xE2\\x89\\x87|\\xE2\s*0x89\s*0x87|e28987|\\xE2\\x89\\x80|\\xE2\s*0x89\s*0x80|e28980)/i',
'wordEntity' => '/&#\s*(?:x|X)?\s*(2264|2265|2260)\s*;?/i',
'repeatSymbol' => '/(≤|≥|≠)\1+/u',
'gbkSymbol' => '/\xA1\xF2|\xA1\xF3|\xA1\xF0/'
]; ];
// 不间断空格替换数组
$nbspReplace = [chr(0xC2) . chr(0xA0), chr(0xA0)]; // 预定义替换映射
// Unicode回调函数预定义避免循环内重复创建闭包 $maps = [
'htmlEntity' => [
'&le;' => '≤', '&#8804;' => '≤', '&#x2264;' => '≤', '&#X2264;' => '≤',
'&ge;' => '≥', '&#8805;' => '≥', '&#x2265;' => '≥', '&#X2265;' => '≥',
'&ne;' => '≠', '&#8800;' => '≠', '&#x2260;' => '≠', '&#X2260;' => '≠',
'&le' => '≤', '&ge' => '≥', '&ne' => '≠',
'&#2264' => '≤', '&#2265' => '≥', '&#2260' => '≠',
'&#x2264' => '≤', '&#x2265' => '≥', '&#x2260' => '≠',
'&#60;' => '≤', '&#62;' => '≥',
],
'wordBin' => [
"\xE2\x89\x86" => '≤', "\xE2\x89\x87" => '≥', "\xE2\x89\x80" => '≠',
"\xe2\x89\x86" => '≤', "\xe2\x89\x87" => '≥', "\xe2\x89\x80" => '≠',
'e28986' => '≤', '\xe2\x89\x86' => '≤', '\xe20x890x86' => '≤',
'e28987' => '≥', '\xe2\x89\x87' => '≥', '\xe20x890x87' => '≥',
'e28980' => '≠', '\xe2\x89\x80' => '≠', '\xe20x890x80' => '≠',
],
'wordEntity' => ['2264' => '≤', '2265' => '≥', '2260' => '≠'],
'gbkSymbol' => ["\xA1\xF2" => '≤', "\xA1\xF3" => '≥', "\xA1\xF0" => '≠'],
];
$unicodeCallback = function ($m) { $unicodeCallback = function ($m) {
return mb_chr(hexdec($m[1]), 'UTF-8') ?: $m[0]; $code = hexdec($m[1]);
return ($code >= 0x20 && $code <= 0x10FFFF) ? mb_chr($code, 'UTF-8') : $m[0];
}; };
$original = $str;
$depth = 0; $depth = 0;
$hasChange = false; // 标记是否有变化,提前终止循环 $hasChange = false;
$currentStr = $str;
// 循环解码:仅在有变化且未达最大深度时执行 // 循环解码
do { do {
$depth++; $depth++;
$hasChange = false; $hasChange = false;
$prevStr = $str; // 保存当前状态,用于判断变化 $prevStr = $currentStr;
// 1. 解码Unicode转义\uXXXX格式 // Unicode转义解码
$str = $this->decodeUnicode($str); $currentStr = preg_replace_callback($regexps['unicode'], $unicodeCallback, $currentStr);
// 2. 解码HTML实体先替换专属实体再执行通用解码 //HTML实体替换
$str = strtr($str, $htmlEntityMap); // 高性能替换strtr比str_replace快 $currentStr = strtr($currentStr, $maps['htmlEntity']);
$str = html_entity_decode($str, ENT_QUOTES | ENT_HTML5, 'UTF-8'); $currentStr = html_entity_decode(
$currentStr,
ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE,
'UTF-8'
);
// 3. 再次处理遗漏的Unicode转义使用预编译正则+预定义回调) // Word特殊符号乱码修复
$str = preg_replace_callback($regUnicode, $unicodeCallback, $str); if (preg_match($regexps['wordBin'], $currentStr)) {
$tempStr = str_replace(' ', '', $currentStr);
// 4. 替换不间断空格为普通空格strtr比str_replace更高效) $currentStr = str_ireplace(array_keys($maps['wordBin']), $maps['wordBin'], $tempStr);
$str = str_replace($nbspReplace, ' ', $str); }
if (preg_match($regexps['wordEntity'], $currentStr)) {
// 5. 核心替换逻辑(优化执行顺序,避免覆盖) $currentStr = preg_replace_callback(
// 5.1 原有≥专属场景(保留) $regexps['wordEntity'],
$str = preg_replace($regOb0, '0B≥30', $str, -1, $count1); function ($m) use ($maps) {
$str = preg_replace($regDl18, 'DL≥0.18', $str, -1, $count2); return $maps['wordEntity'][$m[1]] ?? $m[0];
// 5.2 ≤、≠空格修复(保留) },
$str = preg_replace($regNeNum, '≠$1', $str, -1, $count3); $currentStr
$str = preg_replace($regLeNum, '≤$1', $str, -1, $count4); );
// 5.3 原有≥通用场景(保留) }
$str = preg_replace($regQMarkNum, '≥$1', $str, -1, $count5); if (preg_match($regexps['gbkSymbol'], $currentStr)) {
$str = preg_replace($regQMarkDotNum, '≥0$1', $str, -1, $count6); $currentStr = strtr($currentStr, $maps['gbkSymbol']);
// 5.4 混合符号乱码还原(保留) }
$str = preg_replace($regMixCn, '≤、≥、≠$4', $str, -1, $count7); if (preg_match($regexps['repeatSymbol'], $currentStr)) {
$str = preg_replace($regMixEn, '≤、≥、≠$4', $str, -1, $count8); $currentStr = preg_replace($regexps['repeatSymbol'], '$1', $currentStr);
// 5.5 ≤、≠专属标识还原(保留)
$str = preg_replace($regLeMark, '≤$1', $str, -1, $count9);
$str = preg_replace($regNeMark, '≠$1', $str, -1, $count10);
// 5.6 修复前缀"d with "乱码(保留)
$str = str_replace('d with ', 'd with ', $str, $count11);
// 【性能优化3统计所有替换次数判断是否有变化】
$totalCount = $count1 + $count2 + $count3 + $count4 + $count5 + $count6 +
$count7 + $count8 + $count9 + $count10 + $count11;
if ($totalCount > 0 || $str !== $prevStr) {
$hasChange = true;
$original = $str;
} }
// 【性能优化4提前终止】单次循环无变化直接退出 //业务场景专属替换
if (!$hasChange) { if (preg_match($regexps['neNum'], $currentStr)) {
break; $currentStr = preg_replace($regexps['neNum'], '≠$1', $currentStr);
}
if (preg_match($regexps['leNum'], $currentStr)) {
$currentStr = preg_replace($regexps['leNum'], '≤$1', $currentStr);
}
if (preg_match($regexps['qMarkNum'], $currentStr)) {
$currentStr = preg_replace($regexps['qMarkNum'], '≥$1', $currentStr);
}
if (preg_match($regexps['qMarkDotNum'], $currentStr)) {
$currentStr = preg_replace($regexps['qMarkDotNum'], '≥0$1', $currentStr);
}
if (preg_match($regexps['mixSymbol'], $currentStr)) {
$currentStr = preg_replace($regexps['mixSymbol'], '≤≥≠$1', $currentStr);
}
if (preg_match($regexps['leNeMark'], $currentStr)) {
$currentStr = preg_replace_callback(
$regexps['leNeMark'],
function ($m) {
return strtoupper($m[1]) === 'LE' ? '≤' . $m[2] : '≠' . $m[2];
},
$currentStr
);
} }
} while ($depth < $maxDepth); // 改用do-while减少循环判断次数 $hasChange = ($currentStr !== $prevStr);
} while ($depth < $maxDepth && $hasChange);
// 最终清理仅执行一次trim // 最终清理
return trim($str, ':'); $currentStr = trim($currentStr, ':');
$currentStr = strtr($currentStr, $maps['htmlEntity']);
return $currentStr;
} catch (\Throwable $e) {
return trim((string)$str);
} }
}
// private function fullDecode($str, $maxDepth = 5) {
// // 空值/深度为0直接返回提前终止避免无效操作
// if (empty($str) || $maxDepth <= 0) {
// return $str;
// }
// // 【性能优化1预编译所有正则表达式】避免每次循环重新解析正则
// // 预编译:≥专属场景正则
// $regOb0 = '/0B\s*\?0/';
// $regDl18 = '/DL\s*\?.18/';
// // 预编译:≥通用场景正则
// $regQMarkNum = '/\?(\d+)/';
// $regQMarkDotNum = '/\?(\.\d+)/';
// // 预编译:≤、≠空格修复正则
// $regNeNum = '/≠\s*(\d+)/';
// $regLeNum = '/≤\s*(\d+)/';
// // 预编译:混合符号乱码正则(中文顿号/英文逗号)
// $regMixCn = '/(\?)\s*、\s*(\?)\s*、\s*(\?)(\d+)/';
// $regMixEn = '/(\?)\s*,\s*(\?)\s*,\s*(\?)(\d+)/';
// // 预编译:≤、≠专属标识正则
// $regLeMark = '/LE\s*\?(\d+)/';
// $regNeMark = '/NE\s*\?(\d+)/';
// // 预编译Unicode转义正则提取到外部避免闭包重复创建
// $regUnicode = '/\\\\u([0-9a-fA-F]{4})/';
// // 【性能优化2预定义常量/映射】避免循环内重复创建数组/字符串
// // HTML实体映射一次性定义避免循环内重复赋值
// $htmlEntityMap = [
// '&le;' => '≤', '&#8804;' => '≤', '&#x2264;' => '≤',
// '&ge;' => '≥', '&#8805;' => '≥', '&#x2265;' => '≥',
// '&ne;' => '≠', '&#8800;' => '≠', '&#x2260;' => '≠',
// ];
// // 不间断空格替换数组
// $nbspReplace = [chr(0xC2) . chr(0xA0), chr(0xA0)];
// // Unicode回调函数预定义避免循环内重复创建闭包
// $unicodeCallback = function ($m) {
// return mb_chr(hexdec($m[1]), 'UTF-8') ?: $m[0];
// };
// $original = $str;
// $depth = 0;
// $hasChange = false; // 标记是否有变化,提前终止循环
// // 循环解码:仅在有变化且未达最大深度时执行
// do {
// $depth++;
// $hasChange = false;
// $prevStr = $str; // 保存当前状态,用于判断变化
// // 1. 解码Unicode转义\uXXXX格式
// $str = $this->decodeUnicode($str);
// // 2. 解码HTML实体先替换专属实体再执行通用解码
// $str = strtr($str, $htmlEntityMap); // 高性能替换strtr比str_replace快
// $str = html_entity_decode($str, ENT_QUOTES | ENT_HTML5, 'UTF-8');
// // 3. 再次处理遗漏的Unicode转义使用预编译正则+预定义回调)
// $str = preg_replace_callback($regUnicode, $unicodeCallback, $str);
// // 4. 替换不间断空格为普通空格strtr比str_replace更高效
// $str = str_replace($nbspReplace, ' ', $str);
// // 5. 核心替换逻辑(优化执行顺序,避免覆盖)
// // 5.1 原有≥专属场景(保留)
// $str = preg_replace($regOb0, '0B≥30', $str, -1, $count1);
// $str = preg_replace($regDl18, 'DL≥0.18', $str, -1, $count2);
// // 5.2 ≤、≠空格修复(保留)
// $str = preg_replace($regNeNum, '≠$1', $str, -1, $count3);
// $str = preg_replace($regLeNum, '≤$1', $str, -1, $count4);
// // 5.3 原有≥通用场景(保留)
// $str = preg_replace($regQMarkNum, '≥$1', $str, -1, $count5);
// $str = preg_replace($regQMarkDotNum, '≥0$1', $str, -1, $count6);
// // 5.4 混合符号乱码还原(保留)
// $str = preg_replace($regMixCn, '≤、≥、≠$4', $str, -1, $count7);
// $str = preg_replace($regMixEn, '≤、≥、≠$4', $str, -1, $count8);
// // 5.5 ≤、≠专属标识还原(保留)
// $str = preg_replace($regLeMark, '≤$1', $str, -1, $count9);
// $str = preg_replace($regNeMark, '≠$1', $str, -1, $count10);
// // 5.6 修复前缀"d with "乱码(保留)
// $str = str_replace('d with ', 'd with ', $str, $count11);
// // 【性能优化3统计所有替换次数判断是否有变化】
// $totalCount = $count1 + $count2 + $count3 + $count4 + $count5 + $count6 +
// $count7 + $count8 + $count9 + $count10 + $count11;
// if ($totalCount > 0 || $str !== $prevStr) {
// $hasChange = true;
// $original = $str;
// }
// // 【性能优化4提前终止】单次循环无变化直接退出
// if (!$hasChange) {
// break;
// }
// } while ($depth < $maxDepth); // 改用do-while减少循环判断次数
// // 最终清理仅执行一次trim
// return trim($str, ':');
// }
// private function fullDecode($str, $maxDepth = 5) { // private function fullDecode($str, $maxDepth = 5) {
// if (empty($str) || $maxDepth <= 0) { // if (empty($str) || $maxDepth <= 0) {
// return $str; // return $str;