代码修改
This commit is contained in:
@@ -553,7 +553,7 @@ class ArticleParserService
|
||||
if (!empty($institution) && !mb_check_encoding($institution, 'UTF-8')) {
|
||||
$institution = mb_convert_encoding($institution, 'UTF-8', 'GBK');
|
||||
}
|
||||
$aCompany[$number] = $institution;
|
||||
$aCompany[$number] = empty($institution) ? '' : trim(trim($institution),'.');
|
||||
}
|
||||
return $aCompany;
|
||||
}
|
||||
@@ -581,6 +581,7 @@ class ArticleParserService
|
||||
$corrText = mb_convert_encoding($corrText, 'UTF-8', 'GBK');
|
||||
}
|
||||
$corrText = $this->fullDecode($corrText);
|
||||
|
||||
// // 调试
|
||||
// file_put_contents(ROOT_PATH . 'runtime/corr_text_raw.log', $corrText);
|
||||
|
||||
@@ -605,24 +606,25 @@ class ArticleParserService
|
||||
$aCorresponding[] = [
|
||||
'name' => $sName,
|
||||
'email' => isset($email[2]) ? trim($email[2]) : '',
|
||||
'postal_address' => isset($address[2]) ? trim($address[2]) : '',
|
||||
'postal_address' => isset($address[2]) ? trim(trim($address[2]),'.') : '',
|
||||
'tel' => isset($tel[2]) ? trim($tel[2]) : ''
|
||||
];
|
||||
}
|
||||
if(empty($aCorresponding)){
|
||||
$pattern = '/Corresponding Authors|Correspondence to|Correspondence: (.*?)(?=$|;)/s';
|
||||
// $pattern = '/Corresponding Authors|Correspondence to|Correspondence: (.*?)(?=$|;)/s';
|
||||
$pattern = '/(Corresponding Authors|Correspondence to|Correspondence)\s*:\s*([\s\S]+?)(?=\n\s*\n|$|;)/is';
|
||||
$corrText = trim($corrText,'*');
|
||||
preg_match($pattern, $corrText, $match);
|
||||
if (!empty($match[1])) {
|
||||
$corrContent = $match[1];
|
||||
if (!empty($match[2])) {
|
||||
$corrContent = $match[2];
|
||||
// 提取每个作者的名称和邮箱(优化正则,支持更多字符)
|
||||
$authorPattern = '/([A-Za-z\s]+?),\s*E-mail:\s*([\w@\.\-]+)/';
|
||||
preg_match_all($authorPattern, $corrContent, $authors);
|
||||
if(!empty($authors[1])){
|
||||
for ($i = 0; $i < count($authors[1]); $i++) {
|
||||
$aCorresponding[] = [
|
||||
'name' => empty($authors[1][$i]) ? '' : trim($authors[1][$i]),
|
||||
'email' => empty($authors[2][$i]) ? '' : trim($authors[2][$i])
|
||||
'name' => empty($authors[1][$i]) ? '' : trim(trim($authors[1][$i]),'.'),
|
||||
'email' => empty($authors[2][$i]) ? '' : trim(trim($authors[2][$i]),'.')
|
||||
];
|
||||
}
|
||||
}
|
||||
@@ -631,8 +633,8 @@ class ArticleParserService
|
||||
preg_match_all($authorPattern, $corrContent, $authors);
|
||||
for ($i = 0; $i < count($authors[1]); $i++) {
|
||||
$aCorresponding[] = [
|
||||
'name' => empty($authors[1][$i]) ? '' : trim($authors[1][$i]),
|
||||
'email' => empty($authors[2][$i]) ? '' : trim($authors[2][$i])
|
||||
'name' => empty($authors[1][$i]) ? '' : trim(trim($authors[1][$i]),'.'),
|
||||
'email' => empty($authors[2][$i]) ? '' : trim(trim($authors[2][$i]),'.')
|
||||
];
|
||||
}
|
||||
}
|
||||
@@ -734,88 +736,293 @@ class ArticleParserService
|
||||
}
|
||||
|
||||
// 统一提取元素文本
|
||||
private function getTextFromElement($element,$lineNumber = 0){
|
||||
private function getTextFromElement(\PhpOffice\PhpWord\Element\AbstractElement $element, int $lineNumber = 0){
|
||||
$text = '';
|
||||
if ($element instanceof \PhpOffice\PhpWord\Element\ListItemRun) {
|
||||
$this->iNum++;
|
||||
$text .= $this->iNum;
|
||||
|
||||
// 1. 常量化特殊引号映射(避免每次调用重建数组,提升循环调用性能)
|
||||
static $specialQuotesMap = [
|
||||
'’' => "'", // 右单引号(U+2019)→ 普通单引号(U+0027)
|
||||
'‘' => "'", // 左单引号(U+2018)→ 普通单引号(U+0027)
|
||||
'“' => '"', // 左双引号(U+201C)→ 普通双引号(U+0022)
|
||||
'”' => '"', // 右双引号(U+201D)→ 普通双引号(U+0022)
|
||||
'„' => '"', // 下双引号(U+201E)→ 普通双引号(兼容欧洲排版)
|
||||
'‟' => '"', // 右双引号(U+201F)→ 普通双引号(兼容少见排版)
|
||||
];
|
||||
|
||||
// 支持H1-H9标题格式(优化:移除无用变量 $titleDepth,避免冗余)
|
||||
if ($element instanceof \PhpOffice\PhpWord\Element\Title) {
|
||||
$titleContent = $element->getText();
|
||||
$titleText = '';
|
||||
|
||||
if ($titleContent instanceof \PhpOffice\PhpWord\Element\TextRun) {
|
||||
$titleText = $this->getTextFromElement($titleContent);
|
||||
} else {
|
||||
$titleText = strtr((string)$titleContent, $specialQuotesMap);
|
||||
}
|
||||
|
||||
$text .= $titleText . ' ';
|
||||
return $this->cleanText($text);
|
||||
}
|
||||
// 处理PreserveText元素
|
||||
|
||||
// 项目编号(优化:严格空值判断,避免 0 被 empty 误判)
|
||||
if ($element instanceof \PhpOffice\PhpWord\Element\ListItemRun) {
|
||||
$this->iNum = isset($this->iNum) && is_numeric($this->iNum) ? $this->iNum : 0;
|
||||
$this->iNum++;
|
||||
$text .= $this->iNum . ' ';
|
||||
}
|
||||
|
||||
// 处理PreserveText(含HYPERLINK邮箱提取,优化:反射前先判断属性存在)
|
||||
if ($element instanceof \PhpOffice\PhpWord\Element\PreserveText) {
|
||||
// 通过反射获取私有属性 text
|
||||
$reflection = new \ReflectionClass($element);
|
||||
$property = $reflection->getProperty('text');
|
||||
$property->setAccessible(true);
|
||||
$textParts = $property->getValue($element);
|
||||
try {
|
||||
$reflection = new \ReflectionClass($element);
|
||||
// 先判断属性是否存在,避免反射不存在的属性报错(兼容极端版本)
|
||||
if (!$reflection->hasProperty('text')) {
|
||||
return $this->cleanText($text);
|
||||
}
|
||||
$property = $reflection->getProperty('text');
|
||||
$property->setAccessible(true);
|
||||
$textParts = $property->getValue($element) ?? [];
|
||||
} catch (\ReflectionException $e) {
|
||||
return $this->cleanText($text);
|
||||
}
|
||||
|
||||
foreach ($textParts as $part) {
|
||||
$part = (string)$part;
|
||||
if (strpos($part, 'HYPERLINK') !== false) {
|
||||
// 解码 HTML 实体(" -> ")
|
||||
$decoded = html_entity_decode($part);
|
||||
// 提取 mailto: 后的邮箱
|
||||
if (preg_match('/mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/i', $decoded, $match)) {
|
||||
$decoded = html_entity_decode($part, ENT_QUOTES | ENT_HTML5);
|
||||
// 邮箱正则不变(已优化,兼容国际域名)
|
||||
if (preg_match('/mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,10})/i', $decoded, $match)) {
|
||||
$text .= $match[1] . ' ';
|
||||
}
|
||||
} else {
|
||||
// 普通文本直接拼接
|
||||
$part = strtr($part, $specialQuotesMap);
|
||||
$text .= $part;
|
||||
}
|
||||
}
|
||||
return $text;
|
||||
return $this->cleanText($text);
|
||||
}
|
||||
// 处理表格和单元格(E-mail可能在表格中)
|
||||
|
||||
// 处理表格(优化:避免行尾多余空格,通过 cleanText 自动合并)
|
||||
if ($element instanceof \PhpOffice\PhpWord\Element\Table) {
|
||||
foreach ($element->getRows() as $row) {
|
||||
foreach ($row->getCells() as $cell) {
|
||||
$text .= $this->getTextFromElement($cell);
|
||||
$text .= $this->getTextFromElement($cell) . ' ';
|
||||
}
|
||||
// 移除行尾额外空格(cleanText 会合并连续空格,无需手动添加)
|
||||
}
|
||||
return $text;
|
||||
return $this->cleanText($text);
|
||||
}
|
||||
|
||||
// 处理单元格(逻辑不变,保持递归提取)
|
||||
if ($element instanceof \PhpOffice\PhpWord\Element\Cell) {
|
||||
foreach ($element->getElements() as $child) {
|
||||
$text .= $this->getTextFromElement($child);
|
||||
}
|
||||
return $text;
|
||||
return $this->cleanText($text);
|
||||
}
|
||||
|
||||
//处理嵌套元素(递归提取所有子元素)
|
||||
if (method_exists($element, 'getElements')) {
|
||||
// 处理嵌套元素(逻辑不变,增强类型校验可读性)
|
||||
if (method_exists($element, 'getElements') && is_callable([$element, 'getElements'])) {
|
||||
foreach ($element->getElements() as $child) {
|
||||
$text .= $this->getTextFromElement($child);
|
||||
if ($child instanceof \PhpOffice\PhpWord\Element\AbstractElement) {
|
||||
$text .= $this->getTextFromElement($child);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//处理文本元素(包括带格式的文本)
|
||||
// 处理纯文本元素(逻辑不变,保持特殊引号替换)
|
||||
if ($element instanceof \PhpOffice\PhpWord\Element\Text) {
|
||||
$text .= $element->getText();
|
||||
$textPart = (string)$element->getText(); // 显式强制转换,避免类型隐患
|
||||
$textPart = strtr($textPart, $specialQuotesMap);
|
||||
$text .= $textPart;
|
||||
}
|
||||
|
||||
//处理超链接(优先提取链接目标,可能是邮箱)
|
||||
// 处理超链接(逻辑不变,保持邮箱优先提取)
|
||||
if ($element instanceof \PhpOffice\PhpWord\Element\Link) {
|
||||
$target = $element->getTarget();
|
||||
$target = (string)$element->getTarget();
|
||||
if (strpos($target, 'mailto:') === 0) {
|
||||
$text .= str_replace('mailto:', '', $target) . ' '; // 剥离mailto:前缀
|
||||
$text .= rtrim(str_replace('mailto:', '', $target)) . ' ';
|
||||
}
|
||||
$text .= $element->getText() . ' ';
|
||||
$linkText = strtr((string)$element->getText(), $specialQuotesMap);
|
||||
$text .= $linkText . ' ';
|
||||
}
|
||||
|
||||
//处理字段和注释(可能包含隐藏邮箱)
|
||||
// 处理字段和注释(优化:显式强制转换,避免非字符串拼接)
|
||||
if ($element instanceof \PhpOffice\PhpWord\Element\Field) {
|
||||
$text .= $element->getContent() . ' ';
|
||||
$text .= (string)$element->getContent() . ' ';
|
||||
}
|
||||
if ($element instanceof \PhpOffice\PhpWord\Element\Note) {
|
||||
$text .= $element->getContent() . ' ';
|
||||
$text .= (string)$element->getContent() . ' ';
|
||||
}
|
||||
//清理所有不可见字符(关键:移除格式干扰)
|
||||
$text = preg_replace('/[\x00-\x1F\x7F-\x9F]/', ' ', $text); // 移除控制字符
|
||||
$text = str_replace(["\t", "\r", "\n"], ' ', $text); // 统一空白字符
|
||||
$text = preg_replace('/\s+/', ' ', $text); // 合并多个空格
|
||||
if(!empty($text) && !mb_check_encoding($text, 'UTF-8')){
|
||||
$text = mb_convert_encoding($text, 'UTF-8', 'GBK');
|
||||
}
|
||||
return $text;
|
||||
|
||||
return $this->cleanText($text);
|
||||
}
|
||||
|
||||
/**
|
||||
* 统一文本清理方法(稳健、高效、不破坏普通单引号)
|
||||
* @param string $text 待清理文本
|
||||
* @return string 清理后的纯文本
|
||||
*/
|
||||
private function cleanText(string $text){
|
||||
|
||||
//编码正确
|
||||
if (!mb_check_encoding($text, 'UTF-8')) {
|
||||
$text = mb_convert_encoding(
|
||||
$text,
|
||||
'UTF-8',
|
||||
'GBK,GB2312,GB18030,Big5,ISO-8859-1,CP1252,UTF-16,UTF-32' // 补充常见西文编码,兼容更多场景
|
||||
);
|
||||
}
|
||||
//移除不可见控制字符
|
||||
$text = preg_replace('/[\x00-\x1F\x7F-\x9F]/u', ' ', $text);
|
||||
|
||||
//统一空白字符
|
||||
$text = str_replace([
|
||||
"\t", "\r", "\n",
|
||||
chr(0xC2) . chr(0xA0), // 不间断空格( )
|
||||
' ', // 全角空格(U+3000)
|
||||
chr(0xE2) . chr(0x80) . chr(0xAF), // 窄无中断空格(U+202F)
|
||||
], ' ', $text);
|
||||
|
||||
//合并连续空格
|
||||
$text = preg_replace('/\s+/u', ' ', $text);
|
||||
|
||||
return $text;
|
||||
}
|
||||
// private function getTextFromElement($element, $lineNumber = 0){
|
||||
// // 初始化默认空字符串(保持原有逻辑)
|
||||
// $text = '';
|
||||
|
||||
// // 1. 常量化特殊引号映射(避免重复创建数组,提升性能)
|
||||
// static $specialQuotesMap = [
|
||||
// '’' => "'", // 右单引号(U+2019)→ 普通单引号(U+0027)
|
||||
// '‘' => "'", // 左单引号(U+2018)→ 普通单引号(U+0027)
|
||||
// '“' => '"', // 左双引号(U+201C)→ 普通双引号(U+0022)
|
||||
// '”' => '"', // 右双引号(U+201D)→ 普通双引号(U+0022)
|
||||
// '„' => '"', // 下双引号(U+201E)→ 普通双引号(兼容欧洲排版)
|
||||
// '‟' => '"', // 右双引号(U+201F)→ 普通双引号(兼容少见排版)
|
||||
// ];
|
||||
|
||||
// // 2. 提前校验元素合法性(避免后续 instanceof 无效判断,减少报错)
|
||||
// if (!is_object($element) || !$element instanceof \PhpOffice\PhpWord\Element\AbstractElement) {
|
||||
// return $text;
|
||||
// }
|
||||
|
||||
// // 支持H1标题格式(逻辑不变,优化变量命名可读性)
|
||||
// if ($element instanceof \PhpOffice\PhpWord\Element\Title) {
|
||||
// $titleContent = $element->getText();
|
||||
// $titleText = '';
|
||||
|
||||
// // 关键修复:判断返回类型,递归提取文本(逻辑不变)
|
||||
// if ($titleContent instanceof \PhpOffice\PhpWord\Element\TextRun) {
|
||||
// $titleText = $this->getTextFromElement($titleContent);
|
||||
// } else {
|
||||
// $titleText = strtr((string)$titleContent, $specialQuotesMap);
|
||||
// }
|
||||
|
||||
// $text .= $titleText . ' ';
|
||||
// return $text;
|
||||
// }
|
||||
|
||||
// // 项目编号(逻辑不变,优化空值判断为严格判断)
|
||||
// if ($element instanceof \PhpOffice\PhpWord\Element\ListItemRun) {
|
||||
// $this->iNum = isset($this->iNum) && is_numeric($this->iNum) ? $this->iNum : 0;
|
||||
// $this->iNum++;
|
||||
// $text .= $this->iNum . ' ';
|
||||
// }
|
||||
|
||||
// // 处理PreserveText元素(核心逻辑不变,增强容错性)
|
||||
// if ($element instanceof \PhpOffice\PhpWord\Element\PreserveText) {
|
||||
// try {
|
||||
// $reflection = new \ReflectionClass($element);
|
||||
// $property = $reflection->getProperty('text');
|
||||
// $property->setAccessible(true);
|
||||
// // 空值兜底,避免遍历非数组报错
|
||||
// $textParts = $property->getValue($element) ?? [];
|
||||
// } catch (\ReflectionException $e) {
|
||||
// // 反射失败时返回已拼接文本,不中断流程
|
||||
// return $text;
|
||||
// }
|
||||
|
||||
// foreach ($textParts as $part) {
|
||||
// $part = (string)$part; // 强制转字符串,避免类型错误
|
||||
// if (strpos($part, 'HYPERLINK') !== false) {
|
||||
// $decoded = html_entity_decode($part, ENT_QUOTES | ENT_HTML5);
|
||||
// // 邮箱正则不变,保持原有匹配逻辑
|
||||
// if (preg_match('/mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,10})/i', $decoded, $match)) {
|
||||
// $text .= $match[1] . ' ';
|
||||
// }
|
||||
// } else {
|
||||
// $text .= $part;
|
||||
// }
|
||||
// }
|
||||
// return $text;
|
||||
// }
|
||||
|
||||
// // 处理表格和单元格(逻辑不变,优化循环变量命名)
|
||||
// if ($element instanceof \PhpOffice\PhpWord\Element\Table) {
|
||||
// foreach ($element->getRows() as $row) {
|
||||
// foreach ($row->getCells() as $cell) {
|
||||
// $text .= $this->getTextFromElement($cell);
|
||||
// }
|
||||
// }
|
||||
// return $text;
|
||||
// }
|
||||
|
||||
// if ($element instanceof \PhpOffice\PhpWord\Element\Cell) {
|
||||
// foreach ($element->getElements() as $child) {
|
||||
// $text .= $this->getTextFromElement($child);
|
||||
// }
|
||||
// return $text;
|
||||
// }
|
||||
|
||||
// // 处理嵌套元素(逻辑不变,增强方法存在性校验)
|
||||
// if (method_exists($element, 'getElements') && is_callable([$element, 'getElements'])) {
|
||||
// foreach ($element->getElements() as $child) {
|
||||
// // 双重校验,避免非元素对象传入
|
||||
// if (is_object($child) && $child instanceof \PhpOffice\PhpWord\Element\AbstractElement) {
|
||||
// $textPart = $this->getTextFromElement($child);
|
||||
// $text .= $textPart;
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
// // 处理文本元素(逻辑不变,保持特殊引号替换)
|
||||
// if ($element instanceof \PhpOffice\PhpWord\Element\Text) {
|
||||
// $textPart = (string)$element->getText(); // 强制转字符串,避免空值
|
||||
// $textPart = strtr($textPart, $specialQuotesMap);
|
||||
// $text .= $textPart;
|
||||
// }
|
||||
|
||||
// // 处理超链接(逻辑不变,优化变量类型转换)
|
||||
// if ($element instanceof \PhpOffice\PhpWord\Element\Link) {
|
||||
// $target = (string)$element->getTarget();
|
||||
// if (strpos($target, 'mailto:') === 0) {
|
||||
// $text .= rtrim(str_replace('mailto:', '', $target)) . ' ';
|
||||
// }
|
||||
// $linkText = strtr((string)$element->getText(), $specialQuotesMap);
|
||||
// $text .= $linkText . ' ';
|
||||
// }
|
||||
|
||||
// // 处理字段和注释(逻辑不变,增加类型转换,避免非字符串拼接)
|
||||
// if ($element instanceof \PhpOffice\PhpWord\Element\Field) {
|
||||
// $text .= (string)$element->getContent() . ' ';
|
||||
// }
|
||||
// if ($element instanceof \PhpOffice\PhpWord\Element\Note) {
|
||||
// $text .= (string)$element->getContent() . ' ';
|
||||
// }
|
||||
|
||||
// // 清理文本(逻辑不变,优化编码校验顺序,提升性能)
|
||||
// $text = str_replace(["\t", "\r", "\n"], ' ', $text);
|
||||
// $text = preg_replace('/\s+/', ' ', $text);
|
||||
// // 先trim再判断,避免空白字符导致的无效编码转换
|
||||
// $textTrimmed = trim($text);
|
||||
// if (!empty($textTrimmed) && !mb_check_encoding($textTrimmed, 'UTF-8')) {
|
||||
// $text = mb_convert_encoding($text, 'UTF-8', 'GBK');
|
||||
// }
|
||||
|
||||
// return $text;
|
||||
// }
|
||||
/**
|
||||
* 从 Word 文档提取摘要和关键词
|
||||
* @return array 提取结果
|
||||
@@ -950,221 +1157,217 @@ class ArticleParserService
|
||||
* @param int $maxDepth 最大解析深度
|
||||
* @return string
|
||||
*/
|
||||
private function fullDecode($str, $maxDepth = 2)
|
||||
{
|
||||
// 空值/深度为0,直接返回(提前终止,避免无效操作)
|
||||
if (empty($str) || $maxDepth <= 0) {
|
||||
return $str;
|
||||
private function fullDecode(?string $str, int $maxDepth = 2){
|
||||
// 空值/无效深度/纯空格,直接返回(严谨前置判断,避免无效运算)
|
||||
if ($str === null || trim((string)$str) === '' || $maxDepth <= 0) {
|
||||
return $str === null ? '' : trim((string)$str);
|
||||
}
|
||||
|
||||
// 确保输入是字符串(兼容非字符串输入场景)
|
||||
$str = (string)$str;
|
||||
// 前置Unicode解码(避免转义字符干扰后续匹配)
|
||||
$str = $this->decodeUnicode($str);
|
||||
// ========== 预编译所有正则(合并同类型,避免循环内重复解析) ==========
|
||||
|
||||
// ========== 预编译正则(优化匹配精度、避免歧义,仅编译一次) ==========
|
||||
$regexps = [
|
||||
// 原有专属场景正则
|
||||
'ob0' => '/0B\s*\?0/',
|
||||
'dl18' => '/DL\s*\?.18/',
|
||||
// 原有通用场景正则
|
||||
'qMarkNum' => '/\?(\d+)/',
|
||||
'qMarkDotNum' => '/\?(\.\d+)/',
|
||||
// ≤、≠空格修复正则
|
||||
'neNum' => '/≠\s*(\d+)/',
|
||||
'leNum' => '/≤\s*(\d+)/',
|
||||
// 混合符号乱码正则(合并中英文顿号/逗号)
|
||||
'mixSymbol' => '/(\?)\s*(、|,)\s*(\?)\s*(、|,)\s*(\?)(\d+)/',
|
||||
// ≤、≠专属标识正则(合并LE/NE)
|
||||
'leNeMark' => '/(LE|NE)\s*\?(\d+)/',
|
||||
// Unicode转义正则
|
||||
'unicode' => '/\\\\u([0-9a-fA-F]{4})/',
|
||||
// Word二进制乱码(合并≤≥≠)
|
||||
'wordBin' => '/(\\xE2\\x89\\x86|\\xE2 0x89 0x86|e28986|\\xE2\\x89\\x87|\\xE2 0x89 0x87|e28987|\\xE2\\x89\\x80|\\xE2 0x89 0x80|e28980)/i',
|
||||
// Word XML实体异常(合并≤≥≠)
|
||||
'wordEntity' => '/&#\s*(\x|X)?\s*(2264|2265|2260)\s*;?/i',
|
||||
// 不可见控制字符
|
||||
'controlChar' => '/[\x00-\x1F\x7F]/',
|
||||
// 重复符号去重(合并≤≥≠)
|
||||
'repeatSymbol' => '/(≤{2,}|≥{2,}|≠{2,})/',
|
||||
// GBK编码乱码(合并≤≥≠)
|
||||
'gbkSymbol' => '/(\xA1\xF2|\xA1\xF3|\xA1\xF0)/'
|
||||
// 专属场景正则:优化空格匹配(任意空白字符)+ 问号转义(避免正则歧义)
|
||||
'ob0' => '/0B\s*\\?0/', // 匹配 0B?0、0B ?0 等场景
|
||||
'dl18' => '/DL\s*\\?\.18/', // 精准匹配 DL?.18(避免误匹配 DL?x.18)
|
||||
// 通用场景正则:问号转义,确保仅匹配字面问号
|
||||
'qMarkNum' => '/\\?(\d+)/', // 匹配 ?123、?45 等(问号转义)
|
||||
'qMarkDotNum' => '/\\?(\.\d+)/', // 匹配 ?.18、?.25 等(问号转义)
|
||||
// ≤、≠空格修复:支持任意空白字符(含全角空格)
|
||||
'neNum' => '/≠\s*(\d+)/u',
|
||||
'leNum' => '/≤\s*(\d+)/u',
|
||||
// 混合符号乱码:用非捕获组减少开销,优化分组逻辑
|
||||
'mixSymbol' => '/\\?\s*(?:、|,)\s*\\?\s*(?:、|,)\s*\\?(\d+)/u',
|
||||
// ≤、≠专属标识:支持大小写不敏感(覆盖 LE/le/NE/ne)
|
||||
'leNeMark' => '/(LE|NE)\s*\\?(\d+)/i',
|
||||
// Unicode转义:支持 \u/\U 前缀,覆盖更多转义格式
|
||||
'unicode' => '/\\\\[uU]([0-9a-fA-F]{4})/',
|
||||
// Word二进制乱码:优化正则结构(非捕获组),避免重复分组
|
||||
'wordBin' => '/(?:\\xE2\\x89\\x86|\\xE2\s*0x89\s*0x86|e28986|\\xE2\\x89\\x87|\\xE2\s*0x89\s*0x87|e28987|\\xE2\\x89\\x80|\\xE2\s*0x89\s*0x80|e28980)/i',
|
||||
// Word XML实体异常:优化匹配(支持无分号、空格间隔)
|
||||
'wordEntity' => '/&#\s*(?:x|X)?\s*(2264|2265|2260)\s*;?/i',
|
||||
// 不可见控制字符:添加UTF-8修饰符,避免匹配多字节字符异常
|
||||
'controlChar' => '/[\x00-\x1F\x7F]/u',
|
||||
// 重复符号去重:用反向引用优化,匹配更高效(支持≤≥≠)
|
||||
'repeatSymbol' => '/(≤|≥|≠)\1+/u',
|
||||
// GBK编码乱码:优化正则(无冗余分组),确保匹配原生字节
|
||||
'gbkSymbol' => '/\xA1\xF2|\xA1\xF3|\xA1\xF0/'
|
||||
];
|
||||
|
||||
// ========== 预定义所有替换映射(避免循环内重复创建) ==========
|
||||
// ========== 预定义替换映射(扩展场景、去冗余、修复转义问题) ==========
|
||||
$maps = [
|
||||
// HTML实体映射(扩展Word实体)
|
||||
// HTML实体映射:补充更多Word常见实体,覆盖不完整实体场景
|
||||
'htmlEntity' => [
|
||||
'≤' => '≤', '≤' => '≤', '≤' => '≤',
|
||||
'≥' => '≥', '≥' => '≥', '≥' => '≥',
|
||||
'≠' => '≠', '≠' => '≠', '≠' => '≠',
|
||||
'&le' => '≤', '&ge' => '≥', '&ne' => '≠',
|
||||
'ࣘ' => '≤', 'ࣙ' => '≥', 'ࣔ' => '≠',
|
||||
'≤' => '≤', '≥' => '≥', '≠' => '≠',
|
||||
'≤' => '≤', '≥' => '≥', '≠' => '≠',
|
||||
'<' => '≤', '>' => '≥'
|
||||
'≤' => '≤', '≤' => '≤', '≤' => '≤', '≤' => '≤',
|
||||
'≥' => '≥', '≥' => '≥', '≥' => '≥', '≥' => '≥',
|
||||
'≠' => '≠', '≠' => '≠', '≠' => '≠', '≠' => '≠',
|
||||
'&le' => '≤', '&ge' => '≥', '&ne' => '≠', // 无分号实体
|
||||
'ࣘ' => '≤', 'ࣙ' => '≥', 'ࣔ' => '≠', // 无分号数字实体
|
||||
'≤' => '≤', '≥' => '≥', '≠' => '≠', // 无分号十六进制实体
|
||||
'<' => '≤', '>' => '≥', // 业务专属映射(保留)
|
||||
],
|
||||
// 空格替换数组(扩展Word中的各种空格)
|
||||
// 空格替换数组:补充Word中常见的特殊空格,覆盖更多场景
|
||||
'nbsp' => [
|
||||
chr(0xC2) . chr(0xA0), // UTF-8不间断空格
|
||||
chr(0xA0), // 拉丁1不间断空格
|
||||
' ', // 全角空格
|
||||
chr(0x2002), // 方头空格
|
||||
chr(0x2003), // 全角空格
|
||||
chr(0x2004) // 三分之一全角空格
|
||||
chr(0xC2) . chr(0xA0), // UTF-8不间断空格( )
|
||||
chr(0xA0), // 拉丁1不间断空格
|
||||
' ', // 全角空格(U+3000)
|
||||
chr(0x2002), // 半角空格(U+2002)
|
||||
chr(0x2003), // 全角空格(U+2003)
|
||||
chr(0x2004), // 三分之一全角空格(U+2004)
|
||||
chr(0x2005), // 四分之一全角空格(U+2005)
|
||||
chr(0x202F), // 窄无中断空格(U+202F,Word常用)
|
||||
],
|
||||
// 二进制乱码映射
|
||||
// 二进制乱码映射:统一键名格式(去除空格),避免重复匹配
|
||||
'wordBin' => [
|
||||
'e28986' => '≤', '\\xe2\\x89\\x86' => '≤', '\\xe2 0x89 0x86' => '≤',
|
||||
'e28987' => '≥', '\\xe2\\x89\\x87' => '≥', '\\xe2 0x89 0x87' => '≥',
|
||||
'e28980' => '≠', '\\xe2\\x89\\x80' => '≠', '\\xe2 0x89 0x80' => '≠'
|
||||
'e28986' => '≤',
|
||||
'\xe2\x89\x86' => '≤',
|
||||
'\xe20x890x86' => '≤', // 去除空格后的统一键名
|
||||
'e28987' => '≥',
|
||||
'\xe2\x89\x87' => '≥',
|
||||
'\xe20x890x87' => '≥',
|
||||
'e28980' => '≠',
|
||||
'\xe2\x89\x80' => '≠',
|
||||
'\xe20x890x80' => '≠',
|
||||
],
|
||||
// XML实体编码映射
|
||||
// XML实体编码映射:保持简洁,仅映射核心数字
|
||||
'wordEntity' => [
|
||||
'2264' => '≤',
|
||||
'2265' => '≥',
|
||||
'2260' => '≠'
|
||||
'2260' => '≠',
|
||||
],
|
||||
// GBK编码映射
|
||||
// GBK编码映射:修复转义问题(用双引号包裹原生字节,避免匹配失败)
|
||||
'gbkSymbol' => [
|
||||
'\xA1\xF2' => '≤',
|
||||
'\xA1\xF3' => '≥',
|
||||
'\xA1\xF0' => '≠'
|
||||
]
|
||||
"\xA1\xF2" => '≤', // 原生GBK字节,无需转义(双引号关键)
|
||||
"\xA1\xF3" => '≥',
|
||||
"\xA1\xF0" => '≠',
|
||||
],
|
||||
];
|
||||
|
||||
// 预定义回调函数(仅创建一次,避免循环内重复实例化)
|
||||
// 预定义回调函数(仅创建一次,提升性能,增加容错)
|
||||
$unicodeCallback = function ($m) {
|
||||
return mb_chr(hexdec($m[1]), 'UTF-8') ?: $m[0];
|
||||
$code = hexdec($m[1]);
|
||||
// 容错:十六进制转换失败/无效Unicode码点,返回原始值
|
||||
return ($code >= 0x20 && $code <= 0x10FFFF) ? mb_chr($code, 'UTF-8') : $m[0];
|
||||
};
|
||||
|
||||
$depth = 0;
|
||||
$hasChange = false;
|
||||
$original = $str;
|
||||
$currentStr = $str;
|
||||
|
||||
// 循环解码:仅在有变化且未达最大深度时执行
|
||||
// 循环解码:仅在有变化且未达最大深度时执行(避免无限循环)
|
||||
do {
|
||||
$depth++;
|
||||
$hasChange = false;
|
||||
$prevStr = $str;
|
||||
$prevStr = $currentStr;
|
||||
|
||||
// ========== 前置处理(惰性执行,避免无意义操作) ==========
|
||||
$countCtrl = 0;
|
||||
// ========== 前置处理(惰性执行,仅在需要时触发) ==========
|
||||
// 1. 过滤不可见控制字符(仅当包含时执行)
|
||||
if (preg_match($regexps['controlChar'], $str)) {
|
||||
$str = preg_replace($regexps['controlChar'], '', $str, -1, $countCtrl);
|
||||
if (preg_match($regexps['controlChar'], $currentStr)) {
|
||||
$currentStr = preg_replace($regexps['controlChar'], '', $currentStr);
|
||||
}
|
||||
|
||||
// 2. GBK/GB2312编码转UTF-8(仅当非UTF-8时执行)
|
||||
if (!mb_check_encoding($str, 'UTF-8')) {
|
||||
$str = mb_convert_encoding($str, 'UTF-8', 'GBK,GB2312,ISO-8859-1');
|
||||
// 2. 编码校正(非UTF-8时才转换,增加容错机制)
|
||||
if (!mb_check_encoding($currentStr, 'UTF-8')) {
|
||||
$converted = mb_convert_encoding(
|
||||
$currentStr,
|
||||
'UTF-8',
|
||||
'GBK,GB2312,ISO-8859-1,CP1252' // 补充CP1252(Windows西文编码)
|
||||
);
|
||||
// 容错:转换失败时保留原文本,避免乱码加剧
|
||||
$currentStr = mb_check_encoding($converted, 'UTF-8') ? $converted : $currentStr;
|
||||
}
|
||||
|
||||
// ========== 核心解码逻辑 ==========
|
||||
// 1. 解码Unicode转义
|
||||
$str = preg_replace_callback($regexps['unicode'], $unicodeCallback, $str);
|
||||
// ========== 核心解码逻辑(按优先级执行,避免冲突) ==========
|
||||
// 1. Unicode转义解码(优先处理,避免转义字符干扰后续匹配)
|
||||
$currentStr = preg_replace_callback($regexps['unicode'], $unicodeCallback, $currentStr);
|
||||
|
||||
// 2. 解码HTML实体(高性能strtr替换)
|
||||
$str = strtr($str, $maps['htmlEntity']);
|
||||
$str = html_entity_decode($str, ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE, 'UTF-8');
|
||||
// 2. HTML实体替换(先精准映射,再解码剩余实体)
|
||||
$currentStr = strtr($currentStr, $maps['htmlEntity']);
|
||||
$currentStr = html_entity_decode(
|
||||
$currentStr,
|
||||
ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE,
|
||||
'UTF-8'
|
||||
);
|
||||
|
||||
// 3. 替换各种空格为普通空格
|
||||
$str = str_replace($maps['nbsp'], ' ', $str);
|
||||
// 3. 统一所有空格为普通空格(避免空格类型导致的匹配失败)
|
||||
$currentStr = str_replace($maps['nbsp'], ' ', $currentStr);
|
||||
|
||||
// ========== Word特殊符号乱码修复(合并+惰性) ==========
|
||||
$countBin = $countEnt = $countGbk = $countRepeat = 0;
|
||||
|
||||
// 1. 二进制乱码还原(合并正则+回调)
|
||||
if (preg_match($regexps['wordBin'], $str)) {
|
||||
$str = preg_replace_callback($regexps['wordBin'], function ($m) use ($maps) {
|
||||
$key = strtolower(str_replace(' ', '', $m[0]));
|
||||
return $maps['wordBin'][$key] ?? $m[0];
|
||||
}, $str, -1, $countBin);
|
||||
// ========== Word特殊符号乱码修复(惰性执行,优化效率) ==========
|
||||
// 1. 二进制乱码还原(先去除空格统一格式,再匹配)
|
||||
if (preg_match($regexps['wordBin'], $currentStr)) {
|
||||
$tempStr = str_replace(' ', '', $currentStr); // 去除所有空格,统一键名格式
|
||||
$currentStr = str_ireplace(array_keys($maps['wordBin']), $maps['wordBin'], $tempStr);
|
||||
}
|
||||
|
||||
// 2. XML实体异常修复(合并正则+回调)
|
||||
if (preg_match($regexps['wordEntity'], $str)) {
|
||||
$str = preg_replace_callback($regexps['wordEntity'], function ($m) use ($maps) {
|
||||
return $maps['wordEntity'][$m[2]] ?? $m[0];
|
||||
}, $str, -1, $countEnt);
|
||||
// 2. XML实体异常修复
|
||||
if (preg_match($regexps['wordEntity'], $currentStr)) {
|
||||
$currentStr = preg_replace_callback($regexps['wordEntity'], function ($m) use ($maps) {
|
||||
return $maps['wordEntity'][$m[1]] ?? $m[0];
|
||||
}, $currentStr);
|
||||
}
|
||||
|
||||
// 3. GBK编码乱码修复(合并正则+回调)
|
||||
if (preg_match($regexps['gbkSymbol'], $str)) {
|
||||
$str = preg_replace_callback($regexps['gbkSymbol'], function ($m) use ($maps) {
|
||||
return $maps['gbkSymbol'][$m[0]] ?? $m[0];
|
||||
}, $str, -1, $countGbk);
|
||||
// 3. GBK编码乱码修复(用strtr替代preg_replace_callback,效率更高)
|
||||
if (preg_match($regexps['gbkSymbol'], $currentStr)) {
|
||||
$currentStr = strtr($currentStr, $maps['gbkSymbol']);
|
||||
}
|
||||
|
||||
// 4. 重复符号去重(合并正则+极简回调)
|
||||
if (preg_match($regexps['repeatSymbol'], $str)) {
|
||||
$str = preg_replace_callback($regexps['repeatSymbol'], function ($m) {
|
||||
return $m[0][0]; // 取第一个字符实现去重
|
||||
}, $str, -1, $countRepeat);
|
||||
// 4. 重复符号去重(用preg_replace简化,无需回调)
|
||||
if (preg_match($regexps['repeatSymbol'], $currentStr)) {
|
||||
$currentStr = preg_replace($regexps['repeatSymbol'], '$1', $currentStr);
|
||||
}
|
||||
|
||||
// ========== 原有核心替换逻辑(合并+惰性) ==========
|
||||
$count1 = $count2 = $count3 = $count4 = $count5 = $count6 = 0;
|
||||
$count7 = $count8 = $count9 = 0;
|
||||
|
||||
// 1. 专属场景替换(惰性执行)
|
||||
if (strpos($str, '0B?0') !== false) {
|
||||
$str = preg_replace($regexps['ob0'], '0B≥30', $str, -1, $count1);
|
||||
// ========== 业务场景专属替换(惰性执行,精准匹配) ==========
|
||||
// 1. 专属场景替换(0B?0 → 0B≥30,DL?.18 → DL≥0.18)
|
||||
if (strpos($currentStr, '0B') !== false) {
|
||||
$currentStr = preg_replace($regexps['ob0'], '0B≥30', $currentStr);
|
||||
}
|
||||
if (strpos($str, 'DL?.18') !== false) {
|
||||
$str = preg_replace($regexps['dl18'], 'DL≥0.18', $str, -1, $count2);
|
||||
if (strpos($currentStr, 'DL') !== false) {
|
||||
$currentStr = preg_replace($regexps['dl18'], 'DL≥0.18', $currentStr);
|
||||
}
|
||||
|
||||
// 2. ≤、≠空格修复(惰性执行)
|
||||
if (preg_match($regexps['neNum'], $str)) {
|
||||
$str = preg_replace($regexps['neNum'], '≠$1', $str, -1, $count3);
|
||||
// 2. ≤、≠空格修复(去除符号与数字间的空格)
|
||||
if (preg_match($regexps['neNum'], $currentStr)) {
|
||||
$currentStr = preg_replace($regexps['neNum'], '≠$1', $currentStr);
|
||||
}
|
||||
if (preg_match($regexps['leNum'], $str)) {
|
||||
$str = preg_replace($regexps['leNum'], '≤$1', $str, -1, $count4);
|
||||
if (preg_match($regexps['leNum'], $currentStr)) {
|
||||
$currentStr = preg_replace($regexps['leNum'], '≤$1', $currentStr);
|
||||
}
|
||||
|
||||
// 3. 通用场景替换(惰性执行)
|
||||
if (preg_match($regexps['qMarkNum'], $str)) {
|
||||
$str = preg_replace($regexps['qMarkNum'], '≥$1', $str, -1, $count5);
|
||||
// 3. 通用场景替换(问号 → ≥)
|
||||
if (preg_match($regexps['qMarkNum'], $currentStr)) {
|
||||
$currentStr = preg_replace($regexps['qMarkNum'], '≥$1', $currentStr);
|
||||
}
|
||||
if (preg_match($regexps['qMarkDotNum'], $str)) {
|
||||
$str = preg_replace($regexps['qMarkDotNum'], '≥0$1', $str, -1, $count6);
|
||||
if (preg_match($regexps['qMarkDotNum'], $currentStr)) {
|
||||
$currentStr = preg_replace($regexps['qMarkDotNum'], '≥0$1', $currentStr);
|
||||
}
|
||||
|
||||
// 4. 混合符号乱码还原(合并中英文,惰性执行)
|
||||
if (preg_match($regexps['mixSymbol'], $str)) {
|
||||
$str = preg_replace($regexps['mixSymbol'], '≤$2≥$4≠$6', $str, -1, $count7);
|
||||
// 4. 混合符号乱码还原(?、,?、,?123 → ≤≥≠123)
|
||||
if (preg_match($regexps['mixSymbol'], $currentStr)) {
|
||||
$currentStr = preg_replace($regexps['mixSymbol'], '≤≥≠$1', $currentStr);
|
||||
}
|
||||
|
||||
// 5. ≤、≠专属标识还原(合并正则,惰性执行)
|
||||
if (preg_match($regexps['leNeMark'], $str)) {
|
||||
$str = preg_replace_callback($regexps['leNeMark'], function ($m) {
|
||||
return $m[1] === 'LE' ? '≤' . $m[2] : '≠' . $m[2];
|
||||
}, $str, -1, $count8);
|
||||
// 5. ≤、≠专属标识还原(LE?123 → ≤123,NE?456 → ≠456)
|
||||
if (preg_match($regexps['leNeMark'], $currentStr)) {
|
||||
$currentStr = preg_replace_callback($regexps['leNeMark'], function ($m) {
|
||||
return strtoupper($m[1]) === 'LE' ? '≤' . $m[2] : '≠' . $m[2];
|
||||
}, $currentStr);
|
||||
}
|
||||
|
||||
// 6. 修复前缀"d with "乱码(惰性执行)
|
||||
if (strpos($str, 'd with ') !== false) {
|
||||
$str = str_replace('d with ', 'd with ', $str, $count9);
|
||||
}
|
||||
// 6. 移除冗余代码(原代码"d with "替换无意义,直接删除)
|
||||
|
||||
// ========== 变化判断(合并计数,减少运算) ==========
|
||||
$totalCount = $countCtrl + $countBin + $countEnt + $countGbk + $countRepeat +
|
||||
$count1 + $count2 + $count3 + $count4 + $count5 + $count6 +
|
||||
$count7 + $count8 + $count9;
|
||||
// ========== 变化判断(简化逻辑,避免无效计数) ==========
|
||||
$hasChange = ($currentStr !== $prevStr);
|
||||
|
||||
if ($totalCount > 0 || $str !== $prevStr) {
|
||||
$hasChange = true;
|
||||
$original = $str;
|
||||
}
|
||||
} while ($depth < $maxDepth && $hasChange);
|
||||
|
||||
// 提前终止:无变化则退出循环
|
||||
if (!$hasChange) {
|
||||
break;
|
||||
}
|
||||
// 最终清理(去除首尾冒号+二次实体替换,确保无遗漏)
|
||||
$currentStr = trim($currentStr, ':');
|
||||
$currentStr = strtr($currentStr, $maps['htmlEntity']);
|
||||
|
||||
} while ($depth < $maxDepth);
|
||||
|
||||
// 最终清理+兜底替换
|
||||
$str = trim($str, ':');
|
||||
$str = strtr($str, $maps['htmlEntity']);
|
||||
|
||||
return $str;
|
||||
return $currentStr;
|
||||
}
|
||||
|
||||
// private function fullDecode($str, $maxDepth = 5) {
|
||||
|
||||
Reference in New Issue
Block a user