', ',', $superscript); // 恢复多编号逗号
-// $superscript = preg_replace('/,$/', '', $superscript); // 清理末尾多余逗号
-// if (!empty($name)) {
-// $authorList[] = [
-// 'name' => $name,
-// 'superscript' => $superscript
-// ];
-// }
-// }
-// }
-
-// // 输出结果
-// echo "";
-// print_r($authorList);
-// echo "
";
-// exit;
-
-// // 处理作者
-// $content = mb_convert_encoding($sAuthorContent, 'UTF-8', 'auto'); // 确保编码正确
-// $content = str_replace("\xC2\xA0", ' ', $content); // 替换非-breaking空格为普通空格
-// $content = preg_replace('/(\d+)\s*([*#†])/', '$1$2', $content); // 合并"1 *"为"1*"、"1 #"为"1#"
-// $content = preg_replace('/,†/', ',†', $content); // 保留"1,†"格式(防止被拆分)
-
-// //标记上标内的逗号+空格(多编号)
-// $tempStr = preg_replace('/(\d+)\s*,\s*(\d+)/', '$1$2', $content);
-// // 原有步骤2:正则匹配(扩展上标符号支持,保持原有逻辑)
-// $pattern = '/
-// ([A-Za-z\s\.\-]+?) # 姓名(支持缩写、空格)
-// \s* # 姓名与上标间空格
-// ( # 上标组(扩展符号支持)
-// \d+ # 起始数字
-// (?:[†#*,]|\d+)* # 允许:†#*符号、逗号、+数字(兼容1,†、1,*等)
-// )
-// \s*,? # 作者间逗号(可选)
-// (?=\s|$) # 确保后面是空格或结尾
-// /ux';
-
-// preg_match_all($pattern, $tempStr, $matches);
-// var_dump($matches);exit;
-// $authorList = [];
-// if(!empty($matches[1])){
-// foreach ($matches[1] as $i => $name) {
-// $name = trim($name);
-// $superscript = trim($matches[2][$i]);
-// $superscript = str_replace('', ',', $superscript); // 恢复多编号逗号
-// $superscript = preg_replace('/,$/', '', $superscript); // 清理末尾逗号
-// // 修复符号与数字间的空格(如原始"1 *"被误处理为"1*"的情况,保持原样)
-// $superscript = preg_replace('/(\d)([*#†])/', '$1$2', $superscript);
-// if (!empty($name)) {
-// $authorList[] = [
-// 'name' => $name,
-// 'superscript' => $superscript
-// ];
-// }
-// }
-// }else {
-// // 按“两个或多个连续空格”拆分(姓名之间的分隔)
-// $authorList = array_filter(
-// array_map('trim',
-// preg_split('/(,\p{Z}*|\p{Z}{2,})/u', $sAuthorContent)
-// )
-// );
-// }
-
-
-// // //处理作者
-// $aAuthorData = [];
-// $aReport = [];
-// $namePattern = '/
-// (?:[A-Za-z\s·\-\']+| # 英文姓名(支持空格、连字符)
-// [\x{4e00}-\x{9fa5}]+| # 中文姓名
-// [\x{1800}-\x{18AF}]+| # 蒙古文姓名
-// [A-Z]\.) # 单字母缩写(如 J.)
-// /ux';
-
-// foreach ($authorList as $authorStr){
-// if (empty($authorStr)) continue;
-
-// //获取下标
-// $superscript = empty($authorStr['superscript']) ? $authorStr : $authorStr['superscript'];
-// $nameStr = empty($authorStr['name']) ? $authorStr : $authorStr['name'];
-
-// $companyId = [];
-// $isSuper = 0;
-// $isReport = 0;
-// if (!empty($superscript)) {
-// // 提取机构编号(忽略上标中的逗号,如1,† → 提取1)
-// preg_match_all('/\d+/', $superscript, $numMatch);
-// // 识别特殊符号(#为超级作者,*†为通讯作者)
-// $isSuper = strpos($superscript, '#') !== false ? 1 : 0;
-// $isReport = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0;
-// }
-// if (preg_match("/^([A-Za-z\s'\.-]+)/u", $nameStr, $match)) {
-// $nameStr = trim($match[1]);
-// }
-// $aAuthorData[] = [
-// 'name' => $nameStr,
-// 'company_id' => empty($numMatch[0]) ? [] : $numMatch[0],
-// 'is_super' => $isSuper,
-// 'is_report' => $isReport
-// ];
-// if ($isReport) {
-// $aReport[] = $nameStr;
-// }
-// }
-// return ['author' => $aAuthorData,'report' => array_unique($aReport)];
-// }
// 获取机构
private function getCompany($aParam = []){
@@ -815,16 +476,39 @@ class ArticleParserService
$currentNumber = null; // 当前序号
foreach ($allLines as $line) {
$line = trim($line);
- if (empty($line)) continue;
-
- // 判断是否是新条目的开头:行首为数字(后续可接任意字符或直接接内容)
+ if (empty($line)) {
+ continue;
+ }
+ if(!mb_check_encoding($line, 'UTF-8')){
+ $line = mb_convert_encoding($line, 'UTF-8', 'GBK');
+ }
+ $line = $this->fullDecode($line);
$number = '';
$i = 0;
$lineLen = strlen($line);
// 提取行首的连续数字(作为序号)
- while ($i < $lineLen && ctype_digit($line[$i])) {
- $number .= $line[$i];
- $i++;
+ $hasFirstChar = false;
+ while ($i < $lineLen) {
+ $currentChar = $line[$i];
+ // 首字符处理:允许 26个字母(大小写)或数字
+ if (!$hasFirstChar) {
+ if (ctype_digit($currentChar) || ctype_alpha($currentChar)) {
+ $number .= $currentChar;
+ $hasFirstChar = true;
+ $i++;
+ } else {
+ // 首字符不符合(非字母/数字),终止循环
+ break;
+ }
+ } else {
+ // 后续字符必须是数字(保持原逻辑)
+ if (ctype_digit($currentChar)) {
+ $number .= $currentChar;
+ $i++;
+ } else {
+ break;
+ }
+ }
}
// 若行首有数字,则视为新条目
@@ -840,34 +524,36 @@ class ArticleParserService
continue;
}
- // 非新条目,合并到当前序号的内容中
- if ($currentNumber !== null) {
- $grouped[$currentNumber] .= ' ' . $line;
- }
+ // // 非新条目,合并到当前序号的内容中
+ // if ($currentNumber !== null) {
+ // $grouped[$currentNumber] .= ' ' . $line;
+ // }
}
- //清理结果
- $possibleEncodings = [
- 'Windows-1252', 'UTF-8', 'GBK', 'GB2312',
- 'Latin-1', 'ISO-8859-1', 'CP1252'
- ];
$aCompany = [];
foreach ($grouped as $number => $institution) {
- $encodedContent = @mb_convert_encoding($institution, 'UTF-8', implode(',', $possibleEncodings));
- $sCompany = $encodedContent ?: $sCompany;
+ $institution = $this->fullDecode($institution);
+ // 原有基础清理逻辑不变
$institution = preg_replace('/\s+/', ' ', $institution); // 合并多余空格
- $institution = rtrim($institution, '.');
- $institution = preg_replace('/^\d+\s+/', '', $institution);
+ $institution = rtrim($institution, '.'); // 去除末尾句号
+ $institution = preg_replace('/^\d+\s+/', '', $institution); // 去除开头数字
$institution = trim($institution); // 清理首尾空格
- preg_match('/(.*?, [A-Za-z]+ \d+, [A-Za-z]+)/', $institution, $institutionmatches);;
- $institution = trim($institutionmatches[1] ?? $institution);
- if (preg_match('/^(.*?)(?=\s*\*Email)/', $institution, $matches)) {
- $institution = trim($matches[1]); // trim() 去除内容前后多余空格
+
+ // 增强地址提取:匹配"机构名, 城市 邮编, 国家"格式(兼容更多变体)
+ // 允许地址中包含多个逗号(如子机构、街道信息),最终以"城市 邮编, 国家"结尾
+ // preg_match('/(.*?, [A-Za-z\s]+ \d+, [A-Za-z\s]+)/', $institution, $institutionmatches);
+ // $institution = trim($institutionmatches[1] ?? $institution);
+ // 强化冗余信息过滤:去除"*"及之后的内容(包括通讯作者、邮箱等)
+ // 新增对"#"、"†"等标记的过滤,兼容更多期刊格式
+ if (preg_match('/^(.*?)(?=\s*[\*#†]|(?i)\s*Email)/', $institution, $matches)) {
+ $institution = trim($matches[1]);
}
- if(!empty($institution) && !mb_check_encoding($institution, 'UTF-8')){
+
+ // 编码校验不变
+ if (!empty($institution) && !mb_check_encoding($institution, 'UTF-8')) {
$institution = mb_convert_encoding($institution, 'UTF-8', 'GBK');
}
- $aCompany[$number] = $institution;
+ $aCompany[$number] = empty($institution) ? '' : trim(trim($institution),'.');
}
return $aCompany;
}
@@ -891,13 +577,11 @@ class ArticleParserService
// 获取机构后的完整内容
$corrText = $this->getContentAfterText($sCompany);
- //编码修复
- $possibleEncodings = [
- 'Windows-1252', 'UTF-8', 'GBK', 'GB2312',
- 'Latin-1', 'ISO-8859-1', 'CP1252'
- ];
- $encodedContent = @mb_convert_encoding($corrText, 'UTF-8', implode(',', $possibleEncodings));
- $corrText = $encodedContent ?: $corrText;
+ if(!mb_check_encoding($corrText, 'UTF-8')){
+ $corrText = mb_convert_encoding($corrText, 'UTF-8', 'GBK');
+ }
+ $corrText = $this->fullDecode($corrText);
+
// // 调试
// file_put_contents(ROOT_PATH . 'runtime/corr_text_raw.log', $corrText);
@@ -922,23 +606,35 @@ class ArticleParserService
$aCorresponding[] = [
'name' => $sName,
'email' => isset($email[2]) ? trim($email[2]) : '',
- 'postal_address' => isset($address[2]) ? trim($address[2]) : '',
+ 'postal_address' => isset($address[2]) ? trim(trim($address[2]),'.') : '',
'tel' => isset($tel[2]) ? trim($tel[2]) : ''
];
}
if(empty($aCorresponding)){
- $pattern = '/Corresponding Authors: (.*?)(?=$|;)/s';
+ // $pattern = '/Corresponding Authors|Correspondence to|Correspondence: (.*?)(?=$|;)/s';
+ $pattern = '/(Corresponding Authors|Correspondence to|Correspondence)\s*:\s*([\s\S]+?)(?=\n\s*\n|$|;)/is';
+ $corrText = trim($corrText,'*');
preg_match($pattern, $corrText, $match);
- if (!empty($match[1])) {
- $corrContent = $match[1];
+ if (!empty($match[2])) {
+ $corrContent = $match[2];
// 提取每个作者的名称和邮箱(优化正则,支持更多字符)
$authorPattern = '/([A-Za-z\s]+?),\s*E-mail:\s*([\w@\.\-]+)/';
preg_match_all($authorPattern, $corrContent, $authors);
if(!empty($authors[1])){
for ($i = 0; $i < count($authors[1]); $i++) {
$aCorresponding[] = [
- 'name' => empty($authors[1][$i]) ? '' : trim($authors[1][$i]),
- 'email' => empty($authors[2][$i]) ? '' : trim($authors[2][$i])
+ 'name' => empty($authors[1][$i]) ? '' : trim(trim($authors[1][$i]),'.'),
+ 'email' => empty($authors[2][$i]) ? '' : trim(trim($authors[2][$i]),'.')
+ ];
+ }
+ }
+ if(empty($authors[1])){
+ $authorPattern = '/([A-Za-z0-9\s]+?),\s*([\w@\.\-]+)(?=\.?)/';
+ preg_match_all($authorPattern, $corrContent, $authors);
+ for ($i = 0; $i < count($authors[1]); $i++) {
+ $aCorresponding[] = [
+ 'name' => empty($authors[1][$i]) ? '' : trim(trim($authors[1][$i]),'.'),
+ 'email' => empty($authors[2][$i]) ? '' : trim(trim($authors[2][$i]),'.')
];
}
}
@@ -1040,106 +736,379 @@ class ArticleParserService
}
// 统一提取元素文本
- private function getTextFromElement($element,$lineNumber = 0){
+ private function getTextFromElement(\PhpOffice\PhpWord\Element\AbstractElement $element, int $lineNumber = 0){
$text = '';
- // 处理PreserveText元素
+
+ // 1. 常量化特殊引号映射(避免每次调用重建数组,提升循环调用性能)
+ static $specialQuotesMap = [
+ '’' => "'", // 右单引号(U+2019)→ 普通单引号(U+0027)
+ '‘' => "'", // 左单引号(U+2018)→ 普通单引号(U+0027)
+ '“' => '"', // 左双引号(U+201C)→ 普通双引号(U+0022)
+ '”' => '"', // 右双引号(U+201D)→ 普通双引号(U+0022)
+ '„' => '"', // 下双引号(U+201E)→ 普通双引号(兼容欧洲排版)
+ '‟' => '"', // 右双引号(U+201F)→ 普通双引号(兼容少见排版)
+ ];
+
+ // 支持H1-H9标题格式(优化:移除无用变量 $titleDepth,避免冗余)
+ if ($element instanceof \PhpOffice\PhpWord\Element\Title) {
+ $titleContent = $element->getText();
+ $titleText = '';
+
+ if ($titleContent instanceof \PhpOffice\PhpWord\Element\TextRun) {
+ $titleText = $this->getTextFromElement($titleContent);
+ } else {
+ $titleText = strtr((string)$titleContent, $specialQuotesMap);
+ }
+
+ $text .= $titleText . ' ';
+ return $this->cleanText($text);
+ }
+
+ // 项目编号(优化:严格空值判断,避免 0 被 empty 误判)
+ if ($element instanceof \PhpOffice\PhpWord\Element\ListItemRun) {
+ $this->iNum = isset($this->iNum) && is_numeric($this->iNum) ? $this->iNum : 0;
+ $this->iNum++;
+ $text .= $this->iNum . ' ';
+ }
+
+ // 处理PreserveText(含HYPERLINK邮箱提取,优化:反射前先判断属性存在)
if ($element instanceof \PhpOffice\PhpWord\Element\PreserveText) {
- // 通过反射获取私有属性 text
- $reflection = new \ReflectionClass($element);
- $property = $reflection->getProperty('text');
- $property->setAccessible(true);
- $textParts = $property->getValue($element);
+ try {
+ $reflection = new \ReflectionClass($element);
+ // 先判断属性是否存在,避免反射不存在的属性报错(兼容极端版本)
+ if (!$reflection->hasProperty('text')) {
+ return $this->cleanText($text);
+ }
+ $property = $reflection->getProperty('text');
+ $property->setAccessible(true);
+ $textParts = $property->getValue($element) ?? [];
+ } catch (\ReflectionException $e) {
+ return $this->cleanText($text);
+ }
+
foreach ($textParts as $part) {
+ $part = (string)$part;
if (strpos($part, 'HYPERLINK') !== false) {
- // 解码 HTML 实体(" -> ")
- $decoded = html_entity_decode($part);
- // 提取 mailto: 后的邮箱
- if (preg_match('/mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/i', $decoded, $match)) {
+ $decoded = html_entity_decode($part, ENT_QUOTES | ENT_HTML5);
+ // 邮箱正则不变(已优化,兼容国际域名)
+ if (preg_match('/mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,10})/i', $decoded, $match)) {
$text .= $match[1] . ' ';
}
} else {
- // 普通文本直接拼接
+ $part = strtr($part, $specialQuotesMap);
$text .= $part;
}
}
- return $text;
+ return $this->cleanText($text);
}
- // 处理表格和单元格(E-mail可能在表格中)
+
+ // 处理表格(优化:避免行尾多余空格,通过 cleanText 自动合并)
if ($element instanceof \PhpOffice\PhpWord\Element\Table) {
foreach ($element->getRows() as $row) {
foreach ($row->getCells() as $cell) {
- $text .= $this->getTextFromElement($cell);
+ $text .= $this->getTextFromElement($cell) . ' ';
}
+ // 移除行尾额外空格(cleanText 会合并连续空格,无需手动添加)
}
- return $text;
+ return $this->cleanText($text);
}
+
+ // 处理单元格(逻辑不变,保持递归提取)
if ($element instanceof \PhpOffice\PhpWord\Element\Cell) {
foreach ($element->getElements() as $child) {
$text .= $this->getTextFromElement($child);
}
- return $text;
+ return $this->cleanText($text);
}
- //处理嵌套元素(递归提取所有子元素)
- if (method_exists($element, 'getElements')) {
+ // 处理嵌套元素(逻辑不变,增强类型校验可读性)
+ if (method_exists($element, 'getElements') && is_callable([$element, 'getElements'])) {
foreach ($element->getElements() as $child) {
- $text .= $this->getTextFromElement($child);
+ if ($child instanceof \PhpOffice\PhpWord\Element\AbstractElement) {
+ $text .= $this->getTextFromElement($child);
+ }
}
}
- //处理文本元素(包括带格式的文本)
+ // 处理纯文本元素(逻辑不变,保持特殊引号替换)
if ($element instanceof \PhpOffice\PhpWord\Element\Text) {
- $text .= $element->getText();
+ $textPart = (string)$element->getText(); // 显式强制转换,避免类型隐患
+ $textPart = strtr($textPart, $specialQuotesMap);
+ $text .= $textPart;
}
- //处理超链接(优先提取链接目标,可能是邮箱)
+ // 处理超链接(逻辑不变,保持邮箱优先提取)
if ($element instanceof \PhpOffice\PhpWord\Element\Link) {
- $target = $element->getTarget();
+ $target = (string)$element->getTarget();
if (strpos($target, 'mailto:') === 0) {
- $text .= str_replace('mailto:', '', $target) . ' '; // 剥离mailto:前缀
+ $text .= rtrim(str_replace('mailto:', '', $target)) . ' ';
}
- $text .= $element->getText() . ' ';
+ $linkText = strtr((string)$element->getText(), $specialQuotesMap);
+ $text .= $linkText . ' ';
}
- //处理字段和注释(可能包含隐藏邮箱)
+ // 处理字段和注释(优化:显式强制转换,避免非字符串拼接)
if ($element instanceof \PhpOffice\PhpWord\Element\Field) {
- $text .= $element->getContent() . ' ';
+ $text .= (string)$element->getContent() . ' ';
}
if ($element instanceof \PhpOffice\PhpWord\Element\Note) {
- $text .= $element->getContent() . ' ';
+ $text .= (string)$element->getContent() . ' ';
}
- //清理所有不可见字符(关键:移除格式干扰)
- $text = preg_replace('/[\x00-\x1F\x7F-\x9F]/', ' ', $text); // 移除控制字符
- $text = str_replace(["\t", "\r", "\n"], ' ', $text); // 统一空白字符
- $text = preg_replace('/\s+/', ' ', $text); // 合并多个空格
- if(!empty($text) && !mb_check_encoding($text, 'UTF-8')){
- $text = mb_convert_encoding($text, 'UTF-8', 'GBK');
- }
- return $text;
+
+ return $this->cleanText($text);
}
+ /**
+ * 统一文本清理方法(稳健、高效、不破坏普通单引号)
+ * @param string $text 待清理文本
+ * @return string 清理后的纯文本
+ */
+ private function cleanText(string $text){
+
+ //编码正确
+ if (!mb_check_encoding($text, 'UTF-8')) {
+ $text = mb_convert_encoding(
+ $text,
+ 'UTF-8',
+ 'GBK,GB2312,GB18030,Big5,ISO-8859-1,CP1252,UTF-16,UTF-32' // 补充常见西文编码,兼容更多场景
+ );
+ }
+ //移除不可见控制字符
+ $text = preg_replace('/[\x00-\x1F\x7F-\x9F]/u', ' ', $text);
+
+ //统一空白字符
+ $text = str_replace([
+ "\t", "\r", "\n",
+ chr(0xC2) . chr(0xA0), // 不间断空格( )
+ ' ', // 全角空格(U+3000)
+ chr(0xE2) . chr(0x80) . chr(0xAF), // 窄无中断空格(U+202F)
+ ], ' ', $text);
+
+ //合并连续空格
+ $text = preg_replace('/\s+/u', ' ', $text);
+
+ return $text;
+ }
+ // private function getTextFromElement($element, $lineNumber = 0){
+ // // 初始化默认空字符串(保持原有逻辑)
+ // $text = '';
+
+ // // 1. 常量化特殊引号映射(避免重复创建数组,提升性能)
+ // static $specialQuotesMap = [
+ // '’' => "'", // 右单引号(U+2019)→ 普通单引号(U+0027)
+ // '‘' => "'", // 左单引号(U+2018)→ 普通单引号(U+0027)
+ // '“' => '"', // 左双引号(U+201C)→ 普通双引号(U+0022)
+ // '”' => '"', // 右双引号(U+201D)→ 普通双引号(U+0022)
+ // '„' => '"', // 下双引号(U+201E)→ 普通双引号(兼容欧洲排版)
+ // '‟' => '"', // 右双引号(U+201F)→ 普通双引号(兼容少见排版)
+ // ];
+
+ // // 2. 提前校验元素合法性(避免后续 instanceof 无效判断,减少报错)
+ // if (!is_object($element) || !$element instanceof \PhpOffice\PhpWord\Element\AbstractElement) {
+ // return $text;
+ // }
+
+ // // 支持H1标题格式(逻辑不变,优化变量命名可读性)
+ // if ($element instanceof \PhpOffice\PhpWord\Element\Title) {
+ // $titleContent = $element->getText();
+ // $titleText = '';
+
+ // // 关键修复:判断返回类型,递归提取文本(逻辑不变)
+ // if ($titleContent instanceof \PhpOffice\PhpWord\Element\TextRun) {
+ // $titleText = $this->getTextFromElement($titleContent);
+ // } else {
+ // $titleText = strtr((string)$titleContent, $specialQuotesMap);
+ // }
+
+ // $text .= $titleText . ' ';
+ // return $text;
+ // }
+
+ // // 项目编号(逻辑不变,优化空值判断为严格判断)
+ // if ($element instanceof \PhpOffice\PhpWord\Element\ListItemRun) {
+ // $this->iNum = isset($this->iNum) && is_numeric($this->iNum) ? $this->iNum : 0;
+ // $this->iNum++;
+ // $text .= $this->iNum . ' ';
+ // }
+
+ // // 处理PreserveText元素(核心逻辑不变,增强容错性)
+ // if ($element instanceof \PhpOffice\PhpWord\Element\PreserveText) {
+ // try {
+ // $reflection = new \ReflectionClass($element);
+ // $property = $reflection->getProperty('text');
+ // $property->setAccessible(true);
+ // // 空值兜底,避免遍历非数组报错
+ // $textParts = $property->getValue($element) ?? [];
+ // } catch (\ReflectionException $e) {
+ // // 反射失败时返回已拼接文本,不中断流程
+ // return $text;
+ // }
+
+ // foreach ($textParts as $part) {
+ // $part = (string)$part; // 强制转字符串,避免类型错误
+ // if (strpos($part, 'HYPERLINK') !== false) {
+ // $decoded = html_entity_decode($part, ENT_QUOTES | ENT_HTML5);
+ // // 邮箱正则不变,保持原有匹配逻辑
+ // if (preg_match('/mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,10})/i', $decoded, $match)) {
+ // $text .= $match[1] . ' ';
+ // }
+ // } else {
+ // $text .= $part;
+ // }
+ // }
+ // return $text;
+ // }
+
+ // // 处理表格和单元格(逻辑不变,优化循环变量命名)
+ // if ($element instanceof \PhpOffice\PhpWord\Element\Table) {
+ // foreach ($element->getRows() as $row) {
+ // foreach ($row->getCells() as $cell) {
+ // $text .= $this->getTextFromElement($cell);
+ // }
+ // }
+ // return $text;
+ // }
+
+ // if ($element instanceof \PhpOffice\PhpWord\Element\Cell) {
+ // foreach ($element->getElements() as $child) {
+ // $text .= $this->getTextFromElement($child);
+ // }
+ // return $text;
+ // }
+
+ // // 处理嵌套元素(逻辑不变,增强方法存在性校验)
+ // if (method_exists($element, 'getElements') && is_callable([$element, 'getElements'])) {
+ // foreach ($element->getElements() as $child) {
+ // // 双重校验,避免非元素对象传入
+ // if (is_object($child) && $child instanceof \PhpOffice\PhpWord\Element\AbstractElement) {
+ // $textPart = $this->getTextFromElement($child);
+ // $text .= $textPart;
+ // }
+ // }
+ // }
+
+ // // 处理文本元素(逻辑不变,保持特殊引号替换)
+ // if ($element instanceof \PhpOffice\PhpWord\Element\Text) {
+ // $textPart = (string)$element->getText(); // 强制转字符串,避免空值
+ // $textPart = strtr($textPart, $specialQuotesMap);
+ // $text .= $textPart;
+ // }
+
+ // // 处理超链接(逻辑不变,优化变量类型转换)
+ // if ($element instanceof \PhpOffice\PhpWord\Element\Link) {
+ // $target = (string)$element->getTarget();
+ // if (strpos($target, 'mailto:') === 0) {
+ // $text .= rtrim(str_replace('mailto:', '', $target)) . ' ';
+ // }
+ // $linkText = strtr((string)$element->getText(), $specialQuotesMap);
+ // $text .= $linkText . ' ';
+ // }
+
+ // // 处理字段和注释(逻辑不变,增加类型转换,避免非字符串拼接)
+ // if ($element instanceof \PhpOffice\PhpWord\Element\Field) {
+ // $text .= (string)$element->getContent() . ' ';
+ // }
+ // if ($element instanceof \PhpOffice\PhpWord\Element\Note) {
+ // $text .= (string)$element->getContent() . ' ';
+ // }
+
+ // // 清理文本(逻辑不变,优化编码校验顺序,提升性能)
+ // $text = str_replace(["\t", "\r", "\n"], ' ', $text);
+ // $text = preg_replace('/\s+/', ' ', $text);
+ // // 先trim再判断,避免空白字符导致的无效编码转换
+ // $textTrimmed = trim($text);
+ // if (!empty($textTrimmed) && !mb_check_encoding($textTrimmed, 'UTF-8')) {
+ // $text = mb_convert_encoding($text, 'UTF-8', 'GBK');
+ // }
+
+ // return $text;
+ // }
/**
* 从 Word 文档提取摘要和关键词
* @return array 提取结果
*/
+ function extractContentIntervals($str, $markers = []) {
+ // 1. 初始化标记(支持自定义,默认值兼容原逻辑)
+ $defaultMarkers = [
+ 'abstract' => 'abstract',
+ 'keywords' => 'keywords',
+ 'end_span' => '===========end-span'
+ ];
+ $markers = array_merge($defaultMarkers, $markers);
+ extract($markers); // 解析为变量 $abstract, $keywords, $end_span
+
+ // 2. 初始化结果(包含元信息)
+ $result = [
+ 'abstract_to_keywords' => '',
+ 'keywords_to_end' => '',
+ 'positions' => [ // 标记位置信息(-1 表示未找到)
+ 'abstract' => -1,
+ 'keywords' => -1,
+ 'end_span' => -1
+ ],
+ 'is_valid' => false, // 整体区间是否有效
+ 'error' => '' // 错误信息(如标记顺序异常)
+ ];
+
+ // 3. 定位 Abstract(不区分大小写)
+ $absPos = stripos($str, $abstract);
+ if ($absPos === false) {
+ $result['error'] = "未找到标记: {$abstract}";
+ return $result;
+ }
+ $result['positions']['abstract'] = $absPos;
+ $absEndPos = $absPos + strlen($abstract);
+
+ // 4. 定位 Keywords(需在 Abstract 之后,不区分大小写)
+ $keyPos = stripos($str, $keywords, $absEndPos);
+ if ($keyPos === false) {
+ $result['error'] = "未找到 {$keywords} 或在 {$abstract} 之前";
+ return $result;
+ }
+ $result['positions']['keywords'] = $keyPos;
+ $keyEndPos = $keyPos + strlen($keywords);
+
+ // 5. 定位 end-span(需在 Keywords 之后,严格匹配)
+ $endPos = strpos($str, $end_span, $keyEndPos);
+ if ($endPos === false) {
+ $result['error'] = "未找到 {$end_span} 或在 {$keywords} 之前";
+ return $result;
+ }
+ $result['positions']['end_span'] = $endPos;
+
+ // 6. 截取区间内容(清理标记后的紧邻符号)
+ // 区间1:Abstract 结束 → Keywords 开始(清理标记后的冒号/空格)
+ $len1 = $keyPos - $absEndPos;
+ $part1 = substr($str, $absEndPos, $len1);
+ $part1 = trim($part1);
+ // 移除 Abstract 后可能的冒号/短横线(如 "Abstract: ..." → 去掉开头的 ":")
+ $part1 = ltrim($part1, ': -—');
+ $result['abstract_to_keywords'] = trim($part1);
+
+ // 区间2:Keywords 结束 → end-span 开始(同理清理)
+ $len2 = $endPos - $keyEndPos;
+ $part2 = substr($str, $keyEndPos, $len2);
+ $part2 = trim($part2);
+ $part2 = ltrim($part2, ': -—');
+ $result['keywords_to_end'] = trim($part2);
+
+ // 7. 标记为有效
+ $result['is_valid'] = true;
+ return $result;
+ }
public function extractFromWord() {
$sContent = '';
//文本处理
$sFundContent = '';
+ $aContent = [];
foreach ($this->sections as $section) {
foreach ($section->getElements() as $element) {
$textContent = $this->getTextFromElement($element);
if(empty($textContent)){
continue;
}
- //编码修复
- $possibleEncodings = [
- 'Windows-1252', 'UTF-8', 'GBK', 'GB2312',
- 'Latin-1', 'ISO-8859-1', 'CP1252'
- ];
- $sContent .= @mb_convert_encoding($textContent, 'UTF-8', implode(',', $possibleEncodings));
- if(stripos($textContent, 'Keywords:') !== false){
- $sContent .= "Keywords-End-Flag";
+ if(!empty($textContent) && !mb_check_encoding($textContent, 'UTF-8')){
+ $textContent = mb_convert_encoding($textContent, 'UTF-8', 'GBK');
}
if(empty($sFundContent)){
$aFund = $this->getMatchedFundPhrases($sContent);
@@ -1152,69 +1121,348 @@ class ArticleParserService
}
}
}
- $sContent .= "\n";
+ $sContent .= $textContent."===========end-span";
}
}
-
if(!empty($sContent) && !mb_check_encoding($sContent, 'UTF-8')){
$sContent = mb_convert_encoding($sContent, 'UTF-8', 'GBK');
}
- // 2. 基础文本清理(合并多余空格,保留有效换行)
- $textContent = preg_replace('/(\S)\s+/', '$1 ', $sContent);
- $textContent = trim($textContent);
-
+ $result = $this->extractContentIntervals($sContent);
// 3. 提取摘要
- $abstract = '';
- $abstractPattern = '/Abstract\s*([\s\S]*?)(?=Keywords|$)/i';
- if (preg_match($abstractPattern, $textContent, $abstractMatches)) {
- $abstract = trim($abstractMatches[1]);
- $abstract = preg_replace('/\n+/', ' ', $abstract);
+ $abstract = empty($result['abstract_to_keywords']) ? '' : $result['abstract_to_keywords'];
+ if(!empty($abstract) && !mb_check_encoding($abstract, 'UTF-8')){
+ $abstract = mb_convert_encoding($abstract, 'UTF-8', 'GBK');
}
- // 4. 提取关键词(核心:仅保留两种强制匹配逻辑)
- $keywords = [];
- // $keywordPattern = '/Keywords:\s*([\s\S]*?)(?=\s*\d+\.|[;,]\s*[\r\n]+\s*[\r\n]+|(?i)\bintroduction|abbreviations\b|$)/i';
- $keywordPattern = '/Keywords\s*(.*?)\s*Keywords-End-Flag/s';
-
- if (preg_match($keywordPattern, $textContent, $keywordMatches)) {
- $keywordStr = trim($keywordMatches[1]);
-
- // 清理关键词列表格式(去除换行、末尾多余符号)
- $keywordStr = preg_replace('/\n+/', ' ', $keywordStr);
- $keywordStr = rtrim($keywordStr, ';,. '); // 去除末尾分号、逗号等
- $keywordStr = trim($keywordStr);
-
- // 分割并过滤有效关键词
- $keywords = preg_split('/[,;]\s*/', $keywordStr);
- $keywords = array_filter(array_map('trim', $keywords), function($item) {
- return !empty($item) && !ctype_space($item);
- });
+ $keywords = empty($result['keywords_to_end']) ? '' : $result['keywords_to_end'];
+ if(!empty($keywords) && !mb_check_encoding($keywords, 'UTF-8')){
+ $keywords = mb_convert_encoding($keywords, 'UTF-8', 'GBK');
}
- if(empty($keywords)){
- $keywordPattern = '/Keywords\s*([\s\S]*?)(?=Introduction|$)/i';
- if (preg_match($keywordPattern, $textContent, $keywordMatches)) {
- $keywordStr = trim($keywordMatches[1]);
- // 清理关键词列表格式(去除换行、末尾多余符号)
- $keywordStr = preg_replace('/\n+/', ' ', $keywordStr);
- $keywordStr = rtrim($keywordStr, ';,. '); // 去除末尾分号、逗号等
- $keywordStr = trim($keywordStr);
-
- // 分割并过滤有效关键词
- $keywords = preg_split('/[,;]\s*/', $keywordStr);
- $keywords = array_filter(array_map('trim', $keywords), function($item) {
- return !empty($item) && !ctype_space($item);
- });
- }
+ if(!empty($sFundContent) && !mb_check_encoding($sFundContent, 'UTF-8')){
+ $sFundContent = mb_convert_encoding($sFundContent, 'UTF-8', 'GBK');
}
+
return [
'status' => 1,
'msg' => '提取成功',
'data' => [
- 'abstrart' => $abstract,
- 'keywords' => $keywords,
- 'fund' => $sFundContent
+ 'abstrart' => empty($abstract) ? '' : $this->fullDecode(str_replace('===========end-span', '',$abstract)),
+ 'keywords' => empty($keywords) ? '' : $this->fullDecode(str_replace('===========end-span', '',$keywords)),
+ 'fund' => empty($sFundContent) ? '' : $this->fullDecode(str_replace('===========end-span', '',$sFundContent))
]
];
}
+ /**
+ * 核心解码方法
+ * @param string $str 待解码字符串
+ * @param int $maxDepth 最大解析深度
+ * @return string
+ */
+ private function fullDecode($str = '', int $maxDepth = 2){
+ try {
+ if ($str === null || trim((string)$str) === '' || $maxDepth <= 0) {
+ return $str === null ? '' : trim((string)$str);
+ }
+
+ $str = (string)$str;
+
+ // Unicode解码
+ if (method_exists($this, 'decodeUnicode')) {
+ $str = $this->decodeUnicode($str);
+ } else {
+ $str = preg_replace_callback(
+ '/\\\\[uU]([0-9a-fA-F]{4})/',
+ function ($m) {
+ $code = hexdec($m[1]);
+ return ($code >= 0x20 && $code <= 0x10FFFF) ? mb_chr($code, 'UTF-8') : $m[0];
+ },
+ $str
+ );
+ }
+
+ // 预编译正则
+ $regexps = [
+ 'ob0' => '/0B\s*\\?0/',
+ 'dl18' => '/DL\s*\\?\.18/',
+ 'qMarkNum' => '/\\?(\d+)/',
+ 'qMarkDotNum' => '/\\?(\.\d+)/',
+ 'neNum' => '/≠\s*(\d+)/u',
+ 'leNum' => '/≤\s*(\d+)/u',
+ 'mixSymbol' => '/\\?\s*(?:、|,)\s*\\?\s*(?:、|,)\s*\\?(\d+)/u',
+ 'leNeMark' => '/(LE|NE)\s*\\?(\d+)/i',
+ 'unicode' => '/\\\\[uU]([0-9a-fA-F]{4})/',
+ 'wordBin' => '/(?:\\xE2\\x89\\x86|\\xE2\s*0x89\s*0x86|e28986|\\xE2\\x89\\x87|\\xE2\s*0x89\s*0x87|e28987|\\xE2\\x89\\x80|\\xE2\s*0x89\s*0x80|e28980)/i',
+ 'wordEntity' => '/\s*(?:x|X)?\s*(2264|2265|2260)\s*;?/i',
+ 'repeatSymbol' => '/(≤|≥|≠)\1+/u',
+ 'gbkSymbol' => '/\xA1\xF2|\xA1\xF3|\xA1\xF0/'
+ ];
+
+ // 预定义替换映射
+ $maps = [
+ 'htmlEntity' => [
+ '≤' => '≤', '≤' => '≤', '≤' => '≤', '≤' => '≤',
+ '≥' => '≥', '≥' => '≥', '≥' => '≥', '≥' => '≥',
+ '≠' => '≠', '≠' => '≠', '≠' => '≠', '≠' => '≠',
+ '&le' => '≤', '&ge' => '≥', '&ne' => '≠',
+ 'ࣘ' => '≤', 'ࣙ' => '≥', 'ࣔ' => '≠',
+ '≤' => '≤', '≥' => '≥', '≠' => '≠',
+ '<' => '≤', '>' => '≥',
+ ],
+ 'wordBin' => [
+ "\xE2\x89\x86" => '≤', "\xE2\x89\x87" => '≥', "\xE2\x89\x80" => '≠',
+ "\xe2\x89\x86" => '≤', "\xe2\x89\x87" => '≥', "\xe2\x89\x80" => '≠',
+ 'e28986' => '≤', '\xe2\x89\x86' => '≤', '\xe20x890x86' => '≤',
+ 'e28987' => '≥', '\xe2\x89\x87' => '≥', '\xe20x890x87' => '≥',
+ 'e28980' => '≠', '\xe2\x89\x80' => '≠', '\xe20x890x80' => '≠',
+ ],
+ 'wordEntity' => ['2264' => '≤', '2265' => '≥', '2260' => '≠'],
+ 'gbkSymbol' => ["\xA1\xF2" => '≤', "\xA1\xF3" => '≥', "\xA1\xF0" => '≠'],
+ ];
+
+ $unicodeCallback = function ($m) {
+ $code = hexdec($m[1]);
+ return ($code >= 0x20 && $code <= 0x10FFFF) ? mb_chr($code, 'UTF-8') : $m[0];
+ };
+
+ $depth = 0;
+ $hasChange = false;
+ $currentStr = $str;
+
+ // 循环解码
+ do {
+ $depth++;
+ $hasChange = false;
+ $prevStr = $currentStr;
+
+ // Unicode转义解码
+ $currentStr = preg_replace_callback($regexps['unicode'], $unicodeCallback, $currentStr);
+
+ //HTML实体替换
+ $currentStr = strtr($currentStr, $maps['htmlEntity']);
+ $currentStr = html_entity_decode(
+ $currentStr,
+ ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE,
+ 'UTF-8'
+ );
+
+ // Word特殊符号乱码修复
+ if (preg_match($regexps['wordBin'], $currentStr)) {
+ $tempStr = str_replace(' ', '', $currentStr);
+ $currentStr = str_ireplace(array_keys($maps['wordBin']), $maps['wordBin'], $tempStr);
+ }
+ if (preg_match($regexps['wordEntity'], $currentStr)) {
+ $currentStr = preg_replace_callback(
+ $regexps['wordEntity'],
+ function ($m) use ($maps) {
+ return $maps['wordEntity'][$m[1]] ?? $m[0];
+ },
+ $currentStr
+ );
+ }
+ if (preg_match($regexps['gbkSymbol'], $currentStr)) {
+ $currentStr = strtr($currentStr, $maps['gbkSymbol']);
+ }
+ if (preg_match($regexps['repeatSymbol'], $currentStr)) {
+ $currentStr = preg_replace($regexps['repeatSymbol'], '$1', $currentStr);
+ }
+
+ //业务场景专属替换
+ if (preg_match($regexps['neNum'], $currentStr)) {
+ $currentStr = preg_replace($regexps['neNum'], '≠$1', $currentStr);
+ }
+ if (preg_match($regexps['leNum'], $currentStr)) {
+ $currentStr = preg_replace($regexps['leNum'], '≤$1', $currentStr);
+ }
+ if (preg_match($regexps['qMarkNum'], $currentStr)) {
+ $currentStr = preg_replace($regexps['qMarkNum'], '≥$1', $currentStr);
+ }
+ if (preg_match($regexps['qMarkDotNum'], $currentStr)) {
+ $currentStr = preg_replace($regexps['qMarkDotNum'], '≥0$1', $currentStr);
+ }
+ if (preg_match($regexps['mixSymbol'], $currentStr)) {
+ $currentStr = preg_replace($regexps['mixSymbol'], '≤≥≠$1', $currentStr);
+ }
+ if (preg_match($regexps['leNeMark'], $currentStr)) {
+ $currentStr = preg_replace_callback(
+ $regexps['leNeMark'],
+ function ($m) {
+ return strtoupper($m[1]) === 'LE' ? '≤' . $m[2] : '≠' . $m[2];
+ },
+ $currentStr
+ );
+ }
+
+ $hasChange = ($currentStr !== $prevStr);
+ } while ($depth < $maxDepth && $hasChange);
+
+ // 最终清理
+ $currentStr = trim($currentStr, ':');
+ $currentStr = strtr($currentStr, $maps['htmlEntity']);
+
+ return $currentStr;
+
+ } catch (\Throwable $e) {
+ return trim((string)$str);
+ }
+ }
+
+ // private function fullDecode($str, $maxDepth = 5) {
+ // // 空值/深度为0,直接返回(提前终止,避免无效操作)
+ // if (empty($str) || $maxDepth <= 0) {
+ // return $str;
+ // }
+
+ // // 【性能优化1:预编译所有正则表达式】避免每次循环重新解析正则
+ // // 预编译:≥专属场景正则
+ // $regOb0 = '/0B\s*\?0/';
+ // $regDl18 = '/DL\s*\?.18/';
+ // // 预编译:≥通用场景正则
+ // $regQMarkNum = '/\?(\d+)/';
+ // $regQMarkDotNum = '/\?(\.\d+)/';
+ // // 预编译:≤、≠空格修复正则
+ // $regNeNum = '/≠\s*(\d+)/';
+ // $regLeNum = '/≤\s*(\d+)/';
+ // // 预编译:混合符号乱码正则(中文顿号/英文逗号)
+ // $regMixCn = '/(\?)\s*、\s*(\?)\s*、\s*(\?)(\d+)/';
+ // $regMixEn = '/(\?)\s*,\s*(\?)\s*,\s*(\?)(\d+)/';
+ // // 预编译:≤、≠专属标识正则
+ // $regLeMark = '/LE\s*\?(\d+)/';
+ // $regNeMark = '/NE\s*\?(\d+)/';
+ // // 预编译:Unicode转义正则(提取到外部,避免闭包重复创建)
+ // $regUnicode = '/\\\\u([0-9a-fA-F]{4})/';
+
+ // // 【性能优化2:预定义常量/映射】避免循环内重复创建数组/字符串
+ // // HTML实体映射(一次性定义,避免循环内重复赋值)
+ // $htmlEntityMap = [
+ // '≤' => '≤', '≤' => '≤', '≤' => '≤',
+ // '≥' => '≥', '≥' => '≥', '≥' => '≥',
+ // '≠' => '≠', '≠' => '≠', '≠' => '≠',
+ // ];
+ // // 不间断空格替换数组
+ // $nbspReplace = [chr(0xC2) . chr(0xA0), chr(0xA0)];
+ // // Unicode回调函数(预定义,避免循环内重复创建闭包)
+ // $unicodeCallback = function ($m) {
+ // return mb_chr(hexdec($m[1]), 'UTF-8') ?: $m[0];
+ // };
+
+ // $original = $str;
+ // $depth = 0;
+ // $hasChange = false; // 标记是否有变化,提前终止循环
+
+ // // 循环解码:仅在有变化且未达最大深度时执行
+ // do {
+ // $depth++;
+ // $hasChange = false;
+ // $prevStr = $str; // 保存当前状态,用于判断变化
+
+ // // 1. 解码Unicode转义(\uXXXX格式)
+ // $str = $this->decodeUnicode($str);
+
+ // // 2. 解码HTML实体(先替换专属实体,再执行通用解码)
+ // $str = strtr($str, $htmlEntityMap); // 高性能替换(strtr比str_replace快)
+ // $str = html_entity_decode($str, ENT_QUOTES | ENT_HTML5, 'UTF-8');
+
+ // // 3. 再次处理遗漏的Unicode转义(使用预编译正则+预定义回调)
+ // $str = preg_replace_callback($regUnicode, $unicodeCallback, $str);
+
+ // // 4. 替换不间断空格为普通空格(strtr比str_replace更高效)
+ // $str = str_replace($nbspReplace, ' ', $str);
+
+ // // 5. 核心替换逻辑(优化执行顺序,避免覆盖)
+ // // 5.1 原有≥专属场景(保留)
+ // $str = preg_replace($regOb0, '0B≥30', $str, -1, $count1);
+ // $str = preg_replace($regDl18, 'DL≥0.18', $str, -1, $count2);
+ // // 5.2 ≤、≠空格修复(保留)
+ // $str = preg_replace($regNeNum, '≠$1', $str, -1, $count3);
+ // $str = preg_replace($regLeNum, '≤$1', $str, -1, $count4);
+ // // 5.3 原有≥通用场景(保留)
+ // $str = preg_replace($regQMarkNum, '≥$1', $str, -1, $count5);
+ // $str = preg_replace($regQMarkDotNum, '≥0$1', $str, -1, $count6);
+ // // 5.4 混合符号乱码还原(保留)
+ // $str = preg_replace($regMixCn, '≤、≥、≠$4', $str, -1, $count7);
+ // $str = preg_replace($regMixEn, '≤、≥、≠$4', $str, -1, $count8);
+ // // 5.5 ≤、≠专属标识还原(保留)
+ // $str = preg_replace($regLeMark, '≤$1', $str, -1, $count9);
+ // $str = preg_replace($regNeMark, '≠$1', $str, -1, $count10);
+
+ // // 5.6 修复前缀"d with "乱码(保留)
+ // $str = str_replace('d with ', 'd with ', $str, $count11);
+
+ // // 【性能优化3:统计所有替换次数,判断是否有变化】
+ // $totalCount = $count1 + $count2 + $count3 + $count4 + $count5 + $count6 +
+ // $count7 + $count8 + $count9 + $count10 + $count11;
+ // if ($totalCount > 0 || $str !== $prevStr) {
+ // $hasChange = true;
+ // $original = $str;
+ // }
+
+ // // 【性能优化4:提前终止】单次循环无变化,直接退出
+ // if (!$hasChange) {
+ // break;
+ // }
+
+ // } while ($depth < $maxDepth); // 改用do-while,减少循环判断次数
+
+ // // 最终清理:仅执行一次trim
+ // return trim($str, ':');
+ // }
+ // private function fullDecode($str, $maxDepth = 5) {
+ // if (empty($str) || $maxDepth <= 0) {
+ // return $str;
+ // }
+
+ // $original = $str;
+ // $depth = 0;
+
+ // // 循环解码,直到无变化或达到最大次数
+ // while (true) {
+ // $depth++;
+ // if ($depth > $maxDepth) {
+ // break; // 防止过度解码导致死循环
+ // }
+
+ // // 1. 解码 Unicode 转义(\uXXXX 格式)
+ // $str = $this->decodeUnicode($str);
+
+ // // 2. 解码 HTML 实体(&、'、< 等)
+ // $str = html_entity_decode($str, ENT_QUOTES | ENT_HTML5, 'UTF-8');
+
+ // $str = preg_replace_callback('/\\\\u([0-9a-fA-F]{4})/', function ($m) {
+ // return mb_chr(hexdec($m[1]), 'UTF-8') ?: $m[0];
+ // }, $str);
+ // $str = str_replace([chr(0xC2).chr(0xA0), chr(0xA0)], ' ', $str);
+
+ // // 2. 核心:强制匹配所有可能的乱码格式,还原≥
+ // // 匹配:0B?0、0B ?0、0B ?0(空格/制表符)→ 0B≥30
+ // $str = preg_replace('/0B\s*\?0/', '0B≥30', $str);
+ // // 匹配:DL?.18、DL ?.18、DL ?.18 → DL≥0.18
+ // $str = preg_replace('/DL\s*\?.18/', 'DL≥0.18', $str);
+ // // 通用匹配:数字前的?(如?30、?0.18)→ ≥30、≥0.18(防止其他变体)
+ // $str = preg_replace('/\?(\d+)/', '≥$1', $str);
+ // $str = preg_replace('/\?(\.\d+)/', '≥0$1', $str);
+
+ // // 3. 修复前缀的"d with "可能的乱码(若有)
+ // $str = str_replace('d with ', 'd with ', $str); // 若前缀也乱码,可同步替换
+
+ // // 若解码后无变化,退出循环
+ // if ($str === $original) {
+ // break;
+ // }
+
+ // $original = $str;
+ // }
+
+ // return trim($str,':');
+ // }
+ private function decodeUnicode($str) {
+ return preg_replace_callback(
+ '/\\\\u([0-9a-fA-F]{4})/',
+ function ($matches) {
+ // 将十六进制 Unicode 码转为 UTF-8 字符
+ return mb_convert_encoding(pack('H*', $matches[1]), 'UTF-8', 'UCS-2BE');
+ },
+ $str
+ );
+ }
private function getMatchedFundPhrases($content = '') {
if (empty($content)) {
return [];
@@ -1223,7 +1471,7 @@ class ArticleParserService
// 基金支持词组列表
$fundPhrases = [
'Supported by', 'Funded by', 'Sponsored by', 'Supported in part by',
- 'Funding was provided by', 'Funded in part by'
+ 'Funding was provided by', 'Funded in part by','FUNDING:'
];
// 1. 转义词组中的特殊字符,使用 # 作为分隔符