Files
tougao/application/common/ArticleParserService.php
2025-12-02 15:20:51 +08:00

1512 lines
64 KiB
PHP
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
namespace app\common;
use PhpOffice\PhpWord\IOFactory;
use think\Exception;
use ZipArchive;
use RecursiveIteratorIterator;
use RecursiveDirectoryIterator;
use PhpOffice\PhpWord\Settings;
use PhpOffice\PhpWord\Element\TextRun;
use DOMDocument;
use DOMXPath;
// use BadMethodCallException;
class ArticleParserService
{
private $phpWord;
private $sections;
private $iNum = 0;
public function __construct($filePath = '')
{
if (!file_exists($filePath)) {
return json_encode(['status' => 5, 'msg' => '"文档不存在:{$filePath}"']);
}
try {
// 关键配置:关闭“仅读数据”,保留完整节结构
$reader = IOFactory::createReader();
$reader->setReadDataOnly(false);
Settings::setCompatibility(false);
Settings::setOutputEscapingEnabled(true); // 避免XML转义冲突
$doc = $reader->load($filePath);
$sectionCount = count($doc->getSections());
// $this->log("✅ 文档直接加载成功,节数量:{$sectionCount}");
$this->phpWord = $reader->load($filePath);
$this->sections = $this->phpWord->getSections();
} catch (\Exception $e) {
// 预处理:移除 DOCX 中的 EMF 图片
$processedFilePath = $this->removeEmfFromDocx($filePath);
// 加载处理后的文档
$reader = IOFactory::createReader();
$reader->setReadDataOnly(false);
Settings::setCompatibility(false);
Settings::setOutputEscapingEnabled(true);
$this->phpWord = $reader->load($processedFilePath);
$this->sections = $this->phpWord->getSections();
// 可选:删除临时处理文件(避免冗余)
unlink($processedFilePath);
return json_encode(['status' => 5, 'msg' => $e->getMessage()]);
}
}
/**
* 移除 DOCX 压缩包内的所有 EMF 图片
* @param string $docxPath 原 DOCX 文件路径
* @return string 处理后的临时 DOCX 路径
*/
private function removeEmfFromDocx($docxPath){
$zip = new ZipArchive();
if ($zip->open($docxPath) !== true) {
throw new \Exception("无法打开 DOCX 文件:{$docxPath}");
}
// 1. 创建临时目录用于解压
$tempDir = rtrim(ROOT_PATH,'/').'/runtime/'.uniqid('docx_temp_');
mkdir($tempDir, 0700, true);
// 2. 解压 DOCX 到临时目录
$zip->extractTo($tempDir);
$zip->close();
// 3. 递归删除所有 EMF 文件
$dirIterator = new RecursiveDirectoryIterator($tempDir);
$iterator = new RecursiveIteratorIterator($dirIterator);
foreach ($iterator as $file) {
if ($file->isFile() && strtolower(pathinfo($file, PATHINFO_EXTENSION)) === 'emf') {
unlink($file->getPathname());
}
}
// 4. 重新打包为 DOCX
$processedPath = $tempDir . '_processed.docx';
$newZip = new ZipArchive();
if ($newZip->open($processedPath, ZipArchive::CREATE | ZipArchive::OVERWRITE) !== true) {
throw new \Exception("无法创建处理后的 DOCX 文件");
}
// 遍历临时目录,添加所有文件到新压缩包
$this->addFilesToZip($tempDir, $newZip);
$newZip->close();
// 5. 删除临时解压目录
$this->deleteDir($tempDir);
return $processedPath;
}
/**
* 递归添加目录文件到 ZipArchive
* @param string $dir 目录路径
* @param ZipArchive $zip ZipArchive 实例
*/
private function addFilesToZip($dir, $zip)
{
$files = scandir($dir);
foreach ($files as $file) {
if ($file === '.' || $file === '..') continue;
$filePath = $dir . '/' . $file;
if (is_dir($filePath)) {
$this->addFilesToZip($filePath, $zip);
} else {
// 计算压缩包内的相对路径(避免冗余目录层级)
$relativePath = str_replace(dirname($dir) . '/', '', $filePath);
$zip->addFile($filePath, $relativePath);
}
}
}
/**
* 递归删除目录
* @param string $dir 目录路径
*/
private function deleteDir($dir){
// 1. 基础校验:非空字符串且为有效目录
if (trim($dir) === '' || !is_dir($dir)) {
return false;
}
// 2. 统一路径格式(去除尾部分隔符,避免跨系统差异)
$dir = rtrim($dir, DIRECTORY_SEPARATOR);
$dirName = basename($dir);
// 3. 前缀强校验仅处理docx_temp_开头的目录
if (strpos($dirName, 'docx_temp_') !== 0) {
return false;
}
// 4. 路径归属校验缓存realpath结果减少I/O
$runtimeDir = rtrim(ROOT_PATH, '/') . '/runtime';
$realDir = realpath($dir);
$realRuntimeDir = realpath($runtimeDir);
if ($realDir === false || $realRuntimeDir === false || strpos($realDir, $realRuntimeDir) !== 0) {
return false;
}
// 5. 扫描目录(带错误抑制,处理权限问题)
$files = @scandir($dir);
if ($files === false) {
return false;
}
$isFullyDeleted = true; // 标记是否完全删除
// 6. 递归处理子项
foreach ($files as $file) {
if ($file === '.' || $file === '..') {
continue;
}
$filePath = $dir . DIRECTORY_SEPARATOR . $file;
$realFilePath = realpath($filePath);
// 子路径校验:必须是当前目录的子项(防符号链接跳转)
if ($realFilePath === false || strpos($realFilePath, $realDir) !== 0) {
$isFullyDeleted = false;
continue;
}
if (is_dir($realFilePath)) {
// 递归删除子目录,继承校验逻辑
if (!$this->deleteDir($realFilePath)) {
$isFullyDeleted = false;
}
} else {
// 尝试删除文件(失败则标记未完全删除)
if (!@unlink($realFilePath)) {
$isFullyDeleted = false;
}
}
}
// 7. 最终删除目录(确保空目录才删除)
$remainingFiles = @scandir($dir);
if ($remainingFiles !== false && count($remainingFiles) <= 2) {
@rmdir($dir);
return $isFullyDeleted; // 若子项完全删除则返回true
}
return false;
}
// 上传并解析文档的入口方法
public static function uploadAndParse($sFileUrl){
//必填值验证
if(empty($sFileUrl)){
return json_encode(['status' => 2,'msg' => 'Please upload the submission file']);
}
//判断文件是否执行
if (!file_exists($sFileUrl)) {
return json_encode(['status' => 3, 'msg' => 'The uploaded file does not exist']);
}
if (!is_readable($sFileUrl)) {
return json_encode(['status' => 4, 'msg' => 'The uploaded file is unreadable']);
}
// 解析文档
$oDealFile = new self($sFileUrl);
//获取标题
$sTitle = $oDealFile->getTitle();
if(empty($sTitle)){
return json_encode(['status' => 5, 'msg' => 'Article title retrieval failed']);
}
//获取作者
$aParam = ['title' => $sTitle];
$aAuthor = $oDealFile->getAuthors($aParam);
$aAuthorData = empty($aAuthor['author']) ? [] : $aAuthor['author'];//所有作者信息
$aAuthorReportData = empty($aAuthor['report']) ? [] : $aAuthor['report'];//通讯作者信息
$aParam['author'] = $aAuthorData;
$aParam['report'] = $aAuthorReportData;
//获取机构
$aCompany = $oDealFile->getCompany($aParam);
$aParam['company'] = $aCompany;
//获取通讯作者信息
$aParam['corresponding'] = $oDealFile->getCorrespondingAuthors($aParam);
//keywords 和 摘要
$aContent = $oDealFile->extractFromWord();
if(!mb_check_encoding($sTitle, 'UTF-8')){
$sTitle = mb_convert_encoding($sTitle, 'UTF-8', 'GBK');
}
$aParam['title'] = $oDealFile->fullDecode($aParam['title']);
$aParam += empty($aContent['data']) ? [] : $aContent['data'];
return json_encode(['status' => 1,'msg' => 'success','data' => $aParam]);
}
// 提取文章标题
private function getTitle(){
if(empty($this->sections)){
return '';
}
$title = '';
$maxLength = 0;
foreach ($this->sections as $section) {
foreach ($section->getElements() as $element) {
$text = $this->getTextFromElement($element);
$length = mb_strlen(trim($text));
if ($length > $maxLength && $length > 3) { // 标题通常较长
$title = trim($text);
$maxLength = $length;
break 2; // 取第一个最长段落作为标题
}
}
}
return $title;
}
// 提取作者
private function parseAuthorsWithoutRegex($str = '') {
if (empty($str)) {
return [];
}
if(!mb_check_encoding($str, 'UTF-8')){
$str = mb_convert_encoding($str, 'UTF-8', 'GBK');
}
$str = $this->fullDecode($str);
$str = str_replace(["\xC2\xA0", 'ï¼', '�', '', '', '', '', '', '', '', '', '', '', ''],
[' ', ' ', ' ', ' ', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0'], $str);
$str = trim(str_replace([' and ', ' AND ', ' And '], ', ', $str));
// 合并上标中数字与逗号间的空格(如"2, 3"→"2,3"
$len = mb_strlen($str);
$processed = '';
for ($i = 0; $i < $len; $i++) {
$char = mb_substr($str, $i, 1);
if ($char === ',' && $i - 1 >= 0 && $i + 2 < $len) {
$prevChar = mb_substr($str, $i - 1, 1);
$next1 = mb_substr($str, $i + 1, 1);
$next2 = mb_substr($str, $i + 2, 1);
// 兼容全角数字转半角后的判断
if ((ctype_digit($prevChar) || is_numeric($prevChar)) && $next1 === ' ' && (ctype_digit($next2) || is_numeric($next2))) {
$processed .= $char;
$i += 1;
continue;
}
}
$processed .= $char;
}
$str = $processed;
// 合并数字与符号间的空格(如"1 *"→"1*"
$len = mb_strlen($str);
$processed = '';
for ($i = 0; $i < $len; $i++) {
$char = mb_substr($str, $i, 1);
if ((ctype_digit($char) || is_numeric($char)) && $i + 2 < $len) { // 支持数字判断
$next1 = mb_substr($str, $i + 1, 1);
$next2 = mb_substr($str, $i + 2, 1);
if ($next1 === ' ' && in_array($next2, ['#', '*', '†', '‡', '§'])) { // 扩展符号支持
$processed .= $char;
$i += 2;
$processed .= $next2;
continue;
}
}
$processed .= $char;
}
$str = $processed;
// 合并连续空格
$len = mb_strlen($str);
$processed = '';
$prevSpace = false;
for ($i = 0; $i < $len; $i++) {
$char = mb_substr($str, $i, 1);
if ($char === ' ') {
if (!$prevSpace) {
$processed .= $char;
$prevSpace = true;
}
} else {
$processed .= $char;
$prevSpace = false;
}
}
$str = trim($processed);
// 作者处理
$authors = [];
$currentName = '';
$currentSuperscript = '';
$inName = true;
$len = mb_strlen($str);
for ($i = 0; $i < $len; $i++) {
$char = mb_substr($str, $i, 1);
// 处理作者分隔符:逗号+空格
if ($char === ',' && $i + 1 < $len) {
$nextChar = mb_substr($str, $i + 1, 1);
if ($nextChar === ' ') {
if (!empty($currentName)) {
$currentSuperscript = rtrim($currentSuperscript, ',');
$authors[] = [
'name' => trim($currentName),
'superscript' => trim($currentSuperscript)
];
}
$currentName = '';
$currentSuperscript = '';
$inName = true;
$i++;
continue;
}
}
// 支持姓名中的点、连字符、特殊字母(如带重音的字母)
if (ctype_alpha($char) || in_array($char, [' ', '.', '-', 'à', 'á', 'â', 'ã', 'ä', 'ç', 'è', 'é', 'ê', 'ë'])) {
if ($inName) {
$currentName .= $char;
} else {
$currentSuperscript = rtrim($currentSuperscript, ',');
$authors[] = [
'name' => trim($currentName),
'superscript' => trim($currentSuperscript)
];
$currentName = $char;
$currentSuperscript = '';
$inName = true;
}
}
// 解析上标(数字、逗号、#、*、†等)
elseif ((ctype_digit($char) || is_numeric($char)) || in_array($char, ['#', '*', '†', ',', '‡', '§'])) {
$inName = false;
$currentSuperscript .= $char;
}
// 忽略其他字符
else {
continue;
}
}
// 处理最后一个作者
if (!empty($currentName)) {
$currentSuperscript = rtrim($currentSuperscript, ',');
$authors[] = [
'name' => trim($currentName),
'superscript' => trim($currentSuperscript)
];
}
// 提取机构编号为数组、判断通讯作者和第一作者
foreach ($authors as $index => &$author) {
// 提取机构编号(兼容多字节数字)
$institutionIds = [];
$superscript = $author['superscript'];
$numStr = '';
for ($i = 0; $i < mb_strlen($superscript); $i++) {
$c = mb_substr($superscript, $i, 1);
if (ctype_digit($c) || is_numeric($c)) { // 支持数字判断
$numStr .= $c;
} else {
if (!empty($numStr)) {
$institutionIds[] = (int)$numStr;
$numStr = '';
}
}
}
if (!empty($numStr)) {
$institutionIds[] = (int)$numStr;
}
$institutionIds = array_values(array_unique($institutionIds));
$author['company_id'] = $institutionIds;
// 判断第一作者(#标记)和通讯作者(*、†标记)
$author['is_super'] = strpos($superscript, '#') !== false ? 1 : 0;
$author['is_report'] = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0;
}
unset($author); // 释放引用
return $authors;
}
private function getAuthors($aParam = []) {
$title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
$sAuthorContent = $this->getNextParagraphAfterText($title);
if (empty($sAuthorContent)) {
return ['author' => [], 'report' => []];
}
if(!mb_check_encoding($sAuthorContent, 'UTF-8')){
$sAuthorContent = mb_convert_encoding($sAuthorContent, 'UTF-8', 'GBK');
}
$sAuthorContent = $this->fullDecode($sAuthorContent);
//清理不可见字符
$sAuthorContent = preg_replace('/[\x00-\x1F\x7F\x{200B}-\x{200F}]/u', '', $sAuthorContent);
//修复特殊符号乱码
$symbolMap = [
'†' => '†', 'â ' => '†', 'â' => '†', '?†' => '†',
':' => ':', ',' => ',', '—' => '-',
'啊' => '' // 针对性移除异常字符“啊”(若为固定乱码)
];
$sAuthorContent = strtr($sAuthorContent, $symbolMap);
//格式标准化
$sAuthorContent = str_replace(['', ';', '', '、'], ',', $sAuthorContent); // 统一分隔符
$sAuthorContent = preg_replace('/\s+and\s+/i', ', ', $sAuthorContent); // and转逗号
$sAuthorContent = preg_replace('/\s+/', ' ', $sAuthorContent); // 合并多余空格
$sAuthorContent = trim($sAuthorContent);
$aAuthor = $this->parseAuthorsWithoutRegex($sAuthorContent);
if(empty($aAuthor)){
return ['author' => [],'report' => []];
}
$aReport = $aAuthorData = [];
foreach ($aAuthor as $key => $value) {
if(empty($value['name']) && empty($value['superscript'])){
continue;
}
if(!empty($value['name']) && !empty($value['is_report']) && $value['is_report'] == 1){
$aReport[] = $value['name'];
}
$aAuthorData[] = $value;
}
return ['author' => $aAuthorData,'report' => array_unique($aReport)];
}
// 获取机构
private function getCompany($aParam = []){
//获取标题
$title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
//获取标题下的作者
$sAuthorContent = empty($aParam['authors']) ? $this->getNextParagraphAfterText($title) : $aParam['authors'];
//获取作者结构
$allLines = $this->getContentAfterText($sAuthorContent,1);
if(empty($allLines)){
return [];
}
// 2. 按序号分组,合并同一序号的多行内容
$grouped = [];
$currentNumber = null; // 当前序号
foreach ($allLines as $line) {
$line = trim($line);
if (empty($line)) {
continue;
}
if(!mb_check_encoding($line, 'UTF-8')){
$line = mb_convert_encoding($line, 'UTF-8', 'GBK');
}
$line = $this->fullDecode($line);
$number = '';
$i = 0;
$lineLen = strlen($line);
// 提取行首的连续数字(作为序号)
$hasFirstChar = false;
while ($i < $lineLen) {
$currentChar = $line[$i];
// 首字符处理:允许 26个字母大小写或数字
if (!$hasFirstChar) {
if (ctype_digit($currentChar) || ctype_alpha($currentChar)) {
$number .= $currentChar;
$hasFirstChar = true;
$i++;
} else {
// 首字符不符合(非字母/数字),终止循环
break;
}
} else {
// 后续字符必须是数字(保持原逻辑)
if (ctype_digit($currentChar)) {
$number .= $currentChar;
$i++;
} else {
break;
}
}
}
// 若行首有数字,则视为新条目
if (!empty($number)) {
$currentNumber = $number;
// 提取序号后的内容(跳过数字后的符号/空格,保留核心内容)
// 从数字后的位置开始,跳过可能的符号(./*)或空格
while ($i < $lineLen && (in_array($line[$i], ['.', '*', ' ']))) {
$i++;
}
$content = trim(substr($line, $i)); // 序号后的内容
$grouped[$currentNumber] = $content;
continue;
}
// // 非新条目,合并到当前序号的内容中
// if ($currentNumber !== null) {
// $grouped[$currentNumber] .= ' ' . $line;
// }
}
$aCompany = [];
foreach ($grouped as $number => $institution) {
$institution = $this->fullDecode($institution);
// 原有基础清理逻辑不变
$institution = preg_replace('/\s+/', ' ', $institution); // 合并多余空格
$institution = rtrim($institution, '.'); // 去除末尾句号
$institution = preg_replace('/^\d+\s+/', '', $institution); // 去除开头数字
$institution = trim($institution); // 清理首尾空格
// 增强地址提取:匹配"机构名, 城市 邮编, 国家"格式(兼容更多变体)
// 允许地址中包含多个逗号(如子机构、街道信息),最终以"城市 邮编, 国家"结尾
// preg_match('/(.*?, [A-Za-z\s]+ \d+, [A-Za-z\s]+)/', $institution, $institutionmatches);
// $institution = trim($institutionmatches[1] ?? $institution);
// 强化冗余信息过滤:去除"*"及之后的内容(包括通讯作者、邮箱等)
// 新增对"#"、"†"等标记的过滤,兼容更多期刊格式
if (preg_match('/^(.*?)(?=\s*[\*#†]|(?i)\s*Email)/', $institution, $matches)) {
$institution = trim($matches[1]);
}
// 编码校验不变
if (!empty($institution) && !mb_check_encoding($institution, 'UTF-8')) {
$institution = mb_convert_encoding($institution, 'UTF-8', 'GBK');
}
$aCompany[$number] = empty($institution) ? '' : trim(trim($institution),'.');
}
return $aCompany;
}
// 提取通讯作者含E-mail、地址、电话
private function getCorrespondingAuthors($aParam = []){
$aCorrespondingAuthor = empty($aParam['report']) ? [] : $aParam['report'];
if(empty($aCorrespondingAuthor)){
return [];
}
// 获取标题
$title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
$sAuthorContent = $this->getNextParagraphAfterText($title);
$sCompany = $this->getNextParagraphAfterText($sAuthorContent); // 直接取机构所在段落的原始文本
if (empty($sCompany)) {
// 备选方案:若机构段落获取失败,用解析后的机构名称拼接
$aCompany = $this->getCompany($aParam);
$sCompany = implode(' ', array_values($aCompany));
}
// 获取机构后的完整内容
$corrText = $this->getContentAfterText($sCompany);
if(!mb_check_encoding($corrText, 'UTF-8')){
$corrText = mb_convert_encoding($corrText, 'UTF-8', 'GBK');
}
$corrText = $this->fullDecode($corrText);
// // 调试
// file_put_contents(ROOT_PATH . 'runtime/corr_text_raw.log', $corrText);
//清理文本
$corrText = str_replace(['', ''], [':', '@'], $corrText);
$corrText = preg_replace('/\s+/', ' ', $corrText); // 统一空格
$corrText = str_replace(' ', ' ', $corrText); // 去除多余空格
//按"*"分割通讯作者
$corrBlocks = preg_split('/\s*\*\s*/', $corrText);
$corrBlocks = array_filter(array_map('trim', $corrBlocks));
$aCorresponding = [];
foreach ($corrBlocks as $block) {
//匹配通讯作者姓名
$sName = $this->matchCorrespondingName($block, $aCorrespondingAuthor);
if (empty($sName)) {
continue;
}
preg_match('/(E[\s-]*mail|邮箱)[\s:]*([^\s]+@[^\s]+)/i', $block, $email);
preg_match('/(Postal[\s-]*address|地址)[\s:]*([^,;]+)/i', $block, $address);
preg_match('/(Tel|电话)[\s:]*([^\s]+)/i', $block, $tel);
$aCorresponding[] = [
'name' => $sName,
'email' => isset($email[2]) ? trim($email[2]) : '',
'postal_address' => isset($address[2]) ? trim(trim($address[2]),'.') : '',
'tel' => isset($tel[2]) ? trim($tel[2]) : ''
];
}
if(empty($aCorresponding)){
// $pattern = '/Corresponding Authors|Correspondence to|Correspondence: (.*?)(?=$|;)/s';
$pattern = '/(Corresponding Authors|Correspondence to|Correspondence)\s*:\s*([\s\S]+?)(?=\n\s*\n|$|;)/is';
$corrText = trim($corrText,'*');
preg_match($pattern, $corrText, $match);
if (!empty($match[2])) {
$corrContent = $match[2];
// 提取每个作者的名称和邮箱(优化正则,支持更多字符)
$authorPattern = '/([A-Za-z\s]+?),\s*E-mail:\s*([\w@\.\-]+)/';
preg_match_all($authorPattern, $corrContent, $authors);
if(!empty($authors[1])){
for ($i = 0; $i < count($authors[1]); $i++) {
$aCorresponding[] = [
'name' => empty($authors[1][$i]) ? '' : trim(trim($authors[1][$i]),'.'),
'email' => empty($authors[2][$i]) ? '' : trim(trim($authors[2][$i]),'.')
];
}
}
if(empty($authors[1])){
$authorPattern = '/([A-Za-z0-9\s]+?),\s*([\w@\.\-]+)(?=\.?)/';
preg_match_all($authorPattern, $corrContent, $authors);
for ($i = 0; $i < count($authors[1]); $i++) {
$aCorresponding[] = [
'name' => empty($authors[1][$i]) ? '' : trim(trim($authors[1][$i]),'.'),
'email' => empty($authors[2][$i]) ? '' : trim(trim($authors[2][$i]),'.')
];
}
}
}
}
return $aCorresponding;
}
//匹配通讯作者姓名
private function matchCorrespondingName($block, $corrNames)
{
$blockLower = strtolower($block);
foreach ($corrNames as $name) {
if (strpos($blockLower, strtolower($name)) !== false) {
return $name;
}
$nameParts = explode(' ', $name);
if (count($nameParts) >= 2) {
$reversedName = implode(' ', array_reverse($nameParts));
if (strpos($blockLower, strtolower($reversedName)) !== false) {
return $name;
}
}
}
return '';
}
// 获取目标文本的下一个段落
private function getNextParagraphAfterText($targetText){
$found = false;
foreach ($this->sections as $section) {
foreach ($section->getElements() as $element) {
$text = $this->getTextFromElement($element);
if(empty($text)){
continue;
}
if ($found) {
return $text;
}
if (stripos($text, $targetText) !== false) {
$found = true;
}
}
}
return '';
}
// 获取目标文本后的所有内容
private function getContentAfterText($targetText,$return_type = 2){
$found = false;
$content = [];
$stopKeywords = ['关键词', 'Key words', '摘要', 'Abstract','ABSTRACT'];
$maxLines = 200;
$lineNumber = 0;
foreach ($this->sections as $section) {
foreach ($section->getElements() as $element) {
$lineNumber++;
if (count($content) >= $maxLines) break;
$text = $this->getTextFromElement($element,$lineNumber);
$text = trim($text);
if (empty($text)) continue;
if (!$found) {
// 移除所有非字母数字字符后匹配
$cleanTarget = preg_replace('/[^a-zA-Z0-9]/', '', strtolower($targetText));
$cleanText = preg_replace('/[^a-zA-Z0-9]/', '', strtolower($text));
// 只要目标文本的50%以上能匹配即可
if (strlen($cleanTarget) > 0 && similar_text($cleanText, $cleanTarget) / strlen($cleanTarget) > 0.5) {
$found = true;
}
continue;
}
// 检查停止关键词
$shouldStop = false;
foreach ($stopKeywords as $kw) {
if (stripos($text, $kw) !== false) {
$shouldStop = true;
break;
}
}
if ($shouldStop) break;
$content[] = $text;
}
if (count($content) >= $maxLines || (isset($shouldStop) && $shouldStop)) break;
}
if($return_type == 1){
return $content;
}
$content = implode("\n", $content);
if(!empty($content) && !mb_check_encoding($content, 'UTF-8')){
$content = mb_convert_encoding($content, 'UTF-8', 'GBK');
}
return $content;
}
// 统一提取元素文本
private function getTextFromElement(\PhpOffice\PhpWord\Element\AbstractElement $element, int $lineNumber = 0){
$text = '';
// 1. 常量化特殊引号映射(避免每次调用重建数组,提升循环调用性能)
static $specialQuotesMap = [
'' => "'", // 右单引号U+2019→ 普通单引号U+0027
'' => "'", // 左单引号U+2018→ 普通单引号U+0027
'“' => '"', // 左双引号U+201C→ 普通双引号U+0022
'”' => '"', // 右双引号U+201D→ 普通双引号U+0022
'„' => '"', // 下双引号U+201E→ 普通双引号(兼容欧洲排版)
'‟' => '"', // 右双引号U+201F→ 普通双引号(兼容少见排版)
];
// 支持H1-H9标题格式优化移除无用变量 $titleDepth避免冗余
if ($element instanceof \PhpOffice\PhpWord\Element\Title) {
$titleContent = $element->getText();
$titleText = '';
if ($titleContent instanceof \PhpOffice\PhpWord\Element\TextRun) {
$titleText = $this->getTextFromElement($titleContent);
} else {
$titleText = strtr((string)$titleContent, $specialQuotesMap);
}
$text .= $titleText . ' ';
return $this->cleanText($text);
}
// 项目编号(优化:严格空值判断,避免 0 被 empty 误判)
if ($element instanceof \PhpOffice\PhpWord\Element\ListItemRun) {
$this->iNum = isset($this->iNum) && is_numeric($this->iNum) ? $this->iNum : 0;
$this->iNum++;
$text .= $this->iNum . ' ';
}
// 处理PreserveText含HYPERLINK邮箱提取优化反射前先判断属性存在
if ($element instanceof \PhpOffice\PhpWord\Element\PreserveText) {
try {
$reflection = new \ReflectionClass($element);
// 先判断属性是否存在,避免反射不存在的属性报错(兼容极端版本)
if (!$reflection->hasProperty('text')) {
return $this->cleanText($text);
}
$property = $reflection->getProperty('text');
$property->setAccessible(true);
$textParts = $property->getValue($element) ?? [];
} catch (\ReflectionException $e) {
return $this->cleanText($text);
}
foreach ($textParts as $part) {
$part = (string)$part;
if (strpos($part, 'HYPERLINK') !== false) {
$decoded = html_entity_decode($part, ENT_QUOTES | ENT_HTML5);
// 邮箱正则不变(已优化,兼容国际域名)
if (preg_match('/mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,10})/i', $decoded, $match)) {
$text .= $match[1] . ' ';
}
} else {
$part = strtr($part, $specialQuotesMap);
$text .= $part;
}
}
return $this->cleanText($text);
}
// 处理表格(优化:避免行尾多余空格,通过 cleanText 自动合并)
if ($element instanceof \PhpOffice\PhpWord\Element\Table) {
foreach ($element->getRows() as $row) {
foreach ($row->getCells() as $cell) {
$text .= $this->getTextFromElement($cell) . ' ';
}
// 移除行尾额外空格cleanText 会合并连续空格,无需手动添加)
}
return $this->cleanText($text);
}
// 处理单元格(逻辑不变,保持递归提取)
if ($element instanceof \PhpOffice\PhpWord\Element\Cell) {
foreach ($element->getElements() as $child) {
$text .= $this->getTextFromElement($child);
}
return $this->cleanText($text);
}
// 处理嵌套元素(逻辑不变,增强类型校验可读性)
if (method_exists($element, 'getElements') && is_callable([$element, 'getElements'])) {
foreach ($element->getElements() as $child) {
if ($child instanceof \PhpOffice\PhpWord\Element\AbstractElement) {
$text .= $this->getTextFromElement($child);
}
}
}
// 处理纯文本元素(逻辑不变,保持特殊引号替换)
if ($element instanceof \PhpOffice\PhpWord\Element\Text) {
$textPart = (string)$element->getText(); // 显式强制转换,避免类型隐患
$textPart = strtr($textPart, $specialQuotesMap);
$text .= $textPart;
}
// 处理超链接(逻辑不变,保持邮箱优先提取)
if ($element instanceof \PhpOffice\PhpWord\Element\Link) {
$target = (string)$element->getTarget();
if (strpos($target, 'mailto:') === 0) {
$text .= rtrim(str_replace('mailto:', '', $target)) . ' ';
}
$linkText = strtr((string)$element->getText(), $specialQuotesMap);
$text .= $linkText . ' ';
}
// 处理字段和注释(优化:显式强制转换,避免非字符串拼接)
if ($element instanceof \PhpOffice\PhpWord\Element\Field) {
$text .= (string)$element->getContent() . ' ';
}
if ($element instanceof \PhpOffice\PhpWord\Element\Note) {
$text .= (string)$element->getContent() . ' ';
}
return $this->cleanText($text);
}
/**
* 统一文本清理方法(稳健、高效、不破坏普通单引号)
* @param string $text 待清理文本
* @return string 清理后的纯文本
*/
private function cleanText(string $text){
//编码正确
if (!mb_check_encoding($text, 'UTF-8')) {
$text = mb_convert_encoding(
$text,
'UTF-8',
'GBK,GB2312,GB18030,Big5,ISO-8859-1,CP1252,UTF-16,UTF-32' // 补充常见西文编码,兼容更多场景
);
}
//移除不可见控制字符
$text = preg_replace('/[\x00-\x1F\x7F-\x9F]/u', ' ', $text);
//统一空白字符
$text = str_replace([
"\t", "\r", "\n",
chr(0xC2) . chr(0xA0), // 不间断空格(&nbsp;
' ', // 全角空格U+3000
chr(0xE2) . chr(0x80) . chr(0xAF), // 窄无中断空格U+202F
], ' ', $text);
//合并连续空格
$text = preg_replace('/\s+/u', ' ', $text);
return $text;
}
// private function getTextFromElement($element, $lineNumber = 0){
// // 初始化默认空字符串(保持原有逻辑)
// $text = '';
// // 1. 常量化特殊引号映射(避免重复创建数组,提升性能)
// static $specialQuotesMap = [
// '' => "'", // 右单引号U+2019→ 普通单引号U+0027
// '' => "'", // 左单引号U+2018→ 普通单引号U+0027
// '“' => '"', // 左双引号U+201C→ 普通双引号U+0022
// '”' => '"', // 右双引号U+201D→ 普通双引号U+0022
// '„' => '"', // 下双引号U+201E→ 普通双引号(兼容欧洲排版)
// '‟' => '"', // 右双引号U+201F→ 普通双引号(兼容少见排版)
// ];
// // 2. 提前校验元素合法性(避免后续 instanceof 无效判断,减少报错)
// if (!is_object($element) || !$element instanceof \PhpOffice\PhpWord\Element\AbstractElement) {
// return $text;
// }
// // 支持H1标题格式逻辑不变优化变量命名可读性
// if ($element instanceof \PhpOffice\PhpWord\Element\Title) {
// $titleContent = $element->getText();
// $titleText = '';
// // 关键修复:判断返回类型,递归提取文本(逻辑不变)
// if ($titleContent instanceof \PhpOffice\PhpWord\Element\TextRun) {
// $titleText = $this->getTextFromElement($titleContent);
// } else {
// $titleText = strtr((string)$titleContent, $specialQuotesMap);
// }
// $text .= $titleText . ' ';
// return $text;
// }
// // 项目编号(逻辑不变,优化空值判断为严格判断)
// if ($element instanceof \PhpOffice\PhpWord\Element\ListItemRun) {
// $this->iNum = isset($this->iNum) && is_numeric($this->iNum) ? $this->iNum : 0;
// $this->iNum++;
// $text .= $this->iNum . ' ';
// }
// // 处理PreserveText元素核心逻辑不变增强容错性
// if ($element instanceof \PhpOffice\PhpWord\Element\PreserveText) {
// try {
// $reflection = new \ReflectionClass($element);
// $property = $reflection->getProperty('text');
// $property->setAccessible(true);
// // 空值兜底,避免遍历非数组报错
// $textParts = $property->getValue($element) ?? [];
// } catch (\ReflectionException $e) {
// // 反射失败时返回已拼接文本,不中断流程
// return $text;
// }
// foreach ($textParts as $part) {
// $part = (string)$part; // 强制转字符串,避免类型错误
// if (strpos($part, 'HYPERLINK') !== false) {
// $decoded = html_entity_decode($part, ENT_QUOTES | ENT_HTML5);
// // 邮箱正则不变,保持原有匹配逻辑
// if (preg_match('/mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,10})/i', $decoded, $match)) {
// $text .= $match[1] . ' ';
// }
// } else {
// $text .= $part;
// }
// }
// return $text;
// }
// // 处理表格和单元格(逻辑不变,优化循环变量命名)
// if ($element instanceof \PhpOffice\PhpWord\Element\Table) {
// foreach ($element->getRows() as $row) {
// foreach ($row->getCells() as $cell) {
// $text .= $this->getTextFromElement($cell);
// }
// }
// return $text;
// }
// if ($element instanceof \PhpOffice\PhpWord\Element\Cell) {
// foreach ($element->getElements() as $child) {
// $text .= $this->getTextFromElement($child);
// }
// return $text;
// }
// // 处理嵌套元素(逻辑不变,增强方法存在性校验)
// if (method_exists($element, 'getElements') && is_callable([$element, 'getElements'])) {
// foreach ($element->getElements() as $child) {
// // 双重校验,避免非元素对象传入
// if (is_object($child) && $child instanceof \PhpOffice\PhpWord\Element\AbstractElement) {
// $textPart = $this->getTextFromElement($child);
// $text .= $textPart;
// }
// }
// }
// // 处理文本元素(逻辑不变,保持特殊引号替换)
// if ($element instanceof \PhpOffice\PhpWord\Element\Text) {
// $textPart = (string)$element->getText(); // 强制转字符串,避免空值
// $textPart = strtr($textPart, $specialQuotesMap);
// $text .= $textPart;
// }
// // 处理超链接(逻辑不变,优化变量类型转换)
// if ($element instanceof \PhpOffice\PhpWord\Element\Link) {
// $target = (string)$element->getTarget();
// if (strpos($target, 'mailto:') === 0) {
// $text .= rtrim(str_replace('mailto:', '', $target)) . ' ';
// }
// $linkText = strtr((string)$element->getText(), $specialQuotesMap);
// $text .= $linkText . ' ';
// }
// // 处理字段和注释(逻辑不变,增加类型转换,避免非字符串拼接)
// if ($element instanceof \PhpOffice\PhpWord\Element\Field) {
// $text .= (string)$element->getContent() . ' ';
// }
// if ($element instanceof \PhpOffice\PhpWord\Element\Note) {
// $text .= (string)$element->getContent() . ' ';
// }
// // 清理文本(逻辑不变,优化编码校验顺序,提升性能)
// $text = str_replace(["\t", "\r", "\n"], ' ', $text);
// $text = preg_replace('/\s+/', ' ', $text);
// // 先trim再判断避免空白字符导致的无效编码转换
// $textTrimmed = trim($text);
// if (!empty($textTrimmed) && !mb_check_encoding($textTrimmed, 'UTF-8')) {
// $text = mb_convert_encoding($text, 'UTF-8', 'GBK');
// }
// return $text;
// }
/**
* 从 Word 文档提取摘要和关键词
* @return array 提取结果
*/
function extractContentIntervals($str, $markers = []) {
// 1. 初始化标记(支持自定义,默认值兼容原逻辑)
$defaultMarkers = [
'abstract' => 'abstract',
'keywords' => 'keywords',
'end_span' => '===========end-span'
];
$markers = array_merge($defaultMarkers, $markers);
extract($markers); // 解析为变量 $abstract, $keywords, $end_span
// 2. 初始化结果(包含元信息)
$result = [
'abstract_to_keywords' => '',
'keywords_to_end' => '',
'positions' => [ // 标记位置信息(-1 表示未找到)
'abstract' => -1,
'keywords' => -1,
'end_span' => -1
],
'is_valid' => false, // 整体区间是否有效
'error' => '' // 错误信息(如标记顺序异常)
];
// 3. 定位 Abstract不区分大小写
$absPos = stripos($str, $abstract);
if ($absPos === false) {
$result['error'] = "未找到标记: {$abstract}";
return $result;
}
$result['positions']['abstract'] = $absPos;
$absEndPos = $absPos + strlen($abstract);
// 4. 定位 Keywords需在 Abstract 之后,不区分大小写)
$keyPos = stripos($str, $keywords, $absEndPos);
if ($keyPos === false) {
$result['error'] = "未找到 {$keywords} 或在 {$abstract} 之前";
return $result;
}
$result['positions']['keywords'] = $keyPos;
$keyEndPos = $keyPos + strlen($keywords);
// 5. 定位 end-span需在 Keywords 之后,严格匹配)
$endPos = strpos($str, $end_span, $keyEndPos);
if ($endPos === false) {
$result['error'] = "未找到 {$end_span} 或在 {$keywords} 之前";
return $result;
}
$result['positions']['end_span'] = $endPos;
// 6. 截取区间内容(清理标记后的紧邻符号)
// 区间1Abstract 结束 → Keywords 开始(清理标记后的冒号/空格)
$len1 = $keyPos - $absEndPos;
$part1 = substr($str, $absEndPos, $len1);
$part1 = trim($part1);
// 移除 Abstract 后可能的冒号/短横线(如 "Abstract: ..." → 去掉开头的 ":"
$part1 = ltrim($part1, ': -—');
$result['abstract_to_keywords'] = trim($part1);
// 区间2Keywords 结束 → end-span 开始(同理清理)
$len2 = $endPos - $keyEndPos;
$part2 = substr($str, $keyEndPos, $len2);
$part2 = trim($part2);
$part2 = ltrim($part2, ': -—');
$result['keywords_to_end'] = trim($part2);
// 7. 标记为有效
$result['is_valid'] = true;
return $result;
}
public function extractFromWord() {
$sContent = '';
//文本处理
$sFundContent = '';
$aContent = [];
foreach ($this->sections as $section) {
foreach ($section->getElements() as $element) {
$textContent = $this->getTextFromElement($element);
if(empty($textContent)){
continue;
}
if(!empty($textContent) && !mb_check_encoding($textContent, 'UTF-8')){
$textContent = mb_convert_encoding($textContent, 'UTF-8', 'GBK');
}
if(empty($sFundContent)){
$aFund = $this->getMatchedFundPhrases($sContent);
if(!empty($aFund[0])){
$position = stripos($sContent, $aFund[0]);
$sFundContent = substr($sContent, $position);
$sFundContent = trim(str_ireplace($aFund[0], '', $sFundContent));
if (preg_match('/^(.*?)Peer review/', $sFundContent, $matches)) {
$sFundContent = $matches[1]; // 提取匹配到的前置内容
}
}
}
$sContent .= $textContent."===========end-span";
}
}
if(!empty($sContent) && !mb_check_encoding($sContent, 'UTF-8')){
$sContent = mb_convert_encoding($sContent, 'UTF-8', 'GBK');
}
$result = $this->extractContentIntervals($sContent);
// 3. 提取摘要
$abstract = empty($result['abstract_to_keywords']) ? '' : $result['abstract_to_keywords'];
if(!empty($abstract) && !mb_check_encoding($abstract, 'UTF-8')){
$abstract = mb_convert_encoding($abstract, 'UTF-8', 'GBK');
}
$keywords = empty($result['keywords_to_end']) ? '' : $result['keywords_to_end'];
if(!empty($keywords) && !mb_check_encoding($keywords, 'UTF-8')){
$keywords = mb_convert_encoding($keywords, 'UTF-8', 'GBK');
}
if(!empty($sFundContent) && !mb_check_encoding($sFundContent, 'UTF-8')){
$sFundContent = mb_convert_encoding($sFundContent, 'UTF-8', 'GBK');
}
return [
'status' => 1,
'msg' => '提取成功',
'data' => [
'abstrart' => empty($abstract) ? '' : $this->fullDecode(str_replace('===========end-span', '',$abstract)),
'keywords' => empty($keywords) ? '' : $this->fullDecode(str_replace('===========end-span', '',$keywords)),
'fund' => empty($sFundContent) ? '' : $this->fullDecode(str_replace('===========end-span', '',$sFundContent))
]
];
}
/**
* 核心解码方法
* @param string $str 待解码字符串
* @param int $maxDepth 最大解析深度
* @return string
*/
private function fullDecode($str = '', int $maxDepth = 2){
try {
if ($str === null || trim((string)$str) === '' || $maxDepth <= 0) {
return $str === null ? '' : trim((string)$str);
}
$str = (string)$str;
// Unicode解码
if (method_exists($this, 'decodeUnicode')) {
$str = $this->decodeUnicode($str);
} else {
$str = preg_replace_callback(
'/\\\\[uU]([0-9a-fA-F]{4})/',
function ($m) {
$code = hexdec($m[1]);
return ($code >= 0x20 && $code <= 0x10FFFF) ? mb_chr($code, 'UTF-8') : $m[0];
},
$str
);
}
// 预编译正则
$regexps = [
'ob0' => '/0B\s*\\?0/',
'dl18' => '/DL\s*\\?\.18/',
'qMarkNum' => '/\\?(\d+)/',
'qMarkDotNum' => '/\\?(\.\d+)/',
'neNum' => '/≠\s*(\d+)/u',
'leNum' => '/≤\s*(\d+)/u',
'mixSymbol' => '/\\?\s*(?:、|,)\s*\\?\s*(?:、|,)\s*\\?(\d+)/u',
'leNeMark' => '/(LE|NE)\s*\\?(\d+)/i',
'unicode' => '/\\\\[uU]([0-9a-fA-F]{4})/',
'wordBin' => '/(?:\\xE2\\x89\\x86|\\xE2\s*0x89\s*0x86|e28986|\\xE2\\x89\\x87|\\xE2\s*0x89\s*0x87|e28987|\\xE2\\x89\\x80|\\xE2\s*0x89\s*0x80|e28980)/i',
'wordEntity' => '/&#\s*(?:x|X)?\s*(2264|2265|2260)\s*;?/i',
'repeatSymbol' => '/(≤|≥|≠)\1+/u',
'gbkSymbol' => '/\xA1\xF2|\xA1\xF3|\xA1\xF0/'
];
// 预定义替换映射
$maps = [
'htmlEntity' => [
'&le;' => '≤', '&#8804;' => '≤', '&#x2264;' => '≤', '&#X2264;' => '≤',
'&ge;' => '≥', '&#8805;' => '≥', '&#x2265;' => '≥', '&#X2265;' => '≥',
'&ne;' => '≠', '&#8800;' => '≠', '&#x2260;' => '≠', '&#X2260;' => '≠',
'&le' => '≤', '&ge' => '≥', '&ne' => '≠',
'&#2264' => '≤', '&#2265' => '≥', '&#2260' => '≠',
'&#x2264' => '≤', '&#x2265' => '≥', '&#x2260' => '≠',
'&#60;' => '≤', '&#62;' => '≥',
],
'wordBin' => [
"\xE2\x89\x86" => '≤', "\xE2\x89\x87" => '≥', "\xE2\x89\x80" => '≠',
"\xe2\x89\x86" => '≤', "\xe2\x89\x87" => '≥', "\xe2\x89\x80" => '≠',
'e28986' => '≤', '\xe2\x89\x86' => '≤', '\xe20x890x86' => '≤',
'e28987' => '≥', '\xe2\x89\x87' => '≥', '\xe20x890x87' => '≥',
'e28980' => '≠', '\xe2\x89\x80' => '≠', '\xe20x890x80' => '≠',
],
'wordEntity' => ['2264' => '≤', '2265' => '≥', '2260' => '≠'],
'gbkSymbol' => ["\xA1\xF2" => '≤', "\xA1\xF3" => '≥', "\xA1\xF0" => '≠'],
];
$unicodeCallback = function ($m) {
$code = hexdec($m[1]);
return ($code >= 0x20 && $code <= 0x10FFFF) ? mb_chr($code, 'UTF-8') : $m[0];
};
$depth = 0;
$hasChange = false;
$currentStr = $str;
// 循环解码
do {
$depth++;
$hasChange = false;
$prevStr = $currentStr;
// Unicode转义解码
$currentStr = preg_replace_callback($regexps['unicode'], $unicodeCallback, $currentStr);
//HTML实体替换
$currentStr = strtr($currentStr, $maps['htmlEntity']);
$currentStr = html_entity_decode(
$currentStr,
ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE,
'UTF-8'
);
// Word特殊符号乱码修复
if (preg_match($regexps['wordBin'], $currentStr)) {
$tempStr = str_replace(' ', '', $currentStr);
$currentStr = str_ireplace(array_keys($maps['wordBin']), $maps['wordBin'], $tempStr);
}
if (preg_match($regexps['wordEntity'], $currentStr)) {
$currentStr = preg_replace_callback(
$regexps['wordEntity'],
function ($m) use ($maps) {
return $maps['wordEntity'][$m[1]] ?? $m[0];
},
$currentStr
);
}
if (preg_match($regexps['gbkSymbol'], $currentStr)) {
$currentStr = strtr($currentStr, $maps['gbkSymbol']);
}
if (preg_match($regexps['repeatSymbol'], $currentStr)) {
$currentStr = preg_replace($regexps['repeatSymbol'], '$1', $currentStr);
}
//业务场景专属替换
if (preg_match($regexps['neNum'], $currentStr)) {
$currentStr = preg_replace($regexps['neNum'], '≠$1', $currentStr);
}
if (preg_match($regexps['leNum'], $currentStr)) {
$currentStr = preg_replace($regexps['leNum'], '≤$1', $currentStr);
}
if (preg_match($regexps['qMarkNum'], $currentStr)) {
$currentStr = preg_replace($regexps['qMarkNum'], '≥$1', $currentStr);
}
if (preg_match($regexps['qMarkDotNum'], $currentStr)) {
$currentStr = preg_replace($regexps['qMarkDotNum'], '≥0$1', $currentStr);
}
if (preg_match($regexps['mixSymbol'], $currentStr)) {
$currentStr = preg_replace($regexps['mixSymbol'], '≤≥≠$1', $currentStr);
}
if (preg_match($regexps['leNeMark'], $currentStr)) {
$currentStr = preg_replace_callback(
$regexps['leNeMark'],
function ($m) {
return strtoupper($m[1]) === 'LE' ? '≤' . $m[2] : '≠' . $m[2];
},
$currentStr
);
}
$hasChange = ($currentStr !== $prevStr);
} while ($depth < $maxDepth && $hasChange);
// 最终清理
$currentStr = trim($currentStr, ':');
$currentStr = strtr($currentStr, $maps['htmlEntity']);
return $currentStr;
} catch (\Throwable $e) {
return trim((string)$str);
}
}
// private function fullDecode($str, $maxDepth = 5) {
// // 空值/深度为0直接返回提前终止避免无效操作
// if (empty($str) || $maxDepth <= 0) {
// return $str;
// }
// // 【性能优化1预编译所有正则表达式】避免每次循环重新解析正则
// // 预编译:≥专属场景正则
// $regOb0 = '/0B\s*\?0/';
// $regDl18 = '/DL\s*\?.18/';
// // 预编译:≥通用场景正则
// $regQMarkNum = '/\?(\d+)/';
// $regQMarkDotNum = '/\?(\.\d+)/';
// // 预编译:≤、≠空格修复正则
// $regNeNum = '/≠\s*(\d+)/';
// $regLeNum = '/≤\s*(\d+)/';
// // 预编译:混合符号乱码正则(中文顿号/英文逗号)
// $regMixCn = '/(\?)\s*、\s*(\?)\s*、\s*(\?)(\d+)/';
// $regMixEn = '/(\?)\s*,\s*(\?)\s*,\s*(\?)(\d+)/';
// // 预编译:≤、≠专属标识正则
// $regLeMark = '/LE\s*\?(\d+)/';
// $regNeMark = '/NE\s*\?(\d+)/';
// // 预编译Unicode转义正则提取到外部避免闭包重复创建
// $regUnicode = '/\\\\u([0-9a-fA-F]{4})/';
// // 【性能优化2预定义常量/映射】避免循环内重复创建数组/字符串
// // HTML实体映射一次性定义避免循环内重复赋值
// $htmlEntityMap = [
// '&le;' => '≤', '&#8804;' => '≤', '&#x2264;' => '≤',
// '&ge;' => '≥', '&#8805;' => '≥', '&#x2265;' => '≥',
// '&ne;' => '≠', '&#8800;' => '≠', '&#x2260;' => '≠',
// ];
// // 不间断空格替换数组
// $nbspReplace = [chr(0xC2) . chr(0xA0), chr(0xA0)];
// // Unicode回调函数预定义避免循环内重复创建闭包
// $unicodeCallback = function ($m) {
// return mb_chr(hexdec($m[1]), 'UTF-8') ?: $m[0];
// };
// $original = $str;
// $depth = 0;
// $hasChange = false; // 标记是否有变化,提前终止循环
// // 循环解码:仅在有变化且未达最大深度时执行
// do {
// $depth++;
// $hasChange = false;
// $prevStr = $str; // 保存当前状态,用于判断变化
// // 1. 解码Unicode转义\uXXXX格式
// $str = $this->decodeUnicode($str);
// // 2. 解码HTML实体先替换专属实体再执行通用解码
// $str = strtr($str, $htmlEntityMap); // 高性能替换strtr比str_replace快
// $str = html_entity_decode($str, ENT_QUOTES | ENT_HTML5, 'UTF-8');
// // 3. 再次处理遗漏的Unicode转义使用预编译正则+预定义回调)
// $str = preg_replace_callback($regUnicode, $unicodeCallback, $str);
// // 4. 替换不间断空格为普通空格strtr比str_replace更高效
// $str = str_replace($nbspReplace, ' ', $str);
// // 5. 核心替换逻辑(优化执行顺序,避免覆盖)
// // 5.1 原有≥专属场景(保留)
// $str = preg_replace($regOb0, '0B≥30', $str, -1, $count1);
// $str = preg_replace($regDl18, 'DL≥0.18', $str, -1, $count2);
// // 5.2 ≤、≠空格修复(保留)
// $str = preg_replace($regNeNum, '≠$1', $str, -1, $count3);
// $str = preg_replace($regLeNum, '≤$1', $str, -1, $count4);
// // 5.3 原有≥通用场景(保留)
// $str = preg_replace($regQMarkNum, '≥$1', $str, -1, $count5);
// $str = preg_replace($regQMarkDotNum, '≥0$1', $str, -1, $count6);
// // 5.4 混合符号乱码还原(保留)
// $str = preg_replace($regMixCn, '≤、≥、≠$4', $str, -1, $count7);
// $str = preg_replace($regMixEn, '≤、≥、≠$4', $str, -1, $count8);
// // 5.5 ≤、≠专属标识还原(保留)
// $str = preg_replace($regLeMark, '≤$1', $str, -1, $count9);
// $str = preg_replace($regNeMark, '≠$1', $str, -1, $count10);
// // 5.6 修复前缀"d with "乱码(保留)
// $str = str_replace('d with ', 'd with ', $str, $count11);
// // 【性能优化3统计所有替换次数判断是否有变化】
// $totalCount = $count1 + $count2 + $count3 + $count4 + $count5 + $count6 +
// $count7 + $count8 + $count9 + $count10 + $count11;
// if ($totalCount > 0 || $str !== $prevStr) {
// $hasChange = true;
// $original = $str;
// }
// // 【性能优化4提前终止】单次循环无变化直接退出
// if (!$hasChange) {
// break;
// }
// } while ($depth < $maxDepth); // 改用do-while减少循环判断次数
// // 最终清理仅执行一次trim
// return trim($str, ':');
// }
// private function fullDecode($str, $maxDepth = 5) {
// if (empty($str) || $maxDepth <= 0) {
// return $str;
// }
// $original = $str;
// $depth = 0;
// // 循环解码,直到无变化或达到最大次数
// while (true) {
// $depth++;
// if ($depth > $maxDepth) {
// break; // 防止过度解码导致死循环
// }
// // 1. 解码 Unicode 转义(\uXXXX 格式)
// $str = $this->decodeUnicode($str);
// // 2. 解码 HTML 实体(&amp;、&#039;、&lt; 等)
// $str = html_entity_decode($str, ENT_QUOTES | ENT_HTML5, 'UTF-8');
// $str = preg_replace_callback('/\\\\u([0-9a-fA-F]{4})/', function ($m) {
// return mb_chr(hexdec($m[1]), 'UTF-8') ?: $m[0];
// }, $str);
// $str = str_replace([chr(0xC2).chr(0xA0), chr(0xA0)], ' ', $str);
// // 2. 核心:强制匹配所有可能的乱码格式,还原≥
// // 匹配0B?0、0B ?0、0B ?0空格/制表符)→ 0B≥30
// $str = preg_replace('/0B\s*\?0/', '0B≥30', $str);
// // 匹配DL?.18、DL ?.18、DL ?.18 → DL≥0.18
// $str = preg_replace('/DL\s*\?.18/', 'DL≥0.18', $str);
// // 通用匹配:数字前的?(如?30、?0.18)→ ≥30、≥0.18(防止其他变体)
// $str = preg_replace('/\?(\d+)/', '≥$1', $str);
// $str = preg_replace('/\?(\.\d+)/', '≥0$1', $str);
// // 3. 修复前缀的"d with "可能的乱码(若有)
// $str = str_replace('d with ', 'd with ', $str); // 若前缀也乱码,可同步替换
// // 若解码后无变化,退出循环
// if ($str === $original) {
// break;
// }
// $original = $str;
// }
// return trim($str,':');
// }
private function decodeUnicode($str) {
return preg_replace_callback(
'/\\\\u([0-9a-fA-F]{4})/',
function ($matches) {
// 将十六进制 Unicode 码转为 UTF-8 字符
return mb_convert_encoding(pack('H*', $matches[1]), 'UTF-8', 'UCS-2BE');
},
$str
);
}
private function getMatchedFundPhrases($content = '') {
if (empty($content)) {
return [];
}
// 基金支持词组列表
$fundPhrases = [
'Supported by', 'Funded by', 'Sponsored by', 'Supported in part by',
'Funding was provided by', 'Funded in part by','FUNDING:'
];
// 1. 转义词组中的特殊字符,使用 # 作为分隔符
$escapedPhrases = array_map(function($phrase) {
return preg_quote($phrase, '#');
}, $fundPhrases);
// 2. 拼接为正则模式:匹配任意一个词组(保留原始词组的捕获)
$pattern = '#('.implode('|', $escapedPhrases).')#i';
// 注意:此处用 () 捕获分组,而非 (?:),用于提取匹配到的具体词组
// 3. 全局匹配所有符合的词组
preg_match_all($pattern, $content, $matches);
// 4. 处理结果:去重、保留原始词组格式(忽略大小写导致的变体)
$matched = [];
if (!empty($matches[1])) {
// 遍历匹配到的结果(可能包含大小写变体,如 'funded by'
foreach ($matches[1] as $match) {
// 与原始词组列表比对,找到完全匹配的原始词组(忽略大小写)
foreach ($fundPhrases as $original) {
if (strcasecmp($match, $original) === 0) {
$matched[] = $original;
break; // 找到后跳出内层循环,避免重复
}
}
}
// 去重并保持原始顺序
$matched = array_values(array_unique($matched));
}
return $matched;
}
//日志打印
private function log($msg){
// echo date('[Y-m-d H:i:s] ') . $msg . "\n";
}
}