Files
tougao/application/common/ArticleParserService.php
2025-12-02 13:17:23 +08:00

1577 lines
68 KiB
PHP
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
namespace app\common;
use PhpOffice\PhpWord\IOFactory;
use think\Exception;
use ZipArchive;
use RecursiveIteratorIterator;
use RecursiveDirectoryIterator;
use PhpOffice\PhpWord\Settings;
use PhpOffice\PhpWord\Element\TextRun;
use DOMDocument;
use DOMXPath;
// use BadMethodCallException;
class ArticleParserService
{
private $phpWord;
private $sections;
private $iNum = 0;
public function __construct($filePath = '')
{
if (!file_exists($filePath)) {
return json_encode(['status' => 5, 'msg' => '"文档不存在:{$filePath}"']);
}
try {
// 关键配置:关闭“仅读数据”,保留完整节结构
$reader = IOFactory::createReader();
$reader->setReadDataOnly(false);
Settings::setCompatibility(false);
Settings::setOutputEscapingEnabled(true); // 避免XML转义冲突
$doc = $reader->load($filePath);
$sectionCount = count($doc->getSections());
// $this->log("✅ 文档直接加载成功,节数量:{$sectionCount}");
$this->phpWord = $reader->load($filePath);
$this->sections = $this->phpWord->getSections();
} catch (\Exception $e) {
// 预处理:移除 DOCX 中的 EMF 图片
$processedFilePath = $this->removeEmfFromDocx($filePath);
// 加载处理后的文档
$reader = IOFactory::createReader();
$reader->setReadDataOnly(false);
Settings::setCompatibility(false);
Settings::setOutputEscapingEnabled(true);
$this->phpWord = $reader->load($processedFilePath);
$this->sections = $this->phpWord->getSections();
// 可选:删除临时处理文件(避免冗余)
unlink($processedFilePath);
return json_encode(['status' => 5, 'msg' => $e->getMessage()]);
}
}
/**
* 移除 DOCX 压缩包内的所有 EMF 图片
* @param string $docxPath 原 DOCX 文件路径
* @return string 处理后的临时 DOCX 路径
*/
private function removeEmfFromDocx($docxPath){
$zip = new ZipArchive();
if ($zip->open($docxPath) !== true) {
throw new \Exception("无法打开 DOCX 文件:{$docxPath}");
}
// 1. 创建临时目录用于解压
$tempDir = rtrim(ROOT_PATH,'/').'/runtime/'.uniqid('docx_temp_');
mkdir($tempDir, 0700, true);
// 2. 解压 DOCX 到临时目录
$zip->extractTo($tempDir);
$zip->close();
// 3. 递归删除所有 EMF 文件
$dirIterator = new RecursiveDirectoryIterator($tempDir);
$iterator = new RecursiveIteratorIterator($dirIterator);
foreach ($iterator as $file) {
if ($file->isFile() && strtolower(pathinfo($file, PATHINFO_EXTENSION)) === 'emf') {
unlink($file->getPathname());
}
}
// 4. 重新打包为 DOCX
$processedPath = $tempDir . '_processed.docx';
$newZip = new ZipArchive();
if ($newZip->open($processedPath, ZipArchive::CREATE | ZipArchive::OVERWRITE) !== true) {
throw new \Exception("无法创建处理后的 DOCX 文件");
}
// 遍历临时目录,添加所有文件到新压缩包
$this->addFilesToZip($tempDir, $newZip);
$newZip->close();
// 5. 删除临时解压目录
$this->deleteDir($tempDir);
return $processedPath;
}
/**
* 递归添加目录文件到 ZipArchive
* @param string $dir 目录路径
* @param ZipArchive $zip ZipArchive 实例
*/
private function addFilesToZip($dir, $zip)
{
$files = scandir($dir);
foreach ($files as $file) {
if ($file === '.' || $file === '..') continue;
$filePath = $dir . '/' . $file;
if (is_dir($filePath)) {
$this->addFilesToZip($filePath, $zip);
} else {
// 计算压缩包内的相对路径(避免冗余目录层级)
$relativePath = str_replace(dirname($dir) . '/', '', $filePath);
$zip->addFile($filePath, $relativePath);
}
}
}
/**
* 递归删除目录
* @param string $dir 目录路径
*/
private function deleteDir($dir){
// 1. 基础校验:非空字符串且为有效目录
if (trim($dir) === '' || !is_dir($dir)) {
return false;
}
// 2. 统一路径格式(去除尾部分隔符,避免跨系统差异)
$dir = rtrim($dir, DIRECTORY_SEPARATOR);
$dirName = basename($dir);
// 3. 前缀强校验仅处理docx_temp_开头的目录
if (strpos($dirName, 'docx_temp_') !== 0) {
return false;
}
// 4. 路径归属校验缓存realpath结果减少I/O
$runtimeDir = rtrim(ROOT_PATH, '/') . '/runtime';
$realDir = realpath($dir);
$realRuntimeDir = realpath($runtimeDir);
if ($realDir === false || $realRuntimeDir === false || strpos($realDir, $realRuntimeDir) !== 0) {
return false;
}
// 5. 扫描目录(带错误抑制,处理权限问题)
$files = @scandir($dir);
if ($files === false) {
return false;
}
$isFullyDeleted = true; // 标记是否完全删除
// 6. 递归处理子项
foreach ($files as $file) {
if ($file === '.' || $file === '..') {
continue;
}
$filePath = $dir . DIRECTORY_SEPARATOR . $file;
$realFilePath = realpath($filePath);
// 子路径校验:必须是当前目录的子项(防符号链接跳转)
if ($realFilePath === false || strpos($realFilePath, $realDir) !== 0) {
$isFullyDeleted = false;
continue;
}
if (is_dir($realFilePath)) {
// 递归删除子目录,继承校验逻辑
if (!$this->deleteDir($realFilePath)) {
$isFullyDeleted = false;
}
} else {
// 尝试删除文件(失败则标记未完全删除)
if (!@unlink($realFilePath)) {
$isFullyDeleted = false;
}
}
}
// 7. 最终删除目录(确保空目录才删除)
$remainingFiles = @scandir($dir);
if ($remainingFiles !== false && count($remainingFiles) <= 2) {
@rmdir($dir);
return $isFullyDeleted; // 若子项完全删除则返回true
}
return false;
}
// 上传并解析文档的入口方法
public static function uploadAndParse($sFileUrl){
//必填值验证
if(empty($sFileUrl)){
return json_encode(['status' => 2,'msg' => 'Please upload the submission file']);
}
//判断文件是否执行
if (!file_exists($sFileUrl)) {
return json_encode(['status' => 3, 'msg' => 'The uploaded file does not exist']);
}
if (!is_readable($sFileUrl)) {
return json_encode(['status' => 4, 'msg' => 'The uploaded file is unreadable']);
}
// 解析文档
$oDealFile = new self($sFileUrl);
//获取标题
$sTitle = $oDealFile->getTitle();
if(empty($sTitle)){
return json_encode(['status' => 5, 'msg' => 'Article title retrieval failed']);
}
//获取作者
$aParam = ['title' => $sTitle];
$aAuthor = $oDealFile->getAuthors($aParam);
$aAuthorData = empty($aAuthor['author']) ? [] : $aAuthor['author'];//所有作者信息
$aAuthorReportData = empty($aAuthor['report']) ? [] : $aAuthor['report'];//通讯作者信息
$aParam['author'] = $aAuthorData;
$aParam['report'] = $aAuthorReportData;
//获取机构
$aCompany = $oDealFile->getCompany($aParam);
$aParam['company'] = $aCompany;
//获取通讯作者信息
$aParam['corresponding'] = $oDealFile->getCorrespondingAuthors($aParam);
//keywords 和 摘要
$aContent = $oDealFile->extractFromWord();
if(!mb_check_encoding($sTitle, 'UTF-8')){
$sTitle = mb_convert_encoding($sTitle, 'UTF-8', 'GBK');
}
$aParam['title'] = $oDealFile->fullDecode($aParam['title']);
$aParam += empty($aContent['data']) ? [] : $aContent['data'];
return json_encode(['status' => 1,'msg' => 'success','data' => $aParam]);
}
// 提取文章标题
private function getTitle(){
if(empty($this->sections)){
return '';
}
$title = '';
$maxLength = 0;
foreach ($this->sections as $section) {
foreach ($section->getElements() as $element) {
$text = $this->getTextFromElement($element);
$length = mb_strlen(trim($text));
if ($length > $maxLength && $length > 3) { // 标题通常较长
$title = trim($text);
$maxLength = $length;
break 2; // 取第一个最长段落作为标题
}
}
}
return $title;
}
// 提取作者
private function parseAuthorsWithoutRegex($str = '') {
if (empty($str)) {
return [];
}
if(!mb_check_encoding($str, 'UTF-8')){
$str = mb_convert_encoding($str, 'UTF-8', 'GBK');
}
$str = $this->fullDecode($str);
$str = str_replace(["\xC2\xA0", 'ï¼', '�', '', '', '', '', '', '', '', '', '', '', ''],
[' ', ' ', ' ', ' ', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0'], $str);
$str = trim(str_replace([' and ', ' AND ', ' And '], ', ', $str));
// 合并上标中数字与逗号间的空格(如"2, 3"→"2,3"
$len = mb_strlen($str);
$processed = '';
for ($i = 0; $i < $len; $i++) {
$char = mb_substr($str, $i, 1);
if ($char === ',' && $i - 1 >= 0 && $i + 2 < $len) {
$prevChar = mb_substr($str, $i - 1, 1);
$next1 = mb_substr($str, $i + 1, 1);
$next2 = mb_substr($str, $i + 2, 1);
// 兼容全角数字转半角后的判断
if ((ctype_digit($prevChar) || is_numeric($prevChar)) && $next1 === ' ' && (ctype_digit($next2) || is_numeric($next2))) {
$processed .= $char;
$i += 1;
continue;
}
}
$processed .= $char;
}
$str = $processed;
// 合并数字与符号间的空格(如"1 *"→"1*"
$len = mb_strlen($str);
$processed = '';
for ($i = 0; $i < $len; $i++) {
$char = mb_substr($str, $i, 1);
if ((ctype_digit($char) || is_numeric($char)) && $i + 2 < $len) { // 支持数字判断
$next1 = mb_substr($str, $i + 1, 1);
$next2 = mb_substr($str, $i + 2, 1);
if ($next1 === ' ' && in_array($next2, ['#', '*', '†', '‡', '§'])) { // 扩展符号支持
$processed .= $char;
$i += 2;
$processed .= $next2;
continue;
}
}
$processed .= $char;
}
$str = $processed;
// 合并连续空格
$len = mb_strlen($str);
$processed = '';
$prevSpace = false;
for ($i = 0; $i < $len; $i++) {
$char = mb_substr($str, $i, 1);
if ($char === ' ') {
if (!$prevSpace) {
$processed .= $char;
$prevSpace = true;
}
} else {
$processed .= $char;
$prevSpace = false;
}
}
$str = trim($processed);
// 作者处理
$authors = [];
$currentName = '';
$currentSuperscript = '';
$inName = true;
$len = mb_strlen($str);
for ($i = 0; $i < $len; $i++) {
$char = mb_substr($str, $i, 1);
// 处理作者分隔符:逗号+空格
if ($char === ',' && $i + 1 < $len) {
$nextChar = mb_substr($str, $i + 1, 1);
if ($nextChar === ' ') {
if (!empty($currentName)) {
$currentSuperscript = rtrim($currentSuperscript, ',');
$authors[] = [
'name' => trim($currentName),
'superscript' => trim($currentSuperscript)
];
}
$currentName = '';
$currentSuperscript = '';
$inName = true;
$i++;
continue;
}
}
// 支持姓名中的点、连字符、特殊字母(如带重音的字母)
if (ctype_alpha($char) || in_array($char, [' ', '.', '-', 'à', 'á', 'â', 'ã', 'ä', 'ç', 'è', 'é', 'ê', 'ë'])) {
if ($inName) {
$currentName .= $char;
} else {
$currentSuperscript = rtrim($currentSuperscript, ',');
$authors[] = [
'name' => trim($currentName),
'superscript' => trim($currentSuperscript)
];
$currentName = $char;
$currentSuperscript = '';
$inName = true;
}
}
// 解析上标(数字、逗号、#、*、†等)
elseif ((ctype_digit($char) || is_numeric($char)) || in_array($char, ['#', '*', '†', ',', '‡', '§'])) {
$inName = false;
$currentSuperscript .= $char;
}
// 忽略其他字符
else {
continue;
}
}
// 处理最后一个作者
if (!empty($currentName)) {
$currentSuperscript = rtrim($currentSuperscript, ',');
$authors[] = [
'name' => trim($currentName),
'superscript' => trim($currentSuperscript)
];
}
// 提取机构编号为数组、判断通讯作者和第一作者
foreach ($authors as $index => &$author) {
// 提取机构编号(兼容多字节数字)
$institutionIds = [];
$superscript = $author['superscript'];
$numStr = '';
for ($i = 0; $i < mb_strlen($superscript); $i++) {
$c = mb_substr($superscript, $i, 1);
if (ctype_digit($c) || is_numeric($c)) { // 支持数字判断
$numStr .= $c;
} else {
if (!empty($numStr)) {
$institutionIds[] = (int)$numStr;
$numStr = '';
}
}
}
if (!empty($numStr)) {
$institutionIds[] = (int)$numStr;
}
$institutionIds = array_values(array_unique($institutionIds));
$author['company_id'] = $institutionIds;
// 判断第一作者(#标记)和通讯作者(*、†标记)
$author['is_super'] = strpos($superscript, '#') !== false ? 1 : 0;
$author['is_report'] = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0;
}
unset($author); // 释放引用
return $authors;
}
private function getAuthors($aParam = []) {
$title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
$sAuthorContent = $this->getNextParagraphAfterText($title);
if (empty($sAuthorContent)) {
return ['author' => [], 'report' => []];
}
if(!mb_check_encoding($sAuthorContent, 'UTF-8')){
$sAuthorContent = mb_convert_encoding($sAuthorContent, 'UTF-8', 'GBK');
}
$sAuthorContent = $this->fullDecode($sAuthorContent);
//清理不可见字符
$sAuthorContent = preg_replace('/[\x00-\x1F\x7F\x{200B}-\x{200F}]/u', '', $sAuthorContent);
//修复特殊符号乱码
$symbolMap = [
'†' => '†', 'â ' => '†', 'â' => '†', '?†' => '†',
':' => ':', ',' => ',', '—' => '-',
'啊' => '' // 针对性移除异常字符“啊”(若为固定乱码)
];
$sAuthorContent = strtr($sAuthorContent, $symbolMap);
//格式标准化
$sAuthorContent = str_replace(['', ';', '', '、'], ',', $sAuthorContent); // 统一分隔符
$sAuthorContent = preg_replace('/\s+and\s+/i', ', ', $sAuthorContent); // and转逗号
$sAuthorContent = preg_replace('/\s+/', ' ', $sAuthorContent); // 合并多余空格
$sAuthorContent = trim($sAuthorContent);
$aAuthor = $this->parseAuthorsWithoutRegex($sAuthorContent);
if(empty($aAuthor)){
return ['author' => [],'report' => []];
}
$aReport = $aAuthorData = [];
foreach ($aAuthor as $key => $value) {
if(empty($value['name']) && empty($value['superscript'])){
continue;
}
if(!empty($value['name']) && !empty($value['is_report']) && $value['is_report'] == 1){
$aReport[] = $value['name'];
}
$aAuthorData[] = $value;
}
return ['author' => $aAuthorData,'report' => array_unique($aReport)];
}
// 获取机构
private function getCompany($aParam = []){
//获取标题
$title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
//获取标题下的作者
$sAuthorContent = empty($aParam['authors']) ? $this->getNextParagraphAfterText($title) : $aParam['authors'];
//获取作者结构
$allLines = $this->getContentAfterText($sAuthorContent,1);
if(empty($allLines)){
return [];
}
// 2. 按序号分组,合并同一序号的多行内容
$grouped = [];
$currentNumber = null; // 当前序号
foreach ($allLines as $line) {
$line = trim($line);
if (empty($line)) {
continue;
}
if(!mb_check_encoding($line, 'UTF-8')){
$line = mb_convert_encoding($line, 'UTF-8', 'GBK');
}
$line = $this->fullDecode($line);
$number = '';
$i = 0;
$lineLen = strlen($line);
// 提取行首的连续数字(作为序号)
$hasFirstChar = false;
while ($i < $lineLen) {
$currentChar = $line[$i];
// 首字符处理:允许 26个字母大小写或数字
if (!$hasFirstChar) {
if (ctype_digit($currentChar) || ctype_alpha($currentChar)) {
$number .= $currentChar;
$hasFirstChar = true;
$i++;
} else {
// 首字符不符合(非字母/数字),终止循环
break;
}
} else {
// 后续字符必须是数字(保持原逻辑)
if (ctype_digit($currentChar)) {
$number .= $currentChar;
$i++;
} else {
break;
}
}
}
// 若行首有数字,则视为新条目
if (!empty($number)) {
$currentNumber = $number;
// 提取序号后的内容(跳过数字后的符号/空格,保留核心内容)
// 从数字后的位置开始,跳过可能的符号(./*)或空格
while ($i < $lineLen && (in_array($line[$i], ['.', '*', ' ']))) {
$i++;
}
$content = trim(substr($line, $i)); // 序号后的内容
$grouped[$currentNumber] = $content;
continue;
}
// // 非新条目,合并到当前序号的内容中
// if ($currentNumber !== null) {
// $grouped[$currentNumber] .= ' ' . $line;
// }
}
$aCompany = [];
foreach ($grouped as $number => $institution) {
$institution = $this->fullDecode($institution);
// 原有基础清理逻辑不变
$institution = preg_replace('/\s+/', ' ', $institution); // 合并多余空格
$institution = rtrim($institution, '.'); // 去除末尾句号
$institution = preg_replace('/^\d+\s+/', '', $institution); // 去除开头数字
$institution = trim($institution); // 清理首尾空格
// 增强地址提取:匹配"机构名, 城市 邮编, 国家"格式(兼容更多变体)
// 允许地址中包含多个逗号(如子机构、街道信息),最终以"城市 邮编, 国家"结尾
// preg_match('/(.*?, [A-Za-z\s]+ \d+, [A-Za-z\s]+)/', $institution, $institutionmatches);
// $institution = trim($institutionmatches[1] ?? $institution);
// 强化冗余信息过滤:去除"*"及之后的内容(包括通讯作者、邮箱等)
// 新增对"#"、"†"等标记的过滤,兼容更多期刊格式
if (preg_match('/^(.*?)(?=\s*[\*#†]|(?i)\s*Email)/', $institution, $matches)) {
$institution = trim($matches[1]);
}
// 编码校验不变
if (!empty($institution) && !mb_check_encoding($institution, 'UTF-8')) {
$institution = mb_convert_encoding($institution, 'UTF-8', 'GBK');
}
$aCompany[$number] = empty($institution) ? '' : trim(trim($institution),'.');
}
return $aCompany;
}
// 提取通讯作者含E-mail、地址、电话
private function getCorrespondingAuthors($aParam = []){
$aCorrespondingAuthor = empty($aParam['report']) ? [] : $aParam['report'];
if(empty($aCorrespondingAuthor)){
return [];
}
// 获取标题
$title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
$sAuthorContent = $this->getNextParagraphAfterText($title);
$sCompany = $this->getNextParagraphAfterText($sAuthorContent); // 直接取机构所在段落的原始文本
if (empty($sCompany)) {
// 备选方案:若机构段落获取失败,用解析后的机构名称拼接
$aCompany = $this->getCompany($aParam);
$sCompany = implode(' ', array_values($aCompany));
}
// 获取机构后的完整内容
$corrText = $this->getContentAfterText($sCompany);
if(!mb_check_encoding($corrText, 'UTF-8')){
$corrText = mb_convert_encoding($corrText, 'UTF-8', 'GBK');
}
$corrText = $this->fullDecode($corrText);
// // 调试
// file_put_contents(ROOT_PATH . 'runtime/corr_text_raw.log', $corrText);
//清理文本
$corrText = str_replace(['', ''], [':', '@'], $corrText);
$corrText = preg_replace('/\s+/', ' ', $corrText); // 统一空格
$corrText = str_replace(' ', ' ', $corrText); // 去除多余空格
//按"*"分割通讯作者
$corrBlocks = preg_split('/\s*\*\s*/', $corrText);
$corrBlocks = array_filter(array_map('trim', $corrBlocks));
$aCorresponding = [];
foreach ($corrBlocks as $block) {
//匹配通讯作者姓名
$sName = $this->matchCorrespondingName($block, $aCorrespondingAuthor);
if (empty($sName)) {
continue;
}
preg_match('/(E[\s-]*mail|邮箱)[\s:]*([^\s]+@[^\s]+)/i', $block, $email);
preg_match('/(Postal[\s-]*address|地址)[\s:]*([^,;]+)/i', $block, $address);
preg_match('/(Tel|电话)[\s:]*([^\s]+)/i', $block, $tel);
$aCorresponding[] = [
'name' => $sName,
'email' => isset($email[2]) ? trim($email[2]) : '',
'postal_address' => isset($address[2]) ? trim(trim($address[2]),'.') : '',
'tel' => isset($tel[2]) ? trim($tel[2]) : ''
];
}
if(empty($aCorresponding)){
// $pattern = '/Corresponding Authors|Correspondence to|Correspondence: (.*?)(?=$|;)/s';
$pattern = '/(Corresponding Authors|Correspondence to|Correspondence)\s*:\s*([\s\S]+?)(?=\n\s*\n|$|;)/is';
$corrText = trim($corrText,'*');
preg_match($pattern, $corrText, $match);
if (!empty($match[2])) {
$corrContent = $match[2];
// 提取每个作者的名称和邮箱(优化正则,支持更多字符)
$authorPattern = '/([A-Za-z\s]+?),\s*E-mail:\s*([\w@\.\-]+)/';
preg_match_all($authorPattern, $corrContent, $authors);
if(!empty($authors[1])){
for ($i = 0; $i < count($authors[1]); $i++) {
$aCorresponding[] = [
'name' => empty($authors[1][$i]) ? '' : trim(trim($authors[1][$i]),'.'),
'email' => empty($authors[2][$i]) ? '' : trim(trim($authors[2][$i]),'.')
];
}
}
if(empty($authors[1])){
$authorPattern = '/([A-Za-z0-9\s]+?),\s*([\w@\.\-]+)(?=\.?)/';
preg_match_all($authorPattern, $corrContent, $authors);
for ($i = 0; $i < count($authors[1]); $i++) {
$aCorresponding[] = [
'name' => empty($authors[1][$i]) ? '' : trim(trim($authors[1][$i]),'.'),
'email' => empty($authors[2][$i]) ? '' : trim(trim($authors[2][$i]),'.')
];
}
}
}
}
return $aCorresponding;
}
//匹配通讯作者姓名
private function matchCorrespondingName($block, $corrNames)
{
$blockLower = strtolower($block);
foreach ($corrNames as $name) {
if (strpos($blockLower, strtolower($name)) !== false) {
return $name;
}
$nameParts = explode(' ', $name);
if (count($nameParts) >= 2) {
$reversedName = implode(' ', array_reverse($nameParts));
if (strpos($blockLower, strtolower($reversedName)) !== false) {
return $name;
}
}
}
return '';
}
// 获取目标文本的下一个段落
private function getNextParagraphAfterText($targetText){
$found = false;
foreach ($this->sections as $section) {
foreach ($section->getElements() as $element) {
$text = $this->getTextFromElement($element);
if(empty($text)){
continue;
}
if ($found) {
return $text;
}
if (stripos($text, $targetText) !== false) {
$found = true;
}
}
}
return '';
}
// 获取目标文本后的所有内容
private function getContentAfterText($targetText,$return_type = 2){
$found = false;
$content = [];
$stopKeywords = ['关键词', 'Key words', '摘要', 'Abstract','ABSTRACT'];
$maxLines = 200;
$lineNumber = 0;
foreach ($this->sections as $section) {
foreach ($section->getElements() as $element) {
$lineNumber++;
if (count($content) >= $maxLines) break;
$text = $this->getTextFromElement($element,$lineNumber);
$text = trim($text);
if (empty($text)) continue;
if (!$found) {
// 移除所有非字母数字字符后匹配
$cleanTarget = preg_replace('/[^a-zA-Z0-9]/', '', strtolower($targetText));
$cleanText = preg_replace('/[^a-zA-Z0-9]/', '', strtolower($text));
// 只要目标文本的50%以上能匹配即可
if (strlen($cleanTarget) > 0 && similar_text($cleanText, $cleanTarget) / strlen($cleanTarget) > 0.5) {
$found = true;
}
continue;
}
// 检查停止关键词
$shouldStop = false;
foreach ($stopKeywords as $kw) {
if (stripos($text, $kw) !== false) {
$shouldStop = true;
break;
}
}
if ($shouldStop) break;
$content[] = $text;
}
if (count($content) >= $maxLines || (isset($shouldStop) && $shouldStop)) break;
}
if($return_type == 1){
return $content;
}
$content = implode("\n", $content);
if(!empty($content) && !mb_check_encoding($content, 'UTF-8')){
$content = mb_convert_encoding($content, 'UTF-8', 'GBK');
}
return $content;
}
// 统一提取元素文本
private function getTextFromElement(\PhpOffice\PhpWord\Element\AbstractElement $element, int $lineNumber = 0){
$text = '';
// 1. 常量化特殊引号映射(避免每次调用重建数组,提升循环调用性能)
static $specialQuotesMap = [
'' => "'", // 右单引号U+2019→ 普通单引号U+0027
'' => "'", // 左单引号U+2018→ 普通单引号U+0027
'“' => '"', // 左双引号U+201C→ 普通双引号U+0022
'”' => '"', // 右双引号U+201D→ 普通双引号U+0022
'„' => '"', // 下双引号U+201E→ 普通双引号(兼容欧洲排版)
'‟' => '"', // 右双引号U+201F→ 普通双引号(兼容少见排版)
];
// 支持H1-H9标题格式优化移除无用变量 $titleDepth避免冗余
if ($element instanceof \PhpOffice\PhpWord\Element\Title) {
$titleContent = $element->getText();
$titleText = '';
if ($titleContent instanceof \PhpOffice\PhpWord\Element\TextRun) {
$titleText = $this->getTextFromElement($titleContent);
} else {
$titleText = strtr((string)$titleContent, $specialQuotesMap);
}
$text .= $titleText . ' ';
return $this->cleanText($text);
}
// 项目编号(优化:严格空值判断,避免 0 被 empty 误判)
if ($element instanceof \PhpOffice\PhpWord\Element\ListItemRun) {
$this->iNum = isset($this->iNum) && is_numeric($this->iNum) ? $this->iNum : 0;
$this->iNum++;
$text .= $this->iNum . ' ';
}
// 处理PreserveText含HYPERLINK邮箱提取优化反射前先判断属性存在
if ($element instanceof \PhpOffice\PhpWord\Element\PreserveText) {
try {
$reflection = new \ReflectionClass($element);
// 先判断属性是否存在,避免反射不存在的属性报错(兼容极端版本)
if (!$reflection->hasProperty('text')) {
return $this->cleanText($text);
}
$property = $reflection->getProperty('text');
$property->setAccessible(true);
$textParts = $property->getValue($element) ?? [];
} catch (\ReflectionException $e) {
return $this->cleanText($text);
}
foreach ($textParts as $part) {
$part = (string)$part;
if (strpos($part, 'HYPERLINK') !== false) {
$decoded = html_entity_decode($part, ENT_QUOTES | ENT_HTML5);
// 邮箱正则不变(已优化,兼容国际域名)
if (preg_match('/mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,10})/i', $decoded, $match)) {
$text .= $match[1] . ' ';
}
} else {
$part = strtr($part, $specialQuotesMap);
$text .= $part;
}
}
return $this->cleanText($text);
}
// 处理表格(优化:避免行尾多余空格,通过 cleanText 自动合并)
if ($element instanceof \PhpOffice\PhpWord\Element\Table) {
foreach ($element->getRows() as $row) {
foreach ($row->getCells() as $cell) {
$text .= $this->getTextFromElement($cell) . ' ';
}
// 移除行尾额外空格cleanText 会合并连续空格,无需手动添加)
}
return $this->cleanText($text);
}
// 处理单元格(逻辑不变,保持递归提取)
if ($element instanceof \PhpOffice\PhpWord\Element\Cell) {
foreach ($element->getElements() as $child) {
$text .= $this->getTextFromElement($child);
}
return $this->cleanText($text);
}
// 处理嵌套元素(逻辑不变,增强类型校验可读性)
if (method_exists($element, 'getElements') && is_callable([$element, 'getElements'])) {
foreach ($element->getElements() as $child) {
if ($child instanceof \PhpOffice\PhpWord\Element\AbstractElement) {
$text .= $this->getTextFromElement($child);
}
}
}
// 处理纯文本元素(逻辑不变,保持特殊引号替换)
if ($element instanceof \PhpOffice\PhpWord\Element\Text) {
$textPart = (string)$element->getText(); // 显式强制转换,避免类型隐患
$textPart = strtr($textPart, $specialQuotesMap);
$text .= $textPart;
}
// 处理超链接(逻辑不变,保持邮箱优先提取)
if ($element instanceof \PhpOffice\PhpWord\Element\Link) {
$target = (string)$element->getTarget();
if (strpos($target, 'mailto:') === 0) {
$text .= rtrim(str_replace('mailto:', '', $target)) . ' ';
}
$linkText = strtr((string)$element->getText(), $specialQuotesMap);
$text .= $linkText . ' ';
}
// 处理字段和注释(优化:显式强制转换,避免非字符串拼接)
if ($element instanceof \PhpOffice\PhpWord\Element\Field) {
$text .= (string)$element->getContent() . ' ';
}
if ($element instanceof \PhpOffice\PhpWord\Element\Note) {
$text .= (string)$element->getContent() . ' ';
}
return $this->cleanText($text);
}
/**
* 统一文本清理方法(稳健、高效、不破坏普通单引号)
* @param string $text 待清理文本
* @return string 清理后的纯文本
*/
private function cleanText(string $text){
//编码正确
if (!mb_check_encoding($text, 'UTF-8')) {
$text = mb_convert_encoding(
$text,
'UTF-8',
'GBK,GB2312,GB18030,Big5,ISO-8859-1,CP1252,UTF-16,UTF-32' // 补充常见西文编码,兼容更多场景
);
}
//移除不可见控制字符
$text = preg_replace('/[\x00-\x1F\x7F-\x9F]/u', ' ', $text);
//统一空白字符
$text = str_replace([
"\t", "\r", "\n",
chr(0xC2) . chr(0xA0), // 不间断空格(&nbsp;
' ', // 全角空格U+3000
chr(0xE2) . chr(0x80) . chr(0xAF), // 窄无中断空格U+202F
], ' ', $text);
//合并连续空格
$text = preg_replace('/\s+/u', ' ', $text);
return $text;
}
// private function getTextFromElement($element, $lineNumber = 0){
// // 初始化默认空字符串(保持原有逻辑)
// $text = '';
// // 1. 常量化特殊引号映射(避免重复创建数组,提升性能)
// static $specialQuotesMap = [
// '' => "'", // 右单引号U+2019→ 普通单引号U+0027
// '' => "'", // 左单引号U+2018→ 普通单引号U+0027
// '“' => '"', // 左双引号U+201C→ 普通双引号U+0022
// '”' => '"', // 右双引号U+201D→ 普通双引号U+0022
// '„' => '"', // 下双引号U+201E→ 普通双引号(兼容欧洲排版)
// '‟' => '"', // 右双引号U+201F→ 普通双引号(兼容少见排版)
// ];
// // 2. 提前校验元素合法性(避免后续 instanceof 无效判断,减少报错)
// if (!is_object($element) || !$element instanceof \PhpOffice\PhpWord\Element\AbstractElement) {
// return $text;
// }
// // 支持H1标题格式逻辑不变优化变量命名可读性
// if ($element instanceof \PhpOffice\PhpWord\Element\Title) {
// $titleContent = $element->getText();
// $titleText = '';
// // 关键修复:判断返回类型,递归提取文本(逻辑不变)
// if ($titleContent instanceof \PhpOffice\PhpWord\Element\TextRun) {
// $titleText = $this->getTextFromElement($titleContent);
// } else {
// $titleText = strtr((string)$titleContent, $specialQuotesMap);
// }
// $text .= $titleText . ' ';
// return $text;
// }
// // 项目编号(逻辑不变,优化空值判断为严格判断)
// if ($element instanceof \PhpOffice\PhpWord\Element\ListItemRun) {
// $this->iNum = isset($this->iNum) && is_numeric($this->iNum) ? $this->iNum : 0;
// $this->iNum++;
// $text .= $this->iNum . ' ';
// }
// // 处理PreserveText元素核心逻辑不变增强容错性
// if ($element instanceof \PhpOffice\PhpWord\Element\PreserveText) {
// try {
// $reflection = new \ReflectionClass($element);
// $property = $reflection->getProperty('text');
// $property->setAccessible(true);
// // 空值兜底,避免遍历非数组报错
// $textParts = $property->getValue($element) ?? [];
// } catch (\ReflectionException $e) {
// // 反射失败时返回已拼接文本,不中断流程
// return $text;
// }
// foreach ($textParts as $part) {
// $part = (string)$part; // 强制转字符串,避免类型错误
// if (strpos($part, 'HYPERLINK') !== false) {
// $decoded = html_entity_decode($part, ENT_QUOTES | ENT_HTML5);
// // 邮箱正则不变,保持原有匹配逻辑
// if (preg_match('/mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,10})/i', $decoded, $match)) {
// $text .= $match[1] . ' ';
// }
// } else {
// $text .= $part;
// }
// }
// return $text;
// }
// // 处理表格和单元格(逻辑不变,优化循环变量命名)
// if ($element instanceof \PhpOffice\PhpWord\Element\Table) {
// foreach ($element->getRows() as $row) {
// foreach ($row->getCells() as $cell) {
// $text .= $this->getTextFromElement($cell);
// }
// }
// return $text;
// }
// if ($element instanceof \PhpOffice\PhpWord\Element\Cell) {
// foreach ($element->getElements() as $child) {
// $text .= $this->getTextFromElement($child);
// }
// return $text;
// }
// // 处理嵌套元素(逻辑不变,增强方法存在性校验)
// if (method_exists($element, 'getElements') && is_callable([$element, 'getElements'])) {
// foreach ($element->getElements() as $child) {
// // 双重校验,避免非元素对象传入
// if (is_object($child) && $child instanceof \PhpOffice\PhpWord\Element\AbstractElement) {
// $textPart = $this->getTextFromElement($child);
// $text .= $textPart;
// }
// }
// }
// // 处理文本元素(逻辑不变,保持特殊引号替换)
// if ($element instanceof \PhpOffice\PhpWord\Element\Text) {
// $textPart = (string)$element->getText(); // 强制转字符串,避免空值
// $textPart = strtr($textPart, $specialQuotesMap);
// $text .= $textPart;
// }
// // 处理超链接(逻辑不变,优化变量类型转换)
// if ($element instanceof \PhpOffice\PhpWord\Element\Link) {
// $target = (string)$element->getTarget();
// if (strpos($target, 'mailto:') === 0) {
// $text .= rtrim(str_replace('mailto:', '', $target)) . ' ';
// }
// $linkText = strtr((string)$element->getText(), $specialQuotesMap);
// $text .= $linkText . ' ';
// }
// // 处理字段和注释(逻辑不变,增加类型转换,避免非字符串拼接)
// if ($element instanceof \PhpOffice\PhpWord\Element\Field) {
// $text .= (string)$element->getContent() . ' ';
// }
// if ($element instanceof \PhpOffice\PhpWord\Element\Note) {
// $text .= (string)$element->getContent() . ' ';
// }
// // 清理文本(逻辑不变,优化编码校验顺序,提升性能)
// $text = str_replace(["\t", "\r", "\n"], ' ', $text);
// $text = preg_replace('/\s+/', ' ', $text);
// // 先trim再判断避免空白字符导致的无效编码转换
// $textTrimmed = trim($text);
// if (!empty($textTrimmed) && !mb_check_encoding($textTrimmed, 'UTF-8')) {
// $text = mb_convert_encoding($text, 'UTF-8', 'GBK');
// }
// return $text;
// }
/**
* 从 Word 文档提取摘要和关键词
* @return array 提取结果
*/
function extractContentIntervals($str, $markers = []) {
// 1. 初始化标记(支持自定义,默认值兼容原逻辑)
$defaultMarkers = [
'abstract' => 'abstract',
'keywords' => 'keywords',
'end_span' => '===========end-span'
];
$markers = array_merge($defaultMarkers, $markers);
extract($markers); // 解析为变量 $abstract, $keywords, $end_span
// 2. 初始化结果(包含元信息)
$result = [
'abstract_to_keywords' => '',
'keywords_to_end' => '',
'positions' => [ // 标记位置信息(-1 表示未找到)
'abstract' => -1,
'keywords' => -1,
'end_span' => -1
],
'is_valid' => false, // 整体区间是否有效
'error' => '' // 错误信息(如标记顺序异常)
];
// 3. 定位 Abstract不区分大小写
$absPos = stripos($str, $abstract);
if ($absPos === false) {
$result['error'] = "未找到标记: {$abstract}";
return $result;
}
$result['positions']['abstract'] = $absPos;
$absEndPos = $absPos + strlen($abstract);
// 4. 定位 Keywords需在 Abstract 之后,不区分大小写)
$keyPos = stripos($str, $keywords, $absEndPos);
if ($keyPos === false) {
$result['error'] = "未找到 {$keywords} 或在 {$abstract} 之前";
return $result;
}
$result['positions']['keywords'] = $keyPos;
$keyEndPos = $keyPos + strlen($keywords);
// 5. 定位 end-span需在 Keywords 之后,严格匹配)
$endPos = strpos($str, $end_span, $keyEndPos);
if ($endPos === false) {
$result['error'] = "未找到 {$end_span} 或在 {$keywords} 之前";
return $result;
}
$result['positions']['end_span'] = $endPos;
// 6. 截取区间内容(清理标记后的紧邻符号)
// 区间1Abstract 结束 → Keywords 开始(清理标记后的冒号/空格)
$len1 = $keyPos - $absEndPos;
$part1 = substr($str, $absEndPos, $len1);
$part1 = trim($part1);
// 移除 Abstract 后可能的冒号/短横线(如 "Abstract: ..." → 去掉开头的 ":"
$part1 = ltrim($part1, ': -—');
$result['abstract_to_keywords'] = trim($part1);
// 区间2Keywords 结束 → end-span 开始(同理清理)
$len2 = $endPos - $keyEndPos;
$part2 = substr($str, $keyEndPos, $len2);
$part2 = trim($part2);
$part2 = ltrim($part2, ': -—');
$result['keywords_to_end'] = trim($part2);
// 7. 标记为有效
$result['is_valid'] = true;
return $result;
}
public function extractFromWord() {
$sContent = '';
//文本处理
$sFundContent = '';
$aContent = [];
foreach ($this->sections as $section) {
foreach ($section->getElements() as $element) {
$textContent = $this->getTextFromElement($element);
if(empty($textContent)){
continue;
}
if(!empty($textContent) && !mb_check_encoding($textContent, 'UTF-8')){
$textContent = mb_convert_encoding($textContent, 'UTF-8', 'GBK');
}
if(empty($sFundContent)){
$aFund = $this->getMatchedFundPhrases($sContent);
if(!empty($aFund[0])){
$position = stripos($sContent, $aFund[0]);
$sFundContent = substr($sContent, $position);
$sFundContent = trim(str_ireplace($aFund[0], '', $sFundContent));
if (preg_match('/^(.*?)Peer review/', $sFundContent, $matches)) {
$sFundContent = $matches[1]; // 提取匹配到的前置内容
}
}
}
$sContent .= $textContent."===========end-span";
}
}
if(!empty($sContent) && !mb_check_encoding($sContent, 'UTF-8')){
$sContent = mb_convert_encoding($sContent, 'UTF-8', 'GBK');
}
$result = $this->extractContentIntervals($sContent);
// 3. 提取摘要
$abstract = empty($result['abstract_to_keywords']) ? '' : $result['abstract_to_keywords'];
if(!empty($abstract) && !mb_check_encoding($abstract, 'UTF-8')){
$abstract = mb_convert_encoding($abstract, 'UTF-8', 'GBK');
}
$keywords = empty($result['keywords_to_end']) ? '' : $result['keywords_to_end'];
if(!empty($keywords) && !mb_check_encoding($keywords, 'UTF-8')){
$keywords = mb_convert_encoding($keywords, 'UTF-8', 'GBK');
}
if(!empty($sFundContent) && !mb_check_encoding($sFundContent, 'UTF-8')){
$sFundContent = mb_convert_encoding($sFundContent, 'UTF-8', 'GBK');
}
return [
'status' => 1,
'msg' => '提取成功',
'data' => [
'abstrart' => empty($abstract) ? '' : $this->fullDecode(str_replace('===========end-span', '',$abstract)),
'keywords' => empty($keywords) ? '' : $this->fullDecode(str_replace('===========end-span', '',$keywords)),
'fund' => empty($sFundContent) ? '' : $this->fullDecode(str_replace('===========end-span', '',$sFundContent))
]
];
}
/**
* 核心解码方法(无静态缓存,高性能版)
* @param string $str 待解码字符串
* @param int $maxDepth 最大解析深度
* @return string
*/
private function fullDecode(?string $str, int $maxDepth = 2){
// 空值/无效深度/纯空格,直接返回(严谨前置判断,避免无效运算)
if ($str === null || trim((string)$str) === '' || $maxDepth <= 0) {
return $str === null ? '' : trim((string)$str);
}
// 确保输入是字符串(兼容非字符串输入场景)
$str = (string)$str;
// 前置Unicode解码避免转义字符干扰后续匹配
$str = $this->decodeUnicode($str);
// ========== 预编译正则(优化匹配精度、避免歧义,仅编译一次) ==========
$regexps = [
// 专属场景正则:优化空格匹配(任意空白字符)+ 问号转义(避免正则歧义)
'ob0' => '/0B\s*\\?0/', // 匹配 0B?0、0B ?0 等场景
'dl18' => '/DL\s*\\?\.18/', // 精准匹配 DL?.18(避免误匹配 DL?x.18
// 通用场景正则:问号转义,确保仅匹配字面问号
'qMarkNum' => '/\\?(\d+)/', // 匹配 ?123、?45 等(问号转义)
'qMarkDotNum' => '/\\?(\.\d+)/', // 匹配 ?.18、?.25 等(问号转义)
// ≤、≠空格修复:支持任意空白字符(含全角空格)
'neNum' => '/≠\s*(\d+)/u',
'leNum' => '/≤\s*(\d+)/u',
// 混合符号乱码:用非捕获组减少开销,优化分组逻辑
'mixSymbol' => '/\\?\s*(?:、|,)\s*\\?\s*(?:、|,)\s*\\?(\d+)/u',
// ≤、≠专属标识:支持大小写不敏感(覆盖 LE/le/NE/ne
'leNeMark' => '/(LE|NE)\s*\\?(\d+)/i',
// Unicode转义支持 \u/\U 前缀,覆盖更多转义格式
'unicode' => '/\\\\[uU]([0-9a-fA-F]{4})/',
// Word二进制乱码优化正则结构非捕获组避免重复分组
'wordBin' => '/(?:\\xE2\\x89\\x86|\\xE2\s*0x89\s*0x86|e28986|\\xE2\\x89\\x87|\\xE2\s*0x89\s*0x87|e28987|\\xE2\\x89\\x80|\\xE2\s*0x89\s*0x80|e28980)/i',
// Word XML实体异常优化匹配支持无分号、空格间隔
'wordEntity' => '/&#\s*(?:x|X)?\s*(2264|2265|2260)\s*;?/i',
// 不可见控制字符添加UTF-8修饰符避免匹配多字节字符异常
'controlChar' => '/[\x00-\x1F\x7F]/u',
// 重复符号去重:用反向引用优化,匹配更高效(支持≤≥≠)
'repeatSymbol' => '/(≤|≥|≠)\1+/u',
// GBK编码乱码优化正则无冗余分组确保匹配原生字节
'gbkSymbol' => '/\xA1\xF2|\xA1\xF3|\xA1\xF0/'
];
// ========== 预定义替换映射(扩展场景、去冗余、修复转义问题) ==========
$maps = [
// HTML实体映射补充更多Word常见实体覆盖不完整实体场景
'htmlEntity' => [
'&le;' => '≤', '&#8804;' => '≤', '&#x2264;' => '≤', '&#X2264;' => '≤',
'&ge;' => '≥', '&#8805;' => '≥', '&#x2265;' => '≥', '&#X2265;' => '≥',
'&ne;' => '≠', '&#8800;' => '≠', '&#x2260;' => '≠', '&#X2260;' => '≠',
'&le' => '≤', '&ge' => '≥', '&ne' => '≠', // 无分号实体
'&#2264' => '≤', '&#2265' => '≥', '&#2260' => '≠', // 无分号数字实体
'&#x2264' => '≤', '&#x2265' => '≥', '&#x2260' => '≠', // 无分号十六进制实体
'&#60;' => '≤', '&#62;' => '≥', // 业务专属映射(保留)
],
// 空格替换数组补充Word中常见的特殊空格覆盖更多场景
'nbsp' => [
chr(0xC2) . chr(0xA0), // UTF-8不间断空格&nbsp;
chr(0xA0), // 拉丁1不间断空格
' ', // 全角空格U+3000
chr(0x2002), // 半角空格U+2002
chr(0x2003), // 全角空格U+2003
chr(0x2004), // 三分之一全角空格U+2004
chr(0x2005), // 四分之一全角空格U+2005
chr(0x202F), // 窄无中断空格U+202FWord常用
],
// 二进制乱码映射:统一键名格式(去除空格),避免重复匹配
'wordBin' => [
'e28986' => '≤',
'\xe2\x89\x86' => '≤',
'\xe20x890x86' => '≤', // 去除空格后的统一键名
'e28987' => '≥',
'\xe2\x89\x87' => '≥',
'\xe20x890x87' => '≥',
'e28980' => '≠',
'\xe2\x89\x80' => '≠',
'\xe20x890x80' => '≠',
],
// XML实体编码映射保持简洁仅映射核心数字
'wordEntity' => [
'2264' => '≤',
'2265' => '≥',
'2260' => '≠',
],
// GBK编码映射修复转义问题用双引号包裹原生字节避免匹配失败
'gbkSymbol' => [
"\xA1\xF2" => '≤', // 原生GBK字节无需转义双引号关键
"\xA1\xF3" => '≥',
"\xA1\xF0" => '≠',
],
];
// 预定义回调函数(仅创建一次,提升性能,增加容错)
$unicodeCallback = function ($m) {
$code = hexdec($m[1]);
// 容错:十六进制转换失败/无效Unicode码点返回原始值
return ($code >= 0x20 && $code <= 0x10FFFF) ? mb_chr($code, 'UTF-8') : $m[0];
};
$depth = 0;
$hasChange = false;
$currentStr = $str;
// 循环解码:仅在有变化且未达最大深度时执行(避免无限循环)
do {
$depth++;
$hasChange = false;
$prevStr = $currentStr;
// ========== 前置处理(惰性执行,仅在需要时触发) ==========
// 1. 过滤不可见控制字符(仅当包含时执行)
if (preg_match($regexps['controlChar'], $currentStr)) {
$currentStr = preg_replace($regexps['controlChar'], '', $currentStr);
}
// 2. 编码校正非UTF-8时才转换增加容错机制
if (!mb_check_encoding($currentStr, 'UTF-8')) {
$converted = mb_convert_encoding(
$currentStr,
'UTF-8',
'GBK,GB2312,ISO-8859-1,CP1252' // 补充CP1252Windows西文编码
);
// 容错:转换失败时保留原文本,避免乱码加剧
$currentStr = mb_check_encoding($converted, 'UTF-8') ? $converted : $currentStr;
}
// ========== 核心解码逻辑(按优先级执行,避免冲突) ==========
// 1. Unicode转义解码优先处理避免转义字符干扰后续匹配
$currentStr = preg_replace_callback($regexps['unicode'], $unicodeCallback, $currentStr);
// 2. HTML实体替换先精准映射再解码剩余实体
$currentStr = strtr($currentStr, $maps['htmlEntity']);
$currentStr = html_entity_decode(
$currentStr,
ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE,
'UTF-8'
);
// 3. 统一所有空格为普通空格(避免空格类型导致的匹配失败)
$currentStr = str_replace($maps['nbsp'], ' ', $currentStr);
// ========== Word特殊符号乱码修复惰性执行优化效率 ==========
// 1. 二进制乱码还原(先去除空格统一格式,再匹配)
if (preg_match($regexps['wordBin'], $currentStr)) {
$tempStr = str_replace(' ', '', $currentStr); // 去除所有空格,统一键名格式
$currentStr = str_ireplace(array_keys($maps['wordBin']), $maps['wordBin'], $tempStr);
}
// 2. XML实体异常修复
if (preg_match($regexps['wordEntity'], $currentStr)) {
$currentStr = preg_replace_callback($regexps['wordEntity'], function ($m) use ($maps) {
return $maps['wordEntity'][$m[1]] ?? $m[0];
}, $currentStr);
}
// 3. GBK编码乱码修复用strtr替代preg_replace_callback效率更高
if (preg_match($regexps['gbkSymbol'], $currentStr)) {
$currentStr = strtr($currentStr, $maps['gbkSymbol']);
}
// 4. 重复符号去重用preg_replace简化无需回调
if (preg_match($regexps['repeatSymbol'], $currentStr)) {
$currentStr = preg_replace($regexps['repeatSymbol'], '$1', $currentStr);
}
// ========== 业务场景专属替换(惰性执行,精准匹配) ==========
// 1. 专属场景替换0B?0 → 0B≥30DL?.18 → DL≥0.18
if (strpos($currentStr, '0B') !== false) {
$currentStr = preg_replace($regexps['ob0'], '0B≥30', $currentStr);
}
if (strpos($currentStr, 'DL') !== false) {
$currentStr = preg_replace($regexps['dl18'], 'DL≥0.18', $currentStr);
}
// 2. ≤、≠空格修复(去除符号与数字间的空格)
if (preg_match($regexps['neNum'], $currentStr)) {
$currentStr = preg_replace($regexps['neNum'], '≠$1', $currentStr);
}
if (preg_match($regexps['leNum'], $currentStr)) {
$currentStr = preg_replace($regexps['leNum'], '≤$1', $currentStr);
}
// 3. 通用场景替换(问号 → ≥)
if (preg_match($regexps['qMarkNum'], $currentStr)) {
$currentStr = preg_replace($regexps['qMarkNum'], '≥$1', $currentStr);
}
if (preg_match($regexps['qMarkDotNum'], $currentStr)) {
$currentStr = preg_replace($regexps['qMarkDotNum'], '≥0$1', $currentStr);
}
// 4. 混合符号乱码还原(?、,?、,?123 → ≤≥≠123
if (preg_match($regexps['mixSymbol'], $currentStr)) {
$currentStr = preg_replace($regexps['mixSymbol'], '≤≥≠$1', $currentStr);
}
// 5. ≤、≠专属标识还原LE?123 → ≤123NE?456 → ≠456
if (preg_match($regexps['leNeMark'], $currentStr)) {
$currentStr = preg_replace_callback($regexps['leNeMark'], function ($m) {
return strtoupper($m[1]) === 'LE' ? '≤' . $m[2] : '≠' . $m[2];
}, $currentStr);
}
// 6. 移除冗余代码(原代码"d with "替换无意义,直接删除)
// ========== 变化判断(简化逻辑,避免无效计数) ==========
$hasChange = ($currentStr !== $prevStr);
} while ($depth < $maxDepth && $hasChange);
// 最终清理(去除首尾冒号+二次实体替换,确保无遗漏)
$currentStr = trim($currentStr, ':');
$currentStr = strtr($currentStr, $maps['htmlEntity']);
return $currentStr;
}
// private function fullDecode($str, $maxDepth = 5) {
// // 空值/深度为0直接返回提前终止避免无效操作
// if (empty($str) || $maxDepth <= 0) {
// return $str;
// }
// // 【性能优化1预编译所有正则表达式】避免每次循环重新解析正则
// // 预编译:≥专属场景正则
// $regOb0 = '/0B\s*\?0/';
// $regDl18 = '/DL\s*\?.18/';
// // 预编译:≥通用场景正则
// $regQMarkNum = '/\?(\d+)/';
// $regQMarkDotNum = '/\?(\.\d+)/';
// // 预编译:≤、≠空格修复正则
// $regNeNum = '/≠\s*(\d+)/';
// $regLeNum = '/≤\s*(\d+)/';
// // 预编译:混合符号乱码正则(中文顿号/英文逗号)
// $regMixCn = '/(\?)\s*、\s*(\?)\s*、\s*(\?)(\d+)/';
// $regMixEn = '/(\?)\s*,\s*(\?)\s*,\s*(\?)(\d+)/';
// // 预编译:≤、≠专属标识正则
// $regLeMark = '/LE\s*\?(\d+)/';
// $regNeMark = '/NE\s*\?(\d+)/';
// // 预编译Unicode转义正则提取到外部避免闭包重复创建
// $regUnicode = '/\\\\u([0-9a-fA-F]{4})/';
// // 【性能优化2预定义常量/映射】避免循环内重复创建数组/字符串
// // HTML实体映射一次性定义避免循环内重复赋值
// $htmlEntityMap = [
// '&le;' => '≤', '&#8804;' => '≤', '&#x2264;' => '≤',
// '&ge;' => '≥', '&#8805;' => '≥', '&#x2265;' => '≥',
// '&ne;' => '≠', '&#8800;' => '≠', '&#x2260;' => '≠',
// ];
// // 不间断空格替换数组
// $nbspReplace = [chr(0xC2) . chr(0xA0), chr(0xA0)];
// // Unicode回调函数预定义避免循环内重复创建闭包
// $unicodeCallback = function ($m) {
// return mb_chr(hexdec($m[1]), 'UTF-8') ?: $m[0];
// };
// $original = $str;
// $depth = 0;
// $hasChange = false; // 标记是否有变化,提前终止循环
// // 循环解码:仅在有变化且未达最大深度时执行
// do {
// $depth++;
// $hasChange = false;
// $prevStr = $str; // 保存当前状态,用于判断变化
// // 1. 解码Unicode转义\uXXXX格式
// $str = $this->decodeUnicode($str);
// // 2. 解码HTML实体先替换专属实体再执行通用解码
// $str = strtr($str, $htmlEntityMap); // 高性能替换strtr比str_replace快
// $str = html_entity_decode($str, ENT_QUOTES | ENT_HTML5, 'UTF-8');
// // 3. 再次处理遗漏的Unicode转义使用预编译正则+预定义回调)
// $str = preg_replace_callback($regUnicode, $unicodeCallback, $str);
// // 4. 替换不间断空格为普通空格strtr比str_replace更高效
// $str = str_replace($nbspReplace, ' ', $str);
// // 5. 核心替换逻辑(优化执行顺序,避免覆盖)
// // 5.1 原有≥专属场景(保留)
// $str = preg_replace($regOb0, '0B≥30', $str, -1, $count1);
// $str = preg_replace($regDl18, 'DL≥0.18', $str, -1, $count2);
// // 5.2 ≤、≠空格修复(保留)
// $str = preg_replace($regNeNum, '≠$1', $str, -1, $count3);
// $str = preg_replace($regLeNum, '≤$1', $str, -1, $count4);
// // 5.3 原有≥通用场景(保留)
// $str = preg_replace($regQMarkNum, '≥$1', $str, -1, $count5);
// $str = preg_replace($regQMarkDotNum, '≥0$1', $str, -1, $count6);
// // 5.4 混合符号乱码还原(保留)
// $str = preg_replace($regMixCn, '≤、≥、≠$4', $str, -1, $count7);
// $str = preg_replace($regMixEn, '≤、≥、≠$4', $str, -1, $count8);
// // 5.5 ≤、≠专属标识还原(保留)
// $str = preg_replace($regLeMark, '≤$1', $str, -1, $count9);
// $str = preg_replace($regNeMark, '≠$1', $str, -1, $count10);
// // 5.6 修复前缀"d with "乱码(保留)
// $str = str_replace('d with ', 'd with ', $str, $count11);
// // 【性能优化3统计所有替换次数判断是否有变化】
// $totalCount = $count1 + $count2 + $count3 + $count4 + $count5 + $count6 +
// $count7 + $count8 + $count9 + $count10 + $count11;
// if ($totalCount > 0 || $str !== $prevStr) {
// $hasChange = true;
// $original = $str;
// }
// // 【性能优化4提前终止】单次循环无变化直接退出
// if (!$hasChange) {
// break;
// }
// } while ($depth < $maxDepth); // 改用do-while减少循环判断次数
// // 最终清理仅执行一次trim
// return trim($str, ':');
// }
// private function fullDecode($str, $maxDepth = 5) {
// if (empty($str) || $maxDepth <= 0) {
// return $str;
// }
// $original = $str;
// $depth = 0;
// // 循环解码,直到无变化或达到最大次数
// while (true) {
// $depth++;
// if ($depth > $maxDepth) {
// break; // 防止过度解码导致死循环
// }
// // 1. 解码 Unicode 转义(\uXXXX 格式)
// $str = $this->decodeUnicode($str);
// // 2. 解码 HTML 实体(&amp;、&#039;、&lt; 等)
// $str = html_entity_decode($str, ENT_QUOTES | ENT_HTML5, 'UTF-8');
// $str = preg_replace_callback('/\\\\u([0-9a-fA-F]{4})/', function ($m) {
// return mb_chr(hexdec($m[1]), 'UTF-8') ?: $m[0];
// }, $str);
// $str = str_replace([chr(0xC2).chr(0xA0), chr(0xA0)], ' ', $str);
// // 2. 核心:强制匹配所有可能的乱码格式,还原≥
// // 匹配0B?0、0B ?0、0B ?0空格/制表符)→ 0B≥30
// $str = preg_replace('/0B\s*\?0/', '0B≥30', $str);
// // 匹配DL?.18、DL ?.18、DL ?.18 → DL≥0.18
// $str = preg_replace('/DL\s*\?.18/', 'DL≥0.18', $str);
// // 通用匹配:数字前的?(如?30、?0.18)→ ≥30、≥0.18(防止其他变体)
// $str = preg_replace('/\?(\d+)/', '≥$1', $str);
// $str = preg_replace('/\?(\.\d+)/', '≥0$1', $str);
// // 3. 修复前缀的"d with "可能的乱码(若有)
// $str = str_replace('d with ', 'd with ', $str); // 若前缀也乱码,可同步替换
// // 若解码后无变化,退出循环
// if ($str === $original) {
// break;
// }
// $original = $str;
// }
// return trim($str,':');
// }
private function decodeUnicode($str) {
return preg_replace_callback(
'/\\\\u([0-9a-fA-F]{4})/',
function ($matches) {
// 将十六进制 Unicode 码转为 UTF-8 字符
return mb_convert_encoding(pack('H*', $matches[1]), 'UTF-8', 'UCS-2BE');
},
$str
);
}
private function getMatchedFundPhrases($content = '') {
if (empty($content)) {
return [];
}
// 基金支持词组列表
$fundPhrases = [
'Supported by', 'Funded by', 'Sponsored by', 'Supported in part by',
'Funding was provided by', 'Funded in part by','FUNDING:'
];
// 1. 转义词组中的特殊字符,使用 # 作为分隔符
$escapedPhrases = array_map(function($phrase) {
return preg_quote($phrase, '#');
}, $fundPhrases);
// 2. 拼接为正则模式:匹配任意一个词组(保留原始词组的捕获)
$pattern = '#('.implode('|', $escapedPhrases).')#i';
// 注意:此处用 () 捕获分组,而非 (?:),用于提取匹配到的具体词组
// 3. 全局匹配所有符合的词组
preg_match_all($pattern, $content, $matches);
// 4. 处理结果:去重、保留原始词组格式(忽略大小写导致的变体)
$matched = [];
if (!empty($matches[1])) {
// 遍历匹配到的结果(可能包含大小写变体,如 'funded by'
foreach ($matches[1] as $match) {
// 与原始词组列表比对,找到完全匹配的原始词组(忽略大小写)
foreach ($fundPhrases as $original) {
if (strcasecmp($match, $original) === 0) {
$matched[] = $original;
break; // 找到后跳出内层循环,避免重复
}
}
}
// 去重并保持原始顺序
$matched = array_values(array_unique($matched));
}
return $matched;
}
//日志打印
private function log($msg){
// echo date('[Y-m-d H:i:s] ') . $msg . "\n";
}
}