Files
tougao/application/common/ArticleParserService.php
2025-12-01 09:16:50 +08:00

1147 lines
47 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
namespace app\common;
use PhpOffice\PhpWord\IOFactory;
use think\Exception;
use ZipArchive;
use RecursiveIteratorIterator;
use RecursiveDirectoryIterator;
use PhpOffice\PhpWord\Settings;
use PhpOffice\PhpWord\Element\TextRun;
use DOMDocument;
use DOMXPath;
// use BadMethodCallException;
class ArticleParserService
{
private $phpWord;
private $sections;
public function __construct($filePath = '')
{
if (!file_exists($filePath)) {
return json_encode(['status' => 5, 'msg' => '"文档不存在:{$filePath}"']);
}
try {
// 关键配置:关闭“仅读数据”,保留完整节结构
$reader = IOFactory::createReader();
$reader->setReadDataOnly(false);
Settings::setCompatibility(false);
Settings::setOutputEscapingEnabled(true); // 避免XML转义冲突
$doc = $reader->load($filePath);
$sectionCount = count($doc->getSections());
// $this->log("✅ 文档直接加载成功,节数量:{$sectionCount}");
$this->phpWord = $reader->load($filePath);
$this->sections = $this->phpWord->getSections();
} catch (\Exception $e) {
// 预处理:移除 DOCX 中的 EMF 图片
$processedFilePath = $this->removeEmfFromDocx($filePath);
// 加载处理后的文档
$reader = IOFactory::createReader();
$reader->setReadDataOnly(false);
Settings::setCompatibility(false);
Settings::setOutputEscapingEnabled(true);
$this->phpWord = $reader->load($processedFilePath);
$this->sections = $this->phpWord->getSections();
// 可选:删除临时处理文件(避免冗余)
unlink($processedFilePath);
return json_encode(['status' => 5, 'msg' => $e->getMessage()]);
}
}
/**
* 移除 DOCX 压缩包内的所有 EMF 图片
* @param string $docxPath 原 DOCX 文件路径
* @return string 处理后的临时 DOCX 路径
*/
private function removeEmfFromDocx($docxPath){
$zip = new ZipArchive();
if ($zip->open($docxPath) !== true) {
throw new \Exception("无法打开 DOCX 文件:{$docxPath}");
}
// 1. 创建临时目录用于解压
$tempDir = rtrim(ROOT_PATH,'/').'/runtime/'.uniqid('docx_temp_');
mkdir($tempDir, 0700, true);
// 2. 解压 DOCX 到临时目录
$zip->extractTo($tempDir);
$zip->close();
// 3. 递归删除所有 EMF 文件
$dirIterator = new RecursiveDirectoryIterator($tempDir);
$iterator = new RecursiveIteratorIterator($dirIterator);
foreach ($iterator as $file) {
if ($file->isFile() && strtolower(pathinfo($file, PATHINFO_EXTENSION)) === 'emf') {
unlink($file->getPathname());
}
}
// 4. 重新打包为 DOCX
$processedPath = $tempDir . '_processed.docx';
$newZip = new ZipArchive();
if ($newZip->open($processedPath, ZipArchive::CREATE | ZipArchive::OVERWRITE) !== true) {
throw new \Exception("无法创建处理后的 DOCX 文件");
}
// 遍历临时目录,添加所有文件到新压缩包
$this->addFilesToZip($tempDir, $newZip);
$newZip->close();
// 5. 删除临时解压目录
$this->deleteDir($tempDir);
return $processedPath;
}
/**
* 递归添加目录文件到 ZipArchive
* @param string $dir 目录路径
* @param ZipArchive $zip ZipArchive 实例
*/
private function addFilesToZip($dir, $zip)
{
$files = scandir($dir);
foreach ($files as $file) {
if ($file === '.' || $file === '..') continue;
$filePath = $dir . '/' . $file;
if (is_dir($filePath)) {
$this->addFilesToZip($filePath, $zip);
} else {
// 计算压缩包内的相对路径(避免冗余目录层级)
$relativePath = str_replace(dirname($dir) . '/', '', $filePath);
$zip->addFile($filePath, $relativePath);
}
}
}
/**
* 递归删除目录
* @param string $dir 目录路径
*/
private function deleteDir($dir){
// 1. 基础校验:非空字符串且为有效目录
if (trim($dir) === '' || !is_dir($dir)) {
return false;
}
// 2. 统一路径格式(去除尾部分隔符,避免跨系统差异)
$dir = rtrim($dir, DIRECTORY_SEPARATOR);
$dirName = basename($dir);
// 3. 前缀强校验仅处理docx_temp_开头的目录
if (strpos($dirName, 'docx_temp_') !== 0) {
return false;
}
// 4. 路径归属校验缓存realpath结果减少I/O
$runtimeDir = rtrim(ROOT_PATH, '/') . '/runtime';
$realDir = realpath($dir);
$realRuntimeDir = realpath($runtimeDir);
if ($realDir === false || $realRuntimeDir === false || strpos($realDir, $realRuntimeDir) !== 0) {
return false;
}
// 5. 扫描目录(带错误抑制,处理权限问题)
$files = @scandir($dir);
if ($files === false) {
return false;
}
$isFullyDeleted = true; // 标记是否完全删除
// 6. 递归处理子项
foreach ($files as $file) {
if ($file === '.' || $file === '..') {
continue;
}
$filePath = $dir . DIRECTORY_SEPARATOR . $file;
$realFilePath = realpath($filePath);
// 子路径校验:必须是当前目录的子项(防符号链接跳转)
if ($realFilePath === false || strpos($realFilePath, $realDir) !== 0) {
$isFullyDeleted = false;
continue;
}
if (is_dir($realFilePath)) {
// 递归删除子目录,继承校验逻辑
if (!$this->deleteDir($realFilePath)) {
$isFullyDeleted = false;
}
} else {
// 尝试删除文件(失败则标记未完全删除)
if (!@unlink($realFilePath)) {
$isFullyDeleted = false;
}
}
}
// 7. 最终删除目录(确保空目录才删除)
$remainingFiles = @scandir($dir);
if ($remainingFiles !== false && count($remainingFiles) <= 2) {
@rmdir($dir);
return $isFullyDeleted; // 若子项完全删除则返回true
}
return false;
}
// 上传并解析文档的入口方法
public static function uploadAndParse($sFileUrl){
//必填值验证
if(empty($sFileUrl)){
return json_encode(['status' => 2,'msg' => 'Please upload the submission file']);
}
//判断文件是否执行
if (!file_exists($sFileUrl)) {
return json_encode(['status' => 3, 'msg' => 'The uploaded file does not exist']);
}
if (!is_readable($sFileUrl)) {
return json_encode(['status' => 4, 'msg' => 'The uploaded file is unreadable']);
}
// 解析文档
$oDealFile = new self($sFileUrl);
//获取标题
$sTitle = $oDealFile->getTitle();
if(empty($sTitle)){
return json_encode(['status' => 5, 'msg' => 'Article title retrieval failed']);
}
//获取作者
$aParam = ['title' => $sTitle];
$aAuthor = $oDealFile->getAuthors($aParam);
$aAuthorData = empty($aAuthor['author']) ? [] : $aAuthor['author'];//所有作者信息
$aAuthorReportData = empty($aAuthor['report']) ? [] : $aAuthor['report'];//通讯作者信息
$aParam['author'] = $aAuthorData;
$aParam['report'] = $aAuthorReportData;
//获取机构
$aCompany = $oDealFile->getCompany($aParam);
$aParam['company'] = $aCompany;
//获取通讯作者信息
$aParam['corresponding'] = $oDealFile->getCorrespondingAuthors($aParam);
//keywords 和 摘要
$aContent = $oDealFile->extractFromWord();
if(!mb_check_encoding($sTitle, 'UTF-8')){
$sTitle = mb_convert_encoding($sTitle, 'UTF-8', 'GBK');
}
$aParam['title'] = $oDealFile->fullDecode($aParam['title']);
$aParam += empty($aContent['data']) ? [] : $aContent['data'];
return json_encode(['status' => 1,'msg' => 'success','data' => $aParam]);
}
// 提取文章标题
private function getTitle(){
if(empty($this->sections)){
return '';
}
$title = '';
$maxLength = 0;
foreach ($this->sections as $section) {
foreach ($section->getElements() as $element) {
$text = $this->getTextFromElement($element);
$length = mb_strlen(trim($text));
if ($length > $maxLength && $length > 3) { // 标题通常较长
$title = trim($text);
$maxLength = $length;
break 2; // 取第一个最长段落作为标题
}
}
}
return $title;
}
// 提取作者
private function parseAuthorsWithoutRegex($str = '') {
if (empty($str)) {
return [];
}
if(!mb_check_encoding($str, 'UTF-8')){
$str = mb_convert_encoding($str, 'UTF-8', 'GBK');
}
$str = $this->fullDecode($str);
$str = str_replace(["\xC2\xA0", 'ï¼', '�', '', '', '', '', '', '', '', '', '', '', ''],
[' ', ' ', ' ', ' ', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0'], $str);
$str = trim(str_replace([' and ', ' AND ', ' And '], ', ', $str));
// 合并上标中数字与逗号间的空格(如"2, 3"→"2,3"
$len = mb_strlen($str);
$processed = '';
for ($i = 0; $i < $len; $i++) {
$char = mb_substr($str, $i, 1);
if ($char === ',' && $i - 1 >= 0 && $i + 2 < $len) {
$prevChar = mb_substr($str, $i - 1, 1);
$next1 = mb_substr($str, $i + 1, 1);
$next2 = mb_substr($str, $i + 2, 1);
// 兼容全角数字转半角后的判断
if ((ctype_digit($prevChar) || is_numeric($prevChar)) && $next1 === ' ' && (ctype_digit($next2) || is_numeric($next2))) {
$processed .= $char;
$i += 1;
continue;
}
}
$processed .= $char;
}
$str = $processed;
// 合并数字与符号间的空格(如"1 *"→"1*"
$len = mb_strlen($str);
$processed = '';
for ($i = 0; $i < $len; $i++) {
$char = mb_substr($str, $i, 1);
if ((ctype_digit($char) || is_numeric($char)) && $i + 2 < $len) { // 支持数字判断
$next1 = mb_substr($str, $i + 1, 1);
$next2 = mb_substr($str, $i + 2, 1);
if ($next1 === ' ' && in_array($next2, ['#', '*', '†', '‡', '§'])) { // 扩展符号支持
$processed .= $char;
$i += 2;
$processed .= $next2;
continue;
}
}
$processed .= $char;
}
$str = $processed;
// 合并连续空格
$len = mb_strlen($str);
$processed = '';
$prevSpace = false;
for ($i = 0; $i < $len; $i++) {
$char = mb_substr($str, $i, 1);
if ($char === ' ') {
if (!$prevSpace) {
$processed .= $char;
$prevSpace = true;
}
} else {
$processed .= $char;
$prevSpace = false;
}
}
$str = trim($processed);
// 作者处理
$authors = [];
$currentName = '';
$currentSuperscript = '';
$inName = true;
$len = mb_strlen($str);
for ($i = 0; $i < $len; $i++) {
$char = mb_substr($str, $i, 1);
// 处理作者分隔符:逗号+空格
if ($char === ',' && $i + 1 < $len) {
$nextChar = mb_substr($str, $i + 1, 1);
if ($nextChar === ' ') {
if (!empty($currentName)) {
$currentSuperscript = rtrim($currentSuperscript, ',');
$authors[] = [
'name' => trim($currentName),
'superscript' => trim($currentSuperscript)
];
}
$currentName = '';
$currentSuperscript = '';
$inName = true;
$i++;
continue;
}
}
// 支持姓名中的点、连字符、特殊字母(如带重音的字母)
if (ctype_alpha($char) || in_array($char, [' ', '.', '-', 'à', 'á', 'â', 'ã', 'ä', 'ç', 'è', 'é', 'ê', 'ë'])) {
if ($inName) {
$currentName .= $char;
} else {
$currentSuperscript = rtrim($currentSuperscript, ',');
$authors[] = [
'name' => trim($currentName),
'superscript' => trim($currentSuperscript)
];
$currentName = $char;
$currentSuperscript = '';
$inName = true;
}
}
// 解析上标(数字、逗号、#、*、†等)
elseif ((ctype_digit($char) || is_numeric($char)) || in_array($char, ['#', '*', '†', ',', '‡', '§'])) {
$inName = false;
$currentSuperscript .= $char;
}
// 忽略其他字符
else {
continue;
}
}
// 处理最后一个作者
if (!empty($currentName)) {
$currentSuperscript = rtrim($currentSuperscript, ',');
$authors[] = [
'name' => trim($currentName),
'superscript' => trim($currentSuperscript)
];
}
// 提取机构编号为数组、判断通讯作者和第一作者
foreach ($authors as $index => &$author) {
// 提取机构编号(兼容多字节数字)
$institutionIds = [];
$superscript = $author['superscript'];
$numStr = '';
for ($i = 0; $i < mb_strlen($superscript); $i++) {
$c = mb_substr($superscript, $i, 1);
if (ctype_digit($c) || is_numeric($c)) { // 支持数字判断
$numStr .= $c;
} else {
if (!empty($numStr)) {
$institutionIds[] = (int)$numStr;
$numStr = '';
}
}
}
if (!empty($numStr)) {
$institutionIds[] = (int)$numStr;
}
$institutionIds = array_values(array_unique($institutionIds));
$author['company_id'] = $institutionIds;
// 判断第一作者(#标记)和通讯作者(*、†标记)
$author['is_super'] = strpos($superscript, '#') !== false ? 1 : 0;
$author['is_report'] = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0;
}
unset($author); // 释放引用
return $authors;
}
private function getAuthors($aParam = []) {
$title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
$sAuthorContent = $this->getNextParagraphAfterText($title);
if (empty($sAuthorContent)) {
return ['author' => [], 'report' => []];
}
if(!mb_check_encoding($sAuthorContent, 'UTF-8')){
$sAuthorContent = mb_convert_encoding($sAuthorContent, 'UTF-8', 'GBK');
}
$sAuthorContent = $this->fullDecode($sAuthorContent);
//清理不可见字符
$sAuthorContent = preg_replace('/[\x00-\x1F\x7F\x{200B}-\x{200F}]/u', '', $sAuthorContent);
//修复特殊符号乱码
$symbolMap = [
'†' => '†', 'â ' => '†', 'â' => '†', '?†' => '†',
':' => ':', ',' => ',', '—' => '-',
'啊' => '' // 针对性移除异常字符“啊”(若为固定乱码)
];
$sAuthorContent = strtr($sAuthorContent, $symbolMap);
//格式标准化
$sAuthorContent = str_replace(['', ';', '', '、'], ',', $sAuthorContent); // 统一分隔符
$sAuthorContent = preg_replace('/\s+and\s+/i', ', ', $sAuthorContent); // and转逗号
$sAuthorContent = preg_replace('/\s+/', ' ', $sAuthorContent); // 合并多余空格
$sAuthorContent = trim($sAuthorContent);
$aAuthor = $this->parseAuthorsWithoutRegex($sAuthorContent);
if(empty($aAuthor)){
return ['author' => [],'report' => []];
}
$aReport = $aAuthorData = [];
foreach ($aAuthor as $key => $value) {
if(empty($value['name']) && empty($value['superscript'])){
continue;
}
if(!empty($value['name']) && !empty($value['is_report']) && $value['is_report'] == 1){
$aReport[] = $value['name'];
}
$aAuthorData[] = $value;
}
return ['author' => $aAuthorData,'report' => array_unique($aReport)];
}
// 获取机构
private function getCompany($aParam = []){
//获取标题
$title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
//获取标题下的作者
$sAuthorContent = empty($aParam['authors']) ? $this->getNextParagraphAfterText($title) : $aParam['authors'];
//获取作者结构
$allLines = $this->getContentAfterText($sAuthorContent,1);
if(empty($allLines)){
return [];
}
// 2. 按序号分组,合并同一序号的多行内容
$grouped = [];
$currentNumber = null; // 当前序号
foreach ($allLines as $line) {
$line = trim($line);
if (empty($line)) {
continue;
}
if(!mb_check_encoding($line, 'UTF-8')){
$line = mb_convert_encoding($line, 'UTF-8', 'GBK');
}
$line = $this->fullDecode($line);
$number = '';
$i = 0;
$lineLen = strlen($line);
// 提取行首的连续数字(作为序号)
$hasFirstChar = false;
while ($i < $lineLen) {
$currentChar = $line[$i];
// 首字符处理:允许 26个字母大小写或数字
if (!$hasFirstChar) {
if (ctype_digit($currentChar) || ctype_alpha($currentChar)) {
$number .= $currentChar;
$hasFirstChar = true;
$i++;
} else {
// 首字符不符合(非字母/数字),终止循环
break;
}
} else {
// 后续字符必须是数字(保持原逻辑)
if (ctype_digit($currentChar)) {
$number .= $currentChar;
$i++;
} else {
break;
}
}
}
// 若行首有数字,则视为新条目
if (!empty($number)) {
$currentNumber = $number;
// 提取序号后的内容(跳过数字后的符号/空格,保留核心内容)
// 从数字后的位置开始,跳过可能的符号(./*)或空格
while ($i < $lineLen && (in_array($line[$i], ['.', '*', ' ']))) {
$i++;
}
$content = trim(substr($line, $i)); // 序号后的内容
$grouped[$currentNumber] = $content;
continue;
}
// // 非新条目,合并到当前序号的内容中
// if ($currentNumber !== null) {
// $grouped[$currentNumber] .= ' ' . $line;
// }
}
$aCompany = [];
foreach ($grouped as $number => $institution) {
$institution = $this->fullDecode($institution);
// 原有基础清理逻辑不变
$institution = preg_replace('/\s+/', ' ', $institution); // 合并多余空格
$institution = rtrim($institution, '.'); // 去除末尾句号
$institution = preg_replace('/^\d+\s+/', '', $institution); // 去除开头数字
$institution = trim($institution); // 清理首尾空格
// 增强地址提取:匹配"机构名, 城市 邮编, 国家"格式(兼容更多变体)
// 允许地址中包含多个逗号(如子机构、街道信息),最终以"城市 邮编, 国家"结尾
// preg_match('/(.*?, [A-Za-z\s]+ \d+, [A-Za-z\s]+)/', $institution, $institutionmatches);
// $institution = trim($institutionmatches[1] ?? $institution);
// 强化冗余信息过滤:去除"*"及之后的内容(包括通讯作者、邮箱等)
// 新增对"#"、"†"等标记的过滤,兼容更多期刊格式
if (preg_match('/^(.*?)(?=\s*[\*#†]|(?i)\s*Email)/', $institution, $matches)) {
$institution = trim($matches[1]);
}
// 编码校验不变
if (!empty($institution) && !mb_check_encoding($institution, 'UTF-8')) {
$institution = mb_convert_encoding($institution, 'UTF-8', 'GBK');
}
$aCompany[$number] = $institution;
}
return $aCompany;
}
// 提取通讯作者含E-mail、地址、电话
private function getCorrespondingAuthors($aParam = []){
$aCorrespondingAuthor = empty($aParam['report']) ? [] : $aParam['report'];
if(empty($aCorrespondingAuthor)){
return [];
}
// 获取标题
$title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
$sAuthorContent = $this->getNextParagraphAfterText($title);
$sCompany = $this->getNextParagraphAfterText($sAuthorContent); // 直接取机构所在段落的原始文本
if (empty($sCompany)) {
// 备选方案:若机构段落获取失败,用解析后的机构名称拼接
$aCompany = $this->getCompany($aParam);
$sCompany = implode(' ', array_values($aCompany));
}
// 获取机构后的完整内容
$corrText = $this->getContentAfterText($sCompany);
if(!mb_check_encoding($corrText, 'UTF-8')){
$corrText = mb_convert_encoding($corrText, 'UTF-8', 'GBK');
}
$corrText = $this->fullDecode($corrText);
// // 调试
// file_put_contents(ROOT_PATH . 'runtime/corr_text_raw.log', $corrText);
//清理文本
$corrText = str_replace(['', ''], [':', '@'], $corrText);
$corrText = preg_replace('/\s+/', ' ', $corrText); // 统一空格
$corrText = str_replace(' ', ' ', $corrText); // 去除多余空格
//按"*"分割通讯作者
$corrBlocks = preg_split('/\s*\*\s*/', $corrText);
$corrBlocks = array_filter(array_map('trim', $corrBlocks));
$aCorresponding = [];
foreach ($corrBlocks as $block) {
//匹配通讯作者姓名
$sName = $this->matchCorrespondingName($block, $aCorrespondingAuthor);
if (empty($sName)) {
continue;
}
preg_match('/(E[\s-]*mail|邮箱)[\s:]*([^\s]+@[^\s]+)/i', $block, $email);
preg_match('/(Postal[\s-]*address|地址)[\s:]*([^,;]+)/i', $block, $address);
preg_match('/(Tel|电话)[\s:]*([^\s]+)/i', $block, $tel);
$aCorresponding[] = [
'name' => $sName,
'email' => isset($email[2]) ? trim($email[2]) : '',
'postal_address' => isset($address[2]) ? trim($address[2]) : '',
'tel' => isset($tel[2]) ? trim($tel[2]) : ''
];
}
if(empty($aCorresponding)){
$pattern = '/Corresponding Authors|Correspondence to|Correspondence: (.*?)(?=$|;)/s';
$corrText = trim($corrText,'*');
preg_match($pattern, $corrText, $match);
if (!empty($match[1])) {
$corrContent = $match[1];
// 提取每个作者的名称和邮箱(优化正则,支持更多字符)
$authorPattern = '/([A-Za-z\s]+?),\s*E-mail:\s*([\w@\.\-]+)/';
preg_match_all($authorPattern, $corrContent, $authors);
if(!empty($authors[1])){
for ($i = 0; $i < count($authors[1]); $i++) {
$aCorresponding[] = [
'name' => empty($authors[1][$i]) ? '' : trim($authors[1][$i]),
'email' => empty($authors[2][$i]) ? '' : trim($authors[2][$i])
];
}
}
if(empty($authors[1])){
$authorPattern = '/([A-Za-z0-9\s]+?),\s*([\w@\.\-]+)(?=\.?)/';
preg_match_all($authorPattern, $corrContent, $authors);
for ($i = 0; $i < count($authors[1]); $i++) {
$aCorresponding[] = [
'name' => empty($authors[1][$i]) ? '' : trim($authors[1][$i]),
'email' => empty($authors[2][$i]) ? '' : trim($authors[2][$i])
];
}
}
}
}
return $aCorresponding;
}
//匹配通讯作者姓名
private function matchCorrespondingName($block, $corrNames)
{
$blockLower = strtolower($block);
foreach ($corrNames as $name) {
if (strpos($blockLower, strtolower($name)) !== false) {
return $name;
}
$nameParts = explode(' ', $name);
if (count($nameParts) >= 2) {
$reversedName = implode(' ', array_reverse($nameParts));
if (strpos($blockLower, strtolower($reversedName)) !== false) {
return $name;
}
}
}
return '';
}
// 获取目标文本的下一个段落
private function getNextParagraphAfterText($targetText){
$found = false;
foreach ($this->sections as $section) {
foreach ($section->getElements() as $element) {
$text = $this->getTextFromElement($element);
if(empty($text)){
continue;
}
if ($found) {
return $text;
}
if (stripos($text, $targetText) !== false) {
$found = true;
}
}
}
return '';
}
// 获取目标文本后的所有内容
private function getContentAfterText($targetText,$return_type = 2){
$found = false;
$content = [];
$stopKeywords = ['关键词', 'Key words', '摘要', 'Abstract','ABSTRACT'];
$maxLines = 200;
$lineNumber = 0;
foreach ($this->sections as $section) {
foreach ($section->getElements() as $element) {
$lineNumber++;
if (count($content) >= $maxLines) break;
$text = $this->getTextFromElement($element,$lineNumber);
$text = trim($text);
if (empty($text)) continue;
if (!$found) {
// 移除所有非字母数字字符后匹配
$cleanTarget = preg_replace('/[^a-zA-Z0-9]/', '', strtolower($targetText));
$cleanText = preg_replace('/[^a-zA-Z0-9]/', '', strtolower($text));
// 只要目标文本的50%以上能匹配即可
if (strlen($cleanTarget) > 0 && similar_text($cleanText, $cleanTarget) / strlen($cleanTarget) > 0.5) {
$found = true;
}
continue;
}
// 检查停止关键词
$shouldStop = false;
foreach ($stopKeywords as $kw) {
if (stripos($text, $kw) !== false) {
$shouldStop = true;
break;
}
}
if ($shouldStop) break;
$content[] = $text;
}
if (count($content) >= $maxLines || (isset($shouldStop) && $shouldStop)) break;
}
if($return_type == 1){
return $content;
}
$content = implode("\n", $content);
if(!empty($content) && !mb_check_encoding($content, 'UTF-8')){
$content = mb_convert_encoding($content, 'UTF-8', 'GBK');
}
return $content;
}
// 统一提取元素文本
private function getTextFromElement($element,$lineNumber = 0){
$text = '';
// 处理PreserveText元素
if ($element instanceof \PhpOffice\PhpWord\Element\PreserveText) {
// 通过反射获取私有属性 text
$reflection = new \ReflectionClass($element);
$property = $reflection->getProperty('text');
$property->setAccessible(true);
$textParts = $property->getValue($element);
foreach ($textParts as $part) {
if (strpos($part, 'HYPERLINK') !== false) {
// 解码 HTML 实体(&quot; -> "
$decoded = html_entity_decode($part);
// 提取 mailto: 后的邮箱
if (preg_match('/mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/i', $decoded, $match)) {
$text .= $match[1] . ' ';
}
} else {
// 普通文本直接拼接
$text .= $part;
}
}
return $text;
}
// 处理表格和单元格E-mail可能在表格中
if ($element instanceof \PhpOffice\PhpWord\Element\Table) {
foreach ($element->getRows() as $row) {
foreach ($row->getCells() as $cell) {
$text .= $this->getTextFromElement($cell);
}
}
return $text;
}
if ($element instanceof \PhpOffice\PhpWord\Element\Cell) {
foreach ($element->getElements() as $child) {
$text .= $this->getTextFromElement($child);
}
return $text;
}
//处理嵌套元素(递归提取所有子元素)
if (method_exists($element, 'getElements')) {
foreach ($element->getElements() as $child) {
$text .= $this->getTextFromElement($child);
}
}
//处理文本元素(包括带格式的文本)
if ($element instanceof \PhpOffice\PhpWord\Element\Text) {
$text .= $element->getText();
}
//处理超链接(优先提取链接目标,可能是邮箱)
if ($element instanceof \PhpOffice\PhpWord\Element\Link) {
$target = $element->getTarget();
if (strpos($target, 'mailto:') === 0) {
$text .= str_replace('mailto:', '', $target) . ' '; // 剥离mailto:前缀
}
$text .= $element->getText() . ' ';
}
//处理字段和注释(可能包含隐藏邮箱)
if ($element instanceof \PhpOffice\PhpWord\Element\Field) {
$text .= $element->getContent() . ' ';
}
if ($element instanceof \PhpOffice\PhpWord\Element\Note) {
$text .= $element->getContent() . ' ';
}
//清理所有不可见字符(关键:移除格式干扰)
$text = preg_replace('/[\x00-\x1F\x7F-\x9F]/', ' ', $text); // 移除控制字符
$text = str_replace(["\t", "\r", "\n"], ' ', $text); // 统一空白字符
$text = preg_replace('/\s+/', ' ', $text); // 合并多个空格
if(!empty($text) && !mb_check_encoding($text, 'UTF-8')){
$text = mb_convert_encoding($text, 'UTF-8', 'GBK');
}
return $text;
}
/**
* 从 Word 文档提取摘要和关键词
* @return array 提取结果
*/
function extractContentIntervals($str, $markers = []) {
// 1. 初始化标记(支持自定义,默认值兼容原逻辑)
$defaultMarkers = [
'abstract' => 'abstract',
'keywords' => 'keywords',
'end_span' => '===========end-span'
];
$markers = array_merge($defaultMarkers, $markers);
extract($markers); // 解析为变量 $abstract, $keywords, $end_span
// 2. 初始化结果(包含元信息)
$result = [
'abstract_to_keywords' => '',
'keywords_to_end' => '',
'positions' => [ // 标记位置信息(-1 表示未找到)
'abstract' => -1,
'keywords' => -1,
'end_span' => -1
],
'is_valid' => false, // 整体区间是否有效
'error' => '' // 错误信息(如标记顺序异常)
];
// 3. 定位 Abstract不区分大小写
$absPos = stripos($str, $abstract);
if ($absPos === false) {
$result['error'] = "未找到标记: {$abstract}";
return $result;
}
$result['positions']['abstract'] = $absPos;
$absEndPos = $absPos + strlen($abstract);
// 4. 定位 Keywords需在 Abstract 之后,不区分大小写)
$keyPos = stripos($str, $keywords, $absEndPos);
if ($keyPos === false) {
$result['error'] = "未找到 {$keywords} 或在 {$abstract} 之前";
return $result;
}
$result['positions']['keywords'] = $keyPos;
$keyEndPos = $keyPos + strlen($keywords);
// 5. 定位 end-span需在 Keywords 之后,严格匹配)
$endPos = strpos($str, $end_span, $keyEndPos);
if ($endPos === false) {
$result['error'] = "未找到 {$end_span} 或在 {$keywords} 之前";
return $result;
}
$result['positions']['end_span'] = $endPos;
// 6. 截取区间内容(清理标记后的紧邻符号)
// 区间1Abstract 结束 → Keywords 开始(清理标记后的冒号/空格)
$len1 = $keyPos - $absEndPos;
$part1 = substr($str, $absEndPos, $len1);
$part1 = trim($part1);
// 移除 Abstract 后可能的冒号/短横线(如 "Abstract: ..." → 去掉开头的 ":"
$part1 = ltrim($part1, ': -—');
$result['abstract_to_keywords'] = trim($part1);
// 区间2Keywords 结束 → end-span 开始(同理清理)
$len2 = $endPos - $keyEndPos;
$part2 = substr($str, $keyEndPos, $len2);
$part2 = trim($part2);
$part2 = ltrim($part2, ': -—');
$result['keywords_to_end'] = trim($part2);
// 7. 标记为有效
$result['is_valid'] = true;
return $result;
}
public function extractFromWord() {
$sContent = '';
//文本处理
$sFundContent = '';
$aContent = [];
foreach ($this->sections as $section) {
foreach ($section->getElements() as $element) {
$textContent = $this->getTextFromElement($element);
if(empty($textContent)){
continue;
}
if(!empty($textContent) && !mb_check_encoding($textContent, 'UTF-8')){
$textContent = mb_convert_encoding($textContent, 'UTF-8', 'GBK');
}
if(empty($sFundContent)){
$aFund = $this->getMatchedFundPhrases($sContent);
if(!empty($aFund[0])){
$position = stripos($sContent, $aFund[0]);
$sFundContent = substr($sContent, $position);
$sFundContent = trim(str_ireplace($aFund[0], '', $sFundContent));
if (preg_match('/^(.*?)Peer review/', $sFundContent, $matches)) {
$sFundContent = $matches[1]; // 提取匹配到的前置内容
}
}
}
$sContent .= $textContent."===========end-span";
}
}
if(!empty($sContent) && !mb_check_encoding($sContent, 'UTF-8')){
$sContent = mb_convert_encoding($sContent, 'UTF-8', 'GBK');
}
$result = $this->extractContentIntervals($sContent);
// 3. 提取摘要
$abstract = empty($result['abstract_to_keywords']) ? '' : $result['abstract_to_keywords'];
if(!empty($abstract) && !mb_check_encoding($abstract, 'UTF-8')){
$abstract = mb_convert_encoding($abstract, 'UTF-8', 'GBK');
}
$keywords = empty($result['keywords_to_end']) ? '' : $result['keywords_to_end'];
if(!empty($keywords) && !mb_check_encoding($keywords, 'UTF-8')){
$keywords = mb_convert_encoding($keywords, 'UTF-8', 'GBK');
}
if(!empty($sFundContent) && !mb_check_encoding($sFundContent, 'UTF-8')){
$sFundContent = mb_convert_encoding($sFundContent, 'UTF-8', 'GBK');
}
return [
'status' => 1,
'msg' => '提取成功',
'data' => [
'abstrart' => empty($abstract) ? '' : $this->fullDecode(str_replace('===========end-span', '',$abstract)),
'keywords' => empty($keywords) ? '' : $this->fullDecode(str_replace('===========end-span', '',$keywords)),
'fund' => empty($sFundContent) ? '' : $this->fullDecode(str_replace('===========end-span', '',$sFundContent))
]
];
}
private function fullDecode($str, $maxDepth = 5) {
// 空值/深度为0直接返回提前终止避免无效操作
if (empty($str) || $maxDepth <= 0) {
return $str;
}
// 【性能优化1预编译所有正则表达式】避免每次循环重新解析正则
// 预编译:≥专属场景正则
$regOb0 = '/0B\s*\?0/';
$regDl18 = '/DL\s*\?.18/';
// 预编译:≥通用场景正则
$regQMarkNum = '/\?(\d+)/';
$regQMarkDotNum = '/\?(\.\d+)/';
// 预编译:≤、≠空格修复正则
$regNeNum = '/≠\s*(\d+)/';
$regLeNum = '/≤\s*(\d+)/';
// 预编译:混合符号乱码正则(中文顿号/英文逗号)
$regMixCn = '/(\?)\s*、\s*(\?)\s*、\s*(\?)(\d+)/';
$regMixEn = '/(\?)\s*,\s*(\?)\s*,\s*(\?)(\d+)/';
// 预编译:≤、≠专属标识正则
$regLeMark = '/LE\s*\?(\d+)/';
$regNeMark = '/NE\s*\?(\d+)/';
// 预编译Unicode转义正则提取到外部避免闭包重复创建
$regUnicode = '/\\\\u([0-9a-fA-F]{4})/';
// 【性能优化2预定义常量/映射】避免循环内重复创建数组/字符串
// HTML实体映射一次性定义避免循环内重复赋值
$htmlEntityMap = [
'&le;' => '≤', '&#8804;' => '≤', '&#x2264;' => '≤',
'&ge;' => '≥', '&#8805;' => '≥', '&#x2265;' => '≥',
'&ne;' => '≠', '&#8800;' => '≠', '&#x2260;' => '≠',
];
// 不间断空格替换数组
$nbspReplace = [chr(0xC2) . chr(0xA0), chr(0xA0)];
// Unicode回调函数预定义避免循环内重复创建闭包
$unicodeCallback = function ($m) {
return mb_chr(hexdec($m[1]), 'UTF-8') ?: $m[0];
};
$original = $str;
$depth = 0;
$hasChange = false; // 标记是否有变化,提前终止循环
// 循环解码:仅在有变化且未达最大深度时执行
do {
$depth++;
$hasChange = false;
$prevStr = $str; // 保存当前状态,用于判断变化
// 1. 解码Unicode转义\uXXXX格式
$str = $this->decodeUnicode($str);
// 2. 解码HTML实体先替换专属实体再执行通用解码
$str = strtr($str, $htmlEntityMap); // 高性能替换strtr比str_replace快
$str = html_entity_decode($str, ENT_QUOTES | ENT_HTML5, 'UTF-8');
// 3. 再次处理遗漏的Unicode转义使用预编译正则+预定义回调)
$str = preg_replace_callback($regUnicode, $unicodeCallback, $str);
// 4. 替换不间断空格为普通空格strtr比str_replace更高效
$str = str_replace($nbspReplace, ' ', $str);
// 5. 核心替换逻辑(优化执行顺序,避免覆盖)
// 5.1 原有≥专属场景(保留)
$str = preg_replace($regOb0, '0B≥30', $str, -1, $count1);
$str = preg_replace($regDl18, 'DL≥0.18', $str, -1, $count2);
// 5.2 ≤、≠空格修复(保留)
$str = preg_replace($regNeNum, '≠$1', $str, -1, $count3);
$str = preg_replace($regLeNum, '≤$1', $str, -1, $count4);
// 5.3 原有≥通用场景(保留)
$str = preg_replace($regQMarkNum, '≥$1', $str, -1, $count5);
$str = preg_replace($regQMarkDotNum, '≥0$1', $str, -1, $count6);
// 5.4 混合符号乱码还原(保留)
$str = preg_replace($regMixCn, '≤、≥、≠$4', $str, -1, $count7);
$str = preg_replace($regMixEn, '≤、≥、≠$4', $str, -1, $count8);
// 5.5 ≤、≠专属标识还原(保留)
$str = preg_replace($regLeMark, '≤$1', $str, -1, $count9);
$str = preg_replace($regNeMark, '≠$1', $str, -1, $count10);
// 5.6 修复前缀"d with "乱码(保留)
$str = str_replace('d with ', 'd with ', $str, $count11);
// 【性能优化3统计所有替换次数判断是否有变化】
$totalCount = $count1 + $count2 + $count3 + $count4 + $count5 + $count6 +
$count7 + $count8 + $count9 + $count10 + $count11;
if ($totalCount > 0 || $str !== $prevStr) {
$hasChange = true;
$original = $str;
}
// 【性能优化4提前终止】单次循环无变化直接退出
if (!$hasChange) {
break;
}
} while ($depth < $maxDepth); // 改用do-while减少循环判断次数
// 最终清理仅执行一次trim
return trim($str, ':');
}
// private function fullDecode($str, $maxDepth = 5) {
// if (empty($str) || $maxDepth <= 0) {
// return $str;
// }
// $original = $str;
// $depth = 0;
// // 循环解码,直到无变化或达到最大次数
// while (true) {
// $depth++;
// if ($depth > $maxDepth) {
// break; // 防止过度解码导致死循环
// }
// // 1. 解码 Unicode 转义(\uXXXX 格式)
// $str = $this->decodeUnicode($str);
// // 2. 解码 HTML 实体(&amp;、&#039;、&lt; 等)
// $str = html_entity_decode($str, ENT_QUOTES | ENT_HTML5, 'UTF-8');
// $str = preg_replace_callback('/\\\\u([0-9a-fA-F]{4})/', function ($m) {
// return mb_chr(hexdec($m[1]), 'UTF-8') ?: $m[0];
// }, $str);
// $str = str_replace([chr(0xC2).chr(0xA0), chr(0xA0)], ' ', $str);
// // 2. 核心:强制匹配所有可能的乱码格式,还原≥
// // 匹配0B?0、0B ?0、0B ?0空格/制表符)→ 0B≥30
// $str = preg_replace('/0B\s*\?0/', '0B≥30', $str);
// // 匹配DL?.18、DL ?.18、DL ?.18 → DL≥0.18
// $str = preg_replace('/DL\s*\?.18/', 'DL≥0.18', $str);
// // 通用匹配:数字前的?(如?30、?0.18)→ ≥30、≥0.18(防止其他变体)
// $str = preg_replace('/\?(\d+)/', '≥$1', $str);
// $str = preg_replace('/\?(\.\d+)/', '≥0$1', $str);
// // 3. 修复前缀的"d with "可能的乱码(若有)
// $str = str_replace('d with ', 'd with ', $str); // 若前缀也乱码,可同步替换
// // 若解码后无变化,退出循环
// if ($str === $original) {
// break;
// }
// $original = $str;
// }
// return trim($str,':');
// }
private function decodeUnicode($str) {
return preg_replace_callback(
'/\\\\u([0-9a-fA-F]{4})/',
function ($matches) {
// 将十六进制 Unicode 码转为 UTF-8 字符
return mb_convert_encoding(pack('H*', $matches[1]), 'UTF-8', 'UCS-2BE');
},
$str
);
}
private function getMatchedFundPhrases($content = '') {
if (empty($content)) {
return [];
}
// 基金支持词组列表
$fundPhrases = [
'Supported by', 'Funded by', 'Sponsored by', 'Supported in part by',
'Funding was provided by', 'Funded in part by','FUNDING:'
];
// 1. 转义词组中的特殊字符,使用 # 作为分隔符
$escapedPhrases = array_map(function($phrase) {
return preg_quote($phrase, '#');
}, $fundPhrases);
// 2. 拼接为正则模式:匹配任意一个词组(保留原始词组的捕获)
$pattern = '#('.implode('|', $escapedPhrases).')#i';
// 注意:此处用 () 捕获分组,而非 (?:),用于提取匹配到的具体词组
// 3. 全局匹配所有符合的词组
preg_match_all($pattern, $content, $matches);
// 4. 处理结果:去重、保留原始词组格式(忽略大小写导致的变体)
$matched = [];
if (!empty($matches[1])) {
// 遍历匹配到的结果(可能包含大小写变体,如 'funded by'
foreach ($matches[1] as $match) {
// 与原始词组列表比对,找到完全匹配的原始词组(忽略大小写)
foreach ($fundPhrases as $original) {
if (strcasecmp($match, $original) === 0) {
$matched[] = $original;
break; // 找到后跳出内层循环,避免重复
}
}
}
// 去重并保持原始顺序
$matched = array_values(array_unique($matched));
}
return $matched;
}
//日志打印
private function log($msg){
// echo date('[Y-m-d H:i:s] ') . $msg . "\n";
}
}