Files
tougao/application/common/ArticleParserService.php
2025-11-06 20:35:34 +08:00

1264 lines
54 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
namespace app\common;
use PhpOffice\PhpWord\IOFactory;
use think\Exception;
use ZipArchive;
use RecursiveIteratorIterator;
use RecursiveDirectoryIterator;
use PhpOffice\PhpWord\Settings;
use PhpOffice\PhpWord\Element\TextRun;
use DOMDocument;
use DOMXPath;
// use BadMethodCallException;
class ArticleParserService
{
private $phpWord;
private $sections;
public function __construct($filePath = '')
{
if (!file_exists($filePath)) {
return json_encode(['status' => 5, 'msg' => '"文档不存在:{$filePath}"']);
}
try {
// 关键配置:关闭“仅读数据”,保留完整节结构
$reader = IOFactory::createReader();
$reader->setReadDataOnly(false);
Settings::setCompatibility(false);
Settings::setOutputEscapingEnabled(true); // 避免XML转义冲突
$doc = $reader->load($filePath);
$sectionCount = count($doc->getSections());
// $this->log("✅ 文档直接加载成功,节数量:{$sectionCount}");
$this->phpWord = $reader->load($filePath);
$this->sections = $this->phpWord->getSections();
} catch (\Exception $e) {
// 预处理:移除 DOCX 中的 EMF 图片
$processedFilePath = $this->removeEmfFromDocx($filePath);
// 加载处理后的文档
$reader = IOFactory::createReader();
$reader->setReadDataOnly(false);
Settings::setCompatibility(false);
Settings::setOutputEscapingEnabled(true);
$this->phpWord = $reader->load($processedFilePath);
$this->sections = $this->phpWord->getSections();
// 可选:删除临时处理文件(避免冗余)
unlink($processedFilePath);
return json_encode(['status' => 5, 'msg' => $e->getMessage()]);
}
}
/**
* 移除 DOCX 压缩包内的所有 EMF 图片
* @param string $docxPath 原 DOCX 文件路径
* @return string 处理后的临时 DOCX 路径
*/
private function removeEmfFromDocx($docxPath){
$zip = new ZipArchive();
if ($zip->open($docxPath) !== true) {
throw new \Exception("无法打开 DOCX 文件:{$docxPath}");
}
// 1. 创建临时目录用于解压
$tempDir = rtrim(ROOT_PATH,'/').'/runtime/'.uniqid('docx_temp_');
mkdir($tempDir, 0700, true);
// 2. 解压 DOCX 到临时目录
$zip->extractTo($tempDir);
$zip->close();
// 3. 递归删除所有 EMF 文件
$dirIterator = new RecursiveDirectoryIterator($tempDir);
$iterator = new RecursiveIteratorIterator($dirIterator);
foreach ($iterator as $file) {
if ($file->isFile() && strtolower(pathinfo($file, PATHINFO_EXTENSION)) === 'emf') {
unlink($file->getPathname());
}
}
// 4. 重新打包为 DOCX
$processedPath = $tempDir . '_processed.docx';
$newZip = new ZipArchive();
if ($newZip->open($processedPath, ZipArchive::CREATE | ZipArchive::OVERWRITE) !== true) {
throw new \Exception("无法创建处理后的 DOCX 文件");
}
// 遍历临时目录,添加所有文件到新压缩包
$this->addFilesToZip($tempDir, $newZip);
$newZip->close();
// 5. 删除临时解压目录
$this->deleteDir($tempDir);
return $processedPath;
}
/**
* 递归添加目录文件到 ZipArchive
* @param string $dir 目录路径
* @param ZipArchive $zip ZipArchive 实例
*/
private function addFilesToZip($dir, $zip)
{
$files = scandir($dir);
foreach ($files as $file) {
if ($file === '.' || $file === '..') continue;
$filePath = $dir . '/' . $file;
if (is_dir($filePath)) {
$this->addFilesToZip($filePath, $zip);
} else {
// 计算压缩包内的相对路径(避免冗余目录层级)
$relativePath = str_replace(dirname($dir) . '/', '', $filePath);
$zip->addFile($filePath, $relativePath);
}
}
}
/**
* 递归删除目录
* @param string $dir 目录路径
*/
private function deleteDir($dir){
// 1. 基础校验:非空字符串且为有效目录
if (trim($dir) === '' || !is_dir($dir)) {
return false;
}
// 2. 统一路径格式(去除尾部分隔符,避免跨系统差异)
$dir = rtrim($dir, DIRECTORY_SEPARATOR);
$dirName = basename($dir);
// 3. 前缀强校验仅处理docx_temp_开头的目录
if (strpos($dirName, 'docx_temp_') !== 0) {
return false;
}
// 4. 路径归属校验缓存realpath结果减少I/O
$runtimeDir = rtrim(ROOT_PATH, '/') . '/runtime';
$realDir = realpath($dir);
$realRuntimeDir = realpath($runtimeDir);
if ($realDir === false || $realRuntimeDir === false || strpos($realDir, $realRuntimeDir) !== 0) {
return false;
}
// 5. 扫描目录(带错误抑制,处理权限问题)
$files = @scandir($dir);
if ($files === false) {
return false;
}
$isFullyDeleted = true; // 标记是否完全删除
// 6. 递归处理子项
foreach ($files as $file) {
if ($file === '.' || $file === '..') {
continue;
}
$filePath = $dir . DIRECTORY_SEPARATOR . $file;
$realFilePath = realpath($filePath);
// 子路径校验:必须是当前目录的子项(防符号链接跳转)
if ($realFilePath === false || strpos($realFilePath, $realDir) !== 0) {
$isFullyDeleted = false;
continue;
}
if (is_dir($realFilePath)) {
// 递归删除子目录,继承校验逻辑
if (!$this->deleteDir($realFilePath)) {
$isFullyDeleted = false;
}
} else {
// 尝试删除文件(失败则标记未完全删除)
if (!@unlink($realFilePath)) {
$isFullyDeleted = false;
}
}
}
// 7. 最终删除目录(确保空目录才删除)
$remainingFiles = @scandir($dir);
if ($remainingFiles !== false && count($remainingFiles) <= 2) {
@rmdir($dir);
return $isFullyDeleted; // 若子项完全删除则返回true
}
return false;
}
// 上传并解析文档的入口方法
public static function uploadAndParse($sFileUrl){
//必填值验证
if(empty($sFileUrl)){
return json_encode(['status' => 2,'msg' => 'Please upload the submission file']);
}
//判断文件是否执行
if (!file_exists($sFileUrl)) {
return json_encode(['status' => 3, 'msg' => 'The uploaded file does not exist']);
}
if (!is_readable($sFileUrl)) {
return json_encode(['status' => 4, 'msg' => 'The uploaded file is unreadable']);
}
// 解析文档
$oDealFile = new self($sFileUrl);
//获取标题
$sTitle = $oDealFile->getTitle();
if(empty($sTitle)){
return json_encode(['status' => 5, 'msg' => 'Article title retrieval failed']);
}
//获取作者
$aParam = ['title' => $sTitle];
$aAuthor = $oDealFile->getAuthors($aParam);
$aAuthorData = empty($aAuthor['author']) ? [] : $aAuthor['author'];//所有作者信息
$aAuthorReportData = empty($aAuthor['report']) ? [] : $aAuthor['report'];//通讯作者信息
$aParam['author'] = $aAuthorData;
$aParam['report'] = $aAuthorReportData;
//获取机构
$aCompany = $oDealFile->getCompany($aParam);
$aParam['company'] = $aCompany;
//获取通讯作者信息
$aParam['corresponding'] = $oDealFile->getCorrespondingAuthors($aParam);
//keywords 和 摘要
$aContent = $oDealFile->extractFromWord();
$aParam += empty($aContent['data']) ? [] : $aContent['data'];
return json_encode(['status' => 1,'msg' => 'success','data' => $aParam]);
}
// 提取文章标题
private function getTitle(){
if(empty($this->sections)){
return '';
}
$title = '';
$maxLength = 0;
foreach ($this->sections as $section) {
foreach ($section->getElements() as $element) {
$text = $this->getTextFromElement($element);
$length = mb_strlen(trim($text));
if ($length > $maxLength && $length > 10) { // 标题通常较长
$title = trim($text);
$maxLength = $length;
break 2; // 取第一个最长段落作为标题
}
}
}
if(!empty($title) && !mb_check_encoding($title, 'UTF-8')){
$title = mb_convert_encoding($title, 'UTF-8', 'GBK');
}
return $title;
}
// 提取作者
// private function getAuthors($aParam = []) {
// $title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
// $sAuthorContent = $this->getNextParagraphAfterText($title);
// if (empty($sAuthorContent)) {
// return ['author' => [], 'report' => []];
// }
// //编码修复
// $possibleEncodings = [
// 'Windows-1252', 'UTF-8', 'GBK', 'GB2312',
// 'Latin-1', 'ISO-8859-1', 'CP1252'
// ];
// $encodedContent = @mb_convert_encoding($sAuthorContent, 'UTF-8', implode(',', $possibleEncodings));
// $sAuthorContent = $encodedContent ?: $sAuthorContent;
// //清理不可见字符
// $sAuthorContent = preg_replace('/[\x00-\x1F\x7F\x{200B}-\x{200F}]/u', '', $sAuthorContent);
// //修复特殊符号乱码
// $symbolMap = [
// '†' => '†', 'â ' => '†', 'â' => '†', '?†' => '†',
// ':' => ':', ',' => ',', '—' => '-',
// '啊' => '' // 针对性移除异常字符“啊”(若为固定乱码)
// ];
// $sAuthorContent = strtr($sAuthorContent, $symbolMap);
// //格式标准化
// $sAuthorContent = str_replace(['', ';', '', '、'], ',', $sAuthorContent); // 统一分隔符
// $sAuthorContent = preg_replace('/\s+and\s+/i', ', ', $sAuthorContent); // and转逗号
// $sAuthorContent = preg_replace('/\s+/', ' ', $sAuthorContent); // 合并多余空格
// $sAuthorContent = trim($sAuthorContent);
// // 处理作者
// $content = mb_convert_encoding($sAuthorContent, 'UTF-8', 'auto'); // 确保编码正确
// $content = str_replace("\xC2\xA0", ' ', $content); // 替换非-breaking空格为普通空格
// $content = preg_replace('/(\d+)\s*([*#†])/', '$1$2', $content); // 合并"1 *"为"1*"、"1 #"为"1#"
// $content = preg_replace('/,†/', ',†', $content); // 保留"1,†"格式(防止被拆分)
// //标记上标内的逗号+空格(多编号)
// $tempStr = preg_replace('/(\d+)\s*,\s*(\d+)/', '$1<SEP>$2', $content);
// // 原有步骤2正则匹配扩展上标符号支持保持原有逻辑
// $pattern = '/
// ([A-Za-z\s\.\-]+?) # 姓名(支持缩写、空格)
// \s* # 姓名与上标间空格
// ( # 上标组(扩展符号支持)
// \d+ # 起始数字
// (?:[†#*,]|<SEP>\d+)* # 允许:†#*符号、逗号、<SEP>+数字兼容1,†、1,*等)
// )
// \s*,? # 作者间逗号(可选)
// (?=\s|$) # 确保后面是空格或结尾
// /ux';
// preg_match_all($pattern, $tempStr, $matches);
// $authorList = [];
// if(!empty($matches[1])){
// foreach ($matches[1] as $i => $name) {
// $name = trim($name);
// $superscript = trim($matches[2][$i]);
// $superscript = str_replace('<SEP>', ',', $superscript); // 恢复多编号逗号
// $superscript = preg_replace('/,$/', '', $superscript); // 清理末尾逗号
// // 修复符号与数字间的空格(如原始"1 *"被误处理为"1*"的情况,保持原样)
// $superscript = preg_replace('/(\d)([*#†])/', '$1$2', $superscript);
// if (!empty($name)) {
// $authorList[] = [
// 'name' => $name,
// 'superscript' => $superscript
// ];
// }
// }
// }else {
// // 按“两个或多个连续空格”拆分(姓名之间的分隔)
// $authorList = array_filter(
// array_map('trim',
// preg_split('/(,\p{Z}*|\p{Z}{2,})/u', $sAuthorContent)
// )
// );
// }
// // //处理作者
// // $authorList = [];
// // // 新正则:匹配“姓名+上标”整体允许上标含逗号如1,†)
// // // 逻辑:姓名以字母/中文开头,上标以数字开头、以符号/数字结尾
// // // if (preg_match_all('/([A-Za-z\x{4e00}-\x{9fa5}][A-Za-z\s·\-\'\x{4e00}-\x{9fa5}]*)\s*([\d,†#*]+)/u', $sAuthorContent, $matches)) {
// // if(preg_match_all('/([A-Za-z\x{4e00}-\x{9fa5}][A-Za-z\s·\-\'\x{4e00}-\x{9fa5}]*)\s*(\d[\d,†#\s*]*)/u', $sAuthorContent, $matches)){
// // for ($i = 0; $i < count($matches[1]); $i++) {
// // $authorList[] = trim($matches[1][$i] . $matches[2][$i]);
// // }
// // } else {
// // // 按“两个或多个连续空格”拆分(姓名之间的分隔)
// // $authorList = array_filter(
// // array_map('trim',
// // preg_split('/(,\p{Z}*|\p{Z}{2,})/u', $sAuthorContent)
// // )
// // );
// // }
// $aAuthorData = [];
// $aReport = [];
// $namePattern = '/
// (?:[A-Za-z\s·\-\']+| # 英文姓名(支持空格、连字符)
// [\x{4e00}-\x{9fa5}]+| # 中文姓名
// [\x{1800}-\x{18AF}]+| # 蒙古文姓名
// [A-Z]\.) # 单字母缩写(如 J.
// /ux';
// var_dump($authorList);exit;
// foreach ($authorList as $authorStr) {
// if (empty($authorStr)) continue;
// var_dump($authorList);exit;
// //分离姓名与上标支持上标含逗号如1,†)
// $superscript = '';
// // 新正则:匹配以数字开头、含逗号/符号的完整上标如1,†、2*#
// $authorStr = trim(trim($authorStr,','),' ');
// // if (preg_match('/([\d,†#*]+)$/u', $authorStr, $supMatch)) {
// // if(preg_match('/\s*([\d,†#* ]+)$/u', $authorStr, $supMatch)){
// // if (preg_match('/.*?\s*([\d,†#* ]+)$/u', $authorStr, $supMatch)) {
// // if (preg_match('/.*?\s*([\d,\x{2020}#* ]+?)\s*$/u', $authorStr, $supMatch)) {
// // if (preg_match('/^(.+?)\D*?(\d[\d,#*†,\s]*)$/u', $authorStr, $supMatch)) {
// // $superscript = $supMatch[1];
// // // 移除上标,保留纯姓名(避免残留符号)
// // $nameStr = trim(preg_replace('/' . preg_quote($superscript, '/') . '$/', '', $authorStr));
// // } else {
// // $nameStr = $authorStr;
// // }
// $pattern = '/^(.+?)\s*(\d[\d,#*†\s]*?)\s*$/u';
// if (preg_match($pattern, $authorStr, $supMatch)) {
// $nameStr = empty($supMatch[1]) ? '' : trim($supMatch[1]); // 姓名部分:"Liguo Zhang"
// $superscript = empty($supMatch[2]) ? $nameStr : $nameStr.trim($supMatch[2]); // 上标部分:"1
// // echo "姓名: $nameStr, 上标: $superscript\n";
// } else {
// $nameStr = $authorStr;
// }
// //验证姓名合法性(过滤无效内容)
// if (!preg_match($namePattern, $nameStr)) {
// continue;
// }
// //解析上标信息正确识别1,†中的机构编号和符号)
// $companyId = '';
// $isSuper = 0;
// $isReport = 0;
// if (!empty($superscript)) {
// // 提取机构编号忽略上标中的逗号如1,† → 提取1
// if (preg_match('/(\d+)/', $superscript, $numMatch)) {
// $companyId = $numMatch[1];
// }
// // 识别特殊符号(#为超级作者,*†为通讯作者)
// $isSuper = strpos($superscript, '#') !== false ? 1 : 0;
// $isReport = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0;
// }
// if (preg_match("/^([A-Za-z\s'\.-]+)/u", $nameStr, $match)) {
// $nameStr = trim($match[1]);
// }
// $aAuthorData[] = [
// 'name' => $nameStr,
// 'company_id' => $companyId,
// 'is_super' => $isSuper,
// 'is_report' => $isReport
// ];
// if ($isReport) {
// $aReport[] = $nameStr;
// }
// }
// var_dump($aAuthorData);exit;
// return ['author' => $aAuthorData,'report' => array_unique($aReport)];
// }
// 提取作者
private function parseAuthorsWithoutRegex($str = '') {
if (empty($str)) {
return [];
}
// 清理乱码和特殊字符(扩展全角数字处理)
$str = mb_convert_encoding($str, 'UTF-8', 'auto');
$str = str_replace(["\xC2\xA0", 'ï¼', '�', '', '', '', '', '', '', '', '', '', '', ''],
[' ', ' ', ' ', ' ', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0'], $str);
$str = trim(str_replace([' and ', ' AND ', ' And '], ', ', $str));
// 合并上标中数字与逗号间的空格(如"2, 3"→"2,3"
$len = mb_strlen($str);
$processed = '';
for ($i = 0; $i < $len; $i++) {
$char = mb_substr($str, $i, 1);
if ($char === ',' && $i - 1 >= 0 && $i + 2 < $len) {
$prevChar = mb_substr($str, $i - 1, 1);
$next1 = mb_substr($str, $i + 1, 1);
$next2 = mb_substr($str, $i + 2, 1);
// 兼容全角数字转半角后的判断
if ((ctype_digit($prevChar) || is_numeric($prevChar)) && $next1 === ' ' && (ctype_digit($next2) || is_numeric($next2))) {
$processed .= $char;
$i += 1;
continue;
}
}
$processed .= $char;
}
$str = $processed;
// 合并数字与符号间的空格(如"1 *"→"1*"
$len = mb_strlen($str);
$processed = '';
for ($i = 0; $i < $len; $i++) {
$char = mb_substr($str, $i, 1);
if ((ctype_digit($char) || is_numeric($char)) && $i + 2 < $len) { // 支持数字判断
$next1 = mb_substr($str, $i + 1, 1);
$next2 = mb_substr($str, $i + 2, 1);
if ($next1 === ' ' && in_array($next2, ['#', '*', '†', '‡', '§'])) { // 扩展符号支持
$processed .= $char;
$i += 2;
$processed .= $next2;
continue;
}
}
$processed .= $char;
}
$str = $processed;
// 合并连续空格
$len = mb_strlen($str);
$processed = '';
$prevSpace = false;
for ($i = 0; $i < $len; $i++) {
$char = mb_substr($str, $i, 1);
if ($char === ' ') {
if (!$prevSpace) {
$processed .= $char;
$prevSpace = true;
}
} else {
$processed .= $char;
$prevSpace = false;
}
}
$str = trim($processed);
// 作者处理
$authors = [];
$currentName = '';
$currentSuperscript = '';
$inName = true;
$len = mb_strlen($str);
for ($i = 0; $i < $len; $i++) {
$char = mb_substr($str, $i, 1);
// 处理作者分隔符:逗号+空格
if ($char === ',' && $i + 1 < $len) {
$nextChar = mb_substr($str, $i + 1, 1);
if ($nextChar === ' ') {
if (!empty($currentName)) {
$currentSuperscript = rtrim($currentSuperscript, ',');
$authors[] = [
'name' => trim($currentName),
'superscript' => trim($currentSuperscript)
];
}
$currentName = '';
$currentSuperscript = '';
$inName = true;
$i++;
continue;
}
}
// 支持姓名中的点、连字符、特殊字母(如带重音的字母)
if (ctype_alpha($char) || in_array($char, [' ', '.', '-', 'à', 'á', 'â', 'ã', 'ä', 'ç', 'è', 'é', 'ê', 'ë'])) {
if ($inName) {
$currentName .= $char;
} else {
$currentSuperscript = rtrim($currentSuperscript, ',');
$authors[] = [
'name' => trim($currentName),
'superscript' => trim($currentSuperscript)
];
$currentName = $char;
$currentSuperscript = '';
$inName = true;
}
}
// 解析上标(数字、逗号、#、*、†等)
elseif ((ctype_digit($char) || is_numeric($char)) || in_array($char, ['#', '*', '†', ',', '‡', '§'])) {
$inName = false;
$currentSuperscript .= $char;
}
// 忽略其他字符
else {
continue;
}
}
// 处理最后一个作者
if (!empty($currentName)) {
$currentSuperscript = rtrim($currentSuperscript, ',');
$authors[] = [
'name' => trim($currentName),
'superscript' => trim($currentSuperscript)
];
}
// 提取机构编号为数组、判断通讯作者和第一作者
foreach ($authors as $index => &$author) {
// 提取机构编号(兼容多字节数字)
$institutionIds = [];
$superscript = $author['superscript'];
$numStr = '';
for ($i = 0; $i < mb_strlen($superscript); $i++) {
$c = mb_substr($superscript, $i, 1);
if (ctype_digit($c) || is_numeric($c)) { // 支持数字判断
$numStr .= $c;
} else {
if (!empty($numStr)) {
$institutionIds[] = (int)$numStr;
$numStr = '';
}
}
}
if (!empty($numStr)) {
$institutionIds[] = (int)$numStr;
}
$institutionIds = array_values(array_unique($institutionIds));
$author['company_id'] = $institutionIds;
// 判断第一作者(#标记)和通讯作者(*、†标记)
$author['is_super'] = strpos($superscript, '#') !== false ? 1 : 0;
$author['is_report'] = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0;
}
unset($author); // 释放引用
return $authors;
}
private function getAuthors($aParam = []) {
$title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
$sAuthorContent = $this->getNextParagraphAfterText($title);
if (empty($sAuthorContent)) {
return ['author' => [], 'report' => []];
}
//编码修复
$possibleEncodings = [
'Windows-1252', 'UTF-8', 'GBK', 'GB2312',
'Latin-1', 'ISO-8859-1', 'CP1252'
];
$encodedContent = @mb_convert_encoding($sAuthorContent, 'UTF-8', implode(',', $possibleEncodings));
$sAuthorContent = $encodedContent ?: $sAuthorContent;
//清理不可见字符
$sAuthorContent = preg_replace('/[\x00-\x1F\x7F\x{200B}-\x{200F}]/u', '', $sAuthorContent);
//修复特殊符号乱码
$symbolMap = [
'†' => '†', 'â ' => '†', 'â' => '†', '?†' => '†',
':' => ':', ',' => ',', '—' => '-',
'啊' => '' // 针对性移除异常字符“啊”(若为固定乱码)
];
$sAuthorContent = strtr($sAuthorContent, $symbolMap);
//格式标准化
$sAuthorContent = str_replace(['', ';', '', '、'], ',', $sAuthorContent); // 统一分隔符
$sAuthorContent = preg_replace('/\s+and\s+/i', ', ', $sAuthorContent); // and转逗号
$sAuthorContent = preg_replace('/\s+/', ' ', $sAuthorContent); // 合并多余空格
$sAuthorContent = trim($sAuthorContent);
$aAuthor = $this->parseAuthorsWithoutRegex($sAuthorContent);
if(empty($aAuthor)){
return ['author' => [],'report' => []];
}
$aReport = $aAuthorData = [];
foreach ($aAuthor as $key => $value) {
if(empty($value['name']) && empty($value['superscript'])){
continue;
}
if(!mb_check_encoding($value['name'], 'UTF-8')){
$value['name'] = mb_convert_encoding($value['name'], 'UTF-8', 'GBK');
}
if(!empty($value['name']) && !empty($value['is_report']) && $value['is_report'] == 1){
$aReport[] = $value['name'];
}
$aAuthorData[] = $value;
}
return ['author' => $aAuthorData,'report' => array_unique($aReport)];
}
// private function getAuthors($aParam = []) {
// $title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
// $sAuthorContent = $this->getNextParagraphAfterText($title);
// if (empty($sAuthorContent)) {
// return ['author' => [], 'report' => []];
// }
// //编码修复
// $possibleEncodings = [
// 'Windows-1252', 'UTF-8', 'GBK', 'GB2312',
// 'Latin-1', 'ISO-8859-1', 'CP1252'
// ];
// $encodedContent = @mb_convert_encoding($sAuthorContent, 'UTF-8', implode(',', $possibleEncodings));
// $sAuthorContent = $encodedContent ?: $sAuthorContent;
// //清理不可见字符
// $sAuthorContent = preg_replace('/[\x00-\x1F\x7F\x{200B}-\x{200F}]/u', '', $sAuthorContent);
// //修复特殊符号乱码
// $symbolMap = [
// '†' => '†', 'â ' => '†', 'â' => '†', '?†' => '†',
// ':' => ':', ',' => ',', '—' => '-',
// '啊' => '' // 针对性移除异常字符“啊”(若为固定乱码)
// ];
// $sAuthorContent = strtr($sAuthorContent, $symbolMap);
// //格式标准化
// $sAuthorContent = str_replace(['', ';', '', '、'], ',', $sAuthorContent); // 统一分隔符
// $sAuthorContent = preg_replace('/\s+and\s+/i', ', ', $sAuthorContent); // and转逗号
// $sAuthorContent = preg_replace('/\s+/', ' ', $sAuthorContent); // 合并多余空格
// $sAuthorContent = trim($sAuthorContent);
// var_dump($this->parseAuthorsWithoutRegex($sAuthorContent));exit;
// // 关键预处理:兼容"and"分隔符、清理乱码、统一空格
// $content = mb_convert_encoding($sAuthorContent, 'UTF-8', 'auto');
// $content = str_replace(["\xC2\xA0", 'ï¼', '�', ''], ' ', $content); // 清理乱码和全角符号
// $content = preg_replace('/\band\b/i', ',', $content); // 将 "and" 转为逗号(统一分隔符)
// $content = preg_replace('/(\d+)\s*([*#†])/', '$1$2', $content); // 合并数字与符号间的空格(如"1 *"→"1*"
// $content = trim(preg_replace('/\s+/', ' ', $content)); // 合并连续空格
// // 标记上标内的逗号(多编号处理)
// $tempStr = preg_replace('/(\d+)\s*,\s*(\d+)/', '$1<SEP>$2', $content);
// // 核心正则(保持原有结构,扩展符号支持)
// $pattern = '/
// ([A-Za-z\s\.\-]+?) # 姓名(支持缩写、空格、连字符)
// \s* # 姓名与上标间的空格允许0或多个
// ( # 上标组(扩展兼容所有符号)
// \d+ # 起始数字至少1个数字
// (?:[†#*,]|<SEP>\d+)* # 允许:符号(†#*)、逗号、<SEP>+数字(多编号)
// )
// \s*,? # 作者间的逗号(可选,允许逗号前有空格)
// (?=\s|$) # 确保后面是空格或字符串结尾(避免跨作者匹配)
// /ux';
// preg_match_all($pattern, $tempStr, $matches);
// // 解析结果并格式化
// $authorList = [];
// if (!empty($matches[1])) {
// foreach ($matches[1] as $i => $name) {
// $name = trim($name);
// $superscript = trim($matches[2][$i]);
// $superscript = str_replace('<SEP>', ',', $superscript); // 恢复多编号逗号
// $superscript = preg_replace('/,$/', '', $superscript); // 清理末尾多余逗号
// if (!empty($name)) {
// $authorList[] = [
// 'name' => $name,
// 'superscript' => $superscript
// ];
// }
// }
// }
// // 输出结果
// echo "<pre>";
// print_r($authorList);
// echo "</pre>";
// exit;
// // 处理作者
// $content = mb_convert_encoding($sAuthorContent, 'UTF-8', 'auto'); // 确保编码正确
// $content = str_replace("\xC2\xA0", ' ', $content); // 替换非-breaking空格为普通空格
// $content = preg_replace('/(\d+)\s*([*#†])/', '$1$2', $content); // 合并"1 *"为"1*"、"1 #"为"1#"
// $content = preg_replace('/,†/', ',†', $content); // 保留"1,†"格式(防止被拆分)
// //标记上标内的逗号+空格(多编号)
// $tempStr = preg_replace('/(\d+)\s*,\s*(\d+)/', '$1<SEP>$2', $content);
// // 原有步骤2正则匹配扩展上标符号支持保持原有逻辑
// $pattern = '/
// ([A-Za-z\s\.\-]+?) # 姓名(支持缩写、空格)
// \s* # 姓名与上标间空格
// ( # 上标组(扩展符号支持)
// \d+ # 起始数字
// (?:[†#*,]|<SEP>\d+)* # 允许:†#*符号、逗号、<SEP>+数字兼容1,†、1,*等)
// )
// \s*,? # 作者间逗号(可选)
// (?=\s|$) # 确保后面是空格或结尾
// /ux';
// preg_match_all($pattern, $tempStr, $matches);
// var_dump($matches);exit;
// $authorList = [];
// if(!empty($matches[1])){
// foreach ($matches[1] as $i => $name) {
// $name = trim($name);
// $superscript = trim($matches[2][$i]);
// $superscript = str_replace('<SEP>', ',', $superscript); // 恢复多编号逗号
// $superscript = preg_replace('/,$/', '', $superscript); // 清理末尾逗号
// // 修复符号与数字间的空格(如原始"1 *"被误处理为"1*"的情况,保持原样)
// $superscript = preg_replace('/(\d)([*#†])/', '$1$2', $superscript);
// if (!empty($name)) {
// $authorList[] = [
// 'name' => $name,
// 'superscript' => $superscript
// ];
// }
// }
// }else {
// // 按“两个或多个连续空格”拆分(姓名之间的分隔)
// $authorList = array_filter(
// array_map('trim',
// preg_split('/(,\p{Z}*|\p{Z}{2,})/u', $sAuthorContent)
// )
// );
// }
// // //处理作者
// $aAuthorData = [];
// $aReport = [];
// $namePattern = '/
// (?:[A-Za-z\s·\-\']+| # 英文姓名(支持空格、连字符)
// [\x{4e00}-\x{9fa5}]+| # 中文姓名
// [\x{1800}-\x{18AF}]+| # 蒙古文姓名
// [A-Z]\.) # 单字母缩写(如 J.
// /ux';
// foreach ($authorList as $authorStr){
// if (empty($authorStr)) continue;
// //获取下标
// $superscript = empty($authorStr['superscript']) ? $authorStr : $authorStr['superscript'];
// $nameStr = empty($authorStr['name']) ? $authorStr : $authorStr['name'];
// $companyId = [];
// $isSuper = 0;
// $isReport = 0;
// if (!empty($superscript)) {
// // 提取机构编号忽略上标中的逗号如1,† → 提取1
// preg_match_all('/\d+/', $superscript, $numMatch);
// // 识别特殊符号(#为超级作者,*†为通讯作者)
// $isSuper = strpos($superscript, '#') !== false ? 1 : 0;
// $isReport = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0;
// }
// if (preg_match("/^([A-Za-z\s'\.-]+)/u", $nameStr, $match)) {
// $nameStr = trim($match[1]);
// }
// $aAuthorData[] = [
// 'name' => $nameStr,
// 'company_id' => empty($numMatch[0]) ? [] : $numMatch[0],
// 'is_super' => $isSuper,
// 'is_report' => $isReport
// ];
// if ($isReport) {
// $aReport[] = $nameStr;
// }
// }
// return ['author' => $aAuthorData,'report' => array_unique($aReport)];
// }
// 获取机构
private function getCompany($aParam = []){
//获取标题
$title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
//获取标题下的作者
$sAuthorContent = empty($aParam['authors']) ? $this->getNextParagraphAfterText($title) : $aParam['authors'];
//获取作者结构
$allLines = $this->getContentAfterText($sAuthorContent,1);
if(empty($allLines)){
return [];
}
// 2. 按序号分组,合并同一序号的多行内容
$grouped = [];
$currentNumber = null; // 当前序号
foreach ($allLines as $line) {
$line = trim($line);
if (empty($line)) continue;
// 判断是否是新条目的开头:行首为数字(后续可接任意字符或直接接内容)
$number = '';
$i = 0;
$lineLen = strlen($line);
// 提取行首的连续数字(作为序号)
while ($i < $lineLen && ctype_digit($line[$i])) {
$number .= $line[$i];
$i++;
}
// 若行首有数字,则视为新条目
if (!empty($number)) {
$currentNumber = $number;
// 提取序号后的内容(跳过数字后的符号/空格,保留核心内容)
// 从数字后的位置开始,跳过可能的符号(./*)或空格
while ($i < $lineLen && (in_array($line[$i], ['.', '*', ' ']))) {
$i++;
}
$content = trim(substr($line, $i)); // 序号后的内容
$grouped[$currentNumber] = $content;
continue;
}
// 非新条目,合并到当前序号的内容中
if ($currentNumber !== null) {
$grouped[$currentNumber] .= ' ' . $line;
}
}
//清理结果
$possibleEncodings = [
'Windows-1252', 'UTF-8', 'GBK', 'GB2312',
'Latin-1', 'ISO-8859-1', 'CP1252'
];
$aCompany = [];
foreach ($grouped as $number => $institution) {
$encodedContent = @mb_convert_encoding($institution, 'UTF-8', implode(',', $possibleEncodings));
$sCompany = $encodedContent ?: $sCompany;
$institution = preg_replace('/\s+/', ' ', $institution); // 合并多余空格
$institution = rtrim($institution, '.');
$institution = preg_replace('/^\d+\s+/', '', $institution);
$institution = trim($institution); // 清理首尾空格
preg_match('/(.*?, [A-Za-z]+ \d+, [A-Za-z]+)/', $institution, $institutionmatches);;
$institution = trim($institutionmatches[1] ?? $institution);
if (preg_match('/^(.*?)(?=\s*\*Email)/', $institution, $matches)) {
$institution = trim($matches[1]); // trim() 去除内容前后多余空格
}
if(!empty($institution) && !mb_check_encoding($institution, 'UTF-8')){
$institution = mb_convert_encoding($institution, 'UTF-8', 'GBK');
}
$aCompany[$number] = $institution;
}
return $aCompany;
}
// 提取通讯作者含E-mail、地址、电话
private function getCorrespondingAuthors($aParam = []){
$aCorrespondingAuthor = empty($aParam['report']) ? [] : $aParam['report'];
if(empty($aCorrespondingAuthor)){
return [];
}
// 获取标题
$title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
$sAuthorContent = $this->getNextParagraphAfterText($title);
$sCompany = $this->getNextParagraphAfterText($sAuthorContent); // 直接取机构所在段落的原始文本
if (empty($sCompany)) {
// 备选方案:若机构段落获取失败,用解析后的机构名称拼接
$aCompany = $this->getCompany($aParam);
$sCompany = implode(' ', array_values($aCompany));
}
// 获取机构后的完整内容
$corrText = $this->getContentAfterText($sCompany);
//编码修复
$possibleEncodings = [
'Windows-1252', 'UTF-8', 'GBK', 'GB2312',
'Latin-1', 'ISO-8859-1', 'CP1252'
];
$encodedContent = @mb_convert_encoding($corrText, 'UTF-8', implode(',', $possibleEncodings));
$corrText = $encodedContent ?: $corrText;
// // 调试
// file_put_contents(ROOT_PATH . 'runtime/corr_text_raw.log', $corrText);
//清理文本
$corrText = str_replace(['', ''], [':', '@'], $corrText);
$corrText = preg_replace('/\s+/', ' ', $corrText); // 统一空格
$corrText = str_replace(' ', ' ', $corrText); // 去除多余空格
//按"*"分割通讯作者
$corrBlocks = preg_split('/\s*\*\s*/', $corrText);
$corrBlocks = array_filter(array_map('trim', $corrBlocks));
$aCorresponding = [];
foreach ($corrBlocks as $block) {
//匹配通讯作者姓名
$sName = $this->matchCorrespondingName($block, $aCorrespondingAuthor);
if (empty($sName)) {
continue;
}
preg_match('/(E[\s-]*mail|邮箱)[\s:]*([^\s]+@[^\s]+)/i', $block, $email);
preg_match('/(Postal[\s-]*address|地址)[\s:]*([^,;]+)/i', $block, $address);
preg_match('/(Tel|电话)[\s:]*([^\s]+)/i', $block, $tel);
$aCorresponding[] = [
'name' => $sName,
'email' => isset($email[2]) ? trim($email[2]) : '',
'postal_address' => isset($address[2]) ? trim($address[2]) : '',
'tel' => isset($tel[2]) ? trim($tel[2]) : ''
];
}
if(empty($aCorresponding)){
$pattern = '/Corresponding Authors: (.*?)(?=$|;)/s';
preg_match($pattern, $corrText, $match);
if (!empty($match[1])) {
$corrContent = $match[1];
// 提取每个作者的名称和邮箱(优化正则,支持更多字符)
$authorPattern = '/([A-Za-z\s]+?),\s*E-mail:\s*([\w@\.\-]+)/';
preg_match_all($authorPattern, $corrContent, $authors);
if(!empty($authors[1])){
for ($i = 0; $i < count($authors[1]); $i++) {
$aCorresponding[] = [
'name' => empty($authors[1][$i]) ? '' : trim($authors[1][$i]),
'email' => empty($authors[2][$i]) ? '' : trim($authors[2][$i])
];
}
}
}
}
return $aCorresponding;
}
//匹配通讯作者姓名
private function matchCorrespondingName($block, $corrNames)
{
$blockLower = strtolower($block);
foreach ($corrNames as $name) {
if (strpos($blockLower, strtolower($name)) !== false) {
return $name;
}
$nameParts = explode(' ', $name);
if (count($nameParts) >= 2) {
$reversedName = implode(' ', array_reverse($nameParts));
if (strpos($blockLower, strtolower($reversedName)) !== false) {
return $name;
}
}
}
return '';
}
// 获取目标文本的下一个段落
private function getNextParagraphAfterText($targetText){
$found = false;
foreach ($this->sections as $section) {
foreach ($section->getElements() as $element) {
$text = $this->getTextFromElement($element);
if(empty($text)){
continue;
}
if ($found) {
return $text;
}
if (stripos($text, $targetText) !== false) {
$found = true;
}
}
}
return '';
}
// 获取目标文本后的所有内容
private function getContentAfterText($targetText,$return_type = 2){
$found = false;
$content = [];
$stopKeywords = ['关键词', 'Key words', '摘要', 'Abstract','ABSTRACT'];
$maxLines = 200;
$lineNumber = 0;
foreach ($this->sections as $section) {
foreach ($section->getElements() as $element) {
$lineNumber++;
if (count($content) >= $maxLines) break;
$text = $this->getTextFromElement($element,$lineNumber);
$text = trim($text);
if (empty($text)) continue;
if (!$found) {
// 移除所有非字母数字字符后匹配
$cleanTarget = preg_replace('/[^a-zA-Z0-9]/', '', strtolower($targetText));
$cleanText = preg_replace('/[^a-zA-Z0-9]/', '', strtolower($text));
// 只要目标文本的50%以上能匹配即可
if (strlen($cleanTarget) > 0 && similar_text($cleanText, $cleanTarget) / strlen($cleanTarget) > 0.5) {
$found = true;
}
continue;
}
// 检查停止关键词
$shouldStop = false;
foreach ($stopKeywords as $kw) {
if (stripos($text, $kw) !== false) {
$shouldStop = true;
break;
}
}
if ($shouldStop) break;
$content[] = $text;
}
if (count($content) >= $maxLines || (isset($shouldStop) && $shouldStop)) break;
}
if($return_type == 1){
return $content;
}
$content = implode("\n", $content);
if(!empty($content) && !mb_check_encoding($content, 'UTF-8')){
$content = mb_convert_encoding($content, 'UTF-8', 'GBK');
}
return $content;
}
// 统一提取元素文本
private function getTextFromElement($element,$lineNumber = 0){
$text = '';
// 处理PreserveText元素
if ($element instanceof \PhpOffice\PhpWord\Element\PreserveText) {
// 通过反射获取私有属性 text
$reflection = new \ReflectionClass($element);
$property = $reflection->getProperty('text');
$property->setAccessible(true);
$textParts = $property->getValue($element);
foreach ($textParts as $part) {
if (strpos($part, 'HYPERLINK') !== false) {
// 解码 HTML 实体(&quot; -> "
$decoded = html_entity_decode($part);
// 提取 mailto: 后的邮箱
if (preg_match('/mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/i', $decoded, $match)) {
$text .= $match[1] . ' ';
}
} else {
// 普通文本直接拼接
$text .= $part;
}
}
return $text;
}
// 处理表格和单元格E-mail可能在表格中
if ($element instanceof \PhpOffice\PhpWord\Element\Table) {
foreach ($element->getRows() as $row) {
foreach ($row->getCells() as $cell) {
$text .= $this->getTextFromElement($cell);
}
}
return $text;
}
if ($element instanceof \PhpOffice\PhpWord\Element\Cell) {
foreach ($element->getElements() as $child) {
$text .= $this->getTextFromElement($child);
}
return $text;
}
//处理嵌套元素(递归提取所有子元素)
if (method_exists($element, 'getElements')) {
foreach ($element->getElements() as $child) {
$text .= $this->getTextFromElement($child);
}
}
//处理文本元素(包括带格式的文本)
if ($element instanceof \PhpOffice\PhpWord\Element\Text) {
$text .= $element->getText();
}
//处理超链接(优先提取链接目标,可能是邮箱)
if ($element instanceof \PhpOffice\PhpWord\Element\Link) {
$target = $element->getTarget();
if (strpos($target, 'mailto:') === 0) {
$text .= str_replace('mailto:', '', $target) . ' '; // 剥离mailto:前缀
}
$text .= $element->getText() . ' ';
}
//处理字段和注释(可能包含隐藏邮箱)
if ($element instanceof \PhpOffice\PhpWord\Element\Field) {
$text .= $element->getContent() . ' ';
}
if ($element instanceof \PhpOffice\PhpWord\Element\Note) {
$text .= $element->getContent() . ' ';
}
//清理所有不可见字符(关键:移除格式干扰)
$text = preg_replace('/[\x00-\x1F\x7F-\x9F]/', ' ', $text); // 移除控制字符
$text = str_replace(["\t", "\r", "\n"], ' ', $text); // 统一空白字符
$text = preg_replace('/\s+/', ' ', $text); // 合并多个空格
if(!empty($text) && !mb_check_encoding($text, 'UTF-8')){
$text = mb_convert_encoding($text, 'UTF-8', 'GBK');
}
return $text;
}
/**
* 从 Word 文档提取摘要和关键词
* @return array 提取结果
*/
public function extractFromWord() {
$sContent = '';
//文本处理
$sFundContent = '';
foreach ($this->sections as $section) {
foreach ($section->getElements() as $element) {
$textContent = $this->getTextFromElement($element);
if(empty($textContent)){
continue;
}
//编码修复
$possibleEncodings = [
'Windows-1252', 'UTF-8', 'GBK', 'GB2312',
'Latin-1', 'ISO-8859-1', 'CP1252'
];
$sContent .= @mb_convert_encoding($textContent, 'UTF-8', implode(',', $possibleEncodings));
if(stripos($textContent, 'Keywords:') !== false){
$sContent .= "Keywords-End-Flag";
}
if(empty($sFundContent)){
$aFund = $this->getMatchedFundPhrases($sContent);
if(!empty($aFund[0])){
$position = stripos($sContent, $aFund[0]);
$sFundContent = substr($sContent, $position);
$sFundContent = trim(str_ireplace($aFund[0], '', $sFundContent));
if (preg_match('/^(.*?)Peer review/', $sFundContent, $matches)) {
$sFundContent = $matches[1]; // 提取匹配到的前置内容
}
}
}
$sContent .= "\n";
}
}
if(!empty($sContent) && !mb_check_encoding($sContent, 'UTF-8')){
$sContent = mb_convert_encoding($sContent, 'UTF-8', 'GBK');
}
// 2. 基础文本清理(合并多余空格,保留有效换行)
$textContent = preg_replace('/(\S)\s+/', '$1 ', $sContent);
$textContent = trim($textContent);
// 3. 提取摘要
$abstract = '';
$abstractPattern = '/Abstract\s*([\s\S]*?)(?=Keywords|$)/i';
if (preg_match($abstractPattern, $textContent, $abstractMatches)) {
$abstract = trim($abstractMatches[1]);
$abstract = preg_replace('/\n+/', ' ', $abstract);
}
// 4. 提取关键词(核心:仅保留两种强制匹配逻辑)
$keywords = [];
// $keywordPattern = '/Keywords:\s*([\s\S]*?)(?=\s*\d+\.|[;,]\s*[\r\n]+\s*[\r\n]+|(?i)\bintroduction|abbreviations\b|$)/i';
$keywordPattern = '/Keywords\s*(.*?)\s*Keywords-End-Flag/s';
if (preg_match($keywordPattern, $textContent, $keywordMatches)) {
$keywordStr = trim($keywordMatches[1]);
// 清理关键词列表格式(去除换行、末尾多余符号)
$keywordStr = preg_replace('/\n+/', ' ', $keywordStr);
$keywordStr = rtrim($keywordStr, ';,. '); // 去除末尾分号、逗号等
$keywordStr = trim($keywordStr);
// 分割并过滤有效关键词
$keywords = preg_split('/[,;]\s*/', $keywordStr);
$keywords = array_filter(array_map('trim', $keywords), function($item) {
return !empty($item) && !ctype_space($item);
});
}
if(empty($keywords)){
$keywordPattern = '/Keywords\s*([\s\S]*?)(?=Introduction|$)/i';
if (preg_match($keywordPattern, $textContent, $keywordMatches)) {
$keywordStr = trim($keywordMatches[1]);
// 清理关键词列表格式(去除换行、末尾多余符号)
$keywordStr = preg_replace('/\n+/', ' ', $keywordStr);
$keywordStr = rtrim($keywordStr, ';,. '); // 去除末尾分号、逗号等
$keywordStr = trim($keywordStr);
// 分割并过滤有效关键词
$keywords = preg_split('/[,;]\s*/', $keywordStr);
$keywords = array_filter(array_map('trim', $keywords), function($item) {
return !empty($item) && !ctype_space($item);
});
}
}
return [
'status' => 1,
'msg' => '提取成功',
'data' => [
'abstrart' => $abstract,
'keywords' => $keywords,
'fund' => $sFundContent
]
];
}
private function getMatchedFundPhrases($content = '') {
if (empty($content)) {
return [];
}
// 基金支持词组列表
$fundPhrases = [
'Supported by', 'Funded by', 'Sponsored by', 'Supported in part by',
'Funding was provided by', 'Funded in part by'
];
// 1. 转义词组中的特殊字符,使用 # 作为分隔符
$escapedPhrases = array_map(function($phrase) {
return preg_quote($phrase, '#');
}, $fundPhrases);
// 2. 拼接为正则模式:匹配任意一个词组(保留原始词组的捕获)
$pattern = '#('.implode('|', $escapedPhrases).')#i';
// 注意:此处用 () 捕获分组,而非 (?:),用于提取匹配到的具体词组
// 3. 全局匹配所有符合的词组
preg_match_all($pattern, $content, $matches);
// 4. 处理结果:去重、保留原始词组格式(忽略大小写导致的变体)
$matched = [];
if (!empty($matches[1])) {
// 遍历匹配到的结果(可能包含大小写变体,如 'funded by'
foreach ($matches[1] as $match) {
// 与原始词组列表比对,找到完全匹配的原始词组(忽略大小写)
foreach ($fundPhrases as $original) {
if (strcasecmp($match, $original) === 0) {
$matched[] = $original;
break; // 找到后跳出内层循环,避免重复
}
}
}
// 去重并保持原始顺序
$matched = array_values(array_unique($matched));
}
return $matched;
}
//日志打印
private function log($msg){
// echo date('[Y-m-d H:i:s] ') . $msg . "\n";
}
}