This commit is contained in:
chengxl
2025-11-03 09:37:14 +08:00
parent 65ab2379f8
commit c53ca3aa1f

View File

@@ -0,0 +1,764 @@
<?php
namespace app\common;
use PhpOffice\PhpWord\IOFactory;
use think\Exception;
use ZipArchive;
use RecursiveIteratorIterator;
use RecursiveDirectoryIterator;
use PhpOffice\PhpWord\Settings;
use PhpOffice\PhpWord\Element\TextRun;
use DOMDocument;
use DOMXPath;
// use BadMethodCallException;
class ArticleParserService
{
private $phpWord;
private $sections;
public function __construct($filePath = '')
{
if (!file_exists($filePath)) {
throw new Exception("文档不存在:{$filePath}");
}
try {
// 关键配置:关闭“仅读数据”,保留完整节结构
$reader = IOFactory::createReader();
$reader->setReadDataOnly(false);
Settings::setCompatibility(false);
Settings::setOutputEscapingEnabled(true); // 避免XML转义冲突
$doc = $reader->load($filePath);
$sectionCount = count($doc->getSections());
// $this->log("✅ 文档直接加载成功,节数量:{$sectionCount}");
$this->phpWord = $reader->load($filePath);
$this->sections = $this->phpWord->getSections();
} catch (\Exception $e) {
return json(['status' => 'error', 'msg' => $e->getMessage()]);
}
}
// 上传并解析文档的入口方法
public static function uploadAndParse($sFileUrl){
//必填值验证
if(empty($sFileUrl)){
return json_encode(['status' => 2,'msg' => 'Please upload the submission file']);
}
//判断文件是否执行
if (!file_exists($sFileUrl)) {
return json_encode(['status' => 3, 'msg' => 'The uploaded file does not exist']);
}
if (!is_readable($sFileUrl)) {
return json_encode(['status' => 4, 'msg' => 'The uploaded file is unreadable']);
}
// 解析文档
$oDealFile = new self($sFileUrl);
//获取标题
$sTitle = $oDealFile->getTitle();
if(empty($sTitle)){
return json_encode(['status' => 5, 'msg' => 'Article title retrieval failed']);
}
//获取作者
$aParam = ['title' => $sTitle];
$aAuthor = $oDealFile->getAuthors($aParam);
$aAuthorData = empty($aAuthor['author']) ? [] : $aAuthor['author'];//所有作者信息
$aAuthorReportData = empty($aAuthor['report']) ? [] : $aAuthor['report'];//通讯作者信息
$aParam['author'] = $aAuthorData;
$aParam['report'] = $aAuthorReportData;
//获取机构
$aCompany = $oDealFile->getCompany($aParam);
$aParam['company'] = $aCompany;
//获取通讯作者信息
$aParam['corresponding'] = $oDealFile->getCorrespondingAuthors($aParam);
//keywords 和 摘要
$aContent = $oDealFile->extractFromWord();
$aParam += empty($aContent['data']) ? [] : $aContent['data'];
return json_encode(['status' => 1,'msg' => 'success','data' => $aParam]);
}
// 提取文章标题
private function getTitle(){
$title = '';
$maxLength = 0;
foreach ($this->sections as $section) {
foreach ($section->getElements() as $element) {
$text = $this->getTextFromElement($element);
$length = mb_strlen(trim($text));
if ($length > $maxLength && $length > 10) { // 标题通常较长
$title = trim($text);
$maxLength = $length;
break 2; // 取第一个最长段落作为标题
}
}
}
return $title;
}
// 提取作者
// private function getAuthors($aParam = []) {
// $title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
// $sAuthorContent = $this->getNextParagraphAfterText($title);
// if (empty($sAuthorContent)) {
// return ['author' => [], 'report' => []];
// }
// //编码修复
// $possibleEncodings = [
// 'Windows-1252', 'UTF-8', 'GBK', 'GB2312',
// 'Latin-1', 'ISO-8859-1', 'CP1252'
// ];
// $encodedContent = @mb_convert_encoding($sAuthorContent, 'UTF-8', implode(',', $possibleEncodings));
// $sAuthorContent = $encodedContent ?: $sAuthorContent;
// //清理不可见字符
// $sAuthorContent = preg_replace('/[\x00-\x1F\x7F\x{200B}-\x{200F}]/u', '', $sAuthorContent);
// //修复特殊符号乱码
// $symbolMap = [
// '†' => '†', 'â ' => '†', 'â' => '†', '?†' => '†',
// ':' => ':', ',' => ',', '—' => '-',
// '啊' => '' // 针对性移除异常字符“啊”(若为固定乱码)
// ];
// $sAuthorContent = strtr($sAuthorContent, $symbolMap);
// //格式标准化
// $sAuthorContent = str_replace(['', ';', '', '、'], ',', $sAuthorContent); // 统一分隔符
// $sAuthorContent = preg_replace('/\s+and\s+/i', ', ', $sAuthorContent); // and转逗号
// $sAuthorContent = preg_replace('/\s+/', ' ', $sAuthorContent); // 合并多余空格
// $sAuthorContent = trim($sAuthorContent);
// // 处理作者
// $content = mb_convert_encoding($sAuthorContent, 'UTF-8', 'auto'); // 确保编码正确
// $content = str_replace("\xC2\xA0", ' ', $content); // 替换非-breaking空格为普通空格
// $content = preg_replace('/(\d+)\s*([*#†])/', '$1$2', $content); // 合并"1 *"为"1*"、"1 #"为"1#"
// $content = preg_replace('/,†/', ',†', $content); // 保留"1,†"格式(防止被拆分)
// //标记上标内的逗号+空格(多编号)
// $tempStr = preg_replace('/(\d+)\s*,\s*(\d+)/', '$1<SEP>$2', $content);
// // 原有步骤2正则匹配扩展上标符号支持保持原有逻辑
// $pattern = '/
// ([A-Za-z\s\.\-]+?) # 姓名(支持缩写、空格)
// \s* # 姓名与上标间空格
// ( # 上标组(扩展符号支持)
// \d+ # 起始数字
// (?:[†#*,]|<SEP>\d+)* # 允许:†#*符号、逗号、<SEP>+数字兼容1,†、1,*等)
// )
// \s*,? # 作者间逗号(可选)
// (?=\s|$) # 确保后面是空格或结尾
// /ux';
// preg_match_all($pattern, $tempStr, $matches);
// $authorList = [];
// if(!empty($matches[1])){
// foreach ($matches[1] as $i => $name) {
// $name = trim($name);
// $superscript = trim($matches[2][$i]);
// $superscript = str_replace('<SEP>', ',', $superscript); // 恢复多编号逗号
// $superscript = preg_replace('/,$/', '', $superscript); // 清理末尾逗号
// // 修复符号与数字间的空格(如原始"1 *"被误处理为"1*"的情况,保持原样)
// $superscript = preg_replace('/(\d)([*#†])/', '$1$2', $superscript);
// if (!empty($name)) {
// $authorList[] = [
// 'name' => $name,
// 'superscript' => $superscript
// ];
// }
// }
// }else {
// // 按“两个或多个连续空格”拆分(姓名之间的分隔)
// $authorList = array_filter(
// array_map('trim',
// preg_split('/(,\p{Z}*|\p{Z}{2,})/u', $sAuthorContent)
// )
// );
// }
// // //处理作者
// // $authorList = [];
// // // 新正则:匹配“姓名+上标”整体允许上标含逗号如1,†)
// // // 逻辑:姓名以字母/中文开头,上标以数字开头、以符号/数字结尾
// // // if (preg_match_all('/([A-Za-z\x{4e00}-\x{9fa5}][A-Za-z\s·\-\'\x{4e00}-\x{9fa5}]*)\s*([\d,†#*]+)/u', $sAuthorContent, $matches)) {
// // if(preg_match_all('/([A-Za-z\x{4e00}-\x{9fa5}][A-Za-z\s·\-\'\x{4e00}-\x{9fa5}]*)\s*(\d[\d,†#\s*]*)/u', $sAuthorContent, $matches)){
// // for ($i = 0; $i < count($matches[1]); $i++) {
// // $authorList[] = trim($matches[1][$i] . $matches[2][$i]);
// // }
// // } else {
// // // 按“两个或多个连续空格”拆分(姓名之间的分隔)
// // $authorList = array_filter(
// // array_map('trim',
// // preg_split('/(,\p{Z}*|\p{Z}{2,})/u', $sAuthorContent)
// // )
// // );
// // }
// $aAuthorData = [];
// $aReport = [];
// $namePattern = '/
// (?:[A-Za-z\s·\-\']+| # 英文姓名(支持空格、连字符)
// [\x{4e00}-\x{9fa5}]+| # 中文姓名
// [\x{1800}-\x{18AF}]+| # 蒙古文姓名
// [A-Z]\.) # 单字母缩写(如 J.
// /ux';
// var_dump($authorList);exit;
// foreach ($authorList as $authorStr) {
// if (empty($authorStr)) continue;
// var_dump($authorList);exit;
// //分离姓名与上标支持上标含逗号如1,†)
// $superscript = '';
// // 新正则:匹配以数字开头、含逗号/符号的完整上标如1,†、2*#
// $authorStr = trim(trim($authorStr,','),' ');
// // if (preg_match('/([\d,†#*]+)$/u', $authorStr, $supMatch)) {
// // if(preg_match('/\s*([\d,†#* ]+)$/u', $authorStr, $supMatch)){
// // if (preg_match('/.*?\s*([\d,†#* ]+)$/u', $authorStr, $supMatch)) {
// // if (preg_match('/.*?\s*([\d,\x{2020}#* ]+?)\s*$/u', $authorStr, $supMatch)) {
// // if (preg_match('/^(.+?)\D*?(\d[\d,#*†,\s]*)$/u', $authorStr, $supMatch)) {
// // $superscript = $supMatch[1];
// // // 移除上标,保留纯姓名(避免残留符号)
// // $nameStr = trim(preg_replace('/' . preg_quote($superscript, '/') . '$/', '', $authorStr));
// // } else {
// // $nameStr = $authorStr;
// // }
// $pattern = '/^(.+?)\s*(\d[\d,#*†\s]*?)\s*$/u';
// if (preg_match($pattern, $authorStr, $supMatch)) {
// $nameStr = empty($supMatch[1]) ? '' : trim($supMatch[1]); // 姓名部分:"Liguo Zhang"
// $superscript = empty($supMatch[2]) ? $nameStr : $nameStr.trim($supMatch[2]); // 上标部分:"1
// // echo "姓名: $nameStr, 上标: $superscript\n";
// } else {
// $nameStr = $authorStr;
// }
// //验证姓名合法性(过滤无效内容)
// if (!preg_match($namePattern, $nameStr)) {
// continue;
// }
// //解析上标信息正确识别1,†中的机构编号和符号)
// $companyId = '';
// $isSuper = 0;
// $isReport = 0;
// if (!empty($superscript)) {
// // 提取机构编号忽略上标中的逗号如1,† → 提取1
// if (preg_match('/(\d+)/', $superscript, $numMatch)) {
// $companyId = $numMatch[1];
// }
// // 识别特殊符号(#为超级作者,*†为通讯作者)
// $isSuper = strpos($superscript, '#') !== false ? 1 : 0;
// $isReport = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0;
// }
// if (preg_match("/^([A-Za-z\s'\.-]+)/u", $nameStr, $match)) {
// $nameStr = trim($match[1]);
// }
// $aAuthorData[] = [
// 'name' => $nameStr,
// 'company_id' => $companyId,
// 'is_super' => $isSuper,
// 'is_report' => $isReport
// ];
// if ($isReport) {
// $aReport[] = $nameStr;
// }
// }
// var_dump($aAuthorData);exit;
// return ['author' => $aAuthorData,'report' => array_unique($aReport)];
// }
private function getAuthors($aParam = []) {
$title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
$sAuthorContent = $this->getNextParagraphAfterText($title);
if (empty($sAuthorContent)) {
return ['author' => [], 'report' => []];
}
//编码修复
$possibleEncodings = [
'Windows-1252', 'UTF-8', 'GBK', 'GB2312',
'Latin-1', 'ISO-8859-1', 'CP1252'
];
$encodedContent = @mb_convert_encoding($sAuthorContent, 'UTF-8', implode(',', $possibleEncodings));
$sAuthorContent = $encodedContent ?: $sAuthorContent;
//清理不可见字符
$sAuthorContent = preg_replace('/[\x00-\x1F\x7F\x{200B}-\x{200F}]/u', '', $sAuthorContent);
//修复特殊符号乱码
$symbolMap = [
'†' => '†', 'â ' => '†', 'â' => '†', '?†' => '†',
':' => ':', ',' => ',', '—' => '-',
'啊' => '' // 针对性移除异常字符“啊”(若为固定乱码)
];
$sAuthorContent = strtr($sAuthorContent, $symbolMap);
//格式标准化
$sAuthorContent = str_replace(['', ';', '', '、'], ',', $sAuthorContent); // 统一分隔符
$sAuthorContent = preg_replace('/\s+and\s+/i', ', ', $sAuthorContent); // and转逗号
$sAuthorContent = preg_replace('/\s+/', ' ', $sAuthorContent); // 合并多余空格
$sAuthorContent = trim($sAuthorContent);
// 处理作者
$content = mb_convert_encoding($sAuthorContent, 'UTF-8', 'auto'); // 确保编码正确
$content = str_replace("\xC2\xA0", ' ', $content); // 替换非-breaking空格为普通空格
$content = preg_replace('/(\d+)\s*([*#†])/', '$1$2', $content); // 合并"1 *"为"1*"、"1 #"为"1#"
$content = preg_replace('/,†/', ',†', $content); // 保留"1,†"格式(防止被拆分)
//标记上标内的逗号+空格(多编号)
$tempStr = preg_replace('/(\d+)\s*,\s*(\d+)/', '$1<SEP>$2', $content);
// 原有步骤2正则匹配扩展上标符号支持保持原有逻辑
$pattern = '/
([A-Za-z\s\.\-]+?) # 姓名(支持缩写、空格)
\s* # 姓名与上标间空格
( # 上标组(扩展符号支持)
\d+ # 起始数字
(?:[†#*,]|<SEP>\d+)* # 允许:†#*符号、逗号、<SEP>+数字兼容1,†、1,*等)
)
\s*,? # 作者间逗号(可选)
(?=\s|$) # 确保后面是空格或结尾
/ux';
preg_match_all($pattern, $tempStr, $matches);
$authorList = [];
if(!empty($matches[1])){
foreach ($matches[1] as $i => $name) {
$name = trim($name);
$superscript = trim($matches[2][$i]);
$superscript = str_replace('<SEP>', ',', $superscript); // 恢复多编号逗号
$superscript = preg_replace('/,$/', '', $superscript); // 清理末尾逗号
// 修复符号与数字间的空格(如原始"1 *"被误处理为"1*"的情况,保持原样)
$superscript = preg_replace('/(\d)([*#†])/', '$1$2', $superscript);
if (!empty($name)) {
$authorList[] = [
'name' => $name,
'superscript' => $superscript
];
}
}
}else {
// 按“两个或多个连续空格”拆分(姓名之间的分隔)
$authorList = array_filter(
array_map('trim',
preg_split('/(,\p{Z}*|\p{Z}{2,})/u', $sAuthorContent)
)
);
}
// //处理作者
$aAuthorData = [];
$aReport = [];
$namePattern = '/
(?:[A-Za-z\s·\-\']+| # 英文姓名(支持空格、连字符)
[\x{4e00}-\x{9fa5}]+| # 中文姓名
[\x{1800}-\x{18AF}]+| # 蒙古文姓名
[A-Z]\.) # 单字母缩写(如 J.
/ux';
foreach ($authorList as $authorStr){
if (empty($authorStr)) continue;
//获取下标
$superscript = empty($authorStr['superscript']) ? $authorStr : $authorStr['superscript'];
$nameStr = empty($authorStr['name']) ? $authorStr : $authorStr['name'];
$companyId = [];
$isSuper = 0;
$isReport = 0;
if (!empty($superscript)) {
// 提取机构编号忽略上标中的逗号如1,† → 提取1
preg_match_all('/\d+/', $superscript, $numMatch);
// 识别特殊符号(#为超级作者,*†为通讯作者)
$isSuper = strpos($superscript, '#') !== false ? 1 : 0;
$isReport = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0;
}
if (preg_match("/^([A-Za-z\s'\.-]+)/u", $nameStr, $match)) {
$nameStr = trim($match[1]);
}
$aAuthorData[] = [
'name' => $nameStr,
'company_id' => empty($numMatch[0]) ? [] : $numMatch[0],
'is_super' => $isSuper,
'is_report' => $isReport
];
if ($isReport) {
$aReport[] = $nameStr;
}
}
return ['author' => $aAuthorData,'report' => array_unique($aReport)];
}
// 获取机构
private function getCompany($aParam = []){
//获取标题
$title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
//获取标题下的作者
$sAuthorContent = empty($aParam['authors']) ? $this->getNextParagraphAfterText($title) : $aParam['authors'];
//获取作者结构
$sCompany = $this->getContentAfterText($sAuthorContent);
if(empty($sCompany)){
return [];
}
//编码修复
$possibleEncodings = [
'Windows-1252', 'UTF-8', 'GBK', 'GB2312',
'Latin-1', 'ISO-8859-1', 'CP1252'
];
$encodedContent = @mb_convert_encoding($sCompany, 'UTF-8', implode(',', $possibleEncodings));
$sCompany = $encodedContent ?: $sCompany;
//按行拆分,保留数字开头的行
$sCompany = str_replace(["\r\n", "\r"], "\n", $sCompany);
$aCompanyLines = explode("\n", $sCompany);
$aCompanyLines = array_filter(array_map('trim', $aCompanyLines), function($line) {
return preg_match('/^\d+/', $line); // 仅保留数字开头的行
});
$aCompany = [];
foreach ($aCompanyLines as $line) {
if (preg_match('/^(\d+)\s*(.+)$/', $line, $match)) {
if(empty($match[1]) || empty($match[2])){
continue;
}
$aCompany[$match[1]] = ltrim(trim(ltrim($match[2]),'.'),' ');
}
}
return $aCompany;
}
// 提取通讯作者含E-mail、地址、电话
private function getCorrespondingAuthors($aParam = []){
$aCorrespondingAuthor = empty($aParam['report']) ? [] : $aParam['report'];
if(empty($aCorrespondingAuthor)){
return [];
}
// 获取标题
$title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
$sAuthorContent = $this->getNextParagraphAfterText($title);
$sCompany = $this->getNextParagraphAfterText($sAuthorContent); // 直接取机构所在段落的原始文本
if (empty($sCompany)) {
// 备选方案:若机构段落获取失败,用解析后的机构名称拼接
$aCompany = $this->getCompany($aParam);
$sCompany = implode(' ', array_values($aCompany));
}
// 获取机构后的完整内容
$corrText = $this->getContentAfterText($sCompany);
//编码修复
$possibleEncodings = [
'Windows-1252', 'UTF-8', 'GBK', 'GB2312',
'Latin-1', 'ISO-8859-1', 'CP1252'
];
$encodedContent = @mb_convert_encoding($corrText, 'UTF-8', implode(',', $possibleEncodings));
$corrText = $encodedContent ?: $corrText;
// // 调试
// file_put_contents(ROOT_PATH . 'runtime/corr_text_raw.log', $corrText);
//清理文本
$corrText = str_replace(['', ''], [':', '@'], $corrText);
$corrText = preg_replace('/\s+/', ' ', $corrText); // 统一空格
$corrText = str_replace(' ', ' ', $corrText); // 去除多余空格
//按"*"分割通讯作者
$corrBlocks = preg_split('/\s*\*\s*/', $corrText);
$corrBlocks = array_filter(array_map('trim', $corrBlocks));
$aCorresponding = [];
foreach ($corrBlocks as $block) {
//匹配通讯作者姓名
$sName = $this->matchCorrespondingName($block, $aCorrespondingAuthor);
if (empty($sName)) {
continue;
}
preg_match('/(E[\s-]*mail|邮箱)[\s:]*([^\s]+@[^\s]+)/i', $block, $email);
preg_match('/(Postal[\s-]*address|地址)[\s:]*([^,;]+)/i', $block, $address);
preg_match('/(Tel|电话)[\s:]*([^\s]+)/i', $block, $tel);
$aCorresponding[] = [
'name' => $sName,
'email' => isset($email[2]) ? trim($email[2]) : '',
'postal_address' => isset($address[2]) ? trim($address[2]) : '',
'tel' => isset($tel[2]) ? trim($tel[2]) : ''
];
}
return $aCorresponding;
}
//匹配通讯作者姓名
private function matchCorrespondingName($block, $corrNames)
{
$blockLower = strtolower($block);
foreach ($corrNames as $name) {
if (strpos($blockLower, strtolower($name)) !== false) {
return $name;
}
$nameParts = explode(' ', $name);
if (count($nameParts) >= 2) {
$reversedName = implode(' ', array_reverse($nameParts));
if (strpos($blockLower, strtolower($reversedName)) !== false) {
return $name;
}
}
}
return '';
}
// 获取目标文本的下一个段落
private function getNextParagraphAfterText($targetText){
$found = false;
foreach ($this->sections as $section) {
foreach ($section->getElements() as $element) {
$text = $this->getTextFromElement($element);
if(empty($text)){
continue;
}
if ($found) {
return $text;
}
if (stripos($text, $targetText) !== false) {
$found = true;
}
}
}
return '';
}
// 获取目标文本后的所有内容
private function getContentAfterText($targetText){
$found = false;
$content = [];
$stopKeywords = ['关键词', 'Key words', '摘要', 'Abstract'];
$maxLines = 200;
$lineNumber = 0;
foreach ($this->sections as $section) {
foreach ($section->getElements() as $element) {
$lineNumber++;
if (count($content) >= $maxLines) break;
$text = $this->getTextFromElement($element,$lineNumber);
$text = trim($text);
if (empty($text)) continue;
if (!$found) {
// 移除所有非字母数字字符后匹配
$cleanTarget = preg_replace('/[^a-zA-Z0-9]/', '', strtolower($targetText));
$cleanText = preg_replace('/[^a-zA-Z0-9]/', '', strtolower($text));
// 只要目标文本的50%以上能匹配即可
if (strlen($cleanTarget) > 0 && similar_text($cleanText, $cleanTarget) / strlen($cleanTarget) > 0.5) {
$found = true;
}
continue;
}
// 检查停止关键词
$shouldStop = false;
foreach ($stopKeywords as $kw) {
if (stripos($text, $kw) !== false) {
$shouldStop = true;
break;
}
}
if ($shouldStop) break;
$content[] = $text;
}
if (count($content) >= $maxLines || (isset($shouldStop) && $shouldStop)) break;
}
return implode("\n", $content);
}
// 统一提取元素文本
private function getTextFromElement($element,$lineNumber = 0){
$text = '';
// 处理PreserveText元素
if ($element instanceof \PhpOffice\PhpWord\Element\PreserveText) {
// 通过反射获取私有属性 text
$reflection = new \ReflectionClass($element);
$property = $reflection->getProperty('text');
$property->setAccessible(true);
$textParts = $property->getValue($element);
foreach ($textParts as $part) {
if (strpos($part, 'HYPERLINK') !== false) {
// 解码 HTML 实体(&quot; -> "
$decoded = html_entity_decode($part);
// 提取 mailto: 后的邮箱
if (preg_match('/mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/i', $decoded, $match)) {
$text .= $match[1] . ' ';
}
} else {
// 普通文本直接拼接
$text .= $part;
}
}
return $text;
}
// 处理表格和单元格E-mail可能在表格中
if ($element instanceof \PhpOffice\PhpWord\Element\Table) {
foreach ($element->getRows() as $row) {
foreach ($row->getCells() as $cell) {
$text .= $this->getTextFromElement($cell);
}
}
return $text;
}
if ($element instanceof \PhpOffice\PhpWord\Element\Cell) {
foreach ($element->getElements() as $child) {
$text .= $this->getTextFromElement($child);
}
return $text;
}
//处理嵌套元素(递归提取所有子元素)
if (method_exists($element, 'getElements')) {
foreach ($element->getElements() as $child) {
$text .= $this->getTextFromElement($child);
}
}
//处理文本元素(包括带格式的文本)
if ($element instanceof \PhpOffice\PhpWord\Element\Text) {
$text .= $element->getText();
}
//处理超链接(优先提取链接目标,可能是邮箱)
if ($element instanceof \PhpOffice\PhpWord\Element\Link) {
$target = $element->getTarget();
if (strpos($target, 'mailto:') === 0) {
$text .= str_replace('mailto:', '', $target) . ' '; // 剥离mailto:前缀
}
$text .= $element->getText() . ' ';
}
//处理字段和注释(可能包含隐藏邮箱)
if ($element instanceof \PhpOffice\PhpWord\Element\Field) {
$text .= $element->getContent() . ' ';
}
if ($element instanceof \PhpOffice\PhpWord\Element\Note) {
$text .= $element->getContent() . ' ';
}
//清理所有不可见字符(关键:移除格式干扰)
$text = preg_replace('/[\x00-\x1F\x7F-\x9F]/', ' ', $text); // 移除控制字符
$text = str_replace(["\t", "\r", "\n"], ' ', $text); // 统一空白字符
$text = preg_replace('/\s+/', ' ', $text); // 合并多个空格
return $text;
}
/**
* 从 Word 文档提取摘要和关键词
* @return array 提取结果
*/
public function extractFromWord() {
$sContent = '';
//文本处理
$sFundContent = '';
foreach ($this->sections as $section) {
foreach ($section->getElements() as $element) {
$textContent = $this->getTextFromElement($element);
if(empty($textContent)){
continue;
}
//编码修复
$possibleEncodings = [
'Windows-1252', 'UTF-8', 'GBK', 'GB2312',
'Latin-1', 'ISO-8859-1', 'CP1252'
];
$sContent .= @mb_convert_encoding($textContent, 'UTF-8', implode(',', $possibleEncodings));
if(stripos($textContent, 'Keywords:') !== false){
$sContent .= "Keywords-End-Flag";
}
if(empty($sFundContent)){
$aFund = $this->getMatchedFundPhrases($sContent);
if(!empty($aFund[0])){
$position = stripos($sContent, $aFund[0]);
$sFundContent = substr($sContent, $position);
$sFundContent = trim(str_ireplace($aFund[0], '', $sFundContent));
if (preg_match('/^(.*?)Peer review/', $sFundContent, $matches)) {
$sFundContent = $matches[1]; // 提取匹配到的前置内容
}
}
}
$sContent .= "\n";
}
}
// 2. 基础文本清理(合并多余空格,保留有效换行)
$textContent = preg_replace('/(\S)\s+/', '$1 ', $sContent);
$textContent = trim($textContent);
// 3. 提取摘要
$abstract = '';
$abstractPattern = '/Abstract\s*([\s\S]*?)(?=Keywords:|$)/i';
if (preg_match($abstractPattern, $textContent, $abstractMatches)) {
$abstract = trim($abstractMatches[1]);
$abstract = preg_replace('/\n+/', ' ', $abstract);
}
// 4. 提取关键词(核心:仅保留两种强制匹配逻辑)
$keywords = [];
// $keywordPattern = '/Keywords:\s*([\s\S]*?)(?=\s*\d+\.|[;,]\s*[\r\n]+\s*[\r\n]+|(?i)\bintroduction|abbreviations\b|$)/i';
$keywordPattern = '/Keywords:\s*(.*?)\s*Keywords-End-Flag/s';
if (preg_match($keywordPattern, $textContent, $keywordMatches)) {
$keywordStr = trim($keywordMatches[1]);
// 清理关键词列表格式(去除换行、末尾多余符号)
$keywordStr = preg_replace('/\n+/', ' ', $keywordStr);
$keywordStr = rtrim($keywordStr, ';,. '); // 去除末尾分号、逗号等
$keywordStr = trim($keywordStr);
// 分割并过滤有效关键词
$keywords = preg_split('/[,;]\s*/', $keywordStr);
$keywords = array_filter(array_map('trim', $keywords), function($item) {
return !empty($item) && !ctype_space($item);
});
}
return [
'status' => 1,
'msg' => '提取成功',
'data' => [
'abstrart' => $abstract,
'keywords' => $keywords,
'fund' => $sFundContent
]
];
}
private function getMatchedFundPhrases($content = '') {
if (empty($content)) {
return [];
}
// 基金支持词组列表
$fundPhrases = [
'Supported by', 'Funded by', 'Sponsored by', 'Supported in part by',
'Funding was provided by', 'Funded in part by'
];
// 1. 转义词组中的特殊字符,使用 # 作为分隔符
$escapedPhrases = array_map(function($phrase) {
return preg_quote($phrase, '#');
}, $fundPhrases);
// 2. 拼接为正则模式:匹配任意一个词组(保留原始词组的捕获)
$pattern = '#('.implode('|', $escapedPhrases).')#i';
// 注意:此处用 () 捕获分组,而非 (?:),用于提取匹配到的具体词组
// 3. 全局匹配所有符合的词组
preg_match_all($pattern, $content, $matches);
// 4. 处理结果:去重、保留原始词组格式(忽略大小写导致的变体)
$matched = [];
if (!empty($matches[1])) {
// 遍历匹配到的结果(可能包含大小写变体,如 'funded by'
foreach ($matches[1] as $match) {
// 与原始词组列表比对,找到完全匹配的原始词组(忽略大小写)
foreach ($fundPhrases as $original) {
if (strcasecmp($match, $original) === 0) {
$matched[] = $original;
break; // 找到后跳出内层循环,避免重复
}
}
}
// 去重并保持原始顺序
$matched = array_values(array_unique($matched));
}
return $matched;
}
//日志打印
private function log($msg){
// echo date('[Y-m-d H:i:s] ') . $msg . "\n";
}
}