Files
tougao/application/common/ArticleParserService.php
chengxl c53ca3aa1f 升级
2025-11-03 09:37:14 +08:00

764 lines
33 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
namespace app\common;
use PhpOffice\PhpWord\IOFactory;
use think\Exception;
use ZipArchive;
use RecursiveIteratorIterator;
use RecursiveDirectoryIterator;
use PhpOffice\PhpWord\Settings;
use PhpOffice\PhpWord\Element\TextRun;
use DOMDocument;
use DOMXPath;
// use BadMethodCallException;
class ArticleParserService
{
private $phpWord;
private $sections;
public function __construct($filePath = '')
{
if (!file_exists($filePath)) {
throw new Exception("文档不存在:{$filePath}");
}
try {
// 关键配置:关闭“仅读数据”,保留完整节结构
$reader = IOFactory::createReader();
$reader->setReadDataOnly(false);
Settings::setCompatibility(false);
Settings::setOutputEscapingEnabled(true); // 避免XML转义冲突
$doc = $reader->load($filePath);
$sectionCount = count($doc->getSections());
// $this->log("✅ 文档直接加载成功,节数量:{$sectionCount}");
$this->phpWord = $reader->load($filePath);
$this->sections = $this->phpWord->getSections();
} catch (\Exception $e) {
return json(['status' => 'error', 'msg' => $e->getMessage()]);
}
}
// 上传并解析文档的入口方法
public static function uploadAndParse($sFileUrl){
//必填值验证
if(empty($sFileUrl)){
return json_encode(['status' => 2,'msg' => 'Please upload the submission file']);
}
//判断文件是否执行
if (!file_exists($sFileUrl)) {
return json_encode(['status' => 3, 'msg' => 'The uploaded file does not exist']);
}
if (!is_readable($sFileUrl)) {
return json_encode(['status' => 4, 'msg' => 'The uploaded file is unreadable']);
}
// 解析文档
$oDealFile = new self($sFileUrl);
//获取标题
$sTitle = $oDealFile->getTitle();
if(empty($sTitle)){
return json_encode(['status' => 5, 'msg' => 'Article title retrieval failed']);
}
//获取作者
$aParam = ['title' => $sTitle];
$aAuthor = $oDealFile->getAuthors($aParam);
$aAuthorData = empty($aAuthor['author']) ? [] : $aAuthor['author'];//所有作者信息
$aAuthorReportData = empty($aAuthor['report']) ? [] : $aAuthor['report'];//通讯作者信息
$aParam['author'] = $aAuthorData;
$aParam['report'] = $aAuthorReportData;
//获取机构
$aCompany = $oDealFile->getCompany($aParam);
$aParam['company'] = $aCompany;
//获取通讯作者信息
$aParam['corresponding'] = $oDealFile->getCorrespondingAuthors($aParam);
//keywords 和 摘要
$aContent = $oDealFile->extractFromWord();
$aParam += empty($aContent['data']) ? [] : $aContent['data'];
return json_encode(['status' => 1,'msg' => 'success','data' => $aParam]);
}
// 提取文章标题
private function getTitle(){
$title = '';
$maxLength = 0;
foreach ($this->sections as $section) {
foreach ($section->getElements() as $element) {
$text = $this->getTextFromElement($element);
$length = mb_strlen(trim($text));
if ($length > $maxLength && $length > 10) { // 标题通常较长
$title = trim($text);
$maxLength = $length;
break 2; // 取第一个最长段落作为标题
}
}
}
return $title;
}
// 提取作者
// private function getAuthors($aParam = []) {
// $title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
// $sAuthorContent = $this->getNextParagraphAfterText($title);
// if (empty($sAuthorContent)) {
// return ['author' => [], 'report' => []];
// }
// //编码修复
// $possibleEncodings = [
// 'Windows-1252', 'UTF-8', 'GBK', 'GB2312',
// 'Latin-1', 'ISO-8859-1', 'CP1252'
// ];
// $encodedContent = @mb_convert_encoding($sAuthorContent, 'UTF-8', implode(',', $possibleEncodings));
// $sAuthorContent = $encodedContent ?: $sAuthorContent;
// //清理不可见字符
// $sAuthorContent = preg_replace('/[\x00-\x1F\x7F\x{200B}-\x{200F}]/u', '', $sAuthorContent);
// //修复特殊符号乱码
// $symbolMap = [
// '†' => '†', 'â ' => '†', 'â' => '†', '?†' => '†',
// ':' => ':', ',' => ',', '—' => '-',
// '啊' => '' // 针对性移除异常字符“啊”(若为固定乱码)
// ];
// $sAuthorContent = strtr($sAuthorContent, $symbolMap);
// //格式标准化
// $sAuthorContent = str_replace(['', ';', '', '、'], ',', $sAuthorContent); // 统一分隔符
// $sAuthorContent = preg_replace('/\s+and\s+/i', ', ', $sAuthorContent); // and转逗号
// $sAuthorContent = preg_replace('/\s+/', ' ', $sAuthorContent); // 合并多余空格
// $sAuthorContent = trim($sAuthorContent);
// // 处理作者
// $content = mb_convert_encoding($sAuthorContent, 'UTF-8', 'auto'); // 确保编码正确
// $content = str_replace("\xC2\xA0", ' ', $content); // 替换非-breaking空格为普通空格
// $content = preg_replace('/(\d+)\s*([*#†])/', '$1$2', $content); // 合并"1 *"为"1*"、"1 #"为"1#"
// $content = preg_replace('/,†/', ',†', $content); // 保留"1,†"格式(防止被拆分)
// //标记上标内的逗号+空格(多编号)
// $tempStr = preg_replace('/(\d+)\s*,\s*(\d+)/', '$1<SEP>$2', $content);
// // 原有步骤2正则匹配扩展上标符号支持保持原有逻辑
// $pattern = '/
// ([A-Za-z\s\.\-]+?) # 姓名(支持缩写、空格)
// \s* # 姓名与上标间空格
// ( # 上标组(扩展符号支持)
// \d+ # 起始数字
// (?:[†#*,]|<SEP>\d+)* # 允许:†#*符号、逗号、<SEP>+数字兼容1,†、1,*等)
// )
// \s*,? # 作者间逗号(可选)
// (?=\s|$) # 确保后面是空格或结尾
// /ux';
// preg_match_all($pattern, $tempStr, $matches);
// $authorList = [];
// if(!empty($matches[1])){
// foreach ($matches[1] as $i => $name) {
// $name = trim($name);
// $superscript = trim($matches[2][$i]);
// $superscript = str_replace('<SEP>', ',', $superscript); // 恢复多编号逗号
// $superscript = preg_replace('/,$/', '', $superscript); // 清理末尾逗号
// // 修复符号与数字间的空格(如原始"1 *"被误处理为"1*"的情况,保持原样)
// $superscript = preg_replace('/(\d)([*#†])/', '$1$2', $superscript);
// if (!empty($name)) {
// $authorList[] = [
// 'name' => $name,
// 'superscript' => $superscript
// ];
// }
// }
// }else {
// // 按“两个或多个连续空格”拆分(姓名之间的分隔)
// $authorList = array_filter(
// array_map('trim',
// preg_split('/(,\p{Z}*|\p{Z}{2,})/u', $sAuthorContent)
// )
// );
// }
// // //处理作者
// // $authorList = [];
// // // 新正则:匹配“姓名+上标”整体允许上标含逗号如1,†)
// // // 逻辑:姓名以字母/中文开头,上标以数字开头、以符号/数字结尾
// // // if (preg_match_all('/([A-Za-z\x{4e00}-\x{9fa5}][A-Za-z\s·\-\'\x{4e00}-\x{9fa5}]*)\s*([\d,†#*]+)/u', $sAuthorContent, $matches)) {
// // if(preg_match_all('/([A-Za-z\x{4e00}-\x{9fa5}][A-Za-z\s·\-\'\x{4e00}-\x{9fa5}]*)\s*(\d[\d,†#\s*]*)/u', $sAuthorContent, $matches)){
// // for ($i = 0; $i < count($matches[1]); $i++) {
// // $authorList[] = trim($matches[1][$i] . $matches[2][$i]);
// // }
// // } else {
// // // 按“两个或多个连续空格”拆分(姓名之间的分隔)
// // $authorList = array_filter(
// // array_map('trim',
// // preg_split('/(,\p{Z}*|\p{Z}{2,})/u', $sAuthorContent)
// // )
// // );
// // }
// $aAuthorData = [];
// $aReport = [];
// $namePattern = '/
// (?:[A-Za-z\s·\-\']+| # 英文姓名(支持空格、连字符)
// [\x{4e00}-\x{9fa5}]+| # 中文姓名
// [\x{1800}-\x{18AF}]+| # 蒙古文姓名
// [A-Z]\.) # 单字母缩写(如 J.
// /ux';
// var_dump($authorList);exit;
// foreach ($authorList as $authorStr) {
// if (empty($authorStr)) continue;
// var_dump($authorList);exit;
// //分离姓名与上标支持上标含逗号如1,†)
// $superscript = '';
// // 新正则:匹配以数字开头、含逗号/符号的完整上标如1,†、2*#
// $authorStr = trim(trim($authorStr,','),' ');
// // if (preg_match('/([\d,†#*]+)$/u', $authorStr, $supMatch)) {
// // if(preg_match('/\s*([\d,†#* ]+)$/u', $authorStr, $supMatch)){
// // if (preg_match('/.*?\s*([\d,†#* ]+)$/u', $authorStr, $supMatch)) {
// // if (preg_match('/.*?\s*([\d,\x{2020}#* ]+?)\s*$/u', $authorStr, $supMatch)) {
// // if (preg_match('/^(.+?)\D*?(\d[\d,#*†,\s]*)$/u', $authorStr, $supMatch)) {
// // $superscript = $supMatch[1];
// // // 移除上标,保留纯姓名(避免残留符号)
// // $nameStr = trim(preg_replace('/' . preg_quote($superscript, '/') . '$/', '', $authorStr));
// // } else {
// // $nameStr = $authorStr;
// // }
// $pattern = '/^(.+?)\s*(\d[\d,#*†\s]*?)\s*$/u';
// if (preg_match($pattern, $authorStr, $supMatch)) {
// $nameStr = empty($supMatch[1]) ? '' : trim($supMatch[1]); // 姓名部分:"Liguo Zhang"
// $superscript = empty($supMatch[2]) ? $nameStr : $nameStr.trim($supMatch[2]); // 上标部分:"1
// // echo "姓名: $nameStr, 上标: $superscript\n";
// } else {
// $nameStr = $authorStr;
// }
// //验证姓名合法性(过滤无效内容)
// if (!preg_match($namePattern, $nameStr)) {
// continue;
// }
// //解析上标信息正确识别1,†中的机构编号和符号)
// $companyId = '';
// $isSuper = 0;
// $isReport = 0;
// if (!empty($superscript)) {
// // 提取机构编号忽略上标中的逗号如1,† → 提取1
// if (preg_match('/(\d+)/', $superscript, $numMatch)) {
// $companyId = $numMatch[1];
// }
// // 识别特殊符号(#为超级作者,*†为通讯作者)
// $isSuper = strpos($superscript, '#') !== false ? 1 : 0;
// $isReport = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0;
// }
// if (preg_match("/^([A-Za-z\s'\.-]+)/u", $nameStr, $match)) {
// $nameStr = trim($match[1]);
// }
// $aAuthorData[] = [
// 'name' => $nameStr,
// 'company_id' => $companyId,
// 'is_super' => $isSuper,
// 'is_report' => $isReport
// ];
// if ($isReport) {
// $aReport[] = $nameStr;
// }
// }
// var_dump($aAuthorData);exit;
// return ['author' => $aAuthorData,'report' => array_unique($aReport)];
// }
private function getAuthors($aParam = []) {
$title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
$sAuthorContent = $this->getNextParagraphAfterText($title);
if (empty($sAuthorContent)) {
return ['author' => [], 'report' => []];
}
//编码修复
$possibleEncodings = [
'Windows-1252', 'UTF-8', 'GBK', 'GB2312',
'Latin-1', 'ISO-8859-1', 'CP1252'
];
$encodedContent = @mb_convert_encoding($sAuthorContent, 'UTF-8', implode(',', $possibleEncodings));
$sAuthorContent = $encodedContent ?: $sAuthorContent;
//清理不可见字符
$sAuthorContent = preg_replace('/[\x00-\x1F\x7F\x{200B}-\x{200F}]/u', '', $sAuthorContent);
//修复特殊符号乱码
$symbolMap = [
'†' => '†', 'â ' => '†', 'â' => '†', '?†' => '†',
':' => ':', ',' => ',', '—' => '-',
'啊' => '' // 针对性移除异常字符“啊”(若为固定乱码)
];
$sAuthorContent = strtr($sAuthorContent, $symbolMap);
//格式标准化
$sAuthorContent = str_replace(['', ';', '', '、'], ',', $sAuthorContent); // 统一分隔符
$sAuthorContent = preg_replace('/\s+and\s+/i', ', ', $sAuthorContent); // and转逗号
$sAuthorContent = preg_replace('/\s+/', ' ', $sAuthorContent); // 合并多余空格
$sAuthorContent = trim($sAuthorContent);
// 处理作者
$content = mb_convert_encoding($sAuthorContent, 'UTF-8', 'auto'); // 确保编码正确
$content = str_replace("\xC2\xA0", ' ', $content); // 替换非-breaking空格为普通空格
$content = preg_replace('/(\d+)\s*([*#†])/', '$1$2', $content); // 合并"1 *"为"1*"、"1 #"为"1#"
$content = preg_replace('/,†/', ',†', $content); // 保留"1,†"格式(防止被拆分)
//标记上标内的逗号+空格(多编号)
$tempStr = preg_replace('/(\d+)\s*,\s*(\d+)/', '$1<SEP>$2', $content);
// 原有步骤2正则匹配扩展上标符号支持保持原有逻辑
$pattern = '/
([A-Za-z\s\.\-]+?) # 姓名(支持缩写、空格)
\s* # 姓名与上标间空格
( # 上标组(扩展符号支持)
\d+ # 起始数字
(?:[†#*,]|<SEP>\d+)* # 允许:†#*符号、逗号、<SEP>+数字兼容1,†、1,*等)
)
\s*,? # 作者间逗号(可选)
(?=\s|$) # 确保后面是空格或结尾
/ux';
preg_match_all($pattern, $tempStr, $matches);
$authorList = [];
if(!empty($matches[1])){
foreach ($matches[1] as $i => $name) {
$name = trim($name);
$superscript = trim($matches[2][$i]);
$superscript = str_replace('<SEP>', ',', $superscript); // 恢复多编号逗号
$superscript = preg_replace('/,$/', '', $superscript); // 清理末尾逗号
// 修复符号与数字间的空格(如原始"1 *"被误处理为"1*"的情况,保持原样)
$superscript = preg_replace('/(\d)([*#†])/', '$1$2', $superscript);
if (!empty($name)) {
$authorList[] = [
'name' => $name,
'superscript' => $superscript
];
}
}
}else {
// 按“两个或多个连续空格”拆分(姓名之间的分隔)
$authorList = array_filter(
array_map('trim',
preg_split('/(,\p{Z}*|\p{Z}{2,})/u', $sAuthorContent)
)
);
}
// //处理作者
$aAuthorData = [];
$aReport = [];
$namePattern = '/
(?:[A-Za-z\s·\-\']+| # 英文姓名(支持空格、连字符)
[\x{4e00}-\x{9fa5}]+| # 中文姓名
[\x{1800}-\x{18AF}]+| # 蒙古文姓名
[A-Z]\.) # 单字母缩写(如 J.
/ux';
foreach ($authorList as $authorStr){
if (empty($authorStr)) continue;
//获取下标
$superscript = empty($authorStr['superscript']) ? $authorStr : $authorStr['superscript'];
$nameStr = empty($authorStr['name']) ? $authorStr : $authorStr['name'];
$companyId = [];
$isSuper = 0;
$isReport = 0;
if (!empty($superscript)) {
// 提取机构编号忽略上标中的逗号如1,† → 提取1
preg_match_all('/\d+/', $superscript, $numMatch);
// 识别特殊符号(#为超级作者,*†为通讯作者)
$isSuper = strpos($superscript, '#') !== false ? 1 : 0;
$isReport = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0;
}
if (preg_match("/^([A-Za-z\s'\.-]+)/u", $nameStr, $match)) {
$nameStr = trim($match[1]);
}
$aAuthorData[] = [
'name' => $nameStr,
'company_id' => empty($numMatch[0]) ? [] : $numMatch[0],
'is_super' => $isSuper,
'is_report' => $isReport
];
if ($isReport) {
$aReport[] = $nameStr;
}
}
return ['author' => $aAuthorData,'report' => array_unique($aReport)];
}
// 获取机构
private function getCompany($aParam = []){
//获取标题
$title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
//获取标题下的作者
$sAuthorContent = empty($aParam['authors']) ? $this->getNextParagraphAfterText($title) : $aParam['authors'];
//获取作者结构
$sCompany = $this->getContentAfterText($sAuthorContent);
if(empty($sCompany)){
return [];
}
//编码修复
$possibleEncodings = [
'Windows-1252', 'UTF-8', 'GBK', 'GB2312',
'Latin-1', 'ISO-8859-1', 'CP1252'
];
$encodedContent = @mb_convert_encoding($sCompany, 'UTF-8', implode(',', $possibleEncodings));
$sCompany = $encodedContent ?: $sCompany;
//按行拆分,保留数字开头的行
$sCompany = str_replace(["\r\n", "\r"], "\n", $sCompany);
$aCompanyLines = explode("\n", $sCompany);
$aCompanyLines = array_filter(array_map('trim', $aCompanyLines), function($line) {
return preg_match('/^\d+/', $line); // 仅保留数字开头的行
});
$aCompany = [];
foreach ($aCompanyLines as $line) {
if (preg_match('/^(\d+)\s*(.+)$/', $line, $match)) {
if(empty($match[1]) || empty($match[2])){
continue;
}
$aCompany[$match[1]] = ltrim(trim(ltrim($match[2]),'.'),' ');
}
}
return $aCompany;
}
// 提取通讯作者含E-mail、地址、电话
private function getCorrespondingAuthors($aParam = []){
$aCorrespondingAuthor = empty($aParam['report']) ? [] : $aParam['report'];
if(empty($aCorrespondingAuthor)){
return [];
}
// 获取标题
$title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
$sAuthorContent = $this->getNextParagraphAfterText($title);
$sCompany = $this->getNextParagraphAfterText($sAuthorContent); // 直接取机构所在段落的原始文本
if (empty($sCompany)) {
// 备选方案:若机构段落获取失败,用解析后的机构名称拼接
$aCompany = $this->getCompany($aParam);
$sCompany = implode(' ', array_values($aCompany));
}
// 获取机构后的完整内容
$corrText = $this->getContentAfterText($sCompany);
//编码修复
$possibleEncodings = [
'Windows-1252', 'UTF-8', 'GBK', 'GB2312',
'Latin-1', 'ISO-8859-1', 'CP1252'
];
$encodedContent = @mb_convert_encoding($corrText, 'UTF-8', implode(',', $possibleEncodings));
$corrText = $encodedContent ?: $corrText;
// // 调试
// file_put_contents(ROOT_PATH . 'runtime/corr_text_raw.log', $corrText);
//清理文本
$corrText = str_replace(['', ''], [':', '@'], $corrText);
$corrText = preg_replace('/\s+/', ' ', $corrText); // 统一空格
$corrText = str_replace(' ', ' ', $corrText); // 去除多余空格
//按"*"分割通讯作者
$corrBlocks = preg_split('/\s*\*\s*/', $corrText);
$corrBlocks = array_filter(array_map('trim', $corrBlocks));
$aCorresponding = [];
foreach ($corrBlocks as $block) {
//匹配通讯作者姓名
$sName = $this->matchCorrespondingName($block, $aCorrespondingAuthor);
if (empty($sName)) {
continue;
}
preg_match('/(E[\s-]*mail|邮箱)[\s:]*([^\s]+@[^\s]+)/i', $block, $email);
preg_match('/(Postal[\s-]*address|地址)[\s:]*([^,;]+)/i', $block, $address);
preg_match('/(Tel|电话)[\s:]*([^\s]+)/i', $block, $tel);
$aCorresponding[] = [
'name' => $sName,
'email' => isset($email[2]) ? trim($email[2]) : '',
'postal_address' => isset($address[2]) ? trim($address[2]) : '',
'tel' => isset($tel[2]) ? trim($tel[2]) : ''
];
}
return $aCorresponding;
}
//匹配通讯作者姓名
private function matchCorrespondingName($block, $corrNames)
{
$blockLower = strtolower($block);
foreach ($corrNames as $name) {
if (strpos($blockLower, strtolower($name)) !== false) {
return $name;
}
$nameParts = explode(' ', $name);
if (count($nameParts) >= 2) {
$reversedName = implode(' ', array_reverse($nameParts));
if (strpos($blockLower, strtolower($reversedName)) !== false) {
return $name;
}
}
}
return '';
}
// 获取目标文本的下一个段落
private function getNextParagraphAfterText($targetText){
$found = false;
foreach ($this->sections as $section) {
foreach ($section->getElements() as $element) {
$text = $this->getTextFromElement($element);
if(empty($text)){
continue;
}
if ($found) {
return $text;
}
if (stripos($text, $targetText) !== false) {
$found = true;
}
}
}
return '';
}
// 获取目标文本后的所有内容
private function getContentAfterText($targetText){
$found = false;
$content = [];
$stopKeywords = ['关键词', 'Key words', '摘要', 'Abstract'];
$maxLines = 200;
$lineNumber = 0;
foreach ($this->sections as $section) {
foreach ($section->getElements() as $element) {
$lineNumber++;
if (count($content) >= $maxLines) break;
$text = $this->getTextFromElement($element,$lineNumber);
$text = trim($text);
if (empty($text)) continue;
if (!$found) {
// 移除所有非字母数字字符后匹配
$cleanTarget = preg_replace('/[^a-zA-Z0-9]/', '', strtolower($targetText));
$cleanText = preg_replace('/[^a-zA-Z0-9]/', '', strtolower($text));
// 只要目标文本的50%以上能匹配即可
if (strlen($cleanTarget) > 0 && similar_text($cleanText, $cleanTarget) / strlen($cleanTarget) > 0.5) {
$found = true;
}
continue;
}
// 检查停止关键词
$shouldStop = false;
foreach ($stopKeywords as $kw) {
if (stripos($text, $kw) !== false) {
$shouldStop = true;
break;
}
}
if ($shouldStop) break;
$content[] = $text;
}
if (count($content) >= $maxLines || (isset($shouldStop) && $shouldStop)) break;
}
return implode("\n", $content);
}
// 统一提取元素文本
private function getTextFromElement($element,$lineNumber = 0){
$text = '';
// 处理PreserveText元素
if ($element instanceof \PhpOffice\PhpWord\Element\PreserveText) {
// 通过反射获取私有属性 text
$reflection = new \ReflectionClass($element);
$property = $reflection->getProperty('text');
$property->setAccessible(true);
$textParts = $property->getValue($element);
foreach ($textParts as $part) {
if (strpos($part, 'HYPERLINK') !== false) {
// 解码 HTML 实体(&quot; -> "
$decoded = html_entity_decode($part);
// 提取 mailto: 后的邮箱
if (preg_match('/mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/i', $decoded, $match)) {
$text .= $match[1] . ' ';
}
} else {
// 普通文本直接拼接
$text .= $part;
}
}
return $text;
}
// 处理表格和单元格E-mail可能在表格中
if ($element instanceof \PhpOffice\PhpWord\Element\Table) {
foreach ($element->getRows() as $row) {
foreach ($row->getCells() as $cell) {
$text .= $this->getTextFromElement($cell);
}
}
return $text;
}
if ($element instanceof \PhpOffice\PhpWord\Element\Cell) {
foreach ($element->getElements() as $child) {
$text .= $this->getTextFromElement($child);
}
return $text;
}
//处理嵌套元素(递归提取所有子元素)
if (method_exists($element, 'getElements')) {
foreach ($element->getElements() as $child) {
$text .= $this->getTextFromElement($child);
}
}
//处理文本元素(包括带格式的文本)
if ($element instanceof \PhpOffice\PhpWord\Element\Text) {
$text .= $element->getText();
}
//处理超链接(优先提取链接目标,可能是邮箱)
if ($element instanceof \PhpOffice\PhpWord\Element\Link) {
$target = $element->getTarget();
if (strpos($target, 'mailto:') === 0) {
$text .= str_replace('mailto:', '', $target) . ' '; // 剥离mailto:前缀
}
$text .= $element->getText() . ' ';
}
//处理字段和注释(可能包含隐藏邮箱)
if ($element instanceof \PhpOffice\PhpWord\Element\Field) {
$text .= $element->getContent() . ' ';
}
if ($element instanceof \PhpOffice\PhpWord\Element\Note) {
$text .= $element->getContent() . ' ';
}
//清理所有不可见字符(关键:移除格式干扰)
$text = preg_replace('/[\x00-\x1F\x7F-\x9F]/', ' ', $text); // 移除控制字符
$text = str_replace(["\t", "\r", "\n"], ' ', $text); // 统一空白字符
$text = preg_replace('/\s+/', ' ', $text); // 合并多个空格
return $text;
}
/**
* 从 Word 文档提取摘要和关键词
* @return array 提取结果
*/
public function extractFromWord() {
$sContent = '';
//文本处理
$sFundContent = '';
foreach ($this->sections as $section) {
foreach ($section->getElements() as $element) {
$textContent = $this->getTextFromElement($element);
if(empty($textContent)){
continue;
}
//编码修复
$possibleEncodings = [
'Windows-1252', 'UTF-8', 'GBK', 'GB2312',
'Latin-1', 'ISO-8859-1', 'CP1252'
];
$sContent .= @mb_convert_encoding($textContent, 'UTF-8', implode(',', $possibleEncodings));
if(stripos($textContent, 'Keywords:') !== false){
$sContent .= "Keywords-End-Flag";
}
if(empty($sFundContent)){
$aFund = $this->getMatchedFundPhrases($sContent);
if(!empty($aFund[0])){
$position = stripos($sContent, $aFund[0]);
$sFundContent = substr($sContent, $position);
$sFundContent = trim(str_ireplace($aFund[0], '', $sFundContent));
if (preg_match('/^(.*?)Peer review/', $sFundContent, $matches)) {
$sFundContent = $matches[1]; // 提取匹配到的前置内容
}
}
}
$sContent .= "\n";
}
}
// 2. 基础文本清理(合并多余空格,保留有效换行)
$textContent = preg_replace('/(\S)\s+/', '$1 ', $sContent);
$textContent = trim($textContent);
// 3. 提取摘要
$abstract = '';
$abstractPattern = '/Abstract\s*([\s\S]*?)(?=Keywords:|$)/i';
if (preg_match($abstractPattern, $textContent, $abstractMatches)) {
$abstract = trim($abstractMatches[1]);
$abstract = preg_replace('/\n+/', ' ', $abstract);
}
// 4. 提取关键词(核心:仅保留两种强制匹配逻辑)
$keywords = [];
// $keywordPattern = '/Keywords:\s*([\s\S]*?)(?=\s*\d+\.|[;,]\s*[\r\n]+\s*[\r\n]+|(?i)\bintroduction|abbreviations\b|$)/i';
$keywordPattern = '/Keywords:\s*(.*?)\s*Keywords-End-Flag/s';
if (preg_match($keywordPattern, $textContent, $keywordMatches)) {
$keywordStr = trim($keywordMatches[1]);
// 清理关键词列表格式(去除换行、末尾多余符号)
$keywordStr = preg_replace('/\n+/', ' ', $keywordStr);
$keywordStr = rtrim($keywordStr, ';,. '); // 去除末尾分号、逗号等
$keywordStr = trim($keywordStr);
// 分割并过滤有效关键词
$keywords = preg_split('/[,;]\s*/', $keywordStr);
$keywords = array_filter(array_map('trim', $keywords), function($item) {
return !empty($item) && !ctype_space($item);
});
}
return [
'status' => 1,
'msg' => '提取成功',
'data' => [
'abstrart' => $abstract,
'keywords' => $keywords,
'fund' => $sFundContent
]
];
}
private function getMatchedFundPhrases($content = '') {
if (empty($content)) {
return [];
}
// 基金支持词组列表
$fundPhrases = [
'Supported by', 'Funded by', 'Sponsored by', 'Supported in part by',
'Funding was provided by', 'Funded in part by'
];
// 1. 转义词组中的特殊字符,使用 # 作为分隔符
$escapedPhrases = array_map(function($phrase) {
return preg_quote($phrase, '#');
}, $fundPhrases);
// 2. 拼接为正则模式:匹配任意一个词组(保留原始词组的捕获)
$pattern = '#('.implode('|', $escapedPhrases).')#i';
// 注意:此处用 () 捕获分组,而非 (?:),用于提取匹配到的具体词组
// 3. 全局匹配所有符合的词组
preg_match_all($pattern, $content, $matches);
// 4. 处理结果:去重、保留原始词组格式(忽略大小写导致的变体)
$matched = [];
if (!empty($matches[1])) {
// 遍历匹配到的结果(可能包含大小写变体,如 'funded by'
foreach ($matches[1] as $match) {
// 与原始词组列表比对,找到完全匹配的原始词组(忽略大小写)
foreach ($fundPhrases as $original) {
if (strcasecmp($match, $original) === 0) {
$matched[] = $original;
break; // 找到后跳出内层循环,避免重复
}
}
}
// 去重并保持原始顺序
$matched = array_values(array_unique($matched));
}
return $matched;
}
//日志打印
private function log($msg){
// echo date('[Y-m-d H:i:s] ') . $msg . "\n";
}
}