1264 lines
54 KiB
PHP
1264 lines
54 KiB
PHP
<?php
|
||
namespace app\common;
|
||
use PhpOffice\PhpWord\IOFactory;
|
||
use think\Exception;
|
||
use ZipArchive;
|
||
use RecursiveIteratorIterator;
|
||
use RecursiveDirectoryIterator;
|
||
use PhpOffice\PhpWord\Settings;
|
||
use PhpOffice\PhpWord\Element\TextRun;
|
||
use DOMDocument;
|
||
use DOMXPath;
|
||
// use BadMethodCallException;
|
||
class ArticleParserService
|
||
{
|
||
private $phpWord;
|
||
private $sections;
|
||
|
||
public function __construct($filePath = '')
|
||
{
|
||
if (!file_exists($filePath)) {
|
||
return json_encode(['status' => 5, 'msg' => '"文档不存在:{$filePath}"']);
|
||
}
|
||
try {
|
||
// 关键配置:关闭“仅读数据”,保留完整节结构
|
||
$reader = IOFactory::createReader();
|
||
$reader->setReadDataOnly(false);
|
||
Settings::setCompatibility(false);
|
||
Settings::setOutputEscapingEnabled(true); // 避免XML转义冲突
|
||
|
||
$doc = $reader->load($filePath);
|
||
$sectionCount = count($doc->getSections());
|
||
// $this->log("✅ 文档直接加载成功,节数量:{$sectionCount}");
|
||
$this->phpWord = $reader->load($filePath);
|
||
$this->sections = $this->phpWord->getSections();
|
||
} catch (\Exception $e) {
|
||
// 预处理:移除 DOCX 中的 EMF 图片
|
||
$processedFilePath = $this->removeEmfFromDocx($filePath);
|
||
// 加载处理后的文档
|
||
$reader = IOFactory::createReader();
|
||
$reader->setReadDataOnly(false);
|
||
Settings::setCompatibility(false);
|
||
Settings::setOutputEscapingEnabled(true);
|
||
|
||
$this->phpWord = $reader->load($processedFilePath);
|
||
$this->sections = $this->phpWord->getSections();
|
||
|
||
// 可选:删除临时处理文件(避免冗余)
|
||
unlink($processedFilePath);
|
||
return json_encode(['status' => 5, 'msg' => $e->getMessage()]);
|
||
}
|
||
}
|
||
/**
|
||
* 移除 DOCX 压缩包内的所有 EMF 图片
|
||
* @param string $docxPath 原 DOCX 文件路径
|
||
* @return string 处理后的临时 DOCX 路径
|
||
*/
|
||
private function removeEmfFromDocx($docxPath){
|
||
$zip = new ZipArchive();
|
||
if ($zip->open($docxPath) !== true) {
|
||
throw new \Exception("无法打开 DOCX 文件:{$docxPath}");
|
||
}
|
||
|
||
// 1. 创建临时目录用于解压
|
||
$tempDir = rtrim(ROOT_PATH,'/').'/runtime/'.uniqid('docx_temp_');
|
||
|
||
mkdir($tempDir, 0700, true);
|
||
|
||
// 2. 解压 DOCX 到临时目录
|
||
$zip->extractTo($tempDir);
|
||
$zip->close();
|
||
|
||
// 3. 递归删除所有 EMF 文件
|
||
$dirIterator = new RecursiveDirectoryIterator($tempDir);
|
||
$iterator = new RecursiveIteratorIterator($dirIterator);
|
||
foreach ($iterator as $file) {
|
||
if ($file->isFile() && strtolower(pathinfo($file, PATHINFO_EXTENSION)) === 'emf') {
|
||
unlink($file->getPathname());
|
||
}
|
||
}
|
||
// 4. 重新打包为 DOCX
|
||
$processedPath = $tempDir . '_processed.docx';
|
||
$newZip = new ZipArchive();
|
||
if ($newZip->open($processedPath, ZipArchive::CREATE | ZipArchive::OVERWRITE) !== true) {
|
||
throw new \Exception("无法创建处理后的 DOCX 文件");
|
||
}
|
||
|
||
// 遍历临时目录,添加所有文件到新压缩包
|
||
$this->addFilesToZip($tempDir, $newZip);
|
||
$newZip->close();
|
||
|
||
// 5. 删除临时解压目录
|
||
$this->deleteDir($tempDir);
|
||
|
||
return $processedPath;
|
||
}
|
||
|
||
/**
|
||
* 递归添加目录文件到 ZipArchive
|
||
* @param string $dir 目录路径
|
||
* @param ZipArchive $zip ZipArchive 实例
|
||
*/
|
||
private function addFilesToZip($dir, $zip)
|
||
{
|
||
$files = scandir($dir);
|
||
foreach ($files as $file) {
|
||
if ($file === '.' || $file === '..') continue;
|
||
|
||
$filePath = $dir . '/' . $file;
|
||
if (is_dir($filePath)) {
|
||
$this->addFilesToZip($filePath, $zip);
|
||
} else {
|
||
// 计算压缩包内的相对路径(避免冗余目录层级)
|
||
$relativePath = str_replace(dirname($dir) . '/', '', $filePath);
|
||
$zip->addFile($filePath, $relativePath);
|
||
}
|
||
}
|
||
}
|
||
|
||
/**
|
||
* 递归删除目录
|
||
* @param string $dir 目录路径
|
||
*/
|
||
private function deleteDir($dir){
|
||
// 1. 基础校验:非空字符串且为有效目录
|
||
if (trim($dir) === '' || !is_dir($dir)) {
|
||
return false;
|
||
}
|
||
|
||
// 2. 统一路径格式(去除尾部分隔符,避免跨系统差异)
|
||
$dir = rtrim($dir, DIRECTORY_SEPARATOR);
|
||
$dirName = basename($dir);
|
||
|
||
// 3. 前缀强校验:仅处理docx_temp_开头的目录
|
||
if (strpos($dirName, 'docx_temp_') !== 0) {
|
||
return false;
|
||
}
|
||
|
||
// 4. 路径归属校验(缓存realpath结果,减少I/O)
|
||
$runtimeDir = rtrim(ROOT_PATH, '/') . '/runtime';
|
||
$realDir = realpath($dir);
|
||
$realRuntimeDir = realpath($runtimeDir);
|
||
if ($realDir === false || $realRuntimeDir === false || strpos($realDir, $realRuntimeDir) !== 0) {
|
||
return false;
|
||
}
|
||
|
||
// 5. 扫描目录(带错误抑制,处理权限问题)
|
||
$files = @scandir($dir);
|
||
if ($files === false) {
|
||
return false;
|
||
}
|
||
|
||
$isFullyDeleted = true; // 标记是否完全删除
|
||
|
||
// 6. 递归处理子项
|
||
foreach ($files as $file) {
|
||
if ($file === '.' || $file === '..') {
|
||
continue;
|
||
}
|
||
|
||
$filePath = $dir . DIRECTORY_SEPARATOR . $file;
|
||
$realFilePath = realpath($filePath);
|
||
|
||
// 子路径校验:必须是当前目录的子项(防符号链接跳转)
|
||
if ($realFilePath === false || strpos($realFilePath, $realDir) !== 0) {
|
||
$isFullyDeleted = false;
|
||
continue;
|
||
}
|
||
|
||
if (is_dir($realFilePath)) {
|
||
// 递归删除子目录,继承校验逻辑
|
||
if (!$this->deleteDir($realFilePath)) {
|
||
$isFullyDeleted = false;
|
||
}
|
||
} else {
|
||
// 尝试删除文件(失败则标记未完全删除)
|
||
if (!@unlink($realFilePath)) {
|
||
$isFullyDeleted = false;
|
||
}
|
||
}
|
||
}
|
||
|
||
// 7. 最终删除目录(确保空目录才删除)
|
||
$remainingFiles = @scandir($dir);
|
||
if ($remainingFiles !== false && count($remainingFiles) <= 2) {
|
||
@rmdir($dir);
|
||
return $isFullyDeleted; // 若子项完全删除,则返回true
|
||
}
|
||
|
||
return false;
|
||
}
|
||
|
||
// 上传并解析文档的入口方法
|
||
public static function uploadAndParse($sFileUrl){
|
||
//必填值验证
|
||
if(empty($sFileUrl)){
|
||
return json_encode(['status' => 2,'msg' => 'Please upload the submission file']);
|
||
}
|
||
|
||
//判断文件是否执行
|
||
if (!file_exists($sFileUrl)) {
|
||
return json_encode(['status' => 3, 'msg' => 'The uploaded file does not exist']);
|
||
}
|
||
if (!is_readable($sFileUrl)) {
|
||
return json_encode(['status' => 4, 'msg' => 'The uploaded file is unreadable']);
|
||
}
|
||
|
||
// 解析文档
|
||
$oDealFile = new self($sFileUrl);
|
||
//获取标题
|
||
$sTitle = $oDealFile->getTitle();
|
||
if(empty($sTitle)){
|
||
return json_encode(['status' => 5, 'msg' => 'Article title retrieval failed']);
|
||
}
|
||
//获取作者
|
||
$aParam = ['title' => $sTitle];
|
||
$aAuthor = $oDealFile->getAuthors($aParam);
|
||
$aAuthorData = empty($aAuthor['author']) ? [] : $aAuthor['author'];//所有作者信息
|
||
$aAuthorReportData = empty($aAuthor['report']) ? [] : $aAuthor['report'];//通讯作者信息
|
||
$aParam['author'] = $aAuthorData;
|
||
$aParam['report'] = $aAuthorReportData;
|
||
//获取机构
|
||
$aCompany = $oDealFile->getCompany($aParam);
|
||
$aParam['company'] = $aCompany;
|
||
//获取通讯作者信息
|
||
$aParam['corresponding'] = $oDealFile->getCorrespondingAuthors($aParam);
|
||
//keywords 和 摘要
|
||
$aContent = $oDealFile->extractFromWord();
|
||
$aParam += empty($aContent['data']) ? [] : $aContent['data'];
|
||
return json_encode(['status' => 1,'msg' => 'success','data' => $aParam]);
|
||
}
|
||
|
||
// 提取文章标题
|
||
private function getTitle(){
|
||
if(empty($this->sections)){
|
||
return '';
|
||
}
|
||
$title = '';
|
||
$maxLength = 0;
|
||
foreach ($this->sections as $section) {
|
||
foreach ($section->getElements() as $element) {
|
||
$text = $this->getTextFromElement($element);
|
||
$length = mb_strlen(trim($text));
|
||
if ($length > $maxLength && $length > 10) { // 标题通常较长
|
||
$title = trim($text);
|
||
$maxLength = $length;
|
||
break 2; // 取第一个最长段落作为标题
|
||
}
|
||
}
|
||
}
|
||
if(!empty($title) && !mb_check_encoding($title, 'UTF-8')){
|
||
$title = mb_convert_encoding($title, 'UTF-8', 'GBK');
|
||
}
|
||
return $title;
|
||
}
|
||
// 提取作者
|
||
// private function getAuthors($aParam = []) {
|
||
// $title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
|
||
// $sAuthorContent = $this->getNextParagraphAfterText($title);
|
||
// if (empty($sAuthorContent)) {
|
||
// return ['author' => [], 'report' => []];
|
||
// }
|
||
|
||
// //编码修复
|
||
// $possibleEncodings = [
|
||
// 'Windows-1252', 'UTF-8', 'GBK', 'GB2312',
|
||
// 'Latin-1', 'ISO-8859-1', 'CP1252'
|
||
// ];
|
||
// $encodedContent = @mb_convert_encoding($sAuthorContent, 'UTF-8', implode(',', $possibleEncodings));
|
||
// $sAuthorContent = $encodedContent ?: $sAuthorContent;
|
||
|
||
// //清理不可见字符
|
||
// $sAuthorContent = preg_replace('/[\x00-\x1F\x7F\x{200B}-\x{200F}]/u', '', $sAuthorContent);
|
||
|
||
// //修复特殊符号乱码
|
||
// $symbolMap = [
|
||
// '†' => '†', 'â ' => '†', 'â' => '†', '?†' => '†',
|
||
// ':' => ':', ',' => ',', '—' => '-',
|
||
// '啊' => '' // 针对性移除异常字符“啊”(若为固定乱码)
|
||
// ];
|
||
// $sAuthorContent = strtr($sAuthorContent, $symbolMap);
|
||
|
||
// //格式标准化
|
||
// $sAuthorContent = str_replace([',', ';', ';', '、'], ',', $sAuthorContent); // 统一分隔符
|
||
// $sAuthorContent = preg_replace('/\s+and\s+/i', ', ', $sAuthorContent); // and转逗号
|
||
// $sAuthorContent = preg_replace('/\s+/', ' ', $sAuthorContent); // 合并多余空格
|
||
// $sAuthorContent = trim($sAuthorContent);
|
||
|
||
// // 处理作者
|
||
// $content = mb_convert_encoding($sAuthorContent, 'UTF-8', 'auto'); // 确保编码正确
|
||
// $content = str_replace("\xC2\xA0", ' ', $content); // 替换非-breaking空格为普通空格
|
||
// $content = preg_replace('/(\d+)\s*([*#†])/', '$1$2', $content); // 合并"1 *"为"1*"、"1 #"为"1#"
|
||
// $content = preg_replace('/,†/', ',†', $content); // 保留"1,†"格式(防止被拆分)
|
||
// //标记上标内的逗号+空格(多编号)
|
||
// $tempStr = preg_replace('/(\d+)\s*,\s*(\d+)/', '$1<SEP>$2', $content);
|
||
// // 原有步骤2:正则匹配(扩展上标符号支持,保持原有逻辑)
|
||
// $pattern = '/
|
||
// ([A-Za-z\s\.\-]+?) # 姓名(支持缩写、空格)
|
||
// \s* # 姓名与上标间空格
|
||
// ( # 上标组(扩展符号支持)
|
||
// \d+ # 起始数字
|
||
// (?:[†#*,]|<SEP>\d+)* # 允许:†#*符号、逗号、<SEP>+数字(兼容1,†、1,*等)
|
||
// )
|
||
// \s*,? # 作者间逗号(可选)
|
||
// (?=\s|$) # 确保后面是空格或结尾
|
||
// /ux';
|
||
|
||
// preg_match_all($pattern, $tempStr, $matches);
|
||
// $authorList = [];
|
||
// if(!empty($matches[1])){
|
||
// foreach ($matches[1] as $i => $name) {
|
||
// $name = trim($name);
|
||
// $superscript = trim($matches[2][$i]);
|
||
// $superscript = str_replace('<SEP>', ',', $superscript); // 恢复多编号逗号
|
||
// $superscript = preg_replace('/,$/', '', $superscript); // 清理末尾逗号
|
||
// // 修复符号与数字间的空格(如原始"1 *"被误处理为"1*"的情况,保持原样)
|
||
// $superscript = preg_replace('/(\d)([*#†])/', '$1$2', $superscript);
|
||
// if (!empty($name)) {
|
||
// $authorList[] = [
|
||
// 'name' => $name,
|
||
// 'superscript' => $superscript
|
||
// ];
|
||
// }
|
||
// }
|
||
// }else {
|
||
// // 按“两个或多个连续空格”拆分(姓名之间的分隔)
|
||
// $authorList = array_filter(
|
||
// array_map('trim',
|
||
// preg_split('/(,\p{Z}*|\p{Z}{2,})/u', $sAuthorContent)
|
||
// )
|
||
// );
|
||
// }
|
||
|
||
|
||
// // //处理作者
|
||
// // $authorList = [];
|
||
// // // 新正则:匹配“姓名+上标”整体,允许上标含逗号(如1,†)
|
||
// // // 逻辑:姓名以字母/中文开头,上标以数字开头、以符号/数字结尾
|
||
// // // if (preg_match_all('/([A-Za-z\x{4e00}-\x{9fa5}][A-Za-z\s·\-\'\x{4e00}-\x{9fa5}]*)\s*([\d,†#*]+)/u', $sAuthorContent, $matches)) {
|
||
// // if(preg_match_all('/([A-Za-z\x{4e00}-\x{9fa5}][A-Za-z\s·\-\'\x{4e00}-\x{9fa5}]*)\s*(\d[\d,†#\s*]*)/u', $sAuthorContent, $matches)){
|
||
// // for ($i = 0; $i < count($matches[1]); $i++) {
|
||
// // $authorList[] = trim($matches[1][$i] . $matches[2][$i]);
|
||
// // }
|
||
// // } else {
|
||
// // // 按“两个或多个连续空格”拆分(姓名之间的分隔)
|
||
// // $authorList = array_filter(
|
||
// // array_map('trim',
|
||
// // preg_split('/(,\p{Z}*|\p{Z}{2,})/u', $sAuthorContent)
|
||
// // )
|
||
// // );
|
||
// // }
|
||
// $aAuthorData = [];
|
||
// $aReport = [];
|
||
// $namePattern = '/
|
||
// (?:[A-Za-z\s·\-\']+| # 英文姓名(支持空格、连字符)
|
||
// [\x{4e00}-\x{9fa5}]+| # 中文姓名
|
||
// [\x{1800}-\x{18AF}]+| # 蒙古文姓名
|
||
// [A-Z]\.) # 单字母缩写(如 J.)
|
||
// /ux';
|
||
// var_dump($authorList);exit;
|
||
// foreach ($authorList as $authorStr) {
|
||
// if (empty($authorStr)) continue;
|
||
// var_dump($authorList);exit;
|
||
// //分离姓名与上标(支持上标含逗号,如1,†)
|
||
// $superscript = '';
|
||
// // 新正则:匹配以数字开头、含逗号/符号的完整上标(如1,†、2*#)
|
||
// $authorStr = trim(trim($authorStr,','),' ');
|
||
// // if (preg_match('/([\d,†#*]+)$/u', $authorStr, $supMatch)) {
|
||
// // if(preg_match('/\s*([\d,†#* ]+)$/u', $authorStr, $supMatch)){
|
||
// // if (preg_match('/.*?\s*([\d,†#* ]+)$/u', $authorStr, $supMatch)) {
|
||
// // if (preg_match('/.*?\s*([\d,\x{2020}#* ]+?)\s*$/u', $authorStr, $supMatch)) {
|
||
// // if (preg_match('/^(.+?)\D*?(\d[\d,#*†,\s]*)$/u', $authorStr, $supMatch)) {
|
||
// // $superscript = $supMatch[1];
|
||
// // // 移除上标,保留纯姓名(避免残留符号)
|
||
// // $nameStr = trim(preg_replace('/' . preg_quote($superscript, '/') . '$/', '', $authorStr));
|
||
// // } else {
|
||
// // $nameStr = $authorStr;
|
||
// // }
|
||
// $pattern = '/^(.+?)\s*(\d[\d,#*†\s]*?)\s*$/u';
|
||
// if (preg_match($pattern, $authorStr, $supMatch)) {
|
||
// $nameStr = empty($supMatch[1]) ? '' : trim($supMatch[1]); // 姓名部分:"Liguo Zhang"
|
||
// $superscript = empty($supMatch[2]) ? $nameStr : $nameStr.trim($supMatch[2]); // 上标部分:"1
|
||
// // echo "姓名: $nameStr, 上标: $superscript\n";
|
||
// } else {
|
||
// $nameStr = $authorStr;
|
||
// }
|
||
// //验证姓名合法性(过滤无效内容)
|
||
// if (!preg_match($namePattern, $nameStr)) {
|
||
// continue;
|
||
// }
|
||
// //解析上标信息(正确识别1,†中的机构编号和符号)
|
||
// $companyId = '';
|
||
// $isSuper = 0;
|
||
// $isReport = 0;
|
||
// if (!empty($superscript)) {
|
||
// // 提取机构编号(忽略上标中的逗号,如1,† → 提取1)
|
||
// if (preg_match('/(\d+)/', $superscript, $numMatch)) {
|
||
// $companyId = $numMatch[1];
|
||
// }
|
||
// // 识别特殊符号(#为超级作者,*†为通讯作者)
|
||
// $isSuper = strpos($superscript, '#') !== false ? 1 : 0;
|
||
// $isReport = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0;
|
||
// }
|
||
// if (preg_match("/^([A-Za-z\s'\.-]+)/u", $nameStr, $match)) {
|
||
// $nameStr = trim($match[1]);
|
||
// }
|
||
// $aAuthorData[] = [
|
||
// 'name' => $nameStr,
|
||
// 'company_id' => $companyId,
|
||
// 'is_super' => $isSuper,
|
||
// 'is_report' => $isReport
|
||
// ];
|
||
// if ($isReport) {
|
||
// $aReport[] = $nameStr;
|
||
// }
|
||
// }
|
||
// var_dump($aAuthorData);exit;
|
||
// return ['author' => $aAuthorData,'report' => array_unique($aReport)];
|
||
// }
|
||
|
||
// 提取作者
|
||
private function parseAuthorsWithoutRegex($str = '') {
|
||
if (empty($str)) {
|
||
return [];
|
||
}
|
||
// 清理乱码和特殊字符(扩展全角数字处理)
|
||
$str = mb_convert_encoding($str, 'UTF-8', 'auto');
|
||
$str = str_replace(["\xC2\xA0", 'ï¼', '�', ',', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0'],
|
||
[' ', ' ', ' ', ' ', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0'], $str);
|
||
$str = trim(str_replace([' and ', ' AND ', ' And '], ', ', $str));
|
||
|
||
// 合并上标中数字与逗号间的空格(如"2, 3"→"2,3")
|
||
$len = mb_strlen($str);
|
||
$processed = '';
|
||
for ($i = 0; $i < $len; $i++) {
|
||
$char = mb_substr($str, $i, 1);
|
||
if ($char === ',' && $i - 1 >= 0 && $i + 2 < $len) {
|
||
$prevChar = mb_substr($str, $i - 1, 1);
|
||
$next1 = mb_substr($str, $i + 1, 1);
|
||
$next2 = mb_substr($str, $i + 2, 1);
|
||
// 兼容全角数字转半角后的判断
|
||
if ((ctype_digit($prevChar) || is_numeric($prevChar)) && $next1 === ' ' && (ctype_digit($next2) || is_numeric($next2))) {
|
||
$processed .= $char;
|
||
$i += 1;
|
||
continue;
|
||
}
|
||
}
|
||
$processed .= $char;
|
||
}
|
||
$str = $processed;
|
||
|
||
// 合并数字与符号间的空格(如"1 *"→"1*")
|
||
$len = mb_strlen($str);
|
||
$processed = '';
|
||
for ($i = 0; $i < $len; $i++) {
|
||
$char = mb_substr($str, $i, 1);
|
||
if ((ctype_digit($char) || is_numeric($char)) && $i + 2 < $len) { // 支持数字判断
|
||
$next1 = mb_substr($str, $i + 1, 1);
|
||
$next2 = mb_substr($str, $i + 2, 1);
|
||
if ($next1 === ' ' && in_array($next2, ['#', '*', '†', '‡', '§'])) { // 扩展符号支持
|
||
$processed .= $char;
|
||
$i += 2;
|
||
$processed .= $next2;
|
||
continue;
|
||
}
|
||
}
|
||
$processed .= $char;
|
||
}
|
||
$str = $processed;
|
||
|
||
// 合并连续空格
|
||
$len = mb_strlen($str);
|
||
$processed = '';
|
||
$prevSpace = false;
|
||
for ($i = 0; $i < $len; $i++) {
|
||
$char = mb_substr($str, $i, 1);
|
||
if ($char === ' ') {
|
||
if (!$prevSpace) {
|
||
$processed .= $char;
|
||
$prevSpace = true;
|
||
}
|
||
} else {
|
||
$processed .= $char;
|
||
$prevSpace = false;
|
||
}
|
||
}
|
||
$str = trim($processed);
|
||
|
||
// 作者处理
|
||
$authors = [];
|
||
$currentName = '';
|
||
$currentSuperscript = '';
|
||
$inName = true;
|
||
$len = mb_strlen($str);
|
||
for ($i = 0; $i < $len; $i++) {
|
||
$char = mb_substr($str, $i, 1);
|
||
|
||
// 处理作者分隔符:逗号+空格
|
||
if ($char === ',' && $i + 1 < $len) {
|
||
$nextChar = mb_substr($str, $i + 1, 1);
|
||
if ($nextChar === ' ') {
|
||
if (!empty($currentName)) {
|
||
$currentSuperscript = rtrim($currentSuperscript, ',');
|
||
$authors[] = [
|
||
'name' => trim($currentName),
|
||
'superscript' => trim($currentSuperscript)
|
||
];
|
||
}
|
||
$currentName = '';
|
||
$currentSuperscript = '';
|
||
$inName = true;
|
||
$i++;
|
||
continue;
|
||
}
|
||
}
|
||
|
||
// 支持姓名中的点、连字符、特殊字母(如带重音的字母)
|
||
if (ctype_alpha($char) || in_array($char, [' ', '.', '-', 'à', 'á', 'â', 'ã', 'ä', 'ç', 'è', 'é', 'ê', 'ë'])) {
|
||
if ($inName) {
|
||
$currentName .= $char;
|
||
} else {
|
||
$currentSuperscript = rtrim($currentSuperscript, ',');
|
||
$authors[] = [
|
||
'name' => trim($currentName),
|
||
'superscript' => trim($currentSuperscript)
|
||
];
|
||
$currentName = $char;
|
||
$currentSuperscript = '';
|
||
$inName = true;
|
||
}
|
||
}
|
||
// 解析上标(数字、逗号、#、*、†等)
|
||
elseif ((ctype_digit($char) || is_numeric($char)) || in_array($char, ['#', '*', '†', ',', '‡', '§'])) {
|
||
$inName = false;
|
||
$currentSuperscript .= $char;
|
||
}
|
||
// 忽略其他字符
|
||
else {
|
||
continue;
|
||
}
|
||
}
|
||
|
||
// 处理最后一个作者
|
||
if (!empty($currentName)) {
|
||
$currentSuperscript = rtrim($currentSuperscript, ',');
|
||
$authors[] = [
|
||
'name' => trim($currentName),
|
||
'superscript' => trim($currentSuperscript)
|
||
];
|
||
}
|
||
|
||
// 提取机构编号为数组、判断通讯作者和第一作者
|
||
foreach ($authors as $index => &$author) {
|
||
// 提取机构编号(兼容多字节数字)
|
||
$institutionIds = [];
|
||
$superscript = $author['superscript'];
|
||
$numStr = '';
|
||
for ($i = 0; $i < mb_strlen($superscript); $i++) {
|
||
$c = mb_substr($superscript, $i, 1);
|
||
if (ctype_digit($c) || is_numeric($c)) { // 支持数字判断
|
||
$numStr .= $c;
|
||
} else {
|
||
if (!empty($numStr)) {
|
||
$institutionIds[] = (int)$numStr;
|
||
$numStr = '';
|
||
}
|
||
}
|
||
}
|
||
if (!empty($numStr)) {
|
||
$institutionIds[] = (int)$numStr;
|
||
}
|
||
$institutionIds = array_values(array_unique($institutionIds));
|
||
$author['company_id'] = $institutionIds;
|
||
|
||
// 判断第一作者(#标记)和通讯作者(*、†标记)
|
||
$author['is_super'] = strpos($superscript, '#') !== false ? 1 : 0;
|
||
$author['is_report'] = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0;
|
||
}
|
||
unset($author); // 释放引用
|
||
return $authors;
|
||
}
|
||
private function getAuthors($aParam = []) {
|
||
$title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
|
||
$sAuthorContent = $this->getNextParagraphAfterText($title);
|
||
if (empty($sAuthorContent)) {
|
||
return ['author' => [], 'report' => []];
|
||
}
|
||
|
||
//编码修复
|
||
$possibleEncodings = [
|
||
'Windows-1252', 'UTF-8', 'GBK', 'GB2312',
|
||
'Latin-1', 'ISO-8859-1', 'CP1252'
|
||
];
|
||
$encodedContent = @mb_convert_encoding($sAuthorContent, 'UTF-8', implode(',', $possibleEncodings));
|
||
$sAuthorContent = $encodedContent ?: $sAuthorContent;
|
||
|
||
//清理不可见字符
|
||
$sAuthorContent = preg_replace('/[\x00-\x1F\x7F\x{200B}-\x{200F}]/u', '', $sAuthorContent);
|
||
|
||
//修复特殊符号乱码
|
||
$symbolMap = [
|
||
'†' => '†', 'â ' => '†', 'â' => '†', '?†' => '†',
|
||
':' => ':', ',' => ',', '—' => '-',
|
||
'啊' => '' // 针对性移除异常字符“啊”(若为固定乱码)
|
||
];
|
||
$sAuthorContent = strtr($sAuthorContent, $symbolMap);
|
||
|
||
//格式标准化
|
||
$sAuthorContent = str_replace([',', ';', ';', '、'], ',', $sAuthorContent); // 统一分隔符
|
||
$sAuthorContent = preg_replace('/\s+and\s+/i', ', ', $sAuthorContent); // and转逗号
|
||
$sAuthorContent = preg_replace('/\s+/', ' ', $sAuthorContent); // 合并多余空格
|
||
$sAuthorContent = trim($sAuthorContent);
|
||
$aAuthor = $this->parseAuthorsWithoutRegex($sAuthorContent);
|
||
if(empty($aAuthor)){
|
||
return ['author' => [],'report' => []];
|
||
}
|
||
$aReport = $aAuthorData = [];
|
||
|
||
foreach ($aAuthor as $key => $value) {
|
||
if(empty($value['name']) && empty($value['superscript'])){
|
||
continue;
|
||
}
|
||
if(!mb_check_encoding($value['name'], 'UTF-8')){
|
||
$value['name'] = mb_convert_encoding($value['name'], 'UTF-8', 'GBK');
|
||
}
|
||
if(!empty($value['name']) && !empty($value['is_report']) && $value['is_report'] == 1){
|
||
$aReport[] = $value['name'];
|
||
}
|
||
$aAuthorData[] = $value;
|
||
}
|
||
return ['author' => $aAuthorData,'report' => array_unique($aReport)];
|
||
}
|
||
// private function getAuthors($aParam = []) {
|
||
// $title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
|
||
// $sAuthorContent = $this->getNextParagraphAfterText($title);
|
||
// if (empty($sAuthorContent)) {
|
||
// return ['author' => [], 'report' => []];
|
||
// }
|
||
|
||
// //编码修复
|
||
// $possibleEncodings = [
|
||
// 'Windows-1252', 'UTF-8', 'GBK', 'GB2312',
|
||
// 'Latin-1', 'ISO-8859-1', 'CP1252'
|
||
// ];
|
||
// $encodedContent = @mb_convert_encoding($sAuthorContent, 'UTF-8', implode(',', $possibleEncodings));
|
||
// $sAuthorContent = $encodedContent ?: $sAuthorContent;
|
||
|
||
// //清理不可见字符
|
||
// $sAuthorContent = preg_replace('/[\x00-\x1F\x7F\x{200B}-\x{200F}]/u', '', $sAuthorContent);
|
||
|
||
// //修复特殊符号乱码
|
||
// $symbolMap = [
|
||
// '†' => '†', 'â ' => '†', 'â' => '†', '?†' => '†',
|
||
// ':' => ':', ',' => ',', '—' => '-',
|
||
// '啊' => '' // 针对性移除异常字符“啊”(若为固定乱码)
|
||
// ];
|
||
// $sAuthorContent = strtr($sAuthorContent, $symbolMap);
|
||
|
||
// //格式标准化
|
||
// $sAuthorContent = str_replace([',', ';', ';', '、'], ',', $sAuthorContent); // 统一分隔符
|
||
// $sAuthorContent = preg_replace('/\s+and\s+/i', ', ', $sAuthorContent); // and转逗号
|
||
// $sAuthorContent = preg_replace('/\s+/', ' ', $sAuthorContent); // 合并多余空格
|
||
// $sAuthorContent = trim($sAuthorContent);
|
||
// var_dump($this->parseAuthorsWithoutRegex($sAuthorContent));exit;
|
||
// // 关键预处理:兼容"and"分隔符、清理乱码、统一空格
|
||
// $content = mb_convert_encoding($sAuthorContent, 'UTF-8', 'auto');
|
||
// $content = str_replace(["\xC2\xA0", 'ï¼', '�', ','], ' ', $content); // 清理乱码和全角符号
|
||
// $content = preg_replace('/\band\b/i', ',', $content); // 将 "and" 转为逗号(统一分隔符)
|
||
// $content = preg_replace('/(\d+)\s*([*#†])/', '$1$2', $content); // 合并数字与符号间的空格(如"1 *"→"1*")
|
||
// $content = trim(preg_replace('/\s+/', ' ', $content)); // 合并连续空格
|
||
|
||
// // 标记上标内的逗号(多编号处理)
|
||
// $tempStr = preg_replace('/(\d+)\s*,\s*(\d+)/', '$1<SEP>$2', $content);
|
||
|
||
// // 核心正则(保持原有结构,扩展符号支持)
|
||
// $pattern = '/
|
||
// ([A-Za-z\s\.\-]+?) # 姓名(支持缩写、空格、连字符)
|
||
// \s* # 姓名与上标间的空格(允许0或多个)
|
||
// ( # 上标组(扩展兼容所有符号)
|
||
// \d+ # 起始数字(至少1个数字)
|
||
// (?:[†#*,]|<SEP>\d+)* # 允许:符号(†#*)、逗号、<SEP>+数字(多编号)
|
||
// )
|
||
// \s*,? # 作者间的逗号(可选,允许逗号前有空格)
|
||
// (?=\s|$) # 确保后面是空格或字符串结尾(避免跨作者匹配)
|
||
// /ux';
|
||
|
||
// preg_match_all($pattern, $tempStr, $matches);
|
||
|
||
// // 解析结果并格式化
|
||
// $authorList = [];
|
||
// if (!empty($matches[1])) {
|
||
// foreach ($matches[1] as $i => $name) {
|
||
// $name = trim($name);
|
||
// $superscript = trim($matches[2][$i]);
|
||
// $superscript = str_replace('<SEP>', ',', $superscript); // 恢复多编号逗号
|
||
// $superscript = preg_replace('/,$/', '', $superscript); // 清理末尾多余逗号
|
||
// if (!empty($name)) {
|
||
// $authorList[] = [
|
||
// 'name' => $name,
|
||
// 'superscript' => $superscript
|
||
// ];
|
||
// }
|
||
// }
|
||
// }
|
||
|
||
// // 输出结果
|
||
// echo "<pre>";
|
||
// print_r($authorList);
|
||
// echo "</pre>";
|
||
// exit;
|
||
|
||
// // 处理作者
|
||
// $content = mb_convert_encoding($sAuthorContent, 'UTF-8', 'auto'); // 确保编码正确
|
||
// $content = str_replace("\xC2\xA0", ' ', $content); // 替换非-breaking空格为普通空格
|
||
// $content = preg_replace('/(\d+)\s*([*#†])/', '$1$2', $content); // 合并"1 *"为"1*"、"1 #"为"1#"
|
||
// $content = preg_replace('/,†/', ',†', $content); // 保留"1,†"格式(防止被拆分)
|
||
|
||
// //标记上标内的逗号+空格(多编号)
|
||
// $tempStr = preg_replace('/(\d+)\s*,\s*(\d+)/', '$1<SEP>$2', $content);
|
||
// // 原有步骤2:正则匹配(扩展上标符号支持,保持原有逻辑)
|
||
// $pattern = '/
|
||
// ([A-Za-z\s\.\-]+?) # 姓名(支持缩写、空格)
|
||
// \s* # 姓名与上标间空格
|
||
// ( # 上标组(扩展符号支持)
|
||
// \d+ # 起始数字
|
||
// (?:[†#*,]|<SEP>\d+)* # 允许:†#*符号、逗号、<SEP>+数字(兼容1,†、1,*等)
|
||
// )
|
||
// \s*,? # 作者间逗号(可选)
|
||
// (?=\s|$) # 确保后面是空格或结尾
|
||
// /ux';
|
||
|
||
// preg_match_all($pattern, $tempStr, $matches);
|
||
// var_dump($matches);exit;
|
||
// $authorList = [];
|
||
// if(!empty($matches[1])){
|
||
// foreach ($matches[1] as $i => $name) {
|
||
// $name = trim($name);
|
||
// $superscript = trim($matches[2][$i]);
|
||
// $superscript = str_replace('<SEP>', ',', $superscript); // 恢复多编号逗号
|
||
// $superscript = preg_replace('/,$/', '', $superscript); // 清理末尾逗号
|
||
// // 修复符号与数字间的空格(如原始"1 *"被误处理为"1*"的情况,保持原样)
|
||
// $superscript = preg_replace('/(\d)([*#†])/', '$1$2', $superscript);
|
||
// if (!empty($name)) {
|
||
// $authorList[] = [
|
||
// 'name' => $name,
|
||
// 'superscript' => $superscript
|
||
// ];
|
||
// }
|
||
// }
|
||
// }else {
|
||
// // 按“两个或多个连续空格”拆分(姓名之间的分隔)
|
||
// $authorList = array_filter(
|
||
// array_map('trim',
|
||
// preg_split('/(,\p{Z}*|\p{Z}{2,})/u', $sAuthorContent)
|
||
// )
|
||
// );
|
||
// }
|
||
|
||
|
||
// // //处理作者
|
||
// $aAuthorData = [];
|
||
// $aReport = [];
|
||
// $namePattern = '/
|
||
// (?:[A-Za-z\s·\-\']+| # 英文姓名(支持空格、连字符)
|
||
// [\x{4e00}-\x{9fa5}]+| # 中文姓名
|
||
// [\x{1800}-\x{18AF}]+| # 蒙古文姓名
|
||
// [A-Z]\.) # 单字母缩写(如 J.)
|
||
// /ux';
|
||
|
||
// foreach ($authorList as $authorStr){
|
||
// if (empty($authorStr)) continue;
|
||
|
||
// //获取下标
|
||
// $superscript = empty($authorStr['superscript']) ? $authorStr : $authorStr['superscript'];
|
||
// $nameStr = empty($authorStr['name']) ? $authorStr : $authorStr['name'];
|
||
|
||
// $companyId = [];
|
||
// $isSuper = 0;
|
||
// $isReport = 0;
|
||
// if (!empty($superscript)) {
|
||
// // 提取机构编号(忽略上标中的逗号,如1,† → 提取1)
|
||
// preg_match_all('/\d+/', $superscript, $numMatch);
|
||
// // 识别特殊符号(#为超级作者,*†为通讯作者)
|
||
// $isSuper = strpos($superscript, '#') !== false ? 1 : 0;
|
||
// $isReport = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0;
|
||
// }
|
||
// if (preg_match("/^([A-Za-z\s'\.-]+)/u", $nameStr, $match)) {
|
||
// $nameStr = trim($match[1]);
|
||
// }
|
||
// $aAuthorData[] = [
|
||
// 'name' => $nameStr,
|
||
// 'company_id' => empty($numMatch[0]) ? [] : $numMatch[0],
|
||
// 'is_super' => $isSuper,
|
||
// 'is_report' => $isReport
|
||
// ];
|
||
// if ($isReport) {
|
||
// $aReport[] = $nameStr;
|
||
// }
|
||
// }
|
||
// return ['author' => $aAuthorData,'report' => array_unique($aReport)];
|
||
// }
|
||
|
||
// 获取机构
|
||
private function getCompany($aParam = []){
|
||
//获取标题
|
||
$title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
|
||
//获取标题下的作者
|
||
$sAuthorContent = empty($aParam['authors']) ? $this->getNextParagraphAfterText($title) : $aParam['authors'];
|
||
//获取作者结构
|
||
$allLines = $this->getContentAfterText($sAuthorContent,1);
|
||
if(empty($allLines)){
|
||
return [];
|
||
}
|
||
// 2. 按序号分组,合并同一序号的多行内容
|
||
$grouped = [];
|
||
$currentNumber = null; // 当前序号
|
||
foreach ($allLines as $line) {
|
||
$line = trim($line);
|
||
if (empty($line)) continue;
|
||
|
||
// 判断是否是新条目的开头:行首为数字(后续可接任意字符或直接接内容)
|
||
$number = '';
|
||
$i = 0;
|
||
$lineLen = strlen($line);
|
||
// 提取行首的连续数字(作为序号)
|
||
while ($i < $lineLen && ctype_digit($line[$i])) {
|
||
$number .= $line[$i];
|
||
$i++;
|
||
}
|
||
|
||
// 若行首有数字,则视为新条目
|
||
if (!empty($number)) {
|
||
$currentNumber = $number;
|
||
// 提取序号后的内容(跳过数字后的符号/空格,保留核心内容)
|
||
// 从数字后的位置开始,跳过可能的符号(./*)或空格
|
||
while ($i < $lineLen && (in_array($line[$i], ['.', '*', ' ']))) {
|
||
$i++;
|
||
}
|
||
$content = trim(substr($line, $i)); // 序号后的内容
|
||
$grouped[$currentNumber] = $content;
|
||
continue;
|
||
}
|
||
|
||
// 非新条目,合并到当前序号的内容中
|
||
if ($currentNumber !== null) {
|
||
$grouped[$currentNumber] .= ' ' . $line;
|
||
}
|
||
}
|
||
|
||
//清理结果
|
||
$possibleEncodings = [
|
||
'Windows-1252', 'UTF-8', 'GBK', 'GB2312',
|
||
'Latin-1', 'ISO-8859-1', 'CP1252'
|
||
];
|
||
$aCompany = [];
|
||
foreach ($grouped as $number => $institution) {
|
||
$encodedContent = @mb_convert_encoding($institution, 'UTF-8', implode(',', $possibleEncodings));
|
||
$sCompany = $encodedContent ?: $sCompany;
|
||
$institution = preg_replace('/\s+/', ' ', $institution); // 合并多余空格
|
||
$institution = rtrim($institution, '.');
|
||
$institution = preg_replace('/^\d+\s+/', '', $institution);
|
||
$institution = trim($institution); // 清理首尾空格
|
||
preg_match('/(.*?, [A-Za-z]+ \d+, [A-Za-z]+)/', $institution, $institutionmatches);;
|
||
$institution = trim($institutionmatches[1] ?? $institution);
|
||
if (preg_match('/^(.*?)(?=\s*\*Email)/', $institution, $matches)) {
|
||
$institution = trim($matches[1]); // trim() 去除内容前后多余空格
|
||
}
|
||
if(!empty($institution) && !mb_check_encoding($institution, 'UTF-8')){
|
||
$institution = mb_convert_encoding($institution, 'UTF-8', 'GBK');
|
||
}
|
||
$aCompany[$number] = $institution;
|
||
}
|
||
return $aCompany;
|
||
}
|
||
|
||
// 提取通讯作者(含E-mail、地址、电话)
|
||
private function getCorrespondingAuthors($aParam = []){
|
||
$aCorrespondingAuthor = empty($aParam['report']) ? [] : $aParam['report'];
|
||
if(empty($aCorrespondingAuthor)){
|
||
return [];
|
||
}
|
||
|
||
// 获取标题
|
||
$title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
|
||
$sAuthorContent = $this->getNextParagraphAfterText($title);
|
||
$sCompany = $this->getNextParagraphAfterText($sAuthorContent); // 直接取机构所在段落的原始文本
|
||
if (empty($sCompany)) {
|
||
// 备选方案:若机构段落获取失败,用解析后的机构名称拼接
|
||
$aCompany = $this->getCompany($aParam);
|
||
$sCompany = implode(' ', array_values($aCompany));
|
||
}
|
||
|
||
// 获取机构后的完整内容
|
||
$corrText = $this->getContentAfterText($sCompany);
|
||
//编码修复
|
||
$possibleEncodings = [
|
||
'Windows-1252', 'UTF-8', 'GBK', 'GB2312',
|
||
'Latin-1', 'ISO-8859-1', 'CP1252'
|
||
];
|
||
$encodedContent = @mb_convert_encoding($corrText, 'UTF-8', implode(',', $possibleEncodings));
|
||
$corrText = $encodedContent ?: $corrText;
|
||
// // 调试
|
||
// file_put_contents(ROOT_PATH . 'runtime/corr_text_raw.log', $corrText);
|
||
|
||
//清理文本
|
||
$corrText = str_replace([':', '@'], [':', '@'], $corrText);
|
||
$corrText = preg_replace('/\s+/', ' ', $corrText); // 统一空格
|
||
$corrText = str_replace(' ', ' ', $corrText); // 去除多余空格
|
||
//按"*"分割通讯作者
|
||
$corrBlocks = preg_split('/\s*\*\s*/', $corrText);
|
||
$corrBlocks = array_filter(array_map('trim', $corrBlocks));
|
||
|
||
$aCorresponding = [];
|
||
foreach ($corrBlocks as $block) {
|
||
//匹配通讯作者姓名
|
||
$sName = $this->matchCorrespondingName($block, $aCorrespondingAuthor);
|
||
if (empty($sName)) {
|
||
continue;
|
||
}
|
||
preg_match('/(E[\s-]*mail|邮箱)[\s:]*([^\s]+@[^\s]+)/i', $block, $email);
|
||
preg_match('/(Postal[\s-]*address|地址)[\s:]*([^,;]+)/i', $block, $address);
|
||
preg_match('/(Tel|电话)[\s:]*([^\s]+)/i', $block, $tel);
|
||
$aCorresponding[] = [
|
||
'name' => $sName,
|
||
'email' => isset($email[2]) ? trim($email[2]) : '',
|
||
'postal_address' => isset($address[2]) ? trim($address[2]) : '',
|
||
'tel' => isset($tel[2]) ? trim($tel[2]) : ''
|
||
];
|
||
}
|
||
if(empty($aCorresponding)){
|
||
$pattern = '/Corresponding Authors: (.*?)(?=$|;)/s';
|
||
preg_match($pattern, $corrText, $match);
|
||
if (!empty($match[1])) {
|
||
$corrContent = $match[1];
|
||
// 提取每个作者的名称和邮箱(优化正则,支持更多字符)
|
||
$authorPattern = '/([A-Za-z\s]+?),\s*E-mail:\s*([\w@\.\-]+)/';
|
||
preg_match_all($authorPattern, $corrContent, $authors);
|
||
if(!empty($authors[1])){
|
||
for ($i = 0; $i < count($authors[1]); $i++) {
|
||
$aCorresponding[] = [
|
||
'name' => empty($authors[1][$i]) ? '' : trim($authors[1][$i]),
|
||
'email' => empty($authors[2][$i]) ? '' : trim($authors[2][$i])
|
||
];
|
||
}
|
||
}
|
||
}
|
||
}
|
||
return $aCorresponding;
|
||
}
|
||
|
||
//匹配通讯作者姓名
|
||
private function matchCorrespondingName($block, $corrNames)
|
||
{
|
||
$blockLower = strtolower($block);
|
||
foreach ($corrNames as $name) {
|
||
if (strpos($blockLower, strtolower($name)) !== false) {
|
||
return $name;
|
||
}
|
||
$nameParts = explode(' ', $name);
|
||
if (count($nameParts) >= 2) {
|
||
$reversedName = implode(' ', array_reverse($nameParts));
|
||
if (strpos($blockLower, strtolower($reversedName)) !== false) {
|
||
return $name;
|
||
}
|
||
}
|
||
}
|
||
return '';
|
||
}
|
||
|
||
// 获取目标文本的下一个段落
|
||
private function getNextParagraphAfterText($targetText){
|
||
|
||
$found = false;
|
||
foreach ($this->sections as $section) {
|
||
foreach ($section->getElements() as $element) {
|
||
$text = $this->getTextFromElement($element);
|
||
if(empty($text)){
|
||
continue;
|
||
}
|
||
if ($found) {
|
||
return $text;
|
||
}
|
||
if (stripos($text, $targetText) !== false) {
|
||
$found = true;
|
||
}
|
||
}
|
||
}
|
||
return '';
|
||
}
|
||
|
||
// 获取目标文本后的所有内容
|
||
private function getContentAfterText($targetText,$return_type = 2){
|
||
$found = false;
|
||
$content = [];
|
||
$stopKeywords = ['关键词', 'Key words', '摘要', 'Abstract','ABSTRACT'];
|
||
$maxLines = 200;
|
||
$lineNumber = 0;
|
||
foreach ($this->sections as $section) {
|
||
|
||
foreach ($section->getElements() as $element) {
|
||
|
||
$lineNumber++;
|
||
if (count($content) >= $maxLines) break;
|
||
|
||
$text = $this->getTextFromElement($element,$lineNumber);
|
||
$text = trim($text);
|
||
if (empty($text)) continue;
|
||
if (!$found) {
|
||
// 移除所有非字母数字字符后匹配
|
||
$cleanTarget = preg_replace('/[^a-zA-Z0-9]/', '', strtolower($targetText));
|
||
$cleanText = preg_replace('/[^a-zA-Z0-9]/', '', strtolower($text));
|
||
// 只要目标文本的50%以上能匹配即可
|
||
if (strlen($cleanTarget) > 0 && similar_text($cleanText, $cleanTarget) / strlen($cleanTarget) > 0.5) {
|
||
$found = true;
|
||
}
|
||
continue;
|
||
}
|
||
|
||
// 检查停止关键词
|
||
$shouldStop = false;
|
||
foreach ($stopKeywords as $kw) {
|
||
if (stripos($text, $kw) !== false) {
|
||
$shouldStop = true;
|
||
break;
|
||
}
|
||
}
|
||
if ($shouldStop) break;
|
||
|
||
$content[] = $text;
|
||
}
|
||
if (count($content) >= $maxLines || (isset($shouldStop) && $shouldStop)) break;
|
||
}
|
||
if($return_type == 1){
|
||
return $content;
|
||
}
|
||
$content = implode("\n", $content);
|
||
if(!empty($content) && !mb_check_encoding($content, 'UTF-8')){
|
||
$content = mb_convert_encoding($content, 'UTF-8', 'GBK');
|
||
}
|
||
return $content;
|
||
}
|
||
|
||
// 统一提取元素文本
|
||
private function getTextFromElement($element,$lineNumber = 0){
|
||
$text = '';
|
||
// 处理PreserveText元素
|
||
if ($element instanceof \PhpOffice\PhpWord\Element\PreserveText) {
|
||
// 通过反射获取私有属性 text
|
||
$reflection = new \ReflectionClass($element);
|
||
$property = $reflection->getProperty('text');
|
||
$property->setAccessible(true);
|
||
$textParts = $property->getValue($element);
|
||
foreach ($textParts as $part) {
|
||
if (strpos($part, 'HYPERLINK') !== false) {
|
||
// 解码 HTML 实体(" -> ")
|
||
$decoded = html_entity_decode($part);
|
||
// 提取 mailto: 后的邮箱
|
||
if (preg_match('/mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/i', $decoded, $match)) {
|
||
$text .= $match[1] . ' ';
|
||
}
|
||
} else {
|
||
// 普通文本直接拼接
|
||
$text .= $part;
|
||
}
|
||
}
|
||
return $text;
|
||
}
|
||
// 处理表格和单元格(E-mail可能在表格中)
|
||
if ($element instanceof \PhpOffice\PhpWord\Element\Table) {
|
||
foreach ($element->getRows() as $row) {
|
||
foreach ($row->getCells() as $cell) {
|
||
$text .= $this->getTextFromElement($cell);
|
||
}
|
||
}
|
||
return $text;
|
||
}
|
||
if ($element instanceof \PhpOffice\PhpWord\Element\Cell) {
|
||
foreach ($element->getElements() as $child) {
|
||
$text .= $this->getTextFromElement($child);
|
||
}
|
||
return $text;
|
||
}
|
||
|
||
//处理嵌套元素(递归提取所有子元素)
|
||
if (method_exists($element, 'getElements')) {
|
||
foreach ($element->getElements() as $child) {
|
||
$text .= $this->getTextFromElement($child);
|
||
}
|
||
}
|
||
|
||
//处理文本元素(包括带格式的文本)
|
||
if ($element instanceof \PhpOffice\PhpWord\Element\Text) {
|
||
$text .= $element->getText();
|
||
}
|
||
|
||
//处理超链接(优先提取链接目标,可能是邮箱)
|
||
if ($element instanceof \PhpOffice\PhpWord\Element\Link) {
|
||
$target = $element->getTarget();
|
||
if (strpos($target, 'mailto:') === 0) {
|
||
$text .= str_replace('mailto:', '', $target) . ' '; // 剥离mailto:前缀
|
||
}
|
||
$text .= $element->getText() . ' ';
|
||
}
|
||
|
||
//处理字段和注释(可能包含隐藏邮箱)
|
||
if ($element instanceof \PhpOffice\PhpWord\Element\Field) {
|
||
$text .= $element->getContent() . ' ';
|
||
}
|
||
if ($element instanceof \PhpOffice\PhpWord\Element\Note) {
|
||
$text .= $element->getContent() . ' ';
|
||
}
|
||
//清理所有不可见字符(关键:移除格式干扰)
|
||
$text = preg_replace('/[\x00-\x1F\x7F-\x9F]/', ' ', $text); // 移除控制字符
|
||
$text = str_replace(["\t", "\r", "\n"], ' ', $text); // 统一空白字符
|
||
$text = preg_replace('/\s+/', ' ', $text); // 合并多个空格
|
||
if(!empty($text) && !mb_check_encoding($text, 'UTF-8')){
|
||
$text = mb_convert_encoding($text, 'UTF-8', 'GBK');
|
||
}
|
||
return $text;
|
||
}
|
||
|
||
/**
|
||
* 从 Word 文档提取摘要和关键词
|
||
* @return array 提取结果
|
||
*/
|
||
public function extractFromWord() {
|
||
$sContent = '';
|
||
//文本处理
|
||
$sFundContent = '';
|
||
foreach ($this->sections as $section) {
|
||
foreach ($section->getElements() as $element) {
|
||
$textContent = $this->getTextFromElement($element);
|
||
if(empty($textContent)){
|
||
continue;
|
||
}
|
||
//编码修复
|
||
$possibleEncodings = [
|
||
'Windows-1252', 'UTF-8', 'GBK', 'GB2312',
|
||
'Latin-1', 'ISO-8859-1', 'CP1252'
|
||
];
|
||
$sContent .= @mb_convert_encoding($textContent, 'UTF-8', implode(',', $possibleEncodings));
|
||
if(stripos($textContent, 'Keywords:') !== false){
|
||
$sContent .= "Keywords-End-Flag";
|
||
}
|
||
if(empty($sFundContent)){
|
||
$aFund = $this->getMatchedFundPhrases($sContent);
|
||
if(!empty($aFund[0])){
|
||
$position = stripos($sContent, $aFund[0]);
|
||
$sFundContent = substr($sContent, $position);
|
||
$sFundContent = trim(str_ireplace($aFund[0], '', $sFundContent));
|
||
if (preg_match('/^(.*?)Peer review/', $sFundContent, $matches)) {
|
||
$sFundContent = $matches[1]; // 提取匹配到的前置内容
|
||
}
|
||
}
|
||
}
|
||
$sContent .= "\n";
|
||
}
|
||
}
|
||
|
||
if(!empty($sContent) && !mb_check_encoding($sContent, 'UTF-8')){
|
||
$sContent = mb_convert_encoding($sContent, 'UTF-8', 'GBK');
|
||
}
|
||
// 2. 基础文本清理(合并多余空格,保留有效换行)
|
||
$textContent = preg_replace('/(\S)\s+/', '$1 ', $sContent);
|
||
$textContent = trim($textContent);
|
||
|
||
// 3. 提取摘要
|
||
$abstract = '';
|
||
$abstractPattern = '/Abstract\s*([\s\S]*?)(?=Keywords|$)/i';
|
||
if (preg_match($abstractPattern, $textContent, $abstractMatches)) {
|
||
$abstract = trim($abstractMatches[1]);
|
||
$abstract = preg_replace('/\n+/', ' ', $abstract);
|
||
}
|
||
// 4. 提取关键词(核心:仅保留两种强制匹配逻辑)
|
||
$keywords = [];
|
||
// $keywordPattern = '/Keywords:\s*([\s\S]*?)(?=\s*\d+\.|[;,]\s*[\r\n]+\s*[\r\n]+|(?i)\bintroduction|abbreviations\b|$)/i';
|
||
$keywordPattern = '/Keywords\s*(.*?)\s*Keywords-End-Flag/s';
|
||
|
||
if (preg_match($keywordPattern, $textContent, $keywordMatches)) {
|
||
$keywordStr = trim($keywordMatches[1]);
|
||
|
||
// 清理关键词列表格式(去除换行、末尾多余符号)
|
||
$keywordStr = preg_replace('/\n+/', ' ', $keywordStr);
|
||
$keywordStr = rtrim($keywordStr, ';,. '); // 去除末尾分号、逗号等
|
||
$keywordStr = trim($keywordStr);
|
||
|
||
// 分割并过滤有效关键词
|
||
$keywords = preg_split('/[,;]\s*/', $keywordStr);
|
||
$keywords = array_filter(array_map('trim', $keywords), function($item) {
|
||
return !empty($item) && !ctype_space($item);
|
||
});
|
||
}
|
||
if(empty($keywords)){
|
||
$keywordPattern = '/Keywords\s*([\s\S]*?)(?=Introduction|$)/i';
|
||
if (preg_match($keywordPattern, $textContent, $keywordMatches)) {
|
||
$keywordStr = trim($keywordMatches[1]);
|
||
// 清理关键词列表格式(去除换行、末尾多余符号)
|
||
$keywordStr = preg_replace('/\n+/', ' ', $keywordStr);
|
||
$keywordStr = rtrim($keywordStr, ';,. '); // 去除末尾分号、逗号等
|
||
$keywordStr = trim($keywordStr);
|
||
|
||
// 分割并过滤有效关键词
|
||
$keywords = preg_split('/[,;]\s*/', $keywordStr);
|
||
$keywords = array_filter(array_map('trim', $keywords), function($item) {
|
||
return !empty($item) && !ctype_space($item);
|
||
});
|
||
}
|
||
}
|
||
return [
|
||
'status' => 1,
|
||
'msg' => '提取成功',
|
||
'data' => [
|
||
'abstrart' => $abstract,
|
||
'keywords' => $keywords,
|
||
'fund' => $sFundContent
|
||
]
|
||
];
|
||
}
|
||
private function getMatchedFundPhrases($content = '') {
|
||
if (empty($content)) {
|
||
return [];
|
||
}
|
||
|
||
// 基金支持词组列表
|
||
$fundPhrases = [
|
||
'Supported by', 'Funded by', 'Sponsored by', 'Supported in part by',
|
||
'Funding was provided by', 'Funded in part by'
|
||
];
|
||
|
||
// 1. 转义词组中的特殊字符,使用 # 作为分隔符
|
||
$escapedPhrases = array_map(function($phrase) {
|
||
return preg_quote($phrase, '#');
|
||
}, $fundPhrases);
|
||
|
||
// 2. 拼接为正则模式:匹配任意一个词组(保留原始词组的捕获)
|
||
$pattern = '#('.implode('|', $escapedPhrases).')#i';
|
||
// 注意:此处用 () 捕获分组,而非 (?:),用于提取匹配到的具体词组
|
||
|
||
// 3. 全局匹配所有符合的词组
|
||
preg_match_all($pattern, $content, $matches);
|
||
|
||
// 4. 处理结果:去重、保留原始词组格式(忽略大小写导致的变体)
|
||
$matched = [];
|
||
if (!empty($matches[1])) {
|
||
// 遍历匹配到的结果(可能包含大小写变体,如 'funded by')
|
||
foreach ($matches[1] as $match) {
|
||
// 与原始词组列表比对,找到完全匹配的原始词组(忽略大小写)
|
||
foreach ($fundPhrases as $original) {
|
||
if (strcasecmp($match, $original) === 0) {
|
||
$matched[] = $original;
|
||
break; // 找到后跳出内层循环,避免重复
|
||
}
|
||
}
|
||
}
|
||
// 去重并保持原始顺序
|
||
$matched = array_values(array_unique($matched));
|
||
}
|
||
|
||
return $matched;
|
||
}
|
||
//日志打印
|
||
private function log($msg){
|
||
// echo date('[Y-m-d H:i:s] ') . $msg . "\n";
|
||
}
|
||
} |