1512 lines
64 KiB
PHP
1512 lines
64 KiB
PHP
<?php
|
||
namespace app\common;
|
||
use PhpOffice\PhpWord\IOFactory;
|
||
use think\Exception;
|
||
use ZipArchive;
|
||
use RecursiveIteratorIterator;
|
||
use RecursiveDirectoryIterator;
|
||
use PhpOffice\PhpWord\Settings;
|
||
use PhpOffice\PhpWord\Element\TextRun;
|
||
use DOMDocument;
|
||
use DOMXPath;
|
||
// use BadMethodCallException;
|
||
class ArticleParserService
|
||
{
|
||
private $phpWord;
|
||
private $sections;
|
||
private $iNum = 0;
|
||
public function __construct($filePath = '')
|
||
{
|
||
if (!file_exists($filePath)) {
|
||
return json_encode(['status' => 5, 'msg' => '"文档不存在:{$filePath}"']);
|
||
}
|
||
try {
|
||
// 关键配置:关闭“仅读数据”,保留完整节结构
|
||
$reader = IOFactory::createReader();
|
||
$reader->setReadDataOnly(false);
|
||
Settings::setCompatibility(false);
|
||
Settings::setOutputEscapingEnabled(true); // 避免XML转义冲突
|
||
|
||
$doc = $reader->load($filePath);
|
||
$sectionCount = count($doc->getSections());
|
||
// $this->log("✅ 文档直接加载成功,节数量:{$sectionCount}");
|
||
$this->phpWord = $reader->load($filePath);
|
||
$this->sections = $this->phpWord->getSections();
|
||
} catch (\Exception $e) {
|
||
// 预处理:移除 DOCX 中的 EMF 图片
|
||
$processedFilePath = $this->removeEmfFromDocx($filePath);
|
||
// 加载处理后的文档
|
||
$reader = IOFactory::createReader();
|
||
$reader->setReadDataOnly(false);
|
||
Settings::setCompatibility(false);
|
||
Settings::setOutputEscapingEnabled(true);
|
||
|
||
$this->phpWord = $reader->load($processedFilePath);
|
||
$this->sections = $this->phpWord->getSections();
|
||
|
||
// 可选:删除临时处理文件(避免冗余)
|
||
unlink($processedFilePath);
|
||
return json_encode(['status' => 5, 'msg' => $e->getMessage()]);
|
||
}
|
||
}
|
||
/**
|
||
* 移除 DOCX 压缩包内的所有 EMF 图片
|
||
* @param string $docxPath 原 DOCX 文件路径
|
||
* @return string 处理后的临时 DOCX 路径
|
||
*/
|
||
private function removeEmfFromDocx($docxPath){
|
||
$zip = new ZipArchive();
|
||
if ($zip->open($docxPath) !== true) {
|
||
throw new \Exception("无法打开 DOCX 文件:{$docxPath}");
|
||
}
|
||
|
||
// 1. 创建临时目录用于解压
|
||
$tempDir = rtrim(ROOT_PATH,'/').'/runtime/'.uniqid('docx_temp_');
|
||
|
||
mkdir($tempDir, 0700, true);
|
||
|
||
// 2. 解压 DOCX 到临时目录
|
||
$zip->extractTo($tempDir);
|
||
$zip->close();
|
||
|
||
// 3. 递归删除所有 EMF 文件
|
||
$dirIterator = new RecursiveDirectoryIterator($tempDir);
|
||
$iterator = new RecursiveIteratorIterator($dirIterator);
|
||
foreach ($iterator as $file) {
|
||
if ($file->isFile() && strtolower(pathinfo($file, PATHINFO_EXTENSION)) === 'emf') {
|
||
unlink($file->getPathname());
|
||
}
|
||
}
|
||
// 4. 重新打包为 DOCX
|
||
$processedPath = $tempDir . '_processed.docx';
|
||
$newZip = new ZipArchive();
|
||
if ($newZip->open($processedPath, ZipArchive::CREATE | ZipArchive::OVERWRITE) !== true) {
|
||
throw new \Exception("无法创建处理后的 DOCX 文件");
|
||
}
|
||
|
||
// 遍历临时目录,添加所有文件到新压缩包
|
||
$this->addFilesToZip($tempDir, $newZip);
|
||
$newZip->close();
|
||
|
||
// 5. 删除临时解压目录
|
||
$this->deleteDir($tempDir);
|
||
|
||
return $processedPath;
|
||
}
|
||
|
||
/**
|
||
* 递归添加目录文件到 ZipArchive
|
||
* @param string $dir 目录路径
|
||
* @param ZipArchive $zip ZipArchive 实例
|
||
*/
|
||
private function addFilesToZip($dir, $zip)
|
||
{
|
||
$files = scandir($dir);
|
||
foreach ($files as $file) {
|
||
if ($file === '.' || $file === '..') continue;
|
||
|
||
$filePath = $dir . '/' . $file;
|
||
if (is_dir($filePath)) {
|
||
$this->addFilesToZip($filePath, $zip);
|
||
} else {
|
||
// 计算压缩包内的相对路径(避免冗余目录层级)
|
||
$relativePath = str_replace(dirname($dir) . '/', '', $filePath);
|
||
$zip->addFile($filePath, $relativePath);
|
||
}
|
||
}
|
||
}
|
||
|
||
/**
|
||
* 递归删除目录
|
||
* @param string $dir 目录路径
|
||
*/
|
||
private function deleteDir($dir){
|
||
// 1. 基础校验:非空字符串且为有效目录
|
||
if (trim($dir) === '' || !is_dir($dir)) {
|
||
return false;
|
||
}
|
||
|
||
// 2. 统一路径格式(去除尾部分隔符,避免跨系统差异)
|
||
$dir = rtrim($dir, DIRECTORY_SEPARATOR);
|
||
$dirName = basename($dir);
|
||
|
||
// 3. 前缀强校验:仅处理docx_temp_开头的目录
|
||
if (strpos($dirName, 'docx_temp_') !== 0) {
|
||
return false;
|
||
}
|
||
|
||
// 4. 路径归属校验(缓存realpath结果,减少I/O)
|
||
$runtimeDir = rtrim(ROOT_PATH, '/') . '/runtime';
|
||
$realDir = realpath($dir);
|
||
$realRuntimeDir = realpath($runtimeDir);
|
||
if ($realDir === false || $realRuntimeDir === false || strpos($realDir, $realRuntimeDir) !== 0) {
|
||
return false;
|
||
}
|
||
|
||
// 5. 扫描目录(带错误抑制,处理权限问题)
|
||
$files = @scandir($dir);
|
||
if ($files === false) {
|
||
return false;
|
||
}
|
||
|
||
$isFullyDeleted = true; // 标记是否完全删除
|
||
|
||
// 6. 递归处理子项
|
||
foreach ($files as $file) {
|
||
if ($file === '.' || $file === '..') {
|
||
continue;
|
||
}
|
||
|
||
$filePath = $dir . DIRECTORY_SEPARATOR . $file;
|
||
$realFilePath = realpath($filePath);
|
||
|
||
// 子路径校验:必须是当前目录的子项(防符号链接跳转)
|
||
if ($realFilePath === false || strpos($realFilePath, $realDir) !== 0) {
|
||
$isFullyDeleted = false;
|
||
continue;
|
||
}
|
||
|
||
if (is_dir($realFilePath)) {
|
||
// 递归删除子目录,继承校验逻辑
|
||
if (!$this->deleteDir($realFilePath)) {
|
||
$isFullyDeleted = false;
|
||
}
|
||
} else {
|
||
// 尝试删除文件(失败则标记未完全删除)
|
||
if (!@unlink($realFilePath)) {
|
||
$isFullyDeleted = false;
|
||
}
|
||
}
|
||
}
|
||
|
||
// 7. 最终删除目录(确保空目录才删除)
|
||
$remainingFiles = @scandir($dir);
|
||
if ($remainingFiles !== false && count($remainingFiles) <= 2) {
|
||
@rmdir($dir);
|
||
return $isFullyDeleted; // 若子项完全删除,则返回true
|
||
}
|
||
|
||
return false;
|
||
}
|
||
|
||
// 上传并解析文档的入口方法
|
||
public static function uploadAndParse($sFileUrl){
|
||
//必填值验证
|
||
if(empty($sFileUrl)){
|
||
return json_encode(['status' => 2,'msg' => 'Please upload the submission file']);
|
||
}
|
||
|
||
//判断文件是否执行
|
||
if (!file_exists($sFileUrl)) {
|
||
return json_encode(['status' => 3, 'msg' => 'The uploaded file does not exist']);
|
||
}
|
||
if (!is_readable($sFileUrl)) {
|
||
return json_encode(['status' => 4, 'msg' => 'The uploaded file is unreadable']);
|
||
}
|
||
|
||
// 解析文档
|
||
$oDealFile = new self($sFileUrl);
|
||
//获取标题
|
||
$sTitle = $oDealFile->getTitle();
|
||
if(empty($sTitle)){
|
||
return json_encode(['status' => 5, 'msg' => 'Article title retrieval failed']);
|
||
}
|
||
//获取作者
|
||
$aParam = ['title' => $sTitle];
|
||
$aAuthor = $oDealFile->getAuthors($aParam);
|
||
$aAuthorData = empty($aAuthor['author']) ? [] : $aAuthor['author'];//所有作者信息
|
||
$aAuthorReportData = empty($aAuthor['report']) ? [] : $aAuthor['report'];//通讯作者信息
|
||
$aParam['author'] = $aAuthorData;
|
||
$aParam['report'] = $aAuthorReportData;
|
||
//获取机构
|
||
$aCompany = $oDealFile->getCompany($aParam);
|
||
$aParam['company'] = $aCompany;
|
||
//获取通讯作者信息
|
||
$aParam['corresponding'] = $oDealFile->getCorrespondingAuthors($aParam);
|
||
//keywords 和 摘要
|
||
$aContent = $oDealFile->extractFromWord();
|
||
if(!mb_check_encoding($sTitle, 'UTF-8')){
|
||
$sTitle = mb_convert_encoding($sTitle, 'UTF-8', 'GBK');
|
||
}
|
||
$aParam['title'] = $oDealFile->fullDecode($aParam['title']);
|
||
$aParam += empty($aContent['data']) ? [] : $aContent['data'];
|
||
return json_encode(['status' => 1,'msg' => 'success','data' => $aParam]);
|
||
}
|
||
|
||
// 提取文章标题
|
||
private function getTitle(){
|
||
if(empty($this->sections)){
|
||
return '';
|
||
}
|
||
$title = '';
|
||
$maxLength = 0;
|
||
foreach ($this->sections as $section) {
|
||
foreach ($section->getElements() as $element) {
|
||
$text = $this->getTextFromElement($element);
|
||
$length = mb_strlen(trim($text));
|
||
if ($length > $maxLength && $length > 3) { // 标题通常较长
|
||
$title = trim($text);
|
||
$maxLength = $length;
|
||
break 2; // 取第一个最长段落作为标题
|
||
}
|
||
}
|
||
}
|
||
return $title;
|
||
}
|
||
|
||
// 提取作者
|
||
private function parseAuthorsWithoutRegex($str = '') {
|
||
if (empty($str)) {
|
||
return [];
|
||
}
|
||
if(!mb_check_encoding($str, 'UTF-8')){
|
||
$str = mb_convert_encoding($str, 'UTF-8', 'GBK');
|
||
}
|
||
$str = $this->fullDecode($str);
|
||
$str = str_replace(["\xC2\xA0", 'ï¼', '�', ',', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0'],
|
||
[' ', ' ', ' ', ' ', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0'], $str);
|
||
$str = trim(str_replace([' and ', ' AND ', ' And '], ', ', $str));
|
||
|
||
// 合并上标中数字与逗号间的空格(如"2, 3"→"2,3")
|
||
$len = mb_strlen($str);
|
||
$processed = '';
|
||
for ($i = 0; $i < $len; $i++) {
|
||
$char = mb_substr($str, $i, 1);
|
||
if ($char === ',' && $i - 1 >= 0 && $i + 2 < $len) {
|
||
$prevChar = mb_substr($str, $i - 1, 1);
|
||
$next1 = mb_substr($str, $i + 1, 1);
|
||
$next2 = mb_substr($str, $i + 2, 1);
|
||
// 兼容全角数字转半角后的判断
|
||
if ((ctype_digit($prevChar) || is_numeric($prevChar)) && $next1 === ' ' && (ctype_digit($next2) || is_numeric($next2))) {
|
||
$processed .= $char;
|
||
$i += 1;
|
||
continue;
|
||
}
|
||
}
|
||
$processed .= $char;
|
||
}
|
||
$str = $processed;
|
||
|
||
// 合并数字与符号间的空格(如"1 *"→"1*")
|
||
$len = mb_strlen($str);
|
||
$processed = '';
|
||
for ($i = 0; $i < $len; $i++) {
|
||
$char = mb_substr($str, $i, 1);
|
||
if ((ctype_digit($char) || is_numeric($char)) && $i + 2 < $len) { // 支持数字判断
|
||
$next1 = mb_substr($str, $i + 1, 1);
|
||
$next2 = mb_substr($str, $i + 2, 1);
|
||
if ($next1 === ' ' && in_array($next2, ['#', '*', '†', '‡', '§'])) { // 扩展符号支持
|
||
$processed .= $char;
|
||
$i += 2;
|
||
$processed .= $next2;
|
||
continue;
|
||
}
|
||
}
|
||
$processed .= $char;
|
||
}
|
||
$str = $processed;
|
||
|
||
// 合并连续空格
|
||
$len = mb_strlen($str);
|
||
$processed = '';
|
||
$prevSpace = false;
|
||
for ($i = 0; $i < $len; $i++) {
|
||
$char = mb_substr($str, $i, 1);
|
||
if ($char === ' ') {
|
||
if (!$prevSpace) {
|
||
$processed .= $char;
|
||
$prevSpace = true;
|
||
}
|
||
} else {
|
||
$processed .= $char;
|
||
$prevSpace = false;
|
||
}
|
||
}
|
||
$str = trim($processed);
|
||
|
||
// 作者处理
|
||
$authors = [];
|
||
$currentName = '';
|
||
$currentSuperscript = '';
|
||
$inName = true;
|
||
$len = mb_strlen($str);
|
||
for ($i = 0; $i < $len; $i++) {
|
||
$char = mb_substr($str, $i, 1);
|
||
|
||
// 处理作者分隔符:逗号+空格
|
||
if ($char === ',' && $i + 1 < $len) {
|
||
$nextChar = mb_substr($str, $i + 1, 1);
|
||
if ($nextChar === ' ') {
|
||
if (!empty($currentName)) {
|
||
$currentSuperscript = rtrim($currentSuperscript, ',');
|
||
$authors[] = [
|
||
'name' => trim($currentName),
|
||
'superscript' => trim($currentSuperscript)
|
||
];
|
||
}
|
||
$currentName = '';
|
||
$currentSuperscript = '';
|
||
$inName = true;
|
||
$i++;
|
||
continue;
|
||
}
|
||
}
|
||
|
||
// 支持姓名中的点、连字符、特殊字母(如带重音的字母)
|
||
if (ctype_alpha($char) || in_array($char, [' ', '.', '-', 'à', 'á', 'â', 'ã', 'ä', 'ç', 'è', 'é', 'ê', 'ë'])) {
|
||
if ($inName) {
|
||
$currentName .= $char;
|
||
} else {
|
||
$currentSuperscript = rtrim($currentSuperscript, ',');
|
||
$authors[] = [
|
||
'name' => trim($currentName),
|
||
'superscript' => trim($currentSuperscript)
|
||
];
|
||
$currentName = $char;
|
||
$currentSuperscript = '';
|
||
$inName = true;
|
||
}
|
||
}
|
||
// 解析上标(数字、逗号、#、*、†等)
|
||
elseif ((ctype_digit($char) || is_numeric($char)) || in_array($char, ['#', '*', '†', ',', '‡', '§'])) {
|
||
$inName = false;
|
||
$currentSuperscript .= $char;
|
||
}
|
||
// 忽略其他字符
|
||
else {
|
||
continue;
|
||
}
|
||
}
|
||
|
||
// 处理最后一个作者
|
||
if (!empty($currentName)) {
|
||
$currentSuperscript = rtrim($currentSuperscript, ',');
|
||
$authors[] = [
|
||
'name' => trim($currentName),
|
||
'superscript' => trim($currentSuperscript)
|
||
];
|
||
}
|
||
|
||
// 提取机构编号为数组、判断通讯作者和第一作者
|
||
foreach ($authors as $index => &$author) {
|
||
// 提取机构编号(兼容多字节数字)
|
||
$institutionIds = [];
|
||
$superscript = $author['superscript'];
|
||
$numStr = '';
|
||
for ($i = 0; $i < mb_strlen($superscript); $i++) {
|
||
$c = mb_substr($superscript, $i, 1);
|
||
if (ctype_digit($c) || is_numeric($c)) { // 支持数字判断
|
||
$numStr .= $c;
|
||
} else {
|
||
if (!empty($numStr)) {
|
||
$institutionIds[] = (int)$numStr;
|
||
$numStr = '';
|
||
}
|
||
}
|
||
}
|
||
if (!empty($numStr)) {
|
||
$institutionIds[] = (int)$numStr;
|
||
}
|
||
$institutionIds = array_values(array_unique($institutionIds));
|
||
$author['company_id'] = $institutionIds;
|
||
|
||
// 判断第一作者(#标记)和通讯作者(*、†标记)
|
||
$author['is_super'] = strpos($superscript, '#') !== false ? 1 : 0;
|
||
$author['is_report'] = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0;
|
||
}
|
||
unset($author); // 释放引用
|
||
return $authors;
|
||
}
|
||
private function getAuthors($aParam = []) {
|
||
$title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
|
||
$sAuthorContent = $this->getNextParagraphAfterText($title);
|
||
if (empty($sAuthorContent)) {
|
||
return ['author' => [], 'report' => []];
|
||
}
|
||
if(!mb_check_encoding($sAuthorContent, 'UTF-8')){
|
||
$sAuthorContent = mb_convert_encoding($sAuthorContent, 'UTF-8', 'GBK');
|
||
}
|
||
$sAuthorContent = $this->fullDecode($sAuthorContent);
|
||
//清理不可见字符
|
||
$sAuthorContent = preg_replace('/[\x00-\x1F\x7F\x{200B}-\x{200F}]/u', '', $sAuthorContent);
|
||
|
||
//修复特殊符号乱码
|
||
$symbolMap = [
|
||
'†' => '†', 'â ' => '†', 'â' => '†', '?†' => '†',
|
||
':' => ':', ',' => ',', '—' => '-',
|
||
'啊' => '' // 针对性移除异常字符“啊”(若为固定乱码)
|
||
];
|
||
$sAuthorContent = strtr($sAuthorContent, $symbolMap);
|
||
|
||
//格式标准化
|
||
$sAuthorContent = str_replace([',', ';', ';', '、'], ',', $sAuthorContent); // 统一分隔符
|
||
$sAuthorContent = preg_replace('/\s+and\s+/i', ', ', $sAuthorContent); // and转逗号
|
||
$sAuthorContent = preg_replace('/\s+/', ' ', $sAuthorContent); // 合并多余空格
|
||
$sAuthorContent = trim($sAuthorContent);
|
||
$aAuthor = $this->parseAuthorsWithoutRegex($sAuthorContent);
|
||
if(empty($aAuthor)){
|
||
return ['author' => [],'report' => []];
|
||
}
|
||
$aReport = $aAuthorData = [];
|
||
foreach ($aAuthor as $key => $value) {
|
||
if(empty($value['name']) && empty($value['superscript'])){
|
||
continue;
|
||
}
|
||
if(!empty($value['name']) && !empty($value['is_report']) && $value['is_report'] == 1){
|
||
$aReport[] = $value['name'];
|
||
}
|
||
$aAuthorData[] = $value;
|
||
}
|
||
return ['author' => $aAuthorData,'report' => array_unique($aReport)];
|
||
}
|
||
|
||
// 获取机构
|
||
private function getCompany($aParam = []){
|
||
//获取标题
|
||
$title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
|
||
//获取标题下的作者
|
||
$sAuthorContent = empty($aParam['authors']) ? $this->getNextParagraphAfterText($title) : $aParam['authors'];
|
||
//获取作者结构
|
||
$allLines = $this->getContentAfterText($sAuthorContent,1);
|
||
if(empty($allLines)){
|
||
return [];
|
||
}
|
||
// 2. 按序号分组,合并同一序号的多行内容
|
||
$grouped = [];
|
||
$currentNumber = null; // 当前序号
|
||
foreach ($allLines as $line) {
|
||
$line = trim($line);
|
||
if (empty($line)) {
|
||
continue;
|
||
}
|
||
if(!mb_check_encoding($line, 'UTF-8')){
|
||
$line = mb_convert_encoding($line, 'UTF-8', 'GBK');
|
||
}
|
||
$line = $this->fullDecode($line);
|
||
$number = '';
|
||
$i = 0;
|
||
$lineLen = strlen($line);
|
||
// 提取行首的连续数字(作为序号)
|
||
$hasFirstChar = false;
|
||
while ($i < $lineLen) {
|
||
$currentChar = $line[$i];
|
||
// 首字符处理:允许 26个字母(大小写)或数字
|
||
if (!$hasFirstChar) {
|
||
if (ctype_digit($currentChar) || ctype_alpha($currentChar)) {
|
||
$number .= $currentChar;
|
||
$hasFirstChar = true;
|
||
$i++;
|
||
} else {
|
||
// 首字符不符合(非字母/数字),终止循环
|
||
break;
|
||
}
|
||
} else {
|
||
// 后续字符必须是数字(保持原逻辑)
|
||
if (ctype_digit($currentChar)) {
|
||
$number .= $currentChar;
|
||
$i++;
|
||
} else {
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
// 若行首有数字,则视为新条目
|
||
if (!empty($number)) {
|
||
$currentNumber = $number;
|
||
// 提取序号后的内容(跳过数字后的符号/空格,保留核心内容)
|
||
// 从数字后的位置开始,跳过可能的符号(./*)或空格
|
||
while ($i < $lineLen && (in_array($line[$i], ['.', '*', ' ']))) {
|
||
$i++;
|
||
}
|
||
$content = trim(substr($line, $i)); // 序号后的内容
|
||
$grouped[$currentNumber] = $content;
|
||
continue;
|
||
}
|
||
|
||
// // 非新条目,合并到当前序号的内容中
|
||
// if ($currentNumber !== null) {
|
||
// $grouped[$currentNumber] .= ' ' . $line;
|
||
// }
|
||
}
|
||
|
||
$aCompany = [];
|
||
foreach ($grouped as $number => $institution) {
|
||
$institution = $this->fullDecode($institution);
|
||
// 原有基础清理逻辑不变
|
||
$institution = preg_replace('/\s+/', ' ', $institution); // 合并多余空格
|
||
$institution = rtrim($institution, '.'); // 去除末尾句号
|
||
$institution = preg_replace('/^\d+\s+/', '', $institution); // 去除开头数字
|
||
$institution = trim($institution); // 清理首尾空格
|
||
|
||
// 增强地址提取:匹配"机构名, 城市 邮编, 国家"格式(兼容更多变体)
|
||
// 允许地址中包含多个逗号(如子机构、街道信息),最终以"城市 邮编, 国家"结尾
|
||
// preg_match('/(.*?, [A-Za-z\s]+ \d+, [A-Za-z\s]+)/', $institution, $institutionmatches);
|
||
// $institution = trim($institutionmatches[1] ?? $institution);
|
||
// 强化冗余信息过滤:去除"*"及之后的内容(包括通讯作者、邮箱等)
|
||
// 新增对"#"、"†"等标记的过滤,兼容更多期刊格式
|
||
if (preg_match('/^(.*?)(?=\s*[\*#†]|(?i)\s*Email)/', $institution, $matches)) {
|
||
$institution = trim($matches[1]);
|
||
}
|
||
|
||
// 编码校验不变
|
||
if (!empty($institution) && !mb_check_encoding($institution, 'UTF-8')) {
|
||
$institution = mb_convert_encoding($institution, 'UTF-8', 'GBK');
|
||
}
|
||
$aCompany[$number] = empty($institution) ? '' : trim(trim($institution),'.');
|
||
}
|
||
return $aCompany;
|
||
}
|
||
|
||
// 提取通讯作者(含E-mail、地址、电话)
|
||
private function getCorrespondingAuthors($aParam = []){
|
||
$aCorrespondingAuthor = empty($aParam['report']) ? [] : $aParam['report'];
|
||
if(empty($aCorrespondingAuthor)){
|
||
return [];
|
||
}
|
||
|
||
// 获取标题
|
||
$title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
|
||
$sAuthorContent = $this->getNextParagraphAfterText($title);
|
||
$sCompany = $this->getNextParagraphAfterText($sAuthorContent); // 直接取机构所在段落的原始文本
|
||
if (empty($sCompany)) {
|
||
// 备选方案:若机构段落获取失败,用解析后的机构名称拼接
|
||
$aCompany = $this->getCompany($aParam);
|
||
$sCompany = implode(' ', array_values($aCompany));
|
||
}
|
||
|
||
// 获取机构后的完整内容
|
||
$corrText = $this->getContentAfterText($sCompany);
|
||
if(!mb_check_encoding($corrText, 'UTF-8')){
|
||
$corrText = mb_convert_encoding($corrText, 'UTF-8', 'GBK');
|
||
}
|
||
$corrText = $this->fullDecode($corrText);
|
||
|
||
// // 调试
|
||
// file_put_contents(ROOT_PATH . 'runtime/corr_text_raw.log', $corrText);
|
||
|
||
//清理文本
|
||
$corrText = str_replace([':', '@'], [':', '@'], $corrText);
|
||
$corrText = preg_replace('/\s+/', ' ', $corrText); // 统一空格
|
||
$corrText = str_replace(' ', ' ', $corrText); // 去除多余空格
|
||
//按"*"分割通讯作者
|
||
$corrBlocks = preg_split('/\s*\*\s*/', $corrText);
|
||
$corrBlocks = array_filter(array_map('trim', $corrBlocks));
|
||
|
||
$aCorresponding = [];
|
||
foreach ($corrBlocks as $block) {
|
||
//匹配通讯作者姓名
|
||
$sName = $this->matchCorrespondingName($block, $aCorrespondingAuthor);
|
||
if (empty($sName)) {
|
||
continue;
|
||
}
|
||
preg_match('/(E[\s-]*mail|邮箱)[\s:]*([^\s]+@[^\s]+)/i', $block, $email);
|
||
preg_match('/(Postal[\s-]*address|地址)[\s:]*([^,;]+)/i', $block, $address);
|
||
preg_match('/(Tel|电话)[\s:]*([^\s]+)/i', $block, $tel);
|
||
$aCorresponding[] = [
|
||
'name' => $sName,
|
||
'email' => isset($email[2]) ? trim($email[2]) : '',
|
||
'postal_address' => isset($address[2]) ? trim(trim($address[2]),'.') : '',
|
||
'tel' => isset($tel[2]) ? trim($tel[2]) : ''
|
||
];
|
||
}
|
||
if(empty($aCorresponding)){
|
||
// $pattern = '/Corresponding Authors|Correspondence to|Correspondence: (.*?)(?=$|;)/s';
|
||
$pattern = '/(Corresponding Authors|Correspondence to|Correspondence)\s*:\s*([\s\S]+?)(?=\n\s*\n|$|;)/is';
|
||
$corrText = trim($corrText,'*');
|
||
preg_match($pattern, $corrText, $match);
|
||
if (!empty($match[2])) {
|
||
$corrContent = $match[2];
|
||
// 提取每个作者的名称和邮箱(优化正则,支持更多字符)
|
||
$authorPattern = '/([A-Za-z\s]+?),\s*E-mail:\s*([\w@\.\-]+)/';
|
||
preg_match_all($authorPattern, $corrContent, $authors);
|
||
if(!empty($authors[1])){
|
||
for ($i = 0; $i < count($authors[1]); $i++) {
|
||
$aCorresponding[] = [
|
||
'name' => empty($authors[1][$i]) ? '' : trim(trim($authors[1][$i]),'.'),
|
||
'email' => empty($authors[2][$i]) ? '' : trim(trim($authors[2][$i]),'.')
|
||
];
|
||
}
|
||
}
|
||
if(empty($authors[1])){
|
||
$authorPattern = '/([A-Za-z0-9\s]+?),\s*([\w@\.\-]+)(?=\.?)/';
|
||
preg_match_all($authorPattern, $corrContent, $authors);
|
||
for ($i = 0; $i < count($authors[1]); $i++) {
|
||
$aCorresponding[] = [
|
||
'name' => empty($authors[1][$i]) ? '' : trim(trim($authors[1][$i]),'.'),
|
||
'email' => empty($authors[2][$i]) ? '' : trim(trim($authors[2][$i]),'.')
|
||
];
|
||
}
|
||
}
|
||
}
|
||
}
|
||
return $aCorresponding;
|
||
}
|
||
|
||
//匹配通讯作者姓名
|
||
private function matchCorrespondingName($block, $corrNames)
|
||
{
|
||
$blockLower = strtolower($block);
|
||
foreach ($corrNames as $name) {
|
||
if (strpos($blockLower, strtolower($name)) !== false) {
|
||
return $name;
|
||
}
|
||
$nameParts = explode(' ', $name);
|
||
if (count($nameParts) >= 2) {
|
||
$reversedName = implode(' ', array_reverse($nameParts));
|
||
if (strpos($blockLower, strtolower($reversedName)) !== false) {
|
||
return $name;
|
||
}
|
||
}
|
||
}
|
||
return '';
|
||
}
|
||
|
||
// 获取目标文本的下一个段落
|
||
private function getNextParagraphAfterText($targetText){
|
||
|
||
$found = false;
|
||
foreach ($this->sections as $section) {
|
||
foreach ($section->getElements() as $element) {
|
||
$text = $this->getTextFromElement($element);
|
||
if(empty($text)){
|
||
continue;
|
||
}
|
||
if ($found) {
|
||
return $text;
|
||
}
|
||
if (stripos($text, $targetText) !== false) {
|
||
$found = true;
|
||
}
|
||
}
|
||
}
|
||
return '';
|
||
}
|
||
|
||
// 获取目标文本后的所有内容
|
||
private function getContentAfterText($targetText,$return_type = 2){
|
||
$found = false;
|
||
$content = [];
|
||
$stopKeywords = ['关键词', 'Key words', '摘要', 'Abstract','ABSTRACT'];
|
||
$maxLines = 200;
|
||
$lineNumber = 0;
|
||
foreach ($this->sections as $section) {
|
||
|
||
foreach ($section->getElements() as $element) {
|
||
|
||
$lineNumber++;
|
||
if (count($content) >= $maxLines) break;
|
||
|
||
$text = $this->getTextFromElement($element,$lineNumber);
|
||
$text = trim($text);
|
||
if (empty($text)) continue;
|
||
if (!$found) {
|
||
// 移除所有非字母数字字符后匹配
|
||
$cleanTarget = preg_replace('/[^a-zA-Z0-9]/', '', strtolower($targetText));
|
||
$cleanText = preg_replace('/[^a-zA-Z0-9]/', '', strtolower($text));
|
||
// 只要目标文本的50%以上能匹配即可
|
||
if (strlen($cleanTarget) > 0 && similar_text($cleanText, $cleanTarget) / strlen($cleanTarget) > 0.5) {
|
||
$found = true;
|
||
}
|
||
continue;
|
||
}
|
||
|
||
// 检查停止关键词
|
||
$shouldStop = false;
|
||
foreach ($stopKeywords as $kw) {
|
||
if (stripos($text, $kw) !== false) {
|
||
$shouldStop = true;
|
||
break;
|
||
}
|
||
}
|
||
if ($shouldStop) break;
|
||
|
||
$content[] = $text;
|
||
}
|
||
if (count($content) >= $maxLines || (isset($shouldStop) && $shouldStop)) break;
|
||
}
|
||
if($return_type == 1){
|
||
return $content;
|
||
}
|
||
$content = implode("\n", $content);
|
||
if(!empty($content) && !mb_check_encoding($content, 'UTF-8')){
|
||
$content = mb_convert_encoding($content, 'UTF-8', 'GBK');
|
||
}
|
||
return $content;
|
||
}
|
||
|
||
// 统一提取元素文本
|
||
private function getTextFromElement(\PhpOffice\PhpWord\Element\AbstractElement $element, int $lineNumber = 0){
|
||
$text = '';
|
||
|
||
// 1. 常量化特殊引号映射(避免每次调用重建数组,提升循环调用性能)
|
||
static $specialQuotesMap = [
|
||
'’' => "'", // 右单引号(U+2019)→ 普通单引号(U+0027)
|
||
'‘' => "'", // 左单引号(U+2018)→ 普通单引号(U+0027)
|
||
'“' => '"', // 左双引号(U+201C)→ 普通双引号(U+0022)
|
||
'”' => '"', // 右双引号(U+201D)→ 普通双引号(U+0022)
|
||
'„' => '"', // 下双引号(U+201E)→ 普通双引号(兼容欧洲排版)
|
||
'‟' => '"', // 右双引号(U+201F)→ 普通双引号(兼容少见排版)
|
||
];
|
||
|
||
// 支持H1-H9标题格式(优化:移除无用变量 $titleDepth,避免冗余)
|
||
if ($element instanceof \PhpOffice\PhpWord\Element\Title) {
|
||
$titleContent = $element->getText();
|
||
$titleText = '';
|
||
|
||
if ($titleContent instanceof \PhpOffice\PhpWord\Element\TextRun) {
|
||
$titleText = $this->getTextFromElement($titleContent);
|
||
} else {
|
||
$titleText = strtr((string)$titleContent, $specialQuotesMap);
|
||
}
|
||
|
||
$text .= $titleText . ' ';
|
||
return $this->cleanText($text);
|
||
}
|
||
|
||
// 项目编号(优化:严格空值判断,避免 0 被 empty 误判)
|
||
if ($element instanceof \PhpOffice\PhpWord\Element\ListItemRun) {
|
||
$this->iNum = isset($this->iNum) && is_numeric($this->iNum) ? $this->iNum : 0;
|
||
$this->iNum++;
|
||
$text .= $this->iNum . ' ';
|
||
}
|
||
|
||
// 处理PreserveText(含HYPERLINK邮箱提取,优化:反射前先判断属性存在)
|
||
if ($element instanceof \PhpOffice\PhpWord\Element\PreserveText) {
|
||
try {
|
||
$reflection = new \ReflectionClass($element);
|
||
// 先判断属性是否存在,避免反射不存在的属性报错(兼容极端版本)
|
||
if (!$reflection->hasProperty('text')) {
|
||
return $this->cleanText($text);
|
||
}
|
||
$property = $reflection->getProperty('text');
|
||
$property->setAccessible(true);
|
||
$textParts = $property->getValue($element) ?? [];
|
||
} catch (\ReflectionException $e) {
|
||
return $this->cleanText($text);
|
||
}
|
||
|
||
foreach ($textParts as $part) {
|
||
$part = (string)$part;
|
||
if (strpos($part, 'HYPERLINK') !== false) {
|
||
$decoded = html_entity_decode($part, ENT_QUOTES | ENT_HTML5);
|
||
// 邮箱正则不变(已优化,兼容国际域名)
|
||
if (preg_match('/mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,10})/i', $decoded, $match)) {
|
||
$text .= $match[1] . ' ';
|
||
}
|
||
} else {
|
||
$part = strtr($part, $specialQuotesMap);
|
||
$text .= $part;
|
||
}
|
||
}
|
||
return $this->cleanText($text);
|
||
}
|
||
|
||
// 处理表格(优化:避免行尾多余空格,通过 cleanText 自动合并)
|
||
if ($element instanceof \PhpOffice\PhpWord\Element\Table) {
|
||
foreach ($element->getRows() as $row) {
|
||
foreach ($row->getCells() as $cell) {
|
||
$text .= $this->getTextFromElement($cell) . ' ';
|
||
}
|
||
// 移除行尾额外空格(cleanText 会合并连续空格,无需手动添加)
|
||
}
|
||
return $this->cleanText($text);
|
||
}
|
||
|
||
// 处理单元格(逻辑不变,保持递归提取)
|
||
if ($element instanceof \PhpOffice\PhpWord\Element\Cell) {
|
||
foreach ($element->getElements() as $child) {
|
||
$text .= $this->getTextFromElement($child);
|
||
}
|
||
return $this->cleanText($text);
|
||
}
|
||
|
||
// 处理嵌套元素(逻辑不变,增强类型校验可读性)
|
||
if (method_exists($element, 'getElements') && is_callable([$element, 'getElements'])) {
|
||
foreach ($element->getElements() as $child) {
|
||
if ($child instanceof \PhpOffice\PhpWord\Element\AbstractElement) {
|
||
$text .= $this->getTextFromElement($child);
|
||
}
|
||
}
|
||
}
|
||
|
||
// 处理纯文本元素(逻辑不变,保持特殊引号替换)
|
||
if ($element instanceof \PhpOffice\PhpWord\Element\Text) {
|
||
$textPart = (string)$element->getText(); // 显式强制转换,避免类型隐患
|
||
$textPart = strtr($textPart, $specialQuotesMap);
|
||
$text .= $textPart;
|
||
}
|
||
|
||
// 处理超链接(逻辑不变,保持邮箱优先提取)
|
||
if ($element instanceof \PhpOffice\PhpWord\Element\Link) {
|
||
$target = (string)$element->getTarget();
|
||
if (strpos($target, 'mailto:') === 0) {
|
||
$text .= rtrim(str_replace('mailto:', '', $target)) . ' ';
|
||
}
|
||
$linkText = strtr((string)$element->getText(), $specialQuotesMap);
|
||
$text .= $linkText . ' ';
|
||
}
|
||
|
||
// 处理字段和注释(优化:显式强制转换,避免非字符串拼接)
|
||
if ($element instanceof \PhpOffice\PhpWord\Element\Field) {
|
||
$text .= (string)$element->getContent() . ' ';
|
||
}
|
||
if ($element instanceof \PhpOffice\PhpWord\Element\Note) {
|
||
$text .= (string)$element->getContent() . ' ';
|
||
}
|
||
|
||
return $this->cleanText($text);
|
||
}
|
||
|
||
/**
|
||
* 统一文本清理方法(稳健、高效、不破坏普通单引号)
|
||
* @param string $text 待清理文本
|
||
* @return string 清理后的纯文本
|
||
*/
|
||
private function cleanText(string $text){
|
||
|
||
//编码正确
|
||
if (!mb_check_encoding($text, 'UTF-8')) {
|
||
$text = mb_convert_encoding(
|
||
$text,
|
||
'UTF-8',
|
||
'GBK,GB2312,GB18030,Big5,ISO-8859-1,CP1252,UTF-16,UTF-32' // 补充常见西文编码,兼容更多场景
|
||
);
|
||
}
|
||
//移除不可见控制字符
|
||
$text = preg_replace('/[\x00-\x1F\x7F-\x9F]/u', ' ', $text);
|
||
|
||
//统一空白字符
|
||
$text = str_replace([
|
||
"\t", "\r", "\n",
|
||
chr(0xC2) . chr(0xA0), // 不间断空格( )
|
||
' ', // 全角空格(U+3000)
|
||
chr(0xE2) . chr(0x80) . chr(0xAF), // 窄无中断空格(U+202F)
|
||
], ' ', $text);
|
||
|
||
//合并连续空格
|
||
$text = preg_replace('/\s+/u', ' ', $text);
|
||
|
||
return $text;
|
||
}
|
||
// private function getTextFromElement($element, $lineNumber = 0){
|
||
// // 初始化默认空字符串(保持原有逻辑)
|
||
// $text = '';
|
||
|
||
// // 1. 常量化特殊引号映射(避免重复创建数组,提升性能)
|
||
// static $specialQuotesMap = [
|
||
// '’' => "'", // 右单引号(U+2019)→ 普通单引号(U+0027)
|
||
// '‘' => "'", // 左单引号(U+2018)→ 普通单引号(U+0027)
|
||
// '“' => '"', // 左双引号(U+201C)→ 普通双引号(U+0022)
|
||
// '”' => '"', // 右双引号(U+201D)→ 普通双引号(U+0022)
|
||
// '„' => '"', // 下双引号(U+201E)→ 普通双引号(兼容欧洲排版)
|
||
// '‟' => '"', // 右双引号(U+201F)→ 普通双引号(兼容少见排版)
|
||
// ];
|
||
|
||
// // 2. 提前校验元素合法性(避免后续 instanceof 无效判断,减少报错)
|
||
// if (!is_object($element) || !$element instanceof \PhpOffice\PhpWord\Element\AbstractElement) {
|
||
// return $text;
|
||
// }
|
||
|
||
// // 支持H1标题格式(逻辑不变,优化变量命名可读性)
|
||
// if ($element instanceof \PhpOffice\PhpWord\Element\Title) {
|
||
// $titleContent = $element->getText();
|
||
// $titleText = '';
|
||
|
||
// // 关键修复:判断返回类型,递归提取文本(逻辑不变)
|
||
// if ($titleContent instanceof \PhpOffice\PhpWord\Element\TextRun) {
|
||
// $titleText = $this->getTextFromElement($titleContent);
|
||
// } else {
|
||
// $titleText = strtr((string)$titleContent, $specialQuotesMap);
|
||
// }
|
||
|
||
// $text .= $titleText . ' ';
|
||
// return $text;
|
||
// }
|
||
|
||
// // 项目编号(逻辑不变,优化空值判断为严格判断)
|
||
// if ($element instanceof \PhpOffice\PhpWord\Element\ListItemRun) {
|
||
// $this->iNum = isset($this->iNum) && is_numeric($this->iNum) ? $this->iNum : 0;
|
||
// $this->iNum++;
|
||
// $text .= $this->iNum . ' ';
|
||
// }
|
||
|
||
// // 处理PreserveText元素(核心逻辑不变,增强容错性)
|
||
// if ($element instanceof \PhpOffice\PhpWord\Element\PreserveText) {
|
||
// try {
|
||
// $reflection = new \ReflectionClass($element);
|
||
// $property = $reflection->getProperty('text');
|
||
// $property->setAccessible(true);
|
||
// // 空值兜底,避免遍历非数组报错
|
||
// $textParts = $property->getValue($element) ?? [];
|
||
// } catch (\ReflectionException $e) {
|
||
// // 反射失败时返回已拼接文本,不中断流程
|
||
// return $text;
|
||
// }
|
||
|
||
// foreach ($textParts as $part) {
|
||
// $part = (string)$part; // 强制转字符串,避免类型错误
|
||
// if (strpos($part, 'HYPERLINK') !== false) {
|
||
// $decoded = html_entity_decode($part, ENT_QUOTES | ENT_HTML5);
|
||
// // 邮箱正则不变,保持原有匹配逻辑
|
||
// if (preg_match('/mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,10})/i', $decoded, $match)) {
|
||
// $text .= $match[1] . ' ';
|
||
// }
|
||
// } else {
|
||
// $text .= $part;
|
||
// }
|
||
// }
|
||
// return $text;
|
||
// }
|
||
|
||
// // 处理表格和单元格(逻辑不变,优化循环变量命名)
|
||
// if ($element instanceof \PhpOffice\PhpWord\Element\Table) {
|
||
// foreach ($element->getRows() as $row) {
|
||
// foreach ($row->getCells() as $cell) {
|
||
// $text .= $this->getTextFromElement($cell);
|
||
// }
|
||
// }
|
||
// return $text;
|
||
// }
|
||
|
||
// if ($element instanceof \PhpOffice\PhpWord\Element\Cell) {
|
||
// foreach ($element->getElements() as $child) {
|
||
// $text .= $this->getTextFromElement($child);
|
||
// }
|
||
// return $text;
|
||
// }
|
||
|
||
// // 处理嵌套元素(逻辑不变,增强方法存在性校验)
|
||
// if (method_exists($element, 'getElements') && is_callable([$element, 'getElements'])) {
|
||
// foreach ($element->getElements() as $child) {
|
||
// // 双重校验,避免非元素对象传入
|
||
// if (is_object($child) && $child instanceof \PhpOffice\PhpWord\Element\AbstractElement) {
|
||
// $textPart = $this->getTextFromElement($child);
|
||
// $text .= $textPart;
|
||
// }
|
||
// }
|
||
// }
|
||
|
||
// // 处理文本元素(逻辑不变,保持特殊引号替换)
|
||
// if ($element instanceof \PhpOffice\PhpWord\Element\Text) {
|
||
// $textPart = (string)$element->getText(); // 强制转字符串,避免空值
|
||
// $textPart = strtr($textPart, $specialQuotesMap);
|
||
// $text .= $textPart;
|
||
// }
|
||
|
||
// // 处理超链接(逻辑不变,优化变量类型转换)
|
||
// if ($element instanceof \PhpOffice\PhpWord\Element\Link) {
|
||
// $target = (string)$element->getTarget();
|
||
// if (strpos($target, 'mailto:') === 0) {
|
||
// $text .= rtrim(str_replace('mailto:', '', $target)) . ' ';
|
||
// }
|
||
// $linkText = strtr((string)$element->getText(), $specialQuotesMap);
|
||
// $text .= $linkText . ' ';
|
||
// }
|
||
|
||
// // 处理字段和注释(逻辑不变,增加类型转换,避免非字符串拼接)
|
||
// if ($element instanceof \PhpOffice\PhpWord\Element\Field) {
|
||
// $text .= (string)$element->getContent() . ' ';
|
||
// }
|
||
// if ($element instanceof \PhpOffice\PhpWord\Element\Note) {
|
||
// $text .= (string)$element->getContent() . ' ';
|
||
// }
|
||
|
||
// // 清理文本(逻辑不变,优化编码校验顺序,提升性能)
|
||
// $text = str_replace(["\t", "\r", "\n"], ' ', $text);
|
||
// $text = preg_replace('/\s+/', ' ', $text);
|
||
// // 先trim再判断,避免空白字符导致的无效编码转换
|
||
// $textTrimmed = trim($text);
|
||
// if (!empty($textTrimmed) && !mb_check_encoding($textTrimmed, 'UTF-8')) {
|
||
// $text = mb_convert_encoding($text, 'UTF-8', 'GBK');
|
||
// }
|
||
|
||
// return $text;
|
||
// }
|
||
/**
|
||
* 从 Word 文档提取摘要和关键词
|
||
* @return array 提取结果
|
||
*/
|
||
function extractContentIntervals($str, $markers = []) {
|
||
// 1. 初始化标记(支持自定义,默认值兼容原逻辑)
|
||
$defaultMarkers = [
|
||
'abstract' => 'abstract',
|
||
'keywords' => 'keywords',
|
||
'end_span' => '===========end-span'
|
||
];
|
||
$markers = array_merge($defaultMarkers, $markers);
|
||
extract($markers); // 解析为变量 $abstract, $keywords, $end_span
|
||
|
||
// 2. 初始化结果(包含元信息)
|
||
$result = [
|
||
'abstract_to_keywords' => '',
|
||
'keywords_to_end' => '',
|
||
'positions' => [ // 标记位置信息(-1 表示未找到)
|
||
'abstract' => -1,
|
||
'keywords' => -1,
|
||
'end_span' => -1
|
||
],
|
||
'is_valid' => false, // 整体区间是否有效
|
||
'error' => '' // 错误信息(如标记顺序异常)
|
||
];
|
||
|
||
// 3. 定位 Abstract(不区分大小写)
|
||
$absPos = stripos($str, $abstract);
|
||
if ($absPos === false) {
|
||
$result['error'] = "未找到标记: {$abstract}";
|
||
return $result;
|
||
}
|
||
$result['positions']['abstract'] = $absPos;
|
||
$absEndPos = $absPos + strlen($abstract);
|
||
|
||
// 4. 定位 Keywords(需在 Abstract 之后,不区分大小写)
|
||
$keyPos = stripos($str, $keywords, $absEndPos);
|
||
if ($keyPos === false) {
|
||
$result['error'] = "未找到 {$keywords} 或在 {$abstract} 之前";
|
||
return $result;
|
||
}
|
||
$result['positions']['keywords'] = $keyPos;
|
||
$keyEndPos = $keyPos + strlen($keywords);
|
||
|
||
// 5. 定位 end-span(需在 Keywords 之后,严格匹配)
|
||
$endPos = strpos($str, $end_span, $keyEndPos);
|
||
if ($endPos === false) {
|
||
$result['error'] = "未找到 {$end_span} 或在 {$keywords} 之前";
|
||
return $result;
|
||
}
|
||
$result['positions']['end_span'] = $endPos;
|
||
|
||
// 6. 截取区间内容(清理标记后的紧邻符号)
|
||
// 区间1:Abstract 结束 → Keywords 开始(清理标记后的冒号/空格)
|
||
$len1 = $keyPos - $absEndPos;
|
||
$part1 = substr($str, $absEndPos, $len1);
|
||
$part1 = trim($part1);
|
||
// 移除 Abstract 后可能的冒号/短横线(如 "Abstract: ..." → 去掉开头的 ":")
|
||
$part1 = ltrim($part1, ': -—');
|
||
$result['abstract_to_keywords'] = trim($part1);
|
||
|
||
// 区间2:Keywords 结束 → end-span 开始(同理清理)
|
||
$len2 = $endPos - $keyEndPos;
|
||
$part2 = substr($str, $keyEndPos, $len2);
|
||
$part2 = trim($part2);
|
||
$part2 = ltrim($part2, ': -—');
|
||
$result['keywords_to_end'] = trim($part2);
|
||
|
||
// 7. 标记为有效
|
||
$result['is_valid'] = true;
|
||
return $result;
|
||
}
|
||
public function extractFromWord() {
|
||
$sContent = '';
|
||
//文本处理
|
||
$sFundContent = '';
|
||
$aContent = [];
|
||
foreach ($this->sections as $section) {
|
||
foreach ($section->getElements() as $element) {
|
||
$textContent = $this->getTextFromElement($element);
|
||
if(empty($textContent)){
|
||
continue;
|
||
}
|
||
if(!empty($textContent) && !mb_check_encoding($textContent, 'UTF-8')){
|
||
$textContent = mb_convert_encoding($textContent, 'UTF-8', 'GBK');
|
||
}
|
||
if(empty($sFundContent)){
|
||
$aFund = $this->getMatchedFundPhrases($sContent);
|
||
if(!empty($aFund[0])){
|
||
$position = stripos($sContent, $aFund[0]);
|
||
$sFundContent = substr($sContent, $position);
|
||
$sFundContent = trim(str_ireplace($aFund[0], '', $sFundContent));
|
||
if (preg_match('/^(.*?)Peer review/', $sFundContent, $matches)) {
|
||
$sFundContent = $matches[1]; // 提取匹配到的前置内容
|
||
}
|
||
}
|
||
}
|
||
$sContent .= $textContent."===========end-span";
|
||
}
|
||
}
|
||
if(!empty($sContent) && !mb_check_encoding($sContent, 'UTF-8')){
|
||
$sContent = mb_convert_encoding($sContent, 'UTF-8', 'GBK');
|
||
}
|
||
$result = $this->extractContentIntervals($sContent);
|
||
// 3. 提取摘要
|
||
$abstract = empty($result['abstract_to_keywords']) ? '' : $result['abstract_to_keywords'];
|
||
if(!empty($abstract) && !mb_check_encoding($abstract, 'UTF-8')){
|
||
$abstract = mb_convert_encoding($abstract, 'UTF-8', 'GBK');
|
||
}
|
||
$keywords = empty($result['keywords_to_end']) ? '' : $result['keywords_to_end'];
|
||
if(!empty($keywords) && !mb_check_encoding($keywords, 'UTF-8')){
|
||
$keywords = mb_convert_encoding($keywords, 'UTF-8', 'GBK');
|
||
}
|
||
if(!empty($sFundContent) && !mb_check_encoding($sFundContent, 'UTF-8')){
|
||
$sFundContent = mb_convert_encoding($sFundContent, 'UTF-8', 'GBK');
|
||
}
|
||
|
||
return [
|
||
'status' => 1,
|
||
'msg' => '提取成功',
|
||
'data' => [
|
||
'abstrart' => empty($abstract) ? '' : $this->fullDecode(str_replace('===========end-span', '',$abstract)),
|
||
'keywords' => empty($keywords) ? '' : $this->fullDecode(str_replace('===========end-span', '',$keywords)),
|
||
'fund' => empty($sFundContent) ? '' : $this->fullDecode(str_replace('===========end-span', '',$sFundContent))
|
||
]
|
||
];
|
||
}
|
||
/**
|
||
* 核心解码方法
|
||
* @param string $str 待解码字符串
|
||
* @param int $maxDepth 最大解析深度
|
||
* @return string
|
||
*/
|
||
private function fullDecode($str = '', int $maxDepth = 2){
|
||
try {
|
||
if ($str === null || trim((string)$str) === '' || $maxDepth <= 0) {
|
||
return $str === null ? '' : trim((string)$str);
|
||
}
|
||
|
||
$str = (string)$str;
|
||
|
||
// Unicode解码
|
||
if (method_exists($this, 'decodeUnicode')) {
|
||
$str = $this->decodeUnicode($str);
|
||
} else {
|
||
$str = preg_replace_callback(
|
||
'/\\\\[uU]([0-9a-fA-F]{4})/',
|
||
function ($m) {
|
||
$code = hexdec($m[1]);
|
||
return ($code >= 0x20 && $code <= 0x10FFFF) ? mb_chr($code, 'UTF-8') : $m[0];
|
||
},
|
||
$str
|
||
);
|
||
}
|
||
|
||
// 预编译正则
|
||
$regexps = [
|
||
'ob0' => '/0B\s*\\?0/',
|
||
'dl18' => '/DL\s*\\?\.18/',
|
||
'qMarkNum' => '/\\?(\d+)/',
|
||
'qMarkDotNum' => '/\\?(\.\d+)/',
|
||
'neNum' => '/≠\s*(\d+)/u',
|
||
'leNum' => '/≤\s*(\d+)/u',
|
||
'mixSymbol' => '/\\?\s*(?:、|,)\s*\\?\s*(?:、|,)\s*\\?(\d+)/u',
|
||
'leNeMark' => '/(LE|NE)\s*\\?(\d+)/i',
|
||
'unicode' => '/\\\\[uU]([0-9a-fA-F]{4})/',
|
||
'wordBin' => '/(?:\\xE2\\x89\\x86|\\xE2\s*0x89\s*0x86|e28986|\\xE2\\x89\\x87|\\xE2\s*0x89\s*0x87|e28987|\\xE2\\x89\\x80|\\xE2\s*0x89\s*0x80|e28980)/i',
|
||
'wordEntity' => '/&#\s*(?:x|X)?\s*(2264|2265|2260)\s*;?/i',
|
||
'repeatSymbol' => '/(≤|≥|≠)\1+/u',
|
||
'gbkSymbol' => '/\xA1\xF2|\xA1\xF3|\xA1\xF0/'
|
||
];
|
||
|
||
// 预定义替换映射
|
||
$maps = [
|
||
'htmlEntity' => [
|
||
'≤' => '≤', '≤' => '≤', '≤' => '≤', '≤' => '≤',
|
||
'≥' => '≥', '≥' => '≥', '≥' => '≥', '≥' => '≥',
|
||
'≠' => '≠', '≠' => '≠', '≠' => '≠', '≠' => '≠',
|
||
'&le' => '≤', '&ge' => '≥', '&ne' => '≠',
|
||
'ࣘ' => '≤', 'ࣙ' => '≥', 'ࣔ' => '≠',
|
||
'≤' => '≤', '≥' => '≥', '≠' => '≠',
|
||
'<' => '≤', '>' => '≥',
|
||
],
|
||
'wordBin' => [
|
||
"\xE2\x89\x86" => '≤', "\xE2\x89\x87" => '≥', "\xE2\x89\x80" => '≠',
|
||
"\xe2\x89\x86" => '≤', "\xe2\x89\x87" => '≥', "\xe2\x89\x80" => '≠',
|
||
'e28986' => '≤', '\xe2\x89\x86' => '≤', '\xe20x890x86' => '≤',
|
||
'e28987' => '≥', '\xe2\x89\x87' => '≥', '\xe20x890x87' => '≥',
|
||
'e28980' => '≠', '\xe2\x89\x80' => '≠', '\xe20x890x80' => '≠',
|
||
],
|
||
'wordEntity' => ['2264' => '≤', '2265' => '≥', '2260' => '≠'],
|
||
'gbkSymbol' => ["\xA1\xF2" => '≤', "\xA1\xF3" => '≥', "\xA1\xF0" => '≠'],
|
||
];
|
||
|
||
$unicodeCallback = function ($m) {
|
||
$code = hexdec($m[1]);
|
||
return ($code >= 0x20 && $code <= 0x10FFFF) ? mb_chr($code, 'UTF-8') : $m[0];
|
||
};
|
||
|
||
$depth = 0;
|
||
$hasChange = false;
|
||
$currentStr = $str;
|
||
|
||
// 循环解码
|
||
do {
|
||
$depth++;
|
||
$hasChange = false;
|
||
$prevStr = $currentStr;
|
||
|
||
// Unicode转义解码
|
||
$currentStr = preg_replace_callback($regexps['unicode'], $unicodeCallback, $currentStr);
|
||
|
||
//HTML实体替换
|
||
$currentStr = strtr($currentStr, $maps['htmlEntity']);
|
||
$currentStr = html_entity_decode(
|
||
$currentStr,
|
||
ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE,
|
||
'UTF-8'
|
||
);
|
||
|
||
// Word特殊符号乱码修复
|
||
if (preg_match($regexps['wordBin'], $currentStr)) {
|
||
$tempStr = str_replace(' ', '', $currentStr);
|
||
$currentStr = str_ireplace(array_keys($maps['wordBin']), $maps['wordBin'], $tempStr);
|
||
}
|
||
if (preg_match($regexps['wordEntity'], $currentStr)) {
|
||
$currentStr = preg_replace_callback(
|
||
$regexps['wordEntity'],
|
||
function ($m) use ($maps) {
|
||
return $maps['wordEntity'][$m[1]] ?? $m[0];
|
||
},
|
||
$currentStr
|
||
);
|
||
}
|
||
if (preg_match($regexps['gbkSymbol'], $currentStr)) {
|
||
$currentStr = strtr($currentStr, $maps['gbkSymbol']);
|
||
}
|
||
if (preg_match($regexps['repeatSymbol'], $currentStr)) {
|
||
$currentStr = preg_replace($regexps['repeatSymbol'], '$1', $currentStr);
|
||
}
|
||
|
||
//业务场景专属替换
|
||
if (preg_match($regexps['neNum'], $currentStr)) {
|
||
$currentStr = preg_replace($regexps['neNum'], '≠$1', $currentStr);
|
||
}
|
||
if (preg_match($regexps['leNum'], $currentStr)) {
|
||
$currentStr = preg_replace($regexps['leNum'], '≤$1', $currentStr);
|
||
}
|
||
if (preg_match($regexps['qMarkNum'], $currentStr)) {
|
||
$currentStr = preg_replace($regexps['qMarkNum'], '≥$1', $currentStr);
|
||
}
|
||
if (preg_match($regexps['qMarkDotNum'], $currentStr)) {
|
||
$currentStr = preg_replace($regexps['qMarkDotNum'], '≥0$1', $currentStr);
|
||
}
|
||
if (preg_match($regexps['mixSymbol'], $currentStr)) {
|
||
$currentStr = preg_replace($regexps['mixSymbol'], '≤≥≠$1', $currentStr);
|
||
}
|
||
if (preg_match($regexps['leNeMark'], $currentStr)) {
|
||
$currentStr = preg_replace_callback(
|
||
$regexps['leNeMark'],
|
||
function ($m) {
|
||
return strtoupper($m[1]) === 'LE' ? '≤' . $m[2] : '≠' . $m[2];
|
||
},
|
||
$currentStr
|
||
);
|
||
}
|
||
|
||
$hasChange = ($currentStr !== $prevStr);
|
||
} while ($depth < $maxDepth && $hasChange);
|
||
|
||
// 最终清理
|
||
$currentStr = trim($currentStr, ':');
|
||
$currentStr = strtr($currentStr, $maps['htmlEntity']);
|
||
|
||
return $currentStr;
|
||
|
||
} catch (\Throwable $e) {
|
||
return trim((string)$str);
|
||
}
|
||
}
|
||
|
||
// private function fullDecode($str, $maxDepth = 5) {
|
||
// // 空值/深度为0,直接返回(提前终止,避免无效操作)
|
||
// if (empty($str) || $maxDepth <= 0) {
|
||
// return $str;
|
||
// }
|
||
|
||
// // 【性能优化1:预编译所有正则表达式】避免每次循环重新解析正则
|
||
// // 预编译:≥专属场景正则
|
||
// $regOb0 = '/0B\s*\?0/';
|
||
// $regDl18 = '/DL\s*\?.18/';
|
||
// // 预编译:≥通用场景正则
|
||
// $regQMarkNum = '/\?(\d+)/';
|
||
// $regQMarkDotNum = '/\?(\.\d+)/';
|
||
// // 预编译:≤、≠空格修复正则
|
||
// $regNeNum = '/≠\s*(\d+)/';
|
||
// $regLeNum = '/≤\s*(\d+)/';
|
||
// // 预编译:混合符号乱码正则(中文顿号/英文逗号)
|
||
// $regMixCn = '/(\?)\s*、\s*(\?)\s*、\s*(\?)(\d+)/';
|
||
// $regMixEn = '/(\?)\s*,\s*(\?)\s*,\s*(\?)(\d+)/';
|
||
// // 预编译:≤、≠专属标识正则
|
||
// $regLeMark = '/LE\s*\?(\d+)/';
|
||
// $regNeMark = '/NE\s*\?(\d+)/';
|
||
// // 预编译:Unicode转义正则(提取到外部,避免闭包重复创建)
|
||
// $regUnicode = '/\\\\u([0-9a-fA-F]{4})/';
|
||
|
||
// // 【性能优化2:预定义常量/映射】避免循环内重复创建数组/字符串
|
||
// // HTML实体映射(一次性定义,避免循环内重复赋值)
|
||
// $htmlEntityMap = [
|
||
// '≤' => '≤', '≤' => '≤', '≤' => '≤',
|
||
// '≥' => '≥', '≥' => '≥', '≥' => '≥',
|
||
// '≠' => '≠', '≠' => '≠', '≠' => '≠',
|
||
// ];
|
||
// // 不间断空格替换数组
|
||
// $nbspReplace = [chr(0xC2) . chr(0xA0), chr(0xA0)];
|
||
// // Unicode回调函数(预定义,避免循环内重复创建闭包)
|
||
// $unicodeCallback = function ($m) {
|
||
// return mb_chr(hexdec($m[1]), 'UTF-8') ?: $m[0];
|
||
// };
|
||
|
||
// $original = $str;
|
||
// $depth = 0;
|
||
// $hasChange = false; // 标记是否有变化,提前终止循环
|
||
|
||
// // 循环解码:仅在有变化且未达最大深度时执行
|
||
// do {
|
||
// $depth++;
|
||
// $hasChange = false;
|
||
// $prevStr = $str; // 保存当前状态,用于判断变化
|
||
|
||
// // 1. 解码Unicode转义(\uXXXX格式)
|
||
// $str = $this->decodeUnicode($str);
|
||
|
||
// // 2. 解码HTML实体(先替换专属实体,再执行通用解码)
|
||
// $str = strtr($str, $htmlEntityMap); // 高性能替换(strtr比str_replace快)
|
||
// $str = html_entity_decode($str, ENT_QUOTES | ENT_HTML5, 'UTF-8');
|
||
|
||
// // 3. 再次处理遗漏的Unicode转义(使用预编译正则+预定义回调)
|
||
// $str = preg_replace_callback($regUnicode, $unicodeCallback, $str);
|
||
|
||
// // 4. 替换不间断空格为普通空格(strtr比str_replace更高效)
|
||
// $str = str_replace($nbspReplace, ' ', $str);
|
||
|
||
// // 5. 核心替换逻辑(优化执行顺序,避免覆盖)
|
||
// // 5.1 原有≥专属场景(保留)
|
||
// $str = preg_replace($regOb0, '0B≥30', $str, -1, $count1);
|
||
// $str = preg_replace($regDl18, 'DL≥0.18', $str, -1, $count2);
|
||
// // 5.2 ≤、≠空格修复(保留)
|
||
// $str = preg_replace($regNeNum, '≠$1', $str, -1, $count3);
|
||
// $str = preg_replace($regLeNum, '≤$1', $str, -1, $count4);
|
||
// // 5.3 原有≥通用场景(保留)
|
||
// $str = preg_replace($regQMarkNum, '≥$1', $str, -1, $count5);
|
||
// $str = preg_replace($regQMarkDotNum, '≥0$1', $str, -1, $count6);
|
||
// // 5.4 混合符号乱码还原(保留)
|
||
// $str = preg_replace($regMixCn, '≤、≥、≠$4', $str, -1, $count7);
|
||
// $str = preg_replace($regMixEn, '≤、≥、≠$4', $str, -1, $count8);
|
||
// // 5.5 ≤、≠专属标识还原(保留)
|
||
// $str = preg_replace($regLeMark, '≤$1', $str, -1, $count9);
|
||
// $str = preg_replace($regNeMark, '≠$1', $str, -1, $count10);
|
||
|
||
// // 5.6 修复前缀"d with "乱码(保留)
|
||
// $str = str_replace('d with ', 'd with ', $str, $count11);
|
||
|
||
// // 【性能优化3:统计所有替换次数,判断是否有变化】
|
||
// $totalCount = $count1 + $count2 + $count3 + $count4 + $count5 + $count6 +
|
||
// $count7 + $count8 + $count9 + $count10 + $count11;
|
||
// if ($totalCount > 0 || $str !== $prevStr) {
|
||
// $hasChange = true;
|
||
// $original = $str;
|
||
// }
|
||
|
||
// // 【性能优化4:提前终止】单次循环无变化,直接退出
|
||
// if (!$hasChange) {
|
||
// break;
|
||
// }
|
||
|
||
// } while ($depth < $maxDepth); // 改用do-while,减少循环判断次数
|
||
|
||
// // 最终清理:仅执行一次trim
|
||
// return trim($str, ':');
|
||
// }
|
||
// private function fullDecode($str, $maxDepth = 5) {
|
||
// if (empty($str) || $maxDepth <= 0) {
|
||
// return $str;
|
||
// }
|
||
|
||
// $original = $str;
|
||
// $depth = 0;
|
||
|
||
// // 循环解码,直到无变化或达到最大次数
|
||
// while (true) {
|
||
// $depth++;
|
||
// if ($depth > $maxDepth) {
|
||
// break; // 防止过度解码导致死循环
|
||
// }
|
||
|
||
// // 1. 解码 Unicode 转义(\uXXXX 格式)
|
||
// $str = $this->decodeUnicode($str);
|
||
|
||
// // 2. 解码 HTML 实体(&、'、< 等)
|
||
// $str = html_entity_decode($str, ENT_QUOTES | ENT_HTML5, 'UTF-8');
|
||
|
||
// $str = preg_replace_callback('/\\\\u([0-9a-fA-F]{4})/', function ($m) {
|
||
// return mb_chr(hexdec($m[1]), 'UTF-8') ?: $m[0];
|
||
// }, $str);
|
||
// $str = str_replace([chr(0xC2).chr(0xA0), chr(0xA0)], ' ', $str);
|
||
|
||
// // 2. 核心:强制匹配所有可能的乱码格式,还原≥
|
||
// // 匹配:0B?0、0B ?0、0B ?0(空格/制表符)→ 0B≥30
|
||
// $str = preg_replace('/0B\s*\?0/', '0B≥30', $str);
|
||
// // 匹配:DL?.18、DL ?.18、DL ?.18 → DL≥0.18
|
||
// $str = preg_replace('/DL\s*\?.18/', 'DL≥0.18', $str);
|
||
// // 通用匹配:数字前的?(如?30、?0.18)→ ≥30、≥0.18(防止其他变体)
|
||
// $str = preg_replace('/\?(\d+)/', '≥$1', $str);
|
||
// $str = preg_replace('/\?(\.\d+)/', '≥0$1', $str);
|
||
|
||
// // 3. 修复前缀的"d with "可能的乱码(若有)
|
||
// $str = str_replace('d with ', 'd with ', $str); // 若前缀也乱码,可同步替换
|
||
|
||
// // 若解码后无变化,退出循环
|
||
// if ($str === $original) {
|
||
// break;
|
||
// }
|
||
|
||
// $original = $str;
|
||
// }
|
||
|
||
// return trim($str,':');
|
||
// }
|
||
private function decodeUnicode($str) {
|
||
return preg_replace_callback(
|
||
'/\\\\u([0-9a-fA-F]{4})/',
|
||
function ($matches) {
|
||
// 将十六进制 Unicode 码转为 UTF-8 字符
|
||
return mb_convert_encoding(pack('H*', $matches[1]), 'UTF-8', 'UCS-2BE');
|
||
},
|
||
$str
|
||
);
|
||
}
|
||
private function getMatchedFundPhrases($content = '') {
|
||
if (empty($content)) {
|
||
return [];
|
||
}
|
||
|
||
// 基金支持词组列表
|
||
$fundPhrases = [
|
||
'Supported by', 'Funded by', 'Sponsored by', 'Supported in part by',
|
||
'Funding was provided by', 'Funded in part by','FUNDING:'
|
||
];
|
||
|
||
// 1. 转义词组中的特殊字符,使用 # 作为分隔符
|
||
$escapedPhrases = array_map(function($phrase) {
|
||
return preg_quote($phrase, '#');
|
||
}, $fundPhrases);
|
||
|
||
// 2. 拼接为正则模式:匹配任意一个词组(保留原始词组的捕获)
|
||
$pattern = '#('.implode('|', $escapedPhrases).')#i';
|
||
// 注意:此处用 () 捕获分组,而非 (?:),用于提取匹配到的具体词组
|
||
|
||
// 3. 全局匹配所有符合的词组
|
||
preg_match_all($pattern, $content, $matches);
|
||
|
||
// 4. 处理结果:去重、保留原始词组格式(忽略大小写导致的变体)
|
||
$matched = [];
|
||
if (!empty($matches[1])) {
|
||
// 遍历匹配到的结果(可能包含大小写变体,如 'funded by')
|
||
foreach ($matches[1] as $match) {
|
||
// 与原始词组列表比对,找到完全匹配的原始词组(忽略大小写)
|
||
foreach ($fundPhrases as $original) {
|
||
if (strcasecmp($match, $original) === 0) {
|
||
$matched[] = $original;
|
||
break; // 找到后跳出内层循环,避免重复
|
||
}
|
||
}
|
||
}
|
||
// 去重并保持原始顺序
|
||
$matched = array_values(array_unique($matched));
|
||
}
|
||
|
||
return $matched;
|
||
}
|
||
//日志打印
|
||
private function log($msg){
|
||
// echo date('[Y-m-d H:i:s] ') . $msg . "\n";
|
||
}
|
||
} |