5, 'msg' => '"文档不存在:{$filePath}"']); } try { // 关键配置:关闭“仅读数据”,保留完整节结构 $reader = IOFactory::createReader(); $reader->setReadDataOnly(false); Settings::setCompatibility(false); Settings::setOutputEscapingEnabled(true); // 避免XML转义冲突 $doc = $reader->load($filePath); $sectionCount = count($doc->getSections()); // $this->log("✅ 文档直接加载成功,节数量:{$sectionCount}"); $this->phpWord = $reader->load($filePath); $this->sections = $this->phpWord->getSections(); } catch (\Exception $e) { // 预处理:移除 DOCX 中的 EMF 图片 $processedFilePath = $this->removeEmfFromDocx($filePath); // 加载处理后的文档 $reader = IOFactory::createReader(); $reader->setReadDataOnly(false); Settings::setCompatibility(false); Settings::setOutputEscapingEnabled(true); $this->phpWord = $reader->load($processedFilePath); $this->sections = $this->phpWord->getSections(); // 可选:删除临时处理文件(避免冗余) unlink($processedFilePath); return json_encode(['status' => 5, 'msg' => $e->getMessage()]); } } /** * 移除 DOCX 压缩包内的所有 EMF 图片 * @param string $docxPath 原 DOCX 文件路径 * @return string 处理后的临时 DOCX 路径 */ private function removeEmfFromDocx($docxPath){ $zip = new ZipArchive(); if ($zip->open($docxPath) !== true) { throw new \Exception("无法打开 DOCX 文件:{$docxPath}"); } // 1. 创建临时目录用于解压 $tempDir = rtrim(ROOT_PATH,'/').'/runtime/'.uniqid('docx_temp_'); mkdir($tempDir, 0700, true); // 2. 解压 DOCX 到临时目录 $zip->extractTo($tempDir); $zip->close(); // 3. 递归删除所有 EMF 文件 $dirIterator = new RecursiveDirectoryIterator($tempDir); $iterator = new RecursiveIteratorIterator($dirIterator); foreach ($iterator as $file) { if ($file->isFile() && strtolower(pathinfo($file, PATHINFO_EXTENSION)) === 'emf') { unlink($file->getPathname()); } } // 4. 重新打包为 DOCX $processedPath = $tempDir . '_processed.docx'; $newZip = new ZipArchive(); if ($newZip->open($processedPath, ZipArchive::CREATE | ZipArchive::OVERWRITE) !== true) { throw new \Exception("无法创建处理后的 DOCX 文件"); } // 遍历临时目录,添加所有文件到新压缩包 $this->addFilesToZip($tempDir, $newZip); $newZip->close(); // 5. 删除临时解压目录 $this->deleteDir($tempDir); return $processedPath; } /** * 递归添加目录文件到 ZipArchive * @param string $dir 目录路径 * @param ZipArchive $zip ZipArchive 实例 */ private function addFilesToZip($dir, $zip) { $files = scandir($dir); foreach ($files as $file) { if ($file === '.' || $file === '..') continue; $filePath = $dir . '/' . $file; if (is_dir($filePath)) { $this->addFilesToZip($filePath, $zip); } else { // 计算压缩包内的相对路径(避免冗余目录层级) $relativePath = str_replace(dirname($dir) . '/', '', $filePath); $zip->addFile($filePath, $relativePath); } } } /** * 递归删除目录 * @param string $dir 目录路径 */ private function deleteDir($dir){ // 1. 基础校验:非空字符串且为有效目录 if (trim($dir) === '' || !is_dir($dir)) { return false; } // 2. 统一路径格式(去除尾部分隔符,避免跨系统差异) $dir = rtrim($dir, DIRECTORY_SEPARATOR); $dirName = basename($dir); // 3. 前缀强校验:仅处理docx_temp_开头的目录 if (strpos($dirName, 'docx_temp_') !== 0) { return false; } // 4. 路径归属校验(缓存realpath结果,减少I/O) $runtimeDir = rtrim(ROOT_PATH, '/') . '/runtime'; $realDir = realpath($dir); $realRuntimeDir = realpath($runtimeDir); if ($realDir === false || $realRuntimeDir === false || strpos($realDir, $realRuntimeDir) !== 0) { return false; } // 5. 扫描目录(带错误抑制,处理权限问题) $files = @scandir($dir); if ($files === false) { return false; } $isFullyDeleted = true; // 标记是否完全删除 // 6. 递归处理子项 foreach ($files as $file) { if ($file === '.' || $file === '..') { continue; } $filePath = $dir . DIRECTORY_SEPARATOR . $file; $realFilePath = realpath($filePath); // 子路径校验:必须是当前目录的子项(防符号链接跳转) if ($realFilePath === false || strpos($realFilePath, $realDir) !== 0) { $isFullyDeleted = false; continue; } if (is_dir($realFilePath)) { // 递归删除子目录,继承校验逻辑 if (!$this->deleteDir($realFilePath)) { $isFullyDeleted = false; } } else { // 尝试删除文件(失败则标记未完全删除) if (!@unlink($realFilePath)) { $isFullyDeleted = false; } } } // 7. 最终删除目录(确保空目录才删除) $remainingFiles = @scandir($dir); if ($remainingFiles !== false && count($remainingFiles) <= 2) { @rmdir($dir); return $isFullyDeleted; // 若子项完全删除,则返回true } return false; } // 上传并解析文档的入口方法 public static function uploadAndParse($sFileUrl){ //必填值验证 if(empty($sFileUrl)){ return json_encode(['status' => 2,'msg' => 'Please upload the submission file']); } //判断文件是否执行 if (!file_exists($sFileUrl)) { return json_encode(['status' => 3, 'msg' => 'The uploaded file does not exist']); } if (!is_readable($sFileUrl)) { return json_encode(['status' => 4, 'msg' => 'The uploaded file is unreadable']); } // 解析文档 $oDealFile = new self($sFileUrl); //获取标题 $sTitle = $oDealFile->getTitle(); if(empty($sTitle)){ return json_encode(['status' => 5, 'msg' => 'Article title retrieval failed']); } //获取作者 $aParam = ['title' => $sTitle]; $aAuthor = $oDealFile->getAuthors($aParam); $aAuthorData = empty($aAuthor['author']) ? [] : $aAuthor['author'];//所有作者信息 $aAuthorReportData = empty($aAuthor['report']) ? [] : $aAuthor['report'];//通讯作者信息 $aParam['author'] = $aAuthorData; $aParam['report'] = $aAuthorReportData; //获取机构 $aCompany = $oDealFile->getCompany($aParam); $aParam['company'] = $aCompany; //获取通讯作者信息 $aParam['corresponding'] = $oDealFile->getCorrespondingAuthors($aParam); //keywords 和 摘要 $aContent = $oDealFile->extractFromWord(); $aParam += empty($aContent['data']) ? [] : $aContent['data']; return json_encode(['status' => 1,'msg' => 'success','data' => $aParam]); } // 提取文章标题 private function getTitle(){ if(empty($this->sections)){ return ''; } $title = ''; $maxLength = 0; foreach ($this->sections as $section) { foreach ($section->getElements() as $element) { $text = $this->getTextFromElement($element); $length = mb_strlen(trim($text)); if ($length > $maxLength && $length > 10) { // 标题通常较长 $title = trim($text); $maxLength = $length; break 2; // 取第一个最长段落作为标题 } } } if(!empty($title) && !mb_check_encoding($title, 'UTF-8')){ $title = mb_convert_encoding($title, 'UTF-8', 'GBK'); } return $title; } // 提取作者 // private function getAuthors($aParam = []) { // $title = empty($aParam['title']) ? $this->getTitle() : $aParam['title']; // $sAuthorContent = $this->getNextParagraphAfterText($title); // if (empty($sAuthorContent)) { // return ['author' => [], 'report' => []]; // } // //编码修复 // $possibleEncodings = [ // 'Windows-1252', 'UTF-8', 'GBK', 'GB2312', // 'Latin-1', 'ISO-8859-1', 'CP1252' // ]; // $encodedContent = @mb_convert_encoding($sAuthorContent, 'UTF-8', implode(',', $possibleEncodings)); // $sAuthorContent = $encodedContent ?: $sAuthorContent; // //清理不可见字符 // $sAuthorContent = preg_replace('/[\x00-\x1F\x7F\x{200B}-\x{200F}]/u', '', $sAuthorContent); // //修复特殊符号乱码 // $symbolMap = [ // '†' => '†', 'â ' => '†', 'â' => '†', '?†' => '†', // ':' => ':', ',' => ',', '—' => '-', // '啊' => '' // 针对性移除异常字符“啊”(若为固定乱码) // ]; // $sAuthorContent = strtr($sAuthorContent, $symbolMap); // //格式标准化 // $sAuthorContent = str_replace([',', ';', ';', '、'], ',', $sAuthorContent); // 统一分隔符 // $sAuthorContent = preg_replace('/\s+and\s+/i', ', ', $sAuthorContent); // and转逗号 // $sAuthorContent = preg_replace('/\s+/', ' ', $sAuthorContent); // 合并多余空格 // $sAuthorContent = trim($sAuthorContent); // // 处理作者 // $content = mb_convert_encoding($sAuthorContent, 'UTF-8', 'auto'); // 确保编码正确 // $content = str_replace("\xC2\xA0", ' ', $content); // 替换非-breaking空格为普通空格 // $content = preg_replace('/(\d+)\s*([*#†])/', '$1$2', $content); // 合并"1 *"为"1*"、"1 #"为"1#" // $content = preg_replace('/,†/', ',†', $content); // 保留"1,†"格式(防止被拆分) // //标记上标内的逗号+空格(多编号) // $tempStr = preg_replace('/(\d+)\s*,\s*(\d+)/', '$1$2', $content); // // 原有步骤2:正则匹配(扩展上标符号支持,保持原有逻辑) // $pattern = '/ // ([A-Za-z\s\.\-]+?) # 姓名(支持缩写、空格) // \s* # 姓名与上标间空格 // ( # 上标组(扩展符号支持) // \d+ # 起始数字 // (?:[†#*,]|\d+)* # 允许:†#*符号、逗号、+数字(兼容1,†、1,*等) // ) // \s*,? # 作者间逗号(可选) // (?=\s|$) # 确保后面是空格或结尾 // /ux'; // preg_match_all($pattern, $tempStr, $matches); // $authorList = []; // if(!empty($matches[1])){ // foreach ($matches[1] as $i => $name) { // $name = trim($name); // $superscript = trim($matches[2][$i]); // $superscript = str_replace('', ',', $superscript); // 恢复多编号逗号 // $superscript = preg_replace('/,$/', '', $superscript); // 清理末尾逗号 // // 修复符号与数字间的空格(如原始"1 *"被误处理为"1*"的情况,保持原样) // $superscript = preg_replace('/(\d)([*#†])/', '$1$2', $superscript); // if (!empty($name)) { // $authorList[] = [ // 'name' => $name, // 'superscript' => $superscript // ]; // } // } // }else { // // 按“两个或多个连续空格”拆分(姓名之间的分隔) // $authorList = array_filter( // array_map('trim', // preg_split('/(,\p{Z}*|\p{Z}{2,})/u', $sAuthorContent) // ) // ); // } // // //处理作者 // // $authorList = []; // // // 新正则:匹配“姓名+上标”整体,允许上标含逗号(如1,†) // // // 逻辑:姓名以字母/中文开头,上标以数字开头、以符号/数字结尾 // // // if (preg_match_all('/([A-Za-z\x{4e00}-\x{9fa5}][A-Za-z\s·\-\'\x{4e00}-\x{9fa5}]*)\s*([\d,†#*]+)/u', $sAuthorContent, $matches)) { // // if(preg_match_all('/([A-Za-z\x{4e00}-\x{9fa5}][A-Za-z\s·\-\'\x{4e00}-\x{9fa5}]*)\s*(\d[\d,†#\s*]*)/u', $sAuthorContent, $matches)){ // // for ($i = 0; $i < count($matches[1]); $i++) { // // $authorList[] = trim($matches[1][$i] . $matches[2][$i]); // // } // // } else { // // // 按“两个或多个连续空格”拆分(姓名之间的分隔) // // $authorList = array_filter( // // array_map('trim', // // preg_split('/(,\p{Z}*|\p{Z}{2,})/u', $sAuthorContent) // // ) // // ); // // } // $aAuthorData = []; // $aReport = []; // $namePattern = '/ // (?:[A-Za-z\s·\-\']+| # 英文姓名(支持空格、连字符) // [\x{4e00}-\x{9fa5}]+| # 中文姓名 // [\x{1800}-\x{18AF}]+| # 蒙古文姓名 // [A-Z]\.) # 单字母缩写(如 J.) // /ux'; // var_dump($authorList);exit; // foreach ($authorList as $authorStr) { // if (empty($authorStr)) continue; // var_dump($authorList);exit; // //分离姓名与上标(支持上标含逗号,如1,†) // $superscript = ''; // // 新正则:匹配以数字开头、含逗号/符号的完整上标(如1,†、2*#) // $authorStr = trim(trim($authorStr,','),' '); // // if (preg_match('/([\d,†#*]+)$/u', $authorStr, $supMatch)) { // // if(preg_match('/\s*([\d,†#* ]+)$/u', $authorStr, $supMatch)){ // // if (preg_match('/.*?\s*([\d,†#* ]+)$/u', $authorStr, $supMatch)) { // // if (preg_match('/.*?\s*([\d,\x{2020}#* ]+?)\s*$/u', $authorStr, $supMatch)) { // // if (preg_match('/^(.+?)\D*?(\d[\d,#*†,\s]*)$/u', $authorStr, $supMatch)) { // // $superscript = $supMatch[1]; // // // 移除上标,保留纯姓名(避免残留符号) // // $nameStr = trim(preg_replace('/' . preg_quote($superscript, '/') . '$/', '', $authorStr)); // // } else { // // $nameStr = $authorStr; // // } // $pattern = '/^(.+?)\s*(\d[\d,#*†\s]*?)\s*$/u'; // if (preg_match($pattern, $authorStr, $supMatch)) { // $nameStr = empty($supMatch[1]) ? '' : trim($supMatch[1]); // 姓名部分:"Liguo Zhang" // $superscript = empty($supMatch[2]) ? $nameStr : $nameStr.trim($supMatch[2]); // 上标部分:"1 // // echo "姓名: $nameStr, 上标: $superscript\n"; // } else { // $nameStr = $authorStr; // } // //验证姓名合法性(过滤无效内容) // if (!preg_match($namePattern, $nameStr)) { // continue; // } // //解析上标信息(正确识别1,†中的机构编号和符号) // $companyId = ''; // $isSuper = 0; // $isReport = 0; // if (!empty($superscript)) { // // 提取机构编号(忽略上标中的逗号,如1,† → 提取1) // if (preg_match('/(\d+)/', $superscript, $numMatch)) { // $companyId = $numMatch[1]; // } // // 识别特殊符号(#为超级作者,*†为通讯作者) // $isSuper = strpos($superscript, '#') !== false ? 1 : 0; // $isReport = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0; // } // if (preg_match("/^([A-Za-z\s'\.-]+)/u", $nameStr, $match)) { // $nameStr = trim($match[1]); // } // $aAuthorData[] = [ // 'name' => $nameStr, // 'company_id' => $companyId, // 'is_super' => $isSuper, // 'is_report' => $isReport // ]; // if ($isReport) { // $aReport[] = $nameStr; // } // } // var_dump($aAuthorData);exit; // return ['author' => $aAuthorData,'report' => array_unique($aReport)]; // } // 提取作者 private function parseAuthorsWithoutRegex($str = '') { if (empty($str)) { return []; } // 清理乱码和特殊字符(扩展全角数字处理) $str = mb_convert_encoding($str, 'UTF-8', 'auto'); $str = str_replace(["\xC2\xA0", 'ï¼', '�', ',', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0'], [' ', ' ', ' ', ' ', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0'], $str); $str = trim(str_replace([' and ', ' AND ', ' And '], ', ', $str)); // 合并上标中数字与逗号间的空格(如"2, 3"→"2,3") $len = mb_strlen($str); $processed = ''; for ($i = 0; $i < $len; $i++) { $char = mb_substr($str, $i, 1); if ($char === ',' && $i - 1 >= 0 && $i + 2 < $len) { $prevChar = mb_substr($str, $i - 1, 1); $next1 = mb_substr($str, $i + 1, 1); $next2 = mb_substr($str, $i + 2, 1); // 兼容全角数字转半角后的判断 if ((ctype_digit($prevChar) || is_numeric($prevChar)) && $next1 === ' ' && (ctype_digit($next2) || is_numeric($next2))) { $processed .= $char; $i += 1; continue; } } $processed .= $char; } $str = $processed; // 合并数字与符号间的空格(如"1 *"→"1*") $len = mb_strlen($str); $processed = ''; for ($i = 0; $i < $len; $i++) { $char = mb_substr($str, $i, 1); if ((ctype_digit($char) || is_numeric($char)) && $i + 2 < $len) { // 支持数字判断 $next1 = mb_substr($str, $i + 1, 1); $next2 = mb_substr($str, $i + 2, 1); if ($next1 === ' ' && in_array($next2, ['#', '*', '†', '‡', '§'])) { // 扩展符号支持 $processed .= $char; $i += 2; $processed .= $next2; continue; } } $processed .= $char; } $str = $processed; // 合并连续空格 $len = mb_strlen($str); $processed = ''; $prevSpace = false; for ($i = 0; $i < $len; $i++) { $char = mb_substr($str, $i, 1); if ($char === ' ') { if (!$prevSpace) { $processed .= $char; $prevSpace = true; } } else { $processed .= $char; $prevSpace = false; } } $str = trim($processed); // 作者处理 $authors = []; $currentName = ''; $currentSuperscript = ''; $inName = true; $len = mb_strlen($str); for ($i = 0; $i < $len; $i++) { $char = mb_substr($str, $i, 1); // 处理作者分隔符:逗号+空格 if ($char === ',' && $i + 1 < $len) { $nextChar = mb_substr($str, $i + 1, 1); if ($nextChar === ' ') { if (!empty($currentName)) { $currentSuperscript = rtrim($currentSuperscript, ','); $authors[] = [ 'name' => trim($currentName), 'superscript' => trim($currentSuperscript) ]; } $currentName = ''; $currentSuperscript = ''; $inName = true; $i++; continue; } } // 支持姓名中的点、连字符、特殊字母(如带重音的字母) if (ctype_alpha($char) || in_array($char, [' ', '.', '-', 'à', 'á', 'â', 'ã', 'ä', 'ç', 'è', 'é', 'ê', 'ë'])) { if ($inName) { $currentName .= $char; } else { $currentSuperscript = rtrim($currentSuperscript, ','); $authors[] = [ 'name' => trim($currentName), 'superscript' => trim($currentSuperscript) ]; $currentName = $char; $currentSuperscript = ''; $inName = true; } } // 解析上标(数字、逗号、#、*、†等) elseif ((ctype_digit($char) || is_numeric($char)) || in_array($char, ['#', '*', '†', ',', '‡', '§'])) { $inName = false; $currentSuperscript .= $char; } // 忽略其他字符 else { continue; } } // 处理最后一个作者 if (!empty($currentName)) { $currentSuperscript = rtrim($currentSuperscript, ','); $authors[] = [ 'name' => trim($currentName), 'superscript' => trim($currentSuperscript) ]; } // 提取机构编号为数组、判断通讯作者和第一作者 foreach ($authors as $index => &$author) { // 提取机构编号(兼容多字节数字) $institutionIds = []; $superscript = $author['superscript']; $numStr = ''; for ($i = 0; $i < mb_strlen($superscript); $i++) { $c = mb_substr($superscript, $i, 1); if (ctype_digit($c) || is_numeric($c)) { // 支持数字判断 $numStr .= $c; } else { if (!empty($numStr)) { $institutionIds[] = (int)$numStr; $numStr = ''; } } } if (!empty($numStr)) { $institutionIds[] = (int)$numStr; } $institutionIds = array_values(array_unique($institutionIds)); $author['company_id'] = $institutionIds; // 判断第一作者(#标记)和通讯作者(*、†标记) $author['is_super'] = strpos($superscript, '#') !== false ? 1 : 0; $author['is_report'] = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0; } unset($author); // 释放引用 return $authors; } private function getAuthors($aParam = []) { $title = empty($aParam['title']) ? $this->getTitle() : $aParam['title']; $sAuthorContent = $this->getNextParagraphAfterText($title); if (empty($sAuthorContent)) { return ['author' => [], 'report' => []]; } //编码修复 $possibleEncodings = [ 'Windows-1252', 'UTF-8', 'GBK', 'GB2312', 'Latin-1', 'ISO-8859-1', 'CP1252' ]; $encodedContent = @mb_convert_encoding($sAuthorContent, 'UTF-8', implode(',', $possibleEncodings)); $sAuthorContent = $encodedContent ?: $sAuthorContent; //清理不可见字符 $sAuthorContent = preg_replace('/[\x00-\x1F\x7F\x{200B}-\x{200F}]/u', '', $sAuthorContent); //修复特殊符号乱码 $symbolMap = [ '†' => '†', 'â ' => '†', 'â' => '†', '?†' => '†', ':' => ':', ',' => ',', '—' => '-', '啊' => '' // 针对性移除异常字符“啊”(若为固定乱码) ]; $sAuthorContent = strtr($sAuthorContent, $symbolMap); //格式标准化 $sAuthorContent = str_replace([',', ';', ';', '、'], ',', $sAuthorContent); // 统一分隔符 $sAuthorContent = preg_replace('/\s+and\s+/i', ', ', $sAuthorContent); // and转逗号 $sAuthorContent = preg_replace('/\s+/', ' ', $sAuthorContent); // 合并多余空格 $sAuthorContent = trim($sAuthorContent); $aAuthor = $this->parseAuthorsWithoutRegex($sAuthorContent); if(empty($aAuthor)){ return ['author' => [],'report' => []]; } $aReport = $aAuthorData = []; foreach ($aAuthor as $key => $value) { if(empty($value['name']) && empty($value['superscript'])){ continue; } if(!mb_check_encoding($value['name'], 'UTF-8')){ $value['name'] = mb_convert_encoding($value['name'], 'UTF-8', 'GBK'); } if(!empty($value['name']) && !empty($value['is_report']) && $value['is_report'] == 1){ $aReport[] = $value['name']; } $aAuthorData[] = $value; } return ['author' => $aAuthorData,'report' => array_unique($aReport)]; } // private function getAuthors($aParam = []) { // $title = empty($aParam['title']) ? $this->getTitle() : $aParam['title']; // $sAuthorContent = $this->getNextParagraphAfterText($title); // if (empty($sAuthorContent)) { // return ['author' => [], 'report' => []]; // } // //编码修复 // $possibleEncodings = [ // 'Windows-1252', 'UTF-8', 'GBK', 'GB2312', // 'Latin-1', 'ISO-8859-1', 'CP1252' // ]; // $encodedContent = @mb_convert_encoding($sAuthorContent, 'UTF-8', implode(',', $possibleEncodings)); // $sAuthorContent = $encodedContent ?: $sAuthorContent; // //清理不可见字符 // $sAuthorContent = preg_replace('/[\x00-\x1F\x7F\x{200B}-\x{200F}]/u', '', $sAuthorContent); // //修复特殊符号乱码 // $symbolMap = [ // '†' => '†', 'â ' => '†', 'â' => '†', '?†' => '†', // ':' => ':', ',' => ',', '—' => '-', // '啊' => '' // 针对性移除异常字符“啊”(若为固定乱码) // ]; // $sAuthorContent = strtr($sAuthorContent, $symbolMap); // //格式标准化 // $sAuthorContent = str_replace([',', ';', ';', '、'], ',', $sAuthorContent); // 统一分隔符 // $sAuthorContent = preg_replace('/\s+and\s+/i', ', ', $sAuthorContent); // and转逗号 // $sAuthorContent = preg_replace('/\s+/', ' ', $sAuthorContent); // 合并多余空格 // $sAuthorContent = trim($sAuthorContent); // var_dump($this->parseAuthorsWithoutRegex($sAuthorContent));exit; // // 关键预处理:兼容"and"分隔符、清理乱码、统一空格 // $content = mb_convert_encoding($sAuthorContent, 'UTF-8', 'auto'); // $content = str_replace(["\xC2\xA0", 'ï¼', '�', ','], ' ', $content); // 清理乱码和全角符号 // $content = preg_replace('/\band\b/i', ',', $content); // 将 "and" 转为逗号(统一分隔符) // $content = preg_replace('/(\d+)\s*([*#†])/', '$1$2', $content); // 合并数字与符号间的空格(如"1 *"→"1*") // $content = trim(preg_replace('/\s+/', ' ', $content)); // 合并连续空格 // // 标记上标内的逗号(多编号处理) // $tempStr = preg_replace('/(\d+)\s*,\s*(\d+)/', '$1$2', $content); // // 核心正则(保持原有结构,扩展符号支持) // $pattern = '/ // ([A-Za-z\s\.\-]+?) # 姓名(支持缩写、空格、连字符) // \s* # 姓名与上标间的空格(允许0或多个) // ( # 上标组(扩展兼容所有符号) // \d+ # 起始数字(至少1个数字) // (?:[†#*,]|\d+)* # 允许:符号(†#*)、逗号、+数字(多编号) // ) // \s*,? # 作者间的逗号(可选,允许逗号前有空格) // (?=\s|$) # 确保后面是空格或字符串结尾(避免跨作者匹配) // /ux'; // preg_match_all($pattern, $tempStr, $matches); // // 解析结果并格式化 // $authorList = []; // if (!empty($matches[1])) { // foreach ($matches[1] as $i => $name) { // $name = trim($name); // $superscript = trim($matches[2][$i]); // $superscript = str_replace('', ',', $superscript); // 恢复多编号逗号 // $superscript = preg_replace('/,$/', '', $superscript); // 清理末尾多余逗号 // if (!empty($name)) { // $authorList[] = [ // 'name' => $name, // 'superscript' => $superscript // ]; // } // } // } // // 输出结果 // echo "
";
// print_r($authorList);
// echo "
"; // exit; // // 处理作者 // $content = mb_convert_encoding($sAuthorContent, 'UTF-8', 'auto'); // 确保编码正确 // $content = str_replace("\xC2\xA0", ' ', $content); // 替换非-breaking空格为普通空格 // $content = preg_replace('/(\d+)\s*([*#†])/', '$1$2', $content); // 合并"1 *"为"1*"、"1 #"为"1#" // $content = preg_replace('/,†/', ',†', $content); // 保留"1,†"格式(防止被拆分) // //标记上标内的逗号+空格(多编号) // $tempStr = preg_replace('/(\d+)\s*,\s*(\d+)/', '$1$2', $content); // // 原有步骤2:正则匹配(扩展上标符号支持,保持原有逻辑) // $pattern = '/ // ([A-Za-z\s\.\-]+?) # 姓名(支持缩写、空格) // \s* # 姓名与上标间空格 // ( # 上标组(扩展符号支持) // \d+ # 起始数字 // (?:[†#*,]|\d+)* # 允许:†#*符号、逗号、+数字(兼容1,†、1,*等) // ) // \s*,? # 作者间逗号(可选) // (?=\s|$) # 确保后面是空格或结尾 // /ux'; // preg_match_all($pattern, $tempStr, $matches); // var_dump($matches);exit; // $authorList = []; // if(!empty($matches[1])){ // foreach ($matches[1] as $i => $name) { // $name = trim($name); // $superscript = trim($matches[2][$i]); // $superscript = str_replace('', ',', $superscript); // 恢复多编号逗号 // $superscript = preg_replace('/,$/', '', $superscript); // 清理末尾逗号 // // 修复符号与数字间的空格(如原始"1 *"被误处理为"1*"的情况,保持原样) // $superscript = preg_replace('/(\d)([*#†])/', '$1$2', $superscript); // if (!empty($name)) { // $authorList[] = [ // 'name' => $name, // 'superscript' => $superscript // ]; // } // } // }else { // // 按“两个或多个连续空格”拆分(姓名之间的分隔) // $authorList = array_filter( // array_map('trim', // preg_split('/(,\p{Z}*|\p{Z}{2,})/u', $sAuthorContent) // ) // ); // } // // //处理作者 // $aAuthorData = []; // $aReport = []; // $namePattern = '/ // (?:[A-Za-z\s·\-\']+| # 英文姓名(支持空格、连字符) // [\x{4e00}-\x{9fa5}]+| # 中文姓名 // [\x{1800}-\x{18AF}]+| # 蒙古文姓名 // [A-Z]\.) # 单字母缩写(如 J.) // /ux'; // foreach ($authorList as $authorStr){ // if (empty($authorStr)) continue; // //获取下标 // $superscript = empty($authorStr['superscript']) ? $authorStr : $authorStr['superscript']; // $nameStr = empty($authorStr['name']) ? $authorStr : $authorStr['name']; // $companyId = []; // $isSuper = 0; // $isReport = 0; // if (!empty($superscript)) { // // 提取机构编号(忽略上标中的逗号,如1,† → 提取1) // preg_match_all('/\d+/', $superscript, $numMatch); // // 识别特殊符号(#为超级作者,*†为通讯作者) // $isSuper = strpos($superscript, '#') !== false ? 1 : 0; // $isReport = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0; // } // if (preg_match("/^([A-Za-z\s'\.-]+)/u", $nameStr, $match)) { // $nameStr = trim($match[1]); // } // $aAuthorData[] = [ // 'name' => $nameStr, // 'company_id' => empty($numMatch[0]) ? [] : $numMatch[0], // 'is_super' => $isSuper, // 'is_report' => $isReport // ]; // if ($isReport) { // $aReport[] = $nameStr; // } // } // return ['author' => $aAuthorData,'report' => array_unique($aReport)]; // } // 获取机构 private function getCompany($aParam = []){ //获取标题 $title = empty($aParam['title']) ? $this->getTitle() : $aParam['title']; //获取标题下的作者 $sAuthorContent = empty($aParam['authors']) ? $this->getNextParagraphAfterText($title) : $aParam['authors']; //获取作者结构 $allLines = $this->getContentAfterText($sAuthorContent,1); if(empty($allLines)){ return []; } // 2. 按序号分组,合并同一序号的多行内容 $grouped = []; $currentNumber = null; // 当前序号 foreach ($allLines as $line) { $line = trim($line); if (empty($line)) continue; // 判断是否是新条目的开头:行首为数字(后续可接任意字符或直接接内容) $number = ''; $i = 0; $lineLen = strlen($line); // 提取行首的连续数字(作为序号) while ($i < $lineLen && ctype_digit($line[$i])) { $number .= $line[$i]; $i++; } // 若行首有数字,则视为新条目 if (!empty($number)) { $currentNumber = $number; // 提取序号后的内容(跳过数字后的符号/空格,保留核心内容) // 从数字后的位置开始,跳过可能的符号(./*)或空格 while ($i < $lineLen && (in_array($line[$i], ['.', '*', ' ']))) { $i++; } $content = trim(substr($line, $i)); // 序号后的内容 $grouped[$currentNumber] = $content; continue; } // 非新条目,合并到当前序号的内容中 if ($currentNumber !== null) { $grouped[$currentNumber] .= ' ' . $line; } } //清理结果 $possibleEncodings = [ 'Windows-1252', 'UTF-8', 'GBK', 'GB2312', 'Latin-1', 'ISO-8859-1', 'CP1252' ]; $aCompany = []; foreach ($grouped as $number => $institution) { $encodedContent = @mb_convert_encoding($institution, 'UTF-8', implode(',', $possibleEncodings)); $sCompany = $encodedContent ?: $sCompany; $institution = preg_replace('/\s+/', ' ', $institution); // 合并多余空格 $institution = rtrim($institution, '.'); $institution = preg_replace('/^\d+\s+/', '', $institution); $institution = trim($institution); // 清理首尾空格 preg_match('/(.*?, [A-Za-z]+ \d+, [A-Za-z]+)/', $institution, $institutionmatches);; $institution = trim($institutionmatches[1] ?? $institution); if (preg_match('/^(.*?)(?=\s*\*Email)/', $institution, $matches)) { $institution = trim($matches[1]); // trim() 去除内容前后多余空格 } if(!empty($institution) && !mb_check_encoding($institution, 'UTF-8')){ $institution = mb_convert_encoding($institution, 'UTF-8', 'GBK'); } $aCompany[$number] = $institution; } return $aCompany; } // 提取通讯作者(含E-mail、地址、电话) private function getCorrespondingAuthors($aParam = []){ $aCorrespondingAuthor = empty($aParam['report']) ? [] : $aParam['report']; if(empty($aCorrespondingAuthor)){ return []; } // 获取标题 $title = empty($aParam['title']) ? $this->getTitle() : $aParam['title']; $sAuthorContent = $this->getNextParagraphAfterText($title); $sCompany = $this->getNextParagraphAfterText($sAuthorContent); // 直接取机构所在段落的原始文本 if (empty($sCompany)) { // 备选方案:若机构段落获取失败,用解析后的机构名称拼接 $aCompany = $this->getCompany($aParam); $sCompany = implode(' ', array_values($aCompany)); } // 获取机构后的完整内容 $corrText = $this->getContentAfterText($sCompany); //编码修复 $possibleEncodings = [ 'Windows-1252', 'UTF-8', 'GBK', 'GB2312', 'Latin-1', 'ISO-8859-1', 'CP1252' ]; $encodedContent = @mb_convert_encoding($corrText, 'UTF-8', implode(',', $possibleEncodings)); $corrText = $encodedContent ?: $corrText; // // 调试 // file_put_contents(ROOT_PATH . 'runtime/corr_text_raw.log', $corrText); //清理文本 $corrText = str_replace([':', '@'], [':', '@'], $corrText); $corrText = preg_replace('/\s+/', ' ', $corrText); // 统一空格 $corrText = str_replace(' ', ' ', $corrText); // 去除多余空格 //按"*"分割通讯作者 $corrBlocks = preg_split('/\s*\*\s*/', $corrText); $corrBlocks = array_filter(array_map('trim', $corrBlocks)); $aCorresponding = []; foreach ($corrBlocks as $block) { //匹配通讯作者姓名 $sName = $this->matchCorrespondingName($block, $aCorrespondingAuthor); if (empty($sName)) { continue; } preg_match('/(E[\s-]*mail|邮箱)[\s:]*([^\s]+@[^\s]+)/i', $block, $email); preg_match('/(Postal[\s-]*address|地址)[\s:]*([^,;]+)/i', $block, $address); preg_match('/(Tel|电话)[\s:]*([^\s]+)/i', $block, $tel); $aCorresponding[] = [ 'name' => $sName, 'email' => isset($email[2]) ? trim($email[2]) : '', 'postal_address' => isset($address[2]) ? trim($address[2]) : '', 'tel' => isset($tel[2]) ? trim($tel[2]) : '' ]; } if(empty($aCorresponding)){ $pattern = '/Corresponding Authors: (.*?)(?=$|;)/s'; preg_match($pattern, $corrText, $match); if (!empty($match[1])) { $corrContent = $match[1]; // 提取每个作者的名称和邮箱(优化正则,支持更多字符) $authorPattern = '/([A-Za-z\s]+?),\s*E-mail:\s*([\w@\.\-]+)/'; preg_match_all($authorPattern, $corrContent, $authors); if(!empty($authors[1])){ for ($i = 0; $i < count($authors[1]); $i++) { $aCorresponding[] = [ 'name' => empty($authors[1][$i]) ? '' : trim($authors[1][$i]), 'email' => empty($authors[2][$i]) ? '' : trim($authors[2][$i]) ]; } } } } return $aCorresponding; } //匹配通讯作者姓名 private function matchCorrespondingName($block, $corrNames) { $blockLower = strtolower($block); foreach ($corrNames as $name) { if (strpos($blockLower, strtolower($name)) !== false) { return $name; } $nameParts = explode(' ', $name); if (count($nameParts) >= 2) { $reversedName = implode(' ', array_reverse($nameParts)); if (strpos($blockLower, strtolower($reversedName)) !== false) { return $name; } } } return ''; } // 获取目标文本的下一个段落 private function getNextParagraphAfterText($targetText){ $found = false; foreach ($this->sections as $section) { foreach ($section->getElements() as $element) { $text = $this->getTextFromElement($element); if(empty($text)){ continue; } if ($found) { return $text; } if (stripos($text, $targetText) !== false) { $found = true; } } } return ''; } // 获取目标文本后的所有内容 private function getContentAfterText($targetText,$return_type = 2){ $found = false; $content = []; $stopKeywords = ['关键词', 'Key words', '摘要', 'Abstract','ABSTRACT']; $maxLines = 200; $lineNumber = 0; foreach ($this->sections as $section) { foreach ($section->getElements() as $element) { $lineNumber++; if (count($content) >= $maxLines) break; $text = $this->getTextFromElement($element,$lineNumber); $text = trim($text); if (empty($text)) continue; if (!$found) { // 移除所有非字母数字字符后匹配 $cleanTarget = preg_replace('/[^a-zA-Z0-9]/', '', strtolower($targetText)); $cleanText = preg_replace('/[^a-zA-Z0-9]/', '', strtolower($text)); // 只要目标文本的50%以上能匹配即可 if (strlen($cleanTarget) > 0 && similar_text($cleanText, $cleanTarget) / strlen($cleanTarget) > 0.5) { $found = true; } continue; } // 检查停止关键词 $shouldStop = false; foreach ($stopKeywords as $kw) { if (stripos($text, $kw) !== false) { $shouldStop = true; break; } } if ($shouldStop) break; $content[] = $text; } if (count($content) >= $maxLines || (isset($shouldStop) && $shouldStop)) break; } if($return_type == 1){ return $content; } $content = implode("\n", $content); if(!empty($content) && !mb_check_encoding($content, 'UTF-8')){ $content = mb_convert_encoding($content, 'UTF-8', 'GBK'); } return $content; } // 统一提取元素文本 private function getTextFromElement($element,$lineNumber = 0){ $text = ''; // 处理PreserveText元素 if ($element instanceof \PhpOffice\PhpWord\Element\PreserveText) { // 通过反射获取私有属性 text $reflection = new \ReflectionClass($element); $property = $reflection->getProperty('text'); $property->setAccessible(true); $textParts = $property->getValue($element); foreach ($textParts as $part) { if (strpos($part, 'HYPERLINK') !== false) { // 解码 HTML 实体(" -> ") $decoded = html_entity_decode($part); // 提取 mailto: 后的邮箱 if (preg_match('/mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/i', $decoded, $match)) { $text .= $match[1] . ' '; } } else { // 普通文本直接拼接 $text .= $part; } } return $text; } // 处理表格和单元格(E-mail可能在表格中) if ($element instanceof \PhpOffice\PhpWord\Element\Table) { foreach ($element->getRows() as $row) { foreach ($row->getCells() as $cell) { $text .= $this->getTextFromElement($cell); } } return $text; } if ($element instanceof \PhpOffice\PhpWord\Element\Cell) { foreach ($element->getElements() as $child) { $text .= $this->getTextFromElement($child); } return $text; } //处理嵌套元素(递归提取所有子元素) if (method_exists($element, 'getElements')) { foreach ($element->getElements() as $child) { $text .= $this->getTextFromElement($child); } } //处理文本元素(包括带格式的文本) if ($element instanceof \PhpOffice\PhpWord\Element\Text) { $text .= $element->getText(); } //处理超链接(优先提取链接目标,可能是邮箱) if ($element instanceof \PhpOffice\PhpWord\Element\Link) { $target = $element->getTarget(); if (strpos($target, 'mailto:') === 0) { $text .= str_replace('mailto:', '', $target) . ' '; // 剥离mailto:前缀 } $text .= $element->getText() . ' '; } //处理字段和注释(可能包含隐藏邮箱) if ($element instanceof \PhpOffice\PhpWord\Element\Field) { $text .= $element->getContent() . ' '; } if ($element instanceof \PhpOffice\PhpWord\Element\Note) { $text .= $element->getContent() . ' '; } //清理所有不可见字符(关键:移除格式干扰) $text = preg_replace('/[\x00-\x1F\x7F-\x9F]/', ' ', $text); // 移除控制字符 $text = str_replace(["\t", "\r", "\n"], ' ', $text); // 统一空白字符 $text = preg_replace('/\s+/', ' ', $text); // 合并多个空格 if(!empty($text) && !mb_check_encoding($text, 'UTF-8')){ $text = mb_convert_encoding($text, 'UTF-8', 'GBK'); } return $text; } /** * 从 Word 文档提取摘要和关键词 * @return array 提取结果 */ public function extractFromWord() { $sContent = ''; //文本处理 $sFundContent = ''; foreach ($this->sections as $section) { foreach ($section->getElements() as $element) { $textContent = $this->getTextFromElement($element); if(empty($textContent)){ continue; } //编码修复 $possibleEncodings = [ 'Windows-1252', 'UTF-8', 'GBK', 'GB2312', 'Latin-1', 'ISO-8859-1', 'CP1252' ]; $sContent .= @mb_convert_encoding($textContent, 'UTF-8', implode(',', $possibleEncodings)); if(stripos($textContent, 'Keywords:') !== false){ $sContent .= "Keywords-End-Flag"; } if(empty($sFundContent)){ $aFund = $this->getMatchedFundPhrases($sContent); if(!empty($aFund[0])){ $position = stripos($sContent, $aFund[0]); $sFundContent = substr($sContent, $position); $sFundContent = trim(str_ireplace($aFund[0], '', $sFundContent)); if (preg_match('/^(.*?)Peer review/', $sFundContent, $matches)) { $sFundContent = $matches[1]; // 提取匹配到的前置内容 } } } $sContent .= "\n"; } } if(!empty($sContent) && !mb_check_encoding($sContent, 'UTF-8')){ $sContent = mb_convert_encoding($sContent, 'UTF-8', 'GBK'); } // 2. 基础文本清理(合并多余空格,保留有效换行) $textContent = preg_replace('/(\S)\s+/', '$1 ', $sContent); $textContent = trim($textContent); // 3. 提取摘要 $abstract = ''; $abstractPattern = '/Abstract\s*([\s\S]*?)(?=Keywords|$)/i'; if (preg_match($abstractPattern, $textContent, $abstractMatches)) { $abstract = trim($abstractMatches[1]); $abstract = preg_replace('/\n+/', ' ', $abstract); } // 4. 提取关键词(核心:仅保留两种强制匹配逻辑) $keywords = []; // $keywordPattern = '/Keywords:\s*([\s\S]*?)(?=\s*\d+\.|[;,]\s*[\r\n]+\s*[\r\n]+|(?i)\bintroduction|abbreviations\b|$)/i'; $keywordPattern = '/Keywords\s*(.*?)\s*Keywords-End-Flag/s'; if (preg_match($keywordPattern, $textContent, $keywordMatches)) { $keywordStr = trim($keywordMatches[1]); // 清理关键词列表格式(去除换行、末尾多余符号) $keywordStr = preg_replace('/\n+/', ' ', $keywordStr); $keywordStr = rtrim($keywordStr, ';,. '); // 去除末尾分号、逗号等 $keywordStr = trim($keywordStr); // 分割并过滤有效关键词 $keywords = preg_split('/[,;]\s*/', $keywordStr); $keywords = array_filter(array_map('trim', $keywords), function($item) { return !empty($item) && !ctype_space($item); }); } if(empty($keywords)){ $keywordPattern = '/Keywords\s*([\s\S]*?)(?=Introduction|$)/i'; if (preg_match($keywordPattern, $textContent, $keywordMatches)) { $keywordStr = trim($keywordMatches[1]); // 清理关键词列表格式(去除换行、末尾多余符号) $keywordStr = preg_replace('/\n+/', ' ', $keywordStr); $keywordStr = rtrim($keywordStr, ';,. '); // 去除末尾分号、逗号等 $keywordStr = trim($keywordStr); // 分割并过滤有效关键词 $keywords = preg_split('/[,;]\s*/', $keywordStr); $keywords = array_filter(array_map('trim', $keywords), function($item) { return !empty($item) && !ctype_space($item); }); } } return [ 'status' => 1, 'msg' => '提取成功', 'data' => [ 'abstrart' => $abstract, 'keywords' => $keywords, 'fund' => $sFundContent ] ]; } private function getMatchedFundPhrases($content = '') { if (empty($content)) { return []; } // 基金支持词组列表 $fundPhrases = [ 'Supported by', 'Funded by', 'Sponsored by', 'Supported in part by', 'Funding was provided by', 'Funded in part by' ]; // 1. 转义词组中的特殊字符,使用 # 作为分隔符 $escapedPhrases = array_map(function($phrase) { return preg_quote($phrase, '#'); }, $fundPhrases); // 2. 拼接为正则模式:匹配任意一个词组(保留原始词组的捕获) $pattern = '#('.implode('|', $escapedPhrases).')#i'; // 注意:此处用 () 捕获分组,而非 (?:),用于提取匹配到的具体词组 // 3. 全局匹配所有符合的词组 preg_match_all($pattern, $content, $matches); // 4. 处理结果:去重、保留原始词组格式(忽略大小写导致的变体) $matched = []; if (!empty($matches[1])) { // 遍历匹配到的结果(可能包含大小写变体,如 'funded by') foreach ($matches[1] as $match) { // 与原始词组列表比对,找到完全匹配的原始词组(忽略大小写) foreach ($fundPhrases as $original) { if (strcasecmp($match, $original) === 0) { $matched[] = $original; break; // 找到后跳出内层循环,避免重复 } } } // 去重并保持原始顺序 $matched = array_values(array_unique($matched)); } return $matched; } //日志打印 private function log($msg){ // echo date('[Y-m-d H:i:s] ') . $msg . "\n"; } }