diff --git a/application/common/ArticleParserService.php b/application/common/ArticleParserService.php new file mode 100644 index 0000000..7cd305e --- /dev/null +++ b/application/common/ArticleParserService.php @@ -0,0 +1,764 @@ +setReadDataOnly(false); + Settings::setCompatibility(false); + Settings::setOutputEscapingEnabled(true); // 避免XML转义冲突 + + $doc = $reader->load($filePath); + $sectionCount = count($doc->getSections()); + // $this->log("✅ 文档直接加载成功,节数量:{$sectionCount}"); + $this->phpWord = $reader->load($filePath); + $this->sections = $this->phpWord->getSections(); + + } catch (\Exception $e) { + return json(['status' => 'error', 'msg' => $e->getMessage()]); + } + } + + // 上传并解析文档的入口方法 + public static function uploadAndParse($sFileUrl){ + //必填值验证 + if(empty($sFileUrl)){ + return json_encode(['status' => 2,'msg' => 'Please upload the submission file']); + } + + //判断文件是否执行 + if (!file_exists($sFileUrl)) { + return json_encode(['status' => 3, 'msg' => 'The uploaded file does not exist']); + } + if (!is_readable($sFileUrl)) { + return json_encode(['status' => 4, 'msg' => 'The uploaded file is unreadable']); + } + + // 解析文档 + $oDealFile = new self($sFileUrl); + //获取标题 + $sTitle = $oDealFile->getTitle(); + if(empty($sTitle)){ + return json_encode(['status' => 5, 'msg' => 'Article title retrieval failed']); + } + //获取作者 + $aParam = ['title' => $sTitle]; + $aAuthor = $oDealFile->getAuthors($aParam); + $aAuthorData = empty($aAuthor['author']) ? [] : $aAuthor['author'];//所有作者信息 + $aAuthorReportData = empty($aAuthor['report']) ? [] : $aAuthor['report'];//通讯作者信息 + $aParam['author'] = $aAuthorData; + $aParam['report'] = $aAuthorReportData; + //获取机构 + $aCompany = $oDealFile->getCompany($aParam); + $aParam['company'] = $aCompany; + //获取通讯作者信息 + $aParam['corresponding'] = $oDealFile->getCorrespondingAuthors($aParam); + //keywords 和 摘要 + $aContent = $oDealFile->extractFromWord(); + $aParam += empty($aContent['data']) ? [] : $aContent['data']; + return json_encode(['status' => 1,'msg' => 'success','data' => $aParam]); + } + + // 提取文章标题 + private function getTitle(){ + $title = ''; + $maxLength = 0; + + foreach ($this->sections as $section) { + foreach ($section->getElements() as $element) { + $text = $this->getTextFromElement($element); + $length = mb_strlen(trim($text)); + if ($length > $maxLength && $length > 10) { // 标题通常较长 + $title = trim($text); + $maxLength = $length; + break 2; // 取第一个最长段落作为标题 + } + } + } + return $title; + } + // 提取作者 + // private function getAuthors($aParam = []) { + // $title = empty($aParam['title']) ? $this->getTitle() : $aParam['title']; + // $sAuthorContent = $this->getNextParagraphAfterText($title); + // if (empty($sAuthorContent)) { + // return ['author' => [], 'report' => []]; + // } + + // //编码修复 + // $possibleEncodings = [ + // 'Windows-1252', 'UTF-8', 'GBK', 'GB2312', + // 'Latin-1', 'ISO-8859-1', 'CP1252' + // ]; + // $encodedContent = @mb_convert_encoding($sAuthorContent, 'UTF-8', implode(',', $possibleEncodings)); + // $sAuthorContent = $encodedContent ?: $sAuthorContent; + + // //清理不可见字符 + // $sAuthorContent = preg_replace('/[\x00-\x1F\x7F\x{200B}-\x{200F}]/u', '', $sAuthorContent); + + // //修复特殊符号乱码 + // $symbolMap = [ + // '†' => '†', 'â ' => '†', 'â' => '†', '?†' => '†', + // ':' => ':', ',' => ',', '—' => '-', + // '啊' => '' // 针对性移除异常字符“啊”(若为固定乱码) + // ]; + // $sAuthorContent = strtr($sAuthorContent, $symbolMap); + + // //格式标准化 + // $sAuthorContent = str_replace([',', ';', ';', '、'], ',', $sAuthorContent); // 统一分隔符 + // $sAuthorContent = preg_replace('/\s+and\s+/i', ', ', $sAuthorContent); // and转逗号 + // $sAuthorContent = preg_replace('/\s+/', ' ', $sAuthorContent); // 合并多余空格 + // $sAuthorContent = trim($sAuthorContent); + + // // 处理作者 + // $content = mb_convert_encoding($sAuthorContent, 'UTF-8', 'auto'); // 确保编码正确 + // $content = str_replace("\xC2\xA0", ' ', $content); // 替换非-breaking空格为普通空格 + // $content = preg_replace('/(\d+)\s*([*#†])/', '$1$2', $content); // 合并"1 *"为"1*"、"1 #"为"1#" + // $content = preg_replace('/,†/', ',†', $content); // 保留"1,†"格式(防止被拆分) + // //标记上标内的逗号+空格(多编号) + // $tempStr = preg_replace('/(\d+)\s*,\s*(\d+)/', '$1$2', $content); + // // 原有步骤2:正则匹配(扩展上标符号支持,保持原有逻辑) + // $pattern = '/ + // ([A-Za-z\s\.\-]+?) # 姓名(支持缩写、空格) + // \s* # 姓名与上标间空格 + // ( # 上标组(扩展符号支持) + // \d+ # 起始数字 + // (?:[†#*,]|\d+)* # 允许:†#*符号、逗号、+数字(兼容1,†、1,*等) + // ) + // \s*,? # 作者间逗号(可选) + // (?=\s|$) # 确保后面是空格或结尾 + // /ux'; + + // preg_match_all($pattern, $tempStr, $matches); + // $authorList = []; + // if(!empty($matches[1])){ + // foreach ($matches[1] as $i => $name) { + // $name = trim($name); + // $superscript = trim($matches[2][$i]); + // $superscript = str_replace('', ',', $superscript); // 恢复多编号逗号 + // $superscript = preg_replace('/,$/', '', $superscript); // 清理末尾逗号 + // // 修复符号与数字间的空格(如原始"1 *"被误处理为"1*"的情况,保持原样) + // $superscript = preg_replace('/(\d)([*#†])/', '$1$2', $superscript); + // if (!empty($name)) { + // $authorList[] = [ + // 'name' => $name, + // 'superscript' => $superscript + // ]; + // } + // } + // }else { + // // 按“两个或多个连续空格”拆分(姓名之间的分隔) + // $authorList = array_filter( + // array_map('trim', + // preg_split('/(,\p{Z}*|\p{Z}{2,})/u', $sAuthorContent) + // ) + // ); + // } + + + // // //处理作者 + // // $authorList = []; + // // // 新正则:匹配“姓名+上标”整体,允许上标含逗号(如1,†) + // // // 逻辑:姓名以字母/中文开头,上标以数字开头、以符号/数字结尾 + // // // if (preg_match_all('/([A-Za-z\x{4e00}-\x{9fa5}][A-Za-z\s·\-\'\x{4e00}-\x{9fa5}]*)\s*([\d,†#*]+)/u', $sAuthorContent, $matches)) { + // // if(preg_match_all('/([A-Za-z\x{4e00}-\x{9fa5}][A-Za-z\s·\-\'\x{4e00}-\x{9fa5}]*)\s*(\d[\d,†#\s*]*)/u', $sAuthorContent, $matches)){ + // // for ($i = 0; $i < count($matches[1]); $i++) { + // // $authorList[] = trim($matches[1][$i] . $matches[2][$i]); + // // } + // // } else { + // // // 按“两个或多个连续空格”拆分(姓名之间的分隔) + // // $authorList = array_filter( + // // array_map('trim', + // // preg_split('/(,\p{Z}*|\p{Z}{2,})/u', $sAuthorContent) + // // ) + // // ); + // // } + // $aAuthorData = []; + // $aReport = []; + // $namePattern = '/ + // (?:[A-Za-z\s·\-\']+| # 英文姓名(支持空格、连字符) + // [\x{4e00}-\x{9fa5}]+| # 中文姓名 + // [\x{1800}-\x{18AF}]+| # 蒙古文姓名 + // [A-Z]\.) # 单字母缩写(如 J.) + // /ux'; + // var_dump($authorList);exit; + // foreach ($authorList as $authorStr) { + // if (empty($authorStr)) continue; + // var_dump($authorList);exit; + // //分离姓名与上标(支持上标含逗号,如1,†) + // $superscript = ''; + // // 新正则:匹配以数字开头、含逗号/符号的完整上标(如1,†、2*#) + // $authorStr = trim(trim($authorStr,','),' '); + // // if (preg_match('/([\d,†#*]+)$/u', $authorStr, $supMatch)) { + // // if(preg_match('/\s*([\d,†#* ]+)$/u', $authorStr, $supMatch)){ + // // if (preg_match('/.*?\s*([\d,†#* ]+)$/u', $authorStr, $supMatch)) { + // // if (preg_match('/.*?\s*([\d,\x{2020}#* ]+?)\s*$/u', $authorStr, $supMatch)) { + // // if (preg_match('/^(.+?)\D*?(\d[\d,#*†,\s]*)$/u', $authorStr, $supMatch)) { + // // $superscript = $supMatch[1]; + // // // 移除上标,保留纯姓名(避免残留符号) + // // $nameStr = trim(preg_replace('/' . preg_quote($superscript, '/') . '$/', '', $authorStr)); + // // } else { + // // $nameStr = $authorStr; + // // } + // $pattern = '/^(.+?)\s*(\d[\d,#*†\s]*?)\s*$/u'; + // if (preg_match($pattern, $authorStr, $supMatch)) { + // $nameStr = empty($supMatch[1]) ? '' : trim($supMatch[1]); // 姓名部分:"Liguo Zhang" + // $superscript = empty($supMatch[2]) ? $nameStr : $nameStr.trim($supMatch[2]); // 上标部分:"1 + // // echo "姓名: $nameStr, 上标: $superscript\n"; + // } else { + // $nameStr = $authorStr; + // } + // //验证姓名合法性(过滤无效内容) + // if (!preg_match($namePattern, $nameStr)) { + // continue; + // } + // //解析上标信息(正确识别1,†中的机构编号和符号) + // $companyId = ''; + // $isSuper = 0; + // $isReport = 0; + // if (!empty($superscript)) { + // // 提取机构编号(忽略上标中的逗号,如1,† → 提取1) + // if (preg_match('/(\d+)/', $superscript, $numMatch)) { + // $companyId = $numMatch[1]; + // } + // // 识别特殊符号(#为超级作者,*†为通讯作者) + // $isSuper = strpos($superscript, '#') !== false ? 1 : 0; + // $isReport = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0; + // } + // if (preg_match("/^([A-Za-z\s'\.-]+)/u", $nameStr, $match)) { + // $nameStr = trim($match[1]); + // } + // $aAuthorData[] = [ + // 'name' => $nameStr, + // 'company_id' => $companyId, + // 'is_super' => $isSuper, + // 'is_report' => $isReport + // ]; + // if ($isReport) { + // $aReport[] = $nameStr; + // } + // } + // var_dump($aAuthorData);exit; + // return ['author' => $aAuthorData,'report' => array_unique($aReport)]; + // } + private function getAuthors($aParam = []) { + $title = empty($aParam['title']) ? $this->getTitle() : $aParam['title']; + $sAuthorContent = $this->getNextParagraphAfterText($title); + if (empty($sAuthorContent)) { + return ['author' => [], 'report' => []]; + } + + //编码修复 + $possibleEncodings = [ + 'Windows-1252', 'UTF-8', 'GBK', 'GB2312', + 'Latin-1', 'ISO-8859-1', 'CP1252' + ]; + $encodedContent = @mb_convert_encoding($sAuthorContent, 'UTF-8', implode(',', $possibleEncodings)); + $sAuthorContent = $encodedContent ?: $sAuthorContent; + + //清理不可见字符 + $sAuthorContent = preg_replace('/[\x00-\x1F\x7F\x{200B}-\x{200F}]/u', '', $sAuthorContent); + + //修复特殊符号乱码 + $symbolMap = [ + '†' => '†', 'â ' => '†', 'â' => '†', '?†' => '†', + ':' => ':', ',' => ',', '—' => '-', + '啊' => '' // 针对性移除异常字符“啊”(若为固定乱码) + ]; + $sAuthorContent = strtr($sAuthorContent, $symbolMap); + + //格式标准化 + $sAuthorContent = str_replace([',', ';', ';', '、'], ',', $sAuthorContent); // 统一分隔符 + $sAuthorContent = preg_replace('/\s+and\s+/i', ', ', $sAuthorContent); // and转逗号 + $sAuthorContent = preg_replace('/\s+/', ' ', $sAuthorContent); // 合并多余空格 + $sAuthorContent = trim($sAuthorContent); + + // 处理作者 + $content = mb_convert_encoding($sAuthorContent, 'UTF-8', 'auto'); // 确保编码正确 + $content = str_replace("\xC2\xA0", ' ', $content); // 替换非-breaking空格为普通空格 + $content = preg_replace('/(\d+)\s*([*#†])/', '$1$2', $content); // 合并"1 *"为"1*"、"1 #"为"1#" + $content = preg_replace('/,†/', ',†', $content); // 保留"1,†"格式(防止被拆分) + //标记上标内的逗号+空格(多编号) + $tempStr = preg_replace('/(\d+)\s*,\s*(\d+)/', '$1$2', $content); + // 原有步骤2:正则匹配(扩展上标符号支持,保持原有逻辑) + $pattern = '/ + ([A-Za-z\s\.\-]+?) # 姓名(支持缩写、空格) + \s* # 姓名与上标间空格 + ( # 上标组(扩展符号支持) + \d+ # 起始数字 + (?:[†#*,]|\d+)* # 允许:†#*符号、逗号、+数字(兼容1,†、1,*等) + ) + \s*,? # 作者间逗号(可选) + (?=\s|$) # 确保后面是空格或结尾 + /ux'; + + preg_match_all($pattern, $tempStr, $matches); + $authorList = []; + if(!empty($matches[1])){ + foreach ($matches[1] as $i => $name) { + $name = trim($name); + $superscript = trim($matches[2][$i]); + $superscript = str_replace('', ',', $superscript); // 恢复多编号逗号 + $superscript = preg_replace('/,$/', '', $superscript); // 清理末尾逗号 + // 修复符号与数字间的空格(如原始"1 *"被误处理为"1*"的情况,保持原样) + $superscript = preg_replace('/(\d)([*#†])/', '$1$2', $superscript); + if (!empty($name)) { + $authorList[] = [ + 'name' => $name, + 'superscript' => $superscript + ]; + } + } + }else { + // 按“两个或多个连续空格”拆分(姓名之间的分隔) + $authorList = array_filter( + array_map('trim', + preg_split('/(,\p{Z}*|\p{Z}{2,})/u', $sAuthorContent) + ) + ); + } + + + // //处理作者 + $aAuthorData = []; + $aReport = []; + $namePattern = '/ + (?:[A-Za-z\s·\-\']+| # 英文姓名(支持空格、连字符) + [\x{4e00}-\x{9fa5}]+| # 中文姓名 + [\x{1800}-\x{18AF}]+| # 蒙古文姓名 + [A-Z]\.) # 单字母缩写(如 J.) + /ux'; + + foreach ($authorList as $authorStr){ + if (empty($authorStr)) continue; + + //获取下标 + $superscript = empty($authorStr['superscript']) ? $authorStr : $authorStr['superscript']; + $nameStr = empty($authorStr['name']) ? $authorStr : $authorStr['name']; + + $companyId = []; + $isSuper = 0; + $isReport = 0; + if (!empty($superscript)) { + // 提取机构编号(忽略上标中的逗号,如1,† → 提取1) + preg_match_all('/\d+/', $superscript, $numMatch); + // 识别特殊符号(#为超级作者,*†为通讯作者) + $isSuper = strpos($superscript, '#') !== false ? 1 : 0; + $isReport = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0; + } + if (preg_match("/^([A-Za-z\s'\.-]+)/u", $nameStr, $match)) { + $nameStr = trim($match[1]); + } + $aAuthorData[] = [ + 'name' => $nameStr, + 'company_id' => empty($numMatch[0]) ? [] : $numMatch[0], + 'is_super' => $isSuper, + 'is_report' => $isReport + ]; + if ($isReport) { + $aReport[] = $nameStr; + } + } + return ['author' => $aAuthorData,'report' => array_unique($aReport)]; + } + + // 获取机构 + private function getCompany($aParam = []){ + //获取标题 + $title = empty($aParam['title']) ? $this->getTitle() : $aParam['title']; + //获取标题下的作者 + $sAuthorContent = empty($aParam['authors']) ? $this->getNextParagraphAfterText($title) : $aParam['authors']; + //获取作者结构 + $sCompany = $this->getContentAfterText($sAuthorContent); + if(empty($sCompany)){ + return []; + } + //编码修复 + $possibleEncodings = [ + 'Windows-1252', 'UTF-8', 'GBK', 'GB2312', + 'Latin-1', 'ISO-8859-1', 'CP1252' + ]; + $encodedContent = @mb_convert_encoding($sCompany, 'UTF-8', implode(',', $possibleEncodings)); + $sCompany = $encodedContent ?: $sCompany; + //按行拆分,保留数字开头的行 + $sCompany = str_replace(["\r\n", "\r"], "\n", $sCompany); + $aCompanyLines = explode("\n", $sCompany); + $aCompanyLines = array_filter(array_map('trim', $aCompanyLines), function($line) { + return preg_match('/^\d+/', $line); // 仅保留数字开头的行 + }); + + $aCompany = []; + foreach ($aCompanyLines as $line) { + if (preg_match('/^(\d+)\s*(.+)$/', $line, $match)) { + if(empty($match[1]) || empty($match[2])){ + continue; + } + $aCompany[$match[1]] = ltrim(trim(ltrim($match[2]),'.'),' '); + } + } + return $aCompany; + } + + // 提取通讯作者(含E-mail、地址、电话) + private function getCorrespondingAuthors($aParam = []){ + $aCorrespondingAuthor = empty($aParam['report']) ? [] : $aParam['report']; + if(empty($aCorrespondingAuthor)){ + return []; + } + + // 获取标题 + $title = empty($aParam['title']) ? $this->getTitle() : $aParam['title']; + $sAuthorContent = $this->getNextParagraphAfterText($title); + $sCompany = $this->getNextParagraphAfterText($sAuthorContent); // 直接取机构所在段落的原始文本 + if (empty($sCompany)) { + // 备选方案:若机构段落获取失败,用解析后的机构名称拼接 + $aCompany = $this->getCompany($aParam); + $sCompany = implode(' ', array_values($aCompany)); + } + + // 获取机构后的完整内容 + $corrText = $this->getContentAfterText($sCompany); + //编码修复 + $possibleEncodings = [ + 'Windows-1252', 'UTF-8', 'GBK', 'GB2312', + 'Latin-1', 'ISO-8859-1', 'CP1252' + ]; + $encodedContent = @mb_convert_encoding($corrText, 'UTF-8', implode(',', $possibleEncodings)); + $corrText = $encodedContent ?: $corrText; + // // 调试 + // file_put_contents(ROOT_PATH . 'runtime/corr_text_raw.log', $corrText); + + //清理文本 + $corrText = str_replace([':', '@'], [':', '@'], $corrText); + $corrText = preg_replace('/\s+/', ' ', $corrText); // 统一空格 + $corrText = str_replace(' ', ' ', $corrText); // 去除多余空格 + + //按"*"分割通讯作者 + $corrBlocks = preg_split('/\s*\*\s*/', $corrText); + $corrBlocks = array_filter(array_map('trim', $corrBlocks)); + + $aCorresponding = []; + foreach ($corrBlocks as $block) { + //匹配通讯作者姓名 + $sName = $this->matchCorrespondingName($block, $aCorrespondingAuthor); + if (empty($sName)) { + continue; + } + preg_match('/(E[\s-]*mail|邮箱)[\s:]*([^\s]+@[^\s]+)/i', $block, $email); + preg_match('/(Postal[\s-]*address|地址)[\s:]*([^,;]+)/i', $block, $address); + preg_match('/(Tel|电话)[\s:]*([^\s]+)/i', $block, $tel); + + $aCorresponding[] = [ + 'name' => $sName, + 'email' => isset($email[2]) ? trim($email[2]) : '', + 'postal_address' => isset($address[2]) ? trim($address[2]) : '', + 'tel' => isset($tel[2]) ? trim($tel[2]) : '' + ]; + } + return $aCorresponding; + } + + //匹配通讯作者姓名 + private function matchCorrespondingName($block, $corrNames) + { + $blockLower = strtolower($block); + foreach ($corrNames as $name) { + if (strpos($blockLower, strtolower($name)) !== false) { + return $name; + } + $nameParts = explode(' ', $name); + if (count($nameParts) >= 2) { + $reversedName = implode(' ', array_reverse($nameParts)); + if (strpos($blockLower, strtolower($reversedName)) !== false) { + return $name; + } + } + } + return ''; + } + + // 获取目标文本的下一个段落 + private function getNextParagraphAfterText($targetText){ + + $found = false; + foreach ($this->sections as $section) { + foreach ($section->getElements() as $element) { + $text = $this->getTextFromElement($element); + if(empty($text)){ + continue; + } + if ($found) { + return $text; + } + if (stripos($text, $targetText) !== false) { + $found = true; + } + } + } + return ''; + } + + // 获取目标文本后的所有内容 + private function getContentAfterText($targetText){ + $found = false; + $content = []; + $stopKeywords = ['关键词', 'Key words', '摘要', 'Abstract']; + $maxLines = 200; + $lineNumber = 0; + foreach ($this->sections as $section) { + + foreach ($section->getElements() as $element) { + + $lineNumber++; + if (count($content) >= $maxLines) break; + + $text = $this->getTextFromElement($element,$lineNumber); + $text = trim($text); + if (empty($text)) continue; + if (!$found) { + // 移除所有非字母数字字符后匹配 + $cleanTarget = preg_replace('/[^a-zA-Z0-9]/', '', strtolower($targetText)); + $cleanText = preg_replace('/[^a-zA-Z0-9]/', '', strtolower($text)); + // 只要目标文本的50%以上能匹配即可 + if (strlen($cleanTarget) > 0 && similar_text($cleanText, $cleanTarget) / strlen($cleanTarget) > 0.5) { + $found = true; + } + continue; + } + + // 检查停止关键词 + $shouldStop = false; + foreach ($stopKeywords as $kw) { + if (stripos($text, $kw) !== false) { + $shouldStop = true; + break; + } + } + if ($shouldStop) break; + + $content[] = $text; + } + if (count($content) >= $maxLines || (isset($shouldStop) && $shouldStop)) break; + } + return implode("\n", $content); + } + + // 统一提取元素文本 + private function getTextFromElement($element,$lineNumber = 0){ + $text = ''; + // 处理PreserveText元素 + if ($element instanceof \PhpOffice\PhpWord\Element\PreserveText) { + // 通过反射获取私有属性 text + $reflection = new \ReflectionClass($element); + $property = $reflection->getProperty('text'); + $property->setAccessible(true); + $textParts = $property->getValue($element); + foreach ($textParts as $part) { + if (strpos($part, 'HYPERLINK') !== false) { + // 解码 HTML 实体(" -> ") + $decoded = html_entity_decode($part); + // 提取 mailto: 后的邮箱 + if (preg_match('/mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/i', $decoded, $match)) { + $text .= $match[1] . ' '; + } + } else { + // 普通文本直接拼接 + $text .= $part; + } + } + return $text; + } + // 处理表格和单元格(E-mail可能在表格中) + if ($element instanceof \PhpOffice\PhpWord\Element\Table) { + foreach ($element->getRows() as $row) { + foreach ($row->getCells() as $cell) { + $text .= $this->getTextFromElement($cell); + } + } + return $text; + } + if ($element instanceof \PhpOffice\PhpWord\Element\Cell) { + foreach ($element->getElements() as $child) { + $text .= $this->getTextFromElement($child); + } + return $text; + } + + //处理嵌套元素(递归提取所有子元素) + if (method_exists($element, 'getElements')) { + foreach ($element->getElements() as $child) { + $text .= $this->getTextFromElement($child); + } + } + + //处理文本元素(包括带格式的文本) + if ($element instanceof \PhpOffice\PhpWord\Element\Text) { + $text .= $element->getText(); + } + + //处理超链接(优先提取链接目标,可能是邮箱) + if ($element instanceof \PhpOffice\PhpWord\Element\Link) { + $target = $element->getTarget(); + if (strpos($target, 'mailto:') === 0) { + $text .= str_replace('mailto:', '', $target) . ' '; // 剥离mailto:前缀 + } + $text .= $element->getText() . ' '; + } + + //处理字段和注释(可能包含隐藏邮箱) + if ($element instanceof \PhpOffice\PhpWord\Element\Field) { + $text .= $element->getContent() . ' '; + } + if ($element instanceof \PhpOffice\PhpWord\Element\Note) { + $text .= $element->getContent() . ' '; + } + //清理所有不可见字符(关键:移除格式干扰) + $text = preg_replace('/[\x00-\x1F\x7F-\x9F]/', ' ', $text); // 移除控制字符 + $text = str_replace(["\t", "\r", "\n"], ' ', $text); // 统一空白字符 + $text = preg_replace('/\s+/', ' ', $text); // 合并多个空格 + + return $text; + } + + /** + * 从 Word 文档提取摘要和关键词 + * @return array 提取结果 + */ + public function extractFromWord() { + $sContent = ''; + //文本处理 + $sFundContent = ''; + foreach ($this->sections as $section) { + foreach ($section->getElements() as $element) { + $textContent = $this->getTextFromElement($element); + if(empty($textContent)){ + continue; + } + //编码修复 + $possibleEncodings = [ + 'Windows-1252', 'UTF-8', 'GBK', 'GB2312', + 'Latin-1', 'ISO-8859-1', 'CP1252' + ]; + $sContent .= @mb_convert_encoding($textContent, 'UTF-8', implode(',', $possibleEncodings)); + if(stripos($textContent, 'Keywords:') !== false){ + $sContent .= "Keywords-End-Flag"; + } + if(empty($sFundContent)){ + $aFund = $this->getMatchedFundPhrases($sContent); + if(!empty($aFund[0])){ + $position = stripos($sContent, $aFund[0]); + $sFundContent = substr($sContent, $position); + $sFundContent = trim(str_ireplace($aFund[0], '', $sFundContent)); + if (preg_match('/^(.*?)Peer review/', $sFundContent, $matches)) { + $sFundContent = $matches[1]; // 提取匹配到的前置内容 + } + } + } + $sContent .= "\n"; + } + } + // 2. 基础文本清理(合并多余空格,保留有效换行) + $textContent = preg_replace('/(\S)\s+/', '$1 ', $sContent); + $textContent = trim($textContent); + + // 3. 提取摘要 + $abstract = ''; + $abstractPattern = '/Abstract\s*([\s\S]*?)(?=Keywords:|$)/i'; + if (preg_match($abstractPattern, $textContent, $abstractMatches)) { + $abstract = trim($abstractMatches[1]); + $abstract = preg_replace('/\n+/', ' ', $abstract); + } + // 4. 提取关键词(核心:仅保留两种强制匹配逻辑) + $keywords = []; + // $keywordPattern = '/Keywords:\s*([\s\S]*?)(?=\s*\d+\.|[;,]\s*[\r\n]+\s*[\r\n]+|(?i)\bintroduction|abbreviations\b|$)/i'; + $keywordPattern = '/Keywords:\s*(.*?)\s*Keywords-End-Flag/s'; + if (preg_match($keywordPattern, $textContent, $keywordMatches)) { + $keywordStr = trim($keywordMatches[1]); + + // 清理关键词列表格式(去除换行、末尾多余符号) + $keywordStr = preg_replace('/\n+/', ' ', $keywordStr); + $keywordStr = rtrim($keywordStr, ';,. '); // 去除末尾分号、逗号等 + $keywordStr = trim($keywordStr); + + // 分割并过滤有效关键词 + $keywords = preg_split('/[,;]\s*/', $keywordStr); + $keywords = array_filter(array_map('trim', $keywords), function($item) { + return !empty($item) && !ctype_space($item); + }); + } + return [ + 'status' => 1, + 'msg' => '提取成功', + 'data' => [ + 'abstrart' => $abstract, + 'keywords' => $keywords, + 'fund' => $sFundContent + ] + ]; + } + private function getMatchedFundPhrases($content = '') { + if (empty($content)) { + return []; + } + + // 基金支持词组列表 + $fundPhrases = [ + 'Supported by', 'Funded by', 'Sponsored by', 'Supported in part by', + 'Funding was provided by', 'Funded in part by' + ]; + + // 1. 转义词组中的特殊字符,使用 # 作为分隔符 + $escapedPhrases = array_map(function($phrase) { + return preg_quote($phrase, '#'); + }, $fundPhrases); + + // 2. 拼接为正则模式:匹配任意一个词组(保留原始词组的捕获) + $pattern = '#('.implode('|', $escapedPhrases).')#i'; + // 注意:此处用 () 捕获分组,而非 (?:),用于提取匹配到的具体词组 + + // 3. 全局匹配所有符合的词组 + preg_match_all($pattern, $content, $matches); + + // 4. 处理结果:去重、保留原始词组格式(忽略大小写导致的变体) + $matched = []; + if (!empty($matches[1])) { + // 遍历匹配到的结果(可能包含大小写变体,如 'funded by') + foreach ($matches[1] as $match) { + // 与原始词组列表比对,找到完全匹配的原始词组(忽略大小写) + foreach ($fundPhrases as $original) { + if (strcasecmp($match, $original) === 0) { + $matched[] = $original; + break; // 找到后跳出内层循环,避免重复 + } + } + } + // 去重并保持原始顺序 + $matched = array_values(array_unique($matched)); + } + + return $matched; + } + //日志打印 + private function log($msg){ + // echo date('[Y-m-d H:i:s] ') . $msg . "\n"; + } +} \ No newline at end of file