From 1d9971373aa97301746c1f8fd629e8916d73c064 Mon Sep 17 00:00:00 2001 From: chengxl Date: Wed, 5 Nov 2025 13:14:31 +0800 Subject: [PATCH] =?UTF-8?q?=E5=8D=87=E7=BA=A7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- application/common/ArticleParserService.php | 536 ++++++++++++++++---- 1 file changed, 428 insertions(+), 108 deletions(-) diff --git a/application/common/ArticleParserService.php b/application/common/ArticleParserService.php index 7cd305e..8a4a978 100644 --- a/application/common/ArticleParserService.php +++ b/application/common/ArticleParserService.php @@ -18,7 +18,7 @@ class ArticleParserService public function __construct($filePath = '') { if (!file_exists($filePath)) { - throw new Exception("文档不存在:{$filePath}"); + return json_encode(['status' => 5, 'msg' => '"文档不存在:{$filePath}"']); } try { // 关键配置:关闭“仅读数据”,保留完整节结构 @@ -32,9 +32,8 @@ class ArticleParserService // $this->log("✅ 文档直接加载成功,节数量:{$sectionCount}"); $this->phpWord = $reader->load($filePath); $this->sections = $this->phpWord->getSections(); - } catch (\Exception $e) { - return json(['status' => 'error', 'msg' => $e->getMessage()]); + return json_encode(['status' => 5, 'msg' => $e->getMessage()]); } } @@ -260,6 +259,168 @@ class ArticleParserService // var_dump($aAuthorData);exit; // return ['author' => $aAuthorData,'report' => array_unique($aReport)]; // } + + // 提取作者 + private function parseAuthorsWithoutRegex($str = '') { + if (empty($str)) { + return []; + } + // 清理乱码和特殊字符(扩展全角数字处理) + $str = mb_convert_encoding($str, 'UTF-8', 'auto'); + $str = str_replace(["\xC2\xA0", 'ï¼', '�', ',', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0'], + [' ', ' ', ' ', ' ', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0'], $str); + $str = trim(str_replace([' and ', ' AND ', ' And '], ', ', $str)); + + // 合并上标中数字与逗号间的空格(如"2, 3"→"2,3") + $len = mb_strlen($str); + $processed = ''; + for ($i = 0; $i < $len; $i++) { + $char = mb_substr($str, $i, 1); + if ($char === ',' && $i - 1 >= 0 && $i + 2 < $len) { + $prevChar = mb_substr($str, $i - 1, 1); + $next1 = mb_substr($str, $i + 1, 1); + $next2 = mb_substr($str, $i + 2, 1); + // 兼容全角数字转半角后的判断 + if ((ctype_digit($prevChar) || is_numeric($prevChar)) && $next1 === ' ' && (ctype_digit($next2) || is_numeric($next2))) { + $processed .= $char; + $i += 1; + continue; + } + } + $processed .= $char; + } + $str = $processed; + + // 合并数字与符号间的空格(如"1 *"→"1*") + $len = mb_strlen($str); + $processed = ''; + for ($i = 0; $i < $len; $i++) { + $char = mb_substr($str, $i, 1); + if ((ctype_digit($char) || is_numeric($char)) && $i + 2 < $len) { // 支持数字判断 + $next1 = mb_substr($str, $i + 1, 1); + $next2 = mb_substr($str, $i + 2, 1); + if ($next1 === ' ' && in_array($next2, ['#', '*', '†', '‡', '§'])) { // 扩展符号支持 + $processed .= $char; + $i += 2; + $processed .= $next2; + continue; + } + } + $processed .= $char; + } + $str = $processed; + + // 合并连续空格 + $len = mb_strlen($str); + $processed = ''; + $prevSpace = false; + for ($i = 0; $i < $len; $i++) { + $char = mb_substr($str, $i, 1); + if ($char === ' ') { + if (!$prevSpace) { + $processed .= $char; + $prevSpace = true; + } + } else { + $processed .= $char; + $prevSpace = false; + } + } + $str = trim($processed); + + // 作者处理 + $authors = []; + $currentName = ''; + $currentSuperscript = ''; + $inName = true; + $len = mb_strlen($str); + for ($i = 0; $i < $len; $i++) { + $char = mb_substr($str, $i, 1); + + // 处理作者分隔符:逗号+空格 + if ($char === ',' && $i + 1 < $len) { + $nextChar = mb_substr($str, $i + 1, 1); + if ($nextChar === ' ') { + if (!empty($currentName)) { + $currentSuperscript = rtrim($currentSuperscript, ','); + $authors[] = [ + 'name' => trim($currentName), + 'superscript' => trim($currentSuperscript) + ]; + } + $currentName = ''; + $currentSuperscript = ''; + $inName = true; + $i++; + continue; + } + } + + // 支持姓名中的点、连字符、特殊字母(如带重音的字母) + if (ctype_alpha($char) || in_array($char, [' ', '.', '-', 'à', 'á', 'â', 'ã', 'ä', 'ç', 'è', 'é', 'ê', 'ë'])) { + if ($inName) { + $currentName .= $char; + } else { + $currentSuperscript = rtrim($currentSuperscript, ','); + $authors[] = [ + 'name' => trim($currentName), + 'superscript' => trim($currentSuperscript) + ]; + $currentName = $char; + $currentSuperscript = ''; + $inName = true; + } + } + // 解析上标(数字、逗号、#、*、†等) + elseif ((ctype_digit($char) || is_numeric($char)) || in_array($char, ['#', '*', '†', ',', '‡', '§'])) { + $inName = false; + $currentSuperscript .= $char; + } + // 忽略其他字符 + else { + continue; + } + } + + // 处理最后一个作者 + if (!empty($currentName)) { + $currentSuperscript = rtrim($currentSuperscript, ','); + $authors[] = [ + 'name' => trim($currentName), + 'superscript' => trim($currentSuperscript) + ]; + } + + // 提取机构编号为数组、判断通讯作者和第一作者 + foreach ($authors as $index => &$author) { + // 提取机构编号(兼容多字节数字) + $institutionIds = []; + $superscript = $author['superscript']; + $numStr = ''; + for ($i = 0; $i < mb_strlen($superscript); $i++) { + $c = mb_substr($superscript, $i, 1); + if (ctype_digit($c) || is_numeric($c)) { // 支持数字判断 + $numStr .= $c; + } else { + if (!empty($numStr)) { + $institutionIds[] = (int)$numStr; + $numStr = ''; + } + } + } + if (!empty($numStr)) { + $institutionIds[] = (int)$numStr; + } + $institutionIds = array_values(array_unique($institutionIds)); + $author['company_id'] = $institutionIds; + + // 判断第一作者(#标记)和通讯作者(*、†标记) + $author['is_super'] = strpos($superscript, '#') !== false ? 1 : 0; + $author['is_report'] = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0; + } + unset($author); // 释放引用 + return $authors; + } private function getAuthors($aParam = []) { $title = empty($aParam['title']) ? $this->getTitle() : $aParam['title']; $sAuthorContent = $this->getNextParagraphAfterText($title); @@ -291,95 +452,192 @@ class ArticleParserService $sAuthorContent = preg_replace('/\s+and\s+/i', ', ', $sAuthorContent); // and转逗号 $sAuthorContent = preg_replace('/\s+/', ' ', $sAuthorContent); // 合并多余空格 $sAuthorContent = trim($sAuthorContent); - - // 处理作者 - $content = mb_convert_encoding($sAuthorContent, 'UTF-8', 'auto'); // 确保编码正确 - $content = str_replace("\xC2\xA0", ' ', $content); // 替换非-breaking空格为普通空格 - $content = preg_replace('/(\d+)\s*([*#†])/', '$1$2', $content); // 合并"1 *"为"1*"、"1 #"为"1#" - $content = preg_replace('/,†/', ',†', $content); // 保留"1,†"格式(防止被拆分) - //标记上标内的逗号+空格(多编号) - $tempStr = preg_replace('/(\d+)\s*,\s*(\d+)/', '$1$2', $content); - // 原有步骤2:正则匹配(扩展上标符号支持,保持原有逻辑) - $pattern = '/ - ([A-Za-z\s\.\-]+?) # 姓名(支持缩写、空格) - \s* # 姓名与上标间空格 - ( # 上标组(扩展符号支持) - \d+ # 起始数字 - (?:[†#*,]|\d+)* # 允许:†#*符号、逗号、+数字(兼容1,†、1,*等) - ) - \s*,? # 作者间逗号(可选) - (?=\s|$) # 确保后面是空格或结尾 - /ux'; - - preg_match_all($pattern, $tempStr, $matches); - $authorList = []; - if(!empty($matches[1])){ - foreach ($matches[1] as $i => $name) { - $name = trim($name); - $superscript = trim($matches[2][$i]); - $superscript = str_replace('', ',', $superscript); // 恢复多编号逗号 - $superscript = preg_replace('/,$/', '', $superscript); // 清理末尾逗号 - // 修复符号与数字间的空格(如原始"1 *"被误处理为"1*"的情况,保持原样) - $superscript = preg_replace('/(\d)([*#†])/', '$1$2', $superscript); - if (!empty($name)) { - $authorList[] = [ - 'name' => $name, - 'superscript' => $superscript - ]; - } - } - }else { - // 按“两个或多个连续空格”拆分(姓名之间的分隔) - $authorList = array_filter( - array_map('trim', - preg_split('/(,\p{Z}*|\p{Z}{2,})/u', $sAuthorContent) - ) - ); + $aAuthor = $this->parseAuthorsWithoutRegex($sAuthorContent); + if(empty($aAuthor)){ + return ['author' => [],'report' => []]; } - + $aReport = $aAuthorData = []; - // //处理作者 - $aAuthorData = []; - $aReport = []; - $namePattern = '/ - (?:[A-Za-z\s·\-\']+| # 英文姓名(支持空格、连字符) - [\x{4e00}-\x{9fa5}]+| # 中文姓名 - [\x{1800}-\x{18AF}]+| # 蒙古文姓名 - [A-Z]\.) # 单字母缩写(如 J.) - /ux'; - - foreach ($authorList as $authorStr){ - if (empty($authorStr)) continue; - - //获取下标 - $superscript = empty($authorStr['superscript']) ? $authorStr : $authorStr['superscript']; - $nameStr = empty($authorStr['name']) ? $authorStr : $authorStr['name']; - - $companyId = []; - $isSuper = 0; - $isReport = 0; - if (!empty($superscript)) { - // 提取机构编号(忽略上标中的逗号,如1,† → 提取1) - preg_match_all('/\d+/', $superscript, $numMatch); - // 识别特殊符号(#为超级作者,*†为通讯作者) - $isSuper = strpos($superscript, '#') !== false ? 1 : 0; - $isReport = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0; + foreach ($aAuthor as $key => $value) { + if(empty($value['name']) && empty($value['superscript'])){ + continue; } - if (preg_match("/^([A-Za-z\s'\.-]+)/u", $nameStr, $match)) { - $nameStr = trim($match[1]); - } - $aAuthorData[] = [ - 'name' => $nameStr, - 'company_id' => empty($numMatch[0]) ? [] : $numMatch[0], - 'is_super' => $isSuper, - 'is_report' => $isReport - ]; - if ($isReport) { - $aReport[] = $nameStr; + if(!empty($value['name']) && !empty($value['is_report']) && $value['is_report'] == 1){ + $aReport[] = $value['name']; } + $aAuthorData[] = $value; } return ['author' => $aAuthorData,'report' => array_unique($aReport)]; } +// private function getAuthors($aParam = []) { +// $title = empty($aParam['title']) ? $this->getTitle() : $aParam['title']; +// $sAuthorContent = $this->getNextParagraphAfterText($title); +// if (empty($sAuthorContent)) { +// return ['author' => [], 'report' => []]; +// } + +// //编码修复 +// $possibleEncodings = [ +// 'Windows-1252', 'UTF-8', 'GBK', 'GB2312', +// 'Latin-1', 'ISO-8859-1', 'CP1252' +// ]; +// $encodedContent = @mb_convert_encoding($sAuthorContent, 'UTF-8', implode(',', $possibleEncodings)); +// $sAuthorContent = $encodedContent ?: $sAuthorContent; + +// //清理不可见字符 +// $sAuthorContent = preg_replace('/[\x00-\x1F\x7F\x{200B}-\x{200F}]/u', '', $sAuthorContent); + +// //修复特殊符号乱码 +// $symbolMap = [ +// '†' => '†', 'â ' => '†', 'â' => '†', '?†' => '†', +// ':' => ':', ',' => ',', '—' => '-', +// '啊' => '' // 针对性移除异常字符“啊”(若为固定乱码) +// ]; +// $sAuthorContent = strtr($sAuthorContent, $symbolMap); + +// //格式标准化 +// $sAuthorContent = str_replace([',', ';', ';', '、'], ',', $sAuthorContent); // 统一分隔符 +// $sAuthorContent = preg_replace('/\s+and\s+/i', ', ', $sAuthorContent); // and转逗号 +// $sAuthorContent = preg_replace('/\s+/', ' ', $sAuthorContent); // 合并多余空格 +// $sAuthorContent = trim($sAuthorContent); +// var_dump($this->parseAuthorsWithoutRegex($sAuthorContent));exit; +// // 关键预处理:兼容"and"分隔符、清理乱码、统一空格 +// $content = mb_convert_encoding($sAuthorContent, 'UTF-8', 'auto'); +// $content = str_replace(["\xC2\xA0", 'ï¼', '�', ','], ' ', $content); // 清理乱码和全角符号 +// $content = preg_replace('/\band\b/i', ',', $content); // 将 "and" 转为逗号(统一分隔符) +// $content = preg_replace('/(\d+)\s*([*#†])/', '$1$2', $content); // 合并数字与符号间的空格(如"1 *"→"1*") +// $content = trim(preg_replace('/\s+/', ' ', $content)); // 合并连续空格 + +// // 标记上标内的逗号(多编号处理) +// $tempStr = preg_replace('/(\d+)\s*,\s*(\d+)/', '$1$2', $content); + +// // 核心正则(保持原有结构,扩展符号支持) +// $pattern = '/ +// ([A-Za-z\s\.\-]+?) # 姓名(支持缩写、空格、连字符) +// \s* # 姓名与上标间的空格(允许0或多个) +// ( # 上标组(扩展兼容所有符号) +// \d+ # 起始数字(至少1个数字) +// (?:[†#*,]|\d+)* # 允许:符号(†#*)、逗号、+数字(多编号) +// ) +// \s*,? # 作者间的逗号(可选,允许逗号前有空格) +// (?=\s|$) # 确保后面是空格或字符串结尾(避免跨作者匹配) +// /ux'; + +// preg_match_all($pattern, $tempStr, $matches); + +// // 解析结果并格式化 +// $authorList = []; +// if (!empty($matches[1])) { +// foreach ($matches[1] as $i => $name) { +// $name = trim($name); +// $superscript = trim($matches[2][$i]); +// $superscript = str_replace('', ',', $superscript); // 恢复多编号逗号 +// $superscript = preg_replace('/,$/', '', $superscript); // 清理末尾多余逗号 +// if (!empty($name)) { +// $authorList[] = [ +// 'name' => $name, +// 'superscript' => $superscript +// ]; +// } +// } +// } + +// // 输出结果 +// echo "
";
+// print_r($authorList);
+// echo "
"; +// exit; + +// // 处理作者 +// $content = mb_convert_encoding($sAuthorContent, 'UTF-8', 'auto'); // 确保编码正确 +// $content = str_replace("\xC2\xA0", ' ', $content); // 替换非-breaking空格为普通空格 +// $content = preg_replace('/(\d+)\s*([*#†])/', '$1$2', $content); // 合并"1 *"为"1*"、"1 #"为"1#" +// $content = preg_replace('/,†/', ',†', $content); // 保留"1,†"格式(防止被拆分) + +// //标记上标内的逗号+空格(多编号) +// $tempStr = preg_replace('/(\d+)\s*,\s*(\d+)/', '$1$2', $content); +// // 原有步骤2:正则匹配(扩展上标符号支持,保持原有逻辑) +// $pattern = '/ +// ([A-Za-z\s\.\-]+?) # 姓名(支持缩写、空格) +// \s* # 姓名与上标间空格 +// ( # 上标组(扩展符号支持) +// \d+ # 起始数字 +// (?:[†#*,]|\d+)* # 允许:†#*符号、逗号、+数字(兼容1,†、1,*等) +// ) +// \s*,? # 作者间逗号(可选) +// (?=\s|$) # 确保后面是空格或结尾 +// /ux'; + +// preg_match_all($pattern, $tempStr, $matches); +// var_dump($matches);exit; +// $authorList = []; +// if(!empty($matches[1])){ +// foreach ($matches[1] as $i => $name) { +// $name = trim($name); +// $superscript = trim($matches[2][$i]); +// $superscript = str_replace('', ',', $superscript); // 恢复多编号逗号 +// $superscript = preg_replace('/,$/', '', $superscript); // 清理末尾逗号 +// // 修复符号与数字间的空格(如原始"1 *"被误处理为"1*"的情况,保持原样) +// $superscript = preg_replace('/(\d)([*#†])/', '$1$2', $superscript); +// if (!empty($name)) { +// $authorList[] = [ +// 'name' => $name, +// 'superscript' => $superscript +// ]; +// } +// } +// }else { +// // 按“两个或多个连续空格”拆分(姓名之间的分隔) +// $authorList = array_filter( +// array_map('trim', +// preg_split('/(,\p{Z}*|\p{Z}{2,})/u', $sAuthorContent) +// ) +// ); +// } + + +// // //处理作者 +// $aAuthorData = []; +// $aReport = []; +// $namePattern = '/ +// (?:[A-Za-z\s·\-\']+| # 英文姓名(支持空格、连字符) +// [\x{4e00}-\x{9fa5}]+| # 中文姓名 +// [\x{1800}-\x{18AF}]+| # 蒙古文姓名 +// [A-Z]\.) # 单字母缩写(如 J.) +// /ux'; + +// foreach ($authorList as $authorStr){ +// if (empty($authorStr)) continue; + +// //获取下标 +// $superscript = empty($authorStr['superscript']) ? $authorStr : $authorStr['superscript']; +// $nameStr = empty($authorStr['name']) ? $authorStr : $authorStr['name']; + +// $companyId = []; +// $isSuper = 0; +// $isReport = 0; +// if (!empty($superscript)) { +// // 提取机构编号(忽略上标中的逗号,如1,† → 提取1) +// preg_match_all('/\d+/', $superscript, $numMatch); +// // 识别特殊符号(#为超级作者,*†为通讯作者) +// $isSuper = strpos($superscript, '#') !== false ? 1 : 0; +// $isReport = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0; +// } +// if (preg_match("/^([A-Za-z\s'\.-]+)/u", $nameStr, $match)) { +// $nameStr = trim($match[1]); +// } +// $aAuthorData[] = [ +// 'name' => $nameStr, +// 'company_id' => empty($numMatch[0]) ? [] : $numMatch[0], +// 'is_super' => $isSuper, +// 'is_report' => $isReport +// ]; +// if ($isReport) { +// $aReport[] = $nameStr; +// } +// } +// return ['author' => $aAuthorData,'report' => array_unique($aReport)]; +// } // 获取机构 private function getCompany($aParam = []){ @@ -388,32 +646,68 @@ class ArticleParserService //获取标题下的作者 $sAuthorContent = empty($aParam['authors']) ? $this->getNextParagraphAfterText($title) : $aParam['authors']; //获取作者结构 - $sCompany = $this->getContentAfterText($sAuthorContent); - if(empty($sCompany)){ + $allLines = $this->getContentAfterText($sAuthorContent,1); + if(empty($allLines)){ return []; } - //编码修复 + // 2. 按序号分组,合并同一序号的多行内容 + $grouped = []; + $currentNumber = null; // 当前序号 + foreach ($allLines as $line) { + $line = trim($line); + if (empty($line)) continue; + + // 判断是否是新条目的开头:行首为数字(后续可接任意字符或直接接内容) + $number = ''; + $i = 0; + $lineLen = strlen($line); + // 提取行首的连续数字(作为序号) + while ($i < $lineLen && ctype_digit($line[$i])) { + $number .= $line[$i]; + $i++; + } + + // 若行首有数字,则视为新条目 + if (!empty($number)) { + $currentNumber = $number; + // 提取序号后的内容(跳过数字后的符号/空格,保留核心内容) + // 从数字后的位置开始,跳过可能的符号(./*)或空格 + while ($i < $lineLen && (in_array($line[$i], ['.', '*', ' ']))) { + $i++; + } + $content = trim(substr($line, $i)); // 序号后的内容 + $grouped[$currentNumber] = $content; + continue; + } + + // 非新条目,合并到当前序号的内容中 + if ($currentNumber !== null) { + $grouped[$currentNumber] .= ' ' . $line; + } + } + + //清理结果 $possibleEncodings = [ 'Windows-1252', 'UTF-8', 'GBK', 'GB2312', 'Latin-1', 'ISO-8859-1', 'CP1252' ]; - $encodedContent = @mb_convert_encoding($sCompany, 'UTF-8', implode(',', $possibleEncodings)); - $sCompany = $encodedContent ?: $sCompany; - //按行拆分,保留数字开头的行 - $sCompany = str_replace(["\r\n", "\r"], "\n", $sCompany); - $aCompanyLines = explode("\n", $sCompany); - $aCompanyLines = array_filter(array_map('trim', $aCompanyLines), function($line) { - return preg_match('/^\d+/', $line); // 仅保留数字开头的行 - }); - $aCompany = []; - foreach ($aCompanyLines as $line) { - if (preg_match('/^(\d+)\s*(.+)$/', $line, $match)) { - if(empty($match[1]) || empty($match[2])){ - continue; - } - $aCompany[$match[1]] = ltrim(trim(ltrim($match[2]),'.'),' '); + foreach ($grouped as $number => $institution) { + $encodedContent = @mb_convert_encoding($institution, 'UTF-8', implode(',', $possibleEncodings)); + $sCompany = $encodedContent ?: $sCompany; + $institution = preg_replace('/\s+/', ' ', $institution); // 合并多余空格 + $institution = rtrim($institution, '.'); + $institution = preg_replace('/^\d+\s+/', '', $institution); + $institution = trim($institution); // 清理首尾空格 + preg_match('/(.*?, [A-Za-z]+ \d+, [A-Za-z]+)/', $institution, $institutionmatches);; + $institution = trim($institutionmatches[1] ?? $institution); + if(!mb_check_encoding($institution, 'UTF-8')){ + $institution = mb_convert_encoding($institution, 'UTF-8', 'GBK'); } + if (preg_match('/^(.*?)(?=\s*\*Email)/', $institution, $matches)) { + $institution = trim($matches[1]); // trim() 去除内容前后多余空格 + } + $aCompany[$number] = $institution; } return $aCompany; } @@ -451,11 +745,10 @@ class ArticleParserService $corrText = str_replace([':', '@'], [':', '@'], $corrText); $corrText = preg_replace('/\s+/', ' ', $corrText); // 统一空格 $corrText = str_replace(' ', ' ', $corrText); // 去除多余空格 - //按"*"分割通讯作者 $corrBlocks = preg_split('/\s*\*\s*/', $corrText); $corrBlocks = array_filter(array_map('trim', $corrBlocks)); - + $aCorresponding = []; foreach ($corrBlocks as $block) { //匹配通讯作者姓名 @@ -466,7 +759,6 @@ class ArticleParserService preg_match('/(E[\s-]*mail|邮箱)[\s:]*([^\s]+@[^\s]+)/i', $block, $email); preg_match('/(Postal[\s-]*address|地址)[\s:]*([^,;]+)/i', $block, $address); preg_match('/(Tel|电话)[\s:]*([^\s]+)/i', $block, $tel); - $aCorresponding[] = [ 'name' => $sName, 'email' => isset($email[2]) ? trim($email[2]) : '', @@ -474,6 +766,24 @@ class ArticleParserService 'tel' => isset($tel[2]) ? trim($tel[2]) : '' ]; } + if(empty($aCorresponding)){ + $pattern = '/Corresponding Authors: (.*?)(?=$|;)/s'; + preg_match($pattern, $corrText, $match); + if (!empty($match[1])) { + $corrContent = $match[1]; + // 提取每个作者的名称和邮箱(优化正则,支持更多字符) + $authorPattern = '/([A-Za-z\s]+?),\s*E-mail:\s*([\w@\.\-]+)/'; + preg_match_all($authorPattern, $corrContent, $authors); + if(!empty($authors[1])){ + for ($i = 0; $i < count($authors[1]); $i++) { + $aCorresponding[] = [ + 'name' => empty($authors[1][$i]) ? '' : trim($authors[1][$i]), + 'email' => empty($authors[2][$i]) ? '' : trim($authors[2][$i]) + ]; + } + } + } + } return $aCorresponding; } @@ -518,7 +828,7 @@ class ArticleParserService } // 获取目标文本后的所有内容 - private function getContentAfterText($targetText){ + private function getContentAfterText($targetText,$return_type = 2){ $found = false; $content = []; $stopKeywords = ['关键词', 'Key words', '摘要', 'Abstract']; @@ -559,7 +869,14 @@ class ArticleParserService } if (count($content) >= $maxLines || (isset($shouldStop) && $shouldStop)) break; } - return implode("\n", $content); + if($return_type == 1){ + return $content; + } + $content = implode("\n", $content); + if(!mb_check_encoding($content, 'UTF-8')){ + $content = mb_convert_encoding($content, 'UTF-8', 'GBK'); + } + return $content; } // 统一提取元素文本 @@ -676,6 +993,9 @@ class ArticleParserService $sContent .= "\n"; } } + if(!mb_check_encoding($sContent, 'UTF-8')){ + $sContent = mb_convert_encoding($sContent, 'UTF-8', 'GBK'); + } // 2. 基础文本清理(合并多余空格,保留有效换行) $textContent = preg_replace('/(\S)\s+/', '$1 ', $sContent); $textContent = trim($textContent);