diff --git a/application/common/ArticleParserService.php b/application/common/ArticleParserService.php index 9b93a37..526ac52 100644 --- a/application/common/ArticleParserService.php +++ b/application/common/ArticleParserService.php @@ -225,6 +225,10 @@ class ArticleParserService $aParam['corresponding'] = $oDealFile->getCorrespondingAuthors($aParam); //keywords 和 摘要 $aContent = $oDealFile->extractFromWord(); + if(!mb_check_encoding($sTitle, 'UTF-8')){ + $sTitle = mb_convert_encoding($sTitle, 'UTF-8', 'GBK'); + } + $aParam['title'] = $oDealFile->fullDecode($aParam['title']); $aParam += empty($aContent['data']) ? [] : $aContent['data']; return json_encode(['status' => 1,'msg' => 'success','data' => $aParam]); } @@ -240,190 +244,25 @@ class ArticleParserService foreach ($section->getElements() as $element) { $text = $this->getTextFromElement($element); $length = mb_strlen(trim($text)); - if ($length > $maxLength && $length > 10) { // 标题通常较长 + if ($length > $maxLength && $length > 3) { // 标题通常较长 $title = trim($text); $maxLength = $length; break 2; // 取第一个最长段落作为标题 } } } - if(!empty($title) && !mb_check_encoding($title, 'UTF-8')){ - $title = mb_convert_encoding($title, 'UTF-8', 'GBK'); - } return $title; } - // 提取作者 - // private function getAuthors($aParam = []) { - // $title = empty($aParam['title']) ? $this->getTitle() : $aParam['title']; - // $sAuthorContent = $this->getNextParagraphAfterText($title); - // if (empty($sAuthorContent)) { - // return ['author' => [], 'report' => []]; - // } - - // //编码修复 - // $possibleEncodings = [ - // 'Windows-1252', 'UTF-8', 'GBK', 'GB2312', - // 'Latin-1', 'ISO-8859-1', 'CP1252' - // ]; - // $encodedContent = @mb_convert_encoding($sAuthorContent, 'UTF-8', implode(',', $possibleEncodings)); - // $sAuthorContent = $encodedContent ?: $sAuthorContent; - - // //清理不可见字符 - // $sAuthorContent = preg_replace('/[\x00-\x1F\x7F\x{200B}-\x{200F}]/u', '', $sAuthorContent); - - // //修复特殊符号乱码 - // $symbolMap = [ - // '†' => '†', 'â ' => '†', 'â' => '†', '?†' => '†', - // ':' => ':', ',' => ',', '—' => '-', - // '啊' => '' // 针对性移除异常字符“啊”(若为固定乱码) - // ]; - // $sAuthorContent = strtr($sAuthorContent, $symbolMap); - - // //格式标准化 - // $sAuthorContent = str_replace([',', ';', ';', '、'], ',', $sAuthorContent); // 统一分隔符 - // $sAuthorContent = preg_replace('/\s+and\s+/i', ', ', $sAuthorContent); // and转逗号 - // $sAuthorContent = preg_replace('/\s+/', ' ', $sAuthorContent); // 合并多余空格 - // $sAuthorContent = trim($sAuthorContent); - - // // 处理作者 - // $content = mb_convert_encoding($sAuthorContent, 'UTF-8', 'auto'); // 确保编码正确 - // $content = str_replace("\xC2\xA0", ' ', $content); // 替换非-breaking空格为普通空格 - // $content = preg_replace('/(\d+)\s*([*#†])/', '$1$2', $content); // 合并"1 *"为"1*"、"1 #"为"1#" - // $content = preg_replace('/,†/', ',†', $content); // 保留"1,†"格式(防止被拆分) - // //标记上标内的逗号+空格(多编号) - // $tempStr = preg_replace('/(\d+)\s*,\s*(\d+)/', '$1$2', $content); - // // 原有步骤2:正则匹配(扩展上标符号支持,保持原有逻辑) - // $pattern = '/ - // ([A-Za-z\s\.\-]+?) # 姓名(支持缩写、空格) - // \s* # 姓名与上标间空格 - // ( # 上标组(扩展符号支持) - // \d+ # 起始数字 - // (?:[†#*,]|\d+)* # 允许:†#*符号、逗号、+数字(兼容1,†、1,*等) - // ) - // \s*,? # 作者间逗号(可选) - // (?=\s|$) # 确保后面是空格或结尾 - // /ux'; - - // preg_match_all($pattern, $tempStr, $matches); - // $authorList = []; - // if(!empty($matches[1])){ - // foreach ($matches[1] as $i => $name) { - // $name = trim($name); - // $superscript = trim($matches[2][$i]); - // $superscript = str_replace('', ',', $superscript); // 恢复多编号逗号 - // $superscript = preg_replace('/,$/', '', $superscript); // 清理末尾逗号 - // // 修复符号与数字间的空格(如原始"1 *"被误处理为"1*"的情况,保持原样) - // $superscript = preg_replace('/(\d)([*#†])/', '$1$2', $superscript); - // if (!empty($name)) { - // $authorList[] = [ - // 'name' => $name, - // 'superscript' => $superscript - // ]; - // } - // } - // }else { - // // 按“两个或多个连续空格”拆分(姓名之间的分隔) - // $authorList = array_filter( - // array_map('trim', - // preg_split('/(,\p{Z}*|\p{Z}{2,})/u', $sAuthorContent) - // ) - // ); - // } - - - // // //处理作者 - // // $authorList = []; - // // // 新正则:匹配“姓名+上标”整体,允许上标含逗号(如1,†) - // // // 逻辑:姓名以字母/中文开头,上标以数字开头、以符号/数字结尾 - // // // if (preg_match_all('/([A-Za-z\x{4e00}-\x{9fa5}][A-Za-z\s·\-\'\x{4e00}-\x{9fa5}]*)\s*([\d,†#*]+)/u', $sAuthorContent, $matches)) { - // // if(preg_match_all('/([A-Za-z\x{4e00}-\x{9fa5}][A-Za-z\s·\-\'\x{4e00}-\x{9fa5}]*)\s*(\d[\d,†#\s*]*)/u', $sAuthorContent, $matches)){ - // // for ($i = 0; $i < count($matches[1]); $i++) { - // // $authorList[] = trim($matches[1][$i] . $matches[2][$i]); - // // } - // // } else { - // // // 按“两个或多个连续空格”拆分(姓名之间的分隔) - // // $authorList = array_filter( - // // array_map('trim', - // // preg_split('/(,\p{Z}*|\p{Z}{2,})/u', $sAuthorContent) - // // ) - // // ); - // // } - // $aAuthorData = []; - // $aReport = []; - // $namePattern = '/ - // (?:[A-Za-z\s·\-\']+| # 英文姓名(支持空格、连字符) - // [\x{4e00}-\x{9fa5}]+| # 中文姓名 - // [\x{1800}-\x{18AF}]+| # 蒙古文姓名 - // [A-Z]\.) # 单字母缩写(如 J.) - // /ux'; - // var_dump($authorList);exit; - // foreach ($authorList as $authorStr) { - // if (empty($authorStr)) continue; - // var_dump($authorList);exit; - // //分离姓名与上标(支持上标含逗号,如1,†) - // $superscript = ''; - // // 新正则:匹配以数字开头、含逗号/符号的完整上标(如1,†、2*#) - // $authorStr = trim(trim($authorStr,','),' '); - // // if (preg_match('/([\d,†#*]+)$/u', $authorStr, $supMatch)) { - // // if(preg_match('/\s*([\d,†#* ]+)$/u', $authorStr, $supMatch)){ - // // if (preg_match('/.*?\s*([\d,†#* ]+)$/u', $authorStr, $supMatch)) { - // // if (preg_match('/.*?\s*([\d,\x{2020}#* ]+?)\s*$/u', $authorStr, $supMatch)) { - // // if (preg_match('/^(.+?)\D*?(\d[\d,#*†,\s]*)$/u', $authorStr, $supMatch)) { - // // $superscript = $supMatch[1]; - // // // 移除上标,保留纯姓名(避免残留符号) - // // $nameStr = trim(preg_replace('/' . preg_quote($superscript, '/') . '$/', '', $authorStr)); - // // } else { - // // $nameStr = $authorStr; - // // } - // $pattern = '/^(.+?)\s*(\d[\d,#*†\s]*?)\s*$/u'; - // if (preg_match($pattern, $authorStr, $supMatch)) { - // $nameStr = empty($supMatch[1]) ? '' : trim($supMatch[1]); // 姓名部分:"Liguo Zhang" - // $superscript = empty($supMatch[2]) ? $nameStr : $nameStr.trim($supMatch[2]); // 上标部分:"1 - // // echo "姓名: $nameStr, 上标: $superscript\n"; - // } else { - // $nameStr = $authorStr; - // } - // //验证姓名合法性(过滤无效内容) - // if (!preg_match($namePattern, $nameStr)) { - // continue; - // } - // //解析上标信息(正确识别1,†中的机构编号和符号) - // $companyId = ''; - // $isSuper = 0; - // $isReport = 0; - // if (!empty($superscript)) { - // // 提取机构编号(忽略上标中的逗号,如1,† → 提取1) - // if (preg_match('/(\d+)/', $superscript, $numMatch)) { - // $companyId = $numMatch[1]; - // } - // // 识别特殊符号(#为超级作者,*†为通讯作者) - // $isSuper = strpos($superscript, '#') !== false ? 1 : 0; - // $isReport = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0; - // } - // if (preg_match("/^([A-Za-z\s'\.-]+)/u", $nameStr, $match)) { - // $nameStr = trim($match[1]); - // } - // $aAuthorData[] = [ - // 'name' => $nameStr, - // 'company_id' => $companyId, - // 'is_super' => $isSuper, - // 'is_report' => $isReport - // ]; - // if ($isReport) { - // $aReport[] = $nameStr; - // } - // } - // var_dump($aAuthorData);exit; - // return ['author' => $aAuthorData,'report' => array_unique($aReport)]; - // } // 提取作者 private function parseAuthorsWithoutRegex($str = '') { if (empty($str)) { return []; } - // 清理乱码和特殊字符(扩展全角数字处理) - $str = mb_convert_encoding($str, 'UTF-8', 'auto'); + if(!mb_check_encoding($str, 'UTF-8')){ + $str = mb_convert_encoding($str, 'UTF-8', 'GBK'); + } + $str = $this->fullDecode($str); $str = str_replace(["\xC2\xA0", 'ï¼', '�', ',', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0'], [' ', ' ', ' ', ' ', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0'], $str); $str = trim(str_replace([' and ', ' AND ', ' And '], ', ', $str)); @@ -584,15 +423,10 @@ class ArticleParserService if (empty($sAuthorContent)) { return ['author' => [], 'report' => []]; } - - //编码修复 - $possibleEncodings = [ - 'Windows-1252', 'UTF-8', 'GBK', 'GB2312', - 'Latin-1', 'ISO-8859-1', 'CP1252' - ]; - $encodedContent = @mb_convert_encoding($sAuthorContent, 'UTF-8', implode(',', $possibleEncodings)); - $sAuthorContent = $encodedContent ?: $sAuthorContent; - + if(!mb_check_encoding($sAuthorContent, 'UTF-8')){ + $sAuthorContent = mb_convert_encoding($sAuthorContent, 'UTF-8', 'GBK'); + } + $sAuthorContent = $this->fullDecode($sAuthorContent); //清理不可见字符 $sAuthorContent = preg_replace('/[\x00-\x1F\x7F\x{200B}-\x{200F}]/u', '', $sAuthorContent); @@ -614,14 +448,10 @@ class ArticleParserService return ['author' => [],'report' => []]; } $aReport = $aAuthorData = []; - foreach ($aAuthor as $key => $value) { if(empty($value['name']) && empty($value['superscript'])){ continue; } - if(!mb_check_encoding($value['name'], 'UTF-8')){ - $value['name'] = mb_convert_encoding($value['name'], 'UTF-8', 'GBK'); - } if(!empty($value['name']) && !empty($value['is_report']) && $value['is_report'] == 1){ $aReport[] = $value['name']; } @@ -629,175 +459,6 @@ class ArticleParserService } return ['author' => $aAuthorData,'report' => array_unique($aReport)]; } -// private function getAuthors($aParam = []) { -// $title = empty($aParam['title']) ? $this->getTitle() : $aParam['title']; -// $sAuthorContent = $this->getNextParagraphAfterText($title); -// if (empty($sAuthorContent)) { -// return ['author' => [], 'report' => []]; -// } - -// //编码修复 -// $possibleEncodings = [ -// 'Windows-1252', 'UTF-8', 'GBK', 'GB2312', -// 'Latin-1', 'ISO-8859-1', 'CP1252' -// ]; -// $encodedContent = @mb_convert_encoding($sAuthorContent, 'UTF-8', implode(',', $possibleEncodings)); -// $sAuthorContent = $encodedContent ?: $sAuthorContent; - -// //清理不可见字符 -// $sAuthorContent = preg_replace('/[\x00-\x1F\x7F\x{200B}-\x{200F}]/u', '', $sAuthorContent); - -// //修复特殊符号乱码 -// $symbolMap = [ -// '†' => '†', 'â ' => '†', 'â' => '†', '?†' => '†', -// ':' => ':', ',' => ',', '—' => '-', -// '啊' => '' // 针对性移除异常字符“啊”(若为固定乱码) -// ]; -// $sAuthorContent = strtr($sAuthorContent, $symbolMap); - -// //格式标准化 -// $sAuthorContent = str_replace([',', ';', ';', '、'], ',', $sAuthorContent); // 统一分隔符 -// $sAuthorContent = preg_replace('/\s+and\s+/i', ', ', $sAuthorContent); // and转逗号 -// $sAuthorContent = preg_replace('/\s+/', ' ', $sAuthorContent); // 合并多余空格 -// $sAuthorContent = trim($sAuthorContent); -// var_dump($this->parseAuthorsWithoutRegex($sAuthorContent));exit; -// // 关键预处理:兼容"and"分隔符、清理乱码、统一空格 -// $content = mb_convert_encoding($sAuthorContent, 'UTF-8', 'auto'); -// $content = str_replace(["\xC2\xA0", 'ï¼', '�', ','], ' ', $content); // 清理乱码和全角符号 -// $content = preg_replace('/\band\b/i', ',', $content); // 将 "and" 转为逗号(统一分隔符) -// $content = preg_replace('/(\d+)\s*([*#†])/', '$1$2', $content); // 合并数字与符号间的空格(如"1 *"→"1*") -// $content = trim(preg_replace('/\s+/', ' ', $content)); // 合并连续空格 - -// // 标记上标内的逗号(多编号处理) -// $tempStr = preg_replace('/(\d+)\s*,\s*(\d+)/', '$1$2', $content); - -// // 核心正则(保持原有结构,扩展符号支持) -// $pattern = '/ -// ([A-Za-z\s\.\-]+?) # 姓名(支持缩写、空格、连字符) -// \s* # 姓名与上标间的空格(允许0或多个) -// ( # 上标组(扩展兼容所有符号) -// \d+ # 起始数字(至少1个数字) -// (?:[†#*,]|\d+)* # 允许:符号(†#*)、逗号、+数字(多编号) -// ) -// \s*,? # 作者间的逗号(可选,允许逗号前有空格) -// (?=\s|$) # 确保后面是空格或字符串结尾(避免跨作者匹配) -// /ux'; - -// preg_match_all($pattern, $tempStr, $matches); - -// // 解析结果并格式化 -// $authorList = []; -// if (!empty($matches[1])) { -// foreach ($matches[1] as $i => $name) { -// $name = trim($name); -// $superscript = trim($matches[2][$i]); -// $superscript = str_replace('', ',', $superscript); // 恢复多编号逗号 -// $superscript = preg_replace('/,$/', '', $superscript); // 清理末尾多余逗号 -// if (!empty($name)) { -// $authorList[] = [ -// 'name' => $name, -// 'superscript' => $superscript -// ]; -// } -// } -// } - -// // 输出结果 -// echo "
";
-// print_r($authorList);
-// echo "
"; -// exit; - -// // 处理作者 -// $content = mb_convert_encoding($sAuthorContent, 'UTF-8', 'auto'); // 确保编码正确 -// $content = str_replace("\xC2\xA0", ' ', $content); // 替换非-breaking空格为普通空格 -// $content = preg_replace('/(\d+)\s*([*#†])/', '$1$2', $content); // 合并"1 *"为"1*"、"1 #"为"1#" -// $content = preg_replace('/,†/', ',†', $content); // 保留"1,†"格式(防止被拆分) - -// //标记上标内的逗号+空格(多编号) -// $tempStr = preg_replace('/(\d+)\s*,\s*(\d+)/', '$1$2', $content); -// // 原有步骤2:正则匹配(扩展上标符号支持,保持原有逻辑) -// $pattern = '/ -// ([A-Za-z\s\.\-]+?) # 姓名(支持缩写、空格) -// \s* # 姓名与上标间空格 -// ( # 上标组(扩展符号支持) -// \d+ # 起始数字 -// (?:[†#*,]|\d+)* # 允许:†#*符号、逗号、+数字(兼容1,†、1,*等) -// ) -// \s*,? # 作者间逗号(可选) -// (?=\s|$) # 确保后面是空格或结尾 -// /ux'; - -// preg_match_all($pattern, $tempStr, $matches); -// var_dump($matches);exit; -// $authorList = []; -// if(!empty($matches[1])){ -// foreach ($matches[1] as $i => $name) { -// $name = trim($name); -// $superscript = trim($matches[2][$i]); -// $superscript = str_replace('', ',', $superscript); // 恢复多编号逗号 -// $superscript = preg_replace('/,$/', '', $superscript); // 清理末尾逗号 -// // 修复符号与数字间的空格(如原始"1 *"被误处理为"1*"的情况,保持原样) -// $superscript = preg_replace('/(\d)([*#†])/', '$1$2', $superscript); -// if (!empty($name)) { -// $authorList[] = [ -// 'name' => $name, -// 'superscript' => $superscript -// ]; -// } -// } -// }else { -// // 按“两个或多个连续空格”拆分(姓名之间的分隔) -// $authorList = array_filter( -// array_map('trim', -// preg_split('/(,\p{Z}*|\p{Z}{2,})/u', $sAuthorContent) -// ) -// ); -// } - - -// // //处理作者 -// $aAuthorData = []; -// $aReport = []; -// $namePattern = '/ -// (?:[A-Za-z\s·\-\']+| # 英文姓名(支持空格、连字符) -// [\x{4e00}-\x{9fa5}]+| # 中文姓名 -// [\x{1800}-\x{18AF}]+| # 蒙古文姓名 -// [A-Z]\.) # 单字母缩写(如 J.) -// /ux'; - -// foreach ($authorList as $authorStr){ -// if (empty($authorStr)) continue; - -// //获取下标 -// $superscript = empty($authorStr['superscript']) ? $authorStr : $authorStr['superscript']; -// $nameStr = empty($authorStr['name']) ? $authorStr : $authorStr['name']; - -// $companyId = []; -// $isSuper = 0; -// $isReport = 0; -// if (!empty($superscript)) { -// // 提取机构编号(忽略上标中的逗号,如1,† → 提取1) -// preg_match_all('/\d+/', $superscript, $numMatch); -// // 识别特殊符号(#为超级作者,*†为通讯作者) -// $isSuper = strpos($superscript, '#') !== false ? 1 : 0; -// $isReport = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0; -// } -// if (preg_match("/^([A-Za-z\s'\.-]+)/u", $nameStr, $match)) { -// $nameStr = trim($match[1]); -// } -// $aAuthorData[] = [ -// 'name' => $nameStr, -// 'company_id' => empty($numMatch[0]) ? [] : $numMatch[0], -// 'is_super' => $isSuper, -// 'is_report' => $isReport -// ]; -// if ($isReport) { -// $aReport[] = $nameStr; -// } -// } -// return ['author' => $aAuthorData,'report' => array_unique($aReport)]; -// } // 获取机构 private function getCompany($aParam = []){ @@ -815,16 +476,39 @@ class ArticleParserService $currentNumber = null; // 当前序号 foreach ($allLines as $line) { $line = trim($line); - if (empty($line)) continue; - - // 判断是否是新条目的开头:行首为数字(后续可接任意字符或直接接内容) + if (empty($line)) { + continue; + } + if(!mb_check_encoding($line, 'UTF-8')){ + $line = mb_convert_encoding($line, 'UTF-8', 'GBK'); + } + $line = $this->fullDecode($line); $number = ''; $i = 0; $lineLen = strlen($line); // 提取行首的连续数字(作为序号) - while ($i < $lineLen && ctype_digit($line[$i])) { - $number .= $line[$i]; - $i++; + $hasFirstChar = false; + while ($i < $lineLen) { + $currentChar = $line[$i]; + // 首字符处理:允许 26个字母(大小写)或数字 + if (!$hasFirstChar) { + if (ctype_digit($currentChar) || ctype_alpha($currentChar)) { + $number .= $currentChar; + $hasFirstChar = true; + $i++; + } else { + // 首字符不符合(非字母/数字),终止循环 + break; + } + } else { + // 后续字符必须是数字(保持原逻辑) + if (ctype_digit($currentChar)) { + $number .= $currentChar; + $i++; + } else { + break; + } + } } // 若行首有数字,则视为新条目 @@ -840,31 +524,33 @@ class ArticleParserService continue; } - // 非新条目,合并到当前序号的内容中 - if ($currentNumber !== null) { - $grouped[$currentNumber] .= ' ' . $line; - } + // // 非新条目,合并到当前序号的内容中 + // if ($currentNumber !== null) { + // $grouped[$currentNumber] .= ' ' . $line; + // } } - //清理结果 - $possibleEncodings = [ - 'Windows-1252', 'UTF-8', 'GBK', 'GB2312', - 'Latin-1', 'ISO-8859-1', 'CP1252' - ]; $aCompany = []; foreach ($grouped as $number => $institution) { - $encodedContent = @mb_convert_encoding($institution, 'UTF-8', implode(',', $possibleEncodings)); - $sCompany = $encodedContent ?: $sCompany; + $institution = $this->fullDecode($institution); + // 原有基础清理逻辑不变 $institution = preg_replace('/\s+/', ' ', $institution); // 合并多余空格 - $institution = rtrim($institution, '.'); - $institution = preg_replace('/^\d+\s+/', '', $institution); + $institution = rtrim($institution, '.'); // 去除末尾句号 + $institution = preg_replace('/^\d+\s+/', '', $institution); // 去除开头数字 $institution = trim($institution); // 清理首尾空格 - preg_match('/(.*?, [A-Za-z]+ \d+, [A-Za-z]+)/', $institution, $institutionmatches);; - $institution = trim($institutionmatches[1] ?? $institution); - if (preg_match('/^(.*?)(?=\s*\*Email)/', $institution, $matches)) { - $institution = trim($matches[1]); // trim() 去除内容前后多余空格 + + // 增强地址提取:匹配"机构名, 城市 邮编, 国家"格式(兼容更多变体) + // 允许地址中包含多个逗号(如子机构、街道信息),最终以"城市 邮编, 国家"结尾 + // preg_match('/(.*?, [A-Za-z\s]+ \d+, [A-Za-z\s]+)/', $institution, $institutionmatches); + // $institution = trim($institutionmatches[1] ?? $institution); + // 强化冗余信息过滤:去除"*"及之后的内容(包括通讯作者、邮箱等) + // 新增对"#"、"†"等标记的过滤,兼容更多期刊格式 + if (preg_match('/^(.*?)(?=\s*[\*#†]|(?i)\s*Email)/', $institution, $matches)) { + $institution = trim($matches[1]); } - if(!empty($institution) && !mb_check_encoding($institution, 'UTF-8')){ + + // 编码校验不变 + if (!empty($institution) && !mb_check_encoding($institution, 'UTF-8')) { $institution = mb_convert_encoding($institution, 'UTF-8', 'GBK'); } $aCompany[$number] = $institution; @@ -891,13 +577,10 @@ class ArticleParserService // 获取机构后的完整内容 $corrText = $this->getContentAfterText($sCompany); - //编码修复 - $possibleEncodings = [ - 'Windows-1252', 'UTF-8', 'GBK', 'GB2312', - 'Latin-1', 'ISO-8859-1', 'CP1252' - ]; - $encodedContent = @mb_convert_encoding($corrText, 'UTF-8', implode(',', $possibleEncodings)); - $corrText = $encodedContent ?: $corrText; + if(!mb_check_encoding($corrText, 'UTF-8')){ + $corrText = mb_convert_encoding($corrText, 'UTF-8', 'GBK'); + } + $corrText = $this->fullDecode($corrText); // // 调试 // file_put_contents(ROOT_PATH . 'runtime/corr_text_raw.log', $corrText); @@ -927,7 +610,8 @@ class ArticleParserService ]; } if(empty($aCorresponding)){ - $pattern = '/Corresponding Authors: (.*?)(?=$|;)/s'; + $pattern = '/Corresponding Authors|Correspondence to|Correspondence: (.*?)(?=$|;)/s'; + $corrText = trim($corrText,'*'); preg_match($pattern, $corrText, $match); if (!empty($match[1])) { $corrContent = $match[1]; @@ -942,6 +626,16 @@ class ArticleParserService ]; } } + if(empty($authors[1])){ + $authorPattern = '/([A-Za-z0-9\s]+?),\s*([\w@\.\-]+)(?=\.?)/'; + preg_match_all($authorPattern, $corrContent, $authors); + for ($i = 0; $i < count($authors[1]); $i++) { + $aCorresponding[] = [ + 'name' => empty($authors[1][$i]) ? '' : trim($authors[1][$i]), + 'email' => empty($authors[2][$i]) ? '' : trim($authors[2][$i]) + ]; + } + } } } return $aCorresponding; @@ -1122,24 +816,88 @@ class ArticleParserService * 从 Word 文档提取摘要和关键词 * @return array 提取结果 */ + function extractContentIntervals($str, $markers = []) { + // 1. 初始化标记(支持自定义,默认值兼容原逻辑) + $defaultMarkers = [ + 'abstract' => 'abstract', + 'keywords' => 'keywords', + 'end_span' => '===========end-span' + ]; + $markers = array_merge($defaultMarkers, $markers); + extract($markers); // 解析为变量 $abstract, $keywords, $end_span + + // 2. 初始化结果(包含元信息) + $result = [ + 'abstract_to_keywords' => '', + 'keywords_to_end' => '', + 'positions' => [ // 标记位置信息(-1 表示未找到) + 'abstract' => -1, + 'keywords' => -1, + 'end_span' => -1 + ], + 'is_valid' => false, // 整体区间是否有效 + 'error' => '' // 错误信息(如标记顺序异常) + ]; + + // 3. 定位 Abstract(不区分大小写) + $absPos = stripos($str, $abstract); + if ($absPos === false) { + $result['error'] = "未找到标记: {$abstract}"; + return $result; + } + $result['positions']['abstract'] = $absPos; + $absEndPos = $absPos + strlen($abstract); + + // 4. 定位 Keywords(需在 Abstract 之后,不区分大小写) + $keyPos = stripos($str, $keywords, $absEndPos); + if ($keyPos === false) { + $result['error'] = "未找到 {$keywords} 或在 {$abstract} 之前"; + return $result; + } + $result['positions']['keywords'] = $keyPos; + $keyEndPos = $keyPos + strlen($keywords); + + // 5. 定位 end-span(需在 Keywords 之后,严格匹配) + $endPos = strpos($str, $end_span, $keyEndPos); + if ($endPos === false) { + $result['error'] = "未找到 {$end_span} 或在 {$keywords} 之前"; + return $result; + } + $result['positions']['end_span'] = $endPos; + + // 6. 截取区间内容(清理标记后的紧邻符号) + // 区间1:Abstract 结束 → Keywords 开始(清理标记后的冒号/空格) + $len1 = $keyPos - $absEndPos; + $part1 = substr($str, $absEndPos, $len1); + $part1 = trim($part1); + // 移除 Abstract 后可能的冒号/短横线(如 "Abstract: ..." → 去掉开头的 ":") + $part1 = ltrim($part1, ': -—'); + $result['abstract_to_keywords'] = trim($part1); + + // 区间2:Keywords 结束 → end-span 开始(同理清理) + $len2 = $endPos - $keyEndPos; + $part2 = substr($str, $keyEndPos, $len2); + $part2 = trim($part2); + $part2 = ltrim($part2, ': -—'); + $result['keywords_to_end'] = trim($part2); + + // 7. 标记为有效 + $result['is_valid'] = true; + return $result; + } public function extractFromWord() { $sContent = ''; //文本处理 $sFundContent = ''; + $aContent = []; foreach ($this->sections as $section) { foreach ($section->getElements() as $element) { $textContent = $this->getTextFromElement($element); if(empty($textContent)){ continue; } - //编码修复 - $possibleEncodings = [ - 'Windows-1252', 'UTF-8', 'GBK', 'GB2312', - 'Latin-1', 'ISO-8859-1', 'CP1252' - ]; - $sContent .= @mb_convert_encoding($textContent, 'UTF-8', implode(',', $possibleEncodings)); - if(stripos($textContent, 'Keywords:') !== false){ - $sContent .= "Keywords-End-Flag"; + if(!empty($textContent) && !mb_check_encoding($textContent, 'UTF-8')){ + $textContent = mb_convert_encoding($textContent, 'UTF-8', 'GBK'); } if(empty($sFundContent)){ $aFund = $this->getMatchedFundPhrases($sContent); @@ -1152,69 +910,194 @@ class ArticleParserService } } } - $sContent .= "\n"; + $sContent .= $textContent."===========end-span"; } } - if(!empty($sContent) && !mb_check_encoding($sContent, 'UTF-8')){ $sContent = mb_convert_encoding($sContent, 'UTF-8', 'GBK'); } - // 2. 基础文本清理(合并多余空格,保留有效换行) - $textContent = preg_replace('/(\S)\s+/', '$1 ', $sContent); - $textContent = trim($textContent); - + $result = $this->extractContentIntervals($sContent); // 3. 提取摘要 - $abstract = ''; - $abstractPattern = '/Abstract\s*([\s\S]*?)(?=Keywords|$)/i'; - if (preg_match($abstractPattern, $textContent, $abstractMatches)) { - $abstract = trim($abstractMatches[1]); - $abstract = preg_replace('/\n+/', ' ', $abstract); + $abstract = empty($result['abstract_to_keywords']) ? '' : $result['abstract_to_keywords']; + if(!empty($abstract) && !mb_check_encoding($abstract, 'UTF-8')){ + $abstract = mb_convert_encoding($abstract, 'UTF-8', 'GBK'); } - // 4. 提取关键词(核心:仅保留两种强制匹配逻辑) - $keywords = []; - // $keywordPattern = '/Keywords:\s*([\s\S]*?)(?=\s*\d+\.|[;,]\s*[\r\n]+\s*[\r\n]+|(?i)\bintroduction|abbreviations\b|$)/i'; - $keywordPattern = '/Keywords\s*(.*?)\s*Keywords-End-Flag/s'; - - if (preg_match($keywordPattern, $textContent, $keywordMatches)) { - $keywordStr = trim($keywordMatches[1]); - - // 清理关键词列表格式(去除换行、末尾多余符号) - $keywordStr = preg_replace('/\n+/', ' ', $keywordStr); - $keywordStr = rtrim($keywordStr, ';,. '); // 去除末尾分号、逗号等 - $keywordStr = trim($keywordStr); - - // 分割并过滤有效关键词 - $keywords = preg_split('/[,;]\s*/', $keywordStr); - $keywords = array_filter(array_map('trim', $keywords), function($item) { - return !empty($item) && !ctype_space($item); - }); + $keywords = empty($result['keywords_to_end']) ? '' : $result['keywords_to_end']; + if(!empty($keywords) && !mb_check_encoding($keywords, 'UTF-8')){ + $keywords = mb_convert_encoding($keywords, 'UTF-8', 'GBK'); } - if(empty($keywords)){ - $keywordPattern = '/Keywords\s*([\s\S]*?)(?=Introduction|$)/i'; - if (preg_match($keywordPattern, $textContent, $keywordMatches)) { - $keywordStr = trim($keywordMatches[1]); - // 清理关键词列表格式(去除换行、末尾多余符号) - $keywordStr = preg_replace('/\n+/', ' ', $keywordStr); - $keywordStr = rtrim($keywordStr, ';,. '); // 去除末尾分号、逗号等 - $keywordStr = trim($keywordStr); - - // 分割并过滤有效关键词 - $keywords = preg_split('/[,;]\s*/', $keywordStr); - $keywords = array_filter(array_map('trim', $keywords), function($item) { - return !empty($item) && !ctype_space($item); - }); - } + if(!empty($sFundContent) && !mb_check_encoding($sFundContent, 'UTF-8')){ + $sFundContent = mb_convert_encoding($sFundContent, 'UTF-8', 'GBK'); } + return [ 'status' => 1, 'msg' => '提取成功', 'data' => [ - 'abstrart' => $abstract, - 'keywords' => $keywords, - 'fund' => $sFundContent + 'abstrart' => empty($abstract) ? '' : $this->fullDecode(str_replace('===========end-span', '',$abstract)), + 'keywords' => empty($keywords) ? '' : $this->fullDecode(str_replace('===========end-span', '',$keywords)), + 'fund' => empty($sFundContent) ? '' : $this->fullDecode(str_replace('===========end-span', '',$sFundContent)) ] ]; } + private function fullDecode($str, $maxDepth = 5) { + // 空值/深度为0,直接返回(提前终止,避免无效操作) + if (empty($str) || $maxDepth <= 0) { + return $str; + } + + // 【性能优化1:预编译所有正则表达式】避免每次循环重新解析正则 + // 预编译:≥专属场景正则 + $regOb0 = '/0B\s*\?0/'; + $regDl18 = '/DL\s*\?.18/'; + // 预编译:≥通用场景正则 + $regQMarkNum = '/\?(\d+)/'; + $regQMarkDotNum = '/\?(\.\d+)/'; + // 预编译:≤、≠空格修复正则 + $regNeNum = '/≠\s*(\d+)/'; + $regLeNum = '/≤\s*(\d+)/'; + // 预编译:混合符号乱码正则(中文顿号/英文逗号) + $regMixCn = '/(\?)\s*、\s*(\?)\s*、\s*(\?)(\d+)/'; + $regMixEn = '/(\?)\s*,\s*(\?)\s*,\s*(\?)(\d+)/'; + // 预编译:≤、≠专属标识正则 + $regLeMark = '/LE\s*\?(\d+)/'; + $regNeMark = '/NE\s*\?(\d+)/'; + // 预编译:Unicode转义正则(提取到外部,避免闭包重复创建) + $regUnicode = '/\\\\u([0-9a-fA-F]{4})/'; + + // 【性能优化2:预定义常量/映射】避免循环内重复创建数组/字符串 + // HTML实体映射(一次性定义,避免循环内重复赋值) + $htmlEntityMap = [ + '≤' => '≤', '≤' => '≤', '≤' => '≤', + '≥' => '≥', '≥' => '≥', '≥' => '≥', + '≠' => '≠', '≠' => '≠', '≠' => '≠', + ]; + // 不间断空格替换数组 + $nbspReplace = [chr(0xC2) . chr(0xA0), chr(0xA0)]; + // Unicode回调函数(预定义,避免循环内重复创建闭包) + $unicodeCallback = function ($m) { + return mb_chr(hexdec($m[1]), 'UTF-8') ?: $m[0]; + }; + + $original = $str; + $depth = 0; + $hasChange = false; // 标记是否有变化,提前终止循环 + + // 循环解码:仅在有变化且未达最大深度时执行 + do { + $depth++; + $hasChange = false; + $prevStr = $str; // 保存当前状态,用于判断变化 + + // 1. 解码Unicode转义(\uXXXX格式) + $str = $this->decodeUnicode($str); + + // 2. 解码HTML实体(先替换专属实体,再执行通用解码) + $str = strtr($str, $htmlEntityMap); // 高性能替换(strtr比str_replace快) + $str = html_entity_decode($str, ENT_QUOTES | ENT_HTML5, 'UTF-8'); + + // 3. 再次处理遗漏的Unicode转义(使用预编译正则+预定义回调) + $str = preg_replace_callback($regUnicode, $unicodeCallback, $str); + + // 4. 替换不间断空格为普通空格(strtr比str_replace更高效) + $str = str_replace($nbspReplace, ' ', $str); + + // 5. 核心替换逻辑(优化执行顺序,避免覆盖) + // 5.1 原有≥专属场景(保留) + $str = preg_replace($regOb0, '0B≥30', $str, -1, $count1); + $str = preg_replace($regDl18, 'DL≥0.18', $str, -1, $count2); + // 5.2 ≤、≠空格修复(保留) + $str = preg_replace($regNeNum, '≠$1', $str, -1, $count3); + $str = preg_replace($regLeNum, '≤$1', $str, -1, $count4); + // 5.3 原有≥通用场景(保留) + $str = preg_replace($regQMarkNum, '≥$1', $str, -1, $count5); + $str = preg_replace($regQMarkDotNum, '≥0$1', $str, -1, $count6); + // 5.4 混合符号乱码还原(保留) + $str = preg_replace($regMixCn, '≤、≥、≠$4', $str, -1, $count7); + $str = preg_replace($regMixEn, '≤、≥、≠$4', $str, -1, $count8); + // 5.5 ≤、≠专属标识还原(保留) + $str = preg_replace($regLeMark, '≤$1', $str, -1, $count9); + $str = preg_replace($regNeMark, '≠$1', $str, -1, $count10); + + // 5.6 修复前缀"d with "乱码(保留) + $str = str_replace('d with ', 'd with ', $str, $count11); + + // 【性能优化3:统计所有替换次数,判断是否有变化】 + $totalCount = $count1 + $count2 + $count3 + $count4 + $count5 + $count6 + + $count7 + $count8 + $count9 + $count10 + $count11; + if ($totalCount > 0 || $str !== $prevStr) { + $hasChange = true; + $original = $str; + } + + // 【性能优化4:提前终止】单次循环无变化,直接退出 + if (!$hasChange) { + break; + } + + } while ($depth < $maxDepth); // 改用do-while,减少循环判断次数 + + // 最终清理:仅执行一次trim + return trim($str, ':'); + } + // private function fullDecode($str, $maxDepth = 5) { + // if (empty($str) || $maxDepth <= 0) { + // return $str; + // } + + // $original = $str; + // $depth = 0; + + // // 循环解码,直到无变化或达到最大次数 + // while (true) { + // $depth++; + // if ($depth > $maxDepth) { + // break; // 防止过度解码导致死循环 + // } + + // // 1. 解码 Unicode 转义(\uXXXX 格式) + // $str = $this->decodeUnicode($str); + + // // 2. 解码 HTML 实体(&、'、< 等) + // $str = html_entity_decode($str, ENT_QUOTES | ENT_HTML5, 'UTF-8'); + + // $str = preg_replace_callback('/\\\\u([0-9a-fA-F]{4})/', function ($m) { + // return mb_chr(hexdec($m[1]), 'UTF-8') ?: $m[0]; + // }, $str); + // $str = str_replace([chr(0xC2).chr(0xA0), chr(0xA0)], ' ', $str); + + // // 2. 核心:强制匹配所有可能的乱码格式,还原≥ + // // 匹配:0B?0、0B ?0、0B ?0(空格/制表符)→ 0B≥30 + // $str = preg_replace('/0B\s*\?0/', '0B≥30', $str); + // // 匹配:DL?.18、DL ?.18、DL ?.18 → DL≥0.18 + // $str = preg_replace('/DL\s*\?.18/', 'DL≥0.18', $str); + // // 通用匹配:数字前的?(如?30、?0.18)→ ≥30、≥0.18(防止其他变体) + // $str = preg_replace('/\?(\d+)/', '≥$1', $str); + // $str = preg_replace('/\?(\.\d+)/', '≥0$1', $str); + + // // 3. 修复前缀的"d with "可能的乱码(若有) + // $str = str_replace('d with ', 'd with ', $str); // 若前缀也乱码,可同步替换 + + // // 若解码后无变化,退出循环 + // if ($str === $original) { + // break; + // } + + // $original = $str; + // } + + // return trim($str,':'); + // } + private function decodeUnicode($str) { + return preg_replace_callback( + '/\\\\u([0-9a-fA-F]{4})/', + function ($matches) { + // 将十六进制 Unicode 码转为 UTF-8 字符 + return mb_convert_encoding(pack('H*', $matches[1]), 'UTF-8', 'UCS-2BE'); + }, + $str + ); + } private function getMatchedFundPhrases($content = '') { if (empty($content)) { return []; @@ -1223,7 +1106,7 @@ class ArticleParserService // 基金支持词组列表 $fundPhrases = [ 'Supported by', 'Funded by', 'Sponsored by', 'Supported in part by', - 'Funding was provided by', 'Funded in part by' + 'Funding was provided by', 'Funded in part by','FUNDING:' ]; // 1. 转义词组中的特殊字符,使用 # 作为分隔符