', ',', $superscript); // 恢复多编号逗号
+// $superscript = preg_replace('/,$/', '', $superscript); // 清理末尾多余逗号
+// if (!empty($name)) {
+// $authorList[] = [
+// 'name' => $name,
+// 'superscript' => $superscript
+// ];
+// }
+// }
+// }
+
+// // 输出结果
+// echo "";
+// print_r($authorList);
+// echo "
";
+// exit;
+
+// // 处理作者
+// $content = mb_convert_encoding($sAuthorContent, 'UTF-8', 'auto'); // 确保编码正确
+// $content = str_replace("\xC2\xA0", ' ', $content); // 替换非-breaking空格为普通空格
+// $content = preg_replace('/(\d+)\s*([*#†])/', '$1$2', $content); // 合并"1 *"为"1*"、"1 #"为"1#"
+// $content = preg_replace('/,†/', ',†', $content); // 保留"1,†"格式(防止被拆分)
+
+// //标记上标内的逗号+空格(多编号)
+// $tempStr = preg_replace('/(\d+)\s*,\s*(\d+)/', '$1$2', $content);
+// // 原有步骤2:正则匹配(扩展上标符号支持,保持原有逻辑)
+// $pattern = '/
+// ([A-Za-z\s\.\-]+?) # 姓名(支持缩写、空格)
+// \s* # 姓名与上标间空格
+// ( # 上标组(扩展符号支持)
+// \d+ # 起始数字
+// (?:[†#*,]|\d+)* # 允许:†#*符号、逗号、+数字(兼容1,†、1,*等)
+// )
+// \s*,? # 作者间逗号(可选)
+// (?=\s|$) # 确保后面是空格或结尾
+// /ux';
+
+// preg_match_all($pattern, $tempStr, $matches);
+// var_dump($matches);exit;
+// $authorList = [];
+// if(!empty($matches[1])){
+// foreach ($matches[1] as $i => $name) {
+// $name = trim($name);
+// $superscript = trim($matches[2][$i]);
+// $superscript = str_replace('', ',', $superscript); // 恢复多编号逗号
+// $superscript = preg_replace('/,$/', '', $superscript); // 清理末尾逗号
+// // 修复符号与数字间的空格(如原始"1 *"被误处理为"1*"的情况,保持原样)
+// $superscript = preg_replace('/(\d)([*#†])/', '$1$2', $superscript);
+// if (!empty($name)) {
+// $authorList[] = [
+// 'name' => $name,
+// 'superscript' => $superscript
+// ];
+// }
+// }
+// }else {
+// // 按“两个或多个连续空格”拆分(姓名之间的分隔)
+// $authorList = array_filter(
+// array_map('trim',
+// preg_split('/(,\p{Z}*|\p{Z}{2,})/u', $sAuthorContent)
+// )
+// );
+// }
+
+
+// // //处理作者
+// $aAuthorData = [];
+// $aReport = [];
+// $namePattern = '/
+// (?:[A-Za-z\s·\-\']+| # 英文姓名(支持空格、连字符)
+// [\x{4e00}-\x{9fa5}]+| # 中文姓名
+// [\x{1800}-\x{18AF}]+| # 蒙古文姓名
+// [A-Z]\.) # 单字母缩写(如 J.)
+// /ux';
+
+// foreach ($authorList as $authorStr){
+// if (empty($authorStr)) continue;
+
+// //获取下标
+// $superscript = empty($authorStr['superscript']) ? $authorStr : $authorStr['superscript'];
+// $nameStr = empty($authorStr['name']) ? $authorStr : $authorStr['name'];
+
+// $companyId = [];
+// $isSuper = 0;
+// $isReport = 0;
+// if (!empty($superscript)) {
+// // 提取机构编号(忽略上标中的逗号,如1,† → 提取1)
+// preg_match_all('/\d+/', $superscript, $numMatch);
+// // 识别特殊符号(#为超级作者,*†为通讯作者)
+// $isSuper = strpos($superscript, '#') !== false ? 1 : 0;
+// $isReport = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0;
+// }
+// if (preg_match("/^([A-Za-z\s'\.-]+)/u", $nameStr, $match)) {
+// $nameStr = trim($match[1]);
+// }
+// $aAuthorData[] = [
+// 'name' => $nameStr,
+// 'company_id' => empty($numMatch[0]) ? [] : $numMatch[0],
+// 'is_super' => $isSuper,
+// 'is_report' => $isReport
+// ];
+// if ($isReport) {
+// $aReport[] = $nameStr;
+// }
+// }
+// return ['author' => $aAuthorData,'report' => array_unique($aReport)];
+// }
// 获取机构
private function getCompany($aParam = []){
@@ -388,32 +806,68 @@ class ArticleParserService
//获取标题下的作者
$sAuthorContent = empty($aParam['authors']) ? $this->getNextParagraphAfterText($title) : $aParam['authors'];
//获取作者结构
- $sCompany = $this->getContentAfterText($sAuthorContent);
- if(empty($sCompany)){
+ $allLines = $this->getContentAfterText($sAuthorContent,1);
+ if(empty($allLines)){
return [];
}
- //编码修复
+ // 2. 按序号分组,合并同一序号的多行内容
+ $grouped = [];
+ $currentNumber = null; // 当前序号
+ foreach ($allLines as $line) {
+ $line = trim($line);
+ if (empty($line)) continue;
+
+ // 判断是否是新条目的开头:行首为数字(后续可接任意字符或直接接内容)
+ $number = '';
+ $i = 0;
+ $lineLen = strlen($line);
+ // 提取行首的连续数字(作为序号)
+ while ($i < $lineLen && ctype_digit($line[$i])) {
+ $number .= $line[$i];
+ $i++;
+ }
+
+ // 若行首有数字,则视为新条目
+ if (!empty($number)) {
+ $currentNumber = $number;
+ // 提取序号后的内容(跳过数字后的符号/空格,保留核心内容)
+ // 从数字后的位置开始,跳过可能的符号(./*)或空格
+ while ($i < $lineLen && (in_array($line[$i], ['.', '*', ' ']))) {
+ $i++;
+ }
+ $content = trim(substr($line, $i)); // 序号后的内容
+ $grouped[$currentNumber] = $content;
+ continue;
+ }
+
+ // 非新条目,合并到当前序号的内容中
+ if ($currentNumber !== null) {
+ $grouped[$currentNumber] .= ' ' . $line;
+ }
+ }
+
+ //清理结果
$possibleEncodings = [
'Windows-1252', 'UTF-8', 'GBK', 'GB2312',
'Latin-1', 'ISO-8859-1', 'CP1252'
];
- $encodedContent = @mb_convert_encoding($sCompany, 'UTF-8', implode(',', $possibleEncodings));
- $sCompany = $encodedContent ?: $sCompany;
- //按行拆分,保留数字开头的行
- $sCompany = str_replace(["\r\n", "\r"], "\n", $sCompany);
- $aCompanyLines = explode("\n", $sCompany);
- $aCompanyLines = array_filter(array_map('trim', $aCompanyLines), function($line) {
- return preg_match('/^\d+/', $line); // 仅保留数字开头的行
- });
-
$aCompany = [];
- foreach ($aCompanyLines as $line) {
- if (preg_match('/^(\d+)\s*(.+)$/', $line, $match)) {
- if(empty($match[1]) || empty($match[2])){
- continue;
- }
- $aCompany[$match[1]] = ltrim(trim(ltrim($match[2]),'.'),' ');
+ foreach ($grouped as $number => $institution) {
+ $encodedContent = @mb_convert_encoding($institution, 'UTF-8', implode(',', $possibleEncodings));
+ $sCompany = $encodedContent ?: $sCompany;
+ $institution = preg_replace('/\s+/', ' ', $institution); // 合并多余空格
+ $institution = rtrim($institution, '.');
+ $institution = preg_replace('/^\d+\s+/', '', $institution);
+ $institution = trim($institution); // 清理首尾空格
+ preg_match('/(.*?, [A-Za-z]+ \d+, [A-Za-z]+)/', $institution, $institutionmatches);;
+ $institution = trim($institutionmatches[1] ?? $institution);
+ if (preg_match('/^(.*?)(?=\s*\*Email)/', $institution, $matches)) {
+ $institution = trim($matches[1]); // trim() 去除内容前后多余空格
}
+ if(!empty($institution) && !mb_check_encoding($institution, 'UTF-8')){
+ $institution = mb_convert_encoding($institution, 'UTF-8', 'GBK');
+ }
+ $aCompany[$number] = $institution;
}
return $aCompany;
}
@@ -451,11 +905,10 @@ class ArticleParserService
$corrText = str_replace([':', '@'], [':', '@'], $corrText);
$corrText = preg_replace('/\s+/', ' ', $corrText); // 统一空格
$corrText = str_replace(' ', ' ', $corrText); // 去除多余空格
-
//按"*"分割通讯作者
$corrBlocks = preg_split('/\s*\*\s*/', $corrText);
$corrBlocks = array_filter(array_map('trim', $corrBlocks));
-
+
$aCorresponding = [];
foreach ($corrBlocks as $block) {
//匹配通讯作者姓名
@@ -466,7 +919,6 @@ class ArticleParserService
preg_match('/(E[\s-]*mail|邮箱)[\s:]*([^\s]+@[^\s]+)/i', $block, $email);
preg_match('/(Postal[\s-]*address|地址)[\s:]*([^,;]+)/i', $block, $address);
preg_match('/(Tel|电话)[\s:]*([^\s]+)/i', $block, $tel);
-
$aCorresponding[] = [
'name' => $sName,
'email' => isset($email[2]) ? trim($email[2]) : '',
@@ -474,6 +926,24 @@ class ArticleParserService
'tel' => isset($tel[2]) ? trim($tel[2]) : ''
];
}
+ if(empty($aCorresponding)){
+ $pattern = '/Corresponding Authors: (.*?)(?=$|;)/s';
+ preg_match($pattern, $corrText, $match);
+ if (!empty($match[1])) {
+ $corrContent = $match[1];
+ // 提取每个作者的名称和邮箱(优化正则,支持更多字符)
+ $authorPattern = '/([A-Za-z\s]+?),\s*E-mail:\s*([\w@\.\-]+)/';
+ preg_match_all($authorPattern, $corrContent, $authors);
+ if(!empty($authors[1])){
+ for ($i = 0; $i < count($authors[1]); $i++) {
+ $aCorresponding[] = [
+ 'name' => empty($authors[1][$i]) ? '' : trim($authors[1][$i]),
+ 'email' => empty($authors[2][$i]) ? '' : trim($authors[2][$i])
+ ];
+ }
+ }
+ }
+ }
return $aCorresponding;
}
@@ -518,10 +988,10 @@ class ArticleParserService
}
// 获取目标文本后的所有内容
- private function getContentAfterText($targetText){
+ private function getContentAfterText($targetText,$return_type = 2){
$found = false;
$content = [];
- $stopKeywords = ['关键词', 'Key words', '摘要', 'Abstract'];
+ $stopKeywords = ['关键词', 'Key words', '摘要', 'Abstract','ABSTRACT'];
$maxLines = 200;
$lineNumber = 0;
foreach ($this->sections as $section) {
@@ -559,7 +1029,14 @@ class ArticleParserService
}
if (count($content) >= $maxLines || (isset($shouldStop) && $shouldStop)) break;
}
- return implode("\n", $content);
+ if($return_type == 1){
+ return $content;
+ }
+ $content = implode("\n", $content);
+ if(!empty($content) && !mb_check_encoding($content, 'UTF-8')){
+ $content = mb_convert_encoding($content, 'UTF-8', 'GBK');
+ }
+ return $content;
}
// 统一提取元素文本
@@ -635,7 +1112,9 @@ class ArticleParserService
$text = preg_replace('/[\x00-\x1F\x7F-\x9F]/', ' ', $text); // 移除控制字符
$text = str_replace(["\t", "\r", "\n"], ' ', $text); // 统一空白字符
$text = preg_replace('/\s+/', ' ', $text); // 合并多个空格
-
+ if(!empty($text) && !mb_check_encoding($text, 'UTF-8')){
+ $text = mb_convert_encoding($text, 'UTF-8', 'GBK');
+ }
return $text;
}
@@ -676,13 +1155,17 @@ class ArticleParserService
$sContent .= "\n";
}
}
+
+ if(!empty($sContent) && !mb_check_encoding($sContent, 'UTF-8')){
+ $sContent = mb_convert_encoding($sContent, 'UTF-8', 'GBK');
+ }
// 2. 基础文本清理(合并多余空格,保留有效换行)
$textContent = preg_replace('/(\S)\s+/', '$1 ', $sContent);
$textContent = trim($textContent);
// 3. 提取摘要
$abstract = '';
- $abstractPattern = '/Abstract\s*([\s\S]*?)(?=Keywords:|$)/i';
+ $abstractPattern = '/Abstract\s*([\s\S]*?)(?=Keywords|$)/i';
if (preg_match($abstractPattern, $textContent, $abstractMatches)) {
$abstract = trim($abstractMatches[1]);
$abstract = preg_replace('/\n+/', ' ', $abstract);
@@ -690,7 +1173,8 @@ class ArticleParserService
// 4. 提取关键词(核心:仅保留两种强制匹配逻辑)
$keywords = [];
// $keywordPattern = '/Keywords:\s*([\s\S]*?)(?=\s*\d+\.|[;,]\s*[\r\n]+\s*[\r\n]+|(?i)\bintroduction|abbreviations\b|$)/i';
- $keywordPattern = '/Keywords:\s*(.*?)\s*Keywords-End-Flag/s';
+ $keywordPattern = '/Keywords\s*(.*?)\s*Keywords-End-Flag/s';
+
if (preg_match($keywordPattern, $textContent, $keywordMatches)) {
$keywordStr = trim($keywordMatches[1]);
@@ -705,6 +1189,22 @@ class ArticleParserService
return !empty($item) && !ctype_space($item);
});
}
+ if(empty($keywords)){
+ $keywordPattern = '/Keywords\s*([\s\S]*?)(?=Introduction|$)/i';
+ if (preg_match($keywordPattern, $textContent, $keywordMatches)) {
+ $keywordStr = trim($keywordMatches[1]);
+ // 清理关键词列表格式(去除换行、末尾多余符号)
+ $keywordStr = preg_replace('/\n+/', ' ', $keywordStr);
+ $keywordStr = rtrim($keywordStr, ';,. '); // 去除末尾分号、逗号等
+ $keywordStr = trim($keywordStr);
+
+ // 分割并过滤有效关键词
+ $keywords = preg_split('/[,;]\s*/', $keywordStr);
+ $keywords = array_filter(array_map('trim', $keywords), function($item) {
+ return !empty($item) && !ctype_space($item);
+ });
+ }
+ }
return [
'status' => 1,
'msg' => '提取成功',
diff --git a/application/common/ProofReadService.php b/application/common/ProofReadService.php
index 2dcecf3..1a52e26 100644
--- a/application/common/ProofReadService.php
+++ b/application/common/ProofReadService.php
@@ -301,7 +301,8 @@ class ProofReadService
// 3. 核心优先级:运算符规则(精准匹配,排除No.编号干扰)
[
- 'pattern' => '~(\S)\s*([<>!]=|===|!==)\s*(\S)~u',
+ // 'pattern' => '~(\S)\s*([<>!]=|===|!==)\s*(\S)~u',
+ 'pattern' => '~(?)\s*(\S)\s*([<>!]=|===|!==)\s*(\S)(?!<[a-z]+>)~u',
'replacement' => '$1 $2 $3',
'verbatim_texts' => '复合运算符前后空格不规范',
'explanation' => '复合运算符[>=、<=、==、!=、===、!==]前后应各留一个空格',