Files
tougao/application/common/ProofReadService.php
2025-11-11 11:26:33 +08:00

1435 lines
68 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
namespace app\common;
class ProofReadService
{
private $errors = [];
// 主校对方法
public function proofread($content)
{
$this->errors = [];
$this->excludedFormats = [];
$correctedContent = $content;
//时间单位缩写校对
$correctedContent = $this->checkTimeUnitAbbreviations($correctedContent);
//横线/运算符校对
$correctedContent = $this->checkTextFormat($correctedContent);
//数字格式校对
$correctedContent = $this->checkNumberFormat($correctedContent);
//No. 123456的写法统一
$correctedContent = $this->checkNoFormatUniformity($correctedContent);
//毫升单位校对
$correctedContent = $this->checkMlUnit($correctedContent);
//显著性P斜体校对
$correctedContent = $this->checkPSignificance($correctedContent);
//图表标题一律使用全称Figure 1, Table 1.不能写成Fig. 1, Tab 1.
$correctedContent = $this->checkFigureTableTitle($correctedContent);
//检测参考文献是否能打开
// $correctedContent = $this->checkDoi($correctedContent);
//判断是否为空错误信息
if(empty($this->errors)){
return [];
}
return [
'proof_before' => $content,
'proof_after' => $correctedContent,
'errors' => $this->errors
];
}
/**
* 横线/运算符校对/数字和单位(高可用版)
*/
private function checkTextFormat($content) {
// 初始化错误数组
$errors = [];
$defaultReturn = $content;
$originalContent = $content; // 保存完整原始内容
$searchOffsetForExclude = 0; // 【新增】仅用于「特殊内容过滤」的偏移量
$searchOffsetForCore = 0; // 【新增】仅用于「核心规则处理」的偏移量
// 验证数据
if (!is_string($content) || trim($content) === '') {
$this->handleErrors($errors);
return $defaultReturn;
}
$corrected = $content;
$excludeMarkers = []; // 存储 URL/DOI + <wmath>/<math> 的占位符映射
$processedHashes = [];
// 编码处理
$originalEncoding = mb_detect_encoding($content, ['UTF-8', 'GBK', 'GB2312', 'ISO-8859-1'], true);
if ($originalEncoding === false) {
} else {
$converted = @mb_convert_encoding($content, 'UTF-8', $originalEncoding);
$corrected = $converted !== false ? $converted : $content;
if ($converted === false) {
$posStart = 0;
$posEnd = min(20, strlen($originalContent));
}
}
// 过滤 <wmath>/<math>
$mathTagRegex = '~<(wmath|math)[^>]*?>.*?</\1>~is';
if (@preg_match($mathTagRegex, '') === false) {
// 正则错误处理(不变)
} elseif (preg_match_all($mathTagRegex, $corrected, $matches, PREG_SET_ORDER)) {
usort($matches, function($a, $b) {
return strlen($b[0]) - strlen($a[0]);
});
foreach ($matches as $index => $match) {
$fullTag = $match[0];
$tagType = $match[1];
$marker = "___EXCLUDE_{$tagType}_" . time() . "_{$index}___";
$excludeMarkers[$marker] = $fullTag;
// 【修改】使用独立偏移量 $searchOffsetForExclude
$posStart = strpos($originalContent, $fullTag, $searchOffsetForExclude);
$posEnd = ($posStart !== false) ? $posStart + strlen($fullTag) : -1;
$searchOffsetForExclude = ($posEnd !== -1) ? $posEnd : $searchOffsetForExclude + strlen($fullTag);
$safeFullTag = preg_quote($fullTag, '~');
$corrected = preg_replace("~{$safeFullTag}~u", $marker, $corrected, 1);
}
}
// 过滤 URL/DOI
$urlDoiRegex = '~(
https?://[^\s/]{1,100} # 协议(http/https) + 域名(非空白/字符)
(?:/+[A-Za-z0-9\.\-]+(?:-[A-Za-z0-9\.\-]+)*)* # 多级路径(支持.html后接/1/23等格式
(?:\?[A-Za-z0-9_\-=&%\+\.\~]+)? # 可选查询参数(如?J_num=8&page=1
(?:\#[A-Za-z0-9_\-]+)? # 可选锚点(如#section
|
\b[a-zA-Z0-9\.\-]+\.[a-zA-Z]{2,} # 无协议域名如example.com
(?:/+[A-Za-z0-9\.\-]+(?:-[A-Za-z0-9\.\-]+)*)* # 无协议多级路径
(?:\?[A-Za-z0-9_\-=&%\+\.\~]+)? # 无协议查询参数
(?:\#[A-Za-z0-9_\-]+)? # 无协议锚点
(?=$|[\s\.,;!]) # 结束边界(空白或标点)
|
doi:\s{0,10}\d+\.\d+/[A-Za-z0-9-+×:]+(?:-[A-Za-z0-9-+×:]+)* # DOI格式
)~iux';
if (@preg_match($urlDoiRegex, '') === false) {
// 正则错误处理(不变)
} elseif (preg_match_all($urlDoiRegex, $corrected, $matches, PREG_SET_ORDER)) {
// 按长度降序排序优先处理长URL避免短URL被包含时误替换
usort($matches, function($a, $b) { return strlen($b[1]) - strlen($a[1]); });
foreach ($matches as $index => $match) {
$original = $match[1];
$marker = "___EXCLUDE_URL_" . time() . "_{$index}___";
$excludeMarkers[$marker] = $original;
// 独立偏移量避免重复匹配兼容特殊URL格式
$posStart = strpos($originalContent, $original, $searchOffsetForExclude);
$posEnd = ($posStart !== false) ? $posStart + strlen($original) : -1;
$searchOffsetForExclude = ($posEnd !== -1) ? $posEnd : $searchOffsetForExclude + strlen($original);
// 精准替换当前URL为标记仅1次避免全局替换干扰
$corrected = preg_replace("~" . preg_quote($original, '~') . "~u", $marker, $corrected, 1);
}
}
// 核心格式规则处理(优化偏移量计算与验证逻辑)
$coreRules = $this->getTextCoreRules();
foreach ($coreRules as $rule) {
if (@preg_match($rule['pattern'], '') === false) {
continue;
}
// 匹配时保留偏移量信息,用于精准定位
$matchCount = preg_match_all(
$rule['pattern'],
$corrected,
$matches,
PREG_SET_ORDER | PREG_OFFSET_CAPTURE
);
if ($matchCount === 0) {
continue;
}
foreach ($matches as $match) {
$original = $match[0][0]; // 匹配到的原始内容
$originalLen = strlen($original);
$hash = md5($original);
// 跳过已处理的内容,避免重复修正
if (isset($processedHashes[$hash])) {
continue;
}
$offsetInCorrected = $match[0][1]; // 匹配内容在$corrected中的偏移量
$prefixInCorrected = substr($corrected, 0, $offsetInCorrected);
$prefixInOriginal = strtr($prefixInCorrected, $excludeMarkers); // 还原占位符为原始内容
$posStart = strlen($prefixInOriginal);
$posEnd = $posStart + $originalLen;
$contentCheck = substr($originalContent, $posStart, $originalLen);
$contentCheckConv = iconv('UTF-8', 'UTF-8//IGNORE', $contentCheck); // 忽略无效字符
$originalConv = iconv('UTF-8', 'UTF-8//IGNORE', $original);
if (strcmp($contentCheckConv, $originalConv) !== 0) {
// 验证失败时,基于当前偏移量重新定位
$localPattern = '~' . preg_quote($original, '~') . '~u';
if (preg_match($localPattern, $originalContent, $localMatch, PREG_OFFSET_CAPTURE, $searchOffsetForCore)) {
$posStart = $localMatch[0][1];
$posEnd = $posStart + $originalLen;
} else {
continue;
}
}
// 生成修正后的内容
$fixed = is_callable($rule['replacement'])
? call_user_func($rule['replacement'], $match)
: preg_replace($rule['pattern'], $rule['replacement'], $original);
// 仅在内容有变化时更新
if ($original !== $fixed && $fixed !== null) {
$searchOffsetForCore = $posEnd; // 更新核心规则偏移量,避免重复匹配
$currentCorrected = str_replace($original, $fixed, $corrected);
// 记录错误信息
$errors[] = $this->createError(
$original,
$fixed,
$rule['explanation'],
$originalContent,
$currentCorrected,
$posStart,
$posEnd,
$rule['error_type'] ?? ''
);
$processedHashes[$hash] = true;
$corrected = $currentCorrected;
}
}
}
// 批量还原 URL/DOI 和数学标签(保持不变,优化错误提示)
$restoreErrors = [];
if (!empty($excludeMarkers)) {
$corrected = strtr($corrected, $excludeMarkers);
// 检查未正常还原的占位符
if (preg_match_all('~___EXCLUDE_(wmath|math|URL)_\d+_\d+___~', $corrected, $remaining)) {
foreach ($remaining[0] as $marker) {
$original = $excludeMarkers[$marker] ?? '未知内容';
$restoreErrors[] = "未正常还原的占位符: {$marker}(原始内容: {$original}";
$corrected = str_replace($marker, $original, $corrected); // 强制还原
}
}
}
$this->handleErrors($errors);
return is_string($corrected) ? $corrected : $defaultReturn;
}
/**
* 获取文本格式核心规则
*/
private function getTextCoreRules()
{
return [
// 1. 最高优先级特殊格式排除规则首行专属排除No.编号)
[
'pattern' => '~
# 【首优先级】No.编号专属排除如No.: 2023YJZX-LN03/13、NO: KHYJ-2023-05、no. 123-ABC/45
# 支持变体No.大小写、冒号可带/不带点、冒号前后空格、编号含-/_/数字/字母
\b(?:No|NO|no)\.?:?\s* # 前缀No./NO./no.(冒号可选,点可选,后接任意空格)
[A-Za-z0-9\-\/_]+ # 编号主体:支持字母、数字、-、/、_覆盖2023YJZX-LN03/13
(?:-[A-Za-z0-9\-\/_]+)* # 编号后缀支持多段连接如2023YJZX-LN03/13-001
\b # 单词边界避免编号后接多余字符如No.: 2023abc
|
# 括号内分数及百分比组合(如(45/45)、(15.6%, 7/45)
\(\s*(?:\d+(?:\.\d+)?%?\s*,?\s*)?\d+(?:\.\d+)?\s*/\s*\d+(?:\.\d+)?\s*\)
|
# 独立年份范围如1849-1850、2023 - 2025
(?<!\d|[\+\-\*\/=<>]|from\s+|:\s*|\[[MZACDP]\]\.\s*)\d{4}\s*-\s*\d{4}
(?!\d|[\+\-\*\/=<>]|[,\.:;!]|\+\d+\.)
|
# from+年份范围如from 1849-1850
\bfrom\s+\d{4}\s*-\s*\d{4}\b
|
# 带单位的数字范围/倍数如50-200 nm、10×5 cm
\b\d+\s*[-×]\s*\d+\s*[a-zA-Z%]
|
# 无No.前缀的项目编号如2023YJZX-LN03/13、KHYJ-2023-05-01
[A-Za-z]+-?\d+[-/]\d+[-/]\d*
|
# 参考文献格式(期刊/专著等)
\d{4},\s*\d{1,3}\(\d{1,2}\):\s*\d+-\d+(?:\+\d+)*\.|[^\n]+\[[MZACDP]\]\.\s*[^\n]+,\s*\d{4}:\s*\d+-\d+\.
~ux',
'replacement' => '$0', // 完全保留原始格式,不做任何修改
'verbatim_texts' => 'No.编号及非运算场景无需处理',
'explanation' => 'No.系列编号如No.: 2023YJZX-LN03/13、括号内分数、年份范围、带单位数字范围、项目编号、参考文献等非运算场景的符号不做处理',
'error_type' => 'exclude'
],
// 2. 次高优先级:数字范围规则(避免与-冲突)
[
'pattern' => '~(\[\s*[-]?\d+\s*)\x{2014}\s*(\d+\s*\])~u',
'replacement' => '$1-$2',
'verbatim_texts' => '带括号数字范围长划线不规范',
'explanation' => '带括号的数字范围应使用短划线[-]',
'error_type' => 'en-dash'
],
[
'pattern' => '~(\[\s*[-]?\d+)\s*-\s*(\d+\s*\])~u',
'replacement' => '$1-$2',
'verbatim_texts' => '带括号数字范围短划线空格不规范',
'explanation' => '带括号数字范围的短划线[-]前后不应留空格',
'error_type' => 'en-dash'
],
[
'pattern' => '~(\b\d+)\s*—\s*(\d+\b)~u',
'replacement' => '$1-$2',
'verbatim_texts' => '无括号数字范围长划线不规范',
'explanation' => '无括号的数字范围应使用短划线[-]',
'error_type' => 'bracket_en-dash'
],
[
'pattern' => '~
(?<!\d{4}|:\s*|(?:No|NO|no)\.?:\s*) # 排除No.编号前缀
\b(\d{1,3})\s*-\s*(\d{1,3})\b
(?!\d{4}|\+\d+\.)
~ux',
'replacement' => '$1-$2',
'verbatim_texts' => '无括号数字范围短划线空格不规范',
'explanation' => '无括号数字范围的短划线[-]前后不应留空格',
'error_type' => 'bracket_en-dash'
],
// 3. 核心优先级运算符规则精准匹配排除No.编号干扰)
[
// 'pattern' => '~(\S)\s*([<>!]=|===|!==)\s*(\S)~u',
'pattern' => '~(?<!</[a-z]+>)\s*(\S)\s*([<>!]=|===|!==)\s*(\S)(?!<[a-z]+>)~u',
'replacement' => '$1 $2 $3',
'verbatim_texts' => '复合运算符前后空格不规范',
'explanation' => '复合运算符[>=、<=、==、!=、===、!==]前后应各留一个空格',
'error_type' => 'composite_operator'
],
[
'pattern' => '~
(?<!=|<|>|\*|\+|-|/)
(\S+?)\s*=\s*(\S+?)
(?!=|<|>|\*|\+|-|/)
~ux',
'replacement' => '$1 = $2',
'verbatim_texts' => '等号前后空格不规范',
'explanation' => '独立等号[=]前后应各留一个空格',
'error_type' => 'equal'
],
// 乘法排除No.编号中的*
[
'pattern' => '~
(?<!\D|×\s*|(?:No|NO|no)\.?:\s*) # 排除No.编号前缀
(\d+(?:\.\d+)?)
\s*(\*)
\s*(\d+(?:\.\d+)?)
(?!\D|\s*\])
~ux',
'replacement' => '$1 × $3',
'verbatim_texts' => '乘法运算符格式不规范',
'explanation' => '乘法运算应使用标准乘号[×],前后各留一个空格',
'error_type' => 'ride'
],
// 除法排除No.编号中的/
[
'pattern' => '~
(?<!\D|\(\s*|(?:No|NO|no)\.?:\s*) \# 排除No.编号前缀
(\d+(?:\.\d+)?)
\s*(\/)
\s*(\d+(?:\.\d+)?)
(?!\D|\s*\))
~ux',
'replacement' => '$1 $2 $3',
'verbatim_texts' => '除法运算符前后空格不规范',
'explanation' => '除法运算符[/]前后应各留一个空格(纯数字运算场景)',
'error_type' => 'except'
],
// 加法排除No.编号中的+
[
'pattern' => '~
(?<!\D|-\s*|(?:No|NO|no)\.?:\s*) # 排除No.编号前缀
(\d+(?:\.\d+)?)
\s*(\+)
\s*(\d+(?:\.\d+)?)
(?!\D|\s*\.)
~ux',
'replacement' => '$1 $2 $3',
'verbatim_texts' => '加法运算符前后空格不规范',
'explanation' => '加法运算符[+]前后应各留一个空格(纯数字运算场景)',
'error_type' => 'plus'
],
// 减法排除No.编号中的-
[
'pattern' => '~
(?<!\D|from\s+|:\s*|(?:No|NO|no)\.?:\s*) # 排除No.编号前缀
(\d+(?:\.\d+)?)
\s*(-)
\s*(\d+(?:\.\d+)?)
(?!\D|\s*\,)
~ux',
'replacement' => '$1 $2 $3',
'verbatim_texts' => '减法运算符前后空格不规范',
'explanation' => '减法运算符[-]前后应各留一个空格(纯数字运算场景)',
'error_type' => 'reduce'
],
// 4. 低优先级:特殊符号规则
[
'pattern' => '~(\d+)\s+%~u',
'replacement' => '$1%',
'verbatim_texts' => '数字与百分号空格不规范',
'explanation' => '数字与百分号[%]之间不应留空格',
'error_type' => 'number_percentage'
],
[
'pattern' => '~(\d+)\s+×\s+(\d+)~u',
'replacement' => '$1×$2',
'verbatim_texts' => '倍数乘号空格不规范',
'explanation' => '乘号[×]表示倍数时前后不应留空格',
'error_type' => 'multiple'
],
[
'pattern' => '~(\d+)\s+:\s+(\d+)~u',
'replacement' => '$1:$2',
'verbatim_texts' => '比值符号空格不规范',
'explanation' => '比值符号[:]前后不应留空格',
'error_type' => 'biliel'
]
];
}
/**
* 数字格式处理
*/
private function checkNumberFormat($content) {
$errors = [];
$defaultReturn = $content;
$originalContent = $content;
$searchOffset = 0;
if (!is_string($content) || trim($content) === '') {
$this->handleErrors($errors);
return $defaultReturn;
}
$correctedContent = $content;
$replacements = [];
$urlDoiPlaceholders = [];
$prefixFormatPlaceholders = [];
$decimalAlphaPlaceholders = [];
$dateRelatedPlaceholders = [];
$specialDecimalPlaceholders = [];
$softwareVersionPlaceholders = [];
$postalCodePlaceholders = []; // 精准保护邮编
$bracketedNumPlaceholders = []; // 精准保护括号内数字
// 保护括号内数字(仅匹配(960-1279)这类格式)
$bracketedNumPattern = '~
\(\d+[-\d]*\d+\) # 仅匹配带括号的数字/数字范围
~ux';
if (@preg_match($bracketedNumPattern, '') !== false) {
$correctedContent = preg_replace_callback(
$bracketedNumPattern,
function ($matches) use (&$bracketedNumPlaceholders, $originalContent, &$searchOffset) {
$fullMatch = $matches[0];
$placeholder = '___BRACKETED_NUM_' . uniqid() . '___';
$bracketedNumPlaceholders[$placeholder] = $fullMatch;
$posStart = strpos($originalContent, $fullMatch, $searchOffset);
$searchOffset = $posStart !== false ? $posStart + strlen($fullMatch) : $searchOffset;
return $placeholder;
},
$correctedContent
);
}
// 精准保护邮编(仅匹配“地名+空格+4-6位数字”
$postalCodePattern = '~
\b(?:[A-Za-z]+(?:\s+[A-Za-z]+)*|[\x{4e00}-\x{9fa5}]+)\s+\d{4,6}\b # 强制空格如Jiangsu 223300、北京 100000
|\b0\d{2,3}\d{7}\b # 兼容区号+固定电话02588888888、01012345678
~uix';
if (@preg_match($postalCodePattern, '') !== false) {
$correctedContent = preg_replace_callback(
$postalCodePattern,
function ($matches) use (&$postalCodePlaceholders, $originalContent, &$searchOffset) {
$fullMatch = $matches[0];
$placeholder = '___POSTAL_CODE_' . uniqid() . '___';
$postalCodePlaceholders[$placeholder] = $fullMatch;
$posStart = strpos($originalContent, $fullMatch, $searchOffset);
$searchOffset = $posStart !== false ? $posStart + strlen($fullMatch) : $searchOffset;
return $placeholder;
},
$correctedContent
);
}
//保护软件版本
$softwareVersionPattern = '~
\b(?:[A-Za-z]+(?:\s+[A-Za-z]+)*|[\x{4e00}-\x{9fa5}]+(?:\s+[\x{4e00}-\x{9fa5}]+)*)\s+\d+\.\d+(?:\.\d+)*\b
~uix';
if (@preg_match($softwareVersionPattern, '') !== false) {
$correctedContent = preg_replace_callback(
$softwareVersionPattern,
function ($matches) use (&$softwareVersionPlaceholders, $originalContent, &$searchOffset) {
$fullMatch = $matches[0];
$placeholder = '___SOFTWARE_VERSION_' . uniqid() . '___';
$softwareVersionPlaceholders[$placeholder] = $fullMatch;
$posStart = strpos($originalContent, $fullMatch, $searchOffset);
$searchOffset = $posStart !== false ? $posStart + strlen($fullMatch) : $searchOffset;
return $placeholder;
},
$correctedContent
);
}
//保护特殊小数
$specialDecimalPattern = '~
a=\s*[\d+\.\d+[A-Za-z]+\d*\-+]+
|\b\d+\.\d+[A-Za-z]+\d*\b
|\b\d+\.\d+[-+]\d+\.\d+[A-Za-z]+\d*\b
|\b\d+\.\d+[-+]\d+\.\d+\b
~ux';
if (@preg_match($specialDecimalPattern, '') !== false) {
$correctedContent = preg_replace_callback(
$specialDecimalPattern,
function ($matches) use (&$specialDecimalPlaceholders, $originalContent, &$searchOffset) {
$fullMatch = $matches[0];
$placeholder = '___SPECIAL_DECIMAL_' . uniqid() . '___';
$specialDecimalPlaceholders[$placeholder] = $fullMatch;
$posStart = strpos($originalContent, $fullMatch, $searchOffset);
$searchOffset = $posStart !== false ? $posStart + strlen($fullMatch) : $searchOffset;
return $placeholder;
},
$correctedContent
);
}
// 保护年份/年月格式2023、202309、2023-0021等
$dateRelatedPattern = '~
\b(?:20\d{2}|20\d{2}(0[1-9]|1[0-2])|20\d{2}-00\d{2})\b(?!\s*[A-Za-z]|\.)
~ux';
if (@preg_match($dateRelatedPattern, '') !== false) {
$correctedContent = preg_replace_callback(
$dateRelatedPattern,
function ($matches) use (&$dateRelatedPlaceholders, $originalContent, &$searchOffset) {
$fullMatch = $matches[0];
$placeholder = '___DATE_PROTECT_' . uniqid() . '___';
$dateRelatedPlaceholders[$placeholder] = $fullMatch;
$posStart = strpos($originalContent, $fullMatch, $searchOffset);
$searchOffset = $posStart !== false ? $posStart + strlen($fullMatch) : $searchOffset;
return $placeholder;
},
$correctedContent
);
}
//6. 保护0.00Ac类格式如1.20mL、0.50mg,避免误删末尾零)
$decimalAlphaPattern = '~
\b(?:\d+\.\d+[A-Za-z]+|\d+\.[A-Za-z]+)\b(?!\s*[0-9.])
~ux';
if (@preg_match($decimalAlphaPattern, '') !== false) {
$correctedContent = preg_replace_callback(
$decimalAlphaPattern,
function ($matches) use (&$decimalAlphaPlaceholders, $originalContent, &$searchOffset) {
$fullMatch = $matches[0];
$placeholder = '___DECIMAL_ALPHA_' . uniqid() . '___';
$decimalAlphaPlaceholders[$placeholder] = $fullMatch;
$posStart = strpos($originalContent, $fullMatch, $searchOffset);
$searchOffset = $posStart !== false ? $posStart + strlen($fullMatch) : $searchOffset;
return $placeholder;
},
$correctedContent
);
}
//7. 保护通用前缀格式如ID 123、REF AB456
$universalPrefixPattern = '~
(?:^|\s|\()
(?:(?!No\.|NO\.|PO|SO|SN|BN|REF|ORD|ID|PID)[A-Za-z]{1,3}(?:s?\.?))
\s*
(?:[A-Za-z]+\d+|\d+[A-Za-z]+|[A-Za-z]+\d+[A-Za-z]+|\d{1,3}(?:,\d{3})*|\d+)
(?:$|\s|\)|\,|\.)
~ux';
if (@preg_match($universalPrefixPattern, '') !== false) {
$correctedContent = preg_replace_callback(
$universalPrefixPattern,
function ($matches) use (&$prefixFormatPlaceholders, $originalContent, &$searchOffset) {
$fullMatch = $matches[0];
$placeholder = '___UNIVERSAL_PREFIX_' . uniqid() . '___';
$prefixFormatPlaceholders[$placeholder] = $fullMatch;
$posStart = strpos($originalContent, $fullMatch, $searchOffset);
$searchOffset = $posStart !== false ? $posStart + strlen($fullMatch) : $searchOffset;
return $placeholder;
},
$correctedContent
);
}
// 保护URL/DOI避免链接中的数字被误加千分位
$urlDoiPattern = '#([^\w]|^)(https?://[^<>\s]+|doi:\s{0,10}\d{1,10}\.\d{1,10}/[A-Za-z0-9]{1,30})([^\w]|$)#i';
if (@preg_match($urlDoiPattern, '') !== false) {
$correctedContent = preg_replace_callback(
$urlDoiPattern,
function ($matches) use (&$urlDoiPlaceholders, $originalContent, &$searchOffset) {
$fullMatch = $matches[0];
$placeholder = '___URL_DOI_' . uniqid() . '___';
$urlDoiPlaceholders[$placeholder] = $fullMatch;
$posStart = strpos($originalContent, $fullMatch, $searchOffset);
$searchOffset = $posStart !== false ? $posStart + strlen($fullMatch) : $searchOffset;
return $placeholder;
},
$correctedContent
);
}
// 小数零处理(仅删除普通小数的无效零,跳过特殊格式)
$decimalTrailingZeroPattern = '~(-?\d+\.\d*[1-9])0+(?!\d|e|E|___DATE_PROTECT_|___DECIMAL_ALPHA_|___UNIVERSAL_PREFIX_|No\.|PO|SO|___SPECIAL_DECIMAL_|___SOFTWARE_VERSION_|___POSTAL_CODE_|___BRACKETED_NUM_|\-|\+|[A-Za-z])~ix';
preg_match_all($decimalTrailingZeroPattern, $correctedContent, $trailingMatches);
foreach (array_unique($trailingMatches[0]) as $number) {
if (strpos($number, '___POSTAL_CODE_') !== false || strpos($number, '___BRACKETED_NUM_') !== false) {
continue;
}
if (preg_match($decimalTrailingZeroPattern, $number, $numMatch)) {
$replacements[$number] = $numMatch[1];
$posStart = strpos($originalContent, $number, $searchOffset);
$posEnd = $posStart !== false ? $posStart + strlen($number) : -1;
$searchOffset = $posEnd !== -1 ? $posEnd : $searchOffset;
$currentCorrected = strtr($originalContent, $replacements);
$errors[] = $this->createError(
$number, $numMatch[1], "删除普通小数后末尾无效零",
$originalContent, $currentCorrected, $posStart, $posEnd, 'invalid_zero'
);
}
}
$decimalAllZeroPattern = '~(-?\d+)\.0+(?!\d|e|E|___DATE_PROTECT_|___DECIMAL_ALPHA_|___UNIVERSAL_PREFIX_|No\.|PO|SO|___SPECIAL_DECIMAL_|___SOFTWARE_VERSION_|___POSTAL_CODE_|___BRACKETED_NUM_|\-|\+|[A-Za-z])~ix';
preg_match_all($decimalAllZeroPattern, $correctedContent, $allZeroMatches);
foreach (array_unique($allZeroMatches[0]) as $number) {
if (strpos($number, '___POSTAL_CODE_') !== false || strpos($number, '___BRACKETED_NUM_') !== false) {
continue;
}
if (preg_match($decimalAllZeroPattern, $number, $numMatch)) {
$replacements[$number] = $numMatch[1];
$posStart = strpos($originalContent, $number, $searchOffset);
$posEnd = $posStart !== false ? $posStart + strlen($number) : -1;
$searchOffset = $posEnd !== -1 ? $posEnd : $searchOffset;
$currentCorrected = strtr($originalContent, $replacements);
$errors[] = $this->createError(
$number, $numMatch[1], "删除普通小数后全量无效零",
$originalContent, $currentCorrected, $posStart, $posEnd, 'invalid_zero'
);
}
}
$correctedContent = strtr($correctedContent, $replacements);
// 千分位处理
$excludePatterns = implode('|', [
'https?://[^<>\s]+|doi:\s{0,10}\d{1,10}\.\d{1,10}/[A-Za-z0-9]{1,30}',
'20\d{2}(?:0[1-9]|1[0-2])?(?:0[1-9]|[12]\d|3[01])?|20\d{2}-00\d{2}',
'\d+\.\d+[A-Za-z]+|\d+\.[A-Za-z]+',
'\(\d+[-\d]*\d+\)',
'(?:[A-Za-z]+(?:\s+[A-Za-z]+)*|[\x{4e00}-\x{9fa5}]+)\s+\d{4,6}\b|0\d{2,3}\d{7}\b',
'(?:[A-Za-z]+(?:\s+[A-Za-z]+)*|[\x{4e00}-\x{9fa5}]+(?:\s+[\x{4e00}-\x{9fa5}]+)*)\s+\d+\.\d+(?:\.\d+)*',
'[A-Za-z]{1,3}s?\.?\s*(?:[A-Za-z]+\d+|\d+[A-Za-z]+|\d{1,3}(?:,\d{3})*|\d+)',
'No\.?\s*\d+|PO\s*\d+|SO\s*\d+|SN\s*\d+',
'a=\s*[\d+\.\d+[A-Za-z]+\d*\-+]+',
'___DATE_PROTECT_.*?___|___DECIMAL_ALPHA_.*?___|___UNIVERSAL_PREFIX_.*?___|___URL_DOI_.*?___|___SPECIAL_DECIMAL_.*?___|___SOFTWARE_VERSION_.*?___|___POSTAL_CODE_.*?___|___BRACKETED_NUM_.*?___'
]);
$thousandPattern = sprintf(
'#(?<!\.)\b(?!(?:%s))\d{7,}\b(?!\.)#ixu', // 仅7位及以上纯整数添加千分位避免6位批号误处理
str_replace('#', '\#', $excludePatterns)
);
if (@preg_match($thousandPattern, '') !== false) {
$correctedContent = preg_replace_callback(
$thousandPattern,
function ($matches) use (&$replacements, $originalContent, &$searchOffset, &$errors) {
$original = $matches[0];
if (preg_match('~20\d{2}(0[1-9]|1[0-2])?(0[1-9]|[12]\d|3[01])?|20\d{2}-00\d{2}|(?:[A-Za-z]+|[\x{4e00}-\x{9fa5}]+)\s*\d{4,6}\b|0\d{2,3}\d{7}\b~u', $original)) {
return $original;
}
$isProtected = strpos($original, '___DATE_PROTECT_') !== false
|| strpos($original, '___DECIMAL_ALPHA_') !== false
|| strpos($original, '___UNIVERSAL_PREFIX_') !== false
|| strpos($original, '___SPECIAL_DECIMAL_') !== false
|| strpos($original, '___SOFTWARE_VERSION_') !== false
|| strpos($original, '___POSTAL_CODE_') !== false
|| strpos($original, '___BRACKETED_NUM_') !== false
|| strpos($original, 'No.') !== false
|| strpos($original, 'PO') !== false
|| strpos($original, 'SO') !== false;
if (isset($replacements[$original]) || strpos($original, ',') !== false || $isProtected) {
return $original;
}
$formatted = number_format($original);
$replacements[$original] = $formatted;
$posStart = strpos($originalContent, $original, $searchOffset);
$posEnd = $posStart !== false ? $posStart + strlen($original) : -1;
$searchOffset = $posEnd !== -1 ? $posEnd : $searchOffset;
$currentCorrected = strtr($originalContent, $replacements);
$errors[] = $this->createError(
$original, $formatted, "四位及以上的数字需要每三位加一个逗号",
$originalContent, $currentCorrected, $posStart, $posEnd, 'thousandth_separator'
);
return $formatted;
},
$correctedContent
);
}
// 恢复所有保护内容(按优先级反向,避免相互干扰)
$correctedContent = strtr($correctedContent, $bracketedNumPlaceholders);
$correctedContent = strtr($correctedContent, $postalCodePlaceholders);
$correctedContent = strtr($correctedContent, $softwareVersionPlaceholders);
$correctedContent = strtr($correctedContent, $specialDecimalPlaceholders);
$correctedContent = strtr($correctedContent, $dateRelatedPlaceholders);
$correctedContent = strtr($correctedContent, $decimalAlphaPlaceholders);
$correctedContent = strtr($correctedContent, $prefixFormatPlaceholders);
$correctedContent = strtr($correctedContent, $urlDoiPlaceholders);
// 清理残留占位符(防止异常情况下占位符未替换)
$correctedContent = preg_replace('~___(BRACKETED_NUM|POSTAL_CODE|SOFTWARE_VERSION|SPECIAL_DECIMAL|DATE_PROTECT|DECIMAL_ALPHA|UNIVERSAL_PREFIX|URL_DOI)_.*?___~', '', $correctedContent);
$this->handleErrors($errors);
return is_string($correctedContent) ? $correctedContent : $defaultReturn;
}
/**
* No. 123456格式统一
*/
private function checkNoFormatUniformity($content) {
$errors = [];
if (!is_string($content) || trim($content) === '') {
$this->handleErrors($errors);
return $content;
}
$corrected = $content;
$replaceMap = [];
$originalContent = $corrected;
$searchOffset = 0;
// 关键:精准排除规则
$postalCodePattern = '~(?:[A-Za-z]+(?:\s+[A-Za-z]+)*|[\x{4e00}-\x{9fa5}]+)\s+\d{4,6}\b~u'; // 邮编
$areaCodePattern = '~0\d{2,3}\d{7}\b~u'; // 区号
$urlPattern = '~https?://[^<>\s]+~i'; // URL如https://test.com/10.1101/2024.11.10
$doiPattern = '~doi:\s{0,10}\d{1,10}\.\d{1,10}/[A-Za-z0-9\-_]+~i'; // DOI如doi:11.1/1-1-1-1-9_2
$batchNumberRules = [
[
'name' => 'No.前缀批号',
'pattern' => '~
\b
(?:[Nn][Oo]\.|[Nn][Oo]|NO\.|NO)
\s*
(\d+[A-Za-z0-9\-_]*)
\b
(?!\s*[年月日]|20\d{2}(?:0[1-9]|1[0-2])?|\.\d+|20\d{2}-00\d{2}
|https?://[^<>\s]+ # 排除URL
|doi:\s{0,10}\d{1,10}\.\d{1,10}/[A-Za-z0-9\-_]+) # 排除DOI
~ux',
'standardPrefix' => 'No.',
'spaceAfterPrefix' => true,
'description' => '带No.前缀的编号如No. 123、NO.45-A'
],
[
'name' => '业务前缀批号',
'pattern' => '~
\b
(PO|SO|SN|BN|REF|ORD|ID|PID)
\s*
(\d+[A-Za-z0-9\-_]*)
\b
(?!\s*[年月日]|20\d{2}(?:0[1-9]|1[0-2])?|\.\d+|20\d{2}-00\d{2}
|https?://[^<>\s]+ # 排除URL
|doi:\s{0,10}\d{1,10}\.\d{1,10}/[A-Za-z0-9\-_]+) # 排除DOI
~iux',
'standardPrefix' => function($match) {
return strtoupper($match[1]);
},
'spaceAfterPrefix' => true,
'description' => '带业务前缀的编号'
],
// [
// 'name' => '多段式批号',
// 'pattern' => '~
// \b
// (?:\d+[A-Za-z]?[-_/])+
// \d+[A-Za-z]?
// \b
// (?!\s*[年月日]|20\d{2}(?:0[1-9]|1[0-2])?|20\d{2}-00\d{2}
// |\d+\.\d+[A-Za-z]+
// |https?://[^<>\s]+ # 排除URL
// |doi:\s{0,10}\d{1,10}\.\d{1,10}/[A-Za-z0-9\-_]+) # 排除DOI
// ~ux',
// 'standardize' => function($original) use ($postalCodePattern, $areaCodePattern, $urlPattern, $doiPattern) {
// // 排除URL、DOI、邮编、区号、日期
// if (preg_match('~20\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])?|20\d{2}-00\d{2}~', $original)
// || preg_match($postalCodePattern, $original)
// || preg_match($areaCodePattern, $original)
// || preg_match($urlPattern, $original)
// || preg_match($doiPattern, $original)) {
// return $original;
// }
// return preg_replace(['~[-_/]+~', '~\s+~'], ['-', ''], $original);
// },
// 'description' => '多段式编号如2023-AB-123、XY_456-78'
// ],
[
'name' => '混合批号',
'pattern' => '~
\b
(?:
\d{6,}(?!20\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])?|20\d{2}-00\d{2}
|(?<=[A-Za-z\s])\d{4,6}\b # 排除邮编
|0\d{2,3}\d{7}\b # 排除区号
|https?://[^<>\s]+ # 排除URL
|doi:\s{0,10}\d{1,10}\.\d{1,10}/[A-Za-z0-9\-_]+) # 排除DOI
|[A-Za-z]{2,}\d{4,}
|[A-Za-z0-9]{8,}(?!20\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])?|20\d{2}-00\d{2}
|https?://[^<>\s]+ # 排除URL
|doi:\s{0,10}\d{1,10}\.\d{1,10}/[A-Za-z0-9\-_]+) # 排除DOI
)
\b
(?!\s*[年月日])
(?!\.\d+)
(?!\d+\.\d+[A-Za-z]+)
(?!(?:^|\s|\()(?:[A-Za-z]{1,3}(?:s?\.?))\s*)
~ux',
'standardize' => function($original) use ($postalCodePattern, $areaCodePattern, $urlPattern, $doiPattern) {
// 排除URL、DOI、邮编、区号、日期
if (preg_match('~20\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])?|20\d{2}-00\d{2}~', $original)
|| preg_match($postalCodePattern, $original)
|| preg_match($areaCodePattern, $original)
|| preg_match($urlPattern, $original)
|| preg_match($doiPattern, $original)) {
return $original;
}
return ctype_digit($original) ? $original : $original;
},
'description' => '纯数字/字母混合编号'
]
];
foreach ($batchNumberRules as $rule) {
if (@preg_match($rule['pattern'], '') === false) continue;
if (preg_match_all($rule['pattern'], $corrected, $matches, PREG_SET_ORDER)) {
foreach ($matches as $match) {
$originalFull = $match[0];
$fixedFull = $originalFull;
// 核心排除逻辑新增URL和DOI的判断
if (preg_match($postalCodePattern, $originalFull)
|| preg_match($areaCodePattern, $originalFull)
|| preg_match('~20\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])?|20\d{2}-00\d{2}~', $originalFull)
|| preg_match($urlPattern, $originalFull) // 跳过URL
|| preg_match($doiPattern, $originalFull)) { // 跳过DOI
continue;
}
if (isset($rule['standardPrefix'])) {
preg_match($rule['pattern'], $originalFull, $parts);
$body = $parts[1];
$standardPrefix = is_callable($rule['standardPrefix']) ? $rule['standardPrefix']($parts) : $rule['standardPrefix'];
$space = $rule['spaceAfterPrefix'] ? ' ' : '';
$fixedFull = $standardPrefix . $space . $body;
} elseif (isset($rule['standardize']) && is_callable($rule['standardize'])) {
$fixedFull = $rule['standardize']($originalFull);
}
if ($originalFull !== $fixedFull && !isset($replaceMap[$originalFull])) {
$replaceMap[$originalFull] = $fixedFull;
$posStart = strpos($originalContent, $originalFull, $searchOffset);
$posEnd = $posStart !== false ? $posStart + strlen($originalFull) : -1;
$searchOffset = $posEnd !== -1 ? $posEnd : $searchOffset;
$errorHash = md5($originalFull . $fixedFull);
$errors[$errorHash] = $this->createError(
$originalFull, $fixedFull,
"{$rule['description']}格式不规范,标准格式为「{$fixedFull}",
$originalContent, strtr($originalContent, $replaceMap),
$posStart, $posEnd, $rule['name']
);
}
}
}
}
$corrected = !empty($replaceMap) ? strtr($corrected, $replaceMap) : $corrected;
$this->handleErrors($errors);
return $corrected;
}
/**
* 时间单位缩写校对
*/
private function checkTimeUnitAbbreviations($content) {
$errors = [];
if (!is_string($content) || trim($content) === '') {
$this->handleErrors($errors);
return $content;
}
$corrected = $content;
$replaceMap = [];
$originalContent = $corrected;
$searchOffset = 0;
// 定义时间单位规则
$timeUnits = [
[
'full' => 'hour',
'plural' => 'hours',
'abbr' => 'h',
'description' => '小时',
'cn_full' => '小时', // 中文全称
'cn_plural' => '小时' // 中文单复数同形
],
[
'full' => 'minute',
'plural' => 'minutes',
'abbr' => 'min',
'description' => '分钟',
'cn_full' => '分钟',
'cn_plural' => '分钟'
],
[
'full' => 'second',
'plural' => 'seconds',
'abbr' => 's',
'description' => '秒',
'cn_full' => '秒',
'cn_plural' => '秒'
]
];
foreach ($timeUnits as $unit) {
$pattern = "~
(?<! # 左侧严格排除非时间场景
[A-Za-z_\/-] # 排除字母/下划线/斜杠/短横线如Fig.、v5.0、2023-LN
|\d+[A-Za-z_\/-] # 排除数字+符号/字母如4H、12A、5_min
|[\(] \# 排除左括号(如(Fig. 4h、5h
|[:] \# 排除冒号如Time: 5h、时间5h
)
(\d+(?:\.\d+)?) \# 数字部分(支持整数/小数如5、2.5
(?:\s+|) \# 数字与单位间可选空格如5 h / 5h
( \# 单位部分:仅匹配时间相关单位
{$unit['full']}s? \# 英文单复数hour/hours
|ucfirst({$unit['full']})s? \# 英文首字母大写Hour/Hours
|{$unit['abbr']}|" . strtoupper($unit['abbr']) . " \# 英文缩写h/H、min/MIN
|{$unit['cn_full']} \# 中文单位(小时、分钟)
)
(?! \# 右侧严格排除非时间场景
[A-Za-z_\/-] \# 排除字母/符号如5hA、5h_min
|\d+ \# 排除后续数字如5h30m需单独处理多单位场景
|[\)] # 排除右括号如5h)、5h
|[,.] # 排除标点如5h,、5h.
)
\b # 单词边界确保单位完整如5h不匹配5hour
~iux"; // i不区分大小写u支持Unicode中文x忽略正则空格
// 正则有效性校验
if (@preg_match($pattern, '') === false) {
continue;
}
// 仅匹配纯时间场景,排除所有干扰
if (preg_match_all($pattern, $corrected, $matches, PREG_SET_ORDER)) {
foreach ($matches as $match) {
$original = $match[0]; // 原始内容(如"5 Hour"、"3 分钟"、"2.5 S"
$number = $match[1]; // 数字部分
$unitPart = $match[2]; // 单位部分
$fixed = $number . strtolower($unit['abbr']); // 标准格式5h、3min、2.5s
// 仅处理非标准格式
if ($original !== $fixed) {
// 细化错误原因
if (stripos($unitPart, $unit['full']) !== false || strpos($unitPart, $unit['cn_full']) !== false) {
$errorReason = "应使用缩写'{$unit['abbr']}'(不使用全称'{$unitPart}'";
} elseif (strpos($original, ' ') !== false) {
$errorReason = "数字与单位间不应有空格";
} else {
$errorReason = "单位缩写应小写'{$unit['abbr']}'(不使用'{$unitPart}'";
}
// 计算位置(避免重复定位)
$posStart = strpos($originalContent, $original, $searchOffset);
$posEnd = ($posStart !== false) ? $posStart + strlen($original) : -1;
$searchOffset = ($posEnd !== -1) ? $posEnd : $searchOffset + strlen($original);
// 错误去重
$errorHash = md5($original . $fixed);
$errorType = $unit['full'];
if (!isset($errors[$errorHash])) {
$errors[$errorHash] = $this->createError(
$original,
$fixed,
"{$unit['description']}格式不规范:{$errorReason},标准格式为[数字{$unit['abbr']}]如3h、2.5min",
$originalContent,
strtr($originalContent, $replaceMap + [$original => $fixed]),
$posStart,
$posEnd,
$errorType
);
}
// 记录替换映射
if (!isset($replaceMap[$original])) {
$replaceMap[$original] = $fixed;
}
}
}
}
}
// 批量替换并处理错误
if (!empty($replaceMap)) {
$corrected = strtr($corrected, $replaceMap);
}
$this->handleErrors($errors);
return $corrected;
}
/**
* 毫升单位校对
*/
private function checkMlUnit($content) {
$errors = [];
if (!is_string($content) || trim($content) === '') {
$this->handleErrors($errors);
return $content;
}
$corrected = $content;
$replaceMap = [];
$originalContent = $corrected; // 保存完整原始内容
$searchOffset = 0; // 用于计算错误位置的偏移量(避免重复定位)
// 优化正则规则:
// 1. 排除字母后接ML如Yeh MLML为人名缩写
// 2. 精准匹配毫升单位支持数字前缀如“5ml”“3.0 ML”或纯单位如“ml”“ML”
$mlPattern = '/
(?<!\p{L}) # 左侧排除任意字母避免Yeh ML、Smith ML等人名场景
(\d+(?:\.\d+)?\s*)? # 可选数字前缀(支持整数/小数如5、3.0,后接可选空格)
(ml) # 毫升单位(不区分大小写,后续统一转小写判断)
(?!\p{L}) # 右侧排除任意字母避免MLabc等非单位场景
/iu'; // u修饰符支持Unicode字母i修饰符不区分大小写
// 正则有效性校验
if (@preg_match($mlPattern, '') === false) {
} elseif (preg_match_all($mlPattern, $corrected, $allMatches, PREG_SET_ORDER)) {
foreach ($allMatches as $matchItem) {
$originalFull = $matchItem[0]; // 原始错误内容(如 "5ml"、" ML"、"2.5 mL"
$prefix = $matchItem[1] ?? ''; // 数字前缀(如 "5"、"3.0 "、""
$originalUnit = strtolower($matchItem[2]); // 单位部分(统一转小写为"ml"
// 标准毫升单位格式L大写为"mL"
$fixedFull = "{$prefix}mL";
$errorType = 'mL';
// 仅处理与标准格式不一致的场景(避免无意义替换)
if ($originalFull !== $fixedFull) {
// 计算错误内容在原始文本中的精准位置(基于偏移量避免重复)
$posStart = strpos($originalContent, $originalFull, $searchOffset);
$posEnd = ($posStart !== false) ? $posStart + strlen($originalFull) : -1;
$searchOffset = ($posEnd !== -1) ? $posEnd : $searchOffset + strlen($originalFull);
// 错误去重(通过“原始内容+修正内容”的哈希避免重复记录)
$errorHash = md5($originalFull . $fixedFull);
if (!isset($errors[$errorHash])) {
$errors[$errorHash] = $this->createError(
$originalFull,
$fixedFull,
'毫升单位格式不规范,标准写法为[mL]',
$originalContent,
strtr($originalContent, $replaceMap + [$originalFull => $fixedFull]),
$posStart,
$posEnd,
$errorType
);
}
// 记录替换映射(去重,避免同一内容多次替换)
if (!isset($replaceMap[$originalFull])) {
$replaceMap[$originalFull] = $fixedFull;
}
}
}
// 批量替换所有不规范单位(高效处理,避免循环替换)
if (!empty($replaceMap)) {
$corrected = strtr($corrected, $replaceMap);
}
}
$this->handleErrors($errors);
return $corrected;
}
/**
* 显著性P斜体校对
*/
private function checkPSignificance($content) {
$errors = [];
if (!is_string($content) || trim($content) === '') {
$this->handleErrors($errors);
return $content;
}
$corrected = $content;
$replaceMap = [];
$originalContent = $corrected; // 保存完整原始内容
$searchOffset = 0; // 用于计算错误位置的偏移量(避免重复定位)
// 优化正则规则覆盖P/p全场景支持科学计数法
$pValuePattern = '/\b([Pp])(\s*=?\s*)(\d+(?:\.\d+)?(?:[eE][+-]?\d+)?)\b/';
// 正则有效性校验
if (@preg_match($pValuePattern, '') === false) {
} elseif (preg_match_all($pValuePattern, $corrected, $allMatches, PREG_SET_ORDER)) {
foreach ($allMatches as $matchItem) {
$original = $matchItem[0]; // 原始P值内容如 "P=0.05"、"p < 0.01"
$pChar = $matchItem[1]; // P/p字符如 "P"、"p"
$separator = $matchItem[2];// 分隔符(如 "="、" < "
$number = $matchItem[3]; // 数值部分(如 "0.05"、"1.2e-3"
// 生成修正内容仅P/p加斜体
$fixed = "<i>{$pChar}</i>{$separator}{$number}";
// 仅处理有变化的场景
if ($original !== $fixed) {
// 计算原始P值内容在完整原始文本中的位置
$posStart = strpos($originalContent, $original, $searchOffset);
$posEnd = ($posStart !== false) ? $posStart + strlen($original) : -1;
$searchOffset = ($posEnd !== -1) ? $posEnd : $searchOffset + strlen($original); // 更新偏移量
// 错误去重(哈希机制)
$errorHash = md5($original . $fixed);
$errorType = 'P';
if (!isset($errors[$errorHash])) {
$errors[$errorHash] = $this->createError(
$original,
$fixed,
'显著性P值格式不规范,P/p应使用斜体',
$originalContent,
strtr($originalContent, $replaceMap + [$original => $fixed]),
$posStart,
$posEnd,
$errorType
);
}
// 记录替换映射(去重)
if (!isset($replaceMap[$original])) {
$replaceMap[$original] = $fixed;
}
}
}
// 批量替换
if (!empty($replaceMap)) {
$corrected = strtr($corrected, $replaceMap);
}
}
$this->handleErrors($errors);
return $corrected;
}
/**
* 图表标题一律使用全称Figure 1, Table 1.不能写成Fig. 1, Tab 1.
*/
private function checkFigureTableTitle($content) {
$errors = [];
// 严格输入验证:空内容/非字符串直接返回
if (!is_string($content) || trim($content) === '') {
$this->handleErrors($errors);
return $content;
}
$corrected = $content;
$replaceMap = [];
$originalContent = $corrected; // 备份原始内容,用于错误信息
$searchOffset = 0; // 错误位置计算偏移量,避免重复定位
// 图表标题匹配正则(支持 Fig/Figs/Tab/Tabs、特殊空格、数字范围
$titlePattern = '/(?<!\w)(Fig|Figs|Tab|Tabs)(\.?)(\s*|\u00A0|\u0020)(\d+(?:[-\u2013\u2014]\d+)?)(?!\w)/iu';
// 正则有效性校验
if (@preg_match($titlePattern, '') === false) {
} else {
// 全局匹配所有图表标题格式
$matchCount = preg_match_all($titlePattern, $corrected, $allMatches, PREG_SET_ORDER);
if ($matchCount > 0) {
foreach ($allMatches as $matchItem) {
$originalFull = $matchItem[0]; // 完整错误片段(如 "Fig 1"、"Tabs-2"
$abbrBase = $matchItem[1]; // 缩写主体Fig/Figs/Tab/Tabs
$dot = $matchItem[2]; // 可能的点(.
$space = $matchItem[3]; // 可能的空格(含特殊空格)
$number = $matchItem[4]; // 数字部分(支持范围如 "2-3"
// 确定全称及错误描述
switch (strtolower($abbrBase)) {
case 'fig':
$fullName = 'Figure';
$errorDesc = '图表标题使用缩写"Fig",正确:"Figure"';
break;
case 'figs':
$fullName = 'Figures';
$errorDesc = '图表标题复数使用缩写"Figs",正确:"Figures"';
break;
case 'tab':
$fullName = 'Table';
$errorDesc = '表格标题使用缩写"Tab",正确:"Table"';
break;
case 'tabs':
$fullName = 'Tables';
$errorDesc = '表格标题复数使用缩写"Tabs",正确:"Tables"';
break;
default:
$fullName = '';
$errorDesc = '';
continue 2; // 修复警告:跳出 switch + 跳过当前 foreach 迭代
}
// 生成标准格式(全称 + 单个空格 + 数字)
$fixed = "{$fullName} {$number}";
// 仅处理需要修正的场景(避免无意义操作)
if ($originalFull !== $fixed) {
// 计算错误片段在原始文本中的位置
$posStart = strpos($originalContent, $originalFull, $searchOffset);
$posEnd = ($posStart !== false) ? $posStart + strlen($originalFull) : -1;
$searchOffset = ($posEnd !== -1) ? $posEnd : $searchOffset + strlen($originalFull);
// 错误信息去重(基于原始+修正内容哈希)
$errorHash = md5($originalFull . $fixed);
if (!isset($errors[$errorHash])) {
// 生成临时修正内容,用于错误信息预览
$tempReplace = $replaceMap;
$tempReplace[$originalFull] = $fixed;
$currentCorrected = strtr($originalContent, $tempReplace);
$errors[$errorHash] = $this->createError(
$originalFull,
$fixed,
$errorDesc,
$originalContent,
$currentCorrected,
$posStart,
$posEnd,
$fullName
);
}
// 记录替换规则(去重,避免重复替换)
if (!isset($replaceMap[$originalFull])) {
$replaceMap[$originalFull] = $fixed;
}
}
}
// 批量执行所有替换(高效处理)
if (!empty($replaceMap)) {
$corrected = strtr($corrected, $replaceMap);
}
}
}
// 处理错误信息(需确保 handleErrors 方法已实现)
$this->handleErrors($errors);
return $corrected;
}
/**
* 添加错误信息
*/
private function addError($error = []) {
if (!empty($error) && is_array($error)) {
// 确保错误信息结构完整
$safeError = array_merge([
'verbatim_texts' => '',
'revised_content' => '',
'explanation' => '',
'original' => '',
'corrected' => '',
'position_start' => '',
'position_end' => '',
], $error);
$this->errors[] = $safeError;
}
}
/**
* 处理错误信息(去重和存储)
*/
private function handleErrors($errors) {
if (empty($errors)) return;
// 错误去重
$uniqueErrors = [];
foreach ($errors as $error) {
$errorHash = md5($error['verbatim_texts'] . $error['revised_content']. $error['position_start']. $error['position_end']);
if (!isset($uniqueErrors[$errorHash])) {
$uniqueErrors[$errorHash] = $error;
}
}
// 批量添加错误
foreach (array_values($uniqueErrors) as $error) {
$this->addError($error);
}
}
/**
* 创建标准化错误信息
*/
private function createError($verbatim='', $revised='', $explanation='',$original='',$corrected='', $position_start=-1, $position_end=-1,$error_type='') {
return [
'verbatim_texts' => $verbatim,
'revised_content' => $revised,
'explanation' => $explanation,
'original' => $original,
'corrected' => $corrected,
'position_start' => $position_start,
'position_end' => $position_end,
'error_type' => $error_type
];
}
/**
* 检查doi链接是否都能打开
*/
private function checkDoi($content) {
$errors = [];
if (!is_string($content) || trim($content) === '') {
$this->handleErrors($errors); // 注意原代码笔误“handleErrorsErrors”已修正
return $content;
}
$corrected = $content;
$originalContent = $corrected;
$checkedDois = []; // 用于去重避免同一DOI重复校验
try {
// 优化正则匹配标准DOI格式覆盖所有常见场景
// 匹配规则说明:
// 1. (?<!\w)非单词字符前缀避免匹配类似“xdoi:10.1017/”的无效内容)
// 2. (doi)匹配“doi”不区分大小写通过/i修饰符实现
// 3. :\s*冒号后允许0个或多个空格支持“doi:10.1017/”和“doi: 10.1017/”)
// 4. (\d+\.\d+\/[A-Za-z0-9\/\.\-\_]+)匹配DOI核心部分如10.1017/abc、10.1038/nature12345
// - \d+\.\d+:数字+小数点+数字DOI前缀如10.1017
// - \/斜杠DOI分隔符
// - [A-Za-z0-9\/\.\-\_]+DOI后缀允许字母、数字、斜杠、小数点、横线、下划线
// 5. (?!\w)非单词字符后缀避免匹配类似“10.1017/abcx”的无效内容
$doiPattern = '/(?<!\w)(doi):\s*(\d+\.\d+\/[A-Za-z0-9\/\.\-\_]+)(?!\w)/iu';
// 正则有效性校验
if (@preg_match($doiPattern, '') === false) {
$errors[] = $this->createError(
'DOI正则错误',
'跳过DOI校验',
"DOI匹配正则语法错误{$doiPattern},已跳过该校验流程",
$originalContent,
$corrected
);
} else {
// 匹配所有符合标准的DOIPREG_SET_ORDER按匹配项分组
$matchCount = preg_match_all($doiPattern, $corrected, $allMatches, PREG_SET_ORDER);
if ($matchCount > 0) {
foreach ($allMatches as $matchItem) {
$fullDoi = strtolower($matchItem[1]) . ':' . $matchItem[2]; // 完整DOI统一转为小写如“doi:10.1017/abc”
$doiCore = $matchItem[2]; // DOI核心部分如“10.1017/abc”用于拼接访问链接
// 去重同一DOI仅校验一次
if (isset($checkedDois[$fullDoi])) {
continue;
}
$checkedDois[$fullDoi] = true;
// 测试DOI链接是否可访问
$isAccessible = $this->testDoiAccessibility($doiCore);
// 生成错误/状态信息
if ($isAccessible) {
$errorDesc = "DOI「{$fullDoi}」格式规范,且链接可正常访问";
} else {
$errorDesc = "DOI「{$fullDoi}」格式规范,但链接无法访问(可能无效或网络问题)";
}
// 记录校验结果DOI无需修正仅记录状态
$errors[] = $this->createError(
$fullDoi,
$fullDoi, // 修正后内容与原始一致DOI格式无需修改
$errorDesc,
$originalContent,
$corrected
);
}
} else {
// 无匹配时记录提示(可选,根据业务需求决定是否保留)
$errors[] = $this->createError(
'未匹配到DOI',
'无修正',
'文本中未发现符合标准格式的DOI如doi:10.1017/abc、DOI: 10.1038/nature12345',
$originalContent,
$corrected
);
}
}
} catch (Exception $e) {
$errors[] = $this->createError(
'DOI校验全局异常',
'已回滚原始内容',
"DOI校验出错{$e->getMessage()}(行号:{$e->getLine()}),已恢复原始输入",
$originalContent,
$originalContent
);
$corrected = $originalContent;
}
$this->handleErrors($errors);
return $corrected;
}
/**
* 测试DOI链接是否可访问基于DOI官方解析地址
* @param string $doiCore DOI核心部分如“10.1017/abc”不含“doi:”前缀)
* @return bool 可访问返回true否则返回false
*/
private function testDoiAccessibility($doiCore) {
// 处理DOI核心部分的空格若存在
$doiCore = trim($doiCore);
// DOI官方解析地址https://doi.org/ + 编码后的DOI核心部分
$doiUrl = 'https://doi.org/' . $doiCore;
var_dump($doiUrl,$doiCore);exit;
// 初始化cURL支持HTTPS忽略证书问题避免环境限制
$ch = curl_init();
curl_setopt_array($ch, [
CURLOPT_URL => $doiUrl,
CURLOPT_RETURNTRANSFER => true, // 不直接输出响应
CURLOPT_HEADER => true, // 获取响应头(用于判断状态码)
CURLOPT_TIMEOUT => 15, // 超时时间(避免长时间阻塞)
CURLOPT_FOLLOWLOCATION => true, // 跟随301/302重定向DOI常跳转到期刊页面
CURLOPT_SSL_VERIFYPEER => false, // 忽略SSL证书校验适合测试环境
CURLOPT_SSL_VERIFYHOST => false
]);
curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); // 获取HTTP状态码
curl_close($ch);
// 状态码200-399表示可访问200成功3xx重定向均视为有效
return $httpCode >= 200 && $httpCode < 400;
}
}
?>