1272 lines
59 KiB
PHP
1272 lines
59 KiB
PHP
<?php
|
||
namespace app\common;
|
||
|
||
class ProofReadService
|
||
{
|
||
private $errors = [];
|
||
|
||
// 主校对方法
|
||
public function proofread($content)
|
||
{
|
||
$this->errors = [];
|
||
$this->excludedFormats = [];
|
||
$correctedContent = $content;
|
||
|
||
//时间单位缩写校对
|
||
$correctedContent = $this->checkTimeUnitAbbreviations($correctedContent);
|
||
//横线/运算符校对
|
||
$correctedContent = $this->checkTextFormat($correctedContent);
|
||
//数字格式校对
|
||
$correctedContent = $this->checkNumberFormat($correctedContent);
|
||
//毫升单位校对
|
||
$correctedContent = $this->checkMlUnit($correctedContent);
|
||
//显著性P斜体校对
|
||
$correctedContent = $this->checkPSignificance($correctedContent);
|
||
//No. 123456的写法统一
|
||
$correctedContent = $this->checkNoFormatUniformity($correctedContent);
|
||
//图表标题一律使用全称Figure 1, Table 1.不能写成Fig. 1, Tab 1.
|
||
$correctedContent = $this->checkFigureTableTitle($correctedContent);
|
||
//检测参考文献是否能打开
|
||
// $correctedContent = $this->checkDoi($correctedContent);
|
||
//判断是否为空错误信息
|
||
if(empty($this->errors)){
|
||
return [];
|
||
}
|
||
return [
|
||
'proof_before' => $content,
|
||
'proof_after' => $correctedContent,
|
||
'errors' => $this->errors
|
||
];
|
||
}
|
||
|
||
/**
|
||
* 横线/运算符校对/数字和单位(高可用版)
|
||
*/
|
||
private function checkTextFormat($content) {
|
||
// 初始化错误数组
|
||
$errors = [];
|
||
$defaultReturn = $content;
|
||
$originalContent = $content; // 保存完整原始内容
|
||
$searchOffsetForExclude = 0; // 【新增】仅用于「特殊内容过滤」的偏移量
|
||
$searchOffsetForCore = 0; // 【新增】仅用于「核心规则处理」的偏移量
|
||
|
||
// 验证数据
|
||
if (!is_string($content) || trim($content) === '') {
|
||
$this->handleErrors($errors);
|
||
return $defaultReturn;
|
||
}
|
||
|
||
$corrected = $content;
|
||
$excludeMarkers = []; // 存储 URL/DOI + <wmath>/<math> 的占位符映射
|
||
$processedHashes = [];
|
||
|
||
// 编码处理(不变)
|
||
$originalEncoding = mb_detect_encoding($content, ['UTF-8', 'GBK', 'GB2312', 'ISO-8859-1'], true);
|
||
if ($originalEncoding === false) {
|
||
// $converted = @mb_convert_encoding($content, 'UTF-8', 'auto');
|
||
// $corrected = $converted !== false ? $converted : $content;
|
||
// $posStart = 0;
|
||
// $posEnd = min(20, strlen($originalContent));
|
||
// $errors[] = $this->createError(
|
||
// '内容编码检测失败',
|
||
// '已尝试强制UTF-8编码',
|
||
// '输入内容编码无法识别,已尝试自动转换为UTF-8',
|
||
// $originalContent,
|
||
// $corrected,
|
||
// $posStart,
|
||
// $posEnd
|
||
// );
|
||
} else {
|
||
$converted = @mb_convert_encoding($content, 'UTF-8', $originalEncoding);
|
||
$corrected = $converted !== false ? $converted : $content;
|
||
if ($converted === false) {
|
||
$posStart = 0;
|
||
$posEnd = min(20, strlen($originalContent));
|
||
// $errors[] = $this->createError(
|
||
// '编码转换失败',
|
||
// '保留原始编码内容',
|
||
// "从[{$originalEncoding}]转换为UTF-8失败,保留原始内容",
|
||
// $originalContent,
|
||
// $corrected,
|
||
// $posStart,
|
||
// $posEnd
|
||
// );
|
||
}
|
||
}
|
||
|
||
// 过滤 <wmath>/<math>(复杂标签优先)
|
||
$mathTagRegex = '~<(wmath|math)[^>]*?>.*?</\1>~is';
|
||
if (@preg_match($mathTagRegex, '') === false) {
|
||
// 正则错误处理(不变)
|
||
} elseif (preg_match_all($mathTagRegex, $corrected, $matches, PREG_SET_ORDER)) {
|
||
usort($matches, function($a, $b) {
|
||
return strlen($b[0]) - strlen($a[0]);
|
||
});
|
||
|
||
foreach ($matches as $index => $match) {
|
||
$fullTag = $match[0];
|
||
$tagType = $match[1];
|
||
$marker = "___EXCLUDE_{$tagType}_" . time() . "_{$index}___";
|
||
$excludeMarkers[$marker] = $fullTag;
|
||
|
||
// 【修改】使用独立偏移量 $searchOffsetForExclude
|
||
$posStart = strpos($originalContent, $fullTag, $searchOffsetForExclude);
|
||
$posEnd = ($posStart !== false) ? $posStart + strlen($fullTag) : -1;
|
||
$searchOffsetForExclude = ($posEnd !== -1) ? $posEnd : $searchOffsetForExclude + strlen($fullTag);
|
||
|
||
$safeFullTag = preg_quote($fullTag, '~');
|
||
$corrected = preg_replace("~{$safeFullTag}~u", $marker, $corrected, 1);
|
||
}
|
||
}
|
||
|
||
// 过滤 URL/DOI(放在数学标签后)
|
||
$urlDoiRegex = '~(https?://[^\s/]{1,100}(?:/+[A-Za-z0-9\.\-]+(?:-[A-Za-z0-9\.\-]+)*)*|\b[a-zA-Z0-9\.\-]+\.[a-zA-Z]{2,}(?:/+[A-Za-z0-9\.\-]+(?:-[A-Za-z0-9\.\-]+)*)*(?=$|[\s\.,;!])|doi:\s{0,10}\d+\.\d+/[A-Za-z0-9-+×:]+(?:-[A-Za-z0-9-+×:]+)*)~iu';
|
||
if (@preg_match($urlDoiRegex, '') === false) {
|
||
// 正则错误处理(不变)
|
||
} elseif (preg_match_all($urlDoiRegex, $corrected, $matches, PREG_SET_ORDER)) {
|
||
usort($matches, function($a, $b) { return strlen($b[1]) - strlen($a[1]); });
|
||
foreach ($matches as $index => $match) {
|
||
$original = $match[1];
|
||
$marker = "___EXCLUDE_URL_" . time() . "_{$index}___";
|
||
$excludeMarkers[$marker] = $original;
|
||
|
||
// 【修改】使用独立偏移量 $searchOffsetForExclude
|
||
$posStart = strpos($originalContent, $original, $searchOffsetForExclude);
|
||
$posEnd = ($posStart !== false) ? $posStart + strlen($original) : -1;
|
||
$searchOffsetForExclude = ($posEnd !== -1) ? $posEnd : $searchOffsetForExclude + strlen($original);
|
||
|
||
$corrected = preg_replace("~" . preg_quote($original, '~') . "~u", $marker, $corrected, 1);
|
||
}
|
||
}
|
||
|
||
// 核心格式规则处理(关键修改)
|
||
$coreRules = $this->getTextCoreRules();
|
||
foreach ($coreRules as $rule) {
|
||
if (@preg_match($rule['pattern'], '') === false) {
|
||
continue;
|
||
}
|
||
|
||
// 【修改】确保匹配时使用原始 $corrected(未被占位符干扰),并保留偏移量
|
||
$matchCount = preg_match_all(
|
||
$rule['pattern'],
|
||
$corrected,
|
||
$matches,
|
||
PREG_SET_ORDER | PREG_OFFSET_CAPTURE
|
||
);
|
||
if ($matchCount === 0) {
|
||
continue;
|
||
}
|
||
|
||
foreach ($matches as $match) {
|
||
$original = $match[0][0]; // 匹配到的原始内容(如 (40 × 33))
|
||
$originalLen = strlen($original);
|
||
$hash = md5($original);
|
||
|
||
if (isset($processedHashes[$hash])) {
|
||
continue;
|
||
}
|
||
|
||
// 【关键修复1:精准定位,不受其他偏移量干扰】
|
||
$posStart = -1;
|
||
$posEnd = -1;
|
||
// 1. 先尝试用 PREG_OFFSET_CAPTURE 得到的偏移量反推原始位置
|
||
$offsetInCorrected = $match[0][1]; // 匹配内容在 $corrected 中的偏移量
|
||
// 2. 提取 $corrected 中匹配位置前的内容,计算其在原始文本中的长度(排除占位符影响)
|
||
$prefixInCorrected = substr($corrected, 0, $offsetInCorrected);
|
||
// 3. 替换占位符为原始内容,得到与 $originalContent 对应的前缀
|
||
$prefixInOriginal = strtr($prefixInCorrected, $excludeMarkers);
|
||
// 4. 原始位置 = 前缀长度(确保精准对应)
|
||
$posStart = strlen($prefixInOriginal);
|
||
$posEnd = $posStart + $originalLen;
|
||
|
||
// 【关键修复2:二次验证,确保位置正确】
|
||
if ($posStart !== -1) {
|
||
$contentCheck = substr($originalContent, $posStart, $originalLen);
|
||
// 转换为 UTF-8 编码后用 strcmp 比较(依赖 iconv 扩展)
|
||
$contentCheckConv = iconv('UTF-8', 'UTF-8//IGNORE', $contentCheck);
|
||
$originalConv = iconv('UTF-8', 'UTF-8//IGNORE', $original);
|
||
if (strcmp($contentCheckConv, $originalConv) !== 0) {
|
||
// 二次验证失败时,用局部正则重新定位
|
||
$localPattern = '~' . preg_quote($original, '~') . '~u';
|
||
if (preg_match($localPattern, $originalContent, $localMatch, PREG_OFFSET_CAPTURE, $searchOffsetForCore)) {
|
||
$posStart = $localMatch[0][1];
|
||
$posEnd = $posStart + $originalLen;
|
||
}
|
||
}
|
||
}
|
||
|
||
// 生成修正内容
|
||
$fixed = is_callable($rule['replacement'])
|
||
? call_user_func($rule['replacement'], $match)
|
||
: preg_replace($rule['pattern'], $rule['replacement'], $original);
|
||
|
||
if ($original !== $fixed && $fixed !== null) {
|
||
// 【修改】更新核心规则专用偏移量
|
||
$searchOffsetForCore = ($posEnd !== -1) ? $posEnd : $searchOffsetForCore + $originalLen;
|
||
|
||
// 生成错误信息
|
||
$currentCorrected = str_replace($original, $fixed, $corrected);
|
||
$errors[] = $this->createError(
|
||
$original,
|
||
$fixed,
|
||
$rule['explanation'],
|
||
$originalContent,
|
||
$currentCorrected,
|
||
$posStart,
|
||
$posEnd,
|
||
empty($rule['error_type']) ? '' : $rule['error_type']
|
||
);
|
||
$processedHashes[$hash] = true;
|
||
$corrected = $currentCorrected;
|
||
}
|
||
}
|
||
}
|
||
|
||
// 批量还原 <wmath>/<math> 和 URL/DOI(不变)
|
||
$restoreErrors = [];
|
||
if (!empty($excludeMarkers)) {
|
||
$corrected = strtr($corrected, $excludeMarkers);
|
||
if (preg_match_all('~___EXCLUDE_(wmath|math|URL)_\d+_\d+___~', $corrected, $remaining)) {
|
||
foreach ($remaining[0] as $marker) {
|
||
$original = $excludeMarkers[$marker] ?? '未知数学公式/链接';
|
||
$restoreErrors[] = $original;
|
||
$posStart = strpos($corrected, $marker, $searchOffsetForExclude);
|
||
$posEnd = ($posStart !== false) ? $posStart + strlen($marker) : -1;
|
||
$searchOffsetForExclude = ($posEnd !== -1) ? $posEnd : $searchOffsetForExclude + strlen($marker);
|
||
$corrected = str_replace($marker, $original, $corrected);
|
||
}
|
||
}
|
||
}
|
||
|
||
// if (!empty($restoreErrors)) {
|
||
// $posStart = 0;
|
||
// $posEnd = min(50, strlen($originalContent));
|
||
// $errors[] = $this->createError(
|
||
// '特殊内容恢复不完全',
|
||
// '已强制恢复原始内容',
|
||
// "恢复失败的内容: " . implode('; ', $restoreErrors),
|
||
// $originalContent,
|
||
// $corrected,
|
||
// $posStart,
|
||
// $posEnd
|
||
// );
|
||
// }
|
||
|
||
$this->handleErrors($errors);
|
||
return is_string($corrected) ? $corrected : $defaultReturn;
|
||
}
|
||
/**
|
||
* 获取文本格式核心规则
|
||
*/
|
||
private function getTextCoreRules() {
|
||
return [
|
||
// ====================== 1. 括号内数字范围规则(优先级最高,避免与其他减号规则冲突) ======================
|
||
[
|
||
'pattern' => '~(\[\s*[-]?\d+\s*)\x{2014}\s*(\d+\s*\])~u', // 匹配长划线(—)
|
||
'replacement' => '$1-$2', // 替换为短划线(-)
|
||
'verbatim_texts' => '带括号数字范围使用长划线(—)不规范',
|
||
'explanation' => '带括号的数字范围应使用短划线[-]',
|
||
'error_type' => 'en-dash'
|
||
],
|
||
[
|
||
'pattern' => '~(\[\s*[-]?\d+\s*)-\s*(\d+\s*\])~u', // 匹配连接符(-)及可能的空格
|
||
'replacement' => '$1-$2', // 统一为无空格短划线(-)
|
||
'verbatim_texts' => '带括号数字范围使用连接符(-)格式不规范',
|
||
'explanation' => '带括号的数字范围应使用短划线[-]且前后无空格',
|
||
'error_type' => 'en-dash'
|
||
|
||
],
|
||
[
|
||
'pattern' => '~(\[\s*[-]?\d+)\s+-\s*(\d+\s*\])~u', // 短划线前多余空格
|
||
'replacement' => '$1-$2', // 移除前导空格
|
||
'verbatim_texts' => '数字范围短划线前有多余空格',
|
||
'explanation' => '带括号数字范围的短划线[-]前不应留空格',
|
||
'error_type' => 'en-dash'
|
||
],
|
||
[
|
||
'pattern' => '~(\[\s*[-]?\d+)\s*-\s+(\d+\s*\])~u', // 短划线后多余空格
|
||
'replacement' => '$1-$2', // 移除后导空格
|
||
'verbatim_texts' => '数字范围短划线后有多余空格',
|
||
'explanation' => '带括号数字范围的短划线[-]后不应留空格',
|
||
'error_type' => 'en-dash'
|
||
],
|
||
|
||
// ====================== 2. 无括号数字范围规则(次高优先级,避免与减号运算规则冲突) ======================
|
||
[
|
||
'pattern' => '~(\b\d+)\s*—\s*(\d+\b)~u', // 匹配长划线(—)
|
||
'replacement' => '$1-$2', // 替换为短划线(-)
|
||
'verbatim_texts' => '无括号数字范围使用长划线(—)不规范',
|
||
'explanation' => '无括号的数字范围应使用短划线[-]',
|
||
'error_type' => 'bracket_en-dash'
|
||
],
|
||
[
|
||
'pattern' => '~(\b\d+)\s*-\s*(\d+\b)~u', // 匹配连接符(-)及可能的空格
|
||
'replacement' => '$1-$2', // 统一为无空格短划线(-)
|
||
'verbatim_texts' => '无括号数字范围使用连接符(-)格式不规范',
|
||
'explanation' => '无括号的数字范围应使用短划线[-]且前后无空格',
|
||
'error_type' => 'bracket_en-dash'
|
||
],
|
||
|
||
// ====================== 3. 运算符空格规则(按「复合→独立」顺序,避免冲突) ======================
|
||
[
|
||
'pattern' => '~(\S)\s*([<>!]=|===|!==)\s*(\S)~u', // 复合运算符(>=、<=、==、!=、===、!==)
|
||
'replacement' => '$1 $2 $3',
|
||
'verbatim_texts' => '复合运算符前后空格不规范',
|
||
'explanation' => '复合运算符[>=、<=、==、!=、===、!==]前后应各留一个空格',
|
||
'error_type' => 'composite_operator'
|
||
],
|
||
[
|
||
'pattern' => '~(?<!=|<|>|\*|\+|-|/)(\S+?)\s*=\s*(\S+?)(?!=|<|>|\*|\+|-|/)~u',
|
||
// 捕获组说明:
|
||
// $1:等号前内容(非空字符,避免匹配空格)
|
||
// $2:等号后内容(非空字符,避免匹配空格)
|
||
// 前后否定断言:排除与其他运算符(如+=、*=)的冲突
|
||
'replacement' => '$1 = $2', // 正确拼接“前内容 + 规范等号 + 后内容”
|
||
'verbatim_texts' => '等号前后空格不规范',
|
||
'explanation' => '独立等号[=]前后应各留一个空格',
|
||
'error_type' => 'equal'
|
||
],
|
||
[
|
||
'pattern' => '~(\d+)\s*\+\s*(\d+)~u', // 加法运算符(+)
|
||
'replacement' => '$1 + $2',
|
||
'verbatim_texts' => '加法运算符前后空格不规范',
|
||
'explanation' => '加法运算符[+]前后应各留一个空格',
|
||
'error_type' => 'plus'
|
||
],
|
||
[
|
||
'pattern' => '~(\d+)\s*\*\s*(\d+)~u', // 乘法运算符(*)
|
||
'replacement' => '$1 * $2',
|
||
'verbatim_texts' => '乘法运算符前后空格不规范',
|
||
'explanation' => '乘法运算符[*]前后应各留一个空格',
|
||
'error_type' => 'ride'
|
||
],
|
||
[
|
||
'pattern' => '~(\d+)\s*/\s*(\d+)~u', // 除法运算符(/)
|
||
'replacement' => '$1 / $2',
|
||
'verbatim_texts' => '除法运算符前后空格不规范',
|
||
'explanation' => '除法运算符[/]前后应各留一个空格',
|
||
'error_type' => 'except'
|
||
],
|
||
[
|
||
'pattern' => '~
|
||
(?<!\D) # 排除左侧有非数字字符的场景(如Notch1、文献年份后的数字)
|
||
(\d+)\s*-\s*
|
||
(?!
|
||
\d+[a-zA-Z%] # 排除带单位(如min、kg)
|
||
|\d+\] # 排除带右括号(如[1-5])
|
||
|\d+[.:,;)] # 排除文献引用中数字范围后接标点(如.、:、))
|
||
)
|
||
(\d+)
|
||
~ux', // 减法运算符(-,仅处理纯数字减法,排除文献引用等场景)
|
||
'replacement' => '$1 - $2',
|
||
'verbatim_texts' => '减法运算符前后空格不规范',
|
||
'explanation' => '减法运算符[-]前后应各留一个空格(非数字范围场景)',
|
||
'error_type' => 'reduce'
|
||
],
|
||
|
||
// ====================== 4. 特殊符号规则(低优先级,避免干扰核心格式) ======================
|
||
[
|
||
'pattern' => '~(\d+)\s+%~u', // 数字与百分号
|
||
'replacement' => '$1%',
|
||
'verbatim_texts' => '数字与百分号之间有多余空格',
|
||
'explanation' => '数字与百分号[%]之间有多余空格',
|
||
'error_type' => 'number_percentage'
|
||
],
|
||
[
|
||
'pattern' => '~(\(\s*\d+)\s+×\s+(\d+\s*\))~u', // 先匹配「(数字 × 数字)」场景(带括号)
|
||
'replacement' => '$1×$2', // 修正为「(数字×数字)」,如 (40×33)
|
||
'verbatim_texts' => '带括号的乘号表示倍数时前后有多余空格',
|
||
'explanation' => '带括号的乘号[×]表示倍数关系时前后有多余空格',
|
||
'error_type' => 'multiple'
|
||
|
||
],
|
||
[
|
||
'pattern' => '~(\d+)\s+×\s+(\d+)~u', // 再匹配「数字 × 数字」场景(无括号)
|
||
'replacement' => '$1×$2',
|
||
'verbatim_texts' => '乘号表示倍数时前后有多余空格',
|
||
'explanation' => '乘号[×]表示倍数关系时前后不应留空格',
|
||
'error_type' => 'multiple'
|
||
],
|
||
[
|
||
'pattern' => '~(\d+)\s*\*\s*(\d+)~u', // 星号(*)转乘号(×)
|
||
'replacement' => '$1 × $2',
|
||
'verbatim_texts' => '使用星号(*)作为乘法运算符不规范',
|
||
// 'explanation' => '乘法运算应使用标准乘号(×)替代星号(*),并前后留空格,如 3 × 5'
|
||
'explanation' => '乘法运算应使用标准乘号[×]替代星号[*]',
|
||
'error_type' => 'ride'
|
||
],
|
||
[
|
||
'pattern' => '~(\d+)\s+:\s+(\d+)~u', // 比值符号(:)
|
||
'replacement' => '$1:$2',
|
||
'verbatim_texts' => '比值符号前后有多余空格',
|
||
'explanation' => '比值符号[:]前后有多余空格',
|
||
'error_type' => 'biliel'
|
||
]
|
||
];
|
||
}
|
||
/**
|
||
* 数字格式校对
|
||
*/
|
||
private function checkNumberFormat($content) {
|
||
$errors = [];
|
||
$defaultReturn = $content;
|
||
$originalContent = $content;
|
||
$searchOffset = 0; // 用于计算位置的偏移量(避免重复定位)
|
||
|
||
if (!is_string($content) || trim($content) === '') {
|
||
$this->handleErrors($errors);
|
||
return $defaultReturn;
|
||
}
|
||
|
||
$correctedContent = $content;
|
||
$replacements = [];
|
||
$urlDoiPlaceholders = [];
|
||
|
||
// URL/DOI保护(保持不变,新增位置记录)
|
||
$urlDoiPattern = '#([^\w]|^)(https?://[^<>\s]+|doi:\s{0,10}\d{1,10}\.\d{1,10}/[A-Za-z0-9]{1,30})([^\w]|$)#i';
|
||
if (@preg_match($urlDoiPattern, '') === false) {
|
||
// 正则错误:位置默认-1
|
||
// $errors[] = $this->createError(
|
||
// 'URL/DOI正则错误',
|
||
// '跳过URL/DOI保护',
|
||
// "URL/DOI正则语法错误: {$urlDoiPattern}",
|
||
// $originalContent,
|
||
// $correctedContent,
|
||
// -1,
|
||
// -1
|
||
// );
|
||
} else {
|
||
$correctedContent = preg_replace_callback(
|
||
$urlDoiPattern,
|
||
function ($matches) use (&$urlDoiPlaceholders, $originalContent, &$errors, &$searchOffset) {
|
||
$fullMatch = $matches[0];
|
||
$placeholder = '___URL_DOI_' . time() . '_' . uniqid() . '___';
|
||
$urlDoiPlaceholders[$placeholder] = $fullMatch;
|
||
|
||
// 计算URL/DOI在原始文本中的位置
|
||
$posStart = strpos($originalContent, $fullMatch, $searchOffset);
|
||
$posEnd = ($posStart !== false) ? $posStart + strlen($fullMatch) : -1;
|
||
$searchOffset = ($posEnd !== -1) ? $posEnd : $searchOffset + strlen($fullMatch);
|
||
|
||
// $errors[] = $this->createError(
|
||
// "URL/DOI保护: {$fullMatch}",
|
||
// "已替换为占位符",
|
||
// "保护URL/DOI内容,避免数字格式规则误处理",
|
||
// $originalContent,
|
||
// str_replace($fullMatch, $placeholder, $originalContent),
|
||
// $posStart,
|
||
// $posEnd
|
||
// );
|
||
return $placeholder;
|
||
},
|
||
$correctedContent
|
||
);
|
||
}
|
||
|
||
// 核心修复:处理纯小数零(如-5.0 → -5)
|
||
$decimalZeroPattern = '~(-?\d+)\.0+(?!\d|e|E)~ix';
|
||
preg_match_all($decimalZeroPattern, $correctedContent, $matches);
|
||
|
||
$uniqueNumbers = array_unique($matches[0]);
|
||
|
||
foreach ($uniqueNumbers as $number) {
|
||
if (preg_match($decimalZeroPattern, $number, $numMatch)) {
|
||
$integerPart = $numMatch[1];
|
||
$corrected = $integerPart;
|
||
$errorType = 'invalid_zero';
|
||
if (!isset($replacements[$number])) {
|
||
$replacements[$number] = $corrected;
|
||
|
||
// 计算小数零在原始文本中的位置
|
||
$posStart = strpos($originalContent, $number, $searchOffset);
|
||
$posEnd = ($posStart !== false) ? $posStart + strlen($number) : -1;
|
||
$searchOffset = ($posEnd !== -1) ? $posEnd : $searchOffset + strlen($number);
|
||
|
||
$currentCorrected = strtr($originalContent, $replacements);
|
||
$errors[] = $this->createError(
|
||
$number,
|
||
$corrected,
|
||
"删除小数点后无效零",
|
||
$originalContent,
|
||
$currentCorrected,
|
||
$posStart,
|
||
$posEnd,
|
||
$errorType
|
||
);
|
||
}
|
||
}
|
||
}
|
||
$correctedContent = strtr($correctedContent, $replacements);
|
||
|
||
// 千分位处理
|
||
$excludePatterns = implode('|', [
|
||
'https?://[^<>\s]+|doi:\s{0,10}\d{1,10}\.\d{1,10}/[A-Za-z0-9]{1,30}',
|
||
'\d{1,3}(,\d{3})+', '[A-Za-z]+\d+|\d+[A-Za-z]+',
|
||
'1\d{3}|2\d{3}', '\d{6}', '1[3-9]\d{9}',
|
||
'\d{3}[-\s]?\d{3}[-\s]?\d{4}', '\d{1,3}'
|
||
]);
|
||
$thousandPattern = sprintf(
|
||
'#\b(?!(?:%s))\d{4,}\b#ixu',
|
||
str_replace('#', '\#', $excludePatterns)
|
||
);
|
||
|
||
if (@preg_match($thousandPattern, '') === false) {
|
||
// $errors[] = $this->createError(
|
||
// '千分位正则错误',
|
||
// '跳过千分位处理',
|
||
// "千分位正则错误: {$thousandPattern}",
|
||
// $originalContent,
|
||
// $correctedContent,
|
||
// -1,
|
||
// -1
|
||
// );
|
||
} else {
|
||
$correctedContent = preg_replace_callback(
|
||
$thousandPattern,
|
||
function (array $matches) use (&$replacements, &$errors, $originalContent, &$searchOffset): string {
|
||
$original = $matches[0];
|
||
if (isset($replacements[$original]) || strpos($original, ',') !== false) {
|
||
return $original;
|
||
}
|
||
|
||
$formatted = number_format($original);
|
||
$replacements[$original] = $formatted;
|
||
|
||
// 计算千分位数字在原始文本中的位置
|
||
$posStart = strpos($originalContent, $original, $searchOffset);
|
||
$posEnd = ($posStart !== false) ? $posStart + strlen($original) : -1;
|
||
$searchOffset = ($posEnd !== -1) ? $posEnd : $searchOffset + strlen($original);
|
||
|
||
$currentCorrected = strtr($originalContent, $replacements);
|
||
$errorType = 'thousandth_separator';
|
||
$errors[] = $this->createError(
|
||
$original,
|
||
$formatted,
|
||
"4位及以上整数添加千分位分隔符",
|
||
$originalContent,
|
||
$currentCorrected,
|
||
$posStart,
|
||
$posEnd,
|
||
$errorType
|
||
|
||
);
|
||
return $formatted;
|
||
},
|
||
$correctedContent
|
||
);
|
||
}
|
||
|
||
// 恢复URL/DOI(新增位置记录)
|
||
$restoreFailed = [];
|
||
if (!empty($urlDoiPlaceholders)) {
|
||
$correctedContent = strtr($correctedContent, $urlDoiPlaceholders);
|
||
|
||
if (preg_match_all('#___URL_DOI_.*?___#', $correctedContent, $remaining)) {
|
||
foreach ($remaining[0] as $marker) {
|
||
$original = $urlDoiPlaceholders[$marker] ?? '未知链接';
|
||
$restoreFailed[] = $original;
|
||
|
||
// 计算残留占位符的位置
|
||
$posStart = strpos($correctedContent, $marker, $searchOffset);
|
||
$posEnd = ($posStart !== false) ? $posStart + strlen($marker) : -1;
|
||
$searchOffset = ($posEnd !== -1) ? $posEnd : $searchOffset + strlen($marker);
|
||
|
||
$correctedContent = str_replace($marker, $original, $correctedContent);
|
||
// $errors[] = $this->createError(
|
||
// "残留URL/DOI占位符: {$marker}",
|
||
// "已恢复为原始内容",
|
||
// "URL/DOI恢复不完全,已强制恢复",
|
||
// $originalContent,
|
||
// $correctedContent,
|
||
// $posStart,
|
||
// $posEnd
|
||
// );
|
||
}
|
||
}
|
||
}
|
||
|
||
$this->handleErrors($errors);
|
||
return is_string($correctedContent) ? $correctedContent : $defaultReturn;
|
||
}
|
||
|
||
/**
|
||
* 时间单位缩写校对
|
||
*/
|
||
private function checkTimeUnitAbbreviations($content) {
|
||
// 初始化错误数组(统一格式)
|
||
$errors = [];
|
||
// 严格输入验证:空内容/非字符串直接返回
|
||
if (!is_string($content) || trim($content) === '') {
|
||
$this->handleErrors($errors);
|
||
return $content;
|
||
}
|
||
|
||
$corrected = $content;
|
||
$replaceMap = []; // 存储替换映射
|
||
$originalContent = $corrected;
|
||
$searchOffset = 0; // 用于计算错误位置的偏移量(避免重复定位)
|
||
|
||
// 定义时间单位转换规则
|
||
$timeUnits = [
|
||
[
|
||
'full' => 'hour',
|
||
'plural' => 'hours',
|
||
'abbr' => 'h',
|
||
'description' => '小时'
|
||
],
|
||
[
|
||
'full' => 'minute',
|
||
'plural' => 'minutes',
|
||
'abbr' => 'min',
|
||
'description' => '分钟'
|
||
],
|
||
[
|
||
'full' => 'second',
|
||
'plural' => 'seconds',
|
||
'abbr' => 's',
|
||
'description' => '秒'
|
||
]
|
||
];
|
||
|
||
foreach ($timeUnits as $unit) {
|
||
// 合并所有匹配模式为单一正则
|
||
$fullPattern = $unit['full'] . 's?';
|
||
$capitalizedPattern = ucfirst($unit['full']) . 's?';
|
||
$abbrPattern = $unit['abbr'] . '|' . strtoupper($unit['abbr']);
|
||
|
||
$combinedPattern = "~(\d+(?:\.\d+)?)(?:\s+|)(" .
|
||
"{$fullPattern}|{$capitalizedPattern}|{$abbrPattern}" .
|
||
")\b~i";
|
||
|
||
// 正则有效性校验
|
||
if (@preg_match($combinedPattern, '') === false) {
|
||
// 正则错误:位置默认-1
|
||
$errorHash = md5('time_unit_regex_error_' . $unit['description']);
|
||
// $errors[$errorHash] = $this->createError(
|
||
// '时间单位正则错误',
|
||
// "跳过{$unit['description']}单位校验",
|
||
// "{$unit['description']}单位匹配正则语法错误:{$combinedPattern},已跳过该单位校验",
|
||
// $originalContent,
|
||
// $corrected,
|
||
// -1,
|
||
// -1
|
||
// );
|
||
continue;
|
||
}
|
||
|
||
// 单次匹配所有相关模式
|
||
if (preg_match_all($combinedPattern, $corrected, $matches, PREG_SET_ORDER)) {
|
||
foreach ($matches as $match) {
|
||
$original = $match[0]; // 原始错误内容(如 "5 Hour"、"3 MIN")
|
||
$number = $match[1]; // 数字部分(如 "5"、"3")
|
||
$unitPart = $match[2]; // 单位部分(如 "Hour"、"MIN")
|
||
$fixed = $number . strtolower($unit['abbr']); // 修正后内容(如 "5h"、"3min")
|
||
|
||
// 仅处理需要修正的情况
|
||
if ($original !== $fixed) {
|
||
// 确定错误类型(细化错误原因)
|
||
if (stripos($unitPart, $unit['full']) !== false) {
|
||
$errorReason = "应使用缩写形式'{$unit['abbr']}'";
|
||
} elseif (strpos($original, ' ') !== false) {
|
||
$errorReason = "数字与缩写间不应有空格";
|
||
} else {
|
||
$errorReason = "单位缩写应使用小写'{$unit['abbr']}'";
|
||
}
|
||
|
||
// 计算错误内容在原始文本中的位置
|
||
$posStart = strpos($originalContent, $original, $searchOffset);
|
||
$posEnd = ($posStart !== false) ? $posStart + strlen($original) : -1;
|
||
$searchOffset = ($posEnd !== -1) ? $posEnd : $searchOffset + strlen($original); // 更新偏移量
|
||
|
||
// 错误信息去重(基于原始内容+修正内容哈希)
|
||
$errorHash = md5($original . $fixed);
|
||
$errorType = empty( $unit['full']) ? '' : $unit['full'];
|
||
if (!isset($errors[$errorHash])) {
|
||
$errors[$errorHash] = $this->createError(
|
||
$original,
|
||
$fixed,
|
||
"{$unit['description']}单位格式不规范:{$errorReason},正确格式为[数字{$unit['abbr']}]",
|
||
$originalContent,
|
||
strtr($originalContent, $replaceMap + [$original => $fixed]),
|
||
$posStart,
|
||
$posEnd,
|
||
$errorType
|
||
);
|
||
}
|
||
|
||
// 记录替换映射(去重,避免重复替换)
|
||
if (!isset($replaceMap[$original])) {
|
||
$replaceMap[$original] = $fixed;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// 批量高效替换
|
||
if (!empty($replaceMap)) {
|
||
$corrected = strtr($corrected, $replaceMap);
|
||
}
|
||
|
||
// 统一处理错误
|
||
$this->handleErrors($errors);
|
||
return $corrected;
|
||
}
|
||
|
||
|
||
/**
|
||
* 毫升单位校对
|
||
*/
|
||
private function checkMlUnit($content) {
|
||
$errors = [];
|
||
if (!is_string($content) || trim($content) === '') {
|
||
$this->handleErrors($errors);
|
||
return $content;
|
||
}
|
||
|
||
$corrected = $content;
|
||
$replaceMap = [];
|
||
$originalContent = $corrected; // 保存完整原始内容
|
||
$searchOffset = 0; // 用于计算错误位置的偏移量(避免重复定位)
|
||
|
||
// 优化正则规则(精准匹配毫升单位,支持数字前缀和纯单位场景)
|
||
$mlPattern = '/\b(\d+(?:\.\d+)?\s*)?(ml)\b/i';
|
||
|
||
// 正则有效性校验
|
||
if (@preg_match($mlPattern, '') === false) {
|
||
// // 正则错误:位置默认-1
|
||
// $errorHash = md5('ml_unit_regex_error');
|
||
// $errors[$errorHash] = $this->createError(
|
||
// '毫升单位正则错误',
|
||
// '跳过毫升单位校验',
|
||
// "毫升单位匹配正则语法错误:{$mlPattern},已跳过该校验流程",
|
||
// $originalContent,
|
||
// $corrected,
|
||
// -1,
|
||
// -1
|
||
// );
|
||
} elseif (preg_match_all($mlPattern, $corrected, $allMatches, PREG_SET_ORDER)) {
|
||
foreach ($allMatches as $matchItem) {
|
||
$originalFull = $matchItem[0]; // 原始错误内容(如 "5ml"、" ML")
|
||
$prefix = $matchItem[1] ?? ''; // 数字前缀(如 "5"、"3.0 ")
|
||
$originalUnit = strtolower($matchItem[2]); // 单位部分("ml")
|
||
|
||
// 标准毫升单位格式(L大写)
|
||
$fixedFull = "{$prefix}mL";
|
||
$errorType = 'mL';
|
||
// 仅处理与标准格式不一致的场景
|
||
if ($originalFull !== $fixedFull) {
|
||
// 计算错误内容在原始文本中的位置
|
||
$posStart = strpos($originalContent, $originalFull, $searchOffset);
|
||
$posEnd = ($posStart !== false) ? $posStart + strlen($originalFull) : -1;
|
||
$searchOffset = ($posEnd !== -1) ? $posEnd : $searchOffset + strlen($originalFull); // 更新偏移量
|
||
|
||
// 错误去重(哈希机制)
|
||
$errorHash = md5($originalFull . $fixedFull);
|
||
if (!isset($errors[$errorHash])) {
|
||
$errors[$errorHash] = $this->createError(
|
||
$originalFull,
|
||
$fixedFull,
|
||
'毫升单位格式不规范,标准写法为[mL]',
|
||
$originalContent,
|
||
strtr($originalContent, $replaceMap + [$originalFull => $fixedFull]),
|
||
$posStart,
|
||
$posEnd,
|
||
$errorType
|
||
);
|
||
}
|
||
|
||
// 记录替换映射(去重)
|
||
if (!isset($replaceMap[$originalFull])) {
|
||
$replaceMap[$originalFull] = $fixedFull;
|
||
}
|
||
}
|
||
}
|
||
|
||
// 批量替换
|
||
if (!empty($replaceMap)) {
|
||
$corrected = strtr($corrected, $replaceMap);
|
||
}
|
||
}
|
||
|
||
$this->handleErrors($errors);
|
||
return $corrected;
|
||
}
|
||
|
||
|
||
/**
|
||
* 显著性P斜体校对
|
||
*/
|
||
private function checkPSignificance($content) {
|
||
$errors = [];
|
||
if (!is_string($content) || trim($content) === '') {
|
||
$this->handleErrors($errors);
|
||
return $content;
|
||
}
|
||
|
||
$corrected = $content;
|
||
$replaceMap = [];
|
||
$originalContent = $corrected; // 保存完整原始内容
|
||
$searchOffset = 0; // 用于计算错误位置的偏移量(避免重复定位)
|
||
|
||
// 优化正则规则(覆盖P/p全场景,支持科学计数法)
|
||
$pValuePattern = '/\b([Pp])(\s*=?\s*)(\d+(?:\.\d+)?(?:[eE][+-]?\d+)?)\b/';
|
||
|
||
// 正则有效性校验
|
||
if (@preg_match($pValuePattern, '') === false) {
|
||
// // 正则错误:位置默认-1
|
||
// $errors[] = $this->createError(
|
||
// 'P值正则错误',
|
||
// '跳过P值斜体校验',
|
||
// "P值匹配正则语法错误:{$pValuePattern},已跳过该校验流程",
|
||
// $originalContent,
|
||
// $corrected,
|
||
// -1,
|
||
// -1
|
||
// );
|
||
} elseif (preg_match_all($pValuePattern, $corrected, $allMatches, PREG_SET_ORDER)) {
|
||
foreach ($allMatches as $matchItem) {
|
||
$original = $matchItem[0]; // 原始P值内容(如 "P=0.05"、"p < 0.01")
|
||
$pChar = $matchItem[1]; // P/p字符(如 "P"、"p")
|
||
$separator = $matchItem[2];// 分隔符(如 "="、" < ")
|
||
$number = $matchItem[3]; // 数值部分(如 "0.05"、"1.2e-3")
|
||
|
||
// 生成修正内容(仅P/p加斜体)
|
||
$fixed = "<i>{$pChar}</i>{$separator}{$number}";
|
||
|
||
// 仅处理有变化的场景
|
||
if ($original !== $fixed) {
|
||
// 计算原始P值内容在完整原始文本中的位置
|
||
$posStart = strpos($originalContent, $original, $searchOffset);
|
||
$posEnd = ($posStart !== false) ? $posStart + strlen($original) : -1;
|
||
$searchOffset = ($posEnd !== -1) ? $posEnd : $searchOffset + strlen($original); // 更新偏移量
|
||
|
||
// 错误去重(哈希机制)
|
||
$errorHash = md5($original . $fixed);
|
||
$errorType = 'P';
|
||
if (!isset($errors[$errorHash])) {
|
||
$errors[$errorHash] = $this->createError(
|
||
$original,
|
||
$fixed,
|
||
'显著性P值格式不规范,P/p应使用斜体',
|
||
$originalContent,
|
||
strtr($originalContent, $replaceMap + [$original => $fixed]),
|
||
$posStart,
|
||
$posEnd,
|
||
$errorType
|
||
);
|
||
}
|
||
|
||
// 记录替换映射(去重)
|
||
if (!isset($replaceMap[$original])) {
|
||
$replaceMap[$original] = $fixed;
|
||
}
|
||
}
|
||
}
|
||
|
||
// 批量替换
|
||
if (!empty($replaceMap)) {
|
||
$corrected = strtr($corrected, $replaceMap);
|
||
}
|
||
}
|
||
|
||
$this->handleErrors($errors);
|
||
return $corrected;
|
||
}
|
||
|
||
/**
|
||
* No. 123456格式统一
|
||
*/
|
||
private function checkNoFormatUniformity($content) {
|
||
$errors = [];
|
||
// 严格输入验证:空内容/非字符串直接返回(保持与checkTextFormat一致)
|
||
if (!is_string($content) || trim($content) === '') {
|
||
$this->handleErrors($errors);
|
||
return $content;
|
||
}
|
||
|
||
$corrected = $content;
|
||
$replaceMap = [];
|
||
$originalContent = $corrected; // 备份完整原始内容,用于错误信息的"original"字段
|
||
$searchOffset = 0; // 用于计算错误位置的偏移量(避免重复定位同一错误)
|
||
|
||
// 正则规则(精准匹配No.格式,包含大小写、空格、数字场景,如 NO.123、no. 456 等)
|
||
$combinedPattern = '/\b([Nn][Oo]\.)(\s*)(\d+)\b/';
|
||
// 正则有效性校验(避免无效正则导致崩溃,与checkTextFormat逻辑一致)
|
||
if (@preg_match($combinedPattern, '') === false) {
|
||
// // 正则错误:位置默认-1
|
||
// $errorHash = md5('no_format_regex_error');
|
||
// $errors[$errorHash] = $this->createError(
|
||
// 'No.格式正则错误', // verbatim_texts:具体错误标识
|
||
// '跳过No.格式校验', // revised_content:处理结果
|
||
// "No.格式匹配正则语法错误:{$combinedPattern},已跳过该校验流程", // explanation:错误说明
|
||
// $originalContent, // original:完整原始内容
|
||
// $corrected, // corrected:当前完整修正内容(未处理,故与原始一致)
|
||
// -1, // position_start:默认-1(定位失败)
|
||
// -1 // position_end:默认-1(定位失败)
|
||
// );
|
||
} elseif (preg_match_all($combinedPattern, $corrected, $matches, PREG_SET_ORDER)) {
|
||
foreach ($matches as $item) {
|
||
$originalFull = $item[0]; // 匹配到的单个错误片段(如 "NO.123"、"no. 456")
|
||
$originalPrefix = $item[1]; // 前缀部分(如 "NO."、"no.")
|
||
$spaces = $item[2]; // 空格部分(如空、单个空格、多个空格)
|
||
$number = $item[3]; // 数字部分(如 "123"、"456")
|
||
|
||
// 标准化格式:No.(首字母大写+o小写+点) + 1个空格 + 数字
|
||
$fixedPrefix = 'No.';
|
||
$fixedSpaced = ' ';
|
||
$fixedFull = "{$fixedPrefix}{$fixedSpaced}{$number}"; // 单个错误片段的修正结果
|
||
|
||
// 仅处理与标准格式不一致的场景(避免无意义的替换和错误记录)
|
||
if ($originalFull !== $fixedFull) {
|
||
// 计算错误片段在完整原始文本中的位置
|
||
$posStart = strpos($originalContent, $originalFull, $searchOffset);
|
||
$posEnd = ($posStart !== false) ? $posStart + strlen($originalFull) : -1;
|
||
$searchOffset = ($posEnd !== -1) ? $posEnd : $searchOffset + strlen($originalFull); // 更新偏移量,避免重复定位
|
||
|
||
// 细化错误原因(分场景说明,提升可读性)
|
||
$errorReasons = [];
|
||
if ($originalPrefix !== $fixedPrefix) {
|
||
$errorReasons[] = "前缀格式不规范(应使用 \"No.\",当前为 \"{$originalPrefix}\")";
|
||
}
|
||
if (trim($spaces) !== $fixedSpaced || strlen($spaces) !== 1) {
|
||
$errorReasons[] = empty($spaces)
|
||
? '缺少空格(.后需加1个空格)'
|
||
: "空格数量不规范(当前为 " . strlen($spaces) . " 个,应保留1个空格)";
|
||
}
|
||
|
||
// 记录替换映射(去重,避免重复处理相同错误片段)
|
||
if (!isset($replaceMap[$originalFull])) {
|
||
$replaceMap[$originalFull] = $fixedFull;
|
||
}
|
||
|
||
// 错误信息去重(基于单个错误片段的原始值+修正值哈希,避免重复记录)
|
||
$errorHash = md5($originalFull . $fixedFull);
|
||
$errorType = 'No.';
|
||
if (!isset($errors[$errorHash])) {
|
||
$errors[$errorHash] = $this->createError(
|
||
$originalFull, // verbatim_texts:具体错误片段
|
||
$fixedFull, // revised_content:错误片段的修正结果
|
||
'No. 格式不规范,正确格式为「No. 数字」', // explanation:错误说明
|
||
// 'No. 格式不规范:' . implode(',', $errorReasons) . ',正确格式为「No. 数字」', // explanation:错误说明
|
||
$originalContent, // original:完整原始内容(整个输入文本)
|
||
strtr($originalContent, $replaceMap), // corrected:完整修正内容(基于当前替换映射生成)
|
||
$posStart, // position_start:错误起始位置
|
||
$posEnd, // position_end:错误结束位置
|
||
$errorType //错误类型
|
||
);
|
||
}
|
||
}
|
||
}
|
||
// 批量替换所有错误片段(高效处理,避免循环内重复替换)
|
||
if (!empty($replaceMap)) {
|
||
$corrected = strtr($corrected, $replaceMap);
|
||
}
|
||
}
|
||
|
||
$this->handleErrors($errors);
|
||
return $corrected;
|
||
}
|
||
|
||
/**
|
||
* 图表标题一律使用全称Figure 1, Table 1.不能写成Fig. 1, Tab 1.
|
||
*/
|
||
private function checkFigureTableTitle($content) {
|
||
$errors = [];
|
||
// 严格输入验证:空内容/非字符串直接返回
|
||
if (!is_string($content) || trim($content) === '') {
|
||
$this->handleErrors($errors);
|
||
return $content;
|
||
}
|
||
|
||
$corrected = $content;
|
||
$replaceMap = [];
|
||
$originalContent = $corrected; // 备份原始内容,用于错误信息
|
||
$searchOffset = 0; // 错误位置计算偏移量,避免重复定位
|
||
|
||
// 图表标题匹配正则(支持 Fig/Figs/Tab/Tabs、特殊空格、数字范围)
|
||
$titlePattern = '/(?<!\w)(Fig|Figs|Tab|Tabs)(\.?)(\s*|\u00A0|\u0020)(\d+(?:[-\u2013\u2014]\d+)?)(?!\w)/iu';
|
||
|
||
// 正则有效性校验
|
||
if (@preg_match($titlePattern, '') === false) {
|
||
// $errors[] = $this->createError(
|
||
// '图表标题正则错误',
|
||
// '跳过图表标题格式校验',
|
||
// "图表标题匹配正则语法错误:{$titlePattern},已跳过该校验流程",
|
||
// $originalContent,
|
||
// $corrected,
|
||
// -1,
|
||
// -1
|
||
// );
|
||
} else {
|
||
// 全局匹配所有图表标题格式
|
||
$matchCount = preg_match_all($titlePattern, $corrected, $allMatches, PREG_SET_ORDER);
|
||
|
||
if ($matchCount > 0) {
|
||
foreach ($allMatches as $matchItem) {
|
||
$originalFull = $matchItem[0]; // 完整错误片段(如 "Fig 1"、"Tabs-2")
|
||
$abbrBase = $matchItem[1]; // 缩写主体(Fig/Figs/Tab/Tabs)
|
||
$dot = $matchItem[2]; // 可能的点(.)
|
||
$space = $matchItem[3]; // 可能的空格(含特殊空格)
|
||
$number = $matchItem[4]; // 数字部分(支持范围如 "2-3")
|
||
|
||
// 确定全称及错误描述
|
||
switch (strtolower($abbrBase)) {
|
||
case 'fig':
|
||
$fullName = 'Figure';
|
||
$errorDesc = '图表标题使用缩写"Fig",正确:"Figure"';
|
||
break;
|
||
case 'figs':
|
||
$fullName = 'Figures';
|
||
$errorDesc = '图表标题复数使用缩写"Figs",正确:"Figures"';
|
||
break;
|
||
case 'tab':
|
||
$fullName = 'Table';
|
||
$errorDesc = '表格标题使用缩写"Tab",正确:"Table"';
|
||
break;
|
||
case 'tabs':
|
||
$fullName = 'Tables';
|
||
$errorDesc = '表格标题复数使用缩写"Tabs",正确:"Tables"';
|
||
break;
|
||
default:
|
||
$fullName = '';
|
||
$errorDesc = '';
|
||
continue 2; // 修复警告:跳出 switch + 跳过当前 foreach 迭代
|
||
}
|
||
|
||
// 生成标准格式(全称 + 单个空格 + 数字)
|
||
$fixed = "{$fullName} {$number}";
|
||
|
||
// 仅处理需要修正的场景(避免无意义操作)
|
||
if ($originalFull !== $fixed) {
|
||
// 计算错误片段在原始文本中的位置
|
||
$posStart = strpos($originalContent, $originalFull, $searchOffset);
|
||
$posEnd = ($posStart !== false) ? $posStart + strlen($originalFull) : -1;
|
||
$searchOffset = ($posEnd !== -1) ? $posEnd : $searchOffset + strlen($originalFull);
|
||
|
||
// 错误信息去重(基于原始+修正内容哈希)
|
||
$errorHash = md5($originalFull . $fixed);
|
||
if (!isset($errors[$errorHash])) {
|
||
// 生成临时修正内容,用于错误信息预览
|
||
$tempReplace = $replaceMap;
|
||
$tempReplace[$originalFull] = $fixed;
|
||
$currentCorrected = strtr($originalContent, $tempReplace);
|
||
|
||
$errors[$errorHash] = $this->createError(
|
||
$originalFull,
|
||
$fixed,
|
||
$errorDesc,
|
||
$originalContent,
|
||
$currentCorrected,
|
||
$posStart,
|
||
$posEnd,
|
||
$fullName
|
||
);
|
||
}
|
||
|
||
// 记录替换规则(去重,避免重复替换)
|
||
if (!isset($replaceMap[$originalFull])) {
|
||
$replaceMap[$originalFull] = $fixed;
|
||
}
|
||
}
|
||
}
|
||
|
||
// 批量执行所有替换(高效处理)
|
||
if (!empty($replaceMap)) {
|
||
$corrected = strtr($corrected, $replaceMap);
|
||
}
|
||
}
|
||
}
|
||
|
||
// 处理错误信息(需确保 handleErrors 方法已实现)
|
||
$this->handleErrors($errors);
|
||
return $corrected;
|
||
}
|
||
|
||
|
||
|
||
/**
|
||
* 添加错误信息
|
||
*/
|
||
private function addError($error = []) {
|
||
if (!empty($error) && is_array($error)) {
|
||
// 确保错误信息结构完整
|
||
$safeError = array_merge([
|
||
'verbatim_texts' => '',
|
||
'revised_content' => '',
|
||
'explanation' => '',
|
||
'original' => '',
|
||
'corrected' => '',
|
||
'position_start' => '',
|
||
'position_end' => '',
|
||
], $error);
|
||
$this->errors[] = $safeError;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* 处理错误信息(去重和存储)
|
||
*/
|
||
private function handleErrors($errors) {
|
||
if (empty($errors)) return;
|
||
|
||
// 错误去重
|
||
$uniqueErrors = [];
|
||
foreach ($errors as $error) {
|
||
$errorHash = md5($error['verbatim_texts'] . $error['revised_content']. $error['position_start']. $error['position_end']);
|
||
if (!isset($uniqueErrors[$errorHash])) {
|
||
$uniqueErrors[$errorHash] = $error;
|
||
}
|
||
}
|
||
|
||
// 批量添加错误
|
||
foreach (array_values($uniqueErrors) as $error) {
|
||
$this->addError($error);
|
||
}
|
||
}
|
||
|
||
/**
|
||
* 创建标准化错误信息
|
||
*/
|
||
private function createError($verbatim='', $revised='', $explanation='',$original='',$corrected='', $position_start=-1, $position_end=-1,$error_type='') {
|
||
return [
|
||
'verbatim_texts' => $verbatim,
|
||
'revised_content' => $revised,
|
||
'explanation' => $explanation,
|
||
'original' => $original,
|
||
'corrected' => $corrected,
|
||
'position_start' => $position_start,
|
||
'position_end' => $position_end,
|
||
'error_type' => $error_type
|
||
];
|
||
}
|
||
/**
|
||
* 检查doi链接是否都能打开
|
||
*/
|
||
private function checkDoi($content) {
|
||
$errors = [];
|
||
if (!is_string($content) || trim($content) === '') {
|
||
$this->handleErrors($errors); // 注意:原代码笔误“handleErrorsErrors”已修正
|
||
return $content;
|
||
}
|
||
|
||
$corrected = $content;
|
||
$originalContent = $corrected;
|
||
$checkedDois = []; // 用于去重,避免同一DOI重复校验
|
||
|
||
try {
|
||
// 优化正则:匹配标准DOI格式(覆盖所有常见场景)
|
||
// 匹配规则说明:
|
||
// 1. (?<!\w):非单词字符前缀(避免匹配类似“xdoi:10.1017/”的无效内容)
|
||
// 2. (doi):匹配“doi”(不区分大小写,通过/i修饰符实现)
|
||
// 3. :\s*:冒号后允许0个或多个空格(支持“doi:10.1017/”和“doi: 10.1017/”)
|
||
// 4. (\d+\.\d+\/[A-Za-z0-9\/\.\-\_]+):匹配DOI核心部分(如10.1017/abc、10.1038/nature12345)
|
||
// - \d+\.\d+:数字+小数点+数字(DOI前缀,如10.1017)
|
||
// - \/:斜杠(DOI分隔符)
|
||
// - [A-Za-z0-9\/\.\-\_]+:DOI后缀(允许字母、数字、斜杠、小数点、横线、下划线)
|
||
// 5. (?!\w):非单词字符后缀(避免匹配类似“10.1017/abcx”的无效内容)
|
||
$doiPattern = '/(?<!\w)(doi):\s*(\d+\.\d+\/[A-Za-z0-9\/\.\-\_]+)(?!\w)/iu';
|
||
|
||
// 正则有效性校验
|
||
if (@preg_match($doiPattern, '') === false) {
|
||
$errors[] = $this->createError(
|
||
'DOI正则错误',
|
||
'跳过DOI校验',
|
||
"DOI匹配正则语法错误:{$doiPattern},已跳过该校验流程",
|
||
$originalContent,
|
||
$corrected
|
||
);
|
||
} else {
|
||
// 匹配所有符合标准的DOI(PREG_SET_ORDER按匹配项分组)
|
||
$matchCount = preg_match_all($doiPattern, $corrected, $allMatches, PREG_SET_ORDER);
|
||
|
||
if ($matchCount > 0) {
|
||
foreach ($allMatches as $matchItem) {
|
||
$fullDoi = strtolower($matchItem[1]) . ':' . $matchItem[2]; // 完整DOI(统一转为小写,如“doi:10.1017/abc”)
|
||
$doiCore = $matchItem[2]; // DOI核心部分(如“10.1017/abc”,用于拼接访问链接)
|
||
|
||
// 去重:同一DOI仅校验一次
|
||
if (isset($checkedDois[$fullDoi])) {
|
||
continue;
|
||
}
|
||
$checkedDois[$fullDoi] = true;
|
||
|
||
// 测试DOI链接是否可访问
|
||
$isAccessible = $this->testDoiAccessibility($doiCore);
|
||
// 生成错误/状态信息
|
||
if ($isAccessible) {
|
||
$errorDesc = "DOI「{$fullDoi}」格式规范,且链接可正常访问";
|
||
} else {
|
||
$errorDesc = "DOI「{$fullDoi}」格式规范,但链接无法访问(可能无效或网络问题)";
|
||
}
|
||
|
||
// 记录校验结果(DOI无需修正,仅记录状态)
|
||
$errors[] = $this->createError(
|
||
$fullDoi,
|
||
$fullDoi, // 修正后内容与原始一致(DOI格式无需修改)
|
||
$errorDesc,
|
||
$originalContent,
|
||
$corrected
|
||
);
|
||
}
|
||
} else {
|
||
// 无匹配时记录提示(可选,根据业务需求决定是否保留)
|
||
$errors[] = $this->createError(
|
||
'未匹配到DOI',
|
||
'无修正',
|
||
'文本中未发现符合标准格式的DOI(如doi:10.1017/abc、DOI: 10.1038/nature12345)',
|
||
$originalContent,
|
||
$corrected
|
||
);
|
||
}
|
||
}
|
||
|
||
} catch (Exception $e) {
|
||
$errors[] = $this->createError(
|
||
'DOI校验全局异常',
|
||
'已回滚原始内容',
|
||
"DOI校验出错:{$e->getMessage()}(行号:{$e->getLine()}),已恢复原始输入",
|
||
$originalContent,
|
||
$originalContent
|
||
);
|
||
$corrected = $originalContent;
|
||
}
|
||
|
||
$this->handleErrors($errors);
|
||
return $corrected;
|
||
}
|
||
|
||
/**
|
||
* 测试DOI链接是否可访问(基于DOI官方解析地址)
|
||
* @param string $doiCore DOI核心部分(如“10.1017/abc”,不含“doi:”前缀)
|
||
* @return bool 可访问返回true,否则返回false
|
||
*/
|
||
private function testDoiAccessibility($doiCore) {
|
||
// 处理DOI核心部分的空格(若存在)
|
||
$doiCore = trim($doiCore);
|
||
// DOI官方解析地址:https://doi.org/ + 编码后的DOI核心部分
|
||
$doiUrl = 'https://doi.org/' . $doiCore;
|
||
var_dump($doiUrl,$doiCore);exit;
|
||
|
||
// 初始化cURL(支持HTTPS,忽略证书问题避免环境限制)
|
||
$ch = curl_init();
|
||
curl_setopt_array($ch, [
|
||
CURLOPT_URL => $doiUrl,
|
||
CURLOPT_RETURNTRANSFER => true, // 不直接输出响应
|
||
CURLOPT_HEADER => true, // 获取响应头(用于判断状态码)
|
||
CURLOPT_TIMEOUT => 15, // 超时时间(避免长时间阻塞)
|
||
CURLOPT_FOLLOWLOCATION => true, // 跟随301/302重定向(DOI常跳转到期刊页面)
|
||
CURLOPT_SSL_VERIFYPEER => false, // 忽略SSL证书校验(适合测试环境)
|
||
CURLOPT_SSL_VERIFYHOST => false
|
||
]);
|
||
|
||
curl_exec($ch);
|
||
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); // 获取HTTP状态码
|
||
curl_close($ch);
|
||
|
||
// 状态码200-399表示可访问(200成功,3xx重定向均视为有效)
|
||
return $httpCode >= 200 && $httpCode < 400;
|
||
}
|
||
}
|
||
?>
|