errors = []; $this->excludedFormats = []; $correctedContent = $content; //时间单位缩写校对 $correctedContent = $this->checkTimeUnitAbbreviations($correctedContent); //横线/运算符校对 $correctedContent = $this->checkTextFormat($correctedContent); //数字格式校对 $correctedContent = $this->checkNumberFormat($correctedContent); //No. 123456的写法统一 $correctedContent = $this->checkNoFormatUniformity($correctedContent); //毫升单位校对 $correctedContent = $this->checkMlUnit($correctedContent); //显著性P斜体校对 $correctedContent = $this->checkPSignificance($correctedContent); //图表标题一律使用全称Figure 1, Table 1.不能写成Fig. 1, Tab 1. $correctedContent = $this->checkFigureTableTitle($correctedContent); //检测参考文献是否能打开 // $correctedContent = $this->checkDoi($correctedContent); //判断是否为空错误信息 if(empty($this->errors)){ return []; } return [ 'proof_before' => $content, 'proof_after' => $correctedContent, 'errors' => $this->errors ]; } /** * 横线/运算符校对/数字和单位(高可用版) */ private function checkTextFormat($content) { // 初始化错误数组 $errors = []; $defaultReturn = $content; $originalContent = $content; // 保存完整原始内容 $searchOffsetForExclude = 0; // 【新增】仅用于「特殊内容过滤」的偏移量 $searchOffsetForCore = 0; // 【新增】仅用于「核心规则处理」的偏移量 // 验证数据 if (!is_string($content) || trim($content) === '') { $this->handleErrors($errors); return $defaultReturn; } $corrected = $content; $excludeMarkers = []; // 存储 URL/DOI + / 的占位符映射 $processedHashes = []; // 编码处理 $originalEncoding = mb_detect_encoding($content, ['UTF-8', 'GBK', 'GB2312', 'ISO-8859-1'], true); if ($originalEncoding === false) { } else { $converted = @mb_convert_encoding($content, 'UTF-8', $originalEncoding); $corrected = $converted !== false ? $converted : $content; if ($converted === false) { $posStart = 0; $posEnd = min(20, strlen($originalContent)); } } // 过滤 / $mathTagRegex = '~<(wmath|math)[^>]*?>.*?~is'; if (@preg_match($mathTagRegex, '') === false) { // 正则错误处理(不变) } elseif (preg_match_all($mathTagRegex, $corrected, $matches, PREG_SET_ORDER)) { usort($matches, function($a, $b) { return strlen($b[0]) - strlen($a[0]); }); foreach ($matches as $index => $match) { $fullTag = $match[0]; $tagType = $match[1]; $marker = "___EXCLUDE_{$tagType}_" . time() . "_{$index}___"; $excludeMarkers[$marker] = $fullTag; // 【修改】使用独立偏移量 $searchOffsetForExclude $posStart = strpos($originalContent, $fullTag, $searchOffsetForExclude); $posEnd = ($posStart !== false) ? $posStart + strlen($fullTag) : -1; $searchOffsetForExclude = ($posEnd !== -1) ? $posEnd : $searchOffsetForExclude + strlen($fullTag); $safeFullTag = preg_quote($fullTag, '~'); $corrected = preg_replace("~{$safeFullTag}~u", $marker, $corrected, 1); } } // 过滤 URL/DOI $urlDoiRegex = '~( https?://[^\s/]{1,100} # 协议(http/https) + 域名(非空白/字符) (?:/+[A-Za-z0-9\.\-]+(?:-[A-Za-z0-9\.\-]+)*)* # 多级路径(支持.html后接/1/23等格式) (?:\?[A-Za-z0-9_\-=&%\+\.\~]+)? # 可选查询参数(如?J_num=8&page=1) (?:\#[A-Za-z0-9_\-]+)? # 可选锚点(如#section) | \b[a-zA-Z0-9\.\-]+\.[a-zA-Z]{2,} # 无协议域名(如example.com) (?:/+[A-Za-z0-9\.\-]+(?:-[A-Za-z0-9\.\-]+)*)* # 无协议多级路径 (?:\?[A-Za-z0-9_\-=&%\+\.\~]+)? # 无协议查询参数 (?:\#[A-Za-z0-9_\-]+)? # 无协议锚点 (?=$|[\s\.,;!]) # 结束边界(空白或标点) | doi:\s{0,10}\d+\.\d+/[A-Za-z0-9-+×:]+(?:-[A-Za-z0-9-+×:]+)* # DOI格式 )~iux'; if (@preg_match($urlDoiRegex, '') === false) { // 正则错误处理(不变) } elseif (preg_match_all($urlDoiRegex, $corrected, $matches, PREG_SET_ORDER)) { // 按长度降序排序,优先处理长URL(避免短URL被包含时误替换) usort($matches, function($a, $b) { return strlen($b[1]) - strlen($a[1]); }); foreach ($matches as $index => $match) { $original = $match[1]; $marker = "___EXCLUDE_URL_" . time() . "_{$index}___"; $excludeMarkers[$marker] = $original; // 独立偏移量避免重复匹配,兼容特殊URL格式 $posStart = strpos($originalContent, $original, $searchOffsetForExclude); $posEnd = ($posStart !== false) ? $posStart + strlen($original) : -1; $searchOffsetForExclude = ($posEnd !== -1) ? $posEnd : $searchOffsetForExclude + strlen($original); // 精准替换当前URL为标记(仅1次,避免全局替换干扰) $corrected = preg_replace("~" . preg_quote($original, '~') . "~u", $marker, $corrected, 1); } } // 核心格式规则处理(优化偏移量计算与验证逻辑) $coreRules = $this->getTextCoreRules(); foreach ($coreRules as $rule) { if (@preg_match($rule['pattern'], '') === false) { continue; } // 匹配时保留偏移量信息,用于精准定位 $matchCount = preg_match_all( $rule['pattern'], $corrected, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE ); if ($matchCount === 0) { continue; } foreach ($matches as $match) { $original = $match[0][0]; // 匹配到的原始内容 $originalLen = strlen($original); $hash = md5($original); // 跳过已处理的内容,避免重复修正 if (isset($processedHashes[$hash])) { continue; } $offsetInCorrected = $match[0][1]; // 匹配内容在$corrected中的偏移量 $prefixInCorrected = substr($corrected, 0, $offsetInCorrected); $prefixInOriginal = strtr($prefixInCorrected, $excludeMarkers); // 还原占位符为原始内容 $posStart = strlen($prefixInOriginal); $posEnd = $posStart + $originalLen; $contentCheck = substr($originalContent, $posStart, $originalLen); $contentCheckConv = iconv('UTF-8', 'UTF-8//IGNORE', $contentCheck); // 忽略无效字符 $originalConv = iconv('UTF-8', 'UTF-8//IGNORE', $original); if (strcmp($contentCheckConv, $originalConv) !== 0) { // 验证失败时,基于当前偏移量重新定位 $localPattern = '~' . preg_quote($original, '~') . '~u'; if (preg_match($localPattern, $originalContent, $localMatch, PREG_OFFSET_CAPTURE, $searchOffsetForCore)) { $posStart = $localMatch[0][1]; $posEnd = $posStart + $originalLen; } else { continue; } } // 生成修正后的内容 $fixed = is_callable($rule['replacement']) ? call_user_func($rule['replacement'], $match) : preg_replace($rule['pattern'], $rule['replacement'], $original); // 仅在内容有变化时更新 if ($original !== $fixed && $fixed !== null) { $searchOffsetForCore = $posEnd; // 更新核心规则偏移量,避免重复匹配 $currentCorrected = str_replace($original, $fixed, $corrected); // 记录错误信息 $errors[] = $this->createError( $original, $fixed, $rule['explanation'], $originalContent, $currentCorrected, $posStart, $posEnd, $rule['error_type'] ?? '' ); $processedHashes[$hash] = true; $corrected = $currentCorrected; } } } // 批量还原 URL/DOI 和数学标签(保持不变,优化错误提示) $restoreErrors = []; if (!empty($excludeMarkers)) { $corrected = strtr($corrected, $excludeMarkers); // 检查未正常还原的占位符 if (preg_match_all('~___EXCLUDE_(wmath|math|URL)_\d+_\d+___~', $corrected, $remaining)) { foreach ($remaining[0] as $marker) { $original = $excludeMarkers[$marker] ?? '未知内容'; $restoreErrors[] = "未正常还原的占位符: {$marker}(原始内容: {$original})"; $corrected = str_replace($marker, $original, $corrected); // 强制还原 } } } $this->handleErrors($errors); return is_string($corrected) ? $corrected : $defaultReturn; } /** * 获取文本格式核心规则 */ private function getTextCoreRules() { return [ // 1. 最高优先级:特殊格式排除规则(首行专属排除No.编号) [ 'pattern' => '~ # 【首优先级】No.编号专属排除(如No.: 2023YJZX-LN03/13、NO: KHYJ-2023-05、no. 123-ABC/45) # 支持变体:No.大小写、冒号可带/不带点、冒号前后空格、编号含-/_/数字/字母 \b(?:No|NO|no)\.?:?\s* # 前缀:No./NO./no.(冒号可选,点可选,后接任意空格) [A-Za-z0-9\-\/_]+ # 编号主体:支持字母、数字、-、/、_(覆盖2023YJZX-LN03/13) (?:-[A-Za-z0-9\-\/_]+)* # 编号后缀:支持多段连接(如2023YJZX-LN03/13-001) \b # 单词边界:避免编号后接多余字符(如No.: 2023abc) | # 括号内分数及百分比组合(如(45/45)、(15.6%, 7/45)) \(\s*(?:\d+(?:\.\d+)?%?\s*,?\s*)?\d+(?:\.\d+)?\s*/\s*\d+(?:\.\d+)?\s*\) | # 独立年份范围(如1849-1850、2023 - 2025) (?]|from\s+|:\s*|\[[MZACDP]\]\.\s*)\d{4}\s*-\s*\d{4} (?!\d|[\+\-\*\/=<>]|[,\.:;!]|\+\d+\.) | # from+年份范围(如from 1849-1850) \bfrom\s+\d{4}\s*-\s*\d{4}\b | # 带单位的数字范围/倍数(如50-200 nm、10×5 cm) \b\d+\s*[-×]\s*\d+\s*[a-zA-Z%] | # 无No.前缀的项目编号(如2023YJZX-LN03/13、KHYJ-2023-05-01) [A-Za-z]+-?\d+[-/]\d+[-/]\d* | # 参考文献格式(期刊/专著等) \d{4},\s*\d{1,3}\(\d{1,2}\):\s*\d+-\d+(?:\+\d+)*\.|[^\n]+\[[MZACDP]\]\.\s*[^\n]+,\s*\d{4}:\s*\d+-\d+\. ~ux', 'replacement' => '$0', // 完全保留原始格式,不做任何修改 'verbatim_texts' => 'No.编号及非运算场景无需处理', 'explanation' => 'No.系列编号(如No.: 2023YJZX-LN03/13)、括号内分数、年份范围、带单位数字范围、项目编号、参考文献等非运算场景的符号不做处理', 'error_type' => 'exclude' ], // 2. 次高优先级:数字范围规则(避免与-冲突) [ 'pattern' => '~(\[\s*[-]?\d+\s*)\x{2014}\s*(\d+\s*\])~u', 'replacement' => '$1-$2', 'verbatim_texts' => '带括号数字范围长划线不规范', 'explanation' => '带括号的数字范围应使用短划线[-]', 'error_type' => 'en-dash' ], [ 'pattern' => '~(\[\s*[-]?\d+)\s*-\s*(\d+\s*\])~u', 'replacement' => '$1-$2', 'verbatim_texts' => '带括号数字范围短划线空格不规范', 'explanation' => '带括号数字范围的短划线[-]前后不应留空格', 'error_type' => 'en-dash' ], [ 'pattern' => '~(\b\d+)\s*—\s*(\d+\b)~u', 'replacement' => '$1-$2', 'verbatim_texts' => '无括号数字范围长划线不规范', 'explanation' => '无括号的数字范围应使用短划线[-]', 'error_type' => 'bracket_en-dash' ], [ 'pattern' => '~ (? '$1-$2', 'verbatim_texts' => '无括号数字范围短划线空格不规范', 'explanation' => '无括号数字范围的短划线[-]前后不应留空格', 'error_type' => 'bracket_en-dash' ], // 3. 核心优先级:运算符规则(精准匹配,排除No.编号干扰) [ // 'pattern' => '~(\S)\s*([<>!]=|===|!==)\s*(\S)~u', 'pattern' => '~(?)\s*(\S)\s*([<>!]=|===|!==)\s*(\S)(?!<[a-z]+>)~u', 'replacement' => '$1 $2 $3', 'verbatim_texts' => '复合运算符前后空格不规范', 'explanation' => '复合运算符[>=、<=、==、!=、===、!==]前后应各留一个空格', 'error_type' => 'composite_operator' ], [ 'pattern' => '~ (?|\*|\+|-|/) (\S+?)\s*=\s*(\S+?) (?!=|<|>|\*|\+|-|/) ~ux', 'replacement' => '$1 = $2', 'verbatim_texts' => '等号前后空格不规范', 'explanation' => '独立等号[=]前后应各留一个空格', 'error_type' => 'equal' ], // 乘法(排除No.编号中的*) [ 'pattern' => '~ (? '$1 × $3', 'verbatim_texts' => '乘法运算符格式不规范', 'explanation' => '乘法运算应使用标准乘号[×],前后各留一个空格', 'error_type' => 'ride' ], // 除法(排除No.编号中的/) [ 'pattern' => '~ (? '$1 $2 $3', 'verbatim_texts' => '除法运算符前后空格不规范', 'explanation' => '除法运算符[/]前后应各留一个空格(纯数字运算场景)', 'error_type' => 'except' ], // 加法(排除No.编号中的+) [ 'pattern' => '~ (? '$1 $2 $3', 'verbatim_texts' => '加法运算符前后空格不规范', 'explanation' => '加法运算符[+]前后应各留一个空格(纯数字运算场景)', 'error_type' => 'plus' ], // 减法(排除No.编号中的-) [ 'pattern' => '~ (? '$1 $2 $3', 'verbatim_texts' => '减法运算符前后空格不规范', 'explanation' => '减法运算符[-]前后应各留一个空格(纯数字运算场景)', 'error_type' => 'reduce' ], // 4. 低优先级:特殊符号规则 [ 'pattern' => '~(\d+)\s+%~u', 'replacement' => '$1%', 'verbatim_texts' => '数字与百分号空格不规范', 'explanation' => '数字与百分号[%]之间不应留空格', 'error_type' => 'number_percentage' ], [ 'pattern' => '~(\d+)\s+×\s+(\d+)~u', 'replacement' => '$1×$2', 'verbatim_texts' => '倍数乘号空格不规范', 'explanation' => '乘号[×]表示倍数时前后不应留空格', 'error_type' => 'multiple' ], [ 'pattern' => '~(\d+)\s+:\s+(\d+)~u', 'replacement' => '$1:$2', 'verbatim_texts' => '比值符号空格不规范', 'explanation' => '比值符号[:]前后不应留空格', 'error_type' => 'biliel' ] ]; } /** * 数字格式处理 */ private function checkNumberFormat($content) { $errors = []; $defaultReturn = $content; $originalContent = $content; $searchOffset = 0; if (!is_string($content) || trim($content) === '') { $this->handleErrors($errors); return $defaultReturn; } $correctedContent = $content; $replacements = []; $urlDoiPlaceholders = []; $prefixFormatPlaceholders = []; $decimalAlphaPlaceholders = []; $dateRelatedPlaceholders = []; $specialDecimalPlaceholders = []; $softwareVersionPlaceholders = []; $postalCodePlaceholders = []; // 精准保护邮编 $bracketedNumPlaceholders = []; // 精准保护括号内数字 // 保护括号内数字(仅匹配(960-1279)这类格式) $bracketedNumPattern = '~ \(\d+[-\d]*\d+\) # 仅匹配带括号的数字/数字范围 ~ux'; if (@preg_match($bracketedNumPattern, '') !== false) { $correctedContent = preg_replace_callback( $bracketedNumPattern, function ($matches) use (&$bracketedNumPlaceholders, $originalContent, &$searchOffset) { $fullMatch = $matches[0]; $placeholder = '___BRACKETED_NUM_' . uniqid() . '___'; $bracketedNumPlaceholders[$placeholder] = $fullMatch; $posStart = strpos($originalContent, $fullMatch, $searchOffset); $searchOffset = $posStart !== false ? $posStart + strlen($fullMatch) : $searchOffset; return $placeholder; }, $correctedContent ); } // 精准保护邮编(仅匹配“地名+空格+4-6位数字” $postalCodePattern = '~ \b(?:[A-Za-z]+(?:\s+[A-Za-z]+)*|[\x{4e00}-\x{9fa5}]+)\s+\d{4,6}\b # 强制空格(如Jiangsu 223300、北京 100000) |\b0\d{2,3}\d{7}\b # 兼容区号+固定电话(02588888888、01012345678) ~uix'; if (@preg_match($postalCodePattern, '') !== false) { $correctedContent = preg_replace_callback( $postalCodePattern, function ($matches) use (&$postalCodePlaceholders, $originalContent, &$searchOffset) { $fullMatch = $matches[0]; $placeholder = '___POSTAL_CODE_' . uniqid() . '___'; $postalCodePlaceholders[$placeholder] = $fullMatch; $posStart = strpos($originalContent, $fullMatch, $searchOffset); $searchOffset = $posStart !== false ? $posStart + strlen($fullMatch) : $searchOffset; return $placeholder; }, $correctedContent ); } //保护软件版本 $softwareVersionPattern = '~ \b(?:[A-Za-z]+(?:\s+[A-Za-z]+)*|[\x{4e00}-\x{9fa5}]+(?:\s+[\x{4e00}-\x{9fa5}]+)*)\s+\d+\.\d+(?:\.\d+)*\b ~uix'; if (@preg_match($softwareVersionPattern, '') !== false) { $correctedContent = preg_replace_callback( $softwareVersionPattern, function ($matches) use (&$softwareVersionPlaceholders, $originalContent, &$searchOffset) { $fullMatch = $matches[0]; $placeholder = '___SOFTWARE_VERSION_' . uniqid() . '___'; $softwareVersionPlaceholders[$placeholder] = $fullMatch; $posStart = strpos($originalContent, $fullMatch, $searchOffset); $searchOffset = $posStart !== false ? $posStart + strlen($fullMatch) : $searchOffset; return $placeholder; }, $correctedContent ); } //保护特殊小数 $specialDecimalPattern = '~ a=\s*[\d+\.\d+[A-Za-z]+\d*\-+]+ |\b\d+\.\d+[A-Za-z]+\d*\b |\b\d+\.\d+[-+]\d+\.\d+[A-Za-z]+\d*\b |\b\d+\.\d+[-+]\d+\.\d+\b ~ux'; if (@preg_match($specialDecimalPattern, '') !== false) { $correctedContent = preg_replace_callback( $specialDecimalPattern, function ($matches) use (&$specialDecimalPlaceholders, $originalContent, &$searchOffset) { $fullMatch = $matches[0]; $placeholder = '___SPECIAL_DECIMAL_' . uniqid() . '___'; $specialDecimalPlaceholders[$placeholder] = $fullMatch; $posStart = strpos($originalContent, $fullMatch, $searchOffset); $searchOffset = $posStart !== false ? $posStart + strlen($fullMatch) : $searchOffset; return $placeholder; }, $correctedContent ); } // 保护年份/年月格式(2023、202309、2023-0021等) $dateRelatedPattern = '~ \b(?:20\d{2}|20\d{2}(0[1-9]|1[0-2])|20\d{2}-00\d{2})\b(?!\s*[A-Za-z]|\.) ~ux'; if (@preg_match($dateRelatedPattern, '') !== false) { $correctedContent = preg_replace_callback( $dateRelatedPattern, function ($matches) use (&$dateRelatedPlaceholders, $originalContent, &$searchOffset) { $fullMatch = $matches[0]; $placeholder = '___DATE_PROTECT_' . uniqid() . '___'; $dateRelatedPlaceholders[$placeholder] = $fullMatch; $posStart = strpos($originalContent, $fullMatch, $searchOffset); $searchOffset = $posStart !== false ? $posStart + strlen($fullMatch) : $searchOffset; return $placeholder; }, $correctedContent ); } //6. 保护0.00Ac类格式(如1.20mL、0.50mg,避免误删末尾零) $decimalAlphaPattern = '~ \b(?:\d+\.\d+[A-Za-z]+|\d+\.[A-Za-z]+)\b(?!\s*[0-9.]) ~ux'; if (@preg_match($decimalAlphaPattern, '') !== false) { $correctedContent = preg_replace_callback( $decimalAlphaPattern, function ($matches) use (&$decimalAlphaPlaceholders, $originalContent, &$searchOffset) { $fullMatch = $matches[0]; $placeholder = '___DECIMAL_ALPHA_' . uniqid() . '___'; $decimalAlphaPlaceholders[$placeholder] = $fullMatch; $posStart = strpos($originalContent, $fullMatch, $searchOffset); $searchOffset = $posStart !== false ? $posStart + strlen($fullMatch) : $searchOffset; return $placeholder; }, $correctedContent ); } //7. 保护通用前缀格式(如ID 123、REF AB456) $universalPrefixPattern = '~ (?:^|\s|\() (?:(?!No\.|NO\.|PO|SO|SN|BN|REF|ORD|ID|PID)[A-Za-z]{1,3}(?:s?\.?)) \s* (?:[A-Za-z]+\d+|\d+[A-Za-z]+|[A-Za-z]+\d+[A-Za-z]+|\d{1,3}(?:,\d{3})*|\d+) (?:$|\s|\)|\,|\.) ~ux'; if (@preg_match($universalPrefixPattern, '') !== false) { $correctedContent = preg_replace_callback( $universalPrefixPattern, function ($matches) use (&$prefixFormatPlaceholders, $originalContent, &$searchOffset) { $fullMatch = $matches[0]; $placeholder = '___UNIVERSAL_PREFIX_' . uniqid() . '___'; $prefixFormatPlaceholders[$placeholder] = $fullMatch; $posStart = strpos($originalContent, $fullMatch, $searchOffset); $searchOffset = $posStart !== false ? $posStart + strlen($fullMatch) : $searchOffset; return $placeholder; }, $correctedContent ); } // 保护URL/DOI(避免链接中的数字被误加千分位) $urlDoiPattern = '#([^\w]|^)(https?://[^<>\s]+|doi:\s{0,10}\d{1,10}\.\d{1,10}/[A-Za-z0-9]{1,30})([^\w]|$)#i'; if (@preg_match($urlDoiPattern, '') !== false) { $correctedContent = preg_replace_callback( $urlDoiPattern, function ($matches) use (&$urlDoiPlaceholders, $originalContent, &$searchOffset) { $fullMatch = $matches[0]; $placeholder = '___URL_DOI_' . uniqid() . '___'; $urlDoiPlaceholders[$placeholder] = $fullMatch; $posStart = strpos($originalContent, $fullMatch, $searchOffset); $searchOffset = $posStart !== false ? $posStart + strlen($fullMatch) : $searchOffset; return $placeholder; }, $correctedContent ); } // 小数零处理(仅删除普通小数的无效零,跳过特殊格式) $decimalTrailingZeroPattern = '~(-?\d+\.\d*[1-9])0+(?!\d|e|E|___DATE_PROTECT_|___DECIMAL_ALPHA_|___UNIVERSAL_PREFIX_|No\.|PO|SO|___SPECIAL_DECIMAL_|___SOFTWARE_VERSION_|___POSTAL_CODE_|___BRACKETED_NUM_|\-|\+|[A-Za-z])~ix'; preg_match_all($decimalTrailingZeroPattern, $correctedContent, $trailingMatches); foreach (array_unique($trailingMatches[0]) as $number) { if (strpos($number, '___POSTAL_CODE_') !== false || strpos($number, '___BRACKETED_NUM_') !== false) { continue; } if (preg_match($decimalTrailingZeroPattern, $number, $numMatch)) { $replacements[$number] = $numMatch[1]; $posStart = strpos($originalContent, $number, $searchOffset); $posEnd = $posStart !== false ? $posStart + strlen($number) : -1; $searchOffset = $posEnd !== -1 ? $posEnd : $searchOffset; $currentCorrected = strtr($originalContent, $replacements); $errors[] = $this->createError( $number, $numMatch[1], "删除普通小数后末尾无效零", $originalContent, $currentCorrected, $posStart, $posEnd, 'invalid_zero' ); } } $decimalAllZeroPattern = '~(-?\d+)\.0+(?!\d|e|E|___DATE_PROTECT_|___DECIMAL_ALPHA_|___UNIVERSAL_PREFIX_|No\.|PO|SO|___SPECIAL_DECIMAL_|___SOFTWARE_VERSION_|___POSTAL_CODE_|___BRACKETED_NUM_|\-|\+|[A-Za-z])~ix'; preg_match_all($decimalAllZeroPattern, $correctedContent, $allZeroMatches); foreach (array_unique($allZeroMatches[0]) as $number) { if (strpos($number, '___POSTAL_CODE_') !== false || strpos($number, '___BRACKETED_NUM_') !== false) { continue; } if (preg_match($decimalAllZeroPattern, $number, $numMatch)) { $replacements[$number] = $numMatch[1]; $posStart = strpos($originalContent, $number, $searchOffset); $posEnd = $posStart !== false ? $posStart + strlen($number) : -1; $searchOffset = $posEnd !== -1 ? $posEnd : $searchOffset; $currentCorrected = strtr($originalContent, $replacements); $errors[] = $this->createError( $number, $numMatch[1], "删除普通小数后全量无效零", $originalContent, $currentCorrected, $posStart, $posEnd, 'invalid_zero' ); } } $correctedContent = strtr($correctedContent, $replacements); // 千分位处理 $excludePatterns = implode('|', [ 'https?://[^<>\s]+|doi:\s{0,10}\d{1,10}\.\d{1,10}/[A-Za-z0-9]{1,30}', '20\d{2}(?:0[1-9]|1[0-2])?(?:0[1-9]|[12]\d|3[01])?|20\d{2}-00\d{2}', '\d+\.\d+[A-Za-z]+|\d+\.[A-Za-z]+', '\(\d+[-\d]*\d+\)', '(?:[A-Za-z]+(?:\s+[A-Za-z]+)*|[\x{4e00}-\x{9fa5}]+)\s+\d{4,6}\b|0\d{2,3}\d{7}\b', '(?:[A-Za-z]+(?:\s+[A-Za-z]+)*|[\x{4e00}-\x{9fa5}]+(?:\s+[\x{4e00}-\x{9fa5}]+)*)\s+\d+\.\d+(?:\.\d+)*', '[A-Za-z]{1,3}s?\.?\s*(?:[A-Za-z]+\d+|\d+[A-Za-z]+|\d{1,3}(?:,\d{3})*|\d+)', 'No\.?\s*\d+|PO\s*\d+|SO\s*\d+|SN\s*\d+', 'a=\s*[\d+\.\d+[A-Za-z]+\d*\-+]+', '___DATE_PROTECT_.*?___|___DECIMAL_ALPHA_.*?___|___UNIVERSAL_PREFIX_.*?___|___URL_DOI_.*?___|___SPECIAL_DECIMAL_.*?___|___SOFTWARE_VERSION_.*?___|___POSTAL_CODE_.*?___|___BRACKETED_NUM_.*?___' ]); $thousandPattern = sprintf( '#(?createError( $original, $formatted, "四位及以上的数字需要每三位加一个逗号", $originalContent, $currentCorrected, $posStart, $posEnd, 'thousandth_separator' ); return $formatted; }, $correctedContent ); } // 恢复所有保护内容(按优先级反向,避免相互干扰) $correctedContent = strtr($correctedContent, $bracketedNumPlaceholders); $correctedContent = strtr($correctedContent, $postalCodePlaceholders); $correctedContent = strtr($correctedContent, $softwareVersionPlaceholders); $correctedContent = strtr($correctedContent, $specialDecimalPlaceholders); $correctedContent = strtr($correctedContent, $dateRelatedPlaceholders); $correctedContent = strtr($correctedContent, $decimalAlphaPlaceholders); $correctedContent = strtr($correctedContent, $prefixFormatPlaceholders); $correctedContent = strtr($correctedContent, $urlDoiPlaceholders); // 清理残留占位符(防止异常情况下占位符未替换) $correctedContent = preg_replace('~___(BRACKETED_NUM|POSTAL_CODE|SOFTWARE_VERSION|SPECIAL_DECIMAL|DATE_PROTECT|DECIMAL_ALPHA|UNIVERSAL_PREFIX|URL_DOI)_.*?___~', '', $correctedContent); $this->handleErrors($errors); return is_string($correctedContent) ? $correctedContent : $defaultReturn; } /** * No. 123456格式统一 */ private function checkNoFormatUniformity($content) { $errors = []; if (!is_string($content) || trim($content) === '') { $this->handleErrors($errors); return $content; } $corrected = $content; $replaceMap = []; $originalContent = $corrected; $searchOffset = 0; // 关键:精准排除规则 $postalCodePattern = '~(?:[A-Za-z]+(?:\s+[A-Za-z]+)*|[\x{4e00}-\x{9fa5}]+)\s+\d{4,6}\b~u'; // 邮编 $areaCodePattern = '~0\d{2,3}\d{7}\b~u'; // 区号 $urlPattern = '~https?://[^<>\s]+~i'; // URL(如https://test.com/10.1101/2024.11.10) $doiPattern = '~doi:\s{0,10}\d{1,10}\.\d{1,10}/[A-Za-z0-9\-_]+~i'; // DOI(如doi:11.1/1-1-1-1-9_2) $batchNumberRules = [ [ 'name' => 'No.前缀批号', 'pattern' => '~ \b (?:[Nn][Oo]\.|[Nn][Oo]|NO\.|NO) \s* (\d+[A-Za-z0-9\-_]*) \b (?!\s*[年月日]|20\d{2}(?:0[1-9]|1[0-2])?|\.\d+|20\d{2}-00\d{2} |https?://[^<>\s]+ # 排除URL |doi:\s{0,10}\d{1,10}\.\d{1,10}/[A-Za-z0-9\-_]+) # 排除DOI ~ux', 'standardPrefix' => 'No.', 'spaceAfterPrefix' => true, 'description' => '带No.前缀的编号(如No. 123、NO.45-A)' ], [ 'name' => '业务前缀批号', 'pattern' => '~ \b (PO|SO|SN|BN|REF|ORD|ID|PID) \s* (\d+[A-Za-z0-9\-_]*) \b (?!\s*[年月日]|20\d{2}(?:0[1-9]|1[0-2])?|\.\d+|20\d{2}-00\d{2} |https?://[^<>\s]+ # 排除URL |doi:\s{0,10}\d{1,10}\.\d{1,10}/[A-Za-z0-9\-_]+) # 排除DOI ~iux', 'standardPrefix' => function($match) { return strtoupper($match[1]); }, 'spaceAfterPrefix' => true, 'description' => '带业务前缀的编号' ], // [ // 'name' => '多段式批号', // 'pattern' => '~ // \b // (?:\d+[A-Za-z]?[-_/])+ // \d+[A-Za-z]? // \b // (?!\s*[年月日]|20\d{2}(?:0[1-9]|1[0-2])?|20\d{2}-00\d{2} // |\d+\.\d+[A-Za-z]+ // |https?://[^<>\s]+ # 排除URL // |doi:\s{0,10}\d{1,10}\.\d{1,10}/[A-Za-z0-9\-_]+) # 排除DOI // ~ux', // 'standardize' => function($original) use ($postalCodePattern, $areaCodePattern, $urlPattern, $doiPattern) { // // 排除URL、DOI、邮编、区号、日期 // if (preg_match('~20\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])?|20\d{2}-00\d{2}~', $original) // || preg_match($postalCodePattern, $original) // || preg_match($areaCodePattern, $original) // || preg_match($urlPattern, $original) // || preg_match($doiPattern, $original)) { // return $original; // } // return preg_replace(['~[-_/]+~', '~\s+~'], ['-', ''], $original); // }, // 'description' => '多段式编号(如2023-AB-123、XY_456-78)' // ], [ 'name' => '混合批号', 'pattern' => '~ \b (?: \d{6,}(?!20\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])?|20\d{2}-00\d{2} |(?<=[A-Za-z\s])\d{4,6}\b # 排除邮编 |0\d{2,3}\d{7}\b # 排除区号 |https?://[^<>\s]+ # 排除URL |doi:\s{0,10}\d{1,10}\.\d{1,10}/[A-Za-z0-9\-_]+) # 排除DOI |[A-Za-z]{2,}\d{4,} |[A-Za-z0-9]{8,}(?!20\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])?|20\d{2}-00\d{2} |https?://[^<>\s]+ # 排除URL |doi:\s{0,10}\d{1,10}\.\d{1,10}/[A-Za-z0-9\-_]+) # 排除DOI ) \b (?!\s*[年月日]) (?!\.\d+) (?!\d+\.\d+[A-Za-z]+) (?!(?:^|\s|\()(?:[A-Za-z]{1,3}(?:s?\.?))\s*) ~ux', 'standardize' => function($original) use ($postalCodePattern, $areaCodePattern, $urlPattern, $doiPattern) { // 排除URL、DOI、邮编、区号、日期 if (preg_match('~20\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])?|20\d{2}-00\d{2}~', $original) || preg_match($postalCodePattern, $original) || preg_match($areaCodePattern, $original) || preg_match($urlPattern, $original) || preg_match($doiPattern, $original)) { return $original; } return ctype_digit($original) ? $original : $original; }, 'description' => '纯数字/字母混合编号' ] ]; foreach ($batchNumberRules as $rule) { if (@preg_match($rule['pattern'], '') === false) continue; if (preg_match_all($rule['pattern'], $corrected, $matches, PREG_SET_ORDER)) { foreach ($matches as $match) { $originalFull = $match[0]; $fixedFull = $originalFull; // 核心排除逻辑:新增URL和DOI的判断 if (preg_match($postalCodePattern, $originalFull) || preg_match($areaCodePattern, $originalFull) || preg_match('~20\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])?|20\d{2}-00\d{2}~', $originalFull) || preg_match($urlPattern, $originalFull) // 跳过URL || preg_match($doiPattern, $originalFull)) { // 跳过DOI continue; } if (isset($rule['standardPrefix'])) { preg_match($rule['pattern'], $originalFull, $parts); $body = $parts[1]; $standardPrefix = is_callable($rule['standardPrefix']) ? $rule['standardPrefix']($parts) : $rule['standardPrefix']; $space = $rule['spaceAfterPrefix'] ? ' ' : ''; $fixedFull = $standardPrefix . $space . $body; } elseif (isset($rule['standardize']) && is_callable($rule['standardize'])) { $fixedFull = $rule['standardize']($originalFull); } if ($originalFull !== $fixedFull && !isset($replaceMap[$originalFull])) { $replaceMap[$originalFull] = $fixedFull; $posStart = strpos($originalContent, $originalFull, $searchOffset); $posEnd = $posStart !== false ? $posStart + strlen($originalFull) : -1; $searchOffset = $posEnd !== -1 ? $posEnd : $searchOffset; $errorHash = md5($originalFull . $fixedFull); $errors[$errorHash] = $this->createError( $originalFull, $fixedFull, "{$rule['description']}格式不规范,标准格式为「{$fixedFull}」", $originalContent, strtr($originalContent, $replaceMap), $posStart, $posEnd, $rule['name'] ); } } } } $corrected = !empty($replaceMap) ? strtr($corrected, $replaceMap) : $corrected; $this->handleErrors($errors); return $corrected; } /** * 时间单位缩写校对 */ private function checkTimeUnitAbbreviations($content) { $errors = []; if (!is_string($content) || trim($content) === '') { $this->handleErrors($errors); return $content; } $corrected = $content; $replaceMap = []; $originalContent = $corrected; $searchOffset = 0; // 定义时间单位规则 $timeUnits = [ [ 'full' => 'hour', 'plural' => 'hours', 'abbr' => 'h', 'description' => '小时', 'cn_full' => '小时', // 中文全称 'cn_plural' => '小时' // 中文单复数同形 ], [ 'full' => 'minute', 'plural' => 'minutes', 'abbr' => 'min', 'description' => '分钟', 'cn_full' => '分钟', 'cn_plural' => '分钟' ], [ 'full' => 'second', 'plural' => 'seconds', 'abbr' => 's', 'description' => '秒', 'cn_full' => '秒', 'cn_plural' => '秒' ] ]; foreach ($timeUnits as $unit) { $pattern = "~ (?createError( $original, $fixed, "{$unit['description']}格式不规范:{$errorReason},标准格式为[数字{$unit['abbr']}](如3h、2.5min)", $originalContent, strtr($originalContent, $replaceMap + [$original => $fixed]), $posStart, $posEnd, $errorType ); } // 记录替换映射 if (!isset($replaceMap[$original])) { $replaceMap[$original] = $fixed; } } } } } // 批量替换并处理错误 if (!empty($replaceMap)) { $corrected = strtr($corrected, $replaceMap); } $this->handleErrors($errors); return $corrected; } /** * 毫升单位校对 */ private function checkMlUnit($content) { $errors = []; if (!is_string($content) || trim($content) === '') { $this->handleErrors($errors); return $content; } $corrected = $content; $replaceMap = []; $originalContent = $corrected; // 保存完整原始内容 $searchOffset = 0; // 用于计算错误位置的偏移量(避免重复定位) // 优化正则规则: // 1. 排除字母后接ML(如Yeh ML,ML为人名缩写) // 2. 精准匹配毫升单位(支持数字前缀如“5ml”“3.0 ML”,或纯单位如“ml”“ML”) $mlPattern = '/ (?createError( $originalFull, $fixedFull, '毫升单位格式不规范,标准写法为[mL]', $originalContent, strtr($originalContent, $replaceMap + [$originalFull => $fixedFull]), $posStart, $posEnd, $errorType ); } // 记录替换映射(去重,避免同一内容多次替换) if (!isset($replaceMap[$originalFull])) { $replaceMap[$originalFull] = $fixedFull; } } } // 批量替换所有不规范单位(高效处理,避免循环替换) if (!empty($replaceMap)) { $corrected = strtr($corrected, $replaceMap); } } $this->handleErrors($errors); return $corrected; } /** * 显著性P斜体校对 */ private function checkPSignificance($content) { $errors = []; if (!is_string($content) || trim($content) === '') { $this->handleErrors($errors); return $content; } $corrected = $content; $replaceMap = []; $originalContent = $corrected; // 保存完整原始内容 $searchOffset = 0; // 用于计算错误位置的偏移量(避免重复定位) // 优化正则规则(覆盖P/p全场景,支持科学计数法) $pValuePattern = '/\b([Pp])(\s*=?\s*)(\d+(?:\.\d+)?(?:[eE][+-]?\d+)?)\b/'; // 正则有效性校验 if (@preg_match($pValuePattern, '') === false) { } elseif (preg_match_all($pValuePattern, $corrected, $allMatches, PREG_SET_ORDER)) { foreach ($allMatches as $matchItem) { $original = $matchItem[0]; // 原始P值内容(如 "P=0.05"、"p < 0.01") $pChar = $matchItem[1]; // P/p字符(如 "P"、"p") $separator = $matchItem[2];// 分隔符(如 "="、" < ") $number = $matchItem[3]; // 数值部分(如 "0.05"、"1.2e-3") // 生成修正内容(仅P/p加斜体) $fixed = "{$pChar}{$separator}{$number}"; // 仅处理有变化的场景 if ($original !== $fixed) { // 计算原始P值内容在完整原始文本中的位置 $posStart = strpos($originalContent, $original, $searchOffset); $posEnd = ($posStart !== false) ? $posStart + strlen($original) : -1; $searchOffset = ($posEnd !== -1) ? $posEnd : $searchOffset + strlen($original); // 更新偏移量 // 错误去重(哈希机制) $errorHash = md5($original . $fixed); $errorType = 'P'; if (!isset($errors[$errorHash])) { $errors[$errorHash] = $this->createError( $original, $fixed, '显著性P值格式不规范,P/p应使用斜体', $originalContent, strtr($originalContent, $replaceMap + [$original => $fixed]), $posStart, $posEnd, $errorType ); } // 记录替换映射(去重) if (!isset($replaceMap[$original])) { $replaceMap[$original] = $fixed; } } } // 批量替换 if (!empty($replaceMap)) { $corrected = strtr($corrected, $replaceMap); } } $this->handleErrors($errors); return $corrected; } /** * 图表标题一律使用全称Figure 1, Table 1.不能写成Fig. 1, Tab 1. */ private function checkFigureTableTitle($content) { $errors = []; // 严格输入验证:空内容/非字符串直接返回 if (!is_string($content) || trim($content) === '') { $this->handleErrors($errors); return $content; } $corrected = $content; $replaceMap = []; $originalContent = $corrected; // 备份原始内容,用于错误信息 $searchOffset = 0; // 错误位置计算偏移量,避免重复定位 // 图表标题匹配正则(支持 Fig/Figs/Tab/Tabs、特殊空格、数字范围) $titlePattern = '/(? 0) { foreach ($allMatches as $matchItem) { $originalFull = $matchItem[0]; // 完整错误片段(如 "Fig 1"、"Tabs-2") $abbrBase = $matchItem[1]; // 缩写主体(Fig/Figs/Tab/Tabs) $dot = $matchItem[2]; // 可能的点(.) $space = $matchItem[3]; // 可能的空格(含特殊空格) $number = $matchItem[4]; // 数字部分(支持范围如 "2-3") // 确定全称及错误描述 switch (strtolower($abbrBase)) { case 'fig': $fullName = 'Figure'; $errorDesc = '图表标题使用缩写"Fig",正确:"Figure"'; break; case 'figs': $fullName = 'Figures'; $errorDesc = '图表标题复数使用缩写"Figs",正确:"Figures"'; break; case 'tab': $fullName = 'Table'; $errorDesc = '表格标题使用缩写"Tab",正确:"Table"'; break; case 'tabs': $fullName = 'Tables'; $errorDesc = '表格标题复数使用缩写"Tabs",正确:"Tables"'; break; default: $fullName = ''; $errorDesc = ''; continue 2; // 修复警告:跳出 switch + 跳过当前 foreach 迭代 } // 生成标准格式(全称 + 单个空格 + 数字) $fixed = "{$fullName} {$number}"; // 仅处理需要修正的场景(避免无意义操作) if ($originalFull !== $fixed) { // 计算错误片段在原始文本中的位置 $posStart = strpos($originalContent, $originalFull, $searchOffset); $posEnd = ($posStart !== false) ? $posStart + strlen($originalFull) : -1; $searchOffset = ($posEnd !== -1) ? $posEnd : $searchOffset + strlen($originalFull); // 错误信息去重(基于原始+修正内容哈希) $errorHash = md5($originalFull . $fixed); if (!isset($errors[$errorHash])) { // 生成临时修正内容,用于错误信息预览 $tempReplace = $replaceMap; $tempReplace[$originalFull] = $fixed; $currentCorrected = strtr($originalContent, $tempReplace); $errors[$errorHash] = $this->createError( $originalFull, $fixed, $errorDesc, $originalContent, $currentCorrected, $posStart, $posEnd, $fullName ); } // 记录替换规则(去重,避免重复替换) if (!isset($replaceMap[$originalFull])) { $replaceMap[$originalFull] = $fixed; } } } // 批量执行所有替换(高效处理) if (!empty($replaceMap)) { $corrected = strtr($corrected, $replaceMap); } } } // 处理错误信息(需确保 handleErrors 方法已实现) $this->handleErrors($errors); return $corrected; } /** * 添加错误信息 */ private function addError($error = []) { if (!empty($error) && is_array($error)) { // 确保错误信息结构完整 $safeError = array_merge([ 'verbatim_texts' => '', 'revised_content' => '', 'explanation' => '', 'original' => '', 'corrected' => '', 'position_start' => '', 'position_end' => '', ], $error); $this->errors[] = $safeError; } } /** * 处理错误信息(去重和存储) */ private function handleErrors($errors) { if (empty($errors)) return; // 错误去重 $uniqueErrors = []; foreach ($errors as $error) { $errorHash = md5($error['verbatim_texts'] . $error['revised_content']. $error['position_start']. $error['position_end']); if (!isset($uniqueErrors[$errorHash])) { $uniqueErrors[$errorHash] = $error; } } // 批量添加错误 foreach (array_values($uniqueErrors) as $error) { $this->addError($error); } } /** * 创建标准化错误信息 */ private function createError($verbatim='', $revised='', $explanation='',$original='',$corrected='', $position_start=-1, $position_end=-1,$error_type='') { return [ 'verbatim_texts' => $verbatim, 'revised_content' => $revised, 'explanation' => $explanation, 'original' => $original, 'corrected' => $corrected, 'position_start' => $position_start, 'position_end' => $position_end, 'error_type' => $error_type ]; } /** * 检查doi链接是否都能打开 */ private function checkDoi($content) { $errors = []; if (!is_string($content) || trim($content) === '') { $this->handleErrors($errors); // 注意:原代码笔误“handleErrorsErrors”已修正 return $content; } $corrected = $content; $originalContent = $corrected; $checkedDois = []; // 用于去重,避免同一DOI重复校验 try { // 优化正则:匹配标准DOI格式(覆盖所有常见场景) // 匹配规则说明: // 1. (?createError( 'DOI正则错误', '跳过DOI校验', "DOI匹配正则语法错误:{$doiPattern},已跳过该校验流程", $originalContent, $corrected ); } else { // 匹配所有符合标准的DOI(PREG_SET_ORDER按匹配项分组) $matchCount = preg_match_all($doiPattern, $corrected, $allMatches, PREG_SET_ORDER); if ($matchCount > 0) { foreach ($allMatches as $matchItem) { $fullDoi = strtolower($matchItem[1]) . ':' . $matchItem[2]; // 完整DOI(统一转为小写,如“doi:10.1017/abc”) $doiCore = $matchItem[2]; // DOI核心部分(如“10.1017/abc”,用于拼接访问链接) // 去重:同一DOI仅校验一次 if (isset($checkedDois[$fullDoi])) { continue; } $checkedDois[$fullDoi] = true; // 测试DOI链接是否可访问 $isAccessible = $this->testDoiAccessibility($doiCore); // 生成错误/状态信息 if ($isAccessible) { $errorDesc = "DOI「{$fullDoi}」格式规范,且链接可正常访问"; } else { $errorDesc = "DOI「{$fullDoi}」格式规范,但链接无法访问(可能无效或网络问题)"; } // 记录校验结果(DOI无需修正,仅记录状态) $errors[] = $this->createError( $fullDoi, $fullDoi, // 修正后内容与原始一致(DOI格式无需修改) $errorDesc, $originalContent, $corrected ); } } else { // 无匹配时记录提示(可选,根据业务需求决定是否保留) $errors[] = $this->createError( '未匹配到DOI', '无修正', '文本中未发现符合标准格式的DOI(如doi:10.1017/abc、DOI: 10.1038/nature12345)', $originalContent, $corrected ); } } } catch (Exception $e) { $errors[] = $this->createError( 'DOI校验全局异常', '已回滚原始内容', "DOI校验出错:{$e->getMessage()}(行号:{$e->getLine()}),已恢复原始输入", $originalContent, $originalContent ); $corrected = $originalContent; } $this->handleErrors($errors); return $corrected; } /** * 测试DOI链接是否可访问(基于DOI官方解析地址) * @param string $doiCore DOI核心部分(如“10.1017/abc”,不含“doi:”前缀) * @return bool 可访问返回true,否则返回false */ private function testDoiAccessibility($doiCore) { // 处理DOI核心部分的空格(若存在) $doiCore = trim($doiCore); // DOI官方解析地址:https://doi.org/ + 编码后的DOI核心部分 $doiUrl = 'https://doi.org/' . $doiCore; var_dump($doiUrl,$doiCore);exit; // 初始化cURL(支持HTTPS,忽略证书问题避免环境限制) $ch = curl_init(); curl_setopt_array($ch, [ CURLOPT_URL => $doiUrl, CURLOPT_RETURNTRANSFER => true, // 不直接输出响应 CURLOPT_HEADER => true, // 获取响应头(用于判断状态码) CURLOPT_TIMEOUT => 15, // 超时时间(避免长时间阻塞) CURLOPT_FOLLOWLOCATION => true, // 跟随301/302重定向(DOI常跳转到期刊页面) CURLOPT_SSL_VERIFYPEER => false, // 忽略SSL证书校验(适合测试环境) CURLOPT_SSL_VERIFYHOST => false ]); curl_exec($ch); $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); // 获取HTTP状态码 curl_close($ch); // 状态码200-399表示可访问(200成功,3xx重定向均视为有效) return $httpCode >= 200 && $httpCode < 400; } } ?>