From 4704b61448d6603c91cee058ced40bd309980a15 Mon Sep 17 00:00:00 2001 From: chengxl Date: Sun, 18 Jan 2026 17:07:57 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A4=84=E7=90=86=E6=AD=A3=E6=96=87=E5=86=85?= =?UTF-8?q?=E5=AE=B9=E8=A1=A8=E6=A0=BC/=E5=9B=BE=E7=89=87=E7=9B=B8?= =?UTF-8?q?=E5=85=B3=E8=81=94?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- application/common/ArticleMain.php | 13 + application/common/FigureTagProcessor.php | 292 ++++++++++++++++++++ application/common/TableTagProcessor.php | 316 ++++++++++++++++++++++ 3 files changed, 621 insertions(+) create mode 100644 application/common/ArticleMain.php create mode 100644 application/common/FigureTagProcessor.php create mode 100644 application/common/TableTagProcessor.php diff --git a/application/common/ArticleMain.php b/application/common/ArticleMain.php new file mode 100644 index 0000000..6d52449 --- /dev/null +++ b/application/common/ArticleMain.php @@ -0,0 +1,13 @@ +状态码, 'data'=>处理后文本] + * status: 2-空输入, 4-无匹配, 5-处理异常, 1-处理成功 + */ + public function dealFigureStr($html = '') { + // 1. 基础输入校验 + if (!is_string($html) || trim($html) === '') { + return ['status' => 2, 'data' => '']; + } + // 2. 超大字符串拦截 + if (strlen($html) > self::MAX_HTML_LENGTH) { + return ['status' => 4, 'data' => $html]; + } + + $originalHtml = $html; + $hasReplace = false; + + try { + // 3. 合并嵌套样式标签 + $mergedHtml = $this->mergeFragmentStyleTags($html); + // 4. 提取纯文本(用于匹配Figure) + $plainText = preg_replace('/<[^>]+>/', ' ', $mergedHtml); + $plainText = preg_replace('/\s+/', ' ', trim($plainText)); + + // 5. 提取所有匹配的Figure数字 + $allMatches = $this->extractAllFigureMatches($plainText); + if (empty($allMatches)) { + return ['status' => 4, 'data' => $originalHtml]; + } + + // 6. 替换为myfigure标签 + $html = $this->replaceFigureWithTag($html, $allMatches, $hasReplace); + + // 7. 清理冗余内容(仅替换成功后执行) + if ($hasReplace) { + $html = $this->cleanRedundantStyles($html); + $html = $this->cleanRedundantPunctuation($html); + $html = $this->cleanUnclosedTags($html); + $html = $this->optimizeFormat($html); + } + + } catch (\Throwable $e) { + // 8. 异常处理(记录详细日志) + $errorMsg = sprintf( + '[%s] FigureTagProcessor-dealFigureStr 异常:%s | 文件:%s | 行:%d | 入参MD5:%s | 正则错误:%s', + date('Y-m-d H:i:s'), + $e->getMessage(), + $e->getFile(), + $e->getLine(), + md5($originalHtml), + preg_last_error() ? preg_last_error_msg() : '无' + ); + error_log($errorMsg); + return ['status' => 5, 'data' => $originalHtml]; + } + + return [ + 'status' => $hasReplace ? 1 : 4, + 'data' => $hasReplace ? $html : $originalHtml + ]; + } + + /** + * 合并嵌套的样式标签(如aaabbb → aaa bbb) + * @param string $html + * @return string + */ + private function mergeFragmentStyleTags($html) { + foreach (self::STYLE_TAGS as $tag) { + $pattern = '/(?:<' . $tag . '>)\s*([^<]+?)\s*<\/' . $tag . '>(?:\s*<' . $tag . '>)\s*([^<]+?)\s*<\/' . $tag . '>/is'; + while (@preg_match($pattern, $html)) { // 抑制正则警告 + $html = preg_replace_callback($pattern, function($matches) { + return trim($matches[1]) . ' ' . trim($matches[2]); + }, $html); + } + } + + // 清理括号内的冗余标点/标签 + $html = preg_replace('/(\(.*?\d+)(?:\s*<[^>]+>)*\s*\.*\s*(?:<[^>]+>)*(\s*.*?\))/is', '$1$2', $html); + $html = preg_replace('/\(\s+/', '(', $html); + $html = preg_replace('/\s+\)/', ')', $html); + return $html; + } + + /** + * 从纯文本中提取所有Figure数字(兼容括号/标点/空格) + * @param string $plainText + * @return array + */ + private function extractAllFigureMatches($plainText) { + $allMatches = []; + $processedNums = []; + + // 匹配带括号的Figure(如 (Figure 1.)) + $pattern1 = '/\(Figure\s*(\d+)\b(?!\p{L}|\s+\p{L})(?:\s*[\.,;:]*\s*)\)\s*([\.,:]{0,1})/iu'; + if (@preg_match_all($pattern1, $plainText, $matchesFull, PREG_SET_ORDER)) { + foreach ($matchesFull as $match) { + $num = $match[1]; + if (!ctype_digit($num) || in_array($num, $processedNums)) continue; + $processedNums[] = $num; + $allMatches[$num] = [ + 'hasOuterBracket' => true, + 'validPunct' => $match[2] ?? '', + 'content' => "Figure {$num}" + ]; + } + } + + // 匹配无括号的Figure(如 Figure 1.) + $pattern2 = '/Figure\s*(\d+)\b(?!\p{L}|\s+\p{L})(?:\s*[\.,;:]*\s*)\s*([\.,:]{0,1})/iu'; + if (@preg_match_all($pattern2, $plainText, $matchesOther, PREG_SET_ORDER)) { + foreach ($matchesOther as $match) { + $num = $match[1]; + if (!ctype_digit($num) || in_array($num, $processedNums)) continue; + $processedNums[] = $num; + $allMatches[$num] = [ + 'hasOuterBracket' => false, + 'validPunct' => $match[2] ?? '', + 'content' => "Figure {$num}" + ]; + } + } + + krsort($allMatches); + return $allMatches; + } + + /** + * 将匹配的Figure替换为myfigure标签(优化标签格式) + * @param string $html + * @param array $allMatches + * @param bool $hasReplace + * @return string + */ + private function replaceFigureWithTag($html, $allMatches, &$hasReplace) { + foreach ($allMatches as $num => $info) { + $innerContent = $info['hasOuterBracket'] + ? "({$info['content']})" + : $info['content']; + + // 核心修改:规范myfigure标签格式(去掉属性值空格、加双引号) + // 最终生成:Figure 1 + $targetTag = "{$innerContent}"; + if (!empty($info['validPunct']) && !$info['hasOuterBracket']) { + $targetTag .= $info['validPunct']; + } + + $patternSuffix = '(?!\p{L}|\s+\p{L})'; + $pattern = $info['hasOuterBracket'] + ? '/\(\s*(?:<[^>]+>|\s)*Figure(?:<[^>]+>|\s)*' . $num . '\b' . $patternSuffix . '(?:\s*[\.,;:]*\s*|\s*<[^>]+>)*\s*\)/iu' + : '/\s*(?:<[^>]+>|\s)*Figure(?:<[^>]+>|\s)*' . $num . '\b' . $patternSuffix . '(?:\s*[\.,;:]*\s*|\s*<[^>]+>)*\s*([\.,:]{0,1})/iu'; + + // 执行替换(最多替换1次,避免重复) + $html = @preg_replace($pattern, $targetTag, $html, 1, $count); + if ($count > 0) { + $hasReplace = true; + error_log("[FigureTagProcessor] 替换成功 - ID:{$num} 括号:".($info['hasOuterBracket']?'是':'否')); + } + } + return $html; + } + + /** + * 清理myfigure标签周围的冗余样式标签(适配新标签格式) + * @param string $html + * @return string + */ + private function cleanRedundantStyles($html) { + foreach (self::STYLE_TAGS as $tag) { + // 修改正则:适配 data-id="数字" 的格式 + $pattern = '/<' . $tag . '>\s*]*)>(.*?)<\/myfigure>([\.,:]{0,1})\s*<\/' . $tag . '>/is'; + $html = @preg_replace($pattern, '$2$3', $html); + } + // 清理孤立的样式闭标签 + $html = preg_replace('/<\/('.implode('|', self::STYLE_TAGS).')>(?![^<]*<\1>)/is', '', $html); + return $html; + } + + /** + * 清理myfigure标签后的冗余标点(适配新标签格式) + * @param string $html + * @return string + */ + private function cleanRedundantPunctuation($html) { + // 修改正则:将 data-id = (\d+) 改为 data-id="(\d+)",适配新格式 + $html = preg_replace('/\(Figure \d+\)<\/myfigure>\)\./i', '(Figure $1).', $html); + $html = preg_replace('/<\/myfigure>\)\.([\.,:]{0,1})/', ')$1', $html); + $html = preg_replace('/<\/myfigure>\.\)([\.,:]{0,1})/', ')$1', $html); + $html = preg_replace('/<\/myfigure>([\.,:]){2,}/', '$1', $html); + // 同步修改此处正则的属性格式 + $html = preg_replace('/\((Figure \d+)\s*<\/myfigure>([\.,:]{0,1})/i', + '($2)$3', $html); + return $html; + } + + /** + * 清理孤立的样式标签(优先暴力清理myfigure后标签,再用栈算法兜底) + * @param string $html + * @return string + */ + private function cleanUnclosedTags($html) { + // 第一步:暴力清理myfigure后孤立的样式闭标签 + foreach (self::STYLE_TAGS as $tag) { + $html = @preg_replace('/(<\/myfigure>)\s*<\/' . $tag . '>/i', '$1', $html); + } + + // 第二步:栈算法清理其他孤立标签 + foreach (self::STYLE_TAGS as $tag) { + @preg_match_all("/<{$tag}\b[^>]*>/i", $html, $openMatches, PREG_OFFSET_CAPTURE); + @preg_match_all("/<\/{$tag}>/i", $html, $closeMatches, PREG_OFFSET_CAPTURE); + + $allTags = []; + foreach ($openMatches[0] as $m) { + $allTags[] = [ + 'offset' => $m[1], + 'type' => 'open', + 'content' => $m[0], + 'length' => strlen($m[0]) + ]; + } + foreach ($closeMatches[0] as $m) { + $allTags[] = [ + 'offset' => $m[1], + 'type' => 'close', + 'content' => $m[0], + 'length' => strlen($m[0]) + ]; + } + usort($allTags, function($a, $b) { + return $a['offset'] - $b['offset']; + }); + + $tagStack = []; + $removeOffsets = []; + foreach ($allTags as $t) { + if ($t['type'] == 'open') { + array_push($tagStack, $t); + } else { + if (!empty($tagStack)) { + array_pop($tagStack); + } else { + $removeOffsets[] = [ + 'pos' => $t['offset'], + 'len' => $t['length'], + 'content' => $t['content'] + ]; + } + } + } + foreach ($tagStack as $t) { + $removeOffsets[] = [ + 'pos' => $t['offset'], + 'len' => $t['length'], + 'content' => $t['content'] + ]; + } + + // 倒序删除,避免偏移错乱 + usort($removeOffsets, function($a, $b) { + return $b['pos'] - $a['pos']; + }); + foreach ($removeOffsets as $item) { + if ($item['pos'] >= 0 && $item['pos'] < strlen($html)) { + $html = substr_replace($html, '', $item['pos'], $item['len']); + } + } + } + return $html; + } + + /** + * 优化文本格式(合并多余空格,规范myfigure标签前后空格) + * @param string $html + * @return string + */ + private function optimizeFormat($html) { + $html = preg_replace('/\s{2,}/', ' ', trim($html)); + $html = preg_replace('/<\/myfigure>([A-Za-z0-9])/is', ' $1', $html); + $html = preg_replace('/([a-zA-Z0-9])1001, 2=>1002]) + * @return array ['status'=>状态码, 'data'=>处理后文本] + * status: 2-空输入, 4-无匹配/已处理, 5-处理异常, 1-处理成功 + */ + public function dealTableStr($html = '', $aTableMain = []) { + // 1. 基础输入校验 + if (!is_string($html) || trim($html) === '') { + return ['status' => 2, 'data' => '']; + } + // 2. 超大字符串拦截(防止内存溢出) + if (strlen($html) > self::MAX_HTML_LENGTH) { + $this->logWarning('处理文本超出最大长度限制', ['length' => strlen($html)]); + return ['status' => 4, 'data' => $html]; + } + + // 初始化主键映射数组(过滤非数字键/值,保证数据合法性) + if(!empty($aTableMain)){ + $aTableMainNew = []; + foreach ($aTableMain as $key => $value) { + if (!ctype_digit((string)$key) || !ctype_digit((string)$value)) { + continue; + } + $keyInt = (int)$key; + $aTableMainNew[$keyInt + 1] = $value; + } + $this->aTableMain = $aTableMainNew; + } + + $originalHtml = $html; + $hasReplace = false; + + try { + // 核心:直接在原始HTML中匹配所有符合规则的Table(含嵌套标签) + $html = $this->replaceTableInHtml($html, $hasReplace); + + // 清理冗余内容(仅替换成功后执行,保证输出整洁) + if ($hasReplace) { + $html = $this->cleanRedundantStyles($html); + $html = $this->cleanRedundantPunctuation($html); + $html = $this->cleanUnclosedTags($html); + $html = $this->optimizeFormat($html); + $html = $this->cleanDuplicateNestedTags($html); + } + + } catch (\Throwable $e) { + // 异常兜底:捕获所有异常,记录详细日志,返回原始文本避免业务中断 + $pregError = preg_last_error(); + $pregErrorMsg = $this->getPregErrorMsg($pregError); + $errorMsg = sprintf( + '[%s] TableTagProcessor-dealTableStr 异常:%s | 文件:%s | 行:%d | 入参MD5:%s | 正则错误:%s', + date('Y-m-d H:i:s'), + $e->getMessage(), + $e->getFile(), + $e->getLine(), + md5($originalHtml), + $pregErrorMsg + ); + $this->logError($errorMsg); + return ['status' => 5, 'data' => $originalHtml]; + } + + return [ + 'status' => $hasReplace ? 1 : 4, + 'data' => $html + ]; + } + + /** + * 核心方法:直接在HTML中匹配并替换Table(支持嵌套标签) + * @param string $html + * @param bool $hasReplace 引用传递:标记是否有替换 + * @return string + */ + private function replaceTableInHtml($html, &$hasReplace) { + $styleTagsPattern = implode('|', self::STYLE_TAGS); + $styleTagsRegex = "(?:<(?:{$styleTagsPattern})[^>]*>)*\s*"; // 匹配任意嵌套样式标签 + $styleTagsCloseRegex = "\s*(?:<\/(?:{$styleTagsPattern})>)*"; + + // 规则1:匹配带括号的Table(如 (Table 82)、(Table 1.)) + $pattern1 = "/\(\s*{$styleTagsRegex}table\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*\)/iu"; + $html = preg_replace_callback($pattern1, function($matches) use (&$hasReplace) { + $num = $matches[1]; + $numInt = intval($num); + $suffix = $matches[2] ?? ''; + + // 校验:纯数字 + 有映射ID + 未被mytable包裹(避免重复替换) + if (!ctype_digit($num) || !isset($this->aTableMain[$numInt]) || + $this->isMatchPositionHasMyTableTag($matches[0], "Table {$num}")) { + return $matches[0]; + } + + $primaryId = $this->aTableMain[$numInt]; + // 核心修改:规范mytable标签格式(属性值加双引号、去掉两侧空格) + $baseTag = "<".self::PROCESSED_TAG." data-id=\"{$primaryId}\">Table {$num}"; + $target = "({$baseTag}{$suffix})"; + + $hasReplace = true; + $this->logInfo("替换带括号Table成功", ['num' => $num, 'primary_id' => $primaryId]); + return $target; + }, $html); + + // 规则2:匹配无括号的Table(如 Table 1、Table 2:Table 3.) + $pattern2 = "/{$styleTagsRegex}table\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}(?![a-zA-Z])/iu"; + $html = preg_replace_callback($pattern2, function($matches) use (&$hasReplace) { + $num = $matches[1]; + $numInt = intval($num); + $suffix = $matches[2] ?? ''; + + // 校验:纯数字 + 有映射ID + 未被mytable包裹 + 不是数字+字母组合 + if (!ctype_digit($num) || !isset($this->aTableMain[$numInt]) || + $this->isMatchPositionHasMyTableTag($matches[0], "Table {$num}")) { + return $matches[0]; + } + + $primaryId = $this->aTableMain[$numInt]; + // 核心修改:规范mytable标签格式(属性值加双引号、去掉两侧空格) + $baseTag = "<".self::PROCESSED_TAG." data-id=\"{$primaryId}\">Table {$num}"; + $target = "{$baseTag}{$suffix}"; + + $hasReplace = true; + $this->logInfo("替换无括号Table成功", ['num' => $num, 'primary_id' => $primaryId]); + return $target; + }, $html); + + return $html; + } + + /** + * 清理mytable标签周围的冗余样式标签 + * @param string $html + * @return string + */ + private function cleanRedundantStyles($html) { + foreach (self::STYLE_TAGS as $tag) { + $pattern = '/<' . $tag . '>\s*<'.self::PROCESSED_TAG.'([^>]*?)>(.*?)<\/'.self::PROCESSED_TAG.'>([\.,:]{0,1})\s*<\/' . $tag . '>/is'; + $html = @preg_replace($pattern, '<'.self::PROCESSED_TAG.'$1>$2$3', $html); + } + // 清理孤立的样式闭标签(避免标签残留) + $html = preg_replace('/<\/('.implode('|', self::STYLE_TAGS).')>(?![^<]*<\1>)/is', '', $html); + return $html; + } + + /** + * 清理mytable标签后的冗余标点(保证格式整洁) + * @param string $html + * @return string + */ + private function cleanRedundantPunctuation($html) { + // 核心修改:适配新的mytable标签格式(data-id="数字") + $html = preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\(Table \d+\)<\/'.self::PROCESSED_TAG.'>\)\./i', '<'.self::PROCESSED_TAG.' data-id="$1">(Table $1).', $html); + $html = preg_replace('/<\/'.self::PROCESSED_TAG.'>\)\.([\.,:]{0,1})/', ')$1', $html); + $html = preg_replace('/<\/'.self::PROCESSED_TAG.'>\.\)([\.,:]{0,1})/', ')$1', $html); + $html = preg_replace('/<\/'.self::PROCESSED_TAG.'>([\.,:]){2,}/', '$1', $html); + $html = preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\((Table \d+)\s*<\/'.self::PROCESSED_TAG.'>([\.,:]{0,1})/i', + '<'.self::PROCESSED_TAG.' data-id="$1">($2)$3', $html); + return $html; + } + + /** + * 清理孤立的样式标签(栈算法兜底,避免标签不闭合) + * @param string $html + * @return string + */ + private function cleanUnclosedTags($html) { + // 清理mytable后孤立的样式闭标签 + foreach (self::STYLE_TAGS as $tag) { + $html = @preg_replace('/(<\/'.self::PROCESSED_TAG.'>)\s*<\/' . $tag . '>/i', '$1', $html); + } + + // 栈算法清理其他孤立标签 + foreach (self::STYLE_TAGS as $tag) { + @preg_match_all("/<{$tag}\b[^>]*>/i", $html, $openMatches, PREG_OFFSET_CAPTURE); + @preg_match_all("/<\/{$tag}>/i", $html, $closeMatches, PREG_OFFSET_CAPTURE); + + $allTags = []; + foreach ($openMatches[0] as $m) { + $allTags[] = ['offset' => $m[1], 'type' => 'open', 'content' => $m[0], 'length' => strlen($m[0])]; + } + foreach ($closeMatches[0] as $m) { + $allTags[] = ['offset' => $m[1], 'type' => 'close', 'content' => $m[0], 'length' => strlen($m[0])]; + } + usort($allTags, function($a, $b) { + return $a['offset'] - $b['offset']; + }); + + $tagStack = []; + $removeOffsets = []; + foreach ($allTags as $t) { + if ($t['type'] == 'open') { + array_push($tagStack, $t); + } else { + if (!empty($tagStack)) { + array_pop($tagStack); + } else { + $removeOffsets[] = $t; + } + } + } + foreach ($tagStack as $t) { + $removeOffsets[] = $t; + } + + // 倒序删除,避免偏移错乱 + usort($removeOffsets, function($a, $b) { + return $b['offset'] - $a['offset']; + }); + foreach ($removeOffsets as $item) { + if ($item['offset'] >= 0 && $item['offset'] < strlen($html)) { + $html = substr_replace($html, '', $item['offset'], $item['length']); + } + } + } + return $html; + } + + /** + * 优化文本格式(合并多余空格,规范标签前后空格) + * @param string $html + * @return string + */ + private function optimizeFormat($html) { + $html = preg_replace('/\s{2,}/', ' ', trim($html)); + $html = preg_replace('/<\/'.self::PROCESSED_TAG.'>([A-Za-z0-9])/is', ' $1', $html); + $html = preg_replace('/([a-zA-Z0-9])<'.self::PROCESSED_TAG.'/is', '$1 <'.self::PROCESSED_TAG.'', $html); + return $html; + } + + /** + * 清理重复嵌套的mytable标签(兜底方案) + * @param string $html + * @return string + */ + private function cleanDuplicateNestedTags($html) { + $pattern = '/<'.self::PROCESSED_TAG.'[^>]*>\s*<'.self::PROCESSED_TAG.'([^>]*)>(.*?)<\/'.self::PROCESSED_TAG.'>\s*<\/'.self::PROCESSED_TAG.'>/is'; + $html = preg_replace($pattern, '<'.self::PROCESSED_TAG.'$1>$2', $html); + return $html; + } + + /** + * 判断指定Table内容是否被mytable标签包裹 + * @param string $content 待检查内容 + * @param string $tableText Table文本(如 "Table 1") + * @return bool + */ + private function isMatchPositionHasMyTableTag($content, $tableText) { + $escapedText = preg_quote($tableText, '/'); + $pattern = '/<' . self::PROCESSED_TAG . '[^>]*>\s*' . $escapedText . '\s*<\/' . self::PROCESSED_TAG . '>/is'; + return @preg_match($pattern, $content) === 1; + } + + /** + * 获取正则错误信息(便于调试) + * @param int $pregError 正则错误码 + * @return string + */ + private function getPregErrorMsg($pregError) { + $errorCodes = [ + PREG_INTERNAL_ERROR => '内部错误', + PREG_BACKTRACK_LIMIT_ERROR => '回溯限制超出', + PREG_RECURSION_LIMIT_ERROR => '递归限制超出', + PREG_BAD_UTF8_ERROR => '无效UTF-8字符', + PREG_BAD_UTF8_OFFSET_ERROR => 'UTF-8偏移量无效', + PREG_JIT_STACKLIMIT_ERROR => 'JIT栈限制超出' + ]; + return isset($errorCodes[$pregError]) ? $errorCodes[$pregError] : "未知错误({$pregError})"; + } + + /** + * 记录错误日志(生产环境可对接日志系统) + * @param string $msg + * @param array $context + */ + private function logError($msg, $context = []) { + error_log(json_encode(['level' => 'error', 'msg' => $msg, 'context' => $context, 'time' => date('Y-m-d H:i:s')])); + } + + /** + * 记录警告日志 + * @param string $msg + * @param array $context + */ + private function logWarning($msg, $context = []) { + error_log(json_encode(['level' => 'warning', 'msg' => $msg, 'context' => $context, 'time' => date('Y-m-d H:i:s')])); + } + + /** + * 记录信息日志 + * @param string $msg + * @param array $context + */ + private function logInfo($msg, $context = []) { + error_log(json_encode(['level' => 'info', 'msg' => $msg, 'context' => $context, 'time' => date('Y-m-d H:i:s')])); + } +} \ No newline at end of file