From 4d0cec198fa78f00462c08fd0544e8abba830582 Mon Sep 17 00:00:00 2001 From: chengxl Date: Wed, 21 Jan 2026 13:13:19 +0800 Subject: [PATCH] =?UTF-8?q?latex=20=E4=BB=A3=E7=A0=81=E8=B0=83=E6=95=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- application/common/FigureTagProcessor.php | 104 ++++++++++++++++------ 1 file changed, 76 insertions(+), 28 deletions(-) diff --git a/application/common/FigureTagProcessor.php b/application/common/FigureTagProcessor.php index 7956f88..ee82f1a 100644 --- a/application/common/FigureTagProcessor.php +++ b/application/common/FigureTagProcessor.php @@ -1,5 +1,6 @@ 2, 'data' => '']; } + //超长文本保护 if (strlen($html) > self::MAX_HTML_LENGTH) { return ['status' => 4, 'data' => $html]; } + //编码处理 if (!mb_check_encoding($html, 'UTF-8')) { $html = mb_convert_encoding($html, 'UTF-8', 'GBK,GB2312,ASCII,ISO-8859-1'); } + //初始化映射数组(过滤非数字键值) $this->initImageMap($aImageMain); //原始内容 $originalHtml = $html; $hasReplace = false; + try { //只要包含数字+字母/数字后缀,直接返回原内容(核心修复) if ($this->hasFigureSuffix($html)) { return ['status' => 4, 'data' => $html]; } + //合并拆分标签的Figure+数字 $html = $this->preprocessSplitTags($html); - //替换 + + //替换(核心修复:适配样式标签+后缀标点场景) $html = $this->replaceFigureInHtml($html, $hasReplace); + //清理冗余样式/标签 if ($hasReplace) { $html = $this->cleanRedundantStyles($html); @@ -69,6 +78,7 @@ class FigureTagProcessor{ 'data' => $html ]; } + /** * 全局检测是否包含Figure数字+字母/数字后缀 * 覆盖所有拆分/嵌套/无标签场景,无论是否有空白 @@ -86,8 +96,11 @@ class FigureTagProcessor{ // 正则3:嵌套标签场景(4B / 4123) $pattern3 = "/figure\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*\d+[a-zA-Z0-9]\s*(?:<\/(?:{$styleTagsPattern})>)/iu"; - return preg_match($pattern1, $html) || preg_match($pattern2, $html) || preg_match($pattern3, $html); + + // 加@抑制正则警告 + return @preg_match($pattern1, $html) || @preg_match($pattern2, $html) || @preg_match($pattern3, $html); } + /** * 初始化Figure数字映射数组 * @param array $aImageMain 原始映射数组 @@ -106,6 +119,7 @@ class FigureTagProcessor{ } $this->aImageMain = $imageMap; } + /** * 合并所有拆分标签的Figure+数字(含空白样式标签) * @param string $html 待处理HTML @@ -116,19 +130,22 @@ class FigureTagProcessor{ // 正则1:匹配基础拆分标签的Figure+数字 $pattern = "/(figure)\s*(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\d+)/iu"; - $html = preg_replace_callback($pattern, function($matches) { + $html = @preg_replace_callback($pattern, function($matches) { return $matches[1] . ' ' . $matches[2]; }, $html); // 正则2:匹配多轮拆分标签的Figure+数字(含空白) $pattern2 = "/(figure)(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\s*)\s*(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\d+)/iu"; - $html = preg_replace_callback($pattern2, function($matches) { + $html = @preg_replace_callback($pattern2, function($matches) { return $matches[1] . $matches[2] . $matches[3]; }, $html); + return $html; } + /** * 核心替换逻辑:将纯数字Figure替换为myfigure标签 + * 修复:适配样式标签包裹 + 后缀标点场景(如 Figure 2.) * @param string $html 待处理HTML * @param bool $hasReplace 是否发生替换(引用传递) * @return string @@ -138,13 +155,14 @@ class FigureTagProcessor{ $styleTagsRegex = "(?:<(?:{$styleTagsPattern})[^>]*>)*\s*"; $styleTagsCloseRegex = "\s*(?:<\/(?:{$styleTagsPattern})>)*"; - // 正则1:匹配括号内的纯数字Figure(如 (Figure 2)、(Figure 3)) - // $pattern1 = "/\(\s*{$styleTagsRegex}figure\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*(?![a-zA-Z0-9])/iu"; - $pattern1 = "/\(\s*{$styleTagsRegex}figure\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*(?![a-zA-Z0-9])/iuD"; - $html = preg_replace_callback($pattern1, function($matches) use (&$hasReplace) { + // 正则1:匹配括号内的纯数字Figure(如 (Figure 2)、(Figure 3)、(Figure 3).) + $pattern1 = "/\(\s*{$styleTagsRegex}figure\s*(\d+){$styleTagsCloseRegex}\s*([\.,:]{0,1})\s*\)\s*([\.,:]{0,1})/iuD"; + $html = @preg_replace_callback($pattern1, function($matches) use (&$hasReplace) { $num = $matches[1]; $numInt = (int)$num; - $suffix = $matches[2] ?? ''; + $suffix1 = $matches[2] ?? ''; + $suffix2 = $matches[3] ?? ''; + $suffix = $suffix1 . $suffix2; // 过滤条件:非数字、无映射、已处理过的标签 if (!ctype_digit($num) || !isset($this->aImageMain[$numInt]) || @@ -161,10 +179,9 @@ class FigureTagProcessor{ return $target; }, $html); - // 正则2:匹配无括号的纯数字Figure(如 Figure 2、Figure 3:) - // $pattern2 = "/{$styleTagsRegex}figure\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}(?![a-zA-Z0-9])/iu"; - $pattern2 = "/{$styleTagsRegex}figure\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*(?![a-zA-Z0-9])/iuD"; - $html = preg_replace_callback($pattern2, function($matches) use (&$hasReplace) { + // 正则2:匹配无括号的纯数字Figure(核心修复:适配 Figure 2. 场景) + $pattern2 = "/{$styleTagsRegex}figure\s*(\d+){$styleTagsCloseRegex}\s*([\.,:]{0,1})(?![a-zA-Z0-9])/iuD"; + $html = @preg_replace_callback($pattern2, function($matches) use (&$hasReplace) { $num = $matches[1]; $numInt = (int)$num; $suffix = $matches[2] ?? ''; @@ -186,6 +203,7 @@ class FigureTagProcessor{ return $html; } + /** * 检测当前匹配内容是否已包含myfigure标签(避免重复替换) * @param string $content 匹配的文本片段 @@ -195,8 +213,9 @@ class FigureTagProcessor{ private function isMatchPositionHasMyFigureTag($content, $figureText){ $escapedText = preg_quote($figureText, '/'); $pattern = '/<' . self::PROCESSED_TAG . '[^>]*>\s*' . $escapedText . '\s*<\/' . self::PROCESSED_TAG . '>/is'; - return (bool)preg_match($pattern, $content); + return (bool)@preg_match($pattern, $content); } + /** * 清理myfigure标签周围的冗余样式标签 * @param string $html 待处理HTML @@ -205,14 +224,15 @@ class FigureTagProcessor{ private function cleanRedundantStyles($html){ foreach (self::STYLE_TAGS as $tag) { $pattern = '/<' . $tag . '>\s*<'.self::PROCESSED_TAG.'([^>]*?)>(.*?)<\/'.self::PROCESSED_TAG.'>([\.,:]{0,1})\s*<\/' . $tag . '>/is'; - $html = preg_replace($pattern, '<'.self::PROCESSED_TAG.'$1>$2$3', $html); + $html = @preg_replace($pattern, '<'.self::PROCESSED_TAG.'$1>$2$3', $html); } // 清理无匹配的闭合样式标签 - $html = preg_replace('/<\/('.implode('|', self::STYLE_TAGS).')>(?![^<]*<\1>)/is', '', $html); + $html = @preg_replace('/<\/('.implode('|', self::STYLE_TAGS).')>(?![^<]*<\1>)/is', '', $html); return $html; } + /** * 清理myfigure标签周围的冗余标点 * @param string $html 待处理HTML @@ -220,17 +240,43 @@ class FigureTagProcessor{ */ private function cleanRedundantPunctuation($html){ // 修复括号+标点的冗余格式 - $html = preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\(Figure \d+\)<\/'.self::PROCESSED_TAG.'>\)\./i', + $html = @preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\(Figure \d+\)<\/'.self::PROCESSED_TAG.'>\)\./i', '<'.self::PROCESSED_TAG.' data-id="$1">(Figure $1).', $html); - $html = preg_replace('/<\/'.self::PROCESSED_TAG.'>\)\.([\.,:]{0,1})/', ')$1', $html); - $html = preg_replace('/<\/'.self::PROCESSED_TAG.'>\.\)([\.,:]{0,1})/', ')$1', $html); + $html = @preg_replace('/<\/'.self::PROCESSED_TAG.'>\)\.([\.,:]{0,1})/', ')$1', $html); + $html = @preg_replace('/<\/'.self::PROCESSED_TAG.'>\.\)([\.,:]{0,1})/', ')$1', $html); // 清理重复标点 - $html = preg_replace('/<\/'.self::PROCESSED_TAG.'>([\.,:]){2,}/', '$1', $html); + $html = @preg_replace('/<\/'.self::PROCESSED_TAG.'>([\.,:]){2,}/', '$1', $html); // 修复括号内的标签冗余 - $html = preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\((Figure \d+)\s*<\/'.self::PROCESSED_TAG.'>([\.,:]{0,1})/i', + $html = @preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\((Figure \d+)\s*<\/'.self::PROCESSED_TAG.'>([\.,:]{0,1})/i', '<'.self::PROCESSED_TAG.' data-id="$1">($2)$3', $html); + + $html = $this->cleanExtraParentheses($html); + + return $html; + } + + /** + * 清理文本中多余的成对括号(仅处理myfigure标签相关的括号) + * @param string $html 待处理文本 + * @return string + */ + private function cleanExtraParentheses($html){ + // 匹配myfigure标签周围的括号区域 + $pattern = '/(\()*(<'.self::PROCESSED_TAG.'[^>]*>.*?<\/'.self::PROCESSED_TAG.'>)(\))*/is'; + + $html = @preg_replace_callback($pattern, function($matches) { + $tagContent = $matches[2]; + $leftParen = $matches[1] ?? ''; + $rightParen = $matches[3] ?? ''; + + // 只保留1个左括号和1个右括号(无论原始有多少) + $newLeft = $leftParen ? '(' : ''; + $newRight = $rightParen ? ')' : ''; + + return $newLeft . $tagContent . $newRight; + }, $html); return $html; } @@ -242,11 +288,11 @@ class FigureTagProcessor{ private function cleanUnclosedTags($html){ foreach (self::STYLE_TAGS as $tag) { // 清理myfigure标签后的冗余闭合标签 - $html = preg_replace('/(<\/'.self::PROCESSED_TAG.'>)\s*<\/' . $tag . '>/i', '$1', $html); + $html = @preg_replace('/(<\/'.self::PROCESSED_TAG.'>)\s*<\/' . $tag . '>/i', '$1', $html); // 定位所有该标签的开闭标签位置 - preg_match_all("/<{$tag}\b[^>]*>/i", $html, $openMatches, PREG_OFFSET_CAPTURE); - preg_match_all("/<\/{$tag}>/i", $html, $closeMatches, PREG_OFFSET_CAPTURE); + @preg_match_all("/<{$tag}\b[^>]*>/i", $html, $openMatches, PREG_OFFSET_CAPTURE); + @preg_match_all("/<\/{$tag}>/i", $html, $closeMatches, PREG_OFFSET_CAPTURE); $allTags = []; // 收集开标签 @@ -308,6 +354,7 @@ class FigureTagProcessor{ return $html; } + /** * 优化文本格式(清理多余空格) * @param string $html 待处理HTML @@ -315,14 +362,15 @@ class FigureTagProcessor{ */ private function optimizeFormat($html){ // 清理连续空格 - $html = preg_replace('/\s{2,}/', ' ', trim($html)); + $html = @preg_replace('/\s{2,}/', ' ', trim($html)); // 标签后紧跟字母/数字时加空格 - $html = preg_replace('/<\/'.self::PROCESSED_TAG.'>([A-Za-z0-9])/is', ' $1', $html); + $html = @preg_replace('/<\/'.self::PROCESSED_TAG.'>([A-Za-z0-9])/is', ' $1', $html); // 字母/数字紧跟标签前时加空格 - $html = preg_replace('/([a-zA-Z0-9])<'.self::PROCESSED_TAG.'/is', '$1 <'.self::PROCESSED_TAG.'', $html); + $html = @preg_replace('/([a-zA-Z0-9])<'.self::PROCESSED_TAG.'/is', '$1 <'.self::PROCESSED_TAG.'', $html); return $html; } + /** * 清理嵌套的myfigure标签(避免重复嵌套) * @param string $html 待处理HTML @@ -330,7 +378,7 @@ class FigureTagProcessor{ */ private function cleanDuplicateNestedTags($html){ $pattern = '/<'.self::PROCESSED_TAG.'[^>]*>\s*<'.self::PROCESSED_TAG.'([^>]*)>(.*?)<\/'.self::PROCESSED_TAG.'>\s*<\/'.self::PROCESSED_TAG.'>/is'; - $html = preg_replace($pattern, '<'.self::PROCESSED_TAG.'$1>$2', $html); + $html = @preg_replace($pattern, '<'.self::PROCESSED_TAG.'$1>$2', $html); return $html; }