处理正文内容表格/图片相关联

2026-01-19 13:52:09 +08:00
parent b217ab03fd
commit c2ee3b170f
2 changed files with 445 additions and 269 deletions
--- a/application/common/FigureTagProcessor.php
+++ b/application/common/FigureTagProcessor.php
@@ -1,52 +1,63 @@
 <?php
 namespace app\common;
-class FigureTagProcessor {
-    // 可配置的样式标签列表（解耦）
-    const STYLE_TAGS = ['i', 'b', 'font', 'strong', 'em','blue'];
-    // 最大处理字符串长度（避免内存溢出）
-    const MAX_HTML_LENGTH = 100000;
-
+/**
+ * 功能：精准匹配并替换Figure相关格式为myfigure标签
+ * 支持格式：figure 数字、(figure 数字)、figure 数字:/figure 数字.（含嵌套/拆分标签）
+ * 跳过已被myfigure包裹的Figure（含后缀）
+ * 跳过Figure 数字+字母/数字后缀（含拆分标签场景，无论是否有空白）
+ * 正常处理Figure 数字+空白/样式标签场景
+ */
+class FigureTagProcessor{
+    //支持的样式标签列表
+    private const STYLE_TAGS = ['i', 'b', 'font', 'strong', 'em','blue'];
+    //HTML文本最大处理长度（防止内存溢出）
+    private const MAX_HTML_LENGTH = 100000;
+    //替换后的目标标签名
+    private const PROCESSED_TAG = 'myfigure';
+    //Figure数字与对应ID的映射数组
+    private $aImageMain = [];
    /**
-     * 处理Figure文本，替换为myfigure标签并清理冗余内容
+     * 处理Figure标签替换的主方法
     * @param string $html 待处理的HTML文本
-     * @return array ['status'=>状态码, 'data'=>处理后文本]
-     *         status: 2-空输入, 4-无匹配, 5-处理异常, 1-处理成功
+     * @param array $aImageMain Figure数字=>ID的映射数组
+     * @return array ['status' => 状态码, 'data' => 处理后文本]
+     *         status说明：2-空文本, 4-无匹配/已处理, 1-处理成功, 5-处理异常
     */
-    public function dealFigureStr($html = '') {
-        //验证
-        if (!is_string($html) || trim($html) === '') {
+    public function dealFigureStr($html = '', $aImageMain = []){
+        //空文本校验
+        $html = trim($html);
+        if ($html === '' || !is_string($html)) {
            return ['status' => 2, 'data' => ''];
        }
-        //超大字符串拦截
+        //超长文本保护
        if (strlen($html) > self::MAX_HTML_LENGTH) {
            return ['status' => 4, 'data' => $html];
        }
-
+        //编码处理
+        if (!mb_check_encoding($html, 'UTF-8')) {
+            $html = mb_convert_encoding($html, 'UTF-8', 'GBK,GB2312,ASCII,ISO-8859-1');
+        }
+        //初始化映射数组（过滤非数字键值）
+        $this->initImageMap($aImageMain);
+        //原始内容
        $originalHtml = $html;
        $hasReplace = false;
-
        try {
-            //合并嵌套样式标签
-            $mergedHtml = $this->mergeFragmentStyleTags($html);
-            //提取纯文本（用于匹配Figure）
-            $plainText = preg_replace('/<[^>]+>/', ' ', $mergedHtml);
-            $plainText = preg_replace('/\s+/', ' ', trim($plainText));
-
-            //提取所有匹配的Figure数字
-            $allMatches = $this->extractAllFigureMatches($plainText);
-            if (empty($allMatches)) {
-                return ['status' => 4, 'data' => $originalHtml];
+            //只要包含数字+字母/数字后缀，直接返回原内容（核心修复）
+            if ($this->hasFigureSuffix($html)) {
+                return ['status' => 4, 'data' => $html];
            }
-
-            //替换为myfigure标签
-            $html = $this->replaceFigureWithTag($html, $allMatches, $hasReplace);
-
-            //清理冗余内容（仅替换成功后执行）
+            //合并拆分标签的Figure+数字
+            $html = $this->preprocessSplitTags($html);
+            //替换
+            $html = $this->replaceFigureInHtml($html, $hasReplace);
+            //清理冗余样式/标签
            if ($hasReplace) {
                $html = $this->cleanRedundantStyles($html);
                $html = $this->cleanRedundantPunctuation($html);
                $html = $this->cleanUnclosedTags($html);
                $html = $this->optimizeFormat($html);
+                $html = $this->cleanDuplicateNestedTags($html);
            }

        } catch (\Throwable $e) {
@@ -55,220 +66,272 @@ class FigureTagProcessor {

        return [
            'status' => $hasReplace ? 1 : 4,
-            'data' => $hasReplace ? $html : $originalHtml
+            'data' => $html
        ];
    }
-
    /**
-     * 合并嵌套的样式标签
-     * @param string $html
-     * @return string
+     * 全局检测是否包含Figure数字+字母/数字后缀
+     * 覆盖所有拆分/嵌套/无标签场景，无论是否有空白
+     * @param string $html 待检测HTML
+     * @return bool
     */
-    private function mergeFragmentStyleTags($html) {
-        foreach (self::STYLE_TAGS as $tag) {
-            $pattern = '/(?:<' . $tag . '>)\s*([^<]+?)\s*<\/' . $tag . '>(?:\s*<' . $tag . '>)\s*([^<]+?)\s*<\/' . $tag . '>/is';
-            while (@preg_match($pattern, $html)) { // 抑制正则警告
-                $html = preg_replace_callback($pattern, function($matches) {
-                    return trim($matches[1]) . ' ' . trim($matches[2]);
-                }, $html);
+    private function hasFigureSuffix($html){
+        $styleTagsPattern = implode('|', self::STYLE_TAGS);
+        
+        // 正则1：无标签场景（Figure 4B/4123）
+        $pattern1 = "/figure\s*\d+[a-zA-Z0-9]/iu";
+        
+        // 正则2：拆分标签场景（<b>4</b><b>B</b> / <b>4</b> <b>B</b> / <b>4</b>&nbsp;<b>B</b>）
+        $pattern2 = "/figure\s*(?:<\/(?:{$styleTagsPattern})>)\s*[\s|&nbsp;]*\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\d+)\s*(?:<\/(?:{$styleTagsPattern})>)\s*[\s|&nbsp;]*\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*([a-zA-Z0-9])/iu";
+        
+        // 正则3：嵌套标签场景（<b>4B</b> / <i>4123</i>）
+        $pattern3 = "/figure\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*\d+[a-zA-Z0-9]\s*(?:<\/(?:{$styleTagsPattern})>)/iu";
+        return preg_match($pattern1, $html) || preg_match($pattern2, $html) || preg_match($pattern3, $html);
+    }
+    /**
+     * 初始化Figure数字映射数组
+     * @param array $aImageMain 原始映射数组
+     * @return void
+     */
+    private function initImageMap($aImageMain){
+        if (!is_array($aImageMain)) {
+            $aImageMain = [];
+        }
+        $imageMap = [];
+        foreach ($aImageMain as $key => $value) {
+            // 严格校验键值均为数字
+            if (ctype_digit((string)$key) && ctype_digit((string)$value)) {
+                $imageMap[(int)$key] = (int)$value;
            }
        }
-
-        // 清理括号内的冗余标点/标签
-        $html = preg_replace('/(\(.*?\d+)(?:\s*<[^>]+>)*\s*\.*\s*(?:<[^>]+>)*(\s*.*?\))/is', '$1$2', $html);
-        $html = preg_replace('/\(\s+/', '(', $html);
-        $html = preg_replace('/\s+\)/', ')', $html);
+        $this->aImageMain = $imageMap;
+    }
+    /**
+     * 合并所有拆分标签的Figure+数字（含空白样式标签）
+     * @param string $html 待处理HTML
+     * @return string
+     */
+    private function preprocessSplitTags($html){
+        $styleTagsPattern = implode('|', self::STYLE_TAGS);
+        
+        // 正则1：匹配基础拆分标签的Figure+数字
+        $pattern = "/(figure)\s*(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\d+)/iu";
+        $html = preg_replace_callback($pattern, function($matches) {
+            return $matches[1] . ' ' . $matches[2];
+        }, $html);
+        
+        // 正则2：匹配多轮拆分标签的Figure+数字（含空白）
+        $pattern2 = "/(figure)(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\s*)\s*(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\d+)/iu";
+        $html = preg_replace_callback($pattern2, function($matches) {
+            return $matches[1] . $matches[2] . $matches[3];
+        }, $html);
        return $html;
    }
-
    /**
-     * 从纯文本中提取所有Figure数字（兼容括号/标点/空格）
-     * @param string $plainText
-     * @return array
-     */
-    private function extractAllFigureMatches($plainText) {
-        $allMatches = [];
-        $processedNums = [];
-
-        // 匹配带括号的Figure（如 (Figure 1.)）
-        $pattern1 = '/\(Figure\s*(\d+)\b(?!\p{L}|\s+\p{L})(?:\s*[\.,;:]*\s*)\)\s*([\.,:]{0,1})/iu';
-        if (@preg_match_all($pattern1, $plainText, $matchesFull, PREG_SET_ORDER)) {
-            foreach ($matchesFull as $match) {
-                $num = $match[1];
-                if (!ctype_digit($num) || in_array($num, $processedNums)) continue;
-                $processedNums[] = $num;
-                $allMatches[$num] = [
-                    'hasOuterBracket' => true,
-                    'validPunct' => $match[2] ?? '',
-                    'content' => "Figure {$num}"
-                ];
-            }
-        }
-
-        // 匹配无括号的Figure（如 Figure 1.）
-        $pattern2 = '/Figure\s*(\d+)\b(?!\p{L}|\s+\p{L})(?:\s*[\.,;:]*\s*)\s*([\.,:]{0,1})/iu';
-        if (@preg_match_all($pattern2, $plainText, $matchesOther, PREG_SET_ORDER)) {
-            foreach ($matchesOther as $match) {
-                $num = $match[1];
-                if (!ctype_digit($num) || in_array($num, $processedNums)) continue;
-                $processedNums[] = $num;
-                $allMatches[$num] = [
-                    'hasOuterBracket' => false,
-                    'validPunct' => $match[2] ?? '',
-                    'content' => "Figure {$num}"
-                ];
-            }
-        }
-
-        krsort($allMatches);
-        return $allMatches;
-    }
-
-    /**
-     * 将匹配的Figure替换为myfigure标签（优化标签格式）
-     * @param string $html
-     * @param array $allMatches
-     * @param bool $hasReplace
+     * 核心替换逻辑：将纯数字Figure替换为myfigure标签
+     * @param string $html 待处理HTML
+     * @param bool $hasReplace 是否发生替换（引用传递）
     * @return string
     */
-    private function replaceFigureWithTag($html, $allMatches, &$hasReplace) {
-        foreach ($allMatches as $num => $info) {
-            $innerContent = $info['hasOuterBracket'] 
-                ? "({$info['content']})" 
-                : $info['content'];
+    private function replaceFigureInHtml($html, &$hasReplace){
+        $styleTagsPattern = implode('|', self::STYLE_TAGS);
+        $styleTagsRegex = "(?:<(?:{$styleTagsPattern})[^>]*>)*\s*";
+        $styleTagsCloseRegex = "\s*(?:<\/(?:{$styleTagsPattern})>)*";
+
+        // 正则1：匹配括号内的纯数字Figure（如 (Figure 2)、(<b>Figure 3</b>)）
+        // $pattern1 = "/\(\s*{$styleTagsRegex}figure\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*(?![a-zA-Z0-9])/iu";
+        $pattern1 = "/\(\s*{$styleTagsRegex}figure\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*(?![a-zA-Z0-9])/iuD";
+        $html = preg_replace_callback($pattern1, function($matches) use (&$hasReplace) {
+            $num = $matches[1];
+            $numInt = (int)$num;
+            $suffix = $matches[2] ?? '';
+
+            // 过滤条件：非数字、无映射、已处理过的标签
+            if (!ctype_digit($num) || !isset($this->aImageMain[$numInt]) || 
+                $this->isMatchPositionHasMyFigureTag($matches[0], "Figure {$num}")) {
+                return $matches[0];
+            }
+
+            // 执行替换
+            $primaryId = $this->aImageMain[$numInt];
+            $baseTag = "<" . self::PROCESSED_TAG . " data-id=\"{$primaryId}\">Figure {$num}</" . self::PROCESSED_TAG . ">";
+            $target = "({$baseTag}{$suffix})";
            
-            //<myfigure data-id="1">Figure 1</myfigure>
-            $targetTag = "<myfigure data-id=\"{$num}\">{$innerContent}</myfigure>";
-            if (!empty($info['validPunct']) && !$info['hasOuterBracket']) {
-                $targetTag .= $info['validPunct'];
+            $hasReplace = true;
+            return $target;
+        }, $html);
+
+        // 正则2：匹配无括号的纯数字Figure（如 Figure 2、<i>Figure 3</i>:）
+        // $pattern2 = "/{$styleTagsRegex}figure\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}(?![a-zA-Z0-9])/iu";
+        $pattern2 = "/{$styleTagsRegex}figure\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*(?![a-zA-Z0-9])/iuD";
+        $html = preg_replace_callback($pattern2, function($matches) use (&$hasReplace) {
+            $num = $matches[1];
+            $numInt = (int)$num;
+            $suffix = $matches[2] ?? '';
+
+            // 过滤条件：非数字、无映射、已处理过的标签
+            if (!ctype_digit($num) || !isset($this->aImageMain[$numInt]) || 
+                $this->isMatchPositionHasMyFigureTag($matches[0], "Figure {$num}")) {
+                return $matches[0];
            }

-            $patternSuffix = '(?!\p{L}|\s+\p{L})';
-            $pattern = $info['hasOuterBracket']
-                ? '/\(\s*(?:<[^>]+>|\s)*Figure(?:<[^>]+>|\s)*' . $num . '\b' . $patternSuffix . '(?:\s*[\.,;:]*\s*|\s*<[^>]+>)*\s*\)/iu'
-                : '/\s*(?:<[^>]+>|\s)*Figure(?:<[^>]+>|\s)*' . $num . '\b' . $patternSuffix . '(?:\s*[\.,;:]*\s*|\s*<[^>]+>)*\s*([\.,:]{0,1})/iu';
+            // 执行替换
+            $primaryId = $this->aImageMain[$numInt];
+            $baseTag = "<" . self::PROCESSED_TAG . " data-id=\"{$primaryId}\">Figure {$num}</" . self::PROCESSED_TAG . ">";
+            $target = "{$baseTag}{$suffix}";
+            
+            $hasReplace = true;
+            return $target;
+        }, $html);

-            //执行替换（最多替换1次，避免重复）
-            $html = @preg_replace($pattern, $targetTag, $html, 1, $count);
-            if ($count > 0) {
-                $hasReplace = true;
-                error_log("[FigureTagProcessor] 替换成功 - ID:{$num} 括号:".($info['hasOuterBracket']?'是':'否'));
-            }
-        }
        return $html;
    }
-
    /**
-     * 清理myfigure标签周围的冗余样式标签（适配新标签格式）
-     * @param string $html
+     * 检测当前匹配内容是否已包含myfigure标签（避免重复替换）
+     * @param string $content 匹配的文本片段
+     * @param string $figureText 待检测的Figure文本（如 Figure 2）
+     * @return bool
+     */
+    private function isMatchPositionHasMyFigureTag($content, $figureText){
+        $escapedText = preg_quote($figureText, '/');
+        $pattern = '/<' . self::PROCESSED_TAG . '[^>]*>\s*' . $escapedText . '\s*<\/' . self::PROCESSED_TAG . '>/is';
+        return (bool)preg_match($pattern, $content);
+    }
+    /**
+     * 清理myfigure标签周围的冗余样式标签
+     * @param string $html 待处理HTML
     * @return string
     */
-    private function cleanRedundantStyles($html) {
+    private function cleanRedundantStyles($html){
        foreach (self::STYLE_TAGS as $tag) {
-            $pattern = '/<' . $tag . '>\s*<myfigure([^>]*)>(.*?)<\/myfigure>([\.,:]{0,1})\s*<\/' . $tag . '>/is';
-            $html = @preg_replace($pattern, '<myfigure$1>$2</myfigure>$3', $html);
+            $pattern = '/<' . $tag . '>\s*<'.self::PROCESSED_TAG.'([^>]*?)>(.*?)<\/'.self::PROCESSED_TAG.'>([\.,:]{0,1})\s*<\/' . $tag . '>/is';
+            $html = preg_replace($pattern, '<'.self::PROCESSED_TAG.'$1>$2</'.self::PROCESSED_TAG.'>$3', $html);
        }
-        //清理闭标签
+        
+        // 清理无匹配的闭合样式标签
        $html = preg_replace('/<\/('.implode('|', self::STYLE_TAGS).')>(?![^<]*<\1>)/is', '', $html);
+        
        return $html;
    }
-
    /**
-     * 清理myfigure标签后的冗余标点（适配新标签格式）
-     * @param string $html
+     * 清理myfigure标签周围的冗余标点
+     * @param string $html 待处理HTML
     * @return string
     */
-    private function cleanRedundantPunctuation($html) {
-        $html = preg_replace('/<myfigure data-id="(\d+)">\(Figure \d+\)<\/myfigure>\)\./i', '<myfigure data-id="$1">(Figure $1)</myfigure>.', $html);
-        $html = preg_replace('/<\/myfigure>\)\.([\.,:]{0,1})/', '</myfigure>)$1', $html);
-        $html = preg_replace('/<\/myfigure>\.\)([\.,:]{0,1})/', '</myfigure>)$1', $html);
-        $html = preg_replace('/<\/myfigure>([\.,:]){2,}/', '</myfigure>$1', $html);
-        $html = preg_replace('/<myfigure data-id="(\d+)">\((Figure \d+)\s*<\/myfigure>([\.,:]{0,1})/i', 
-            '<myfigure data-id="$1">($2)</myfigure>$3', $html);
+    private function cleanRedundantPunctuation($html){
+        // 修复括号+标点的冗余格式
+        $html = preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\(Figure \d+\)<\/'.self::PROCESSED_TAG.'>\)\./i', 
+            '<'.self::PROCESSED_TAG.' data-id="$1">(Figure $1)</'.self::PROCESSED_TAG.'>.', $html);
+        $html = preg_replace('/<\/'.self::PROCESSED_TAG.'>\)\.([\.,:]{0,1})/', '</'.self::PROCESSED_TAG.'>)$1', $html);
+        $html = preg_replace('/<\/'.self::PROCESSED_TAG.'>\.\)([\.,:]{0,1})/', '</'.self::PROCESSED_TAG.'>)$1', $html);
+        
+        // 清理重复标点
+        $html = preg_replace('/<\/'.self::PROCESSED_TAG.'>([\.,:]){2,}/', '</'.self::PROCESSED_TAG.'>$1', $html);
+        
+        // 修复括号内的标签冗余
+        $html = preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\((Figure \d+)\s*<\/'.self::PROCESSED_TAG.'>([\.,:]{0,1})/i', 
+            '<'.self::PROCESSED_TAG.' data-id="$1">($2)</'.self::PROCESSED_TAG.'>$3', $html);
+        
        return $html;
    }
-
    /**
-     * 清理孤立的样式标签
-     * @param string $html
+     * 清理未闭合的样式标签
+     * @param string $html 待处理HTML
     * @return string
     */
-    private function cleanUnclosedTags($html) {
+    private function cleanUnclosedTags($html){
        foreach (self::STYLE_TAGS as $tag) {
-            $html = @preg_replace('/(<\/myfigure>)\s*<\/' . $tag . '>/i', '$1', $html);
-        }
-        foreach (self::STYLE_TAGS as $tag) {
-            @preg_match_all("/<{$tag}\b[^>]*>/i", $html, $openMatches, PREG_OFFSET_CAPTURE);
-            @preg_match_all("/<\/{$tag}>/i", $html, $closeMatches, PREG_OFFSET_CAPTURE);
-            
+            // 清理myfigure标签后的冗余闭合标签
+            $html = preg_replace('/(<\/'.self::PROCESSED_TAG.'>)\s*<\/' . $tag . '>/i', '$1', $html);
+
+            // 定位所有该标签的开闭标签位置
+            preg_match_all("/<{$tag}\b[^>]*>/i", $html, $openMatches, PREG_OFFSET_CAPTURE);
+            preg_match_all("/<\/{$tag}>/i", $html, $closeMatches, PREG_OFFSET_CAPTURE);
+
            $allTags = [];
+            // 收集开标签
            foreach ($openMatches[0] as $m) {
                $allTags[] = [
-                    'offset' => $m[1], 
-                    'type' => 'open', 
-                    'content' => $m[0], 
+                    'offset' => $m[1],
+                    'type' => 'open',
+                    'content' => $m[0],
                    'length' => strlen($m[0])
                ];
            }
+            // 收集闭标签
            foreach ($closeMatches[0] as $m) {
                $allTags[] = [
-                    'offset' => $m[1], 
-                    'type' => 'close', 
-                    'content' => $m[0], 
+                    'offset' => $m[1],
+                    'type' => 'close',
+                    'content' => $m[0],
                    'length' => strlen($m[0])
                ];
            }
+
+            // 按位置排序
            usort($allTags, function($a, $b) {
                return $a['offset'] - $b['offset'];
            });

+            // 栈结构匹配开闭标签
            $tagStack = [];
            $removeOffsets = [];
            foreach ($allTags as $t) {
-                if ($t['type'] == 'open') {
+                if ($t['type'] === 'open') {
                    array_push($tagStack, $t);
                } else {
                    if (!empty($tagStack)) {
                        array_pop($tagStack);
                    } else {
-                        $removeOffsets[] = [
-                            'pos' => $t['offset'],
-                            'len' => $t['length'],
-                            'content' => $t['content']
-                        ];
+                        // 无匹配开标签的闭标签，标记删除
+                        $removeOffsets[] = $t;
                    }
                }
            }
+
+            // 无匹配闭标签的开标签，标记删除
            foreach ($tagStack as $t) {
-                $removeOffsets[] = [
-                    'pos' => $t['offset'],
-                    'len' => $t['length'],
-                    'content' => $t['content']
-                ];
+                $removeOffsets[] = $t;
            }

-            // 倒序删除，避免偏移错乱
+            // 按偏移量倒序删除（避免影响后续偏移）
            usort($removeOffsets, function($a, $b) {
-                return $b['pos'] - $a['pos'];
+                return $b['offset'] - $a['offset'];
            });
+
            foreach ($removeOffsets as $item) {
-                if ($item['pos'] >= 0 && $item['pos'] < strlen($html)) {
-                    $html = substr_replace($html, '', $item['pos'], $item['len']);
+                if ($item['offset'] >= 0 && $item['offset'] < strlen($html)) {
+                    $html = substr_replace($html, '', $item['offset'], $item['length']);
                }
            }
        }
-        return $html;
-    }   

+        return $html;
+    }
    /**
-     * 优化文本格式（合并多余空格，规范myfigure标签前后空格）
-     * @param string $html
+     * 优化文本格式（清理多余空格）
+     * @param string $html 待处理HTML
     * @return string
     */
-    private function optimizeFormat($html) {
+    private function optimizeFormat($html){
+        // 清理连续空格
        $html = preg_replace('/\s{2,}/', ' ', trim($html));
-        $html = preg_replace('/<\/myfigure>([A-Za-z0-9])/is', '</myfigure> $1', $html);
-        $html = preg_replace('/([a-zA-Z0-9])<myfigure/is', '$1 <myfigure', $html);
+        // 标签后紧跟字母/数字时加空格
+        $html = preg_replace('/<\/'.self::PROCESSED_TAG.'>([A-Za-z0-9])/is', '</'.self::PROCESSED_TAG.'> $1', $html);
+        // 字母/数字紧跟标签前时加空格
+        $html = preg_replace('/([a-zA-Z0-9])<'.self::PROCESSED_TAG.'/is', '$1 <'.self::PROCESSED_TAG.'', $html);
+        
+        return $html;
+    }
+    /**
+     * 清理嵌套的myfigure标签（避免重复嵌套）
+     * @param string $html 待处理HTML
+     * @return string
+     */
+    private function cleanDuplicateNestedTags($html){
+        $pattern = '/<'.self::PROCESSED_TAG.'[^>]*>\s*<'.self::PROCESSED_TAG.'([^>]*)>(.*?)<\/'.self::PROCESSED_TAG.'>\s*<\/'.self::PROCESSED_TAG.'>/is';
+        $html = preg_replace($pattern, '<'.self::PROCESSED_TAG.'$1>$2</'.self::PROCESSED_TAG.'>', $html);
+        
        return $html;
    }
 }
--- a/application/common/TableTagProcessor.php
+++ b/application/common/TableTagProcessor.php
@@ -2,59 +2,71 @@
 namespace app\common;

 /**
- * Table标签处理器（生产环境终极版）
 * 功能：精准匹配并替换Table相关格式为mytable标签
- * 支持格式：table 数字、(table 数字)、table 数字:/table 数字.（含嵌套标签）
- * 特性：支持任意嵌套标签/括号、不处理数字+字母、仅跳过已被mytable包裹的Table
+ * 支持格式：table 数字、(table 数字)、table 数字:/table 数字.（含嵌套/拆分标签）
+ * 跳过已被mytable包裹的table（含后缀）
+ * 跳过table 数字+字母/数字后缀（含拆分标签场景，无论是否有空白）
+ * 正常处理table 数字+空白/样式标签场景
 */
-class TableTagProcessor {
-    // 可配置的样式标签列表
-    const STYLE_TAGS = ['i', 'b', 'font', 'strong', 'em'];
-    // 最大处理字符串长度
-    const MAX_HTML_LENGTH = 100000;
-    // 目标替换标签
-    const PROCESSED_TAG = 'mytable';
-    // 数据库表格ID映射
+class TableTagProcessor{
+    // 支持的样式标签列表
+    private const STYLE_TAGS = ['i', 'b', 'font', 'strong', 'em','blue'];
+    // HTML文本最大处理长度（防止内存溢出）
+    private const MAX_HTML_LENGTH = 100000;
+    // 替换后的目标标签名
+    private const PROCESSED_TAG = 'mytable';
+    // Table数字与对应ID的映射数组
    private $aTableMain = [];

    /**
-     * 处理Table文本，替换为mytable标签并清理冗余内容
+     * 处理Table标签替换的主方法
     * @param string $html 待处理的HTML文本
-     * @param array $aTableMain Table数字→主键ID的映射数组（如 [1=>1001, 2=>1002]）
-     * @return array ['status'=>状态码, 'data'=>处理后文本]
-     *         status: 2-空输入, 4-无匹配/已处理, 5-处理异常, 1-处理成功
+     * @param array $aTableMain Table数字=>ID的映射数组（可选，默认1=>1~10=>10）
+     * @return array ['status' => 状态码, 'data' => 处理后文本]
+     *         status说明：2-空文本, 4-无匹配/已处理, 1-处理成功, 5-处理异常
     */
-    public function dealTableStr($html = '', $aTableMain = []) {
-        //验证
-        if (!is_string($html) || trim($html) === '') {
+    public function dealTableStr($html = '', $aTableMain = []){
+        // 初始化默认映射数组（仅当入参为空时使用）
+        $defaultTableMap = [1=>1,2=>2,3=>3,4=>4,5=>5,6=>6,7=>7,8=>8,9=>9,10=>10];
+        // 优先使用入参，入参为空则用默认值
+        $tableMap = !empty($aTableMain) ? $aTableMain : $defaultTableMap;
+
+        // 空文本校验
+        $html = trim($html);
+        if ($html === '' || !is_string($html)) {
            return ['status' => 2, 'data' => ''];
        }
-        //超大字符串拦截（防止内存溢出）
+
+        // 超长文本保护
        if (strlen($html) > self::MAX_HTML_LENGTH) {
            return ['status' => 4, 'data' => $html];
        }

-        //初始化主键映射数组
-        if(!empty($aTableMain)){
-            $aTableMainNew = [];
-            foreach ($aTableMain as $key => $value) {
-                if (!ctype_digit((string)$key) || !ctype_digit((string)$value)) {
-                    continue;
-                }
-                $keyInt = (int)$key;
-                $aTableMainNew[$keyInt + 1] = $value;
-            }
-            $this->aTableMain = $aTableMainNew;
+        // 编码处理（统一转为UTF-8，避免中文乱码）
+        if (!mb_check_encoding($html, 'UTF-8')) {
+            $html = mb_convert_encoding($html, 'UTF-8', 'GBK,GB2312,ASCII,ISO-8859-1');
        }

+        // 初始化映射数组（过滤非数字键值）
+        $this->initTableMap($tableMap);
+
+        // 原始内容（异常时返回）
        $originalHtml = $html;
        $hasReplace = false;

        try {
-            //原始HTML中匹配所有符合规则的Table
+            // 只要包含数字+字母/数字后缀，直接返回原内容
+            if ($this->hasTableSuffix($html)) {
+                return ['status' => 4, 'data' => $html];
+            }
+
+            // 合并拆分标签的Table+数字
+            $html = $this->preprocessSplitTags($html);
+
+            // 核心替换逻辑
            $html = $this->replaceTableInHtml($html, $hasReplace);

-            // 清理冗余内容
+            // 清理冗余样式/标签（仅当发生替换时执行）
            if ($hasReplace) {
                $html = $this->cleanRedundantStyles($html);
                $html = $this->cleanRedundantPunctuation($html);
@@ -74,50 +86,120 @@ class TableTagProcessor {
    }

    /**
-     * 核心方法：直接在HTML中匹配并替换Table
+     * 全局检测是否包含Table数字+字母/数字后缀
+     * 覆盖所有拆分/嵌套/无标签场景，无论是否有空白
+     * @param string $html 待检测HTML
+     * @return bool
+     */
+    private function hasTableSuffix($html){
+        $styleTagsPattern = implode('|', self::STYLE_TAGS);
+        
+        // 正则1：无标签场景（Table 4B/4123）
+        $pattern1 = "/table\s*\d+[a-zA-Z0-9]/iu";
+        
+        // 正则2：拆分标签场景（<b>4</b><b>B</b> / <b>4</b> <b>B</b> / <b>4</b>&nbsp;<b>B</b>）
+        $pattern2 = "/table\s*(?:<\/(?:{$styleTagsPattern})>)\s*[\s|&nbsp;]*\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\d+)\s*(?:<\/(?:{$styleTagsPattern})>)\s*[\s|&nbsp;]*\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*([a-zA-Z0-9])/iu";
+        
+        // 正则3：嵌套标签场景（<b>4B</b> / <i>4123</i>）
+        $pattern3 = "/table\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*\d+[a-zA-Z0-9]\s*(?:<\/(?:{$styleTagsPattern})>)/iu";
+        
+        // 加@抑制正则警告，避免极端文本导致报错
+        return @preg_match($pattern1, $html) || @preg_match($pattern2, $html) || @preg_match($pattern3, $html);
+    }
+
+    /**
+     * 初始化Table数字映射数组（过滤非数字键值）
+     * @param array $aTableMain 原始映射数组
+     * @return void
+     */
+    private function initTableMap($aTableMain = []){
+        if (!is_array($aTableMain)) {
+            $aTableMain = [];
+        }
+        
+        $tableMap = [];
+        foreach ($aTableMain as $key => $value) {
+            // 严格校验键值均为数字
+            if (ctype_digit((string)$key) && ctype_digit((string)$value)) {
+                $tableMap[(int)$key] = (int)$value;
+            }
+        }
+        
+        $this->aTableMain = $tableMap;
+    }
+
+    /**
+     * 合并所有拆分标签的Table+数字（含空白样式标签）
+     * @param string $html 待处理HTML
     * @return string
     */
-    private function replaceTableInHtml($html, &$hasReplace) {
+    private function preprocessSplitTags($html){
        $styleTagsPattern = implode('|', self::STYLE_TAGS);
-        $styleTagsRegex = "(?:<(?:{$styleTagsPattern})[^>]*>)*\s*"; // 匹配任意嵌套样式标签
+        
+        // 正则1：匹配基础拆分标签的Table+数字
+        $pattern = "/(table)\s*(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\d+)/iu";
+        $html = @preg_replace_callback($pattern, function($matches) {
+            return $matches[1] . ' ' . $matches[2];
+        }, $html);
+        
+        // 正则2：匹配多轮拆分标签的Table+数字（含空白）
+        $pattern2 = "/(table)(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\s*)\s*(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\d+)/iu";
+        $html = @preg_replace_callback($pattern2, function($matches) {
+            return $matches[1] . $matches[2] . $matches[3];
+        }, $html);
+        
+        return $html;
+    }
+
+    /**
+     * 核心替换逻辑：将纯数字Table替换为mytable标签
+     * @param string $html 待处理HTML
+     * @param bool $hasReplace 是否发生替换（引用传递）
+     * @return string
+     */
+    private function replaceTableInHtml($html, &$hasReplace){
+        $styleTagsPattern = implode('|', self::STYLE_TAGS);
+        $styleTagsRegex = "(?:<(?:{$styleTagsPattern})[^>]*>)*\s*";
        $styleTagsCloseRegex = "\s*(?:<\/(?:{$styleTagsPattern})>)*";

-        // 规则1：匹配带括号的Table（如 (Table 82)、(<b>Table 1.</b>)）
-        $pattern1 = "/\(\s*{$styleTagsRegex}table\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*\)/iu";
-        $html = preg_replace_callback($pattern1, function($matches) use (&$hasReplace) {
+        // 正则1：匹配括号内的纯数字Table（如 (Table 2)、(<b>Table 3</b>)）
+        $pattern1 = "/\(\s*{$styleTagsRegex}table\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*(?![a-zA-Z0-9])/iuD";
+        $html = @preg_replace_callback($pattern1, function($matches) use (&$hasReplace) {
            $num = $matches[1];
-            $numInt = intval($num);
+            $numInt = (int)$num;
            $suffix = $matches[2] ?? '';

-            // 校验：纯数字 + 有映射ID + 未被mytable包裹（避免重复替换）
+            // 过滤条件：非数字、无映射、已处理过的标签
            if (!ctype_digit($num) || !isset($this->aTableMain[$numInt]) || 
                $this->isMatchPositionHasMyTableTag($matches[0], "Table {$num}")) {
                return $matches[0];
            }

+            // 执行替换
            $primaryId = $this->aTableMain[$numInt];
-            $baseTag = "<".self::PROCESSED_TAG." data-id=\"{$primaryId}\">Table {$num}</".self::PROCESSED_TAG.">";
+            $baseTag = "<" . self::PROCESSED_TAG . " data-id=\"{$primaryId}\">Table {$num}</" . self::PROCESSED_TAG . ">";
            $target = "({$baseTag}{$suffix})";
            
            $hasReplace = true;
            return $target;
        }, $html);

-        // 规则2：匹配无括号的Table（如 Table 1、<b>Table 2:</b>、<i>Table 3.</i>）
-        $pattern2 = "/{$styleTagsRegex}table\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}(?![a-zA-Z])/iu";
-        $html = preg_replace_callback($pattern2, function($matches) use (&$hasReplace) {
+        // 正则2：匹配无括号的纯数字Table（如 Table 2、<i>Table 3</i>:）
+        $pattern2 = "/{$styleTagsRegex}table\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*(?![a-zA-Z0-9])/iuD";
+        $html = @preg_replace_callback($pattern2, function($matches) use (&$hasReplace) {
            $num = $matches[1];
-            $numInt = intval($num);
+            $numInt = (int)$num;
            $suffix = $matches[2] ?? '';

-            // 校验：纯数字 + 有映射ID + 未被mytable包裹 + 不是数字+字母组合
+            // 过滤条件：非数字、无映射、已处理过的标签
            if (!ctype_digit($num) || !isset($this->aTableMain[$numInt]) || 
                $this->isMatchPositionHasMyTableTag($matches[0], "Table {$num}")) {
                return $matches[0];
            }

+            // 执行替换
            $primaryId = $this->aTableMain[$numInt];
-            $baseTag = "<".self::PROCESSED_TAG." data-id=\"{$primaryId}\">Table {$num}</".self::PROCESSED_TAG.">";
+            $baseTag = "<" . self::PROCESSED_TAG . " data-id=\"{$primaryId}\">Table {$num}</" . self::PROCESSED_TAG . ">";
            $target = "{$baseTag}{$suffix}";
            
            $hasReplace = true;
@@ -127,126 +209,157 @@ class TableTagProcessor {
        return $html;
    }

+    /**
+     * 检测当前匹配内容是否已包含mytable标签（避免重复替换）
+     * @param string $content 匹配的文本片段
+     * @param string $tableText 待检测的Table文本（如 Table 2）
+     * @return bool
+     */
+    private function isMatchPositionHasMyTableTag($content, $tableText){
+        $escapedText = preg_quote($tableText, '/');
+        $pattern = '/<' . self::PROCESSED_TAG . '[^>]*>\s*' . $escapedText . '\s*<\/' . self::PROCESSED_TAG . '>/is';
+        return (bool)@preg_match($pattern, $content);
+    }
+
    /**
     * 清理mytable标签周围的冗余样式标签
-     * @param string $html
+     * @param string $html 待处理HTML
     * @return string
     */
-    private function cleanRedundantStyles($html) {
+    private function cleanRedundantStyles($html){
        foreach (self::STYLE_TAGS as $tag) {
            $pattern = '/<' . $tag . '>\s*<'.self::PROCESSED_TAG.'([^>]*?)>(.*?)<\/'.self::PROCESSED_TAG.'>([\.,:]{0,1})\s*<\/' . $tag . '>/is';
            $html = @preg_replace($pattern, '<'.self::PROCESSED_TAG.'$1>$2</'.self::PROCESSED_TAG.'>$3', $html);
        }
-        // 清理孤立的样式闭标签（避免标签残留）
-        $html = preg_replace('/<\/('.implode('|', self::STYLE_TAGS).')>(?![^<]*<\1>)/is', '', $html);
+        
+        // 清理无匹配的闭合样式标签
+        $html = @preg_replace('/<\/('.implode('|', self::STYLE_TAGS).')>(?![^<]*<\1>)/is', '', $html);
+        
        return $html;
    }

    /**
-     * 清理mytable标签后的冗余标点（保证格式整洁）
-     * @param string $html
+     * 清理mytable标签周围的冗余标点
+     * @param string $html 待处理HTML
     * @return string
     */
-    private function cleanRedundantPunctuation($html) {
-        $html = preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\(Table \d+\)<\/'.self::PROCESSED_TAG.'>\)\./i', '<'.self::PROCESSED_TAG.' data-id="$1">(Table $1)</'.self::PROCESSED_TAG.'>.', $html);
-        $html = preg_replace('/<\/'.self::PROCESSED_TAG.'>\)\.([\.,:]{0,1})/', '</'.self::PROCESSED_TAG.'>)$1', $html);
-        $html = preg_replace('/<\/'.self::PROCESSED_TAG.'>\.\)([\.,:]{0,1})/', '</'.self::PROCESSED_TAG.'>)$1', $html);
-        $html = preg_replace('/<\/'.self::PROCESSED_TAG.'>([\.,:]){2,}/', '</'.self::PROCESSED_TAG.'>$1', $html);
-        $html = preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\((Table \d+)\s*<\/'.self::PROCESSED_TAG.'>([\.,:]{0,1})/i', 
+    private function cleanRedundantPunctuation($html){
+        // 修复括号+标点的冗余格式
+        $html = @preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\(Table \d+\)<\/'.self::PROCESSED_TAG.'>\)\./i', 
+            '<'.self::PROCESSED_TAG.' data-id="$1">(Table $1)</'.self::PROCESSED_TAG.'>.', $html);
+        $html = @preg_replace('/<\/'.self::PROCESSED_TAG.'>\)\.([\.,:]{0,1})/', '</'.self::PROCESSED_TAG.'>)$1', $html);
+        $html = @preg_replace('/<\/'.self::PROCESSED_TAG.'>\.\)([\.,:]{0,1})/', '</'.self::PROCESSED_TAG.'>)$1', $html);
+        
+        // 清理重复标点
+        $html = @preg_replace('/<\/'.self::PROCESSED_TAG.'>([\.,:]){2,}/', '</'.self::PROCESSED_TAG.'>$1', $html);
+        
+        // 修复括号内的标签冗余
+        $html = @preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\((Table \d+)\s*<\/'.self::PROCESSED_TAG.'>([\.,:]{0,1})/i', 
            '<'.self::PROCESSED_TAG.' data-id="$1">($2)</'.self::PROCESSED_TAG.'>$3', $html);
+        
        return $html;
    }

    /**
-     * 清理孤立的样式标签（栈算法兜底，避免标签不闭合）
-     * @param string $html
+     * 清理未闭合的样式标签
+     * @param string $html 待处理HTML
     * @return string
     */
-    private function cleanUnclosedTags($html) {
-        // 清理mytable后孤立的样式闭标签
+    private function cleanUnclosedTags($html){
        foreach (self::STYLE_TAGS as $tag) {
+            // 清理mytable标签后的冗余闭合标签
            $html = @preg_replace('/(<\/'.self::PROCESSED_TAG.'>)\s*<\/' . $tag . '>/i', '$1', $html);
-        }

-        // 栈算法清理其他孤立标签
-        foreach (self::STYLE_TAGS as $tag) {
+            // 定位所有该标签的开闭标签位置
            @preg_match_all("/<{$tag}\b[^>]*>/i", $html, $openMatches, PREG_OFFSET_CAPTURE);
            @preg_match_all("/<\/{$tag}>/i", $html, $closeMatches, PREG_OFFSET_CAPTURE);
-            
+
            $allTags = [];
+            // 收集开标签
            foreach ($openMatches[0] as $m) {
-                $allTags[] = ['offset' => $m[1], 'type' => 'open', 'content' => $m[0], 'length' => strlen($m[0])];
+                $allTags[] = [
+                    'offset' => $m[1],
+                    'type' => 'open',
+                    'content' => $m[0],
+                    'length' => strlen($m[0])
+                ];
            }
+            // 收集闭标签
            foreach ($closeMatches[0] as $m) {
-                $allTags[] = ['offset' => $m[1], 'type' => 'close', 'content' => $m[0], 'length' => strlen($m[0])];
+                $allTags[] = [
+                    'offset' => $m[1],
+                    'type' => 'close',
+                    'content' => $m[0],
+                    'length' => strlen($m[0])
+                ];
            }
+
+            // 按位置排序
            usort($allTags, function($a, $b) {
                return $a['offset'] - $b['offset'];
            });

+            // 栈结构匹配开闭标签
            $tagStack = [];
            $removeOffsets = [];
            foreach ($allTags as $t) {
-                if ($t['type'] == 'open') {
+                if ($t['type'] === 'open') {
                    array_push($tagStack, $t);
                } else {
                    if (!empty($tagStack)) {
                        array_pop($tagStack);
                    } else {
+                        // 无匹配开标签的闭标签，标记删除
                        $removeOffsets[] = $t;
                    }
                }
            }
+
+            // 无匹配闭标签的开标签，标记删除
            foreach ($tagStack as $t) {
                $removeOffsets[] = $t;
            }

-            // 倒序删除，避免偏移错乱
+            // 按偏移量倒序删除（避免影响后续偏移）
            usort($removeOffsets, function($a, $b) {
                return $b['offset'] - $a['offset'];
            });
+
            foreach ($removeOffsets as $item) {
                if ($item['offset'] >= 0 && $item['offset'] < strlen($html)) {
                    $html = substr_replace($html, '', $item['offset'], $item['length']);
                }
            }
        }
-        return $html;
-    }   

-    /**
-     * 优化文本格式（合并多余空格，规范标签前后空格）
-     * @param string $html
-     * @return string
-     */
-    private function optimizeFormat($html) {
-        $html = preg_replace('/\s{2,}/', ' ', trim($html));
-        $html = preg_replace('/<\/'.self::PROCESSED_TAG.'>([A-Za-z0-9])/is', '</'.self::PROCESSED_TAG.'> $1', $html);
-        $html = preg_replace('/([a-zA-Z0-9])<'.self::PROCESSED_TAG.'/is', '$1 <'.self::PROCESSED_TAG.'', $html);
        return $html;
    }

    /**
-     * 清理重复嵌套的mytable标签（兜底方案）
-     * @param string $html
+     * 优化文本格式（清理多余空格）
+     * @param string $html 待处理HTML
     * @return string
     */
-    private function cleanDuplicateNestedTags($html) {
+    private function optimizeFormat($html){
+        // 清理连续空格
+        $html = @preg_replace('/\s{2,}/', ' ', trim($html));
+        // 标签后紧跟字母/数字时加空格
+        $html = @preg_replace('/<\/'.self::PROCESSED_TAG.'>([A-Za-z0-9])/is', '</'.self::PROCESSED_TAG.'> $1', $html);
+        // 字母/数字紧跟标签前时加空格
+        $html = @preg_replace('/([a-zA-Z0-9])<'.self::PROCESSED_TAG.'/is', '$1 <'.self::PROCESSED_TAG.'', $html);
+        
+        return $html;
+    }
+
+    /**
+     * 清理嵌套的mytable标签（避免重复嵌套）
+     * @param string $html 待处理HTML
+     * @return string
+     */
+    private function cleanDuplicateNestedTags($html){
        $pattern = '/<'.self::PROCESSED_TAG.'[^>]*>\s*<'.self::PROCESSED_TAG.'([^>]*)>(.*?)<\/'.self::PROCESSED_TAG.'>\s*<\/'.self::PROCESSED_TAG.'>/is';
-        $html = preg_replace($pattern, '<'.self::PROCESSED_TAG.'$1>$2</'.self::PROCESSED_TAG.'>', $html);
+        $html = @preg_replace($pattern, '<'.self::PROCESSED_TAG.'$1>$2</'.self::PROCESSED_TAG.'>', $html);
+        
        return $html;
    }
-
-    /**
-     * 判断指定Table内容是否被mytable标签包裹
-     * @param string $content 待检查内容
-     * @param string $tableText Table文本（如 "Table 1"）
-     * @return bool
-     */
-    private function isMatchPositionHasMyTableTag($content, $tableText) {
-        $escapedText = preg_quote($tableText, '/');
-        $pattern = '/<' . self::PROCESSED_TAG . '[^>]*>\s*' . $escapedText . '\s*<\/' . self::PROCESSED_TAG . '>/is';
-        return @preg_match($pattern, $content) === 1;
-    }
-
 }