diff --git a/application/common/FigureTagProcessor.php b/application/common/FigureTagProcessor.php index 97d53d4..7956f88 100644 --- a/application/common/FigureTagProcessor.php +++ b/application/common/FigureTagProcessor.php @@ -1,52 +1,63 @@ 状态码, 'data'=>处理后文本] - * status: 2-空输入, 4-无匹配, 5-处理异常, 1-处理成功 + * @param array $aImageMain Figure数字=>ID的映射数组 + * @return array ['status' => 状态码, 'data' => 处理后文本] + * status说明:2-空文本, 4-无匹配/已处理, 1-处理成功, 5-处理异常 */ - public function dealFigureStr($html = '') { - //验证 - if (!is_string($html) || trim($html) === '') { + public function dealFigureStr($html = '', $aImageMain = []){ + //空文本校验 + $html = trim($html); + if ($html === '' || !is_string($html)) { return ['status' => 2, 'data' => '']; } - //超大字符串拦截 + //超长文本保护 if (strlen($html) > self::MAX_HTML_LENGTH) { return ['status' => 4, 'data' => $html]; } - + //编码处理 + if (!mb_check_encoding($html, 'UTF-8')) { + $html = mb_convert_encoding($html, 'UTF-8', 'GBK,GB2312,ASCII,ISO-8859-1'); + } + //初始化映射数组(过滤非数字键值) + $this->initImageMap($aImageMain); + //原始内容 $originalHtml = $html; $hasReplace = false; - try { - //合并嵌套样式标签 - $mergedHtml = $this->mergeFragmentStyleTags($html); - //提取纯文本(用于匹配Figure) - $plainText = preg_replace('/<[^>]+>/', ' ', $mergedHtml); - $plainText = preg_replace('/\s+/', ' ', trim($plainText)); - - //提取所有匹配的Figure数字 - $allMatches = $this->extractAllFigureMatches($plainText); - if (empty($allMatches)) { - return ['status' => 4, 'data' => $originalHtml]; + //只要包含数字+字母/数字后缀,直接返回原内容(核心修复) + if ($this->hasFigureSuffix($html)) { + return ['status' => 4, 'data' => $html]; } - - //替换为myfigure标签 - $html = $this->replaceFigureWithTag($html, $allMatches, $hasReplace); - - //清理冗余内容(仅替换成功后执行) + //合并拆分标签的Figure+数字 + $html = $this->preprocessSplitTags($html); + //替换 + $html = $this->replaceFigureInHtml($html, $hasReplace); + //清理冗余样式/标签 if ($hasReplace) { $html = $this->cleanRedundantStyles($html); $html = $this->cleanRedundantPunctuation($html); $html = $this->cleanUnclosedTags($html); $html = $this->optimizeFormat($html); + $html = $this->cleanDuplicateNestedTags($html); } } catch (\Throwable $e) { @@ -55,220 +66,272 @@ class FigureTagProcessor { return [ 'status' => $hasReplace ? 1 : 4, - 'data' => $hasReplace ? $html : $originalHtml + 'data' => $html ]; } - /** - * 合并嵌套的样式标签 - * @param string $html - * @return string + * 全局检测是否包含Figure数字+字母/数字后缀 + * 覆盖所有拆分/嵌套/无标签场景,无论是否有空白 + * @param string $html 待检测HTML + * @return bool */ - private function mergeFragmentStyleTags($html) { - foreach (self::STYLE_TAGS as $tag) { - $pattern = '/(?:<' . $tag . '>)\s*([^<]+?)\s*<\/' . $tag . '>(?:\s*<' . $tag . '>)\s*([^<]+?)\s*<\/' . $tag . '>/is'; - while (@preg_match($pattern, $html)) { // 抑制正则警告 - $html = preg_replace_callback($pattern, function($matches) { - return trim($matches[1]) . ' ' . trim($matches[2]); - }, $html); + private function hasFigureSuffix($html){ + $styleTagsPattern = implode('|', self::STYLE_TAGS); + + // 正则1:无标签场景(Figure 4B/4123) + $pattern1 = "/figure\s*\d+[a-zA-Z0-9]/iu"; + + // 正则2:拆分标签场景(4B / 4 B / 4 B) + $pattern2 = "/figure\s*(?:<\/(?:{$styleTagsPattern})>)\s*[\s| ]*\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\d+)\s*(?:<\/(?:{$styleTagsPattern})>)\s*[\s| ]*\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*([a-zA-Z0-9])/iu"; + + // 正则3:嵌套标签场景(4B / 4123) + $pattern3 = "/figure\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*\d+[a-zA-Z0-9]\s*(?:<\/(?:{$styleTagsPattern})>)/iu"; + return preg_match($pattern1, $html) || preg_match($pattern2, $html) || preg_match($pattern3, $html); + } + /** + * 初始化Figure数字映射数组 + * @param array $aImageMain 原始映射数组 + * @return void + */ + private function initImageMap($aImageMain){ + if (!is_array($aImageMain)) { + $aImageMain = []; + } + $imageMap = []; + foreach ($aImageMain as $key => $value) { + // 严格校验键值均为数字 + if (ctype_digit((string)$key) && ctype_digit((string)$value)) { + $imageMap[(int)$key] = (int)$value; } } - - // 清理括号内的冗余标点/标签 - $html = preg_replace('/(\(.*?\d+)(?:\s*<[^>]+>)*\s*\.*\s*(?:<[^>]+>)*(\s*.*?\))/is', '$1$2', $html); - $html = preg_replace('/\(\s+/', '(', $html); - $html = preg_replace('/\s+\)/', ')', $html); + $this->aImageMain = $imageMap; + } + /** + * 合并所有拆分标签的Figure+数字(含空白样式标签) + * @param string $html 待处理HTML + * @return string + */ + private function preprocessSplitTags($html){ + $styleTagsPattern = implode('|', self::STYLE_TAGS); + + // 正则1:匹配基础拆分标签的Figure+数字 + $pattern = "/(figure)\s*(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\d+)/iu"; + $html = preg_replace_callback($pattern, function($matches) { + return $matches[1] . ' ' . $matches[2]; + }, $html); + + // 正则2:匹配多轮拆分标签的Figure+数字(含空白) + $pattern2 = "/(figure)(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\s*)\s*(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\d+)/iu"; + $html = preg_replace_callback($pattern2, function($matches) { + return $matches[1] . $matches[2] . $matches[3]; + }, $html); return $html; } - /** - * 从纯文本中提取所有Figure数字(兼容括号/标点/空格) - * @param string $plainText - * @return array - */ - private function extractAllFigureMatches($plainText) { - $allMatches = []; - $processedNums = []; - - // 匹配带括号的Figure(如 (Figure 1.)) - $pattern1 = '/\(Figure\s*(\d+)\b(?!\p{L}|\s+\p{L})(?:\s*[\.,;:]*\s*)\)\s*([\.,:]{0,1})/iu'; - if (@preg_match_all($pattern1, $plainText, $matchesFull, PREG_SET_ORDER)) { - foreach ($matchesFull as $match) { - $num = $match[1]; - if (!ctype_digit($num) || in_array($num, $processedNums)) continue; - $processedNums[] = $num; - $allMatches[$num] = [ - 'hasOuterBracket' => true, - 'validPunct' => $match[2] ?? '', - 'content' => "Figure {$num}" - ]; - } - } - - // 匹配无括号的Figure(如 Figure 1.) - $pattern2 = '/Figure\s*(\d+)\b(?!\p{L}|\s+\p{L})(?:\s*[\.,;:]*\s*)\s*([\.,:]{0,1})/iu'; - if (@preg_match_all($pattern2, $plainText, $matchesOther, PREG_SET_ORDER)) { - foreach ($matchesOther as $match) { - $num = $match[1]; - if (!ctype_digit($num) || in_array($num, $processedNums)) continue; - $processedNums[] = $num; - $allMatches[$num] = [ - 'hasOuterBracket' => false, - 'validPunct' => $match[2] ?? '', - 'content' => "Figure {$num}" - ]; - } - } - - krsort($allMatches); - return $allMatches; - } - - /** - * 将匹配的Figure替换为myfigure标签(优化标签格式) - * @param string $html - * @param array $allMatches - * @param bool $hasReplace + * 核心替换逻辑:将纯数字Figure替换为myfigure标签 + * @param string $html 待处理HTML + * @param bool $hasReplace 是否发生替换(引用传递) * @return string */ - private function replaceFigureWithTag($html, $allMatches, &$hasReplace) { - foreach ($allMatches as $num => $info) { - $innerContent = $info['hasOuterBracket'] - ? "({$info['content']})" - : $info['content']; + private function replaceFigureInHtml($html, &$hasReplace){ + $styleTagsPattern = implode('|', self::STYLE_TAGS); + $styleTagsRegex = "(?:<(?:{$styleTagsPattern})[^>]*>)*\s*"; + $styleTagsCloseRegex = "\s*(?:<\/(?:{$styleTagsPattern})>)*"; + + // 正则1:匹配括号内的纯数字Figure(如 (Figure 2)、(Figure 3)) + // $pattern1 = "/\(\s*{$styleTagsRegex}figure\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*(?![a-zA-Z0-9])/iu"; + $pattern1 = "/\(\s*{$styleTagsRegex}figure\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*(?![a-zA-Z0-9])/iuD"; + $html = preg_replace_callback($pattern1, function($matches) use (&$hasReplace) { + $num = $matches[1]; + $numInt = (int)$num; + $suffix = $matches[2] ?? ''; + + // 过滤条件:非数字、无映射、已处理过的标签 + if (!ctype_digit($num) || !isset($this->aImageMain[$numInt]) || + $this->isMatchPositionHasMyFigureTag($matches[0], "Figure {$num}")) { + return $matches[0]; + } + + // 执行替换 + $primaryId = $this->aImageMain[$numInt]; + $baseTag = "<" . self::PROCESSED_TAG . " data-id=\"{$primaryId}\">Figure {$num}"; + $target = "({$baseTag}{$suffix})"; - //Figure 1 - $targetTag = "{$innerContent}"; - if (!empty($info['validPunct']) && !$info['hasOuterBracket']) { - $targetTag .= $info['validPunct']; + $hasReplace = true; + return $target; + }, $html); + + // 正则2:匹配无括号的纯数字Figure(如 Figure 2、Figure 3:) + // $pattern2 = "/{$styleTagsRegex}figure\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}(?![a-zA-Z0-9])/iu"; + $pattern2 = "/{$styleTagsRegex}figure\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*(?![a-zA-Z0-9])/iuD"; + $html = preg_replace_callback($pattern2, function($matches) use (&$hasReplace) { + $num = $matches[1]; + $numInt = (int)$num; + $suffix = $matches[2] ?? ''; + + // 过滤条件:非数字、无映射、已处理过的标签 + if (!ctype_digit($num) || !isset($this->aImageMain[$numInt]) || + $this->isMatchPositionHasMyFigureTag($matches[0], "Figure {$num}")) { + return $matches[0]; } - $patternSuffix = '(?!\p{L}|\s+\p{L})'; - $pattern = $info['hasOuterBracket'] - ? '/\(\s*(?:<[^>]+>|\s)*Figure(?:<[^>]+>|\s)*' . $num . '\b' . $patternSuffix . '(?:\s*[\.,;:]*\s*|\s*<[^>]+>)*\s*\)/iu' - : '/\s*(?:<[^>]+>|\s)*Figure(?:<[^>]+>|\s)*' . $num . '\b' . $patternSuffix . '(?:\s*[\.,;:]*\s*|\s*<[^>]+>)*\s*([\.,:]{0,1})/iu'; + // 执行替换 + $primaryId = $this->aImageMain[$numInt]; + $baseTag = "<" . self::PROCESSED_TAG . " data-id=\"{$primaryId}\">Figure {$num}"; + $target = "{$baseTag}{$suffix}"; + + $hasReplace = true; + return $target; + }, $html); - //执行替换(最多替换1次,避免重复) - $html = @preg_replace($pattern, $targetTag, $html, 1, $count); - if ($count > 0) { - $hasReplace = true; - error_log("[FigureTagProcessor] 替换成功 - ID:{$num} 括号:".($info['hasOuterBracket']?'是':'否')); - } - } return $html; } - /** - * 清理myfigure标签周围的冗余样式标签(适配新标签格式) - * @param string $html + * 检测当前匹配内容是否已包含myfigure标签(避免重复替换) + * @param string $content 匹配的文本片段 + * @param string $figureText 待检测的Figure文本(如 Figure 2) + * @return bool + */ + private function isMatchPositionHasMyFigureTag($content, $figureText){ + $escapedText = preg_quote($figureText, '/'); + $pattern = '/<' . self::PROCESSED_TAG . '[^>]*>\s*' . $escapedText . '\s*<\/' . self::PROCESSED_TAG . '>/is'; + return (bool)preg_match($pattern, $content); + } + /** + * 清理myfigure标签周围的冗余样式标签 + * @param string $html 待处理HTML * @return string */ - private function cleanRedundantStyles($html) { + private function cleanRedundantStyles($html){ foreach (self::STYLE_TAGS as $tag) { - $pattern = '/<' . $tag . '>\s*]*)>(.*?)<\/myfigure>([\.,:]{0,1})\s*<\/' . $tag . '>/is'; - $html = @preg_replace($pattern, '$2$3', $html); + $pattern = '/<' . $tag . '>\s*<'.self::PROCESSED_TAG.'([^>]*?)>(.*?)<\/'.self::PROCESSED_TAG.'>([\.,:]{0,1})\s*<\/' . $tag . '>/is'; + $html = preg_replace($pattern, '<'.self::PROCESSED_TAG.'$1>$2$3', $html); } - //清理闭标签 + + // 清理无匹配的闭合样式标签 $html = preg_replace('/<\/('.implode('|', self::STYLE_TAGS).')>(?![^<]*<\1>)/is', '', $html); + return $html; } - /** - * 清理myfigure标签后的冗余标点(适配新标签格式) - * @param string $html + * 清理myfigure标签周围的冗余标点 + * @param string $html 待处理HTML * @return string */ - private function cleanRedundantPunctuation($html) { - $html = preg_replace('/\(Figure \d+\)<\/myfigure>\)\./i', '(Figure $1).', $html); - $html = preg_replace('/<\/myfigure>\)\.([\.,:]{0,1})/', ')$1', $html); - $html = preg_replace('/<\/myfigure>\.\)([\.,:]{0,1})/', ')$1', $html); - $html = preg_replace('/<\/myfigure>([\.,:]){2,}/', '$1', $html); - $html = preg_replace('/\((Figure \d+)\s*<\/myfigure>([\.,:]{0,1})/i', - '($2)$3', $html); + private function cleanRedundantPunctuation($html){ + // 修复括号+标点的冗余格式 + $html = preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\(Figure \d+\)<\/'.self::PROCESSED_TAG.'>\)\./i', + '<'.self::PROCESSED_TAG.' data-id="$1">(Figure $1).', $html); + $html = preg_replace('/<\/'.self::PROCESSED_TAG.'>\)\.([\.,:]{0,1})/', ')$1', $html); + $html = preg_replace('/<\/'.self::PROCESSED_TAG.'>\.\)([\.,:]{0,1})/', ')$1', $html); + + // 清理重复标点 + $html = preg_replace('/<\/'.self::PROCESSED_TAG.'>([\.,:]){2,}/', '$1', $html); + + // 修复括号内的标签冗余 + $html = preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\((Figure \d+)\s*<\/'.self::PROCESSED_TAG.'>([\.,:]{0,1})/i', + '<'.self::PROCESSED_TAG.' data-id="$1">($2)$3', $html); + return $html; } - /** - * 清理孤立的样式标签 - * @param string $html + * 清理未闭合的样式标签 + * @param string $html 待处理HTML * @return string */ - private function cleanUnclosedTags($html) { + private function cleanUnclosedTags($html){ foreach (self::STYLE_TAGS as $tag) { - $html = @preg_replace('/(<\/myfigure>)\s*<\/' . $tag . '>/i', '$1', $html); - } - foreach (self::STYLE_TAGS as $tag) { - @preg_match_all("/<{$tag}\b[^>]*>/i", $html, $openMatches, PREG_OFFSET_CAPTURE); - @preg_match_all("/<\/{$tag}>/i", $html, $closeMatches, PREG_OFFSET_CAPTURE); - + // 清理myfigure标签后的冗余闭合标签 + $html = preg_replace('/(<\/'.self::PROCESSED_TAG.'>)\s*<\/' . $tag . '>/i', '$1', $html); + + // 定位所有该标签的开闭标签位置 + preg_match_all("/<{$tag}\b[^>]*>/i", $html, $openMatches, PREG_OFFSET_CAPTURE); + preg_match_all("/<\/{$tag}>/i", $html, $closeMatches, PREG_OFFSET_CAPTURE); + $allTags = []; + // 收集开标签 foreach ($openMatches[0] as $m) { $allTags[] = [ - 'offset' => $m[1], - 'type' => 'open', - 'content' => $m[0], + 'offset' => $m[1], + 'type' => 'open', + 'content' => $m[0], 'length' => strlen($m[0]) ]; } + // 收集闭标签 foreach ($closeMatches[0] as $m) { $allTags[] = [ - 'offset' => $m[1], - 'type' => 'close', - 'content' => $m[0], + 'offset' => $m[1], + 'type' => 'close', + 'content' => $m[0], 'length' => strlen($m[0]) ]; } + + // 按位置排序 usort($allTags, function($a, $b) { return $a['offset'] - $b['offset']; }); + // 栈结构匹配开闭标签 $tagStack = []; $removeOffsets = []; foreach ($allTags as $t) { - if ($t['type'] == 'open') { + if ($t['type'] === 'open') { array_push($tagStack, $t); } else { if (!empty($tagStack)) { array_pop($tagStack); } else { - $removeOffsets[] = [ - 'pos' => $t['offset'], - 'len' => $t['length'], - 'content' => $t['content'] - ]; + // 无匹配开标签的闭标签,标记删除 + $removeOffsets[] = $t; } } } + + // 无匹配闭标签的开标签,标记删除 foreach ($tagStack as $t) { - $removeOffsets[] = [ - 'pos' => $t['offset'], - 'len' => $t['length'], - 'content' => $t['content'] - ]; + $removeOffsets[] = $t; } - // 倒序删除,避免偏移错乱 + // 按偏移量倒序删除(避免影响后续偏移) usort($removeOffsets, function($a, $b) { - return $b['pos'] - $a['pos']; + return $b['offset'] - $a['offset']; }); + foreach ($removeOffsets as $item) { - if ($item['pos'] >= 0 && $item['pos'] < strlen($html)) { - $html = substr_replace($html, '', $item['pos'], $item['len']); + if ($item['offset'] >= 0 && $item['offset'] < strlen($html)) { + $html = substr_replace($html, '', $item['offset'], $item['length']); } } } - return $html; - } + return $html; + } /** - * 优化文本格式(合并多余空格,规范myfigure标签前后空格) - * @param string $html + * 优化文本格式(清理多余空格) + * @param string $html 待处理HTML * @return string */ - private function optimizeFormat($html) { + private function optimizeFormat($html){ + // 清理连续空格 $html = preg_replace('/\s{2,}/', ' ', trim($html)); - $html = preg_replace('/<\/myfigure>([A-Za-z0-9])/is', ' $1', $html); - $html = preg_replace('/([a-zA-Z0-9])([A-Za-z0-9])/is', ' $1', $html); + // 字母/数字紧跟标签前时加空格 + $html = preg_replace('/([a-zA-Z0-9])<'.self::PROCESSED_TAG.'/is', '$1 <'.self::PROCESSED_TAG.'', $html); + + return $html; + } + /** + * 清理嵌套的myfigure标签(避免重复嵌套) + * @param string $html 待处理HTML + * @return string + */ + private function cleanDuplicateNestedTags($html){ + $pattern = '/<'.self::PROCESSED_TAG.'[^>]*>\s*<'.self::PROCESSED_TAG.'([^>]*)>(.*?)<\/'.self::PROCESSED_TAG.'>\s*<\/'.self::PROCESSED_TAG.'>/is'; + $html = preg_replace($pattern, '<'.self::PROCESSED_TAG.'$1>$2', $html); + return $html; } } \ No newline at end of file diff --git a/application/common/TableTagProcessor.php b/application/common/TableTagProcessor.php index 3a7e50f..9ae92ec 100644 --- a/application/common/TableTagProcessor.php +++ b/application/common/TableTagProcessor.php @@ -2,59 +2,71 @@ namespace app\common; /** - * Table标签处理器(生产环境终极版) * 功能:精准匹配并替换Table相关格式为mytable标签 - * 支持格式:table 数字、(table 数字)、table 数字:/table 数字.(含嵌套标签) - * 特性:支持任意嵌套标签/括号、不处理数字+字母、仅跳过已被mytable包裹的Table + * 支持格式:table 数字、(table 数字)、table 数字:/table 数字.(含嵌套/拆分标签) + * 跳过已被mytable包裹的table(含后缀) + * 跳过table 数字+字母/数字后缀(含拆分标签场景,无论是否有空白) + * 正常处理table 数字+空白/样式标签场景 */ -class TableTagProcessor { - // 可配置的样式标签列表 - const STYLE_TAGS = ['i', 'b', 'font', 'strong', 'em']; - // 最大处理字符串长度 - const MAX_HTML_LENGTH = 100000; - // 目标替换标签 - const PROCESSED_TAG = 'mytable'; - // 数据库表格ID映射 +class TableTagProcessor{ + // 支持的样式标签列表 + private const STYLE_TAGS = ['i', 'b', 'font', 'strong', 'em','blue']; + // HTML文本最大处理长度(防止内存溢出) + private const MAX_HTML_LENGTH = 100000; + // 替换后的目标标签名 + private const PROCESSED_TAG = 'mytable'; + // Table数字与对应ID的映射数组 private $aTableMain = []; /** - * 处理Table文本,替换为mytable标签并清理冗余内容 + * 处理Table标签替换的主方法 * @param string $html 待处理的HTML文本 - * @param array $aTableMain Table数字→主键ID的映射数组(如 [1=>1001, 2=>1002]) - * @return array ['status'=>状态码, 'data'=>处理后文本] - * status: 2-空输入, 4-无匹配/已处理, 5-处理异常, 1-处理成功 + * @param array $aTableMain Table数字=>ID的映射数组(可选,默认1=>1~10=>10) + * @return array ['status' => 状态码, 'data' => 处理后文本] + * status说明:2-空文本, 4-无匹配/已处理, 1-处理成功, 5-处理异常 */ - public function dealTableStr($html = '', $aTableMain = []) { - //验证 - if (!is_string($html) || trim($html) === '') { + public function dealTableStr($html = '', $aTableMain = []){ + // 初始化默认映射数组(仅当入参为空时使用) + $defaultTableMap = [1=>1,2=>2,3=>3,4=>4,5=>5,6=>6,7=>7,8=>8,9=>9,10=>10]; + // 优先使用入参,入参为空则用默认值 + $tableMap = !empty($aTableMain) ? $aTableMain : $defaultTableMap; + + // 空文本校验 + $html = trim($html); + if ($html === '' || !is_string($html)) { return ['status' => 2, 'data' => '']; } - //超大字符串拦截(防止内存溢出) + + // 超长文本保护 if (strlen($html) > self::MAX_HTML_LENGTH) { return ['status' => 4, 'data' => $html]; } - //初始化主键映射数组 - if(!empty($aTableMain)){ - $aTableMainNew = []; - foreach ($aTableMain as $key => $value) { - if (!ctype_digit((string)$key) || !ctype_digit((string)$value)) { - continue; - } - $keyInt = (int)$key; - $aTableMainNew[$keyInt + 1] = $value; - } - $this->aTableMain = $aTableMainNew; + // 编码处理(统一转为UTF-8,避免中文乱码) + if (!mb_check_encoding($html, 'UTF-8')) { + $html = mb_convert_encoding($html, 'UTF-8', 'GBK,GB2312,ASCII,ISO-8859-1'); } + // 初始化映射数组(过滤非数字键值) + $this->initTableMap($tableMap); + + // 原始内容(异常时返回) $originalHtml = $html; $hasReplace = false; try { - //原始HTML中匹配所有符合规则的Table + // 只要包含数字+字母/数字后缀,直接返回原内容 + if ($this->hasTableSuffix($html)) { + return ['status' => 4, 'data' => $html]; + } + + // 合并拆分标签的Table+数字 + $html = $this->preprocessSplitTags($html); + + // 核心替换逻辑 $html = $this->replaceTableInHtml($html, $hasReplace); - // 清理冗余内容 + // 清理冗余样式/标签(仅当发生替换时执行) if ($hasReplace) { $html = $this->cleanRedundantStyles($html); $html = $this->cleanRedundantPunctuation($html); @@ -74,50 +86,120 @@ class TableTagProcessor { } /** - * 核心方法:直接在HTML中匹配并替换Table + * 全局检测是否包含Table数字+字母/数字后缀 + * 覆盖所有拆分/嵌套/无标签场景,无论是否有空白 + * @param string $html 待检测HTML + * @return bool + */ + private function hasTableSuffix($html){ + $styleTagsPattern = implode('|', self::STYLE_TAGS); + + // 正则1:无标签场景(Table 4B/4123) + $pattern1 = "/table\s*\d+[a-zA-Z0-9]/iu"; + + // 正则2:拆分标签场景(4B / 4 B / 4 B) + $pattern2 = "/table\s*(?:<\/(?:{$styleTagsPattern})>)\s*[\s| ]*\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\d+)\s*(?:<\/(?:{$styleTagsPattern})>)\s*[\s| ]*\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*([a-zA-Z0-9])/iu"; + + // 正则3:嵌套标签场景(4B / 4123) + $pattern3 = "/table\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*\d+[a-zA-Z0-9]\s*(?:<\/(?:{$styleTagsPattern})>)/iu"; + + // 加@抑制正则警告,避免极端文本导致报错 + return @preg_match($pattern1, $html) || @preg_match($pattern2, $html) || @preg_match($pattern3, $html); + } + + /** + * 初始化Table数字映射数组(过滤非数字键值) + * @param array $aTableMain 原始映射数组 + * @return void + */ + private function initTableMap($aTableMain = []){ + if (!is_array($aTableMain)) { + $aTableMain = []; + } + + $tableMap = []; + foreach ($aTableMain as $key => $value) { + // 严格校验键值均为数字 + if (ctype_digit((string)$key) && ctype_digit((string)$value)) { + $tableMap[(int)$key] = (int)$value; + } + } + + $this->aTableMain = $tableMap; + } + + /** + * 合并所有拆分标签的Table+数字(含空白样式标签) + * @param string $html 待处理HTML * @return string */ - private function replaceTableInHtml($html, &$hasReplace) { + private function preprocessSplitTags($html){ $styleTagsPattern = implode('|', self::STYLE_TAGS); - $styleTagsRegex = "(?:<(?:{$styleTagsPattern})[^>]*>)*\s*"; // 匹配任意嵌套样式标签 + + // 正则1:匹配基础拆分标签的Table+数字 + $pattern = "/(table)\s*(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\d+)/iu"; + $html = @preg_replace_callback($pattern, function($matches) { + return $matches[1] . ' ' . $matches[2]; + }, $html); + + // 正则2:匹配多轮拆分标签的Table+数字(含空白) + $pattern2 = "/(table)(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\s*)\s*(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\d+)/iu"; + $html = @preg_replace_callback($pattern2, function($matches) { + return $matches[1] . $matches[2] . $matches[3]; + }, $html); + + return $html; + } + + /** + * 核心替换逻辑:将纯数字Table替换为mytable标签 + * @param string $html 待处理HTML + * @param bool $hasReplace 是否发生替换(引用传递) + * @return string + */ + private function replaceTableInHtml($html, &$hasReplace){ + $styleTagsPattern = implode('|', self::STYLE_TAGS); + $styleTagsRegex = "(?:<(?:{$styleTagsPattern})[^>]*>)*\s*"; $styleTagsCloseRegex = "\s*(?:<\/(?:{$styleTagsPattern})>)*"; - // 规则1:匹配带括号的Table(如 (Table 82)、(Table 1.)) - $pattern1 = "/\(\s*{$styleTagsRegex}table\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*\)/iu"; - $html = preg_replace_callback($pattern1, function($matches) use (&$hasReplace) { + // 正则1:匹配括号内的纯数字Table(如 (Table 2)、(Table 3)) + $pattern1 = "/\(\s*{$styleTagsRegex}table\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*(?![a-zA-Z0-9])/iuD"; + $html = @preg_replace_callback($pattern1, function($matches) use (&$hasReplace) { $num = $matches[1]; - $numInt = intval($num); + $numInt = (int)$num; $suffix = $matches[2] ?? ''; - // 校验:纯数字 + 有映射ID + 未被mytable包裹(避免重复替换) + // 过滤条件:非数字、无映射、已处理过的标签 if (!ctype_digit($num) || !isset($this->aTableMain[$numInt]) || $this->isMatchPositionHasMyTableTag($matches[0], "Table {$num}")) { return $matches[0]; } + // 执行替换 $primaryId = $this->aTableMain[$numInt]; - $baseTag = "<".self::PROCESSED_TAG." data-id=\"{$primaryId}\">Table {$num}"; + $baseTag = "<" . self::PROCESSED_TAG . " data-id=\"{$primaryId}\">Table {$num}"; $target = "({$baseTag}{$suffix})"; $hasReplace = true; return $target; }, $html); - // 规则2:匹配无括号的Table(如 Table 1、Table 2:Table 3.) - $pattern2 = "/{$styleTagsRegex}table\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}(?![a-zA-Z])/iu"; - $html = preg_replace_callback($pattern2, function($matches) use (&$hasReplace) { + // 正则2:匹配无括号的纯数字Table(如 Table 2、Table 3:) + $pattern2 = "/{$styleTagsRegex}table\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*(?![a-zA-Z0-9])/iuD"; + $html = @preg_replace_callback($pattern2, function($matches) use (&$hasReplace) { $num = $matches[1]; - $numInt = intval($num); + $numInt = (int)$num; $suffix = $matches[2] ?? ''; - // 校验:纯数字 + 有映射ID + 未被mytable包裹 + 不是数字+字母组合 + // 过滤条件:非数字、无映射、已处理过的标签 if (!ctype_digit($num) || !isset($this->aTableMain[$numInt]) || $this->isMatchPositionHasMyTableTag($matches[0], "Table {$num}")) { return $matches[0]; } + // 执行替换 $primaryId = $this->aTableMain[$numInt]; - $baseTag = "<".self::PROCESSED_TAG." data-id=\"{$primaryId}\">Table {$num}"; + $baseTag = "<" . self::PROCESSED_TAG . " data-id=\"{$primaryId}\">Table {$num}"; $target = "{$baseTag}{$suffix}"; $hasReplace = true; @@ -127,126 +209,157 @@ class TableTagProcessor { return $html; } + /** + * 检测当前匹配内容是否已包含mytable标签(避免重复替换) + * @param string $content 匹配的文本片段 + * @param string $tableText 待检测的Table文本(如 Table 2) + * @return bool + */ + private function isMatchPositionHasMyTableTag($content, $tableText){ + $escapedText = preg_quote($tableText, '/'); + $pattern = '/<' . self::PROCESSED_TAG . '[^>]*>\s*' . $escapedText . '\s*<\/' . self::PROCESSED_TAG . '>/is'; + return (bool)@preg_match($pattern, $content); + } + /** * 清理mytable标签周围的冗余样式标签 - * @param string $html + * @param string $html 待处理HTML * @return string */ - private function cleanRedundantStyles($html) { + private function cleanRedundantStyles($html){ foreach (self::STYLE_TAGS as $tag) { $pattern = '/<' . $tag . '>\s*<'.self::PROCESSED_TAG.'([^>]*?)>(.*?)<\/'.self::PROCESSED_TAG.'>([\.,:]{0,1})\s*<\/' . $tag . '>/is'; $html = @preg_replace($pattern, '<'.self::PROCESSED_TAG.'$1>$2$3', $html); } - // 清理孤立的样式闭标签(避免标签残留) - $html = preg_replace('/<\/('.implode('|', self::STYLE_TAGS).')>(?![^<]*<\1>)/is', '', $html); + + // 清理无匹配的闭合样式标签 + $html = @preg_replace('/<\/('.implode('|', self::STYLE_TAGS).')>(?![^<]*<\1>)/is', '', $html); + return $html; } /** - * 清理mytable标签后的冗余标点(保证格式整洁) - * @param string $html + * 清理mytable标签周围的冗余标点 + * @param string $html 待处理HTML * @return string */ - private function cleanRedundantPunctuation($html) { - $html = preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\(Table \d+\)<\/'.self::PROCESSED_TAG.'>\)\./i', '<'.self::PROCESSED_TAG.' data-id="$1">(Table $1).', $html); - $html = preg_replace('/<\/'.self::PROCESSED_TAG.'>\)\.([\.,:]{0,1})/', ')$1', $html); - $html = preg_replace('/<\/'.self::PROCESSED_TAG.'>\.\)([\.,:]{0,1})/', ')$1', $html); - $html = preg_replace('/<\/'.self::PROCESSED_TAG.'>([\.,:]){2,}/', '$1', $html); - $html = preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\((Table \d+)\s*<\/'.self::PROCESSED_TAG.'>([\.,:]{0,1})/i', + private function cleanRedundantPunctuation($html){ + // 修复括号+标点的冗余格式 + $html = @preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\(Table \d+\)<\/'.self::PROCESSED_TAG.'>\)\./i', + '<'.self::PROCESSED_TAG.' data-id="$1">(Table $1).', $html); + $html = @preg_replace('/<\/'.self::PROCESSED_TAG.'>\)\.([\.,:]{0,1})/', ')$1', $html); + $html = @preg_replace('/<\/'.self::PROCESSED_TAG.'>\.\)([\.,:]{0,1})/', ')$1', $html); + + // 清理重复标点 + $html = @preg_replace('/<\/'.self::PROCESSED_TAG.'>([\.,:]){2,}/', '$1', $html); + + // 修复括号内的标签冗余 + $html = @preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\((Table \d+)\s*<\/'.self::PROCESSED_TAG.'>([\.,:]{0,1})/i', '<'.self::PROCESSED_TAG.' data-id="$1">($2)$3', $html); + return $html; } /** - * 清理孤立的样式标签(栈算法兜底,避免标签不闭合) - * @param string $html + * 清理未闭合的样式标签 + * @param string $html 待处理HTML * @return string */ - private function cleanUnclosedTags($html) { - // 清理mytable后孤立的样式闭标签 + private function cleanUnclosedTags($html){ foreach (self::STYLE_TAGS as $tag) { + // 清理mytable标签后的冗余闭合标签 $html = @preg_replace('/(<\/'.self::PROCESSED_TAG.'>)\s*<\/' . $tag . '>/i', '$1', $html); - } - // 栈算法清理其他孤立标签 - foreach (self::STYLE_TAGS as $tag) { + // 定位所有该标签的开闭标签位置 @preg_match_all("/<{$tag}\b[^>]*>/i", $html, $openMatches, PREG_OFFSET_CAPTURE); @preg_match_all("/<\/{$tag}>/i", $html, $closeMatches, PREG_OFFSET_CAPTURE); - + $allTags = []; + // 收集开标签 foreach ($openMatches[0] as $m) { - $allTags[] = ['offset' => $m[1], 'type' => 'open', 'content' => $m[0], 'length' => strlen($m[0])]; + $allTags[] = [ + 'offset' => $m[1], + 'type' => 'open', + 'content' => $m[0], + 'length' => strlen($m[0]) + ]; } + // 收集闭标签 foreach ($closeMatches[0] as $m) { - $allTags[] = ['offset' => $m[1], 'type' => 'close', 'content' => $m[0], 'length' => strlen($m[0])]; + $allTags[] = [ + 'offset' => $m[1], + 'type' => 'close', + 'content' => $m[0], + 'length' => strlen($m[0]) + ]; } + + // 按位置排序 usort($allTags, function($a, $b) { return $a['offset'] - $b['offset']; }); + // 栈结构匹配开闭标签 $tagStack = []; $removeOffsets = []; foreach ($allTags as $t) { - if ($t['type'] == 'open') { + if ($t['type'] === 'open') { array_push($tagStack, $t); } else { if (!empty($tagStack)) { array_pop($tagStack); } else { + // 无匹配开标签的闭标签,标记删除 $removeOffsets[] = $t; } } } + + // 无匹配闭标签的开标签,标记删除 foreach ($tagStack as $t) { $removeOffsets[] = $t; } - // 倒序删除,避免偏移错乱 + // 按偏移量倒序删除(避免影响后续偏移) usort($removeOffsets, function($a, $b) { return $b['offset'] - $a['offset']; }); + foreach ($removeOffsets as $item) { if ($item['offset'] >= 0 && $item['offset'] < strlen($html)) { $html = substr_replace($html, '', $item['offset'], $item['length']); } } } - return $html; - } - /** - * 优化文本格式(合并多余空格,规范标签前后空格) - * @param string $html - * @return string - */ - private function optimizeFormat($html) { - $html = preg_replace('/\s{2,}/', ' ', trim($html)); - $html = preg_replace('/<\/'.self::PROCESSED_TAG.'>([A-Za-z0-9])/is', ' $1', $html); - $html = preg_replace('/([a-zA-Z0-9])<'.self::PROCESSED_TAG.'/is', '$1 <'.self::PROCESSED_TAG.'', $html); return $html; } /** - * 清理重复嵌套的mytable标签(兜底方案) - * @param string $html + * 优化文本格式(清理多余空格) + * @param string $html 待处理HTML * @return string */ - private function cleanDuplicateNestedTags($html) { + private function optimizeFormat($html){ + // 清理连续空格 + $html = @preg_replace('/\s{2,}/', ' ', trim($html)); + // 标签后紧跟字母/数字时加空格 + $html = @preg_replace('/<\/'.self::PROCESSED_TAG.'>([A-Za-z0-9])/is', ' $1', $html); + // 字母/数字紧跟标签前时加空格 + $html = @preg_replace('/([a-zA-Z0-9])<'.self::PROCESSED_TAG.'/is', '$1 <'.self::PROCESSED_TAG.'', $html); + + return $html; + } + + /** + * 清理嵌套的mytable标签(避免重复嵌套) + * @param string $html 待处理HTML + * @return string + */ + private function cleanDuplicateNestedTags($html){ $pattern = '/<'.self::PROCESSED_TAG.'[^>]*>\s*<'.self::PROCESSED_TAG.'([^>]*)>(.*?)<\/'.self::PROCESSED_TAG.'>\s*<\/'.self::PROCESSED_TAG.'>/is'; - $html = preg_replace($pattern, '<'.self::PROCESSED_TAG.'$1>$2', $html); + $html = @preg_replace($pattern, '<'.self::PROCESSED_TAG.'$1>$2', $html); + return $html; } - - /** - * 判断指定Table内容是否被mytable标签包裹 - * @param string $content 待检查内容 - * @param string $tableText Table文本(如 "Table 1") - * @return bool - */ - private function isMatchPositionHasMyTableTag($content, $tableText) { - $escapedText = preg_quote($tableText, '/'); - $pattern = '/<' . self::PROCESSED_TAG . '[^>]*>\s*' . $escapedText . '\s*<\/' . self::PROCESSED_TAG . '>/is'; - return @preg_match($pattern, $content) === 1; - } - } \ No newline at end of file