diff --git a/application/common/FigureTagProcessor.php b/application/common/FigureTagProcessor.php
index 97d53d4..7956f88 100644
--- a/application/common/FigureTagProcessor.php
+++ b/application/common/FigureTagProcessor.php
@@ -1,52 +1,63 @@
状态码, 'data'=>处理后文本]
- * status: 2-空输入, 4-无匹配, 5-处理异常, 1-处理成功
+ * @param array $aImageMain Figure数字=>ID的映射数组
+ * @return array ['status' => 状态码, 'data' => 处理后文本]
+ * status说明:2-空文本, 4-无匹配/已处理, 1-处理成功, 5-处理异常
*/
- public function dealFigureStr($html = '') {
- //验证
- if (!is_string($html) || trim($html) === '') {
+ public function dealFigureStr($html = '', $aImageMain = []){
+ //空文本校验
+ $html = trim($html);
+ if ($html === '' || !is_string($html)) {
return ['status' => 2, 'data' => ''];
}
- //超大字符串拦截
+ //超长文本保护
if (strlen($html) > self::MAX_HTML_LENGTH) {
return ['status' => 4, 'data' => $html];
}
-
+ //编码处理
+ if (!mb_check_encoding($html, 'UTF-8')) {
+ $html = mb_convert_encoding($html, 'UTF-8', 'GBK,GB2312,ASCII,ISO-8859-1');
+ }
+ //初始化映射数组(过滤非数字键值)
+ $this->initImageMap($aImageMain);
+ //原始内容
$originalHtml = $html;
$hasReplace = false;
-
try {
- //合并嵌套样式标签
- $mergedHtml = $this->mergeFragmentStyleTags($html);
- //提取纯文本(用于匹配Figure)
- $plainText = preg_replace('/<[^>]+>/', ' ', $mergedHtml);
- $plainText = preg_replace('/\s+/', ' ', trim($plainText));
-
- //提取所有匹配的Figure数字
- $allMatches = $this->extractAllFigureMatches($plainText);
- if (empty($allMatches)) {
- return ['status' => 4, 'data' => $originalHtml];
+ //只要包含数字+字母/数字后缀,直接返回原内容(核心修复)
+ if ($this->hasFigureSuffix($html)) {
+ return ['status' => 4, 'data' => $html];
}
-
- //替换为myfigure标签
- $html = $this->replaceFigureWithTag($html, $allMatches, $hasReplace);
-
- //清理冗余内容(仅替换成功后执行)
+ //合并拆分标签的Figure+数字
+ $html = $this->preprocessSplitTags($html);
+ //替换
+ $html = $this->replaceFigureInHtml($html, $hasReplace);
+ //清理冗余样式/标签
if ($hasReplace) {
$html = $this->cleanRedundantStyles($html);
$html = $this->cleanRedundantPunctuation($html);
$html = $this->cleanUnclosedTags($html);
$html = $this->optimizeFormat($html);
+ $html = $this->cleanDuplicateNestedTags($html);
}
} catch (\Throwable $e) {
@@ -55,220 +66,272 @@ class FigureTagProcessor {
return [
'status' => $hasReplace ? 1 : 4,
- 'data' => $hasReplace ? $html : $originalHtml
+ 'data' => $html
];
}
-
/**
- * 合并嵌套的样式标签
- * @param string $html
- * @return string
+ * 全局检测是否包含Figure数字+字母/数字后缀
+ * 覆盖所有拆分/嵌套/无标签场景,无论是否有空白
+ * @param string $html 待检测HTML
+ * @return bool
*/
- private function mergeFragmentStyleTags($html) {
- foreach (self::STYLE_TAGS as $tag) {
- $pattern = '/(?:<' . $tag . '>)\s*([^<]+?)\s*<\/' . $tag . '>(?:\s*<' . $tag . '>)\s*([^<]+?)\s*<\/' . $tag . '>/is';
- while (@preg_match($pattern, $html)) { // 抑制正则警告
- $html = preg_replace_callback($pattern, function($matches) {
- return trim($matches[1]) . ' ' . trim($matches[2]);
- }, $html);
+ private function hasFigureSuffix($html){
+ $styleTagsPattern = implode('|', self::STYLE_TAGS);
+
+ // 正则1:无标签场景(Figure 4B/4123)
+ $pattern1 = "/figure\s*\d+[a-zA-Z0-9]/iu";
+
+ // 正则2:拆分标签场景(4B / 4 B / 4 B)
+ $pattern2 = "/figure\s*(?:<\/(?:{$styleTagsPattern})>)\s*[\s| ]*\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\d+)\s*(?:<\/(?:{$styleTagsPattern})>)\s*[\s| ]*\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*([a-zA-Z0-9])/iu";
+
+ // 正则3:嵌套标签场景(4B / 4123)
+ $pattern3 = "/figure\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*\d+[a-zA-Z0-9]\s*(?:<\/(?:{$styleTagsPattern})>)/iu";
+ return preg_match($pattern1, $html) || preg_match($pattern2, $html) || preg_match($pattern3, $html);
+ }
+ /**
+ * 初始化Figure数字映射数组
+ * @param array $aImageMain 原始映射数组
+ * @return void
+ */
+ private function initImageMap($aImageMain){
+ if (!is_array($aImageMain)) {
+ $aImageMain = [];
+ }
+ $imageMap = [];
+ foreach ($aImageMain as $key => $value) {
+ // 严格校验键值均为数字
+ if (ctype_digit((string)$key) && ctype_digit((string)$value)) {
+ $imageMap[(int)$key] = (int)$value;
}
}
-
- // 清理括号内的冗余标点/标签
- $html = preg_replace('/(\(.*?\d+)(?:\s*<[^>]+>)*\s*\.*\s*(?:<[^>]+>)*(\s*.*?\))/is', '$1$2', $html);
- $html = preg_replace('/\(\s+/', '(', $html);
- $html = preg_replace('/\s+\)/', ')', $html);
+ $this->aImageMain = $imageMap;
+ }
+ /**
+ * 合并所有拆分标签的Figure+数字(含空白样式标签)
+ * @param string $html 待处理HTML
+ * @return string
+ */
+ private function preprocessSplitTags($html){
+ $styleTagsPattern = implode('|', self::STYLE_TAGS);
+
+ // 正则1:匹配基础拆分标签的Figure+数字
+ $pattern = "/(figure)\s*(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\d+)/iu";
+ $html = preg_replace_callback($pattern, function($matches) {
+ return $matches[1] . ' ' . $matches[2];
+ }, $html);
+
+ // 正则2:匹配多轮拆分标签的Figure+数字(含空白)
+ $pattern2 = "/(figure)(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\s*)\s*(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\d+)/iu";
+ $html = preg_replace_callback($pattern2, function($matches) {
+ return $matches[1] . $matches[2] . $matches[3];
+ }, $html);
return $html;
}
-
/**
- * 从纯文本中提取所有Figure数字(兼容括号/标点/空格)
- * @param string $plainText
- * @return array
- */
- private function extractAllFigureMatches($plainText) {
- $allMatches = [];
- $processedNums = [];
-
- // 匹配带括号的Figure(如 (Figure 1.))
- $pattern1 = '/\(Figure\s*(\d+)\b(?!\p{L}|\s+\p{L})(?:\s*[\.,;:]*\s*)\)\s*([\.,:]{0,1})/iu';
- if (@preg_match_all($pattern1, $plainText, $matchesFull, PREG_SET_ORDER)) {
- foreach ($matchesFull as $match) {
- $num = $match[1];
- if (!ctype_digit($num) || in_array($num, $processedNums)) continue;
- $processedNums[] = $num;
- $allMatches[$num] = [
- 'hasOuterBracket' => true,
- 'validPunct' => $match[2] ?? '',
- 'content' => "Figure {$num}"
- ];
- }
- }
-
- // 匹配无括号的Figure(如 Figure 1.)
- $pattern2 = '/Figure\s*(\d+)\b(?!\p{L}|\s+\p{L})(?:\s*[\.,;:]*\s*)\s*([\.,:]{0,1})/iu';
- if (@preg_match_all($pattern2, $plainText, $matchesOther, PREG_SET_ORDER)) {
- foreach ($matchesOther as $match) {
- $num = $match[1];
- if (!ctype_digit($num) || in_array($num, $processedNums)) continue;
- $processedNums[] = $num;
- $allMatches[$num] = [
- 'hasOuterBracket' => false,
- 'validPunct' => $match[2] ?? '',
- 'content' => "Figure {$num}"
- ];
- }
- }
-
- krsort($allMatches);
- return $allMatches;
- }
-
- /**
- * 将匹配的Figure替换为myfigure标签(优化标签格式)
- * @param string $html
- * @param array $allMatches
- * @param bool $hasReplace
+ * 核心替换逻辑:将纯数字Figure替换为myfigure标签
+ * @param string $html 待处理HTML
+ * @param bool $hasReplace 是否发生替换(引用传递)
* @return string
*/
- private function replaceFigureWithTag($html, $allMatches, &$hasReplace) {
- foreach ($allMatches as $num => $info) {
- $innerContent = $info['hasOuterBracket']
- ? "({$info['content']})"
- : $info['content'];
+ private function replaceFigureInHtml($html, &$hasReplace){
+ $styleTagsPattern = implode('|', self::STYLE_TAGS);
+ $styleTagsRegex = "(?:<(?:{$styleTagsPattern})[^>]*>)*\s*";
+ $styleTagsCloseRegex = "\s*(?:<\/(?:{$styleTagsPattern})>)*";
+
+ // 正则1:匹配括号内的纯数字Figure(如 (Figure 2)、(Figure 3))
+ // $pattern1 = "/\(\s*{$styleTagsRegex}figure\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*(?![a-zA-Z0-9])/iu";
+ $pattern1 = "/\(\s*{$styleTagsRegex}figure\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*(?![a-zA-Z0-9])/iuD";
+ $html = preg_replace_callback($pattern1, function($matches) use (&$hasReplace) {
+ $num = $matches[1];
+ $numInt = (int)$num;
+ $suffix = $matches[2] ?? '';
+
+ // 过滤条件:非数字、无映射、已处理过的标签
+ if (!ctype_digit($num) || !isset($this->aImageMain[$numInt]) ||
+ $this->isMatchPositionHasMyFigureTag($matches[0], "Figure {$num}")) {
+ return $matches[0];
+ }
+
+ // 执行替换
+ $primaryId = $this->aImageMain[$numInt];
+ $baseTag = "<" . self::PROCESSED_TAG . " data-id=\"{$primaryId}\">Figure {$num}" . self::PROCESSED_TAG . ">";
+ $target = "({$baseTag}{$suffix})";
- //Figure 1
- $targetTag = "{$innerContent}";
- if (!empty($info['validPunct']) && !$info['hasOuterBracket']) {
- $targetTag .= $info['validPunct'];
+ $hasReplace = true;
+ return $target;
+ }, $html);
+
+ // 正则2:匹配无括号的纯数字Figure(如 Figure 2、Figure 3:)
+ // $pattern2 = "/{$styleTagsRegex}figure\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}(?![a-zA-Z0-9])/iu";
+ $pattern2 = "/{$styleTagsRegex}figure\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*(?![a-zA-Z0-9])/iuD";
+ $html = preg_replace_callback($pattern2, function($matches) use (&$hasReplace) {
+ $num = $matches[1];
+ $numInt = (int)$num;
+ $suffix = $matches[2] ?? '';
+
+ // 过滤条件:非数字、无映射、已处理过的标签
+ if (!ctype_digit($num) || !isset($this->aImageMain[$numInt]) ||
+ $this->isMatchPositionHasMyFigureTag($matches[0], "Figure {$num}")) {
+ return $matches[0];
}
- $patternSuffix = '(?!\p{L}|\s+\p{L})';
- $pattern = $info['hasOuterBracket']
- ? '/\(\s*(?:<[^>]+>|\s)*Figure(?:<[^>]+>|\s)*' . $num . '\b' . $patternSuffix . '(?:\s*[\.,;:]*\s*|\s*<[^>]+>)*\s*\)/iu'
- : '/\s*(?:<[^>]+>|\s)*Figure(?:<[^>]+>|\s)*' . $num . '\b' . $patternSuffix . '(?:\s*[\.,;:]*\s*|\s*<[^>]+>)*\s*([\.,:]{0,1})/iu';
+ // 执行替换
+ $primaryId = $this->aImageMain[$numInt];
+ $baseTag = "<" . self::PROCESSED_TAG . " data-id=\"{$primaryId}\">Figure {$num}" . self::PROCESSED_TAG . ">";
+ $target = "{$baseTag}{$suffix}";
+
+ $hasReplace = true;
+ return $target;
+ }, $html);
- //执行替换(最多替换1次,避免重复)
- $html = @preg_replace($pattern, $targetTag, $html, 1, $count);
- if ($count > 0) {
- $hasReplace = true;
- error_log("[FigureTagProcessor] 替换成功 - ID:{$num} 括号:".($info['hasOuterBracket']?'是':'否'));
- }
- }
return $html;
}
-
/**
- * 清理myfigure标签周围的冗余样式标签(适配新标签格式)
- * @param string $html
+ * 检测当前匹配内容是否已包含myfigure标签(避免重复替换)
+ * @param string $content 匹配的文本片段
+ * @param string $figureText 待检测的Figure文本(如 Figure 2)
+ * @return bool
+ */
+ private function isMatchPositionHasMyFigureTag($content, $figureText){
+ $escapedText = preg_quote($figureText, '/');
+ $pattern = '/<' . self::PROCESSED_TAG . '[^>]*>\s*' . $escapedText . '\s*<\/' . self::PROCESSED_TAG . '>/is';
+ return (bool)preg_match($pattern, $content);
+ }
+ /**
+ * 清理myfigure标签周围的冗余样式标签
+ * @param string $html 待处理HTML
* @return string
*/
- private function cleanRedundantStyles($html) {
+ private function cleanRedundantStyles($html){
foreach (self::STYLE_TAGS as $tag) {
- $pattern = '/<' . $tag . '>\s*]*)>(.*?)<\/myfigure>([\.,:]{0,1})\s*<\/' . $tag . '>/is';
- $html = @preg_replace($pattern, '$2$3', $html);
+ $pattern = '/<' . $tag . '>\s*<'.self::PROCESSED_TAG.'([^>]*?)>(.*?)<\/'.self::PROCESSED_TAG.'>([\.,:]{0,1})\s*<\/' . $tag . '>/is';
+ $html = preg_replace($pattern, '<'.self::PROCESSED_TAG.'$1>$2'.self::PROCESSED_TAG.'>$3', $html);
}
- //清理闭标签
+
+ // 清理无匹配的闭合样式标签
$html = preg_replace('/<\/('.implode('|', self::STYLE_TAGS).')>(?![^<]*<\1>)/is', '', $html);
+
return $html;
}
-
/**
- * 清理myfigure标签后的冗余标点(适配新标签格式)
- * @param string $html
+ * 清理myfigure标签周围的冗余标点
+ * @param string $html 待处理HTML
* @return string
*/
- private function cleanRedundantPunctuation($html) {
- $html = preg_replace('/\(Figure \d+\)<\/myfigure>\)\./i', '(Figure $1).', $html);
- $html = preg_replace('/<\/myfigure>\)\.([\.,:]{0,1})/', ')$1', $html);
- $html = preg_replace('/<\/myfigure>\.\)([\.,:]{0,1})/', ')$1', $html);
- $html = preg_replace('/<\/myfigure>([\.,:]){2,}/', '$1', $html);
- $html = preg_replace('/\((Figure \d+)\s*<\/myfigure>([\.,:]{0,1})/i',
- '($2)$3', $html);
+ private function cleanRedundantPunctuation($html){
+ // 修复括号+标点的冗余格式
+ $html = preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\(Figure \d+\)<\/'.self::PROCESSED_TAG.'>\)\./i',
+ '<'.self::PROCESSED_TAG.' data-id="$1">(Figure $1)'.self::PROCESSED_TAG.'>.', $html);
+ $html = preg_replace('/<\/'.self::PROCESSED_TAG.'>\)\.([\.,:]{0,1})/', ''.self::PROCESSED_TAG.'>)$1', $html);
+ $html = preg_replace('/<\/'.self::PROCESSED_TAG.'>\.\)([\.,:]{0,1})/', ''.self::PROCESSED_TAG.'>)$1', $html);
+
+ // 清理重复标点
+ $html = preg_replace('/<\/'.self::PROCESSED_TAG.'>([\.,:]){2,}/', ''.self::PROCESSED_TAG.'>$1', $html);
+
+ // 修复括号内的标签冗余
+ $html = preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\((Figure \d+)\s*<\/'.self::PROCESSED_TAG.'>([\.,:]{0,1})/i',
+ '<'.self::PROCESSED_TAG.' data-id="$1">($2)'.self::PROCESSED_TAG.'>$3', $html);
+
return $html;
}
-
/**
- * 清理孤立的样式标签
- * @param string $html
+ * 清理未闭合的样式标签
+ * @param string $html 待处理HTML
* @return string
*/
- private function cleanUnclosedTags($html) {
+ private function cleanUnclosedTags($html){
foreach (self::STYLE_TAGS as $tag) {
- $html = @preg_replace('/(<\/myfigure>)\s*<\/' . $tag . '>/i', '$1', $html);
- }
- foreach (self::STYLE_TAGS as $tag) {
- @preg_match_all("/<{$tag}\b[^>]*>/i", $html, $openMatches, PREG_OFFSET_CAPTURE);
- @preg_match_all("/<\/{$tag}>/i", $html, $closeMatches, PREG_OFFSET_CAPTURE);
-
+ // 清理myfigure标签后的冗余闭合标签
+ $html = preg_replace('/(<\/'.self::PROCESSED_TAG.'>)\s*<\/' . $tag . '>/i', '$1', $html);
+
+ // 定位所有该标签的开闭标签位置
+ preg_match_all("/<{$tag}\b[^>]*>/i", $html, $openMatches, PREG_OFFSET_CAPTURE);
+ preg_match_all("/<\/{$tag}>/i", $html, $closeMatches, PREG_OFFSET_CAPTURE);
+
$allTags = [];
+ // 收集开标签
foreach ($openMatches[0] as $m) {
$allTags[] = [
- 'offset' => $m[1],
- 'type' => 'open',
- 'content' => $m[0],
+ 'offset' => $m[1],
+ 'type' => 'open',
+ 'content' => $m[0],
'length' => strlen($m[0])
];
}
+ // 收集闭标签
foreach ($closeMatches[0] as $m) {
$allTags[] = [
- 'offset' => $m[1],
- 'type' => 'close',
- 'content' => $m[0],
+ 'offset' => $m[1],
+ 'type' => 'close',
+ 'content' => $m[0],
'length' => strlen($m[0])
];
}
+
+ // 按位置排序
usort($allTags, function($a, $b) {
return $a['offset'] - $b['offset'];
});
+ // 栈结构匹配开闭标签
$tagStack = [];
$removeOffsets = [];
foreach ($allTags as $t) {
- if ($t['type'] == 'open') {
+ if ($t['type'] === 'open') {
array_push($tagStack, $t);
} else {
if (!empty($tagStack)) {
array_pop($tagStack);
} else {
- $removeOffsets[] = [
- 'pos' => $t['offset'],
- 'len' => $t['length'],
- 'content' => $t['content']
- ];
+ // 无匹配开标签的闭标签,标记删除
+ $removeOffsets[] = $t;
}
}
}
+
+ // 无匹配闭标签的开标签,标记删除
foreach ($tagStack as $t) {
- $removeOffsets[] = [
- 'pos' => $t['offset'],
- 'len' => $t['length'],
- 'content' => $t['content']
- ];
+ $removeOffsets[] = $t;
}
- // 倒序删除,避免偏移错乱
+ // 按偏移量倒序删除(避免影响后续偏移)
usort($removeOffsets, function($a, $b) {
- return $b['pos'] - $a['pos'];
+ return $b['offset'] - $a['offset'];
});
+
foreach ($removeOffsets as $item) {
- if ($item['pos'] >= 0 && $item['pos'] < strlen($html)) {
- $html = substr_replace($html, '', $item['pos'], $item['len']);
+ if ($item['offset'] >= 0 && $item['offset'] < strlen($html)) {
+ $html = substr_replace($html, '', $item['offset'], $item['length']);
}
}
}
- return $html;
- }
+ return $html;
+ }
/**
- * 优化文本格式(合并多余空格,规范myfigure标签前后空格)
- * @param string $html
+ * 优化文本格式(清理多余空格)
+ * @param string $html 待处理HTML
* @return string
*/
- private function optimizeFormat($html) {
+ private function optimizeFormat($html){
+ // 清理连续空格
$html = preg_replace('/\s{2,}/', ' ', trim($html));
- $html = preg_replace('/<\/myfigure>([A-Za-z0-9])/is', ' $1', $html);
- $html = preg_replace('/([a-zA-Z0-9])([A-Za-z0-9])/is', ''.self::PROCESSED_TAG.'> $1', $html);
+ // 字母/数字紧跟标签前时加空格
+ $html = preg_replace('/([a-zA-Z0-9])<'.self::PROCESSED_TAG.'/is', '$1 <'.self::PROCESSED_TAG.'', $html);
+
+ return $html;
+ }
+ /**
+ * 清理嵌套的myfigure标签(避免重复嵌套)
+ * @param string $html 待处理HTML
+ * @return string
+ */
+ private function cleanDuplicateNestedTags($html){
+ $pattern = '/<'.self::PROCESSED_TAG.'[^>]*>\s*<'.self::PROCESSED_TAG.'([^>]*)>(.*?)<\/'.self::PROCESSED_TAG.'>\s*<\/'.self::PROCESSED_TAG.'>/is';
+ $html = preg_replace($pattern, '<'.self::PROCESSED_TAG.'$1>$2'.self::PROCESSED_TAG.'>', $html);
+
return $html;
}
}
\ No newline at end of file
diff --git a/application/common/TableTagProcessor.php b/application/common/TableTagProcessor.php
index 3a7e50f..9ae92ec 100644
--- a/application/common/TableTagProcessor.php
+++ b/application/common/TableTagProcessor.php
@@ -2,59 +2,71 @@
namespace app\common;
/**
- * Table标签处理器(生产环境终极版)
* 功能:精准匹配并替换Table相关格式为mytable标签
- * 支持格式:table 数字、(table 数字)、table 数字:/table 数字.(含嵌套标签)
- * 特性:支持任意嵌套标签/括号、不处理数字+字母、仅跳过已被mytable包裹的Table
+ * 支持格式:table 数字、(table 数字)、table 数字:/table 数字.(含嵌套/拆分标签)
+ * 跳过已被mytable包裹的table(含后缀)
+ * 跳过table 数字+字母/数字后缀(含拆分标签场景,无论是否有空白)
+ * 正常处理table 数字+空白/样式标签场景
*/
-class TableTagProcessor {
- // 可配置的样式标签列表
- const STYLE_TAGS = ['i', 'b', 'font', 'strong', 'em'];
- // 最大处理字符串长度
- const MAX_HTML_LENGTH = 100000;
- // 目标替换标签
- const PROCESSED_TAG = 'mytable';
- // 数据库表格ID映射
+class TableTagProcessor{
+ // 支持的样式标签列表
+ private const STYLE_TAGS = ['i', 'b', 'font', 'strong', 'em','blue'];
+ // HTML文本最大处理长度(防止内存溢出)
+ private const MAX_HTML_LENGTH = 100000;
+ // 替换后的目标标签名
+ private const PROCESSED_TAG = 'mytable';
+ // Table数字与对应ID的映射数组
private $aTableMain = [];
/**
- * 处理Table文本,替换为mytable标签并清理冗余内容
+ * 处理Table标签替换的主方法
* @param string $html 待处理的HTML文本
- * @param array $aTableMain Table数字→主键ID的映射数组(如 [1=>1001, 2=>1002])
- * @return array ['status'=>状态码, 'data'=>处理后文本]
- * status: 2-空输入, 4-无匹配/已处理, 5-处理异常, 1-处理成功
+ * @param array $aTableMain Table数字=>ID的映射数组(可选,默认1=>1~10=>10)
+ * @return array ['status' => 状态码, 'data' => 处理后文本]
+ * status说明:2-空文本, 4-无匹配/已处理, 1-处理成功, 5-处理异常
*/
- public function dealTableStr($html = '', $aTableMain = []) {
- //验证
- if (!is_string($html) || trim($html) === '') {
+ public function dealTableStr($html = '', $aTableMain = []){
+ // 初始化默认映射数组(仅当入参为空时使用)
+ $defaultTableMap = [1=>1,2=>2,3=>3,4=>4,5=>5,6=>6,7=>7,8=>8,9=>9,10=>10];
+ // 优先使用入参,入参为空则用默认值
+ $tableMap = !empty($aTableMain) ? $aTableMain : $defaultTableMap;
+
+ // 空文本校验
+ $html = trim($html);
+ if ($html === '' || !is_string($html)) {
return ['status' => 2, 'data' => ''];
}
- //超大字符串拦截(防止内存溢出)
+
+ // 超长文本保护
if (strlen($html) > self::MAX_HTML_LENGTH) {
return ['status' => 4, 'data' => $html];
}
- //初始化主键映射数组
- if(!empty($aTableMain)){
- $aTableMainNew = [];
- foreach ($aTableMain as $key => $value) {
- if (!ctype_digit((string)$key) || !ctype_digit((string)$value)) {
- continue;
- }
- $keyInt = (int)$key;
- $aTableMainNew[$keyInt + 1] = $value;
- }
- $this->aTableMain = $aTableMainNew;
+ // 编码处理(统一转为UTF-8,避免中文乱码)
+ if (!mb_check_encoding($html, 'UTF-8')) {
+ $html = mb_convert_encoding($html, 'UTF-8', 'GBK,GB2312,ASCII,ISO-8859-1');
}
+ // 初始化映射数组(过滤非数字键值)
+ $this->initTableMap($tableMap);
+
+ // 原始内容(异常时返回)
$originalHtml = $html;
$hasReplace = false;
try {
- //原始HTML中匹配所有符合规则的Table
+ // 只要包含数字+字母/数字后缀,直接返回原内容
+ if ($this->hasTableSuffix($html)) {
+ return ['status' => 4, 'data' => $html];
+ }
+
+ // 合并拆分标签的Table+数字
+ $html = $this->preprocessSplitTags($html);
+
+ // 核心替换逻辑
$html = $this->replaceTableInHtml($html, $hasReplace);
- // 清理冗余内容
+ // 清理冗余样式/标签(仅当发生替换时执行)
if ($hasReplace) {
$html = $this->cleanRedundantStyles($html);
$html = $this->cleanRedundantPunctuation($html);
@@ -74,50 +86,120 @@ class TableTagProcessor {
}
/**
- * 核心方法:直接在HTML中匹配并替换Table
+ * 全局检测是否包含Table数字+字母/数字后缀
+ * 覆盖所有拆分/嵌套/无标签场景,无论是否有空白
+ * @param string $html 待检测HTML
+ * @return bool
+ */
+ private function hasTableSuffix($html){
+ $styleTagsPattern = implode('|', self::STYLE_TAGS);
+
+ // 正则1:无标签场景(Table 4B/4123)
+ $pattern1 = "/table\s*\d+[a-zA-Z0-9]/iu";
+
+ // 正则2:拆分标签场景(4B / 4 B / 4 B)
+ $pattern2 = "/table\s*(?:<\/(?:{$styleTagsPattern})>)\s*[\s| ]*\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\d+)\s*(?:<\/(?:{$styleTagsPattern})>)\s*[\s| ]*\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*([a-zA-Z0-9])/iu";
+
+ // 正则3:嵌套标签场景(4B / 4123)
+ $pattern3 = "/table\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*\d+[a-zA-Z0-9]\s*(?:<\/(?:{$styleTagsPattern})>)/iu";
+
+ // 加@抑制正则警告,避免极端文本导致报错
+ return @preg_match($pattern1, $html) || @preg_match($pattern2, $html) || @preg_match($pattern3, $html);
+ }
+
+ /**
+ * 初始化Table数字映射数组(过滤非数字键值)
+ * @param array $aTableMain 原始映射数组
+ * @return void
+ */
+ private function initTableMap($aTableMain = []){
+ if (!is_array($aTableMain)) {
+ $aTableMain = [];
+ }
+
+ $tableMap = [];
+ foreach ($aTableMain as $key => $value) {
+ // 严格校验键值均为数字
+ if (ctype_digit((string)$key) && ctype_digit((string)$value)) {
+ $tableMap[(int)$key] = (int)$value;
+ }
+ }
+
+ $this->aTableMain = $tableMap;
+ }
+
+ /**
+ * 合并所有拆分标签的Table+数字(含空白样式标签)
+ * @param string $html 待处理HTML
* @return string
*/
- private function replaceTableInHtml($html, &$hasReplace) {
+ private function preprocessSplitTags($html){
$styleTagsPattern = implode('|', self::STYLE_TAGS);
- $styleTagsRegex = "(?:<(?:{$styleTagsPattern})[^>]*>)*\s*"; // 匹配任意嵌套样式标签
+
+ // 正则1:匹配基础拆分标签的Table+数字
+ $pattern = "/(table)\s*(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\d+)/iu";
+ $html = @preg_replace_callback($pattern, function($matches) {
+ return $matches[1] . ' ' . $matches[2];
+ }, $html);
+
+ // 正则2:匹配多轮拆分标签的Table+数字(含空白)
+ $pattern2 = "/(table)(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\s*)\s*(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\d+)/iu";
+ $html = @preg_replace_callback($pattern2, function($matches) {
+ return $matches[1] . $matches[2] . $matches[3];
+ }, $html);
+
+ return $html;
+ }
+
+ /**
+ * 核心替换逻辑:将纯数字Table替换为mytable标签
+ * @param string $html 待处理HTML
+ * @param bool $hasReplace 是否发生替换(引用传递)
+ * @return string
+ */
+ private function replaceTableInHtml($html, &$hasReplace){
+ $styleTagsPattern = implode('|', self::STYLE_TAGS);
+ $styleTagsRegex = "(?:<(?:{$styleTagsPattern})[^>]*>)*\s*";
$styleTagsCloseRegex = "\s*(?:<\/(?:{$styleTagsPattern})>)*";
- // 规则1:匹配带括号的Table(如 (Table 82)、(Table 1.))
- $pattern1 = "/\(\s*{$styleTagsRegex}table\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*\)/iu";
- $html = preg_replace_callback($pattern1, function($matches) use (&$hasReplace) {
+ // 正则1:匹配括号内的纯数字Table(如 (Table 2)、(Table 3))
+ $pattern1 = "/\(\s*{$styleTagsRegex}table\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*(?![a-zA-Z0-9])/iuD";
+ $html = @preg_replace_callback($pattern1, function($matches) use (&$hasReplace) {
$num = $matches[1];
- $numInt = intval($num);
+ $numInt = (int)$num;
$suffix = $matches[2] ?? '';
- // 校验:纯数字 + 有映射ID + 未被mytable包裹(避免重复替换)
+ // 过滤条件:非数字、无映射、已处理过的标签
if (!ctype_digit($num) || !isset($this->aTableMain[$numInt]) ||
$this->isMatchPositionHasMyTableTag($matches[0], "Table {$num}")) {
return $matches[0];
}
+ // 执行替换
$primaryId = $this->aTableMain[$numInt];
- $baseTag = "<".self::PROCESSED_TAG." data-id=\"{$primaryId}\">Table {$num}".self::PROCESSED_TAG.">";
+ $baseTag = "<" . self::PROCESSED_TAG . " data-id=\"{$primaryId}\">Table {$num}" . self::PROCESSED_TAG . ">";
$target = "({$baseTag}{$suffix})";
$hasReplace = true;
return $target;
}, $html);
- // 规则2:匹配无括号的Table(如 Table 1、Table 2:、Table 3.)
- $pattern2 = "/{$styleTagsRegex}table\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}(?![a-zA-Z])/iu";
- $html = preg_replace_callback($pattern2, function($matches) use (&$hasReplace) {
+ // 正则2:匹配无括号的纯数字Table(如 Table 2、Table 3:)
+ $pattern2 = "/{$styleTagsRegex}table\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*(?![a-zA-Z0-9])/iuD";
+ $html = @preg_replace_callback($pattern2, function($matches) use (&$hasReplace) {
$num = $matches[1];
- $numInt = intval($num);
+ $numInt = (int)$num;
$suffix = $matches[2] ?? '';
- // 校验:纯数字 + 有映射ID + 未被mytable包裹 + 不是数字+字母组合
+ // 过滤条件:非数字、无映射、已处理过的标签
if (!ctype_digit($num) || !isset($this->aTableMain[$numInt]) ||
$this->isMatchPositionHasMyTableTag($matches[0], "Table {$num}")) {
return $matches[0];
}
+ // 执行替换
$primaryId = $this->aTableMain[$numInt];
- $baseTag = "<".self::PROCESSED_TAG." data-id=\"{$primaryId}\">Table {$num}".self::PROCESSED_TAG.">";
+ $baseTag = "<" . self::PROCESSED_TAG . " data-id=\"{$primaryId}\">Table {$num}" . self::PROCESSED_TAG . ">";
$target = "{$baseTag}{$suffix}";
$hasReplace = true;
@@ -127,126 +209,157 @@ class TableTagProcessor {
return $html;
}
+ /**
+ * 检测当前匹配内容是否已包含mytable标签(避免重复替换)
+ * @param string $content 匹配的文本片段
+ * @param string $tableText 待检测的Table文本(如 Table 2)
+ * @return bool
+ */
+ private function isMatchPositionHasMyTableTag($content, $tableText){
+ $escapedText = preg_quote($tableText, '/');
+ $pattern = '/<' . self::PROCESSED_TAG . '[^>]*>\s*' . $escapedText . '\s*<\/' . self::PROCESSED_TAG . '>/is';
+ return (bool)@preg_match($pattern, $content);
+ }
+
/**
* 清理mytable标签周围的冗余样式标签
- * @param string $html
+ * @param string $html 待处理HTML
* @return string
*/
- private function cleanRedundantStyles($html) {
+ private function cleanRedundantStyles($html){
foreach (self::STYLE_TAGS as $tag) {
$pattern = '/<' . $tag . '>\s*<'.self::PROCESSED_TAG.'([^>]*?)>(.*?)<\/'.self::PROCESSED_TAG.'>([\.,:]{0,1})\s*<\/' . $tag . '>/is';
$html = @preg_replace($pattern, '<'.self::PROCESSED_TAG.'$1>$2'.self::PROCESSED_TAG.'>$3', $html);
}
- // 清理孤立的样式闭标签(避免标签残留)
- $html = preg_replace('/<\/('.implode('|', self::STYLE_TAGS).')>(?![^<]*<\1>)/is', '', $html);
+
+ // 清理无匹配的闭合样式标签
+ $html = @preg_replace('/<\/('.implode('|', self::STYLE_TAGS).')>(?![^<]*<\1>)/is', '', $html);
+
return $html;
}
/**
- * 清理mytable标签后的冗余标点(保证格式整洁)
- * @param string $html
+ * 清理mytable标签周围的冗余标点
+ * @param string $html 待处理HTML
* @return string
*/
- private function cleanRedundantPunctuation($html) {
- $html = preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\(Table \d+\)<\/'.self::PROCESSED_TAG.'>\)\./i', '<'.self::PROCESSED_TAG.' data-id="$1">(Table $1)'.self::PROCESSED_TAG.'>.', $html);
- $html = preg_replace('/<\/'.self::PROCESSED_TAG.'>\)\.([\.,:]{0,1})/', ''.self::PROCESSED_TAG.'>)$1', $html);
- $html = preg_replace('/<\/'.self::PROCESSED_TAG.'>\.\)([\.,:]{0,1})/', ''.self::PROCESSED_TAG.'>)$1', $html);
- $html = preg_replace('/<\/'.self::PROCESSED_TAG.'>([\.,:]){2,}/', ''.self::PROCESSED_TAG.'>$1', $html);
- $html = preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\((Table \d+)\s*<\/'.self::PROCESSED_TAG.'>([\.,:]{0,1})/i',
+ private function cleanRedundantPunctuation($html){
+ // 修复括号+标点的冗余格式
+ $html = @preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\(Table \d+\)<\/'.self::PROCESSED_TAG.'>\)\./i',
+ '<'.self::PROCESSED_TAG.' data-id="$1">(Table $1)'.self::PROCESSED_TAG.'>.', $html);
+ $html = @preg_replace('/<\/'.self::PROCESSED_TAG.'>\)\.([\.,:]{0,1})/', ''.self::PROCESSED_TAG.'>)$1', $html);
+ $html = @preg_replace('/<\/'.self::PROCESSED_TAG.'>\.\)([\.,:]{0,1})/', ''.self::PROCESSED_TAG.'>)$1', $html);
+
+ // 清理重复标点
+ $html = @preg_replace('/<\/'.self::PROCESSED_TAG.'>([\.,:]){2,}/', ''.self::PROCESSED_TAG.'>$1', $html);
+
+ // 修复括号内的标签冗余
+ $html = @preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\((Table \d+)\s*<\/'.self::PROCESSED_TAG.'>([\.,:]{0,1})/i',
'<'.self::PROCESSED_TAG.' data-id="$1">($2)'.self::PROCESSED_TAG.'>$3', $html);
+
return $html;
}
/**
- * 清理孤立的样式标签(栈算法兜底,避免标签不闭合)
- * @param string $html
+ * 清理未闭合的样式标签
+ * @param string $html 待处理HTML
* @return string
*/
- private function cleanUnclosedTags($html) {
- // 清理mytable后孤立的样式闭标签
+ private function cleanUnclosedTags($html){
foreach (self::STYLE_TAGS as $tag) {
+ // 清理mytable标签后的冗余闭合标签
$html = @preg_replace('/(<\/'.self::PROCESSED_TAG.'>)\s*<\/' . $tag . '>/i', '$1', $html);
- }
- // 栈算法清理其他孤立标签
- foreach (self::STYLE_TAGS as $tag) {
+ // 定位所有该标签的开闭标签位置
@preg_match_all("/<{$tag}\b[^>]*>/i", $html, $openMatches, PREG_OFFSET_CAPTURE);
@preg_match_all("/<\/{$tag}>/i", $html, $closeMatches, PREG_OFFSET_CAPTURE);
-
+
$allTags = [];
+ // 收集开标签
foreach ($openMatches[0] as $m) {
- $allTags[] = ['offset' => $m[1], 'type' => 'open', 'content' => $m[0], 'length' => strlen($m[0])];
+ $allTags[] = [
+ 'offset' => $m[1],
+ 'type' => 'open',
+ 'content' => $m[0],
+ 'length' => strlen($m[0])
+ ];
}
+ // 收集闭标签
foreach ($closeMatches[0] as $m) {
- $allTags[] = ['offset' => $m[1], 'type' => 'close', 'content' => $m[0], 'length' => strlen($m[0])];
+ $allTags[] = [
+ 'offset' => $m[1],
+ 'type' => 'close',
+ 'content' => $m[0],
+ 'length' => strlen($m[0])
+ ];
}
+
+ // 按位置排序
usort($allTags, function($a, $b) {
return $a['offset'] - $b['offset'];
});
+ // 栈结构匹配开闭标签
$tagStack = [];
$removeOffsets = [];
foreach ($allTags as $t) {
- if ($t['type'] == 'open') {
+ if ($t['type'] === 'open') {
array_push($tagStack, $t);
} else {
if (!empty($tagStack)) {
array_pop($tagStack);
} else {
+ // 无匹配开标签的闭标签,标记删除
$removeOffsets[] = $t;
}
}
}
+
+ // 无匹配闭标签的开标签,标记删除
foreach ($tagStack as $t) {
$removeOffsets[] = $t;
}
- // 倒序删除,避免偏移错乱
+ // 按偏移量倒序删除(避免影响后续偏移)
usort($removeOffsets, function($a, $b) {
return $b['offset'] - $a['offset'];
});
+
foreach ($removeOffsets as $item) {
if ($item['offset'] >= 0 && $item['offset'] < strlen($html)) {
$html = substr_replace($html, '', $item['offset'], $item['length']);
}
}
}
- return $html;
- }
- /**
- * 优化文本格式(合并多余空格,规范标签前后空格)
- * @param string $html
- * @return string
- */
- private function optimizeFormat($html) {
- $html = preg_replace('/\s{2,}/', ' ', trim($html));
- $html = preg_replace('/<\/'.self::PROCESSED_TAG.'>([A-Za-z0-9])/is', ''.self::PROCESSED_TAG.'> $1', $html);
- $html = preg_replace('/([a-zA-Z0-9])<'.self::PROCESSED_TAG.'/is', '$1 <'.self::PROCESSED_TAG.'', $html);
return $html;
}
/**
- * 清理重复嵌套的mytable标签(兜底方案)
- * @param string $html
+ * 优化文本格式(清理多余空格)
+ * @param string $html 待处理HTML
* @return string
*/
- private function cleanDuplicateNestedTags($html) {
+ private function optimizeFormat($html){
+ // 清理连续空格
+ $html = @preg_replace('/\s{2,}/', ' ', trim($html));
+ // 标签后紧跟字母/数字时加空格
+ $html = @preg_replace('/<\/'.self::PROCESSED_TAG.'>([A-Za-z0-9])/is', ''.self::PROCESSED_TAG.'> $1', $html);
+ // 字母/数字紧跟标签前时加空格
+ $html = @preg_replace('/([a-zA-Z0-9])<'.self::PROCESSED_TAG.'/is', '$1 <'.self::PROCESSED_TAG.'', $html);
+
+ return $html;
+ }
+
+ /**
+ * 清理嵌套的mytable标签(避免重复嵌套)
+ * @param string $html 待处理HTML
+ * @return string
+ */
+ private function cleanDuplicateNestedTags($html){
$pattern = '/<'.self::PROCESSED_TAG.'[^>]*>\s*<'.self::PROCESSED_TAG.'([^>]*)>(.*?)<\/'.self::PROCESSED_TAG.'>\s*<\/'.self::PROCESSED_TAG.'>/is';
- $html = preg_replace($pattern, '<'.self::PROCESSED_TAG.'$1>$2'.self::PROCESSED_TAG.'>', $html);
+ $html = @preg_replace($pattern, '<'.self::PROCESSED_TAG.'$1>$2'.self::PROCESSED_TAG.'>', $html);
+
return $html;
}
-
- /**
- * 判断指定Table内容是否被mytable标签包裹
- * @param string $content 待检查内容
- * @param string $tableText Table文本(如 "Table 1")
- * @return bool
- */
- private function isMatchPositionHasMyTableTag($content, $tableText) {
- $escapedText = preg_quote($tableText, '/');
- $pattern = '/<' . self::PROCESSED_TAG . '[^>]*>\s*' . $escapedText . '\s*<\/' . self::PROCESSED_TAG . '>/is';
- return @preg_match($pattern, $content) === 1;
- }
-
}
\ No newline at end of file