Merge remote-tracking branch 'origin/master'

This commit is contained in:
wangjinlei
2026-01-20 14:18:10 +08:00
2 changed files with 445 additions and 269 deletions

View File

@@ -1,52 +1,63 @@
<?php <?php
namespace app\common; namespace app\common;
class FigureTagProcessor { /**
// 可配置的样式标签列表(解耦) * 功能精准匹配并替换Figure相关格式为myfigure标签
const STYLE_TAGS = ['i', 'b', 'font', 'strong', 'em','blue']; * 支持格式figure 数字、(figure 数字)、figure 数字:/figure 数字.(含嵌套/拆分标签)
// 最大处理字符串长度(避免内存溢出 * 跳过已被myfigure包裹的Figure含后缀
const MAX_HTML_LENGTH = 100000; * 跳过Figure 数字+字母/数字后缀(含拆分标签场景,无论是否有空白)
* 正常处理Figure 数字+空白/样式标签场景
*/
class FigureTagProcessor{
//支持的样式标签列表
private const STYLE_TAGS = ['i', 'b', 'font', 'strong', 'em','blue'];
//HTML文本最大处理长度防止内存溢出
private const MAX_HTML_LENGTH = 100000;
//替换后的目标标签名
private const PROCESSED_TAG = 'myfigure';
//Figure数字与对应ID的映射数组
private $aImageMain = [];
/** /**
* 处理Figure文本替换为myfigure标签并清理冗余内容 * 处理Figure标签替换的主方法
* @param string $html 待处理的HTML文本 * @param string $html 待处理的HTML文本
* @return array ['status'=>状态码, 'data'=>处理后文本] * @param array $aImageMain Figure数字=>ID的映射数组
* status: 2-空输入, 4-无匹配, 5-处理异常, 1-处理成功 * @return array ['status' => 状态码, 'data' => 处理后文本]
* status说明2-空文本, 4-无匹配/已处理, 1-处理成功, 5-处理异常
*/ */
public function dealFigureStr($html = '') { public function dealFigureStr($html = '', $aImageMain = []){
//验 //空文本校
if (!is_string($html) || trim($html) === '') { $html = trim($html);
if ($html === '' || !is_string($html)) {
return ['status' => 2, 'data' => '']; return ['status' => 2, 'data' => ''];
} }
//超大字符串拦截 //超长文本保护
if (strlen($html) > self::MAX_HTML_LENGTH) { if (strlen($html) > self::MAX_HTML_LENGTH) {
return ['status' => 4, 'data' => $html]; return ['status' => 4, 'data' => $html];
} }
//编码处理
if (!mb_check_encoding($html, 'UTF-8')) {
$html = mb_convert_encoding($html, 'UTF-8', 'GBK,GB2312,ASCII,ISO-8859-1');
}
//初始化映射数组(过滤非数字键值)
$this->initImageMap($aImageMain);
//原始内容
$originalHtml = $html; $originalHtml = $html;
$hasReplace = false; $hasReplace = false;
try { try {
//合并嵌套样式标签 //只要包含数字+字母/数字后缀,直接返回原内容(核心修复)
$mergedHtml = $this->mergeFragmentStyleTags($html); if ($this->hasFigureSuffix($html)) {
//提取纯文本用于匹配Figure return ['status' => 4, 'data' => $html];
$plainText = preg_replace('/<[^>]+>/', ' ', $mergedHtml);
$plainText = preg_replace('/\s+/', ' ', trim($plainText));
//提取所有匹配的Figure数字
$allMatches = $this->extractAllFigureMatches($plainText);
if (empty($allMatches)) {
return ['status' => 4, 'data' => $originalHtml];
} }
//合并拆分标签的Figure+数字
//替换为myfigure标签 $html = $this->preprocessSplitTags($html);
$html = $this->replaceFigureWithTag($html, $allMatches, $hasReplace); //替换
$html = $this->replaceFigureInHtml($html, $hasReplace);
//清理冗余内容(仅替换成功后执行) //清理冗余样式/标签
if ($hasReplace) { if ($hasReplace) {
$html = $this->cleanRedundantStyles($html); $html = $this->cleanRedundantStyles($html);
$html = $this->cleanRedundantPunctuation($html); $html = $this->cleanRedundantPunctuation($html);
$html = $this->cleanUnclosedTags($html); $html = $this->cleanUnclosedTags($html);
$html = $this->optimizeFormat($html); $html = $this->optimizeFormat($html);
$html = $this->cleanDuplicateNestedTags($html);
} }
} catch (\Throwable $e) { } catch (\Throwable $e) {
@@ -55,153 +66,190 @@ class FigureTagProcessor {
return [ return [
'status' => $hasReplace ? 1 : 4, 'status' => $hasReplace ? 1 : 4,
'data' => $hasReplace ? $html : $originalHtml 'data' => $html
]; ];
} }
/** /**
* 合并嵌套的样式标签 * 全局检测是否包含Figure数字+字母/数字后缀
* @param string $html * 覆盖所有拆分/嵌套/无标签场景,无论是否有空白
* @return string * @param string $html 待检测HTML
* @return bool
*/ */
private function mergeFragmentStyleTags($html) { private function hasFigureSuffix($html){
foreach (self::STYLE_TAGS as $tag) { $styleTagsPattern = implode('|', self::STYLE_TAGS);
$pattern = '/(?:<' . $tag . '>)\s*([^<]+?)\s*<\/' . $tag . '>(?:\s*<' . $tag . '>)\s*([^<]+?)\s*<\/' . $tag . '>/is';
while (@preg_match($pattern, $html)) { // 抑制正则警告 // 正则1无标签场景Figure 4B/4123
$html = preg_replace_callback($pattern, function($matches) { $pattern1 = "/figure\s*\d+[a-zA-Z0-9]/iu";
return trim($matches[1]) . ' ' . trim($matches[2]);
}, $html); // 正则2拆分标签场景<b>4</b><b>B</b> / <b>4</b> <b>B</b> / <b>4</b>&nbsp;<b>B</b>
$pattern2 = "/figure\s*(?:<\/(?:{$styleTagsPattern})>)\s*[\s|&nbsp;]*\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\d+)\s*(?:<\/(?:{$styleTagsPattern})>)\s*[\s|&nbsp;]*\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*([a-zA-Z0-9])/iu";
// 正则3嵌套标签场景<b>4B</b> / <i>4123</i>
$pattern3 = "/figure\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*\d+[a-zA-Z0-9]\s*(?:<\/(?:{$styleTagsPattern})>)/iu";
return preg_match($pattern1, $html) || preg_match($pattern2, $html) || preg_match($pattern3, $html);
}
/**
* 初始化Figure数字映射数组
* @param array $aImageMain 原始映射数组
* @return void
*/
private function initImageMap($aImageMain){
if (!is_array($aImageMain)) {
$aImageMain = [];
}
$imageMap = [];
foreach ($aImageMain as $key => $value) {
// 严格校验键值均为数字
if (ctype_digit((string)$key) && ctype_digit((string)$value)) {
$imageMap[(int)$key] = (int)$value;
} }
} }
$this->aImageMain = $imageMap;
}
/**
* 合并所有拆分标签的Figure+数字(含空白样式标签)
* @param string $html 待处理HTML
* @return string
*/
private function preprocessSplitTags($html){
$styleTagsPattern = implode('|', self::STYLE_TAGS);
// 清理括号内的冗余标点/标签 // 正则1匹配基础拆分标签的Figure+数字
$html = preg_replace('/(\(.*?\d+)(?:\s*<[^>]+>)*\s*\.*\s*(?:<[^>]+>)*(\s*.*?\))/is', '$1$2', $html); $pattern = "/(figure)\s*(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\d+)/iu";
$html = preg_replace('/\(\s+/', '(', $html); $html = preg_replace_callback($pattern, function($matches) {
$html = preg_replace('/\s+\)/', ')', $html); return $matches[1] . ' ' . $matches[2];
}, $html);
// 正则2匹配多轮拆分标签的Figure+数字(含空白)
$pattern2 = "/(figure)(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\s*)\s*(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\d+)/iu";
$html = preg_replace_callback($pattern2, function($matches) {
return $matches[1] . $matches[2] . $matches[3];
}, $html);
return $html; return $html;
} }
/** /**
* 从纯文本中提取所有Figure数字兼容括号/标点/空格) * 核心替换逻辑将纯数字Figure替换为myfigure标签
* @param string $plainText * @param string $html 待处理HTML
* @return array * @param bool $hasReplace 是否发生替换(引用传递)
*/
private function extractAllFigureMatches($plainText) {
$allMatches = [];
$processedNums = [];
// 匹配带括号的Figure如 (Figure 1.)
$pattern1 = '/\(Figure\s*(\d+)\b(?!\p{L}|\s+\p{L})(?:\s*[\.,;:]*\s*)\)\s*([\.,:]{0,1})/iu';
if (@preg_match_all($pattern1, $plainText, $matchesFull, PREG_SET_ORDER)) {
foreach ($matchesFull as $match) {
$num = $match[1];
if (!ctype_digit($num) || in_array($num, $processedNums)) continue;
$processedNums[] = $num;
$allMatches[$num] = [
'hasOuterBracket' => true,
'validPunct' => $match[2] ?? '',
'content' => "Figure {$num}"
];
}
}
// 匹配无括号的Figure如 Figure 1.
$pattern2 = '/Figure\s*(\d+)\b(?!\p{L}|\s+\p{L})(?:\s*[\.,;:]*\s*)\s*([\.,:]{0,1})/iu';
if (@preg_match_all($pattern2, $plainText, $matchesOther, PREG_SET_ORDER)) {
foreach ($matchesOther as $match) {
$num = $match[1];
if (!ctype_digit($num) || in_array($num, $processedNums)) continue;
$processedNums[] = $num;
$allMatches[$num] = [
'hasOuterBracket' => false,
'validPunct' => $match[2] ?? '',
'content' => "Figure {$num}"
];
}
}
krsort($allMatches);
return $allMatches;
}
/**
* 将匹配的Figure替换为myfigure标签优化标签格式
* @param string $html
* @param array $allMatches
* @param bool $hasReplace
* @return string * @return string
*/ */
private function replaceFigureWithTag($html, $allMatches, &$hasReplace) { private function replaceFigureInHtml($html, &$hasReplace){
foreach ($allMatches as $num => $info) { $styleTagsPattern = implode('|', self::STYLE_TAGS);
$innerContent = $info['hasOuterBracket'] $styleTagsRegex = "(?:<(?:{$styleTagsPattern})[^>]*>)*\s*";
? "({$info['content']})" $styleTagsCloseRegex = "\s*(?:<\/(?:{$styleTagsPattern})>)*";
: $info['content'];
//<myfigure data-id="1">Figure 1</myfigure> // 正则1匹配括号内的纯数字Figure如 (Figure 2)、(<b>Figure 3</b>)
$targetTag = "<myfigure data-id=\"{$num}\">{$innerContent}</myfigure>"; // $pattern1 = "/\(\s*{$styleTagsRegex}figure\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*(?![a-zA-Z0-9])/iu";
if (!empty($info['validPunct']) && !$info['hasOuterBracket']) { $pattern1 = "/\(\s*{$styleTagsRegex}figure\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*(?![a-zA-Z0-9])/iuD";
$targetTag .= $info['validPunct']; $html = preg_replace_callback($pattern1, function($matches) use (&$hasReplace) {
$num = $matches[1];
$numInt = (int)$num;
$suffix = $matches[2] ?? '';
// 过滤条件:非数字、无映射、已处理过的标签
if (!ctype_digit($num) || !isset($this->aImageMain[$numInt]) ||
$this->isMatchPositionHasMyFigureTag($matches[0], "Figure {$num}")) {
return $matches[0];
} }
$patternSuffix = '(?!\p{L}|\s+\p{L})'; // 执行替换
$pattern = $info['hasOuterBracket'] $primaryId = $this->aImageMain[$numInt];
? '/\(\s*(?:<[^>]+>|\s)*Figure(?:<[^>]+>|\s)*' . $num . '\b' . $patternSuffix . '(?:\s*[\.,;:]*\s*|\s*<[^>]+>)*\s*\)/iu' $baseTag = "<" . self::PROCESSED_TAG . " data-id=\"{$primaryId}\">Figure {$num}</" . self::PROCESSED_TAG . ">";
: '/\s*(?:<[^>]+>|\s)*Figure(?:<[^>]+>|\s)*' . $num . '\b' . $patternSuffix . '(?:\s*[\.,;:]*\s*|\s*<[^>]+>)*\s*([\.,:]{0,1})/iu'; $target = "({$baseTag}{$suffix})";
//执行替换最多替换1次避免重复 $hasReplace = true;
$html = @preg_replace($pattern, $targetTag, $html, 1, $count); return $target;
if ($count > 0) { }, $html);
$hasReplace = true;
error_log("[FigureTagProcessor] 替换成功 - ID:{$num} 括号:".($info['hasOuterBracket']?'是':'否')); // 正则2匹配无括号的纯数字Figure如 Figure 2、<i>Figure 3</i>:
// $pattern2 = "/{$styleTagsRegex}figure\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}(?![a-zA-Z0-9])/iu";
$pattern2 = "/{$styleTagsRegex}figure\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*(?![a-zA-Z0-9])/iuD";
$html = preg_replace_callback($pattern2, function($matches) use (&$hasReplace) {
$num = $matches[1];
$numInt = (int)$num;
$suffix = $matches[2] ?? '';
// 过滤条件:非数字、无映射、已处理过的标签
if (!ctype_digit($num) || !isset($this->aImageMain[$numInt]) ||
$this->isMatchPositionHasMyFigureTag($matches[0], "Figure {$num}")) {
return $matches[0];
} }
}
// 执行替换
$primaryId = $this->aImageMain[$numInt];
$baseTag = "<" . self::PROCESSED_TAG . " data-id=\"{$primaryId}\">Figure {$num}</" . self::PROCESSED_TAG . ">";
$target = "{$baseTag}{$suffix}";
$hasReplace = true;
return $target;
}, $html);
return $html; return $html;
} }
/** /**
* 清理myfigure标签周围的冗余样式标签适配新标签格式 * 检测当前匹配内容是否已包含myfigure标签避免重复替换
* @param string $html * @param string $content 匹配的文本片段
* @param string $figureText 待检测的Figure文本如 Figure 2
* @return bool
*/
private function isMatchPositionHasMyFigureTag($content, $figureText){
$escapedText = preg_quote($figureText, '/');
$pattern = '/<' . self::PROCESSED_TAG . '[^>]*>\s*' . $escapedText . '\s*<\/' . self::PROCESSED_TAG . '>/is';
return (bool)preg_match($pattern, $content);
}
/**
* 清理myfigure标签周围的冗余样式标签
* @param string $html 待处理HTML
* @return string * @return string
*/ */
private function cleanRedundantStyles($html) { private function cleanRedundantStyles($html){
foreach (self::STYLE_TAGS as $tag) { foreach (self::STYLE_TAGS as $tag) {
$pattern = '/<' . $tag . '>\s*<myfigure([^>]*)>(.*?)<\/myfigure>([\.,:]{0,1})\s*<\/' . $tag . '>/is'; $pattern = '/<' . $tag . '>\s*<'.self::PROCESSED_TAG.'([^>]*?)>(.*?)<\/'.self::PROCESSED_TAG.'>([\.,:]{0,1})\s*<\/' . $tag . '>/is';
$html = @preg_replace($pattern, '<myfigure$1>$2</myfigure>$3', $html); $html = preg_replace($pattern, '<'.self::PROCESSED_TAG.'$1>$2</'.self::PROCESSED_TAG.'>$3', $html);
} }
//清理闭标签
// 清理无匹配的闭合样式标签
$html = preg_replace('/<\/('.implode('|', self::STYLE_TAGS).')>(?![^<]*<\1>)/is', '', $html); $html = preg_replace('/<\/('.implode('|', self::STYLE_TAGS).')>(?![^<]*<\1>)/is', '', $html);
return $html; return $html;
} }
/** /**
* 清理myfigure标签的冗余标点(适配新标签格式) * 清理myfigure标签周围的冗余标点
* @param string $html * @param string $html 待处理HTML
* @return string * @return string
*/ */
private function cleanRedundantPunctuation($html) { private function cleanRedundantPunctuation($html){
$html = preg_replace('/<myfigure data-id="(\d+)">\(Figure \d+\)<\/myfigure>\)\./i', '<myfigure data-id="$1">(Figure $1)</myfigure>.', $html); // 修复括号+标点的冗余格式
$html = preg_replace('/<\/myfigure>\)\.([\.,:]{0,1})/', '</myfigure>)$1', $html); $html = preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\(Figure \d+\)<\/'.self::PROCESSED_TAG.'>\)\./i',
$html = preg_replace('/<\/myfigure>\.\)([\.,:]{0,1})/', '</myfigure>)$1', $html); '<'.self::PROCESSED_TAG.' data-id="$1">(Figure $1)</'.self::PROCESSED_TAG.'>.', $html);
$html = preg_replace('/<\/myfigure>([\.,:]){2,}/', '</myfigure>$1', $html); $html = preg_replace('/<\/'.self::PROCESSED_TAG.'>\)\.([\.,:]{0,1})/', '</'.self::PROCESSED_TAG.'>)$1', $html);
$html = preg_replace('/<myfigure data-id="(\d+)">\((Figure \d+)\s*<\/myfigure>([\.,:]{0,1})/i', $html = preg_replace('/<\/'.self::PROCESSED_TAG.'>\.\)([\.,:]{0,1})/', '</'.self::PROCESSED_TAG.'>)$1', $html);
'<myfigure data-id="$1">($2)</myfigure>$3', $html);
// 清理重复标点
$html = preg_replace('/<\/'.self::PROCESSED_TAG.'>([\.,:]){2,}/', '</'.self::PROCESSED_TAG.'>$1', $html);
// 修复括号内的标签冗余
$html = preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\((Figure \d+)\s*<\/'.self::PROCESSED_TAG.'>([\.,:]{0,1})/i',
'<'.self::PROCESSED_TAG.' data-id="$1">($2)</'.self::PROCESSED_TAG.'>$3', $html);
return $html; return $html;
} }
/** /**
* 清理孤立的样式标签 * 清理未闭合的样式标签
* @param string $html * @param string $html 待处理HTML
* @return string * @return string
*/ */
private function cleanUnclosedTags($html) { private function cleanUnclosedTags($html){
foreach (self::STYLE_TAGS as $tag) { foreach (self::STYLE_TAGS as $tag) {
$html = @preg_replace('/(<\/myfigure>)\s*<\/' . $tag . '>/i', '$1', $html); // 清理myfigure标签后的冗余闭合标签
} $html = preg_replace('/(<\/'.self::PROCESSED_TAG.'>)\s*<\/' . $tag . '>/i', '$1', $html);
foreach (self::STYLE_TAGS as $tag) {
@preg_match_all("/<{$tag}\b[^>]*>/i", $html, $openMatches, PREG_OFFSET_CAPTURE); // 定位所有该标签的开闭标签位置
@preg_match_all("/<\/{$tag}>/i", $html, $closeMatches, PREG_OFFSET_CAPTURE); preg_match_all("/<{$tag}\b[^>]*>/i", $html, $openMatches, PREG_OFFSET_CAPTURE);
preg_match_all("/<\/{$tag}>/i", $html, $closeMatches, PREG_OFFSET_CAPTURE);
$allTags = []; $allTags = [];
// 收集开标签
foreach ($openMatches[0] as $m) { foreach ($openMatches[0] as $m) {
$allTags[] = [ $allTags[] = [
'offset' => $m[1], 'offset' => $m[1],
@@ -210,6 +258,7 @@ class FigureTagProcessor {
'length' => strlen($m[0]) 'length' => strlen($m[0])
]; ];
} }
// 收集闭标签
foreach ($closeMatches[0] as $m) { foreach ($closeMatches[0] as $m) {
$allTags[] = [ $allTags[] = [
'offset' => $m[1], 'offset' => $m[1],
@@ -218,57 +267,71 @@ class FigureTagProcessor {
'length' => strlen($m[0]) 'length' => strlen($m[0])
]; ];
} }
// 按位置排序
usort($allTags, function($a, $b) { usort($allTags, function($a, $b) {
return $a['offset'] - $b['offset']; return $a['offset'] - $b['offset'];
}); });
// 栈结构匹配开闭标签
$tagStack = []; $tagStack = [];
$removeOffsets = []; $removeOffsets = [];
foreach ($allTags as $t) { foreach ($allTags as $t) {
if ($t['type'] == 'open') { if ($t['type'] === 'open') {
array_push($tagStack, $t); array_push($tagStack, $t);
} else { } else {
if (!empty($tagStack)) { if (!empty($tagStack)) {
array_pop($tagStack); array_pop($tagStack);
} else { } else {
$removeOffsets[] = [ // 无匹配开标签的闭标签,标记删除
'pos' => $t['offset'], $removeOffsets[] = $t;
'len' => $t['length'],
'content' => $t['content']
];
} }
} }
} }
// 无匹配闭标签的开标签,标记删除
foreach ($tagStack as $t) { foreach ($tagStack as $t) {
$removeOffsets[] = [ $removeOffsets[] = $t;
'pos' => $t['offset'],
'len' => $t['length'],
'content' => $t['content']
];
} }
// 倒序删除避免偏移错乱 // 按偏移量倒序删除避免影响后续偏移)
usort($removeOffsets, function($a, $b) { usort($removeOffsets, function($a, $b) {
return $b['pos'] - $a['pos']; return $b['offset'] - $a['offset'];
}); });
foreach ($removeOffsets as $item) { foreach ($removeOffsets as $item) {
if ($item['pos'] >= 0 && $item['pos'] < strlen($html)) { if ($item['offset'] >= 0 && $item['offset'] < strlen($html)) {
$html = substr_replace($html, '', $item['pos'], $item['len']); $html = substr_replace($html, '', $item['offset'], $item['length']);
} }
} }
} }
return $html; return $html;
} }
/** /**
* 优化文本格式(合并多余空格规范myfigure标签前后空格) * 优化文本格式(清理多余空格)
* @param string $html * @param string $html 待处理HTML
* @return string * @return string
*/ */
private function optimizeFormat($html) { private function optimizeFormat($html){
// 清理连续空格
$html = preg_replace('/\s{2,}/', ' ', trim($html)); $html = preg_replace('/\s{2,}/', ' ', trim($html));
$html = preg_replace('/<\/myfigure>([A-Za-z0-9])/is', '</myfigure> $1', $html); // 标签后紧跟字母/数字时加空格
$html = preg_replace('/([a-zA-Z0-9])<myfigure/is', '$1 <myfigure', $html); $html = preg_replace('/<\/'.self::PROCESSED_TAG.'>([A-Za-z0-9])/is', '</'.self::PROCESSED_TAG.'> $1', $html);
// 字母/数字紧跟标签前时加空格
$html = preg_replace('/([a-zA-Z0-9])<'.self::PROCESSED_TAG.'/is', '$1 <'.self::PROCESSED_TAG.'', $html);
return $html;
}
/**
* 清理嵌套的myfigure标签避免重复嵌套
* @param string $html 待处理HTML
* @return string
*/
private function cleanDuplicateNestedTags($html){
$pattern = '/<'.self::PROCESSED_TAG.'[^>]*>\s*<'.self::PROCESSED_TAG.'([^>]*)>(.*?)<\/'.self::PROCESSED_TAG.'>\s*<\/'.self::PROCESSED_TAG.'>/is';
$html = preg_replace($pattern, '<'.self::PROCESSED_TAG.'$1>$2</'.self::PROCESSED_TAG.'>', $html);
return $html; return $html;
} }
} }

View File

@@ -2,59 +2,71 @@
namespace app\common; namespace app\common;
/** /**
* Table标签处理器生产环境终极版
* 功能精准匹配并替换Table相关格式为mytable标签 * 功能精准匹配并替换Table相关格式为mytable标签
* 支持格式table 数字、(table 数字)、table 数字:/table 数字.(含嵌套标签) * 支持格式table 数字、(table 数字)、table 数字:/table 数字.(含嵌套/拆分标签)
* 特性:支持任意嵌套标签/括号、不处理数字+字母、仅跳过已被mytable包裹的Table * 跳过已被mytable包裹的table(含后缀)
* 跳过table 数字+字母/数字后缀(含拆分标签场景,无论是否有空白)
* 正常处理table 数字+空白/样式标签场景
*/ */
class TableTagProcessor { class TableTagProcessor{
// 可配置的样式标签列表 // 支持的样式标签列表
const STYLE_TAGS = ['i', 'b', 'font', 'strong', 'em']; private const STYLE_TAGS = ['i', 'b', 'font', 'strong', 'em','blue'];
// 最大处理字符串长度 // HTML文本最大处理长度防止内存溢出
const MAX_HTML_LENGTH = 100000; private const MAX_HTML_LENGTH = 100000;
// 目标替换标签 // 替换后的目标标签
const PROCESSED_TAG = 'mytable'; private const PROCESSED_TAG = 'mytable';
// 数据库表格ID映射 // Table数字与对应ID映射数组
private $aTableMain = []; private $aTableMain = [];
/** /**
* 处理Table文本替换为mytable标签并清理冗余内容 * 处理Table标签替换的主方法
* @param string $html 待处理的HTML文本 * @param string $html 待处理的HTML文本
* @param array $aTableMain Table数字→主键ID的映射数组如 [1=>1001, 2=>1002] * @param array $aTableMain Table数字=>ID的映射数组可选默认1=>1~10=>10
* @return array ['status'=>状态码, 'data'=>处理后文本] * @return array ['status' => 状态码, 'data' => 处理后文本]
* status: 2-空输入, 4-无匹配/已处理, 5-处理异常, 1-处理成功 * status说明:2-空文本, 4-无匹配/已处理, 1-处理成功, 5-处理异常
*/ */
public function dealTableStr($html = '', $aTableMain = []) { public function dealTableStr($html = '', $aTableMain = []){
//验证 // 初始化默认映射数组(仅当入参为空时使用)
if (!is_string($html) || trim($html) === '') { $defaultTableMap = [1=>1,2=>2,3=>3,4=>4,5=>5,6=>6,7=>7,8=>8,9=>9,10=>10];
// 优先使用入参,入参为空则用默认值
$tableMap = !empty($aTableMain) ? $aTableMain : $defaultTableMap;
// 空文本校验
$html = trim($html);
if ($html === '' || !is_string($html)) {
return ['status' => 2, 'data' => '']; return ['status' => 2, 'data' => ''];
} }
//超大字符串拦截(防止内存溢出)
// 超长文本保护
if (strlen($html) > self::MAX_HTML_LENGTH) { if (strlen($html) > self::MAX_HTML_LENGTH) {
return ['status' => 4, 'data' => $html]; return ['status' => 4, 'data' => $html];
} }
//初始化主键映射数组 // 编码处理统一转为UTF-8避免中文乱码
if(!empty($aTableMain)){ if (!mb_check_encoding($html, 'UTF-8')) {
$aTableMainNew = []; $html = mb_convert_encoding($html, 'UTF-8', 'GBK,GB2312,ASCII,ISO-8859-1');
foreach ($aTableMain as $key => $value) {
if (!ctype_digit((string)$key) || !ctype_digit((string)$value)) {
continue;
}
$keyInt = (int)$key;
$aTableMainNew[$keyInt + 1] = $value;
}
$this->aTableMain = $aTableMainNew;
} }
// 初始化映射数组(过滤非数字键值)
$this->initTableMap($tableMap);
// 原始内容(异常时返回)
$originalHtml = $html; $originalHtml = $html;
$hasReplace = false; $hasReplace = false;
try { try {
//原始HTML中匹配所有符合规则的Table // 只要包含数字+字母/数字后缀,直接返回原内容
if ($this->hasTableSuffix($html)) {
return ['status' => 4, 'data' => $html];
}
// 合并拆分标签的Table+数字
$html = $this->preprocessSplitTags($html);
// 核心替换逻辑
$html = $this->replaceTableInHtml($html, $hasReplace); $html = $this->replaceTableInHtml($html, $hasReplace);
// 清理冗余内容 // 清理冗余样式/标签(仅当发生替换时执行)
if ($hasReplace) { if ($hasReplace) {
$html = $this->cleanRedundantStyles($html); $html = $this->cleanRedundantStyles($html);
$html = $this->cleanRedundantPunctuation($html); $html = $this->cleanRedundantPunctuation($html);
@@ -74,50 +86,120 @@ class TableTagProcessor {
} }
/** /**
* 核心方法直接在HTML中匹配并替换Table * 全局检测是否包含Table数字+字母/数字后缀
* 覆盖所有拆分/嵌套/无标签场景,无论是否有空白
* @param string $html 待检测HTML
* @return bool
*/
private function hasTableSuffix($html){
$styleTagsPattern = implode('|', self::STYLE_TAGS);
// 正则1无标签场景Table 4B/4123
$pattern1 = "/table\s*\d+[a-zA-Z0-9]/iu";
// 正则2拆分标签场景<b>4</b><b>B</b> / <b>4</b> <b>B</b> / <b>4</b>&nbsp;<b>B</b>
$pattern2 = "/table\s*(?:<\/(?:{$styleTagsPattern})>)\s*[\s|&nbsp;]*\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\d+)\s*(?:<\/(?:{$styleTagsPattern})>)\s*[\s|&nbsp;]*\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*([a-zA-Z0-9])/iu";
// 正则3嵌套标签场景<b>4B</b> / <i>4123</i>
$pattern3 = "/table\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*\d+[a-zA-Z0-9]\s*(?:<\/(?:{$styleTagsPattern})>)/iu";
// 加@抑制正则警告,避免极端文本导致报错
return @preg_match($pattern1, $html) || @preg_match($pattern2, $html) || @preg_match($pattern3, $html);
}
/**
* 初始化Table数字映射数组过滤非数字键值
* @param array $aTableMain 原始映射数组
* @return void
*/
private function initTableMap($aTableMain = []){
if (!is_array($aTableMain)) {
$aTableMain = [];
}
$tableMap = [];
foreach ($aTableMain as $key => $value) {
// 严格校验键值均为数字
if (ctype_digit((string)$key) && ctype_digit((string)$value)) {
$tableMap[(int)$key] = (int)$value;
}
}
$this->aTableMain = $tableMap;
}
/**
* 合并所有拆分标签的Table+数字(含空白样式标签)
* @param string $html 待处理HTML
* @return string * @return string
*/ */
private function replaceTableInHtml($html, &$hasReplace) { private function preprocessSplitTags($html){
$styleTagsPattern = implode('|', self::STYLE_TAGS); $styleTagsPattern = implode('|', self::STYLE_TAGS);
$styleTagsRegex = "(?:<(?:{$styleTagsPattern})[^>]*>)*\s*"; // 匹配任意嵌套样式标签
// 正则1匹配基础拆分标签的Table+数字
$pattern = "/(table)\s*(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\d+)/iu";
$html = @preg_replace_callback($pattern, function($matches) {
return $matches[1] . ' ' . $matches[2];
}, $html);
// 正则2匹配多轮拆分标签的Table+数字(含空白)
$pattern2 = "/(table)(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\s*)\s*(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\d+)/iu";
$html = @preg_replace_callback($pattern2, function($matches) {
return $matches[1] . $matches[2] . $matches[3];
}, $html);
return $html;
}
/**
* 核心替换逻辑将纯数字Table替换为mytable标签
* @param string $html 待处理HTML
* @param bool $hasReplace 是否发生替换(引用传递)
* @return string
*/
private function replaceTableInHtml($html, &$hasReplace){
$styleTagsPattern = implode('|', self::STYLE_TAGS);
$styleTagsRegex = "(?:<(?:{$styleTagsPattern})[^>]*>)*\s*";
$styleTagsCloseRegex = "\s*(?:<\/(?:{$styleTagsPattern})>)*"; $styleTagsCloseRegex = "\s*(?:<\/(?:{$styleTagsPattern})>)*";
// 则1匹配括号Table如 (Table 82)、(<b>Table 1.</b>) // 则1匹配括号内的纯数字Table如 (Table 2)、(<b>Table 3</b>)
$pattern1 = "/\(\s*{$styleTagsRegex}table\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*\)/iu"; $pattern1 = "/\(\s*{$styleTagsRegex}table\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*(?![a-zA-Z0-9])/iuD";
$html = preg_replace_callback($pattern1, function($matches) use (&$hasReplace) { $html = @preg_replace_callback($pattern1, function($matches) use (&$hasReplace) {
$num = $matches[1]; $num = $matches[1];
$numInt = intval($num); $numInt = (int)$num;
$suffix = $matches[2] ?? ''; $suffix = $matches[2] ?? '';
// 校验:纯数字 + 有映射ID + 未被mytable包裹避免重复替换 // 过滤条件:非数字、无映射、已处理过的标签
if (!ctype_digit($num) || !isset($this->aTableMain[$numInt]) || if (!ctype_digit($num) || !isset($this->aTableMain[$numInt]) ||
$this->isMatchPositionHasMyTableTag($matches[0], "Table {$num}")) { $this->isMatchPositionHasMyTableTag($matches[0], "Table {$num}")) {
return $matches[0]; return $matches[0];
} }
// 执行替换
$primaryId = $this->aTableMain[$numInt]; $primaryId = $this->aTableMain[$numInt];
$baseTag = "<".self::PROCESSED_TAG." data-id=\"{$primaryId}\">Table {$num}</".self::PROCESSED_TAG.">"; $baseTag = "<" . self::PROCESSED_TAG . " data-id=\"{$primaryId}\">Table {$num}</" . self::PROCESSED_TAG . ">";
$target = "({$baseTag}{$suffix})"; $target = "({$baseTag}{$suffix})";
$hasReplace = true; $hasReplace = true;
return $target; return $target;
}, $html); }, $html);
// 则2匹配无括号的Table如 Table 1、<b>Table 2:</b>、<i>Table 3.</i> // 则2匹配无括号的纯数字Table如 Table 2、<i>Table 3</i>:
$pattern2 = "/{$styleTagsRegex}table\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}(?![a-zA-Z])/iu"; $pattern2 = "/{$styleTagsRegex}table\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*(?![a-zA-Z0-9])/iuD";
$html = preg_replace_callback($pattern2, function($matches) use (&$hasReplace) { $html = @preg_replace_callback($pattern2, function($matches) use (&$hasReplace) {
$num = $matches[1]; $num = $matches[1];
$numInt = intval($num); $numInt = (int)$num;
$suffix = $matches[2] ?? ''; $suffix = $matches[2] ?? '';
// 校验:纯数字 + 有映射ID + 未被mytable包裹 + 不是数字+字母组合 // 过滤条件:非数字、无映射、已处理过的标签
if (!ctype_digit($num) || !isset($this->aTableMain[$numInt]) || if (!ctype_digit($num) || !isset($this->aTableMain[$numInt]) ||
$this->isMatchPositionHasMyTableTag($matches[0], "Table {$num}")) { $this->isMatchPositionHasMyTableTag($matches[0], "Table {$num}")) {
return $matches[0]; return $matches[0];
} }
// 执行替换
$primaryId = $this->aTableMain[$numInt]; $primaryId = $this->aTableMain[$numInt];
$baseTag = "<".self::PROCESSED_TAG." data-id=\"{$primaryId}\">Table {$num}</".self::PROCESSED_TAG.">"; $baseTag = "<" . self::PROCESSED_TAG . " data-id=\"{$primaryId}\">Table {$num}</" . self::PROCESSED_TAG . ">";
$target = "{$baseTag}{$suffix}"; $target = "{$baseTag}{$suffix}";
$hasReplace = true; $hasReplace = true;
@@ -127,126 +209,157 @@ class TableTagProcessor {
return $html; return $html;
} }
/**
* 检测当前匹配内容是否已包含mytable标签避免重复替换
* @param string $content 匹配的文本片段
* @param string $tableText 待检测的Table文本如 Table 2
* @return bool
*/
private function isMatchPositionHasMyTableTag($content, $tableText){
$escapedText = preg_quote($tableText, '/');
$pattern = '/<' . self::PROCESSED_TAG . '[^>]*>\s*' . $escapedText . '\s*<\/' . self::PROCESSED_TAG . '>/is';
return (bool)@preg_match($pattern, $content);
}
/** /**
* 清理mytable标签周围的冗余样式标签 * 清理mytable标签周围的冗余样式标签
* @param string $html * @param string $html 待处理HTML
* @return string * @return string
*/ */
private function cleanRedundantStyles($html) { private function cleanRedundantStyles($html){
foreach (self::STYLE_TAGS as $tag) { foreach (self::STYLE_TAGS as $tag) {
$pattern = '/<' . $tag . '>\s*<'.self::PROCESSED_TAG.'([^>]*?)>(.*?)<\/'.self::PROCESSED_TAG.'>([\.,:]{0,1})\s*<\/' . $tag . '>/is'; $pattern = '/<' . $tag . '>\s*<'.self::PROCESSED_TAG.'([^>]*?)>(.*?)<\/'.self::PROCESSED_TAG.'>([\.,:]{0,1})\s*<\/' . $tag . '>/is';
$html = @preg_replace($pattern, '<'.self::PROCESSED_TAG.'$1>$2</'.self::PROCESSED_TAG.'>$3', $html); $html = @preg_replace($pattern, '<'.self::PROCESSED_TAG.'$1>$2</'.self::PROCESSED_TAG.'>$3', $html);
} }
// 清理孤立的样式闭标签(避免标签残留)
$html = preg_replace('/<\/('.implode('|', self::STYLE_TAGS).')>(?![^<]*<\1>)/is', '', $html); // 清理无匹配的闭合样式标签
$html = @preg_replace('/<\/('.implode('|', self::STYLE_TAGS).')>(?![^<]*<\1>)/is', '', $html);
return $html; return $html;
} }
/** /**
* 清理mytable标签的冗余标点(保证格式整洁) * 清理mytable标签周围的冗余标点
* @param string $html * @param string $html 待处理HTML
* @return string * @return string
*/ */
private function cleanRedundantPunctuation($html) { private function cleanRedundantPunctuation($html){
$html = preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\(Table \d+\)<\/'.self::PROCESSED_TAG.'>\)\./i', '<'.self::PROCESSED_TAG.' data-id="$1">(Table $1)</'.self::PROCESSED_TAG.'>.', $html); // 修复括号+标点的冗余格式
$html = preg_replace('/<\/'.self::PROCESSED_TAG.'>\)\.([\.,:]{0,1})/', '</'.self::PROCESSED_TAG.'>)$1', $html); $html = @preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\(Table \d+\)<\/'.self::PROCESSED_TAG.'>\)\./i',
$html = preg_replace('/<\/'.self::PROCESSED_TAG.'>\.\)([\.,:]{0,1})/', '</'.self::PROCESSED_TAG.'>)$1', $html); '<'.self::PROCESSED_TAG.' data-id="$1">(Table $1)</'.self::PROCESSED_TAG.'>.', $html);
$html = preg_replace('/<\/'.self::PROCESSED_TAG.'>([\.,:]){2,}/', '</'.self::PROCESSED_TAG.'>$1', $html); $html = @preg_replace('/<\/'.self::PROCESSED_TAG.'>\)\.([\.,:]{0,1})/', '</'.self::PROCESSED_TAG.'>)$1', $html);
$html = preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\((Table \d+)\s*<\/'.self::PROCESSED_TAG.'>([\.,:]{0,1})/i', $html = @preg_replace('/<\/'.self::PROCESSED_TAG.'>\.\)([\.,:]{0,1})/', '</'.self::PROCESSED_TAG.'>)$1', $html);
// 清理重复标点
$html = @preg_replace('/<\/'.self::PROCESSED_TAG.'>([\.,:]){2,}/', '</'.self::PROCESSED_TAG.'>$1', $html);
// 修复括号内的标签冗余
$html = @preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\((Table \d+)\s*<\/'.self::PROCESSED_TAG.'>([\.,:]{0,1})/i',
'<'.self::PROCESSED_TAG.' data-id="$1">($2)</'.self::PROCESSED_TAG.'>$3', $html); '<'.self::PROCESSED_TAG.' data-id="$1">($2)</'.self::PROCESSED_TAG.'>$3', $html);
return $html; return $html;
} }
/** /**
* 清理孤立的样式标签(栈算法兜底,避免标签不闭合) * 清理未闭合的样式标签
* @param string $html * @param string $html 待处理HTML
* @return string * @return string
*/ */
private function cleanUnclosedTags($html) { private function cleanUnclosedTags($html){
// 清理mytable后孤立的样式闭标签
foreach (self::STYLE_TAGS as $tag) { foreach (self::STYLE_TAGS as $tag) {
// 清理mytable标签后的冗余闭合标签
$html = @preg_replace('/(<\/'.self::PROCESSED_TAG.'>)\s*<\/' . $tag . '>/i', '$1', $html); $html = @preg_replace('/(<\/'.self::PROCESSED_TAG.'>)\s*<\/' . $tag . '>/i', '$1', $html);
}
// 栈算法清理其他孤立标签 // 定位所有该标签的开闭标签位置
foreach (self::STYLE_TAGS as $tag) {
@preg_match_all("/<{$tag}\b[^>]*>/i", $html, $openMatches, PREG_OFFSET_CAPTURE); @preg_match_all("/<{$tag}\b[^>]*>/i", $html, $openMatches, PREG_OFFSET_CAPTURE);
@preg_match_all("/<\/{$tag}>/i", $html, $closeMatches, PREG_OFFSET_CAPTURE); @preg_match_all("/<\/{$tag}>/i", $html, $closeMatches, PREG_OFFSET_CAPTURE);
$allTags = []; $allTags = [];
// 收集开标签
foreach ($openMatches[0] as $m) { foreach ($openMatches[0] as $m) {
$allTags[] = ['offset' => $m[1], 'type' => 'open', 'content' => $m[0], 'length' => strlen($m[0])]; $allTags[] = [
'offset' => $m[1],
'type' => 'open',
'content' => $m[0],
'length' => strlen($m[0])
];
} }
// 收集闭标签
foreach ($closeMatches[0] as $m) { foreach ($closeMatches[0] as $m) {
$allTags[] = ['offset' => $m[1], 'type' => 'close', 'content' => $m[0], 'length' => strlen($m[0])]; $allTags[] = [
'offset' => $m[1],
'type' => 'close',
'content' => $m[0],
'length' => strlen($m[0])
];
} }
// 按位置排序
usort($allTags, function($a, $b) { usort($allTags, function($a, $b) {
return $a['offset'] - $b['offset']; return $a['offset'] - $b['offset'];
}); });
// 栈结构匹配开闭标签
$tagStack = []; $tagStack = [];
$removeOffsets = []; $removeOffsets = [];
foreach ($allTags as $t) { foreach ($allTags as $t) {
if ($t['type'] == 'open') { if ($t['type'] === 'open') {
array_push($tagStack, $t); array_push($tagStack, $t);
} else { } else {
if (!empty($tagStack)) { if (!empty($tagStack)) {
array_pop($tagStack); array_pop($tagStack);
} else { } else {
// 无匹配开标签的闭标签,标记删除
$removeOffsets[] = $t; $removeOffsets[] = $t;
} }
} }
} }
// 无匹配闭标签的开标签,标记删除
foreach ($tagStack as $t) { foreach ($tagStack as $t) {
$removeOffsets[] = $t; $removeOffsets[] = $t;
} }
// 倒序删除避免偏移错乱 // 按偏移量倒序删除避免影响后续偏移)
usort($removeOffsets, function($a, $b) { usort($removeOffsets, function($a, $b) {
return $b['offset'] - $a['offset']; return $b['offset'] - $a['offset'];
}); });
foreach ($removeOffsets as $item) { foreach ($removeOffsets as $item) {
if ($item['offset'] >= 0 && $item['offset'] < strlen($html)) { if ($item['offset'] >= 0 && $item['offset'] < strlen($html)) {
$html = substr_replace($html, '', $item['offset'], $item['length']); $html = substr_replace($html, '', $item['offset'], $item['length']);
} }
} }
} }
return $html; return $html;
} }
/** /**
* 优化文本格式(合并多余空格,规范标签前后空格) * 优化文本格式(清理多余空格)
* @param string $html * @param string $html 待处理HTML
* @return string * @return string
*/ */
private function optimizeFormat($html) { private function optimizeFormat($html){
$html = preg_replace('/\s{2,}/', ' ', trim($html)); // 清理连续空格
$html = preg_replace('/<\/'.self::PROCESSED_TAG.'>([A-Za-z0-9])/is', '</'.self::PROCESSED_TAG.'> $1', $html); $html = @preg_replace('/\s{2,}/', ' ', trim($html));
$html = preg_replace('/([a-zA-Z0-9])<'.self::PROCESSED_TAG.'/is', '$1 <'.self::PROCESSED_TAG.'', $html); // 标签后紧跟字母/数字时加空格
$html = @preg_replace('/<\/'.self::PROCESSED_TAG.'>([A-Za-z0-9])/is', '</'.self::PROCESSED_TAG.'> $1', $html);
// 字母/数字紧跟标签前时加空格
$html = @preg_replace('/([a-zA-Z0-9])<'.self::PROCESSED_TAG.'/is', '$1 <'.self::PROCESSED_TAG.'', $html);
return $html; return $html;
} }
/** /**
* 清理重复嵌套的mytable标签兜底方案 * 清理嵌套的mytable标签避免重复嵌套
* @param string $html * @param string $html 待处理HTML
* @return string * @return string
*/ */
private function cleanDuplicateNestedTags($html) { private function cleanDuplicateNestedTags($html){
$pattern = '/<'.self::PROCESSED_TAG.'[^>]*>\s*<'.self::PROCESSED_TAG.'([^>]*)>(.*?)<\/'.self::PROCESSED_TAG.'>\s*<\/'.self::PROCESSED_TAG.'>/is'; $pattern = '/<'.self::PROCESSED_TAG.'[^>]*>\s*<'.self::PROCESSED_TAG.'([^>]*)>(.*?)<\/'.self::PROCESSED_TAG.'>\s*<\/'.self::PROCESSED_TAG.'>/is';
$html = preg_replace($pattern, '<'.self::PROCESSED_TAG.'$1>$2</'.self::PROCESSED_TAG.'>', $html); $html = @preg_replace($pattern, '<'.self::PROCESSED_TAG.'$1>$2</'.self::PROCESSED_TAG.'>', $html);
return $html; return $html;
} }
/**
* 判断指定Table内容是否被mytable标签包裹
* @param string $content 待检查内容
* @param string $tableText Table文本如 "Table 1"
* @return bool
*/
private function isMatchPositionHasMyTableTag($content, $tableText) {
$escapedText = preg_quote($tableText, '/');
$pattern = '/<' . self::PROCESSED_TAG . '[^>]*>\s*' . $escapedText . '\s*<\/' . self::PROCESSED_TAG . '>/is';
return @preg_match($pattern, $content) === 1;
}
} }