处理正文内容表格/图片相关联

This commit is contained in:
chengxl
2026-01-19 13:52:09 +08:00
parent b217ab03fd
commit c2ee3b170f
2 changed files with 445 additions and 269 deletions

View File

@@ -1,52 +1,63 @@
<?php
namespace app\common;
class FigureTagProcessor {
// 可配置的样式标签列表(解耦)
const STYLE_TAGS = ['i', 'b', 'font', 'strong', 'em','blue'];
// 最大处理字符串长度(避免内存溢出
const MAX_HTML_LENGTH = 100000;
/**
* 功能精准匹配并替换Figure相关格式为myfigure标签
* 支持格式figure 数字、(figure 数字)、figure 数字:/figure 数字.(含嵌套/拆分标签)
* 跳过已被myfigure包裹的Figure含后缀
* 跳过Figure 数字+字母/数字后缀(含拆分标签场景,无论是否有空白)
* 正常处理Figure 数字+空白/样式标签场景
*/
class FigureTagProcessor{
//支持的样式标签列表
private const STYLE_TAGS = ['i', 'b', 'font', 'strong', 'em','blue'];
//HTML文本最大处理长度防止内存溢出
private const MAX_HTML_LENGTH = 100000;
//替换后的目标标签名
private const PROCESSED_TAG = 'myfigure';
//Figure数字与对应ID的映射数组
private $aImageMain = [];
/**
* 处理Figure文本替换为myfigure标签并清理冗余内容
* 处理Figure标签替换的主方法
* @param string $html 待处理的HTML文本
* @return array ['status'=>状态码, 'data'=>处理后文本]
* status: 2-空输入, 4-无匹配, 5-处理异常, 1-处理成功
* @param array $aImageMain Figure数字=>ID的映射数组
* @return array ['status' => 状态码, 'data' => 处理后文本]
* status说明2-空文本, 4-无匹配/已处理, 1-处理成功, 5-处理异常
*/
public function dealFigureStr($html = '') {
//验
if (!is_string($html) || trim($html) === '') {
public function dealFigureStr($html = '', $aImageMain = []){
//空文本校
$html = trim($html);
if ($html === '' || !is_string($html)) {
return ['status' => 2, 'data' => ''];
}
//超大字符串拦截
//超长文本保护
if (strlen($html) > self::MAX_HTML_LENGTH) {
return ['status' => 4, 'data' => $html];
}
//编码处理
if (!mb_check_encoding($html, 'UTF-8')) {
$html = mb_convert_encoding($html, 'UTF-8', 'GBK,GB2312,ASCII,ISO-8859-1');
}
//初始化映射数组(过滤非数字键值)
$this->initImageMap($aImageMain);
//原始内容
$originalHtml = $html;
$hasReplace = false;
try {
//合并嵌套样式标签
$mergedHtml = $this->mergeFragmentStyleTags($html);
//提取纯文本用于匹配Figure
$plainText = preg_replace('/<[^>]+>/', ' ', $mergedHtml);
$plainText = preg_replace('/\s+/', ' ', trim($plainText));
//提取所有匹配的Figure数字
$allMatches = $this->extractAllFigureMatches($plainText);
if (empty($allMatches)) {
return ['status' => 4, 'data' => $originalHtml];
//只要包含数字+字母/数字后缀,直接返回原内容(核心修复)
if ($this->hasFigureSuffix($html)) {
return ['status' => 4, 'data' => $html];
}
//替换为myfigure标签
$html = $this->replaceFigureWithTag($html, $allMatches, $hasReplace);
//清理冗余内容(仅替换成功后执行)
//合并拆分标签的Figure+数字
$html = $this->preprocessSplitTags($html);
//替换
$html = $this->replaceFigureInHtml($html, $hasReplace);
//清理冗余样式/标签
if ($hasReplace) {
$html = $this->cleanRedundantStyles($html);
$html = $this->cleanRedundantPunctuation($html);
$html = $this->cleanUnclosedTags($html);
$html = $this->optimizeFormat($html);
$html = $this->cleanDuplicateNestedTags($html);
}
} catch (\Throwable $e) {
@@ -55,220 +66,272 @@ class FigureTagProcessor {
return [
'status' => $hasReplace ? 1 : 4,
'data' => $hasReplace ? $html : $originalHtml
'data' => $html
];
}
/**
* 合并嵌套的样式标签
* @param string $html
* @return string
* 全局检测是否包含Figure数字+字母/数字后缀
* 覆盖所有拆分/嵌套/无标签场景,无论是否有空白
* @param string $html 待检测HTML
* @return bool
*/
private function mergeFragmentStyleTags($html) {
foreach (self::STYLE_TAGS as $tag) {
$pattern = '/(?:<' . $tag . '>)\s*([^<]+?)\s*<\/' . $tag . '>(?:\s*<' . $tag . '>)\s*([^<]+?)\s*<\/' . $tag . '>/is';
while (@preg_match($pattern, $html)) { // 抑制正则警告
$html = preg_replace_callback($pattern, function($matches) {
return trim($matches[1]) . ' ' . trim($matches[2]);
}, $html);
private function hasFigureSuffix($html){
$styleTagsPattern = implode('|', self::STYLE_TAGS);
// 正则1无标签场景Figure 4B/4123
$pattern1 = "/figure\s*\d+[a-zA-Z0-9]/iu";
// 正则2拆分标签场景<b>4</b><b>B</b> / <b>4</b> <b>B</b> / <b>4</b>&nbsp;<b>B</b>
$pattern2 = "/figure\s*(?:<\/(?:{$styleTagsPattern})>)\s*[\s|&nbsp;]*\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\d+)\s*(?:<\/(?:{$styleTagsPattern})>)\s*[\s|&nbsp;]*\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*([a-zA-Z0-9])/iu";
// 正则3嵌套标签场景<b>4B</b> / <i>4123</i>
$pattern3 = "/figure\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*\d+[a-zA-Z0-9]\s*(?:<\/(?:{$styleTagsPattern})>)/iu";
return preg_match($pattern1, $html) || preg_match($pattern2, $html) || preg_match($pattern3, $html);
}
/**
* 初始化Figure数字映射数组
* @param array $aImageMain 原始映射数组
* @return void
*/
private function initImageMap($aImageMain){
if (!is_array($aImageMain)) {
$aImageMain = [];
}
$imageMap = [];
foreach ($aImageMain as $key => $value) {
// 严格校验键值均为数字
if (ctype_digit((string)$key) && ctype_digit((string)$value)) {
$imageMap[(int)$key] = (int)$value;
}
}
// 清理括号内的冗余标点/标签
$html = preg_replace('/(\(.*?\d+)(?:\s*<[^>]+>)*\s*\.*\s*(?:<[^>]+>)*(\s*.*?\))/is', '$1$2', $html);
$html = preg_replace('/\(\s+/', '(', $html);
$html = preg_replace('/\s+\)/', ')', $html);
$this->aImageMain = $imageMap;
}
/**
* 合并所有拆分标签的Figure+数字(含空白样式标签)
* @param string $html 待处理HTML
* @return string
*/
private function preprocessSplitTags($html){
$styleTagsPattern = implode('|', self::STYLE_TAGS);
// 正则1匹配基础拆分标签的Figure+数字
$pattern = "/(figure)\s*(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\d+)/iu";
$html = preg_replace_callback($pattern, function($matches) {
return $matches[1] . ' ' . $matches[2];
}, $html);
// 正则2匹配多轮拆分标签的Figure+数字(含空白)
$pattern2 = "/(figure)(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\s*)\s*(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\d+)/iu";
$html = preg_replace_callback($pattern2, function($matches) {
return $matches[1] . $matches[2] . $matches[3];
}, $html);
return $html;
}
/**
* 从纯文本中提取所有Figure数字兼容括号/标点/空格)
* @param string $plainText
* @return array
*/
private function extractAllFigureMatches($plainText) {
$allMatches = [];
$processedNums = [];
// 匹配带括号的Figure如 (Figure 1.)
$pattern1 = '/\(Figure\s*(\d+)\b(?!\p{L}|\s+\p{L})(?:\s*[\.,;:]*\s*)\)\s*([\.,:]{0,1})/iu';
if (@preg_match_all($pattern1, $plainText, $matchesFull, PREG_SET_ORDER)) {
foreach ($matchesFull as $match) {
$num = $match[1];
if (!ctype_digit($num) || in_array($num, $processedNums)) continue;
$processedNums[] = $num;
$allMatches[$num] = [
'hasOuterBracket' => true,
'validPunct' => $match[2] ?? '',
'content' => "Figure {$num}"
];
}
}
// 匹配无括号的Figure如 Figure 1.
$pattern2 = '/Figure\s*(\d+)\b(?!\p{L}|\s+\p{L})(?:\s*[\.,;:]*\s*)\s*([\.,:]{0,1})/iu';
if (@preg_match_all($pattern2, $plainText, $matchesOther, PREG_SET_ORDER)) {
foreach ($matchesOther as $match) {
$num = $match[1];
if (!ctype_digit($num) || in_array($num, $processedNums)) continue;
$processedNums[] = $num;
$allMatches[$num] = [
'hasOuterBracket' => false,
'validPunct' => $match[2] ?? '',
'content' => "Figure {$num}"
];
}
}
krsort($allMatches);
return $allMatches;
}
/**
* 将匹配的Figure替换为myfigure标签优化标签格式
* @param string $html
* @param array $allMatches
* @param bool $hasReplace
* 核心替换逻辑将纯数字Figure替换为myfigure标签
* @param string $html 待处理HTML
* @param bool $hasReplace 是否发生替换(引用传递)
* @return string
*/
private function replaceFigureWithTag($html, $allMatches, &$hasReplace) {
foreach ($allMatches as $num => $info) {
$innerContent = $info['hasOuterBracket']
? "({$info['content']})"
: $info['content'];
private function replaceFigureInHtml($html, &$hasReplace){
$styleTagsPattern = implode('|', self::STYLE_TAGS);
$styleTagsRegex = "(?:<(?:{$styleTagsPattern})[^>]*>)*\s*";
$styleTagsCloseRegex = "\s*(?:<\/(?:{$styleTagsPattern})>)*";
// 正则1匹配括号内的纯数字Figure如 (Figure 2)、(<b>Figure 3</b>)
// $pattern1 = "/\(\s*{$styleTagsRegex}figure\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*(?![a-zA-Z0-9])/iu";
$pattern1 = "/\(\s*{$styleTagsRegex}figure\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*(?![a-zA-Z0-9])/iuD";
$html = preg_replace_callback($pattern1, function($matches) use (&$hasReplace) {
$num = $matches[1];
$numInt = (int)$num;
$suffix = $matches[2] ?? '';
// 过滤条件:非数字、无映射、已处理过的标签
if (!ctype_digit($num) || !isset($this->aImageMain[$numInt]) ||
$this->isMatchPositionHasMyFigureTag($matches[0], "Figure {$num}")) {
return $matches[0];
}
// 执行替换
$primaryId = $this->aImageMain[$numInt];
$baseTag = "<" . self::PROCESSED_TAG . " data-id=\"{$primaryId}\">Figure {$num}</" . self::PROCESSED_TAG . ">";
$target = "({$baseTag}{$suffix})";
//<myfigure data-id="1">Figure 1</myfigure>
$targetTag = "<myfigure data-id=\"{$num}\">{$innerContent}</myfigure>";
if (!empty($info['validPunct']) && !$info['hasOuterBracket']) {
$targetTag .= $info['validPunct'];
$hasReplace = true;
return $target;
}, $html);
// 正则2匹配无括号的纯数字Figure如 Figure 2、<i>Figure 3</i>:
// $pattern2 = "/{$styleTagsRegex}figure\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}(?![a-zA-Z0-9])/iu";
$pattern2 = "/{$styleTagsRegex}figure\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*(?![a-zA-Z0-9])/iuD";
$html = preg_replace_callback($pattern2, function($matches) use (&$hasReplace) {
$num = $matches[1];
$numInt = (int)$num;
$suffix = $matches[2] ?? '';
// 过滤条件:非数字、无映射、已处理过的标签
if (!ctype_digit($num) || !isset($this->aImageMain[$numInt]) ||
$this->isMatchPositionHasMyFigureTag($matches[0], "Figure {$num}")) {
return $matches[0];
}
$patternSuffix = '(?!\p{L}|\s+\p{L})';
$pattern = $info['hasOuterBracket']
? '/\(\s*(?:<[^>]+>|\s)*Figure(?:<[^>]+>|\s)*' . $num . '\b' . $patternSuffix . '(?:\s*[\.,;:]*\s*|\s*<[^>]+>)*\s*\)/iu'
: '/\s*(?:<[^>]+>|\s)*Figure(?:<[^>]+>|\s)*' . $num . '\b' . $patternSuffix . '(?:\s*[\.,;:]*\s*|\s*<[^>]+>)*\s*([\.,:]{0,1})/iu';
// 执行替换
$primaryId = $this->aImageMain[$numInt];
$baseTag = "<" . self::PROCESSED_TAG . " data-id=\"{$primaryId}\">Figure {$num}</" . self::PROCESSED_TAG . ">";
$target = "{$baseTag}{$suffix}";
$hasReplace = true;
return $target;
}, $html);
//执行替换最多替换1次避免重复
$html = @preg_replace($pattern, $targetTag, $html, 1, $count);
if ($count > 0) {
$hasReplace = true;
error_log("[FigureTagProcessor] 替换成功 - ID:{$num} 括号:".($info['hasOuterBracket']?'是':'否'));
}
}
return $html;
}
/**
* 清理myfigure标签周围的冗余样式标签适配新标签格式
* @param string $html
* 检测当前匹配内容是否已包含myfigure标签避免重复替换
* @param string $content 匹配的文本片段
* @param string $figureText 待检测的Figure文本如 Figure 2
* @return bool
*/
private function isMatchPositionHasMyFigureTag($content, $figureText){
$escapedText = preg_quote($figureText, '/');
$pattern = '/<' . self::PROCESSED_TAG . '[^>]*>\s*' . $escapedText . '\s*<\/' . self::PROCESSED_TAG . '>/is';
return (bool)preg_match($pattern, $content);
}
/**
* 清理myfigure标签周围的冗余样式标签
* @param string $html 待处理HTML
* @return string
*/
private function cleanRedundantStyles($html) {
private function cleanRedundantStyles($html){
foreach (self::STYLE_TAGS as $tag) {
$pattern = '/<' . $tag . '>\s*<myfigure([^>]*)>(.*?)<\/myfigure>([\.,:]{0,1})\s*<\/' . $tag . '>/is';
$html = @preg_replace($pattern, '<myfigure$1>$2</myfigure>$3', $html);
$pattern = '/<' . $tag . '>\s*<'.self::PROCESSED_TAG.'([^>]*?)>(.*?)<\/'.self::PROCESSED_TAG.'>([\.,:]{0,1})\s*<\/' . $tag . '>/is';
$html = preg_replace($pattern, '<'.self::PROCESSED_TAG.'$1>$2</'.self::PROCESSED_TAG.'>$3', $html);
}
//清理闭标签
// 清理无匹配的闭合样式标签
$html = preg_replace('/<\/('.implode('|', self::STYLE_TAGS).')>(?![^<]*<\1>)/is', '', $html);
return $html;
}
/**
* 清理myfigure标签的冗余标点(适配新标签格式)
* @param string $html
* 清理myfigure标签周围的冗余标点
* @param string $html 待处理HTML
* @return string
*/
private function cleanRedundantPunctuation($html) {
$html = preg_replace('/<myfigure data-id="(\d+)">\(Figure \d+\)<\/myfigure>\)\./i', '<myfigure data-id="$1">(Figure $1)</myfigure>.', $html);
$html = preg_replace('/<\/myfigure>\)\.([\.,:]{0,1})/', '</myfigure>)$1', $html);
$html = preg_replace('/<\/myfigure>\.\)([\.,:]{0,1})/', '</myfigure>)$1', $html);
$html = preg_replace('/<\/myfigure>([\.,:]){2,}/', '</myfigure>$1', $html);
$html = preg_replace('/<myfigure data-id="(\d+)">\((Figure \d+)\s*<\/myfigure>([\.,:]{0,1})/i',
'<myfigure data-id="$1">($2)</myfigure>$3', $html);
private function cleanRedundantPunctuation($html){
// 修复括号+标点的冗余格式
$html = preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\(Figure \d+\)<\/'.self::PROCESSED_TAG.'>\)\./i',
'<'.self::PROCESSED_TAG.' data-id="$1">(Figure $1)</'.self::PROCESSED_TAG.'>.', $html);
$html = preg_replace('/<\/'.self::PROCESSED_TAG.'>\)\.([\.,:]{0,1})/', '</'.self::PROCESSED_TAG.'>)$1', $html);
$html = preg_replace('/<\/'.self::PROCESSED_TAG.'>\.\)([\.,:]{0,1})/', '</'.self::PROCESSED_TAG.'>)$1', $html);
// 清理重复标点
$html = preg_replace('/<\/'.self::PROCESSED_TAG.'>([\.,:]){2,}/', '</'.self::PROCESSED_TAG.'>$1', $html);
// 修复括号内的标签冗余
$html = preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\((Figure \d+)\s*<\/'.self::PROCESSED_TAG.'>([\.,:]{0,1})/i',
'<'.self::PROCESSED_TAG.' data-id="$1">($2)</'.self::PROCESSED_TAG.'>$3', $html);
return $html;
}
/**
* 清理孤立的样式标签
* @param string $html
* 清理未闭合的样式标签
* @param string $html 待处理HTML
* @return string
*/
private function cleanUnclosedTags($html) {
private function cleanUnclosedTags($html){
foreach (self::STYLE_TAGS as $tag) {
$html = @preg_replace('/(<\/myfigure>)\s*<\/' . $tag . '>/i', '$1', $html);
}
foreach (self::STYLE_TAGS as $tag) {
@preg_match_all("/<{$tag}\b[^>]*>/i", $html, $openMatches, PREG_OFFSET_CAPTURE);
@preg_match_all("/<\/{$tag}>/i", $html, $closeMatches, PREG_OFFSET_CAPTURE);
// 清理myfigure标签后的冗余闭合标签
$html = preg_replace('/(<\/'.self::PROCESSED_TAG.'>)\s*<\/' . $tag . '>/i', '$1', $html);
// 定位所有该标签的开闭标签位置
preg_match_all("/<{$tag}\b[^>]*>/i", $html, $openMatches, PREG_OFFSET_CAPTURE);
preg_match_all("/<\/{$tag}>/i", $html, $closeMatches, PREG_OFFSET_CAPTURE);
$allTags = [];
// 收集开标签
foreach ($openMatches[0] as $m) {
$allTags[] = [
'offset' => $m[1],
'type' => 'open',
'content' => $m[0],
'offset' => $m[1],
'type' => 'open',
'content' => $m[0],
'length' => strlen($m[0])
];
}
// 收集闭标签
foreach ($closeMatches[0] as $m) {
$allTags[] = [
'offset' => $m[1],
'type' => 'close',
'content' => $m[0],
'offset' => $m[1],
'type' => 'close',
'content' => $m[0],
'length' => strlen($m[0])
];
}
// 按位置排序
usort($allTags, function($a, $b) {
return $a['offset'] - $b['offset'];
});
// 栈结构匹配开闭标签
$tagStack = [];
$removeOffsets = [];
foreach ($allTags as $t) {
if ($t['type'] == 'open') {
if ($t['type'] === 'open') {
array_push($tagStack, $t);
} else {
if (!empty($tagStack)) {
array_pop($tagStack);
} else {
$removeOffsets[] = [
'pos' => $t['offset'],
'len' => $t['length'],
'content' => $t['content']
];
// 无匹配开标签的闭标签,标记删除
$removeOffsets[] = $t;
}
}
}
// 无匹配闭标签的开标签,标记删除
foreach ($tagStack as $t) {
$removeOffsets[] = [
'pos' => $t['offset'],
'len' => $t['length'],
'content' => $t['content']
];
$removeOffsets[] = $t;
}
// 倒序删除避免偏移错乱
// 按偏移量倒序删除避免影响后续偏移)
usort($removeOffsets, function($a, $b) {
return $b['pos'] - $a['pos'];
return $b['offset'] - $a['offset'];
});
foreach ($removeOffsets as $item) {
if ($item['pos'] >= 0 && $item['pos'] < strlen($html)) {
$html = substr_replace($html, '', $item['pos'], $item['len']);
if ($item['offset'] >= 0 && $item['offset'] < strlen($html)) {
$html = substr_replace($html, '', $item['offset'], $item['length']);
}
}
}
return $html;
}
return $html;
}
/**
* 优化文本格式(合并多余空格规范myfigure标签前后空格)
* @param string $html
* 优化文本格式(清理多余空格)
* @param string $html 待处理HTML
* @return string
*/
private function optimizeFormat($html) {
private function optimizeFormat($html){
// 清理连续空格
$html = preg_replace('/\s{2,}/', ' ', trim($html));
$html = preg_replace('/<\/myfigure>([A-Za-z0-9])/is', '</myfigure> $1', $html);
$html = preg_replace('/([a-zA-Z0-9])<myfigure/is', '$1 <myfigure', $html);
// 标签后紧跟字母/数字时加空格
$html = preg_replace('/<\/'.self::PROCESSED_TAG.'>([A-Za-z0-9])/is', '</'.self::PROCESSED_TAG.'> $1', $html);
// 字母/数字紧跟标签前时加空格
$html = preg_replace('/([a-zA-Z0-9])<'.self::PROCESSED_TAG.'/is', '$1 <'.self::PROCESSED_TAG.'', $html);
return $html;
}
/**
* 清理嵌套的myfigure标签避免重复嵌套
* @param string $html 待处理HTML
* @return string
*/
private function cleanDuplicateNestedTags($html){
$pattern = '/<'.self::PROCESSED_TAG.'[^>]*>\s*<'.self::PROCESSED_TAG.'([^>]*)>(.*?)<\/'.self::PROCESSED_TAG.'>\s*<\/'.self::PROCESSED_TAG.'>/is';
$html = preg_replace($pattern, '<'.self::PROCESSED_TAG.'$1>$2</'.self::PROCESSED_TAG.'>', $html);
return $html;
}
}

View File

@@ -2,59 +2,71 @@
namespace app\common;
/**
* Table标签处理器生产环境终极版
* 功能精准匹配并替换Table相关格式为mytable标签
* 支持格式table 数字、(table 数字)、table 数字:/table 数字.(含嵌套标签)
* 特性:支持任意嵌套标签/括号、不处理数字+字母、仅跳过已被mytable包裹的Table
* 支持格式table 数字、(table 数字)、table 数字:/table 数字.(含嵌套/拆分标签)
* 跳过已被mytable包裹的table(含后缀)
* 跳过table 数字+字母/数字后缀(含拆分标签场景,无论是否有空白)
* 正常处理table 数字+空白/样式标签场景
*/
class TableTagProcessor {
// 可配置的样式标签列表
const STYLE_TAGS = ['i', 'b', 'font', 'strong', 'em'];
// 最大处理字符串长度
const MAX_HTML_LENGTH = 100000;
// 目标替换标签
const PROCESSED_TAG = 'mytable';
// 数据库表格ID映射
class TableTagProcessor{
// 支持的样式标签列表
private const STYLE_TAGS = ['i', 'b', 'font', 'strong', 'em','blue'];
// HTML文本最大处理长度防止内存溢出
private const MAX_HTML_LENGTH = 100000;
// 替换后的目标标签
private const PROCESSED_TAG = 'mytable';
// Table数字与对应ID映射数组
private $aTableMain = [];
/**
* 处理Table文本替换为mytable标签并清理冗余内容
* 处理Table标签替换的主方法
* @param string $html 待处理的HTML文本
* @param array $aTableMain Table数字→主键ID的映射数组如 [1=>1001, 2=>1002]
* @return array ['status'=>状态码, 'data'=>处理后文本]
* status: 2-空输入, 4-无匹配/已处理, 5-处理异常, 1-处理成功
* @param array $aTableMain Table数字=>ID的映射数组可选默认1=>1~10=>10
* @return array ['status' => 状态码, 'data' => 处理后文本]
* status说明:2-空文本, 4-无匹配/已处理, 1-处理成功, 5-处理异常
*/
public function dealTableStr($html = '', $aTableMain = []) {
//验证
if (!is_string($html) || trim($html) === '') {
public function dealTableStr($html = '', $aTableMain = []){
// 初始化默认映射数组(仅当入参为空时使用)
$defaultTableMap = [1=>1,2=>2,3=>3,4=>4,5=>5,6=>6,7=>7,8=>8,9=>9,10=>10];
// 优先使用入参,入参为空则用默认值
$tableMap = !empty($aTableMain) ? $aTableMain : $defaultTableMap;
// 空文本校验
$html = trim($html);
if ($html === '' || !is_string($html)) {
return ['status' => 2, 'data' => ''];
}
//超大字符串拦截(防止内存溢出)
// 超长文本保护
if (strlen($html) > self::MAX_HTML_LENGTH) {
return ['status' => 4, 'data' => $html];
}
//初始化主键映射数组
if(!empty($aTableMain)){
$aTableMainNew = [];
foreach ($aTableMain as $key => $value) {
if (!ctype_digit((string)$key) || !ctype_digit((string)$value)) {
continue;
}
$keyInt = (int)$key;
$aTableMainNew[$keyInt + 1] = $value;
}
$this->aTableMain = $aTableMainNew;
// 编码处理统一转为UTF-8避免中文乱码
if (!mb_check_encoding($html, 'UTF-8')) {
$html = mb_convert_encoding($html, 'UTF-8', 'GBK,GB2312,ASCII,ISO-8859-1');
}
// 初始化映射数组(过滤非数字键值)
$this->initTableMap($tableMap);
// 原始内容(异常时返回)
$originalHtml = $html;
$hasReplace = false;
try {
//原始HTML中匹配所有符合规则的Table
// 只要包含数字+字母/数字后缀,直接返回原内容
if ($this->hasTableSuffix($html)) {
return ['status' => 4, 'data' => $html];
}
// 合并拆分标签的Table+数字
$html = $this->preprocessSplitTags($html);
// 核心替换逻辑
$html = $this->replaceTableInHtml($html, $hasReplace);
// 清理冗余内容
// 清理冗余样式/标签(仅当发生替换时执行)
if ($hasReplace) {
$html = $this->cleanRedundantStyles($html);
$html = $this->cleanRedundantPunctuation($html);
@@ -74,50 +86,120 @@ class TableTagProcessor {
}
/**
* 核心方法直接在HTML中匹配并替换Table
* 全局检测是否包含Table数字+字母/数字后缀
* 覆盖所有拆分/嵌套/无标签场景,无论是否有空白
* @param string $html 待检测HTML
* @return bool
*/
private function hasTableSuffix($html){
$styleTagsPattern = implode('|', self::STYLE_TAGS);
// 正则1无标签场景Table 4B/4123
$pattern1 = "/table\s*\d+[a-zA-Z0-9]/iu";
// 正则2拆分标签场景<b>4</b><b>B</b> / <b>4</b> <b>B</b> / <b>4</b>&nbsp;<b>B</b>
$pattern2 = "/table\s*(?:<\/(?:{$styleTagsPattern})>)\s*[\s|&nbsp;]*\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\d+)\s*(?:<\/(?:{$styleTagsPattern})>)\s*[\s|&nbsp;]*\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*([a-zA-Z0-9])/iu";
// 正则3嵌套标签场景<b>4B</b> / <i>4123</i>
$pattern3 = "/table\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*\d+[a-zA-Z0-9]\s*(?:<\/(?:{$styleTagsPattern})>)/iu";
// 加@抑制正则警告,避免极端文本导致报错
return @preg_match($pattern1, $html) || @preg_match($pattern2, $html) || @preg_match($pattern3, $html);
}
/**
* 初始化Table数字映射数组过滤非数字键值
* @param array $aTableMain 原始映射数组
* @return void
*/
private function initTableMap($aTableMain = []){
if (!is_array($aTableMain)) {
$aTableMain = [];
}
$tableMap = [];
foreach ($aTableMain as $key => $value) {
// 严格校验键值均为数字
if (ctype_digit((string)$key) && ctype_digit((string)$value)) {
$tableMap[(int)$key] = (int)$value;
}
}
$this->aTableMain = $tableMap;
}
/**
* 合并所有拆分标签的Table+数字(含空白样式标签)
* @param string $html 待处理HTML
* @return string
*/
private function replaceTableInHtml($html, &$hasReplace) {
private function preprocessSplitTags($html){
$styleTagsPattern = implode('|', self::STYLE_TAGS);
$styleTagsRegex = "(?:<(?:{$styleTagsPattern})[^>]*>)*\s*"; // 匹配任意嵌套样式标签
// 正则1匹配基础拆分标签的Table+数字
$pattern = "/(table)\s*(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\d+)/iu";
$html = @preg_replace_callback($pattern, function($matches) {
return $matches[1] . ' ' . $matches[2];
}, $html);
// 正则2匹配多轮拆分标签的Table+数字(含空白)
$pattern2 = "/(table)(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\s*)\s*(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\d+)/iu";
$html = @preg_replace_callback($pattern2, function($matches) {
return $matches[1] . $matches[2] . $matches[3];
}, $html);
return $html;
}
/**
* 核心替换逻辑将纯数字Table替换为mytable标签
* @param string $html 待处理HTML
* @param bool $hasReplace 是否发生替换(引用传递)
* @return string
*/
private function replaceTableInHtml($html, &$hasReplace){
$styleTagsPattern = implode('|', self::STYLE_TAGS);
$styleTagsRegex = "(?:<(?:{$styleTagsPattern})[^>]*>)*\s*";
$styleTagsCloseRegex = "\s*(?:<\/(?:{$styleTagsPattern})>)*";
// 则1匹配括号Table如 (Table 82)、(<b>Table 1.</b>)
$pattern1 = "/\(\s*{$styleTagsRegex}table\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*\)/iu";
$html = preg_replace_callback($pattern1, function($matches) use (&$hasReplace) {
// 则1匹配括号内的纯数字Table如 (Table 2)、(<b>Table 3</b>)
$pattern1 = "/\(\s*{$styleTagsRegex}table\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*(?![a-zA-Z0-9])/iuD";
$html = @preg_replace_callback($pattern1, function($matches) use (&$hasReplace) {
$num = $matches[1];
$numInt = intval($num);
$numInt = (int)$num;
$suffix = $matches[2] ?? '';
// 校验:纯数字 + 有映射ID + 未被mytable包裹避免重复替换
// 过滤条件:非数字、无映射、已处理过的标签
if (!ctype_digit($num) || !isset($this->aTableMain[$numInt]) ||
$this->isMatchPositionHasMyTableTag($matches[0], "Table {$num}")) {
return $matches[0];
}
// 执行替换
$primaryId = $this->aTableMain[$numInt];
$baseTag = "<".self::PROCESSED_TAG." data-id=\"{$primaryId}\">Table {$num}</".self::PROCESSED_TAG.">";
$baseTag = "<" . self::PROCESSED_TAG . " data-id=\"{$primaryId}\">Table {$num}</" . self::PROCESSED_TAG . ">";
$target = "({$baseTag}{$suffix})";
$hasReplace = true;
return $target;
}, $html);
// 则2匹配无括号的Table如 Table 1、<b>Table 2:</b>、<i>Table 3.</i>
$pattern2 = "/{$styleTagsRegex}table\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}(?![a-zA-Z])/iu";
$html = preg_replace_callback($pattern2, function($matches) use (&$hasReplace) {
// 则2匹配无括号的纯数字Table如 Table 2、<i>Table 3</i>:
$pattern2 = "/{$styleTagsRegex}table\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*(?![a-zA-Z0-9])/iuD";
$html = @preg_replace_callback($pattern2, function($matches) use (&$hasReplace) {
$num = $matches[1];
$numInt = intval($num);
$numInt = (int)$num;
$suffix = $matches[2] ?? '';
// 校验:纯数字 + 有映射ID + 未被mytable包裹 + 不是数字+字母组合
// 过滤条件:非数字、无映射、已处理过的标签
if (!ctype_digit($num) || !isset($this->aTableMain[$numInt]) ||
$this->isMatchPositionHasMyTableTag($matches[0], "Table {$num}")) {
return $matches[0];
}
// 执行替换
$primaryId = $this->aTableMain[$numInt];
$baseTag = "<".self::PROCESSED_TAG." data-id=\"{$primaryId}\">Table {$num}</".self::PROCESSED_TAG.">";
$baseTag = "<" . self::PROCESSED_TAG . " data-id=\"{$primaryId}\">Table {$num}</" . self::PROCESSED_TAG . ">";
$target = "{$baseTag}{$suffix}";
$hasReplace = true;
@@ -127,126 +209,157 @@ class TableTagProcessor {
return $html;
}
/**
* 检测当前匹配内容是否已包含mytable标签避免重复替换
* @param string $content 匹配的文本片段
* @param string $tableText 待检测的Table文本如 Table 2
* @return bool
*/
private function isMatchPositionHasMyTableTag($content, $tableText){
$escapedText = preg_quote($tableText, '/');
$pattern = '/<' . self::PROCESSED_TAG . '[^>]*>\s*' . $escapedText . '\s*<\/' . self::PROCESSED_TAG . '>/is';
return (bool)@preg_match($pattern, $content);
}
/**
* 清理mytable标签周围的冗余样式标签
* @param string $html
* @param string $html 待处理HTML
* @return string
*/
private function cleanRedundantStyles($html) {
private function cleanRedundantStyles($html){
foreach (self::STYLE_TAGS as $tag) {
$pattern = '/<' . $tag . '>\s*<'.self::PROCESSED_TAG.'([^>]*?)>(.*?)<\/'.self::PROCESSED_TAG.'>([\.,:]{0,1})\s*<\/' . $tag . '>/is';
$html = @preg_replace($pattern, '<'.self::PROCESSED_TAG.'$1>$2</'.self::PROCESSED_TAG.'>$3', $html);
}
// 清理孤立的样式闭标签(避免标签残留)
$html = preg_replace('/<\/('.implode('|', self::STYLE_TAGS).')>(?![^<]*<\1>)/is', '', $html);
// 清理无匹配的闭合样式标签
$html = @preg_replace('/<\/('.implode('|', self::STYLE_TAGS).')>(?![^<]*<\1>)/is', '', $html);
return $html;
}
/**
* 清理mytable标签的冗余标点(保证格式整洁)
* @param string $html
* 清理mytable标签周围的冗余标点
* @param string $html 待处理HTML
* @return string
*/
private function cleanRedundantPunctuation($html) {
$html = preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\(Table \d+\)<\/'.self::PROCESSED_TAG.'>\)\./i', '<'.self::PROCESSED_TAG.' data-id="$1">(Table $1)</'.self::PROCESSED_TAG.'>.', $html);
$html = preg_replace('/<\/'.self::PROCESSED_TAG.'>\)\.([\.,:]{0,1})/', '</'.self::PROCESSED_TAG.'>)$1', $html);
$html = preg_replace('/<\/'.self::PROCESSED_TAG.'>\.\)([\.,:]{0,1})/', '</'.self::PROCESSED_TAG.'>)$1', $html);
$html = preg_replace('/<\/'.self::PROCESSED_TAG.'>([\.,:]){2,}/', '</'.self::PROCESSED_TAG.'>$1', $html);
$html = preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\((Table \d+)\s*<\/'.self::PROCESSED_TAG.'>([\.,:]{0,1})/i',
private function cleanRedundantPunctuation($html){
// 修复括号+标点的冗余格式
$html = @preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\(Table \d+\)<\/'.self::PROCESSED_TAG.'>\)\./i',
'<'.self::PROCESSED_TAG.' data-id="$1">(Table $1)</'.self::PROCESSED_TAG.'>.', $html);
$html = @preg_replace('/<\/'.self::PROCESSED_TAG.'>\)\.([\.,:]{0,1})/', '</'.self::PROCESSED_TAG.'>)$1', $html);
$html = @preg_replace('/<\/'.self::PROCESSED_TAG.'>\.\)([\.,:]{0,1})/', '</'.self::PROCESSED_TAG.'>)$1', $html);
// 清理重复标点
$html = @preg_replace('/<\/'.self::PROCESSED_TAG.'>([\.,:]){2,}/', '</'.self::PROCESSED_TAG.'>$1', $html);
// 修复括号内的标签冗余
$html = @preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\((Table \d+)\s*<\/'.self::PROCESSED_TAG.'>([\.,:]{0,1})/i',
'<'.self::PROCESSED_TAG.' data-id="$1">($2)</'.self::PROCESSED_TAG.'>$3', $html);
return $html;
}
/**
* 清理孤立的样式标签(栈算法兜底,避免标签不闭合)
* @param string $html
* 清理未闭合的样式标签
* @param string $html 待处理HTML
* @return string
*/
private function cleanUnclosedTags($html) {
// 清理mytable后孤立的样式闭标签
private function cleanUnclosedTags($html){
foreach (self::STYLE_TAGS as $tag) {
// 清理mytable标签后的冗余闭合标签
$html = @preg_replace('/(<\/'.self::PROCESSED_TAG.'>)\s*<\/' . $tag . '>/i', '$1', $html);
}
// 栈算法清理其他孤立标签
foreach (self::STYLE_TAGS as $tag) {
// 定位所有该标签的开闭标签位置
@preg_match_all("/<{$tag}\b[^>]*>/i", $html, $openMatches, PREG_OFFSET_CAPTURE);
@preg_match_all("/<\/{$tag}>/i", $html, $closeMatches, PREG_OFFSET_CAPTURE);
$allTags = [];
// 收集开标签
foreach ($openMatches[0] as $m) {
$allTags[] = ['offset' => $m[1], 'type' => 'open', 'content' => $m[0], 'length' => strlen($m[0])];
$allTags[] = [
'offset' => $m[1],
'type' => 'open',
'content' => $m[0],
'length' => strlen($m[0])
];
}
// 收集闭标签
foreach ($closeMatches[0] as $m) {
$allTags[] = ['offset' => $m[1], 'type' => 'close', 'content' => $m[0], 'length' => strlen($m[0])];
$allTags[] = [
'offset' => $m[1],
'type' => 'close',
'content' => $m[0],
'length' => strlen($m[0])
];
}
// 按位置排序
usort($allTags, function($a, $b) {
return $a['offset'] - $b['offset'];
});
// 栈结构匹配开闭标签
$tagStack = [];
$removeOffsets = [];
foreach ($allTags as $t) {
if ($t['type'] == 'open') {
if ($t['type'] === 'open') {
array_push($tagStack, $t);
} else {
if (!empty($tagStack)) {
array_pop($tagStack);
} else {
// 无匹配开标签的闭标签,标记删除
$removeOffsets[] = $t;
}
}
}
// 无匹配闭标签的开标签,标记删除
foreach ($tagStack as $t) {
$removeOffsets[] = $t;
}
// 倒序删除避免偏移错乱
// 按偏移量倒序删除避免影响后续偏移)
usort($removeOffsets, function($a, $b) {
return $b['offset'] - $a['offset'];
});
foreach ($removeOffsets as $item) {
if ($item['offset'] >= 0 && $item['offset'] < strlen($html)) {
$html = substr_replace($html, '', $item['offset'], $item['length']);
}
}
}
return $html;
}
/**
* 优化文本格式(合并多余空格,规范标签前后空格)
* @param string $html
* @return string
*/
private function optimizeFormat($html) {
$html = preg_replace('/\s{2,}/', ' ', trim($html));
$html = preg_replace('/<\/'.self::PROCESSED_TAG.'>([A-Za-z0-9])/is', '</'.self::PROCESSED_TAG.'> $1', $html);
$html = preg_replace('/([a-zA-Z0-9])<'.self::PROCESSED_TAG.'/is', '$1 <'.self::PROCESSED_TAG.'', $html);
return $html;
}
/**
* 清理重复嵌套的mytable标签兜底方案
* @param string $html
* 优化文本格式(清理多余空格
* @param string $html 待处理HTML
* @return string
*/
private function cleanDuplicateNestedTags($html) {
private function optimizeFormat($html){
// 清理连续空格
$html = @preg_replace('/\s{2,}/', ' ', trim($html));
// 标签后紧跟字母/数字时加空格
$html = @preg_replace('/<\/'.self::PROCESSED_TAG.'>([A-Za-z0-9])/is', '</'.self::PROCESSED_TAG.'> $1', $html);
// 字母/数字紧跟标签前时加空格
$html = @preg_replace('/([a-zA-Z0-9])<'.self::PROCESSED_TAG.'/is', '$1 <'.self::PROCESSED_TAG.'', $html);
return $html;
}
/**
* 清理嵌套的mytable标签避免重复嵌套
* @param string $html 待处理HTML
* @return string
*/
private function cleanDuplicateNestedTags($html){
$pattern = '/<'.self::PROCESSED_TAG.'[^>]*>\s*<'.self::PROCESSED_TAG.'([^>]*)>(.*?)<\/'.self::PROCESSED_TAG.'>\s*<\/'.self::PROCESSED_TAG.'>/is';
$html = preg_replace($pattern, '<'.self::PROCESSED_TAG.'$1>$2</'.self::PROCESSED_TAG.'>', $html);
$html = @preg_replace($pattern, '<'.self::PROCESSED_TAG.'$1>$2</'.self::PROCESSED_TAG.'>', $html);
return $html;
}
/**
* 判断指定Table内容是否被mytable标签包裹
* @param string $content 待检查内容
* @param string $tableText Table文本如 "Table 1"
* @return bool
*/
private function isMatchPositionHasMyTableTag($content, $tableText) {
$escapedText = preg_quote($tableText, '/');
$pattern = '/<' . self::PROCESSED_TAG . '[^>]*>\s*' . $escapedText . '\s*<\/' . self::PROCESSED_TAG . '>/is';
return @preg_match($pattern, $content) === 1;
}
}