Merge remote-tracking branch 'origin/master'
This commit is contained in:
@@ -1,52 +1,63 @@
|
||||
<?php
|
||||
namespace app\common;
|
||||
class FigureTagProcessor {
|
||||
// 可配置的样式标签列表(解耦)
|
||||
const STYLE_TAGS = ['i', 'b', 'font', 'strong', 'em','blue'];
|
||||
// 最大处理字符串长度(避免内存溢出)
|
||||
const MAX_HTML_LENGTH = 100000;
|
||||
|
||||
/**
|
||||
* 功能:精准匹配并替换Figure相关格式为myfigure标签
|
||||
* 支持格式:figure 数字、(figure 数字)、figure 数字:/figure 数字.(含嵌套/拆分标签)
|
||||
* 跳过已被myfigure包裹的Figure(含后缀)
|
||||
* 跳过Figure 数字+字母/数字后缀(含拆分标签场景,无论是否有空白)
|
||||
* 正常处理Figure 数字+空白/样式标签场景
|
||||
*/
|
||||
class FigureTagProcessor{
|
||||
//支持的样式标签列表
|
||||
private const STYLE_TAGS = ['i', 'b', 'font', 'strong', 'em','blue'];
|
||||
//HTML文本最大处理长度(防止内存溢出)
|
||||
private const MAX_HTML_LENGTH = 100000;
|
||||
//替换后的目标标签名
|
||||
private const PROCESSED_TAG = 'myfigure';
|
||||
//Figure数字与对应ID的映射数组
|
||||
private $aImageMain = [];
|
||||
/**
|
||||
* 处理Figure文本,替换为myfigure标签并清理冗余内容
|
||||
* 处理Figure标签替换的主方法
|
||||
* @param string $html 待处理的HTML文本
|
||||
* @return array ['status'=>状态码, 'data'=>处理后文本]
|
||||
* status: 2-空输入, 4-无匹配, 5-处理异常, 1-处理成功
|
||||
* @param array $aImageMain Figure数字=>ID的映射数组
|
||||
* @return array ['status' => 状态码, 'data' => 处理后文本]
|
||||
* status说明:2-空文本, 4-无匹配/已处理, 1-处理成功, 5-处理异常
|
||||
*/
|
||||
public function dealFigureStr($html = '') {
|
||||
//验证
|
||||
if (!is_string($html) || trim($html) === '') {
|
||||
public function dealFigureStr($html = '', $aImageMain = []){
|
||||
//空文本校验
|
||||
$html = trim($html);
|
||||
if ($html === '' || !is_string($html)) {
|
||||
return ['status' => 2, 'data' => ''];
|
||||
}
|
||||
//超大字符串拦截
|
||||
//超长文本保护
|
||||
if (strlen($html) > self::MAX_HTML_LENGTH) {
|
||||
return ['status' => 4, 'data' => $html];
|
||||
}
|
||||
|
||||
//编码处理
|
||||
if (!mb_check_encoding($html, 'UTF-8')) {
|
||||
$html = mb_convert_encoding($html, 'UTF-8', 'GBK,GB2312,ASCII,ISO-8859-1');
|
||||
}
|
||||
//初始化映射数组(过滤非数字键值)
|
||||
$this->initImageMap($aImageMain);
|
||||
//原始内容
|
||||
$originalHtml = $html;
|
||||
$hasReplace = false;
|
||||
|
||||
try {
|
||||
//合并嵌套样式标签
|
||||
$mergedHtml = $this->mergeFragmentStyleTags($html);
|
||||
//提取纯文本(用于匹配Figure)
|
||||
$plainText = preg_replace('/<[^>]+>/', ' ', $mergedHtml);
|
||||
$plainText = preg_replace('/\s+/', ' ', trim($plainText));
|
||||
|
||||
//提取所有匹配的Figure数字
|
||||
$allMatches = $this->extractAllFigureMatches($plainText);
|
||||
if (empty($allMatches)) {
|
||||
return ['status' => 4, 'data' => $originalHtml];
|
||||
//只要包含数字+字母/数字后缀,直接返回原内容(核心修复)
|
||||
if ($this->hasFigureSuffix($html)) {
|
||||
return ['status' => 4, 'data' => $html];
|
||||
}
|
||||
|
||||
//替换为myfigure标签
|
||||
$html = $this->replaceFigureWithTag($html, $allMatches, $hasReplace);
|
||||
|
||||
//清理冗余内容(仅替换成功后执行)
|
||||
//合并拆分标签的Figure+数字
|
||||
$html = $this->preprocessSplitTags($html);
|
||||
//替换
|
||||
$html = $this->replaceFigureInHtml($html, $hasReplace);
|
||||
//清理冗余样式/标签
|
||||
if ($hasReplace) {
|
||||
$html = $this->cleanRedundantStyles($html);
|
||||
$html = $this->cleanRedundantPunctuation($html);
|
||||
$html = $this->cleanUnclosedTags($html);
|
||||
$html = $this->optimizeFormat($html);
|
||||
$html = $this->cleanDuplicateNestedTags($html);
|
||||
}
|
||||
|
||||
} catch (\Throwable $e) {
|
||||
@@ -55,220 +66,272 @@ class FigureTagProcessor {
|
||||
|
||||
return [
|
||||
'status' => $hasReplace ? 1 : 4,
|
||||
'data' => $hasReplace ? $html : $originalHtml
|
||||
'data' => $html
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* 合并嵌套的样式标签
|
||||
* @param string $html
|
||||
* @return string
|
||||
* 全局检测是否包含Figure数字+字母/数字后缀
|
||||
* 覆盖所有拆分/嵌套/无标签场景,无论是否有空白
|
||||
* @param string $html 待检测HTML
|
||||
* @return bool
|
||||
*/
|
||||
private function mergeFragmentStyleTags($html) {
|
||||
foreach (self::STYLE_TAGS as $tag) {
|
||||
$pattern = '/(?:<' . $tag . '>)\s*([^<]+?)\s*<\/' . $tag . '>(?:\s*<' . $tag . '>)\s*([^<]+?)\s*<\/' . $tag . '>/is';
|
||||
while (@preg_match($pattern, $html)) { // 抑制正则警告
|
||||
$html = preg_replace_callback($pattern, function($matches) {
|
||||
return trim($matches[1]) . ' ' . trim($matches[2]);
|
||||
}, $html);
|
||||
private function hasFigureSuffix($html){
|
||||
$styleTagsPattern = implode('|', self::STYLE_TAGS);
|
||||
|
||||
// 正则1:无标签场景(Figure 4B/4123)
|
||||
$pattern1 = "/figure\s*\d+[a-zA-Z0-9]/iu";
|
||||
|
||||
// 正则2:拆分标签场景(<b>4</b><b>B</b> / <b>4</b> <b>B</b> / <b>4</b> <b>B</b>)
|
||||
$pattern2 = "/figure\s*(?:<\/(?:{$styleTagsPattern})>)\s*[\s| ]*\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\d+)\s*(?:<\/(?:{$styleTagsPattern})>)\s*[\s| ]*\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*([a-zA-Z0-9])/iu";
|
||||
|
||||
// 正则3:嵌套标签场景(<b>4B</b> / <i>4123</i>)
|
||||
$pattern3 = "/figure\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*\d+[a-zA-Z0-9]\s*(?:<\/(?:{$styleTagsPattern})>)/iu";
|
||||
return preg_match($pattern1, $html) || preg_match($pattern2, $html) || preg_match($pattern3, $html);
|
||||
}
|
||||
/**
|
||||
* 初始化Figure数字映射数组
|
||||
* @param array $aImageMain 原始映射数组
|
||||
* @return void
|
||||
*/
|
||||
private function initImageMap($aImageMain){
|
||||
if (!is_array($aImageMain)) {
|
||||
$aImageMain = [];
|
||||
}
|
||||
$imageMap = [];
|
||||
foreach ($aImageMain as $key => $value) {
|
||||
// 严格校验键值均为数字
|
||||
if (ctype_digit((string)$key) && ctype_digit((string)$value)) {
|
||||
$imageMap[(int)$key] = (int)$value;
|
||||
}
|
||||
}
|
||||
|
||||
// 清理括号内的冗余标点/标签
|
||||
$html = preg_replace('/(\(.*?\d+)(?:\s*<[^>]+>)*\s*\.*\s*(?:<[^>]+>)*(\s*.*?\))/is', '$1$2', $html);
|
||||
$html = preg_replace('/\(\s+/', '(', $html);
|
||||
$html = preg_replace('/\s+\)/', ')', $html);
|
||||
$this->aImageMain = $imageMap;
|
||||
}
|
||||
/**
|
||||
* 合并所有拆分标签的Figure+数字(含空白样式标签)
|
||||
* @param string $html 待处理HTML
|
||||
* @return string
|
||||
*/
|
||||
private function preprocessSplitTags($html){
|
||||
$styleTagsPattern = implode('|', self::STYLE_TAGS);
|
||||
|
||||
// 正则1:匹配基础拆分标签的Figure+数字
|
||||
$pattern = "/(figure)\s*(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\d+)/iu";
|
||||
$html = preg_replace_callback($pattern, function($matches) {
|
||||
return $matches[1] . ' ' . $matches[2];
|
||||
}, $html);
|
||||
|
||||
// 正则2:匹配多轮拆分标签的Figure+数字(含空白)
|
||||
$pattern2 = "/(figure)(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\s*)\s*(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\d+)/iu";
|
||||
$html = preg_replace_callback($pattern2, function($matches) {
|
||||
return $matches[1] . $matches[2] . $matches[3];
|
||||
}, $html);
|
||||
return $html;
|
||||
}
|
||||
|
||||
/**
|
||||
* 从纯文本中提取所有Figure数字(兼容括号/标点/空格)
|
||||
* @param string $plainText
|
||||
* @return array
|
||||
*/
|
||||
private function extractAllFigureMatches($plainText) {
|
||||
$allMatches = [];
|
||||
$processedNums = [];
|
||||
|
||||
// 匹配带括号的Figure(如 (Figure 1.))
|
||||
$pattern1 = '/\(Figure\s*(\d+)\b(?!\p{L}|\s+\p{L})(?:\s*[\.,;:]*\s*)\)\s*([\.,:]{0,1})/iu';
|
||||
if (@preg_match_all($pattern1, $plainText, $matchesFull, PREG_SET_ORDER)) {
|
||||
foreach ($matchesFull as $match) {
|
||||
$num = $match[1];
|
||||
if (!ctype_digit($num) || in_array($num, $processedNums)) continue;
|
||||
$processedNums[] = $num;
|
||||
$allMatches[$num] = [
|
||||
'hasOuterBracket' => true,
|
||||
'validPunct' => $match[2] ?? '',
|
||||
'content' => "Figure {$num}"
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
// 匹配无括号的Figure(如 Figure 1.)
|
||||
$pattern2 = '/Figure\s*(\d+)\b(?!\p{L}|\s+\p{L})(?:\s*[\.,;:]*\s*)\s*([\.,:]{0,1})/iu';
|
||||
if (@preg_match_all($pattern2, $plainText, $matchesOther, PREG_SET_ORDER)) {
|
||||
foreach ($matchesOther as $match) {
|
||||
$num = $match[1];
|
||||
if (!ctype_digit($num) || in_array($num, $processedNums)) continue;
|
||||
$processedNums[] = $num;
|
||||
$allMatches[$num] = [
|
||||
'hasOuterBracket' => false,
|
||||
'validPunct' => $match[2] ?? '',
|
||||
'content' => "Figure {$num}"
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
krsort($allMatches);
|
||||
return $allMatches;
|
||||
}
|
||||
|
||||
/**
|
||||
* 将匹配的Figure替换为myfigure标签(优化标签格式)
|
||||
* @param string $html
|
||||
* @param array $allMatches
|
||||
* @param bool $hasReplace
|
||||
* 核心替换逻辑:将纯数字Figure替换为myfigure标签
|
||||
* @param string $html 待处理HTML
|
||||
* @param bool $hasReplace 是否发生替换(引用传递)
|
||||
* @return string
|
||||
*/
|
||||
private function replaceFigureWithTag($html, $allMatches, &$hasReplace) {
|
||||
foreach ($allMatches as $num => $info) {
|
||||
$innerContent = $info['hasOuterBracket']
|
||||
? "({$info['content']})"
|
||||
: $info['content'];
|
||||
private function replaceFigureInHtml($html, &$hasReplace){
|
||||
$styleTagsPattern = implode('|', self::STYLE_TAGS);
|
||||
$styleTagsRegex = "(?:<(?:{$styleTagsPattern})[^>]*>)*\s*";
|
||||
$styleTagsCloseRegex = "\s*(?:<\/(?:{$styleTagsPattern})>)*";
|
||||
|
||||
// 正则1:匹配括号内的纯数字Figure(如 (Figure 2)、(<b>Figure 3</b>))
|
||||
// $pattern1 = "/\(\s*{$styleTagsRegex}figure\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*(?![a-zA-Z0-9])/iu";
|
||||
$pattern1 = "/\(\s*{$styleTagsRegex}figure\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*(?![a-zA-Z0-9])/iuD";
|
||||
$html = preg_replace_callback($pattern1, function($matches) use (&$hasReplace) {
|
||||
$num = $matches[1];
|
||||
$numInt = (int)$num;
|
||||
$suffix = $matches[2] ?? '';
|
||||
|
||||
// 过滤条件:非数字、无映射、已处理过的标签
|
||||
if (!ctype_digit($num) || !isset($this->aImageMain[$numInt]) ||
|
||||
$this->isMatchPositionHasMyFigureTag($matches[0], "Figure {$num}")) {
|
||||
return $matches[0];
|
||||
}
|
||||
|
||||
// 执行替换
|
||||
$primaryId = $this->aImageMain[$numInt];
|
||||
$baseTag = "<" . self::PROCESSED_TAG . " data-id=\"{$primaryId}\">Figure {$num}</" . self::PROCESSED_TAG . ">";
|
||||
$target = "({$baseTag}{$suffix})";
|
||||
|
||||
//<myfigure data-id="1">Figure 1</myfigure>
|
||||
$targetTag = "<myfigure data-id=\"{$num}\">{$innerContent}</myfigure>";
|
||||
if (!empty($info['validPunct']) && !$info['hasOuterBracket']) {
|
||||
$targetTag .= $info['validPunct'];
|
||||
$hasReplace = true;
|
||||
return $target;
|
||||
}, $html);
|
||||
|
||||
// 正则2:匹配无括号的纯数字Figure(如 Figure 2、<i>Figure 3</i>:)
|
||||
// $pattern2 = "/{$styleTagsRegex}figure\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}(?![a-zA-Z0-9])/iu";
|
||||
$pattern2 = "/{$styleTagsRegex}figure\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*(?![a-zA-Z0-9])/iuD";
|
||||
$html = preg_replace_callback($pattern2, function($matches) use (&$hasReplace) {
|
||||
$num = $matches[1];
|
||||
$numInt = (int)$num;
|
||||
$suffix = $matches[2] ?? '';
|
||||
|
||||
// 过滤条件:非数字、无映射、已处理过的标签
|
||||
if (!ctype_digit($num) || !isset($this->aImageMain[$numInt]) ||
|
||||
$this->isMatchPositionHasMyFigureTag($matches[0], "Figure {$num}")) {
|
||||
return $matches[0];
|
||||
}
|
||||
|
||||
$patternSuffix = '(?!\p{L}|\s+\p{L})';
|
||||
$pattern = $info['hasOuterBracket']
|
||||
? '/\(\s*(?:<[^>]+>|\s)*Figure(?:<[^>]+>|\s)*' . $num . '\b' . $patternSuffix . '(?:\s*[\.,;:]*\s*|\s*<[^>]+>)*\s*\)/iu'
|
||||
: '/\s*(?:<[^>]+>|\s)*Figure(?:<[^>]+>|\s)*' . $num . '\b' . $patternSuffix . '(?:\s*[\.,;:]*\s*|\s*<[^>]+>)*\s*([\.,:]{0,1})/iu';
|
||||
// 执行替换
|
||||
$primaryId = $this->aImageMain[$numInt];
|
||||
$baseTag = "<" . self::PROCESSED_TAG . " data-id=\"{$primaryId}\">Figure {$num}</" . self::PROCESSED_TAG . ">";
|
||||
$target = "{$baseTag}{$suffix}";
|
||||
|
||||
$hasReplace = true;
|
||||
return $target;
|
||||
}, $html);
|
||||
|
||||
//执行替换(最多替换1次,避免重复)
|
||||
$html = @preg_replace($pattern, $targetTag, $html, 1, $count);
|
||||
if ($count > 0) {
|
||||
$hasReplace = true;
|
||||
error_log("[FigureTagProcessor] 替换成功 - ID:{$num} 括号:".($info['hasOuterBracket']?'是':'否'));
|
||||
}
|
||||
}
|
||||
return $html;
|
||||
}
|
||||
|
||||
/**
|
||||
* 清理myfigure标签周围的冗余样式标签(适配新标签格式)
|
||||
* @param string $html
|
||||
* 检测当前匹配内容是否已包含myfigure标签(避免重复替换)
|
||||
* @param string $content 匹配的文本片段
|
||||
* @param string $figureText 待检测的Figure文本(如 Figure 2)
|
||||
* @return bool
|
||||
*/
|
||||
private function isMatchPositionHasMyFigureTag($content, $figureText){
|
||||
$escapedText = preg_quote($figureText, '/');
|
||||
$pattern = '/<' . self::PROCESSED_TAG . '[^>]*>\s*' . $escapedText . '\s*<\/' . self::PROCESSED_TAG . '>/is';
|
||||
return (bool)preg_match($pattern, $content);
|
||||
}
|
||||
/**
|
||||
* 清理myfigure标签周围的冗余样式标签
|
||||
* @param string $html 待处理HTML
|
||||
* @return string
|
||||
*/
|
||||
private function cleanRedundantStyles($html) {
|
||||
private function cleanRedundantStyles($html){
|
||||
foreach (self::STYLE_TAGS as $tag) {
|
||||
$pattern = '/<' . $tag . '>\s*<myfigure([^>]*)>(.*?)<\/myfigure>([\.,:]{0,1})\s*<\/' . $tag . '>/is';
|
||||
$html = @preg_replace($pattern, '<myfigure$1>$2</myfigure>$3', $html);
|
||||
$pattern = '/<' . $tag . '>\s*<'.self::PROCESSED_TAG.'([^>]*?)>(.*?)<\/'.self::PROCESSED_TAG.'>([\.,:]{0,1})\s*<\/' . $tag . '>/is';
|
||||
$html = preg_replace($pattern, '<'.self::PROCESSED_TAG.'$1>$2</'.self::PROCESSED_TAG.'>$3', $html);
|
||||
}
|
||||
//清理闭标签
|
||||
|
||||
// 清理无匹配的闭合样式标签
|
||||
$html = preg_replace('/<\/('.implode('|', self::STYLE_TAGS).')>(?![^<]*<\1>)/is', '', $html);
|
||||
|
||||
return $html;
|
||||
}
|
||||
|
||||
/**
|
||||
* 清理myfigure标签后的冗余标点(适配新标签格式)
|
||||
* @param string $html
|
||||
* 清理myfigure标签周围的冗余标点
|
||||
* @param string $html 待处理HTML
|
||||
* @return string
|
||||
*/
|
||||
private function cleanRedundantPunctuation($html) {
|
||||
$html = preg_replace('/<myfigure data-id="(\d+)">\(Figure \d+\)<\/myfigure>\)\./i', '<myfigure data-id="$1">(Figure $1)</myfigure>.', $html);
|
||||
$html = preg_replace('/<\/myfigure>\)\.([\.,:]{0,1})/', '</myfigure>)$1', $html);
|
||||
$html = preg_replace('/<\/myfigure>\.\)([\.,:]{0,1})/', '</myfigure>)$1', $html);
|
||||
$html = preg_replace('/<\/myfigure>([\.,:]){2,}/', '</myfigure>$1', $html);
|
||||
$html = preg_replace('/<myfigure data-id="(\d+)">\((Figure \d+)\s*<\/myfigure>([\.,:]{0,1})/i',
|
||||
'<myfigure data-id="$1">($2)</myfigure>$3', $html);
|
||||
private function cleanRedundantPunctuation($html){
|
||||
// 修复括号+标点的冗余格式
|
||||
$html = preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\(Figure \d+\)<\/'.self::PROCESSED_TAG.'>\)\./i',
|
||||
'<'.self::PROCESSED_TAG.' data-id="$1">(Figure $1)</'.self::PROCESSED_TAG.'>.', $html);
|
||||
$html = preg_replace('/<\/'.self::PROCESSED_TAG.'>\)\.([\.,:]{0,1})/', '</'.self::PROCESSED_TAG.'>)$1', $html);
|
||||
$html = preg_replace('/<\/'.self::PROCESSED_TAG.'>\.\)([\.,:]{0,1})/', '</'.self::PROCESSED_TAG.'>)$1', $html);
|
||||
|
||||
// 清理重复标点
|
||||
$html = preg_replace('/<\/'.self::PROCESSED_TAG.'>([\.,:]){2,}/', '</'.self::PROCESSED_TAG.'>$1', $html);
|
||||
|
||||
// 修复括号内的标签冗余
|
||||
$html = preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\((Figure \d+)\s*<\/'.self::PROCESSED_TAG.'>([\.,:]{0,1})/i',
|
||||
'<'.self::PROCESSED_TAG.' data-id="$1">($2)</'.self::PROCESSED_TAG.'>$3', $html);
|
||||
|
||||
return $html;
|
||||
}
|
||||
|
||||
/**
|
||||
* 清理孤立的样式标签
|
||||
* @param string $html
|
||||
* 清理未闭合的样式标签
|
||||
* @param string $html 待处理HTML
|
||||
* @return string
|
||||
*/
|
||||
private function cleanUnclosedTags($html) {
|
||||
private function cleanUnclosedTags($html){
|
||||
foreach (self::STYLE_TAGS as $tag) {
|
||||
$html = @preg_replace('/(<\/myfigure>)\s*<\/' . $tag . '>/i', '$1', $html);
|
||||
}
|
||||
foreach (self::STYLE_TAGS as $tag) {
|
||||
@preg_match_all("/<{$tag}\b[^>]*>/i", $html, $openMatches, PREG_OFFSET_CAPTURE);
|
||||
@preg_match_all("/<\/{$tag}>/i", $html, $closeMatches, PREG_OFFSET_CAPTURE);
|
||||
|
||||
// 清理myfigure标签后的冗余闭合标签
|
||||
$html = preg_replace('/(<\/'.self::PROCESSED_TAG.'>)\s*<\/' . $tag . '>/i', '$1', $html);
|
||||
|
||||
// 定位所有该标签的开闭标签位置
|
||||
preg_match_all("/<{$tag}\b[^>]*>/i", $html, $openMatches, PREG_OFFSET_CAPTURE);
|
||||
preg_match_all("/<\/{$tag}>/i", $html, $closeMatches, PREG_OFFSET_CAPTURE);
|
||||
|
||||
$allTags = [];
|
||||
// 收集开标签
|
||||
foreach ($openMatches[0] as $m) {
|
||||
$allTags[] = [
|
||||
'offset' => $m[1],
|
||||
'type' => 'open',
|
||||
'content' => $m[0],
|
||||
'offset' => $m[1],
|
||||
'type' => 'open',
|
||||
'content' => $m[0],
|
||||
'length' => strlen($m[0])
|
||||
];
|
||||
}
|
||||
// 收集闭标签
|
||||
foreach ($closeMatches[0] as $m) {
|
||||
$allTags[] = [
|
||||
'offset' => $m[1],
|
||||
'type' => 'close',
|
||||
'content' => $m[0],
|
||||
'offset' => $m[1],
|
||||
'type' => 'close',
|
||||
'content' => $m[0],
|
||||
'length' => strlen($m[0])
|
||||
];
|
||||
}
|
||||
|
||||
// 按位置排序
|
||||
usort($allTags, function($a, $b) {
|
||||
return $a['offset'] - $b['offset'];
|
||||
});
|
||||
|
||||
// 栈结构匹配开闭标签
|
||||
$tagStack = [];
|
||||
$removeOffsets = [];
|
||||
foreach ($allTags as $t) {
|
||||
if ($t['type'] == 'open') {
|
||||
if ($t['type'] === 'open') {
|
||||
array_push($tagStack, $t);
|
||||
} else {
|
||||
if (!empty($tagStack)) {
|
||||
array_pop($tagStack);
|
||||
} else {
|
||||
$removeOffsets[] = [
|
||||
'pos' => $t['offset'],
|
||||
'len' => $t['length'],
|
||||
'content' => $t['content']
|
||||
];
|
||||
// 无匹配开标签的闭标签,标记删除
|
||||
$removeOffsets[] = $t;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 无匹配闭标签的开标签,标记删除
|
||||
foreach ($tagStack as $t) {
|
||||
$removeOffsets[] = [
|
||||
'pos' => $t['offset'],
|
||||
'len' => $t['length'],
|
||||
'content' => $t['content']
|
||||
];
|
||||
$removeOffsets[] = $t;
|
||||
}
|
||||
|
||||
// 倒序删除,避免偏移错乱
|
||||
// 按偏移量倒序删除(避免影响后续偏移)
|
||||
usort($removeOffsets, function($a, $b) {
|
||||
return $b['pos'] - $a['pos'];
|
||||
return $b['offset'] - $a['offset'];
|
||||
});
|
||||
|
||||
foreach ($removeOffsets as $item) {
|
||||
if ($item['pos'] >= 0 && $item['pos'] < strlen($html)) {
|
||||
$html = substr_replace($html, '', $item['pos'], $item['len']);
|
||||
if ($item['offset'] >= 0 && $item['offset'] < strlen($html)) {
|
||||
$html = substr_replace($html, '', $item['offset'], $item['length']);
|
||||
}
|
||||
}
|
||||
}
|
||||
return $html;
|
||||
}
|
||||
|
||||
return $html;
|
||||
}
|
||||
/**
|
||||
* 优化文本格式(合并多余空格,规范myfigure标签前后空格)
|
||||
* @param string $html
|
||||
* 优化文本格式(清理多余空格)
|
||||
* @param string $html 待处理HTML
|
||||
* @return string
|
||||
*/
|
||||
private function optimizeFormat($html) {
|
||||
private function optimizeFormat($html){
|
||||
// 清理连续空格
|
||||
$html = preg_replace('/\s{2,}/', ' ', trim($html));
|
||||
$html = preg_replace('/<\/myfigure>([A-Za-z0-9])/is', '</myfigure> $1', $html);
|
||||
$html = preg_replace('/([a-zA-Z0-9])<myfigure/is', '$1 <myfigure', $html);
|
||||
// 标签后紧跟字母/数字时加空格
|
||||
$html = preg_replace('/<\/'.self::PROCESSED_TAG.'>([A-Za-z0-9])/is', '</'.self::PROCESSED_TAG.'> $1', $html);
|
||||
// 字母/数字紧跟标签前时加空格
|
||||
$html = preg_replace('/([a-zA-Z0-9])<'.self::PROCESSED_TAG.'/is', '$1 <'.self::PROCESSED_TAG.'', $html);
|
||||
|
||||
return $html;
|
||||
}
|
||||
/**
|
||||
* 清理嵌套的myfigure标签(避免重复嵌套)
|
||||
* @param string $html 待处理HTML
|
||||
* @return string
|
||||
*/
|
||||
private function cleanDuplicateNestedTags($html){
|
||||
$pattern = '/<'.self::PROCESSED_TAG.'[^>]*>\s*<'.self::PROCESSED_TAG.'([^>]*)>(.*?)<\/'.self::PROCESSED_TAG.'>\s*<\/'.self::PROCESSED_TAG.'>/is';
|
||||
$html = preg_replace($pattern, '<'.self::PROCESSED_TAG.'$1>$2</'.self::PROCESSED_TAG.'>', $html);
|
||||
|
||||
return $html;
|
||||
}
|
||||
}
|
||||
@@ -2,59 +2,71 @@
|
||||
namespace app\common;
|
||||
|
||||
/**
|
||||
* Table标签处理器(生产环境终极版)
|
||||
* 功能:精准匹配并替换Table相关格式为mytable标签
|
||||
* 支持格式:table 数字、(table 数字)、table 数字:/table 数字.(含嵌套标签)
|
||||
* 特性:支持任意嵌套标签/括号、不处理数字+字母、仅跳过已被mytable包裹的Table
|
||||
* 支持格式:table 数字、(table 数字)、table 数字:/table 数字.(含嵌套/拆分标签)
|
||||
* 跳过已被mytable包裹的table(含后缀)
|
||||
* 跳过table 数字+字母/数字后缀(含拆分标签场景,无论是否有空白)
|
||||
* 正常处理table 数字+空白/样式标签场景
|
||||
*/
|
||||
class TableTagProcessor {
|
||||
// 可配置的样式标签列表
|
||||
const STYLE_TAGS = ['i', 'b', 'font', 'strong', 'em'];
|
||||
// 最大处理字符串长度
|
||||
const MAX_HTML_LENGTH = 100000;
|
||||
// 目标替换标签
|
||||
const PROCESSED_TAG = 'mytable';
|
||||
// 数据库表格ID映射
|
||||
class TableTagProcessor{
|
||||
// 支持的样式标签列表
|
||||
private const STYLE_TAGS = ['i', 'b', 'font', 'strong', 'em','blue'];
|
||||
// HTML文本最大处理长度(防止内存溢出)
|
||||
private const MAX_HTML_LENGTH = 100000;
|
||||
// 替换后的目标标签名
|
||||
private const PROCESSED_TAG = 'mytable';
|
||||
// Table数字与对应ID的映射数组
|
||||
private $aTableMain = [];
|
||||
|
||||
/**
|
||||
* 处理Table文本,替换为mytable标签并清理冗余内容
|
||||
* 处理Table标签替换的主方法
|
||||
* @param string $html 待处理的HTML文本
|
||||
* @param array $aTableMain Table数字→主键ID的映射数组(如 [1=>1001, 2=>1002])
|
||||
* @return array ['status'=>状态码, 'data'=>处理后文本]
|
||||
* status: 2-空输入, 4-无匹配/已处理, 5-处理异常, 1-处理成功
|
||||
* @param array $aTableMain Table数字=>ID的映射数组(可选,默认1=>1~10=>10)
|
||||
* @return array ['status' => 状态码, 'data' => 处理后文本]
|
||||
* status说明:2-空文本, 4-无匹配/已处理, 1-处理成功, 5-处理异常
|
||||
*/
|
||||
public function dealTableStr($html = '', $aTableMain = []) {
|
||||
//验证
|
||||
if (!is_string($html) || trim($html) === '') {
|
||||
public function dealTableStr($html = '', $aTableMain = []){
|
||||
// 初始化默认映射数组(仅当入参为空时使用)
|
||||
$defaultTableMap = [1=>1,2=>2,3=>3,4=>4,5=>5,6=>6,7=>7,8=>8,9=>9,10=>10];
|
||||
// 优先使用入参,入参为空则用默认值
|
||||
$tableMap = !empty($aTableMain) ? $aTableMain : $defaultTableMap;
|
||||
|
||||
// 空文本校验
|
||||
$html = trim($html);
|
||||
if ($html === '' || !is_string($html)) {
|
||||
return ['status' => 2, 'data' => ''];
|
||||
}
|
||||
//超大字符串拦截(防止内存溢出)
|
||||
|
||||
// 超长文本保护
|
||||
if (strlen($html) > self::MAX_HTML_LENGTH) {
|
||||
return ['status' => 4, 'data' => $html];
|
||||
}
|
||||
|
||||
//初始化主键映射数组
|
||||
if(!empty($aTableMain)){
|
||||
$aTableMainNew = [];
|
||||
foreach ($aTableMain as $key => $value) {
|
||||
if (!ctype_digit((string)$key) || !ctype_digit((string)$value)) {
|
||||
continue;
|
||||
}
|
||||
$keyInt = (int)$key;
|
||||
$aTableMainNew[$keyInt + 1] = $value;
|
||||
}
|
||||
$this->aTableMain = $aTableMainNew;
|
||||
// 编码处理(统一转为UTF-8,避免中文乱码)
|
||||
if (!mb_check_encoding($html, 'UTF-8')) {
|
||||
$html = mb_convert_encoding($html, 'UTF-8', 'GBK,GB2312,ASCII,ISO-8859-1');
|
||||
}
|
||||
|
||||
// 初始化映射数组(过滤非数字键值)
|
||||
$this->initTableMap($tableMap);
|
||||
|
||||
// 原始内容(异常时返回)
|
||||
$originalHtml = $html;
|
||||
$hasReplace = false;
|
||||
|
||||
try {
|
||||
//原始HTML中匹配所有符合规则的Table
|
||||
// 只要包含数字+字母/数字后缀,直接返回原内容
|
||||
if ($this->hasTableSuffix($html)) {
|
||||
return ['status' => 4, 'data' => $html];
|
||||
}
|
||||
|
||||
// 合并拆分标签的Table+数字
|
||||
$html = $this->preprocessSplitTags($html);
|
||||
|
||||
// 核心替换逻辑
|
||||
$html = $this->replaceTableInHtml($html, $hasReplace);
|
||||
|
||||
// 清理冗余内容
|
||||
// 清理冗余样式/标签(仅当发生替换时执行)
|
||||
if ($hasReplace) {
|
||||
$html = $this->cleanRedundantStyles($html);
|
||||
$html = $this->cleanRedundantPunctuation($html);
|
||||
@@ -74,50 +86,120 @@ class TableTagProcessor {
|
||||
}
|
||||
|
||||
/**
|
||||
* 核心方法:直接在HTML中匹配并替换Table
|
||||
* 全局检测是否包含Table数字+字母/数字后缀
|
||||
* 覆盖所有拆分/嵌套/无标签场景,无论是否有空白
|
||||
* @param string $html 待检测HTML
|
||||
* @return bool
|
||||
*/
|
||||
private function hasTableSuffix($html){
|
||||
$styleTagsPattern = implode('|', self::STYLE_TAGS);
|
||||
|
||||
// 正则1:无标签场景(Table 4B/4123)
|
||||
$pattern1 = "/table\s*\d+[a-zA-Z0-9]/iu";
|
||||
|
||||
// 正则2:拆分标签场景(<b>4</b><b>B</b> / <b>4</b> <b>B</b> / <b>4</b> <b>B</b>)
|
||||
$pattern2 = "/table\s*(?:<\/(?:{$styleTagsPattern})>)\s*[\s| ]*\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\d+)\s*(?:<\/(?:{$styleTagsPattern})>)\s*[\s| ]*\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*([a-zA-Z0-9])/iu";
|
||||
|
||||
// 正则3:嵌套标签场景(<b>4B</b> / <i>4123</i>)
|
||||
$pattern3 = "/table\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*\d+[a-zA-Z0-9]\s*(?:<\/(?:{$styleTagsPattern})>)/iu";
|
||||
|
||||
// 加@抑制正则警告,避免极端文本导致报错
|
||||
return @preg_match($pattern1, $html) || @preg_match($pattern2, $html) || @preg_match($pattern3, $html);
|
||||
}
|
||||
|
||||
/**
|
||||
* 初始化Table数字映射数组(过滤非数字键值)
|
||||
* @param array $aTableMain 原始映射数组
|
||||
* @return void
|
||||
*/
|
||||
private function initTableMap($aTableMain = []){
|
||||
if (!is_array($aTableMain)) {
|
||||
$aTableMain = [];
|
||||
}
|
||||
|
||||
$tableMap = [];
|
||||
foreach ($aTableMain as $key => $value) {
|
||||
// 严格校验键值均为数字
|
||||
if (ctype_digit((string)$key) && ctype_digit((string)$value)) {
|
||||
$tableMap[(int)$key] = (int)$value;
|
||||
}
|
||||
}
|
||||
|
||||
$this->aTableMain = $tableMap;
|
||||
}
|
||||
|
||||
/**
|
||||
* 合并所有拆分标签的Table+数字(含空白样式标签)
|
||||
* @param string $html 待处理HTML
|
||||
* @return string
|
||||
*/
|
||||
private function replaceTableInHtml($html, &$hasReplace) {
|
||||
private function preprocessSplitTags($html){
|
||||
$styleTagsPattern = implode('|', self::STYLE_TAGS);
|
||||
$styleTagsRegex = "(?:<(?:{$styleTagsPattern})[^>]*>)*\s*"; // 匹配任意嵌套样式标签
|
||||
|
||||
// 正则1:匹配基础拆分标签的Table+数字
|
||||
$pattern = "/(table)\s*(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\d+)/iu";
|
||||
$html = @preg_replace_callback($pattern, function($matches) {
|
||||
return $matches[1] . ' ' . $matches[2];
|
||||
}, $html);
|
||||
|
||||
// 正则2:匹配多轮拆分标签的Table+数字(含空白)
|
||||
$pattern2 = "/(table)(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\s*)\s*(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\d+)/iu";
|
||||
$html = @preg_replace_callback($pattern2, function($matches) {
|
||||
return $matches[1] . $matches[2] . $matches[3];
|
||||
}, $html);
|
||||
|
||||
return $html;
|
||||
}
|
||||
|
||||
/**
|
||||
* 核心替换逻辑:将纯数字Table替换为mytable标签
|
||||
* @param string $html 待处理HTML
|
||||
* @param bool $hasReplace 是否发生替换(引用传递)
|
||||
* @return string
|
||||
*/
|
||||
private function replaceTableInHtml($html, &$hasReplace){
|
||||
$styleTagsPattern = implode('|', self::STYLE_TAGS);
|
||||
$styleTagsRegex = "(?:<(?:{$styleTagsPattern})[^>]*>)*\s*";
|
||||
$styleTagsCloseRegex = "\s*(?:<\/(?:{$styleTagsPattern})>)*";
|
||||
|
||||
// 规则1:匹配带括号的Table(如 (Table 82)、(<b>Table 1.</b>))
|
||||
$pattern1 = "/\(\s*{$styleTagsRegex}table\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*\)/iu";
|
||||
$html = preg_replace_callback($pattern1, function($matches) use (&$hasReplace) {
|
||||
// 正则1:匹配括号内的纯数字Table(如 (Table 2)、(<b>Table 3</b>))
|
||||
$pattern1 = "/\(\s*{$styleTagsRegex}table\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*(?![a-zA-Z0-9])/iuD";
|
||||
$html = @preg_replace_callback($pattern1, function($matches) use (&$hasReplace) {
|
||||
$num = $matches[1];
|
||||
$numInt = intval($num);
|
||||
$numInt = (int)$num;
|
||||
$suffix = $matches[2] ?? '';
|
||||
|
||||
// 校验:纯数字 + 有映射ID + 未被mytable包裹(避免重复替换)
|
||||
// 过滤条件:非数字、无映射、已处理过的标签
|
||||
if (!ctype_digit($num) || !isset($this->aTableMain[$numInt]) ||
|
||||
$this->isMatchPositionHasMyTableTag($matches[0], "Table {$num}")) {
|
||||
return $matches[0];
|
||||
}
|
||||
|
||||
// 执行替换
|
||||
$primaryId = $this->aTableMain[$numInt];
|
||||
$baseTag = "<".self::PROCESSED_TAG." data-id=\"{$primaryId}\">Table {$num}</".self::PROCESSED_TAG.">";
|
||||
$baseTag = "<" . self::PROCESSED_TAG . " data-id=\"{$primaryId}\">Table {$num}</" . self::PROCESSED_TAG . ">";
|
||||
$target = "({$baseTag}{$suffix})";
|
||||
|
||||
$hasReplace = true;
|
||||
return $target;
|
||||
}, $html);
|
||||
|
||||
// 规则2:匹配无括号的Table(如 Table 1、<b>Table 2:</b>、<i>Table 3.</i>)
|
||||
$pattern2 = "/{$styleTagsRegex}table\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}(?![a-zA-Z])/iu";
|
||||
$html = preg_replace_callback($pattern2, function($matches) use (&$hasReplace) {
|
||||
// 正则2:匹配无括号的纯数字Table(如 Table 2、<i>Table 3</i>:)
|
||||
$pattern2 = "/{$styleTagsRegex}table\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*(?![a-zA-Z0-9])/iuD";
|
||||
$html = @preg_replace_callback($pattern2, function($matches) use (&$hasReplace) {
|
||||
$num = $matches[1];
|
||||
$numInt = intval($num);
|
||||
$numInt = (int)$num;
|
||||
$suffix = $matches[2] ?? '';
|
||||
|
||||
// 校验:纯数字 + 有映射ID + 未被mytable包裹 + 不是数字+字母组合
|
||||
// 过滤条件:非数字、无映射、已处理过的标签
|
||||
if (!ctype_digit($num) || !isset($this->aTableMain[$numInt]) ||
|
||||
$this->isMatchPositionHasMyTableTag($matches[0], "Table {$num}")) {
|
||||
return $matches[0];
|
||||
}
|
||||
|
||||
// 执行替换
|
||||
$primaryId = $this->aTableMain[$numInt];
|
||||
$baseTag = "<".self::PROCESSED_TAG." data-id=\"{$primaryId}\">Table {$num}</".self::PROCESSED_TAG.">";
|
||||
$baseTag = "<" . self::PROCESSED_TAG . " data-id=\"{$primaryId}\">Table {$num}</" . self::PROCESSED_TAG . ">";
|
||||
$target = "{$baseTag}{$suffix}";
|
||||
|
||||
$hasReplace = true;
|
||||
@@ -127,126 +209,157 @@ class TableTagProcessor {
|
||||
return $html;
|
||||
}
|
||||
|
||||
/**
|
||||
* 检测当前匹配内容是否已包含mytable标签(避免重复替换)
|
||||
* @param string $content 匹配的文本片段
|
||||
* @param string $tableText 待检测的Table文本(如 Table 2)
|
||||
* @return bool
|
||||
*/
|
||||
private function isMatchPositionHasMyTableTag($content, $tableText){
|
||||
$escapedText = preg_quote($tableText, '/');
|
||||
$pattern = '/<' . self::PROCESSED_TAG . '[^>]*>\s*' . $escapedText . '\s*<\/' . self::PROCESSED_TAG . '>/is';
|
||||
return (bool)@preg_match($pattern, $content);
|
||||
}
|
||||
|
||||
/**
|
||||
* 清理mytable标签周围的冗余样式标签
|
||||
* @param string $html
|
||||
* @param string $html 待处理HTML
|
||||
* @return string
|
||||
*/
|
||||
private function cleanRedundantStyles($html) {
|
||||
private function cleanRedundantStyles($html){
|
||||
foreach (self::STYLE_TAGS as $tag) {
|
||||
$pattern = '/<' . $tag . '>\s*<'.self::PROCESSED_TAG.'([^>]*?)>(.*?)<\/'.self::PROCESSED_TAG.'>([\.,:]{0,1})\s*<\/' . $tag . '>/is';
|
||||
$html = @preg_replace($pattern, '<'.self::PROCESSED_TAG.'$1>$2</'.self::PROCESSED_TAG.'>$3', $html);
|
||||
}
|
||||
// 清理孤立的样式闭标签(避免标签残留)
|
||||
$html = preg_replace('/<\/('.implode('|', self::STYLE_TAGS).')>(?![^<]*<\1>)/is', '', $html);
|
||||
|
||||
// 清理无匹配的闭合样式标签
|
||||
$html = @preg_replace('/<\/('.implode('|', self::STYLE_TAGS).')>(?![^<]*<\1>)/is', '', $html);
|
||||
|
||||
return $html;
|
||||
}
|
||||
|
||||
/**
|
||||
* 清理mytable标签后的冗余标点(保证格式整洁)
|
||||
* @param string $html
|
||||
* 清理mytable标签周围的冗余标点
|
||||
* @param string $html 待处理HTML
|
||||
* @return string
|
||||
*/
|
||||
private function cleanRedundantPunctuation($html) {
|
||||
$html = preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\(Table \d+\)<\/'.self::PROCESSED_TAG.'>\)\./i', '<'.self::PROCESSED_TAG.' data-id="$1">(Table $1)</'.self::PROCESSED_TAG.'>.', $html);
|
||||
$html = preg_replace('/<\/'.self::PROCESSED_TAG.'>\)\.([\.,:]{0,1})/', '</'.self::PROCESSED_TAG.'>)$1', $html);
|
||||
$html = preg_replace('/<\/'.self::PROCESSED_TAG.'>\.\)([\.,:]{0,1})/', '</'.self::PROCESSED_TAG.'>)$1', $html);
|
||||
$html = preg_replace('/<\/'.self::PROCESSED_TAG.'>([\.,:]){2,}/', '</'.self::PROCESSED_TAG.'>$1', $html);
|
||||
$html = preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\((Table \d+)\s*<\/'.self::PROCESSED_TAG.'>([\.,:]{0,1})/i',
|
||||
private function cleanRedundantPunctuation($html){
|
||||
// 修复括号+标点的冗余格式
|
||||
$html = @preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\(Table \d+\)<\/'.self::PROCESSED_TAG.'>\)\./i',
|
||||
'<'.self::PROCESSED_TAG.' data-id="$1">(Table $1)</'.self::PROCESSED_TAG.'>.', $html);
|
||||
$html = @preg_replace('/<\/'.self::PROCESSED_TAG.'>\)\.([\.,:]{0,1})/', '</'.self::PROCESSED_TAG.'>)$1', $html);
|
||||
$html = @preg_replace('/<\/'.self::PROCESSED_TAG.'>\.\)([\.,:]{0,1})/', '</'.self::PROCESSED_TAG.'>)$1', $html);
|
||||
|
||||
// 清理重复标点
|
||||
$html = @preg_replace('/<\/'.self::PROCESSED_TAG.'>([\.,:]){2,}/', '</'.self::PROCESSED_TAG.'>$1', $html);
|
||||
|
||||
// 修复括号内的标签冗余
|
||||
$html = @preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\((Table \d+)\s*<\/'.self::PROCESSED_TAG.'>([\.,:]{0,1})/i',
|
||||
'<'.self::PROCESSED_TAG.' data-id="$1">($2)</'.self::PROCESSED_TAG.'>$3', $html);
|
||||
|
||||
return $html;
|
||||
}
|
||||
|
||||
/**
|
||||
* 清理孤立的样式标签(栈算法兜底,避免标签不闭合)
|
||||
* @param string $html
|
||||
* 清理未闭合的样式标签
|
||||
* @param string $html 待处理HTML
|
||||
* @return string
|
||||
*/
|
||||
private function cleanUnclosedTags($html) {
|
||||
// 清理mytable后孤立的样式闭标签
|
||||
private function cleanUnclosedTags($html){
|
||||
foreach (self::STYLE_TAGS as $tag) {
|
||||
// 清理mytable标签后的冗余闭合标签
|
||||
$html = @preg_replace('/(<\/'.self::PROCESSED_TAG.'>)\s*<\/' . $tag . '>/i', '$1', $html);
|
||||
}
|
||||
|
||||
// 栈算法清理其他孤立标签
|
||||
foreach (self::STYLE_TAGS as $tag) {
|
||||
// 定位所有该标签的开闭标签位置
|
||||
@preg_match_all("/<{$tag}\b[^>]*>/i", $html, $openMatches, PREG_OFFSET_CAPTURE);
|
||||
@preg_match_all("/<\/{$tag}>/i", $html, $closeMatches, PREG_OFFSET_CAPTURE);
|
||||
|
||||
|
||||
$allTags = [];
|
||||
// 收集开标签
|
||||
foreach ($openMatches[0] as $m) {
|
||||
$allTags[] = ['offset' => $m[1], 'type' => 'open', 'content' => $m[0], 'length' => strlen($m[0])];
|
||||
$allTags[] = [
|
||||
'offset' => $m[1],
|
||||
'type' => 'open',
|
||||
'content' => $m[0],
|
||||
'length' => strlen($m[0])
|
||||
];
|
||||
}
|
||||
// 收集闭标签
|
||||
foreach ($closeMatches[0] as $m) {
|
||||
$allTags[] = ['offset' => $m[1], 'type' => 'close', 'content' => $m[0], 'length' => strlen($m[0])];
|
||||
$allTags[] = [
|
||||
'offset' => $m[1],
|
||||
'type' => 'close',
|
||||
'content' => $m[0],
|
||||
'length' => strlen($m[0])
|
||||
];
|
||||
}
|
||||
|
||||
// 按位置排序
|
||||
usort($allTags, function($a, $b) {
|
||||
return $a['offset'] - $b['offset'];
|
||||
});
|
||||
|
||||
// 栈结构匹配开闭标签
|
||||
$tagStack = [];
|
||||
$removeOffsets = [];
|
||||
foreach ($allTags as $t) {
|
||||
if ($t['type'] == 'open') {
|
||||
if ($t['type'] === 'open') {
|
||||
array_push($tagStack, $t);
|
||||
} else {
|
||||
if (!empty($tagStack)) {
|
||||
array_pop($tagStack);
|
||||
} else {
|
||||
// 无匹配开标签的闭标签,标记删除
|
||||
$removeOffsets[] = $t;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 无匹配闭标签的开标签,标记删除
|
||||
foreach ($tagStack as $t) {
|
||||
$removeOffsets[] = $t;
|
||||
}
|
||||
|
||||
// 倒序删除,避免偏移错乱
|
||||
// 按偏移量倒序删除(避免影响后续偏移)
|
||||
usort($removeOffsets, function($a, $b) {
|
||||
return $b['offset'] - $a['offset'];
|
||||
});
|
||||
|
||||
foreach ($removeOffsets as $item) {
|
||||
if ($item['offset'] >= 0 && $item['offset'] < strlen($html)) {
|
||||
$html = substr_replace($html, '', $item['offset'], $item['length']);
|
||||
}
|
||||
}
|
||||
}
|
||||
return $html;
|
||||
}
|
||||
|
||||
/**
|
||||
* 优化文本格式(合并多余空格,规范标签前后空格)
|
||||
* @param string $html
|
||||
* @return string
|
||||
*/
|
||||
private function optimizeFormat($html) {
|
||||
$html = preg_replace('/\s{2,}/', ' ', trim($html));
|
||||
$html = preg_replace('/<\/'.self::PROCESSED_TAG.'>([A-Za-z0-9])/is', '</'.self::PROCESSED_TAG.'> $1', $html);
|
||||
$html = preg_replace('/([a-zA-Z0-9])<'.self::PROCESSED_TAG.'/is', '$1 <'.self::PROCESSED_TAG.'', $html);
|
||||
return $html;
|
||||
}
|
||||
|
||||
/**
|
||||
* 清理重复嵌套的mytable标签(兜底方案)
|
||||
* @param string $html
|
||||
* 优化文本格式(清理多余空格)
|
||||
* @param string $html 待处理HTML
|
||||
* @return string
|
||||
*/
|
||||
private function cleanDuplicateNestedTags($html) {
|
||||
private function optimizeFormat($html){
|
||||
// 清理连续空格
|
||||
$html = @preg_replace('/\s{2,}/', ' ', trim($html));
|
||||
// 标签后紧跟字母/数字时加空格
|
||||
$html = @preg_replace('/<\/'.self::PROCESSED_TAG.'>([A-Za-z0-9])/is', '</'.self::PROCESSED_TAG.'> $1', $html);
|
||||
// 字母/数字紧跟标签前时加空格
|
||||
$html = @preg_replace('/([a-zA-Z0-9])<'.self::PROCESSED_TAG.'/is', '$1 <'.self::PROCESSED_TAG.'', $html);
|
||||
|
||||
return $html;
|
||||
}
|
||||
|
||||
/**
|
||||
* 清理嵌套的mytable标签(避免重复嵌套)
|
||||
* @param string $html 待处理HTML
|
||||
* @return string
|
||||
*/
|
||||
private function cleanDuplicateNestedTags($html){
|
||||
$pattern = '/<'.self::PROCESSED_TAG.'[^>]*>\s*<'.self::PROCESSED_TAG.'([^>]*)>(.*?)<\/'.self::PROCESSED_TAG.'>\s*<\/'.self::PROCESSED_TAG.'>/is';
|
||||
$html = preg_replace($pattern, '<'.self::PROCESSED_TAG.'$1>$2</'.self::PROCESSED_TAG.'>', $html);
|
||||
$html = @preg_replace($pattern, '<'.self::PROCESSED_TAG.'$1>$2</'.self::PROCESSED_TAG.'>', $html);
|
||||
|
||||
return $html;
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断指定Table内容是否被mytable标签包裹
|
||||
* @param string $content 待检查内容
|
||||
* @param string $tableText Table文本(如 "Table 1")
|
||||
* @return bool
|
||||
*/
|
||||
private function isMatchPositionHasMyTableTag($content, $tableText) {
|
||||
$escapedText = preg_quote($tableText, '/');
|
||||
$pattern = '/<' . self::PROCESSED_TAG . '[^>]*>\s*' . $escapedText . '\s*<\/' . self::PROCESSED_TAG . '>/is';
|
||||
return @preg_match($pattern, $content) === 1;
|
||||
}
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user