latex 代码调整

This commit is contained in:
chengxl
2026-01-21 13:13:19 +08:00
parent 06308bc319
commit 4d0cec198f

View File

@@ -1,5 +1,6 @@
<?php
namespace app\common;
/**
* 功能精准匹配并替换Figure相关格式为myfigure标签
* 支持格式figure 数字、(figure 数字)、figure 数字:/figure 数字.(含嵌套/拆分标签)
@@ -16,6 +17,7 @@ class FigureTagProcessor{
private const PROCESSED_TAG = 'myfigure';
//Figure数字与对应ID的映射数组
private $aImageMain = [];
/**
* 处理Figure标签替换的主方法
* @param string $html 待处理的HTML文本
@@ -29,28 +31,35 @@ class FigureTagProcessor{
if ($html === '' || !is_string($html)) {
return ['status' => 2, 'data' => ''];
}
//超长文本保护
if (strlen($html) > self::MAX_HTML_LENGTH) {
return ['status' => 4, 'data' => $html];
}
//编码处理
if (!mb_check_encoding($html, 'UTF-8')) {
$html = mb_convert_encoding($html, 'UTF-8', 'GBK,GB2312,ASCII,ISO-8859-1');
}
//初始化映射数组(过滤非数字键值)
$this->initImageMap($aImageMain);
//原始内容
$originalHtml = $html;
$hasReplace = false;
try {
//只要包含数字+字母/数字后缀,直接返回原内容(核心修复)
if ($this->hasFigureSuffix($html)) {
return ['status' => 4, 'data' => $html];
}
//合并拆分标签的Figure+数字
$html = $this->preprocessSplitTags($html);
//替换
//替换(核心修复:适配样式标签+后缀标点场景)
$html = $this->replaceFigureInHtml($html, $hasReplace);
//清理冗余样式/标签
if ($hasReplace) {
$html = $this->cleanRedundantStyles($html);
@@ -69,6 +78,7 @@ class FigureTagProcessor{
'data' => $html
];
}
/**
* 全局检测是否包含Figure数字+字母/数字后缀
* 覆盖所有拆分/嵌套/无标签场景,无论是否有空白
@@ -86,8 +96,11 @@ class FigureTagProcessor{
// 正则3嵌套标签场景<b>4B</b> / <i>4123</i>
$pattern3 = "/figure\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*\d+[a-zA-Z0-9]\s*(?:<\/(?:{$styleTagsPattern})>)/iu";
return preg_match($pattern1, $html) || preg_match($pattern2, $html) || preg_match($pattern3, $html);
// 加@抑制正则警告
return @preg_match($pattern1, $html) || @preg_match($pattern2, $html) || @preg_match($pattern3, $html);
}
/**
* 初始化Figure数字映射数组
* @param array $aImageMain 原始映射数组
@@ -106,6 +119,7 @@ class FigureTagProcessor{
}
$this->aImageMain = $imageMap;
}
/**
* 合并所有拆分标签的Figure+数字(含空白样式标签)
* @param string $html 待处理HTML
@@ -116,19 +130,22 @@ class FigureTagProcessor{
// 正则1匹配基础拆分标签的Figure+数字
$pattern = "/(figure)\s*(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\d+)/iu";
$html = preg_replace_callback($pattern, function($matches) {
$html = @preg_replace_callback($pattern, function($matches) {
return $matches[1] . ' ' . $matches[2];
}, $html);
// 正则2匹配多轮拆分标签的Figure+数字(含空白)
$pattern2 = "/(figure)(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\s*)\s*(?:<\/(?:{$styleTagsPattern})>)\s*(?:<(?:{$styleTagsPattern})[^>]*>)\s*(\d+)/iu";
$html = preg_replace_callback($pattern2, function($matches) {
$html = @preg_replace_callback($pattern2, function($matches) {
return $matches[1] . $matches[2] . $matches[3];
}, $html);
return $html;
}
/**
* 核心替换逻辑将纯数字Figure替换为myfigure标签
* 修复:适配样式标签包裹 + 后缀标点场景(如 <b>Figure 2</b>.
* @param string $html 待处理HTML
* @param bool $hasReplace 是否发生替换(引用传递)
* @return string
@@ -138,13 +155,14 @@ class FigureTagProcessor{
$styleTagsRegex = "(?:<(?:{$styleTagsPattern})[^>]*>)*\s*";
$styleTagsCloseRegex = "\s*(?:<\/(?:{$styleTagsPattern})>)*";
// 正则1匹配括号内的纯数字Figure如 (Figure 2)、(<b>Figure 3</b>)
// $pattern1 = "/\(\s*{$styleTagsRegex}figure\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*(?![a-zA-Z0-9])/iu";
$pattern1 = "/\(\s*{$styleTagsRegex}figure\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*(?![a-zA-Z0-9])/iuD";
$html = preg_replace_callback($pattern1, function($matches) use (&$hasReplace) {
// 正则1匹配括号内的纯数字Figure如 (Figure 2)、(<b>Figure 3</b>)、(<b>Figure 3</b>).
$pattern1 = "/\(\s*{$styleTagsRegex}figure\s*(\d+){$styleTagsCloseRegex}\s*([\.,:]{0,1})\s*\)\s*([\.,:]{0,1})/iuD";
$html = @preg_replace_callback($pattern1, function($matches) use (&$hasReplace) {
$num = $matches[1];
$numInt = (int)$num;
$suffix = $matches[2] ?? '';
$suffix1 = $matches[2] ?? '';
$suffix2 = $matches[3] ?? '';
$suffix = $suffix1 . $suffix2;
// 过滤条件:非数字、无映射、已处理过的标签
if (!ctype_digit($num) || !isset($this->aImageMain[$numInt]) ||
@@ -161,10 +179,9 @@ class FigureTagProcessor{
return $target;
}, $html);
// 正则2匹配无括号的纯数字Figure如 Figure 2、<i>Figure 3</i>:
// $pattern2 = "/{$styleTagsRegex}figure\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}(?![a-zA-Z0-9])/iu";
$pattern2 = "/{$styleTagsRegex}figure\s*(\d+)\s*([\.,:]{0,1}){$styleTagsCloseRegex}\s*(?![a-zA-Z0-9])/iuD";
$html = preg_replace_callback($pattern2, function($matches) use (&$hasReplace) {
// 正则2匹配无括号的纯数字Figure核心修复:适配 <b>Figure 2</b>. 场景
$pattern2 = "/{$styleTagsRegex}figure\s*(\d+){$styleTagsCloseRegex}\s*([\.,:]{0,1})(?![a-zA-Z0-9])/iuD";
$html = @preg_replace_callback($pattern2, function($matches) use (&$hasReplace) {
$num = $matches[1];
$numInt = (int)$num;
$suffix = $matches[2] ?? '';
@@ -186,6 +203,7 @@ class FigureTagProcessor{
return $html;
}
/**
* 检测当前匹配内容是否已包含myfigure标签避免重复替换
* @param string $content 匹配的文本片段
@@ -195,8 +213,9 @@ class FigureTagProcessor{
private function isMatchPositionHasMyFigureTag($content, $figureText){
$escapedText = preg_quote($figureText, '/');
$pattern = '/<' . self::PROCESSED_TAG . '[^>]*>\s*' . $escapedText . '\s*<\/' . self::PROCESSED_TAG . '>/is';
return (bool)preg_match($pattern, $content);
return (bool)@preg_match($pattern, $content);
}
/**
* 清理myfigure标签周围的冗余样式标签
* @param string $html 待处理HTML
@@ -205,14 +224,15 @@ class FigureTagProcessor{
private function cleanRedundantStyles($html){
foreach (self::STYLE_TAGS as $tag) {
$pattern = '/<' . $tag . '>\s*<'.self::PROCESSED_TAG.'([^>]*?)>(.*?)<\/'.self::PROCESSED_TAG.'>([\.,:]{0,1})\s*<\/' . $tag . '>/is';
$html = preg_replace($pattern, '<'.self::PROCESSED_TAG.'$1>$2</'.self::PROCESSED_TAG.'>$3', $html);
$html = @preg_replace($pattern, '<'.self::PROCESSED_TAG.'$1>$2</'.self::PROCESSED_TAG.'>$3', $html);
}
// 清理无匹配的闭合样式标签
$html = preg_replace('/<\/('.implode('|', self::STYLE_TAGS).')>(?![^<]*<\1>)/is', '', $html);
$html = @preg_replace('/<\/('.implode('|', self::STYLE_TAGS).')>(?![^<]*<\1>)/is', '', $html);
return $html;
}
/**
* 清理myfigure标签周围的冗余标点
* @param string $html 待处理HTML
@@ -220,17 +240,43 @@ class FigureTagProcessor{
*/
private function cleanRedundantPunctuation($html){
// 修复括号+标点的冗余格式
$html = preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\(Figure \d+\)<\/'.self::PROCESSED_TAG.'>\)\./i',
$html = @preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\(Figure \d+\)<\/'.self::PROCESSED_TAG.'>\)\./i',
'<'.self::PROCESSED_TAG.' data-id="$1">(Figure $1)</'.self::PROCESSED_TAG.'>.', $html);
$html = preg_replace('/<\/'.self::PROCESSED_TAG.'>\)\.([\.,:]{0,1})/', '</'.self::PROCESSED_TAG.'>)$1', $html);
$html = preg_replace('/<\/'.self::PROCESSED_TAG.'>\.\)([\.,:]{0,1})/', '</'.self::PROCESSED_TAG.'>)$1', $html);
$html = @preg_replace('/<\/'.self::PROCESSED_TAG.'>\)\.([\.,:]{0,1})/', '</'.self::PROCESSED_TAG.'>)$1', $html);
$html = @preg_replace('/<\/'.self::PROCESSED_TAG.'>\.\)([\.,:]{0,1})/', '</'.self::PROCESSED_TAG.'>)$1', $html);
// 清理重复标点
$html = preg_replace('/<\/'.self::PROCESSED_TAG.'>([\.,:]){2,}/', '</'.self::PROCESSED_TAG.'>$1', $html);
$html = @preg_replace('/<\/'.self::PROCESSED_TAG.'>([\.,:]){2,}/', '</'.self::PROCESSED_TAG.'>$1', $html);
// 修复括号内的标签冗余
$html = preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\((Figure \d+)\s*<\/'.self::PROCESSED_TAG.'>([\.,:]{0,1})/i',
$html = @preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\((Figure \d+)\s*<\/'.self::PROCESSED_TAG.'>([\.,:]{0,1})/i',
'<'.self::PROCESSED_TAG.' data-id="$1">($2)</'.self::PROCESSED_TAG.'>$3', $html);
$html = $this->cleanExtraParentheses($html);
return $html;
}
/**
* 清理文本中多余的成对括号仅处理myfigure标签相关的括号
* @param string $html 待处理文本
* @return string
*/
private function cleanExtraParentheses($html){
// 匹配myfigure标签周围的括号区域
$pattern = '/(\()*(<'.self::PROCESSED_TAG.'[^>]*>.*?<\/'.self::PROCESSED_TAG.'>)(\))*/is';
$html = @preg_replace_callback($pattern, function($matches) {
$tagContent = $matches[2];
$leftParen = $matches[1] ?? '';
$rightParen = $matches[3] ?? '';
// 只保留1个左括号和1个右括号无论原始有多少
$newLeft = $leftParen ? '(' : '';
$newRight = $rightParen ? ')' : '';
return $newLeft . $tagContent . $newRight;
}, $html);
return $html;
}
@@ -242,11 +288,11 @@ class FigureTagProcessor{
private function cleanUnclosedTags($html){
foreach (self::STYLE_TAGS as $tag) {
// 清理myfigure标签后的冗余闭合标签
$html = preg_replace('/(<\/'.self::PROCESSED_TAG.'>)\s*<\/' . $tag . '>/i', '$1', $html);
$html = @preg_replace('/(<\/'.self::PROCESSED_TAG.'>)\s*<\/' . $tag . '>/i', '$1', $html);
// 定位所有该标签的开闭标签位置
preg_match_all("/<{$tag}\b[^>]*>/i", $html, $openMatches, PREG_OFFSET_CAPTURE);
preg_match_all("/<\/{$tag}>/i", $html, $closeMatches, PREG_OFFSET_CAPTURE);
@preg_match_all("/<{$tag}\b[^>]*>/i", $html, $openMatches, PREG_OFFSET_CAPTURE);
@preg_match_all("/<\/{$tag}>/i", $html, $closeMatches, PREG_OFFSET_CAPTURE);
$allTags = [];
// 收集开标签
@@ -308,6 +354,7 @@ class FigureTagProcessor{
return $html;
}
/**
* 优化文本格式(清理多余空格)
* @param string $html 待处理HTML
@@ -315,14 +362,15 @@ class FigureTagProcessor{
*/
private function optimizeFormat($html){
// 清理连续空格
$html = preg_replace('/\s{2,}/', ' ', trim($html));
$html = @preg_replace('/\s{2,}/', ' ', trim($html));
// 标签后紧跟字母/数字时加空格
$html = preg_replace('/<\/'.self::PROCESSED_TAG.'>([A-Za-z0-9])/is', '</'.self::PROCESSED_TAG.'> $1', $html);
$html = @preg_replace('/<\/'.self::PROCESSED_TAG.'>([A-Za-z0-9])/is', '</'.self::PROCESSED_TAG.'> $1', $html);
// 字母/数字紧跟标签前时加空格
$html = preg_replace('/([a-zA-Z0-9])<'.self::PROCESSED_TAG.'/is', '$1 <'.self::PROCESSED_TAG.'', $html);
$html = @preg_replace('/([a-zA-Z0-9])<'.self::PROCESSED_TAG.'/is', '$1 <'.self::PROCESSED_TAG.'', $html);
return $html;
}
/**
* 清理嵌套的myfigure标签避免重复嵌套
* @param string $html 待处理HTML
@@ -330,7 +378,7 @@ class FigureTagProcessor{
*/
private function cleanDuplicateNestedTags($html){
$pattern = '/<'.self::PROCESSED_TAG.'[^>]*>\s*<'.self::PROCESSED_TAG.'([^>]*)>(.*?)<\/'.self::PROCESSED_TAG.'>\s*<\/'.self::PROCESSED_TAG.'>/is';
$html = preg_replace($pattern, '<'.self::PROCESSED_TAG.'$1>$2</'.self::PROCESSED_TAG.'>', $html);
$html = @preg_replace($pattern, '<'.self::PROCESSED_TAG.'$1>$2</'.self::PROCESSED_TAG.'>', $html);
return $html;
}