处理正文内容表格/图片相关联

This commit is contained in:
chengxl
2026-01-18 17:13:58 +08:00
parent 4704b61448
commit b217ab03fd
2 changed files with 18 additions and 100 deletions

View File

@@ -13,11 +13,11 @@ class FigureTagProcessor {
* status: 2-空输入, 4-无匹配, 5-处理异常, 1-处理成功 * status: 2-空输入, 4-无匹配, 5-处理异常, 1-处理成功
*/ */
public function dealFigureStr($html = '') { public function dealFigureStr($html = '') {
// 1. 基础输入校 //验
if (!is_string($html) || trim($html) === '') { if (!is_string($html) || trim($html) === '') {
return ['status' => 2, 'data' => '']; return ['status' => 2, 'data' => ''];
} }
// 2. 超大字符串拦截 //超大字符串拦截
if (strlen($html) > self::MAX_HTML_LENGTH) { if (strlen($html) > self::MAX_HTML_LENGTH) {
return ['status' => 4, 'data' => $html]; return ['status' => 4, 'data' => $html];
} }
@@ -26,22 +26,22 @@ class FigureTagProcessor {
$hasReplace = false; $hasReplace = false;
try { try {
// 3. 合并嵌套样式标签 //合并嵌套样式标签
$mergedHtml = $this->mergeFragmentStyleTags($html); $mergedHtml = $this->mergeFragmentStyleTags($html);
// 4. 提取纯文本用于匹配Figure //提取纯文本用于匹配Figure
$plainText = preg_replace('/<[^>]+>/', ' ', $mergedHtml); $plainText = preg_replace('/<[^>]+>/', ' ', $mergedHtml);
$plainText = preg_replace('/\s+/', ' ', trim($plainText)); $plainText = preg_replace('/\s+/', ' ', trim($plainText));
// 5. 提取所有匹配的Figure数字 //提取所有匹配的Figure数字
$allMatches = $this->extractAllFigureMatches($plainText); $allMatches = $this->extractAllFigureMatches($plainText);
if (empty($allMatches)) { if (empty($allMatches)) {
return ['status' => 4, 'data' => $originalHtml]; return ['status' => 4, 'data' => $originalHtml];
} }
// 6. 替换为myfigure标签 //替换为myfigure标签
$html = $this->replaceFigureWithTag($html, $allMatches, $hasReplace); $html = $this->replaceFigureWithTag($html, $allMatches, $hasReplace);
// 7. 清理冗余内容(仅替换成功后执行) //清理冗余内容(仅替换成功后执行)
if ($hasReplace) { if ($hasReplace) {
$html = $this->cleanRedundantStyles($html); $html = $this->cleanRedundantStyles($html);
$html = $this->cleanRedundantPunctuation($html); $html = $this->cleanRedundantPunctuation($html);
@@ -50,17 +50,6 @@ class FigureTagProcessor {
} }
} catch (\Throwable $e) { } catch (\Throwable $e) {
// 8. 异常处理(记录详细日志)
$errorMsg = sprintf(
'[%s] FigureTagProcessor-dealFigureStr 异常:%s | 文件:%s | 行:%d | 入参MD5%s | 正则错误:%s',
date('Y-m-d H:i:s'),
$e->getMessage(),
$e->getFile(),
$e->getLine(),
md5($originalHtml),
preg_last_error() ? preg_last_error_msg() : '无'
);
error_log($errorMsg);
return ['status' => 5, 'data' => $originalHtml]; return ['status' => 5, 'data' => $originalHtml];
} }
@@ -71,7 +60,7 @@ class FigureTagProcessor {
} }
/** /**
* 合并嵌套的样式标签(如<i>aaa</i><i>bbb</i> → aaa bbb * 合并嵌套的样式标签
* @param string $html * @param string $html
* @return string * @return string
*/ */
@@ -148,8 +137,7 @@ class FigureTagProcessor {
? "({$info['content']})" ? "({$info['content']})"
: $info['content']; : $info['content'];
// 核心修改规范myfigure标签格式去掉属性值空格、加双引号 //<myfigure data-id="1">Figure 1</myfigure>
// 最终生成:<myfigure data-id="1">Figure 1</myfigure>
$targetTag = "<myfigure data-id=\"{$num}\">{$innerContent}</myfigure>"; $targetTag = "<myfigure data-id=\"{$num}\">{$innerContent}</myfigure>";
if (!empty($info['validPunct']) && !$info['hasOuterBracket']) { if (!empty($info['validPunct']) && !$info['hasOuterBracket']) {
$targetTag .= $info['validPunct']; $targetTag .= $info['validPunct'];
@@ -160,7 +148,7 @@ class FigureTagProcessor {
? '/\(\s*(?:<[^>]+>|\s)*Figure(?:<[^>]+>|\s)*' . $num . '\b' . $patternSuffix . '(?:\s*[\.,;:]*\s*|\s*<[^>]+>)*\s*\)/iu' ? '/\(\s*(?:<[^>]+>|\s)*Figure(?:<[^>]+>|\s)*' . $num . '\b' . $patternSuffix . '(?:\s*[\.,;:]*\s*|\s*<[^>]+>)*\s*\)/iu'
: '/\s*(?:<[^>]+>|\s)*Figure(?:<[^>]+>|\s)*' . $num . '\b' . $patternSuffix . '(?:\s*[\.,;:]*\s*|\s*<[^>]+>)*\s*([\.,:]{0,1})/iu'; : '/\s*(?:<[^>]+>|\s)*Figure(?:<[^>]+>|\s)*' . $num . '\b' . $patternSuffix . '(?:\s*[\.,;:]*\s*|\s*<[^>]+>)*\s*([\.,:]{0,1})/iu';
// 执行替换最多替换1次避免重复 //执行替换最多替换1次避免重复
$html = @preg_replace($pattern, $targetTag, $html, 1, $count); $html = @preg_replace($pattern, $targetTag, $html, 1, $count);
if ($count > 0) { if ($count > 0) {
$hasReplace = true; $hasReplace = true;
@@ -177,11 +165,10 @@ class FigureTagProcessor {
*/ */
private function cleanRedundantStyles($html) { private function cleanRedundantStyles($html) {
foreach (self::STYLE_TAGS as $tag) { foreach (self::STYLE_TAGS as $tag) {
// 修改正则:适配 data-id="数字" 的格式
$pattern = '/<' . $tag . '>\s*<myfigure([^>]*)>(.*?)<\/myfigure>([\.,:]{0,1})\s*<\/' . $tag . '>/is'; $pattern = '/<' . $tag . '>\s*<myfigure([^>]*)>(.*?)<\/myfigure>([\.,:]{0,1})\s*<\/' . $tag . '>/is';
$html = @preg_replace($pattern, '<myfigure$1>$2</myfigure>$3', $html); $html = @preg_replace($pattern, '<myfigure$1>$2</myfigure>$3', $html);
} }
// 清理孤立的样式闭标签 //清理闭标签
$html = preg_replace('/<\/('.implode('|', self::STYLE_TAGS).')>(?![^<]*<\1>)/is', '', $html); $html = preg_replace('/<\/('.implode('|', self::STYLE_TAGS).')>(?![^<]*<\1>)/is', '', $html);
return $html; return $html;
} }
@@ -192,29 +179,24 @@ class FigureTagProcessor {
* @return string * @return string
*/ */
private function cleanRedundantPunctuation($html) { private function cleanRedundantPunctuation($html) {
// 修改正则:将 data-id = (\d+) 改为 data-id="(\d+)",适配新格式
$html = preg_replace('/<myfigure data-id="(\d+)">\(Figure \d+\)<\/myfigure>\)\./i', '<myfigure data-id="$1">(Figure $1)</myfigure>.', $html); $html = preg_replace('/<myfigure data-id="(\d+)">\(Figure \d+\)<\/myfigure>\)\./i', '<myfigure data-id="$1">(Figure $1)</myfigure>.', $html);
$html = preg_replace('/<\/myfigure>\)\.([\.,:]{0,1})/', '</myfigure>)$1', $html); $html = preg_replace('/<\/myfigure>\)\.([\.,:]{0,1})/', '</myfigure>)$1', $html);
$html = preg_replace('/<\/myfigure>\.\)([\.,:]{0,1})/', '</myfigure>)$1', $html); $html = preg_replace('/<\/myfigure>\.\)([\.,:]{0,1})/', '</myfigure>)$1', $html);
$html = preg_replace('/<\/myfigure>([\.,:]){2,}/', '</myfigure>$1', $html); $html = preg_replace('/<\/myfigure>([\.,:]){2,}/', '</myfigure>$1', $html);
// 同步修改此处正则的属性格式
$html = preg_replace('/<myfigure data-id="(\d+)">\((Figure \d+)\s*<\/myfigure>([\.,:]{0,1})/i', $html = preg_replace('/<myfigure data-id="(\d+)">\((Figure \d+)\s*<\/myfigure>([\.,:]{0,1})/i',
'<myfigure data-id="$1">($2)</myfigure>$3', $html); '<myfigure data-id="$1">($2)</myfigure>$3', $html);
return $html; return $html;
} }
/** /**
* 清理孤立的样式标签优先暴力清理myfigure后标签再用栈算法兜底 * 清理孤立的样式标签
* @param string $html * @param string $html
* @return string * @return string
*/ */
private function cleanUnclosedTags($html) { private function cleanUnclosedTags($html) {
// 第一步暴力清理myfigure后孤立的样式闭标签
foreach (self::STYLE_TAGS as $tag) { foreach (self::STYLE_TAGS as $tag) {
$html = @preg_replace('/(<\/myfigure>)\s*<\/' . $tag . '>/i', '$1', $html); $html = @preg_replace('/(<\/myfigure>)\s*<\/' . $tag . '>/i', '$1', $html);
} }
// 第二步:栈算法清理其他孤立标签
foreach (self::STYLE_TAGS as $tag) { foreach (self::STYLE_TAGS as $tag) {
@preg_match_all("/<{$tag}\b[^>]*>/i", $html, $openMatches, PREG_OFFSET_CAPTURE); @preg_match_all("/<{$tag}\b[^>]*>/i", $html, $openMatches, PREG_OFFSET_CAPTURE);
@preg_match_all("/<\/{$tag}>/i", $html, $closeMatches, PREG_OFFSET_CAPTURE); @preg_match_all("/<\/{$tag}>/i", $html, $closeMatches, PREG_OFFSET_CAPTURE);

View File

@@ -25,17 +25,16 @@ class TableTagProcessor {
* status: 2-空输入, 4-无匹配/已处理, 5-处理异常, 1-处理成功 * status: 2-空输入, 4-无匹配/已处理, 5-处理异常, 1-处理成功
*/ */
public function dealTableStr($html = '', $aTableMain = []) { public function dealTableStr($html = '', $aTableMain = []) {
// 1. 基础输入校 //验
if (!is_string($html) || trim($html) === '') { if (!is_string($html) || trim($html) === '') {
return ['status' => 2, 'data' => '']; return ['status' => 2, 'data' => ''];
} }
// 2. 超大字符串拦截(防止内存溢出) //超大字符串拦截(防止内存溢出)
if (strlen($html) > self::MAX_HTML_LENGTH) { if (strlen($html) > self::MAX_HTML_LENGTH) {
$this->logWarning('处理文本超出最大长度限制', ['length' => strlen($html)]);
return ['status' => 4, 'data' => $html]; return ['status' => 4, 'data' => $html];
} }
// 初始化主键映射数组(过滤非数字键/值,保证数据合法性) //初始化主键映射数组
if(!empty($aTableMain)){ if(!empty($aTableMain)){
$aTableMainNew = []; $aTableMainNew = [];
foreach ($aTableMain as $key => $value) { foreach ($aTableMain as $key => $value) {
@@ -52,10 +51,10 @@ class TableTagProcessor {
$hasReplace = false; $hasReplace = false;
try { try {
// 核心:直接在原始HTML中匹配所有符合规则的Table(含嵌套标签) //原始HTML中匹配所有符合规则的Table
$html = $this->replaceTableInHtml($html, $hasReplace); $html = $this->replaceTableInHtml($html, $hasReplace);
// 清理冗余内容(仅替换成功后执行,保证输出整洁) // 清理冗余内容
if ($hasReplace) { if ($hasReplace) {
$html = $this->cleanRedundantStyles($html); $html = $this->cleanRedundantStyles($html);
$html = $this->cleanRedundantPunctuation($html); $html = $this->cleanRedundantPunctuation($html);
@@ -65,19 +64,6 @@ class TableTagProcessor {
} }
} catch (\Throwable $e) { } catch (\Throwable $e) {
// 异常兜底:捕获所有异常,记录详细日志,返回原始文本避免业务中断
$pregError = preg_last_error();
$pregErrorMsg = $this->getPregErrorMsg($pregError);
$errorMsg = sprintf(
'[%s] TableTagProcessor-dealTableStr 异常:%s | 文件:%s | 行:%d | 入参MD5%s | 正则错误:%s',
date('Y-m-d H:i:s'),
$e->getMessage(),
$e->getFile(),
$e->getLine(),
md5($originalHtml),
$pregErrorMsg
);
$this->logError($errorMsg);
return ['status' => 5, 'data' => $originalHtml]; return ['status' => 5, 'data' => $originalHtml];
} }
@@ -88,9 +74,7 @@ class TableTagProcessor {
} }
/** /**
* 核心方法直接在HTML中匹配并替换Table(支持嵌套标签) * 核心方法直接在HTML中匹配并替换Table
* @param string $html
* @param bool $hasReplace 引用传递:标记是否有替换
* @return string * @return string
*/ */
private function replaceTableInHtml($html, &$hasReplace) { private function replaceTableInHtml($html, &$hasReplace) {
@@ -112,12 +96,10 @@ class TableTagProcessor {
} }
$primaryId = $this->aTableMain[$numInt]; $primaryId = $this->aTableMain[$numInt];
// 核心修改规范mytable标签格式属性值加双引号、去掉两侧空格
$baseTag = "<".self::PROCESSED_TAG." data-id=\"{$primaryId}\">Table {$num}</".self::PROCESSED_TAG.">"; $baseTag = "<".self::PROCESSED_TAG." data-id=\"{$primaryId}\">Table {$num}</".self::PROCESSED_TAG.">";
$target = "({$baseTag}{$suffix})"; $target = "({$baseTag}{$suffix})";
$hasReplace = true; $hasReplace = true;
$this->logInfo("替换带括号Table成功", ['num' => $num, 'primary_id' => $primaryId]);
return $target; return $target;
}, $html); }, $html);
@@ -135,12 +117,10 @@ class TableTagProcessor {
} }
$primaryId = $this->aTableMain[$numInt]; $primaryId = $this->aTableMain[$numInt];
// 核心修改规范mytable标签格式属性值加双引号、去掉两侧空格
$baseTag = "<".self::PROCESSED_TAG." data-id=\"{$primaryId}\">Table {$num}</".self::PROCESSED_TAG.">"; $baseTag = "<".self::PROCESSED_TAG." data-id=\"{$primaryId}\">Table {$num}</".self::PROCESSED_TAG.">";
$target = "{$baseTag}{$suffix}"; $target = "{$baseTag}{$suffix}";
$hasReplace = true; $hasReplace = true;
$this->logInfo("替换无括号Table成功", ['num' => $num, 'primary_id' => $primaryId]);
return $target; return $target;
}, $html); }, $html);
@@ -168,7 +148,6 @@ class TableTagProcessor {
* @return string * @return string
*/ */
private function cleanRedundantPunctuation($html) { private function cleanRedundantPunctuation($html) {
// 核心修改适配新的mytable标签格式data-id="数字"
$html = preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\(Table \d+\)<\/'.self::PROCESSED_TAG.'>\)\./i', '<'.self::PROCESSED_TAG.' data-id="$1">(Table $1)</'.self::PROCESSED_TAG.'>.', $html); $html = preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\(Table \d+\)<\/'.self::PROCESSED_TAG.'>\)\./i', '<'.self::PROCESSED_TAG.' data-id="$1">(Table $1)</'.self::PROCESSED_TAG.'>.', $html);
$html = preg_replace('/<\/'.self::PROCESSED_TAG.'>\)\.([\.,:]{0,1})/', '</'.self::PROCESSED_TAG.'>)$1', $html); $html = preg_replace('/<\/'.self::PROCESSED_TAG.'>\)\.([\.,:]{0,1})/', '</'.self::PROCESSED_TAG.'>)$1', $html);
$html = preg_replace('/<\/'.self::PROCESSED_TAG.'>\.\)([\.,:]{0,1})/', '</'.self::PROCESSED_TAG.'>)$1', $html); $html = preg_replace('/<\/'.self::PROCESSED_TAG.'>\.\)([\.,:]{0,1})/', '</'.self::PROCESSED_TAG.'>)$1', $html);
@@ -270,47 +249,4 @@ class TableTagProcessor {
return @preg_match($pattern, $content) === 1; return @preg_match($pattern, $content) === 1;
} }
/**
* 获取正则错误信息(便于调试)
* @param int $pregError 正则错误码
* @return string
*/
private function getPregErrorMsg($pregError) {
$errorCodes = [
PREG_INTERNAL_ERROR => '内部错误',
PREG_BACKTRACK_LIMIT_ERROR => '回溯限制超出',
PREG_RECURSION_LIMIT_ERROR => '递归限制超出',
PREG_BAD_UTF8_ERROR => '无效UTF-8字符',
PREG_BAD_UTF8_OFFSET_ERROR => 'UTF-8偏移量无效',
PREG_JIT_STACKLIMIT_ERROR => 'JIT栈限制超出'
];
return isset($errorCodes[$pregError]) ? $errorCodes[$pregError] : "未知错误({$pregError})";
}
/**
* 记录错误日志(生产环境可对接日志系统)
* @param string $msg
* @param array $context
*/
private function logError($msg, $context = []) {
error_log(json_encode(['level' => 'error', 'msg' => $msg, 'context' => $context, 'time' => date('Y-m-d H:i:s')]));
}
/**
* 记录警告日志
* @param string $msg
* @param array $context
*/
private function logWarning($msg, $context = []) {
error_log(json_encode(['level' => 'warning', 'msg' => $msg, 'context' => $context, 'time' => date('Y-m-d H:i:s')]));
}
/**
* 记录信息日志
* @param string $msg
* @param array $context
*/
private function logInfo($msg, $context = []) {
error_log(json_encode(['level' => 'info', 'msg' => $msg, 'context' => $context, 'time' => date('Y-m-d H:i:s')]));
}
} }