处理正文内容表格/图片相关联
This commit is contained in:
@@ -13,11 +13,11 @@ class FigureTagProcessor {
|
||||
* status: 2-空输入, 4-无匹配, 5-处理异常, 1-处理成功
|
||||
*/
|
||||
public function dealFigureStr($html = '') {
|
||||
// 1. 基础输入校验
|
||||
//验证
|
||||
if (!is_string($html) || trim($html) === '') {
|
||||
return ['status' => 2, 'data' => ''];
|
||||
}
|
||||
// 2. 超大字符串拦截
|
||||
//超大字符串拦截
|
||||
if (strlen($html) > self::MAX_HTML_LENGTH) {
|
||||
return ['status' => 4, 'data' => $html];
|
||||
}
|
||||
@@ -26,22 +26,22 @@ class FigureTagProcessor {
|
||||
$hasReplace = false;
|
||||
|
||||
try {
|
||||
// 3. 合并嵌套样式标签
|
||||
//合并嵌套样式标签
|
||||
$mergedHtml = $this->mergeFragmentStyleTags($html);
|
||||
// 4. 提取纯文本(用于匹配Figure)
|
||||
//提取纯文本(用于匹配Figure)
|
||||
$plainText = preg_replace('/<[^>]+>/', ' ', $mergedHtml);
|
||||
$plainText = preg_replace('/\s+/', ' ', trim($plainText));
|
||||
|
||||
// 5. 提取所有匹配的Figure数字
|
||||
//提取所有匹配的Figure数字
|
||||
$allMatches = $this->extractAllFigureMatches($plainText);
|
||||
if (empty($allMatches)) {
|
||||
return ['status' => 4, 'data' => $originalHtml];
|
||||
}
|
||||
|
||||
// 6. 替换为myfigure标签
|
||||
//替换为myfigure标签
|
||||
$html = $this->replaceFigureWithTag($html, $allMatches, $hasReplace);
|
||||
|
||||
// 7. 清理冗余内容(仅替换成功后执行)
|
||||
//清理冗余内容(仅替换成功后执行)
|
||||
if ($hasReplace) {
|
||||
$html = $this->cleanRedundantStyles($html);
|
||||
$html = $this->cleanRedundantPunctuation($html);
|
||||
@@ -50,17 +50,6 @@ class FigureTagProcessor {
|
||||
}
|
||||
|
||||
} catch (\Throwable $e) {
|
||||
// 8. 异常处理(记录详细日志)
|
||||
$errorMsg = sprintf(
|
||||
'[%s] FigureTagProcessor-dealFigureStr 异常:%s | 文件:%s | 行:%d | 入参MD5:%s | 正则错误:%s',
|
||||
date('Y-m-d H:i:s'),
|
||||
$e->getMessage(),
|
||||
$e->getFile(),
|
||||
$e->getLine(),
|
||||
md5($originalHtml),
|
||||
preg_last_error() ? preg_last_error_msg() : '无'
|
||||
);
|
||||
error_log($errorMsg);
|
||||
return ['status' => 5, 'data' => $originalHtml];
|
||||
}
|
||||
|
||||
@@ -71,7 +60,7 @@ class FigureTagProcessor {
|
||||
}
|
||||
|
||||
/**
|
||||
* 合并嵌套的样式标签(如<i>aaa</i><i>bbb</i> → aaa bbb)
|
||||
* 合并嵌套的样式标签
|
||||
* @param string $html
|
||||
* @return string
|
||||
*/
|
||||
@@ -148,8 +137,7 @@ class FigureTagProcessor {
|
||||
? "({$info['content']})"
|
||||
: $info['content'];
|
||||
|
||||
// 核心修改:规范myfigure标签格式(去掉属性值空格、加双引号)
|
||||
// 最终生成:<myfigure data-id="1">Figure 1</myfigure>
|
||||
//<myfigure data-id="1">Figure 1</myfigure>
|
||||
$targetTag = "<myfigure data-id=\"{$num}\">{$innerContent}</myfigure>";
|
||||
if (!empty($info['validPunct']) && !$info['hasOuterBracket']) {
|
||||
$targetTag .= $info['validPunct'];
|
||||
@@ -160,7 +148,7 @@ class FigureTagProcessor {
|
||||
? '/\(\s*(?:<[^>]+>|\s)*Figure(?:<[^>]+>|\s)*' . $num . '\b' . $patternSuffix . '(?:\s*[\.,;:]*\s*|\s*<[^>]+>)*\s*\)/iu'
|
||||
: '/\s*(?:<[^>]+>|\s)*Figure(?:<[^>]+>|\s)*' . $num . '\b' . $patternSuffix . '(?:\s*[\.,;:]*\s*|\s*<[^>]+>)*\s*([\.,:]{0,1})/iu';
|
||||
|
||||
// 执行替换(最多替换1次,避免重复)
|
||||
//执行替换(最多替换1次,避免重复)
|
||||
$html = @preg_replace($pattern, $targetTag, $html, 1, $count);
|
||||
if ($count > 0) {
|
||||
$hasReplace = true;
|
||||
@@ -177,11 +165,10 @@ class FigureTagProcessor {
|
||||
*/
|
||||
private function cleanRedundantStyles($html) {
|
||||
foreach (self::STYLE_TAGS as $tag) {
|
||||
// 修改正则:适配 data-id="数字" 的格式
|
||||
$pattern = '/<' . $tag . '>\s*<myfigure([^>]*)>(.*?)<\/myfigure>([\.,:]{0,1})\s*<\/' . $tag . '>/is';
|
||||
$html = @preg_replace($pattern, '<myfigure$1>$2</myfigure>$3', $html);
|
||||
}
|
||||
// 清理孤立的样式闭标签
|
||||
//清理闭标签
|
||||
$html = preg_replace('/<\/('.implode('|', self::STYLE_TAGS).')>(?![^<]*<\1>)/is', '', $html);
|
||||
return $html;
|
||||
}
|
||||
@@ -192,29 +179,24 @@ class FigureTagProcessor {
|
||||
* @return string
|
||||
*/
|
||||
private function cleanRedundantPunctuation($html) {
|
||||
// 修改正则:将 data-id = (\d+) 改为 data-id="(\d+)",适配新格式
|
||||
$html = preg_replace('/<myfigure data-id="(\d+)">\(Figure \d+\)<\/myfigure>\)\./i', '<myfigure data-id="$1">(Figure $1)</myfigure>.', $html);
|
||||
$html = preg_replace('/<\/myfigure>\)\.([\.,:]{0,1})/', '</myfigure>)$1', $html);
|
||||
$html = preg_replace('/<\/myfigure>\.\)([\.,:]{0,1})/', '</myfigure>)$1', $html);
|
||||
$html = preg_replace('/<\/myfigure>([\.,:]){2,}/', '</myfigure>$1', $html);
|
||||
// 同步修改此处正则的属性格式
|
||||
$html = preg_replace('/<myfigure data-id="(\d+)">\((Figure \d+)\s*<\/myfigure>([\.,:]{0,1})/i',
|
||||
'<myfigure data-id="$1">($2)</myfigure>$3', $html);
|
||||
return $html;
|
||||
}
|
||||
|
||||
/**
|
||||
* 清理孤立的样式标签(优先暴力清理myfigure后标签,再用栈算法兜底)
|
||||
* 清理孤立的样式标签
|
||||
* @param string $html
|
||||
* @return string
|
||||
*/
|
||||
private function cleanUnclosedTags($html) {
|
||||
// 第一步:暴力清理myfigure后孤立的样式闭标签
|
||||
foreach (self::STYLE_TAGS as $tag) {
|
||||
$html = @preg_replace('/(<\/myfigure>)\s*<\/' . $tag . '>/i', '$1', $html);
|
||||
}
|
||||
|
||||
// 第二步:栈算法清理其他孤立标签
|
||||
foreach (self::STYLE_TAGS as $tag) {
|
||||
@preg_match_all("/<{$tag}\b[^>]*>/i", $html, $openMatches, PREG_OFFSET_CAPTURE);
|
||||
@preg_match_all("/<\/{$tag}>/i", $html, $closeMatches, PREG_OFFSET_CAPTURE);
|
||||
|
||||
@@ -25,17 +25,16 @@ class TableTagProcessor {
|
||||
* status: 2-空输入, 4-无匹配/已处理, 5-处理异常, 1-处理成功
|
||||
*/
|
||||
public function dealTableStr($html = '', $aTableMain = []) {
|
||||
// 1. 基础输入校验
|
||||
//验证
|
||||
if (!is_string($html) || trim($html) === '') {
|
||||
return ['status' => 2, 'data' => ''];
|
||||
}
|
||||
// 2. 超大字符串拦截(防止内存溢出)
|
||||
//超大字符串拦截(防止内存溢出)
|
||||
if (strlen($html) > self::MAX_HTML_LENGTH) {
|
||||
$this->logWarning('处理文本超出最大长度限制', ['length' => strlen($html)]);
|
||||
return ['status' => 4, 'data' => $html];
|
||||
}
|
||||
|
||||
// 初始化主键映射数组(过滤非数字键/值,保证数据合法性)
|
||||
//初始化主键映射数组
|
||||
if(!empty($aTableMain)){
|
||||
$aTableMainNew = [];
|
||||
foreach ($aTableMain as $key => $value) {
|
||||
@@ -52,10 +51,10 @@ class TableTagProcessor {
|
||||
$hasReplace = false;
|
||||
|
||||
try {
|
||||
// 核心:直接在原始HTML中匹配所有符合规则的Table(含嵌套标签)
|
||||
//原始HTML中匹配所有符合规则的Table
|
||||
$html = $this->replaceTableInHtml($html, $hasReplace);
|
||||
|
||||
// 清理冗余内容(仅替换成功后执行,保证输出整洁)
|
||||
// 清理冗余内容
|
||||
if ($hasReplace) {
|
||||
$html = $this->cleanRedundantStyles($html);
|
||||
$html = $this->cleanRedundantPunctuation($html);
|
||||
@@ -65,19 +64,6 @@ class TableTagProcessor {
|
||||
}
|
||||
|
||||
} catch (\Throwable $e) {
|
||||
// 异常兜底:捕获所有异常,记录详细日志,返回原始文本避免业务中断
|
||||
$pregError = preg_last_error();
|
||||
$pregErrorMsg = $this->getPregErrorMsg($pregError);
|
||||
$errorMsg = sprintf(
|
||||
'[%s] TableTagProcessor-dealTableStr 异常:%s | 文件:%s | 行:%d | 入参MD5:%s | 正则错误:%s',
|
||||
date('Y-m-d H:i:s'),
|
||||
$e->getMessage(),
|
||||
$e->getFile(),
|
||||
$e->getLine(),
|
||||
md5($originalHtml),
|
||||
$pregErrorMsg
|
||||
);
|
||||
$this->logError($errorMsg);
|
||||
return ['status' => 5, 'data' => $originalHtml];
|
||||
}
|
||||
|
||||
@@ -88,9 +74,7 @@ class TableTagProcessor {
|
||||
}
|
||||
|
||||
/**
|
||||
* 核心方法:直接在HTML中匹配并替换Table(支持嵌套标签)
|
||||
* @param string $html
|
||||
* @param bool $hasReplace 引用传递:标记是否有替换
|
||||
* 核心方法:直接在HTML中匹配并替换Table
|
||||
* @return string
|
||||
*/
|
||||
private function replaceTableInHtml($html, &$hasReplace) {
|
||||
@@ -112,12 +96,10 @@ class TableTagProcessor {
|
||||
}
|
||||
|
||||
$primaryId = $this->aTableMain[$numInt];
|
||||
// 核心修改:规范mytable标签格式(属性值加双引号、去掉两侧空格)
|
||||
$baseTag = "<".self::PROCESSED_TAG." data-id=\"{$primaryId}\">Table {$num}</".self::PROCESSED_TAG.">";
|
||||
$target = "({$baseTag}{$suffix})";
|
||||
|
||||
$hasReplace = true;
|
||||
$this->logInfo("替换带括号Table成功", ['num' => $num, 'primary_id' => $primaryId]);
|
||||
return $target;
|
||||
}, $html);
|
||||
|
||||
@@ -135,12 +117,10 @@ class TableTagProcessor {
|
||||
}
|
||||
|
||||
$primaryId = $this->aTableMain[$numInt];
|
||||
// 核心修改:规范mytable标签格式(属性值加双引号、去掉两侧空格)
|
||||
$baseTag = "<".self::PROCESSED_TAG." data-id=\"{$primaryId}\">Table {$num}</".self::PROCESSED_TAG.">";
|
||||
$target = "{$baseTag}{$suffix}";
|
||||
|
||||
$hasReplace = true;
|
||||
$this->logInfo("替换无括号Table成功", ['num' => $num, 'primary_id' => $primaryId]);
|
||||
return $target;
|
||||
}, $html);
|
||||
|
||||
@@ -168,7 +148,6 @@ class TableTagProcessor {
|
||||
* @return string
|
||||
*/
|
||||
private function cleanRedundantPunctuation($html) {
|
||||
// 核心修改:适配新的mytable标签格式(data-id="数字")
|
||||
$html = preg_replace('/<'.self::PROCESSED_TAG.' data-id="(\d+)">\(Table \d+\)<\/'.self::PROCESSED_TAG.'>\)\./i', '<'.self::PROCESSED_TAG.' data-id="$1">(Table $1)</'.self::PROCESSED_TAG.'>.', $html);
|
||||
$html = preg_replace('/<\/'.self::PROCESSED_TAG.'>\)\.([\.,:]{0,1})/', '</'.self::PROCESSED_TAG.'>)$1', $html);
|
||||
$html = preg_replace('/<\/'.self::PROCESSED_TAG.'>\.\)([\.,:]{0,1})/', '</'.self::PROCESSED_TAG.'>)$1', $html);
|
||||
@@ -270,47 +249,4 @@ class TableTagProcessor {
|
||||
return @preg_match($pattern, $content) === 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取正则错误信息(便于调试)
|
||||
* @param int $pregError 正则错误码
|
||||
* @return string
|
||||
*/
|
||||
private function getPregErrorMsg($pregError) {
|
||||
$errorCodes = [
|
||||
PREG_INTERNAL_ERROR => '内部错误',
|
||||
PREG_BACKTRACK_LIMIT_ERROR => '回溯限制超出',
|
||||
PREG_RECURSION_LIMIT_ERROR => '递归限制超出',
|
||||
PREG_BAD_UTF8_ERROR => '无效UTF-8字符',
|
||||
PREG_BAD_UTF8_OFFSET_ERROR => 'UTF-8偏移量无效',
|
||||
PREG_JIT_STACKLIMIT_ERROR => 'JIT栈限制超出'
|
||||
];
|
||||
return isset($errorCodes[$pregError]) ? $errorCodes[$pregError] : "未知错误({$pregError})";
|
||||
}
|
||||
|
||||
/**
|
||||
* 记录错误日志(生产环境可对接日志系统)
|
||||
* @param string $msg
|
||||
* @param array $context
|
||||
*/
|
||||
private function logError($msg, $context = []) {
|
||||
error_log(json_encode(['level' => 'error', 'msg' => $msg, 'context' => $context, 'time' => date('Y-m-d H:i:s')]));
|
||||
}
|
||||
|
||||
/**
|
||||
* 记录警告日志
|
||||
* @param string $msg
|
||||
* @param array $context
|
||||
*/
|
||||
private function logWarning($msg, $context = []) {
|
||||
error_log(json_encode(['level' => 'warning', 'msg' => $msg, 'context' => $context, 'time' => date('Y-m-d H:i:s')]));
|
||||
}
|
||||
|
||||
/**
|
||||
* 记录信息日志
|
||||
* @param string $msg
|
||||
* @param array $context
|
||||
*/
|
||||
private function logInfo($msg, $context = []) {
|
||||
error_log(json_encode(['level' => 'info', 'msg' => $msg, 'context' => $context, 'time' => date('Y-m-d H:i:s')]));
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user