From c2ed27c0dfd767acbfa72380496e6debbff5a32f Mon Sep 17 00:00:00 2001 From: chengxl Date: Fri, 22 Aug 2025 16:24:59 +0800 Subject: [PATCH] =?UTF-8?q?job=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- application/api/controller/Aireview.php | 67 ++++++++++++++++--------- 1 file changed, 44 insertions(+), 23 deletions(-) diff --git a/application/api/controller/Aireview.php b/application/api/controller/Aireview.php index cd500eb..a126117 100644 --- a/application/api/controller/Aireview.php +++ b/application/api/controller/Aireview.php @@ -140,7 +140,6 @@ class Aireview extends Base $sContent = $oHelperFunction->filterAllTags($sContent); //将文章内容拆分参考文献 $aDealContent = $this->dealContent($sContent); - $sBefore= empty($aDealContent['before']) ? '' : $aDealContent['before']; $sReference = empty($aDealContent['after']) ? '' : $aDealContent['after']; if(in_array($sQuestionFields, ['attribute'])){//科学性和创新性 @@ -193,50 +192,72 @@ class Aireview extends Base * @param sContent 文章内容 */ private function dealContent($sContent = '',$regex = null){ - if(empty($sContent)){ + if (empty($sContent)) { return ['before' => '', 'after' => '']; } - // 1. 限制匹配范围(末尾30%) - $contentLength = strlen($sContent); - $searchStart = $contentLength > 5000 ? (int)($contentLength * 0.7) : 0; - $searchContent = substr($sContent, $searchStart); + // 1. 优化字符串长度计算 + $contentLength = mb_strlen($sContent); + $searchStart = $contentLength > 5000 ? (int)($contentLength * 0.6) : 0; - // 2. 正则模式优化 + // 2. 截取搜索区域 + $searchContent = $searchStart > 0 + ? mb_substr($sContent, $searchStart, null, 'UTF-8') + : $sContent; + + // 3. 正则模式优化 - 重点处理拼写错误和特殊格式 if ($regex === null) { + // 关键词列表增加容错处理 $keywords = [ - 'references?', 'bibliograph(?:y|ies)', - 'works? cited', 'citation(?:s)?' + 'r?reference[s]?', // 允许关键词前多一个r(处理Rreferences这类拼写错误) + 'bibliograph(?:y|ies)', + 'works? cited', + 'citation[s]?', + 'literature cited', + 'reference list' ]; + + // 正则模式优化: + // 1. 允许关键词与前面内容直接连接(解决"article.Rreferences"这种没有空格的情况) + // 2. 增强对冒号和方括号的支持(匹配"References:[1]"这种格式) $pattern = sprintf( - '/(?:^|\s)\s*#*\s*(%s)\s*[:\-–]?\s*(?:$|\s)/i', + '/(?:^|\s|[(\[{<"\']|[\p{P}])?\s*\#*[*_]*\s*(%s)\s*[*_]*\s*[:\-–=.]?\s*(?:$|\s|[)\]}>"\'.|[\d{[])*/i', implode('|', $keywords) ); $regex = $pattern; } - // 3. 匹配并处理结果 - if (preg_match_all($regex, $searchContent, $matches, PREG_OFFSET_CAPTURE)) { - $lastMatch = end($matches[0]); - $refPosition = $searchStart + $lastMatch[1]; + // 4. 高效匹配 - 从字符串末尾开始搜索最后一个匹配项 + // 使用带偏移量的循环匹配找到最后一个符合条件的关键词 + $lastPos = 0; + $matchFound = false; + $offset = 0; + $searchLen = mb_strlen($searchContent); - // 4. 合并字符串处理 - $before = substr($sContent, 0, $refPosition); - $after = substr($sContent, $refPosition); + // 从字符串末尾向前搜索,提高长文本效率 + while ($offset < $searchLen && preg_match($regex, $searchContent, $match, PREG_OFFSET_CAPTURE, $offset)) { + $lastPos = $match[0][1]; + $matchFound = true; + // 移动偏移量继续查找下一个匹配 + $offset = $lastPos + mb_strlen($match[0][0]); + } + + if ($matchFound) { + $refPosition = $searchStart + $lastPos; - // 一次性处理空白和换行 - $process = function($str) { - return str_replace("\n", '', trim($str)); + $process = static function($str) { + return trim(str_replace("\n", '', $str)); }; + return [ - 'before' => $process($before), - 'after' => $process($after) + 'before' => $process(mb_substr($sContent, 0, $refPosition, 'UTF-8')), + 'after' => $process(mb_substr($sContent, $refPosition, null, 'UTF-8')) ]; } // 未匹配时处理 return [ - 'before' => str_replace("\n", '', trim($sContent)), + 'before' => trim(str_replace("\n", '', $sContent)), 'after' => '' ]; // $lastPos = strrpos($sContent, 'Reference');