From 867621232bc318780f05b19f751aa320087006ce Mon Sep 17 00:00:00 2001 From: wyn <1074145239@qq.com> Date: Thu, 21 May 2026 13:55:13 +0800 Subject: [PATCH] Changes --- application/common/ReferenceCheckService.php | 152 ++++++++++++++++--- 1 file changed, 130 insertions(+), 22 deletions(-) diff --git a/application/common/ReferenceCheckService.php b/application/common/ReferenceCheckService.php index 194f60cd..27ff9d8c 100644 --- a/application/common/ReferenceCheckService.php +++ b/application/common/ReferenceCheckService.php @@ -193,7 +193,7 @@ class ReferenceCheckService 'created_at' => $now, 'updated_at' => $now, ]); - + continue; $this->pushJob(intval($checkId), $delay); $checkIds[] = $checkId; $queued++; @@ -631,33 +631,33 @@ class ReferenceCheckService public function extractReferences($content) { $result = []; - preg_match_all('/\[([\d,\-\s]+)\]<\/blue>/', $content, $matches,PREG_OFFSET_CAPTURE); + preg_match_all('/\[([\d,\-\s]+)\]<\/blue>/', $content, $matches, PREG_OFFSET_CAPTURE); if (empty($matches[0])) { return []; } + $tagSpans = []; foreach ($matches[0] as $index => $match) { + $tagSpans[] = [ + 'start' => $match[1], + 'end' => $match[1] + strlen($match[0]), + 'index' => $index, + ]; + } + foreach ($matches[0] as $index => $match) { $fullTag = $match[0]; $tagStart = $match[1]; $tagEnd = $tagStart + strlen($fullTag); $rawRef = trim($matches[1][$index][0]); $referenceNumbers = $this->expandReferenceNumbers($rawRef); - $sentenceStart = $this->findSentenceStart($content, $tagStart); - $sentenceEnd = $this->findSentenceEnd($content, $tagEnd, $tagEnd); - $originalText = $this->buildCitationContextText($content, $sentenceStart, $sentenceEnd); - - if (!$this->isMeaningfulCitationContext($originalText)) { - list($sentenceStart, $sentenceEnd) = $this->widenCitationContextBounds( - $content, - $tagStart, - $tagEnd, - $sentenceStart, - $sentenceEnd - ); - $originalText = $this->buildCitationContextText($content, $sentenceStart, $sentenceEnd); - } + list($localStart, $localEnd, $originalText) = $this->extractLocalCitationContext( + $content, + $tagStart, + $tagEnd, + $tagSpans + ); if (!$this->isMeaningfulCitationContext($originalText) || empty($referenceNumbers)) { continue; @@ -669,14 +669,81 @@ class ReferenceCheckService 'original_text' => $originalText, 'reference_start' => $tagStart, 'reference_end' => $tagEnd, - 'text_start' => $sentenceStart, - 'text_end' => $sentenceEnd, + 'text_start' => $localStart, + 'text_end' => $localEnd, ]; } return $result; } + /** + * 按引用位置截取局部上下文:优先取标签前叙述;同句多引时后续引用从上一标签后开始。 + */ + private function extractLocalCitationContext($content, $tagStart, $tagEnd, array $tagSpans) + { + $sentenceStart = $this->findSentenceStart($content, $tagStart); + $sentenceEnd = $this->findSentenceEnd($content, $tagEnd, $tagEnd); + + $prevTagEnd = $sentenceStart; + $nextTagStart = $sentenceEnd; + foreach ($tagSpans as $span) { + if ($span['end'] <= $tagStart && $span['end'] > $prevTagEnd && $span['end'] >= $sentenceStart) { + $prevTagEnd = $span['end']; + } + if ($span['start'] > $tagEnd && $span['start'] < $nextTagStart) { + $nextTagStart = $span['start']; + } + } + + $hasPriorCiteInSentence = ($prevTagEnd > $sentenceStart); + // 同句后续引用:从上一 标签后开始;首个引用:从整句开头到本标签前 + $localStart = $hasPriorCiteInSentence ? $prevTagEnd : $sentenceStart; + + // 默认:引用标签前的论述(如 Yin et al. [13] → 含 “higher than … Yin et al.”) + $localEnd = $tagStart; + $originalText = $this->buildCitationContextText($content, $localStart, $localEnd); + + // 标签前几乎无正文(如句末 … ICU nurses [14])→ 改用标签后至下一引用或句末 + if (!$this->isMeaningfulCitationContext($originalText) + || $this->shouldUseTrailingCitationContext($content, $localStart, $tagStart, $tagEnd) + ) { + $trailEnd = ($nextTagStart < $sentenceEnd) ? $nextTagStart : $sentenceEnd; + $trailText = $this->buildCitationContextText($content, $tagEnd, $trailEnd); + if ($this->isMeaningfulCitationContext($trailText)) { + $localStart = $tagEnd; + $localEnd = $trailEnd; + $originalText = $trailText; + } + } + + if (!$this->isMeaningfulCitationContext($originalText)) { + list($localStart, $localEnd) = $this->widenCitationContextBounds( + $content, + $tagStart, + $tagEnd, + $localStart, + $localEnd + ); + $originalText = $this->buildCitationContextText($content, $localStart, $localEnd); + } + + return [$localStart, $localEnd, $originalText]; + } + + /** + * 标签前仅有作者缩写等极短片段时,改用标签后上下文 + */ + private function shouldUseTrailingCitationContext($content, $localStart, $tagStart, $tagEnd) + { + $before = $this->buildCitationContextText($content, $localStart, $tagStart); + if (!$this->isMeaningfulCitationContext($before)) { + return true; + } + + return mb_strlen($before) < 25; + } + public function expandReferenceNumbers($refStr) { $refStr = str_replace( @@ -703,12 +770,43 @@ class ReferenceCheckService return array_values(array_unique($numbers)); } + /** + * 返回 $bytePos 处 UTF-8 码点占用的最后一字节之后的位置(下一字符起始) + */ + private function utf8CharEnd($content, $bytePos) + { + $len = strlen($content); + if ($bytePos < 0 || $bytePos >= $len) { + return max(0, min($len, $bytePos + 1)); + } + $next = $bytePos + 1; + while ($next < $len && (ord($content[$next]) & 0xC0) === 0x80) { + $next++; + } + + return $next; + } + + /** + * 按字节偏移截取(与 strpos/strlen 一致);勿用 mb_substr,否则遇中文前缀会截断英文词头 + */ + private function byteSubstr($content, $start, $end) + { + $length = max(0, $end - $start); + if ($length === 0) { + return ''; + } + + return (string)mb_strcut($content, $start, $length, 'UTF-8'); + } + private function buildCitationContextText($content, $start, $end) { - $text = mb_substr($content, $start, max(0, $end - $start)); + $text = $this->byteSubstr($content, $start, $end); $text = preg_replace('/\[[\d,\-\s]+\]<\/blue>/', '', $text); $text = trim(strip_tags($text)); $text = preg_replace('/\s+/u', ' ', $text); + $text = ltrim($text, "\xEF\xBB\xBF"); return $text; } @@ -768,7 +866,7 @@ class ReferenceCheckService } /** - * 句号是否可作为句界(排除 0.95、3.14 等小数点) + * 句号是否可作为句界(排除小数点、et al. 等缩写) */ private function isSentenceDelimiterAt($content, $pos, $delimiter) { @@ -783,6 +881,16 @@ class ReferenceCheckService return false; } + $before = substr($content, max(0, $pos - 12), min(12, $pos)); + if (preg_match('/\b(et\s+al|e\.g|i\.e|vs|etc|fig|no)\s*\.?\s*$/i', $before)) { + return false; + } + + $after = substr($content, $pos + 1, 24); + if (preg_match('/^\s*\s*\[/', $after)) { + return false; + } + return true; } @@ -792,7 +900,7 @@ class ReferenceCheckService foreach (['.', '。', '!', '?', "\n"] as $delimiter) { $pos = strrpos(substr($content, 0, $position), $delimiter); if ($pos !== false && $this->isSentenceDelimiterAt($content, $pos, $delimiter)) { - $start = max($start, $pos + 1); + $start = max($start, $this->utf8CharEnd($content, $pos)); } } return $start; @@ -812,7 +920,7 @@ class ReferenceCheckService foreach (['.', '。', '!', '?', "\n"] as $delimiter) { $pos = strpos($content, $delimiter, $minPos); if ($pos !== false && $this->isSentenceDelimiterAt($content, $pos, $delimiter)) { - $endPositions[] = $pos + 1; + $endPositions[] = $this->utf8CharEnd($content, $pos); } } if (empty($endPositions)) {