diff --git a/application/common/ReferenceCheckService.php b/application/common/ReferenceCheckService.php index 27ff9d8c..be13d089 100644 --- a/application/common/ReferenceCheckService.php +++ b/application/common/ReferenceCheckService.php @@ -193,7 +193,7 @@ class ReferenceCheckService 'created_at' => $now, 'updated_at' => $now, ]); - continue; + $this->pushJob(intval($checkId), $delay); $checkIds[] = $checkId; $queued++; @@ -682,13 +682,13 @@ class ReferenceCheckService */ private function extractLocalCitationContext($content, $tagStart, $tagEnd, array $tagSpans) { - $sentenceStart = $this->findSentenceStart($content, $tagStart); + $paragraphStart = $this->findParagraphStart($content, $tagStart); $sentenceEnd = $this->findSentenceEnd($content, $tagEnd, $tagEnd); - $prevTagEnd = $sentenceStart; + $prevTagEnd = $paragraphStart; $nextTagStart = $sentenceEnd; foreach ($tagSpans as $span) { - if ($span['end'] <= $tagStart && $span['end'] > $prevTagEnd && $span['end'] >= $sentenceStart) { + if ($span['end'] <= $tagStart && $span['end'] > $prevTagEnd) { $prevTagEnd = $span['end']; } if ($span['start'] > $tagEnd && $span['start'] < $nextTagStart) { @@ -696,9 +696,13 @@ class ReferenceCheckService } } - $hasPriorCiteInSentence = ($prevTagEnd > $sentenceStart); - // 同句后续引用:从上一 标签后开始;首个引用:从整句开头到本标签前 - $localStart = $hasPriorCiteInSentence ? $prevTagEnd : $sentenceStart; + $hasPriorCiteInParagraph = ($prevTagEnd > $paragraphStart); + // 同段后续引用:从上一 后开始;段内首个引用:从段落开头到本标签前(非仅最后一句) + if ($hasPriorCiteInParagraph) { + $localStart = $prevTagEnd; + } else { + $localStart = $this->capContextStartBeforeTag($content, $tagStart, $paragraphStart); + } // 默认:引用标签前的论述(如 Yin et al. [13] → 含 “higher than … Yin et al.”) $localEnd = $tagStart; @@ -894,6 +898,63 @@ class ReferenceCheckService return true; } + /** + * 段落起始(HTML / 换行),避免英文多句段落只取到最后一个句号后的一句 + */ + private function findParagraphStart($content, $tagStart) + { + $search = substr($content, 0, max(0, $tagStart)); + if ($search === '') { + return 0; + } + + $best = 0; + + if (preg_match_all('/]*>/i', $search, $m, PREG_OFFSET_CAPTURE)) { + $last = end($m[0]); + $best = max($best, $last[1] + strlen($last[0])); + } + if (preg_match_all('/<\/p>\s*/i', $search, $m, PREG_OFFSET_CAPTURE)) { + $last = end($m[0]); + $best = max($best, $last[1] + strlen($last[0])); + } + if (preg_match_all('/\s*/i', $search, $m, PREG_OFFSET_CAPTURE)) { + $last = end($m[0]); + $best = max($best, $last[1] + strlen($last[0])); + } + + $pos = strrpos($search, "\n\n"); + if ($pos !== false) { + $best = max($best, $pos + 2); + } + $pos = strrpos($search, "\n"); + if ($pos !== false) { + $best = max($best, $pos + 1); + } + + return $best; + } + + /** + * 段落过长时从引用处向前截取上限,避免单次 LLM 上下文过大 + */ + private function capContextStartBeforeTag($content, $tagStart, $paragraphStart, $maxBytes = 2500) + { + if ($tagStart - $paragraphStart <= $maxBytes) { + return $paragraphStart; + } + + $start = $tagStart - $maxBytes; + $slice = substr($content, $start, $tagStart - $start); + if (preg_match('/[.!?。!?]\s+/u', $slice, $m, PREG_OFFSET_CAPTURE)) { + $last = end($m[0]); + $rel = $last[1] + strlen($last[0]); + return $start + $rel; + } + + return max($paragraphStart, $start); + } + private function findSentenceStart($content, $position) { $start = 0;