原文内容截取的已经很好了
This commit is contained in:
wyn
2026-05-21 15:19:07 +08:00
parent 3663dd4ea6
commit 8cd033a56d

View File

@@ -193,7 +193,7 @@ class ReferenceCheckService
'created_at' => $now, 'created_at' => $now,
'updated_at' => $now, 'updated_at' => $now,
]); ]);
// continue; continue;
$this->pushJob(intval($checkId), $delay); $this->pushJob(intval($checkId), $delay);
$checkIds[] = $checkId; $checkIds[] = $checkId;
$queued++; $queued++;
@@ -677,6 +677,9 @@ class ReferenceCheckService
return $result; return $result;
} }
/** 与上一引用间距低于此值(字符)时视为同句并列,从整句开头截取而非仅取两标签之间 */
const CITE_GAP_SENTENCE_THRESHOLD = 60;
/** /**
* 按引用位置截取局部上下文:优先取标签前叙述;同句多引时后续引用从上一标签后开始。 * 按引用位置截取局部上下文:优先取标签前叙述;同句多引时后续引用从上一标签后开始。
*/ */
@@ -697,32 +700,37 @@ class ReferenceCheckService
} }
$hasPriorCiteInParagraph = ($prevTagEnd > $paragraphStart); $hasPriorCiteInParagraph = ($prevTagEnd > $paragraphStart);
// 同段后续引用:从上一 <blue> 后开始;段内首个引用:从段落开头到本标签前
if ($hasPriorCiteInParagraph) { if ($hasPriorCiteInParagraph) {
$localStart = $prevTagEnd; $gapText = $this->buildCitationContextText($content, $prevTagEnd, $tagStart);
// 如 motivation [23] and external environment [24]:间距短,取整句而非仅 “and external environment”
if (mb_strlen($gapText) < self::CITE_GAP_SENTENCE_THRESHOLD) {
$sentenceStart = $this->findSentenceStart($content, $tagStart);
$localStart = $this->capContextStartBeforeTag(
$content,
$tagStart,
max($paragraphStart, $sentenceStart)
);
} else {
// 如 … Yin et al. [13] on oncology nurses, but … Yang [14]:间距较长,取上一标签后至本标签前
$localStart = $prevTagEnd;
}
} else { } else {
$sentenceStart = $this->findSentenceStart($content, $tagStart); $localStart = $this->capContextStartBeforeTag($content, $tagStart, $paragraphStart);
$localStart = $this->capContextStartBeforeTag(
$content,
$tagStart,
max($paragraphStart, $sentenceStart)
);
} }
// 默认:引用标签前的论述 // 默认:引用标签前的论述(如 Yin et al. [13] → 含 “higher than … Yin et al.”)
$localEnd = $tagStart; $localEnd = $tagStart;
$originalText = $this->buildCitationContextText($content, $localStart, $localEnd); $originalText = $this->buildCitationContextText($content, $localStart, $localEnd);
// 同句多引(如 …[23] and external environment [24]:上一标签后仅几个词,回退到本句开头 // 标签前几乎无正文(如句末 … ICU nurses [14]→ 改用标签后至下一引用或句末
if ($hasPriorCiteInParagraph && mb_strlen($originalText) < 50) {
$sentenceStart = $this->findSentenceStart($content, $tagStart);
$localStart = max($paragraphStart, $sentenceStart);
$originalText = $this->buildCitationContextText($content, $localStart, $localEnd);
}
// 仅段内首个引用且标签前极短时才用标签后文(避免 [24] 误截到 [25] 所在句)
if (!$this->isMeaningfulCitationContext($originalText) if (!$this->isMeaningfulCitationContext($originalText)
|| (!$hasPriorCiteInParagraph && $this->shouldUseTrailingCitationContext($content, $localStart, $tagStart, $tagEnd)) || $this->shouldUseTrailingCitationContext(
$content,
$localStart,
$tagStart,
$tagEnd,
$hasPriorCiteInParagraph
)
) { ) {
$trailEnd = ($nextTagStart < $sentenceEnd) ? $nextTagStart : $sentenceEnd; $trailEnd = ($nextTagStart < $sentenceEnd) ? $nextTagStart : $sentenceEnd;
$trailText = $this->buildCitationContextText($content, $tagEnd, $trailEnd); $trailText = $this->buildCitationContextText($content, $tagEnd, $trailEnd);
@@ -749,12 +757,23 @@ class ReferenceCheckService
/** /**
* 标签前仅有作者缩写等极短片段时,改用标签后上下文 * 标签前仅有作者缩写等极短片段时,改用标签后上下文
*
* @param bool $hasPriorCiteInParagraph 同段多引时,短片段常为并列成分,不应误取标签后下一句
*/ */
private function shouldUseTrailingCitationContext($content, $localStart, $tagStart, $tagEnd) private function shouldUseTrailingCitationContext(
{ $content,
$localStart,
$tagStart,
$tagEnd,
$hasPriorCiteInParagraph = false
) {
$before = $this->buildCitationContextText($content, $localStart, $tagStart); $before = $this->buildCitationContextText($content, $localStart, $tagStart);
if (!$this->isMeaningfulCitationContext($before)) { if (!$this->isMeaningfulCitationContext($before)) {
return true; return !$hasPriorCiteInParagraph;
}
if ($hasPriorCiteInParagraph) {
return false;
} }
return mb_strlen($before) < 25; return mb_strlen($before) < 25;
@@ -823,7 +842,7 @@ class ReferenceCheckService
$text = trim(strip_tags($text)); $text = trim(strip_tags($text));
$text = preg_replace('/\s+/u', ' ', $text); $text = preg_replace('/\s+/u', ' ', $text);
$text = ltrim($text, "\xEF\xBB\xBF"); $text = ltrim($text, "\xEF\xBB\xBF");
$text = preg_replace('/^[\s.,。;、:!?]+/u', '', $text); $text = preg_replace('/^[\s.!?。!?,,、;:]+/u', '', $text);
return $text; return $text;
} }