This commit is contained in:
wyn
2026-05-21 13:55:13 +08:00
parent 74383d24ea
commit 867621232b

View File

@@ -193,7 +193,7 @@ class ReferenceCheckService
'created_at' => $now,
'updated_at' => $now,
]);
continue;
$this->pushJob(intval($checkId), $delay);
$checkIds[] = $checkId;
$queued++;
@@ -631,33 +631,33 @@ class ReferenceCheckService
public function extractReferences($content)
{
$result = [];
preg_match_all('/<blue>\[([\d,\-\s]+)\]<\/blue>/', $content, $matches,PREG_OFFSET_CAPTURE);
preg_match_all('/<blue>\[([\d,\-\s]+)\]<\/blue>/', $content, $matches, PREG_OFFSET_CAPTURE);
if (empty($matches[0])) {
return [];
}
$tagSpans = [];
foreach ($matches[0] as $index => $match) {
$tagSpans[] = [
'start' => $match[1],
'end' => $match[1] + strlen($match[0]),
'index' => $index,
];
}
foreach ($matches[0] as $index => $match) {
$fullTag = $match[0];
$tagStart = $match[1];
$tagEnd = $tagStart + strlen($fullTag);
$rawRef = trim($matches[1][$index][0]);
$referenceNumbers = $this->expandReferenceNumbers($rawRef);
$sentenceStart = $this->findSentenceStart($content, $tagStart);
$sentenceEnd = $this->findSentenceEnd($content, $tagEnd, $tagEnd);
$originalText = $this->buildCitationContextText($content, $sentenceStart, $sentenceEnd);
if (!$this->isMeaningfulCitationContext($originalText)) {
list($sentenceStart, $sentenceEnd) = $this->widenCitationContextBounds(
$content,
$tagStart,
$tagEnd,
$sentenceStart,
$sentenceEnd
);
$originalText = $this->buildCitationContextText($content, $sentenceStart, $sentenceEnd);
}
list($localStart, $localEnd, $originalText) = $this->extractLocalCitationContext(
$content,
$tagStart,
$tagEnd,
$tagSpans
);
if (!$this->isMeaningfulCitationContext($originalText) || empty($referenceNumbers)) {
continue;
@@ -669,14 +669,81 @@ class ReferenceCheckService
'original_text' => $originalText,
'reference_start' => $tagStart,
'reference_end' => $tagEnd,
'text_start' => $sentenceStart,
'text_end' => $sentenceEnd,
'text_start' => $localStart,
'text_end' => $localEnd,
];
}
return $result;
}
/**
* 按引用位置截取局部上下文:优先取标签前叙述;同句多引时后续引用从上一标签后开始。
*/
private function extractLocalCitationContext($content, $tagStart, $tagEnd, array $tagSpans)
{
$sentenceStart = $this->findSentenceStart($content, $tagStart);
$sentenceEnd = $this->findSentenceEnd($content, $tagEnd, $tagEnd);
$prevTagEnd = $sentenceStart;
$nextTagStart = $sentenceEnd;
foreach ($tagSpans as $span) {
if ($span['end'] <= $tagStart && $span['end'] > $prevTagEnd && $span['end'] >= $sentenceStart) {
$prevTagEnd = $span['end'];
}
if ($span['start'] > $tagEnd && $span['start'] < $nextTagStart) {
$nextTagStart = $span['start'];
}
}
$hasPriorCiteInSentence = ($prevTagEnd > $sentenceStart);
// 同句后续引用:从上一 <blue> 标签后开始;首个引用:从整句开头到本标签前
$localStart = $hasPriorCiteInSentence ? $prevTagEnd : $sentenceStart;
// 默认:引用标签前的论述(如 Yin et al. [13] → 含 “higher than … Yin et al.”)
$localEnd = $tagStart;
$originalText = $this->buildCitationContextText($content, $localStart, $localEnd);
// 标签前几乎无正文(如句末 … ICU nurses [14])→ 改用标签后至下一引用或句末
if (!$this->isMeaningfulCitationContext($originalText)
|| $this->shouldUseTrailingCitationContext($content, $localStart, $tagStart, $tagEnd)
) {
$trailEnd = ($nextTagStart < $sentenceEnd) ? $nextTagStart : $sentenceEnd;
$trailText = $this->buildCitationContextText($content, $tagEnd, $trailEnd);
if ($this->isMeaningfulCitationContext($trailText)) {
$localStart = $tagEnd;
$localEnd = $trailEnd;
$originalText = $trailText;
}
}
if (!$this->isMeaningfulCitationContext($originalText)) {
list($localStart, $localEnd) = $this->widenCitationContextBounds(
$content,
$tagStart,
$tagEnd,
$localStart,
$localEnd
);
$originalText = $this->buildCitationContextText($content, $localStart, $localEnd);
}
return [$localStart, $localEnd, $originalText];
}
/**
* 标签前仅有作者缩写等极短片段时,改用标签后上下文
*/
private function shouldUseTrailingCitationContext($content, $localStart, $tagStart, $tagEnd)
{
$before = $this->buildCitationContextText($content, $localStart, $tagStart);
if (!$this->isMeaningfulCitationContext($before)) {
return true;
}
return mb_strlen($before) < 25;
}
public function expandReferenceNumbers($refStr)
{
$refStr = str_replace(
@@ -703,12 +770,43 @@ class ReferenceCheckService
return array_values(array_unique($numbers));
}
/**
* 返回 $bytePos 处 UTF-8 码点占用的最后一字节之后的位置(下一字符起始)
*/
private function utf8CharEnd($content, $bytePos)
{
$len = strlen($content);
if ($bytePos < 0 || $bytePos >= $len) {
return max(0, min($len, $bytePos + 1));
}
$next = $bytePos + 1;
while ($next < $len && (ord($content[$next]) & 0xC0) === 0x80) {
$next++;
}
return $next;
}
/**
* 按字节偏移截取(与 strpos/strlen 一致);勿用 mb_substr否则遇中文前缀会截断英文词头
*/
private function byteSubstr($content, $start, $end)
{
$length = max(0, $end - $start);
if ($length === 0) {
return '';
}
return (string)mb_strcut($content, $start, $length, 'UTF-8');
}
private function buildCitationContextText($content, $start, $end)
{
$text = mb_substr($content, $start, max(0, $end - $start));
$text = $this->byteSubstr($content, $start, $end);
$text = preg_replace('/<blue>\[[\d,\-\s]+\]<\/blue>/', '', $text);
$text = trim(strip_tags($text));
$text = preg_replace('/\s+/u', ' ', $text);
$text = ltrim($text, "\xEF\xBB\xBF");
return $text;
}
@@ -768,7 +866,7 @@ class ReferenceCheckService
}
/**
* 句号是否可作为句界(排除 0.95、3.14 等小数点)
* 句号是否可作为句界(排除小数点、et al. 等缩写
*/
private function isSentenceDelimiterAt($content, $pos, $delimiter)
{
@@ -783,6 +881,16 @@ class ReferenceCheckService
return false;
}
$before = substr($content, max(0, $pos - 12), min(12, $pos));
if (preg_match('/\b(et\s+al|e\.g|i\.e|vs|etc|fig|no)\s*\.?\s*$/i', $before)) {
return false;
}
$after = substr($content, $pos + 1, 24);
if (preg_match('/^\s*<blue>\s*\[/', $after)) {
return false;
}
return true;
}
@@ -792,7 +900,7 @@ class ReferenceCheckService
foreach (['.', '。', '!', '?', "\n"] as $delimiter) {
$pos = strrpos(substr($content, 0, $position), $delimiter);
if ($pos !== false && $this->isSentenceDelimiterAt($content, $pos, $delimiter)) {
$start = max($start, $pos + 1);
$start = max($start, $this->utf8CharEnd($content, $pos));
}
}
return $start;
@@ -812,7 +920,7 @@ class ReferenceCheckService
foreach (['.', '。', '!', '?', "\n"] as $delimiter) {
$pos = strpos($content, $delimiter, $minPos);
if ($pos !== false && $this->isSentenceDelimiterAt($content, $pos, $delimiter)) {
$endPositions[] = $pos + 1;
$endPositions[] = $this->utf8CharEnd($content, $pos);
}
}
if (empty($endPositions)) {