Changes
This commit is contained in:
@@ -193,7 +193,7 @@ class ReferenceCheckService
|
||||
'created_at' => $now,
|
||||
'updated_at' => $now,
|
||||
]);
|
||||
|
||||
continue;
|
||||
$this->pushJob(intval($checkId), $delay);
|
||||
$checkIds[] = $checkId;
|
||||
$queued++;
|
||||
@@ -631,33 +631,33 @@ class ReferenceCheckService
|
||||
public function extractReferences($content)
|
||||
{
|
||||
$result = [];
|
||||
preg_match_all('/<blue>\[([\d,\-\s]+)\]<\/blue>/', $content, $matches,PREG_OFFSET_CAPTURE);
|
||||
preg_match_all('/<blue>\[([\d,\-\s]+)\]<\/blue>/', $content, $matches, PREG_OFFSET_CAPTURE);
|
||||
if (empty($matches[0])) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$tagSpans = [];
|
||||
foreach ($matches[0] as $index => $match) {
|
||||
$tagSpans[] = [
|
||||
'start' => $match[1],
|
||||
'end' => $match[1] + strlen($match[0]),
|
||||
'index' => $index,
|
||||
];
|
||||
}
|
||||
|
||||
foreach ($matches[0] as $index => $match) {
|
||||
$fullTag = $match[0];
|
||||
$tagStart = $match[1];
|
||||
$tagEnd = $tagStart + strlen($fullTag);
|
||||
$rawRef = trim($matches[1][$index][0]);
|
||||
$referenceNumbers = $this->expandReferenceNumbers($rawRef);
|
||||
|
||||
$sentenceStart = $this->findSentenceStart($content, $tagStart);
|
||||
$sentenceEnd = $this->findSentenceEnd($content, $tagEnd, $tagEnd);
|
||||
$originalText = $this->buildCitationContextText($content, $sentenceStart, $sentenceEnd);
|
||||
|
||||
if (!$this->isMeaningfulCitationContext($originalText)) {
|
||||
list($sentenceStart, $sentenceEnd) = $this->widenCitationContextBounds(
|
||||
$content,
|
||||
$tagStart,
|
||||
$tagEnd,
|
||||
$sentenceStart,
|
||||
$sentenceEnd
|
||||
);
|
||||
$originalText = $this->buildCitationContextText($content, $sentenceStart, $sentenceEnd);
|
||||
}
|
||||
list($localStart, $localEnd, $originalText) = $this->extractLocalCitationContext(
|
||||
$content,
|
||||
$tagStart,
|
||||
$tagEnd,
|
||||
$tagSpans
|
||||
);
|
||||
|
||||
if (!$this->isMeaningfulCitationContext($originalText) || empty($referenceNumbers)) {
|
||||
continue;
|
||||
@@ -669,14 +669,81 @@ class ReferenceCheckService
|
||||
'original_text' => $originalText,
|
||||
'reference_start' => $tagStart,
|
||||
'reference_end' => $tagEnd,
|
||||
'text_start' => $sentenceStart,
|
||||
'text_end' => $sentenceEnd,
|
||||
'text_start' => $localStart,
|
||||
'text_end' => $localEnd,
|
||||
];
|
||||
}
|
||||
|
||||
return $result;
|
||||
}
|
||||
|
||||
/**
|
||||
* 按引用位置截取局部上下文:优先取标签前叙述;同句多引时后续引用从上一标签后开始。
|
||||
*/
|
||||
private function extractLocalCitationContext($content, $tagStart, $tagEnd, array $tagSpans)
|
||||
{
|
||||
$sentenceStart = $this->findSentenceStart($content, $tagStart);
|
||||
$sentenceEnd = $this->findSentenceEnd($content, $tagEnd, $tagEnd);
|
||||
|
||||
$prevTagEnd = $sentenceStart;
|
||||
$nextTagStart = $sentenceEnd;
|
||||
foreach ($tagSpans as $span) {
|
||||
if ($span['end'] <= $tagStart && $span['end'] > $prevTagEnd && $span['end'] >= $sentenceStart) {
|
||||
$prevTagEnd = $span['end'];
|
||||
}
|
||||
if ($span['start'] > $tagEnd && $span['start'] < $nextTagStart) {
|
||||
$nextTagStart = $span['start'];
|
||||
}
|
||||
}
|
||||
|
||||
$hasPriorCiteInSentence = ($prevTagEnd > $sentenceStart);
|
||||
// 同句后续引用:从上一 <blue> 标签后开始;首个引用:从整句开头到本标签前
|
||||
$localStart = $hasPriorCiteInSentence ? $prevTagEnd : $sentenceStart;
|
||||
|
||||
// 默认:引用标签前的论述(如 Yin et al. [13] → 含 “higher than … Yin et al.”)
|
||||
$localEnd = $tagStart;
|
||||
$originalText = $this->buildCitationContextText($content, $localStart, $localEnd);
|
||||
|
||||
// 标签前几乎无正文(如句末 … ICU nurses [14])→ 改用标签后至下一引用或句末
|
||||
if (!$this->isMeaningfulCitationContext($originalText)
|
||||
|| $this->shouldUseTrailingCitationContext($content, $localStart, $tagStart, $tagEnd)
|
||||
) {
|
||||
$trailEnd = ($nextTagStart < $sentenceEnd) ? $nextTagStart : $sentenceEnd;
|
||||
$trailText = $this->buildCitationContextText($content, $tagEnd, $trailEnd);
|
||||
if ($this->isMeaningfulCitationContext($trailText)) {
|
||||
$localStart = $tagEnd;
|
||||
$localEnd = $trailEnd;
|
||||
$originalText = $trailText;
|
||||
}
|
||||
}
|
||||
|
||||
if (!$this->isMeaningfulCitationContext($originalText)) {
|
||||
list($localStart, $localEnd) = $this->widenCitationContextBounds(
|
||||
$content,
|
||||
$tagStart,
|
||||
$tagEnd,
|
||||
$localStart,
|
||||
$localEnd
|
||||
);
|
||||
$originalText = $this->buildCitationContextText($content, $localStart, $localEnd);
|
||||
}
|
||||
|
||||
return [$localStart, $localEnd, $originalText];
|
||||
}
|
||||
|
||||
/**
|
||||
* 标签前仅有作者缩写等极短片段时,改用标签后上下文
|
||||
*/
|
||||
private function shouldUseTrailingCitationContext($content, $localStart, $tagStart, $tagEnd)
|
||||
{
|
||||
$before = $this->buildCitationContextText($content, $localStart, $tagStart);
|
||||
if (!$this->isMeaningfulCitationContext($before)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return mb_strlen($before) < 25;
|
||||
}
|
||||
|
||||
public function expandReferenceNumbers($refStr)
|
||||
{
|
||||
$refStr = str_replace(
|
||||
@@ -703,12 +770,43 @@ class ReferenceCheckService
|
||||
return array_values(array_unique($numbers));
|
||||
}
|
||||
|
||||
/**
|
||||
* 返回 $bytePos 处 UTF-8 码点占用的最后一字节之后的位置(下一字符起始)
|
||||
*/
|
||||
private function utf8CharEnd($content, $bytePos)
|
||||
{
|
||||
$len = strlen($content);
|
||||
if ($bytePos < 0 || $bytePos >= $len) {
|
||||
return max(0, min($len, $bytePos + 1));
|
||||
}
|
||||
$next = $bytePos + 1;
|
||||
while ($next < $len && (ord($content[$next]) & 0xC0) === 0x80) {
|
||||
$next++;
|
||||
}
|
||||
|
||||
return $next;
|
||||
}
|
||||
|
||||
/**
|
||||
* 按字节偏移截取(与 strpos/strlen 一致);勿用 mb_substr,否则遇中文前缀会截断英文词头
|
||||
*/
|
||||
private function byteSubstr($content, $start, $end)
|
||||
{
|
||||
$length = max(0, $end - $start);
|
||||
if ($length === 0) {
|
||||
return '';
|
||||
}
|
||||
|
||||
return (string)mb_strcut($content, $start, $length, 'UTF-8');
|
||||
}
|
||||
|
||||
private function buildCitationContextText($content, $start, $end)
|
||||
{
|
||||
$text = mb_substr($content, $start, max(0, $end - $start));
|
||||
$text = $this->byteSubstr($content, $start, $end);
|
||||
$text = preg_replace('/<blue>\[[\d,\-\s]+\]<\/blue>/', '', $text);
|
||||
$text = trim(strip_tags($text));
|
||||
$text = preg_replace('/\s+/u', ' ', $text);
|
||||
$text = ltrim($text, "\xEF\xBB\xBF");
|
||||
|
||||
return $text;
|
||||
}
|
||||
@@ -768,7 +866,7 @@ class ReferenceCheckService
|
||||
}
|
||||
|
||||
/**
|
||||
* 句号是否可作为句界(排除 0.95、3.14 等小数点)
|
||||
* 句号是否可作为句界(排除小数点、et al. 等缩写)
|
||||
*/
|
||||
private function isSentenceDelimiterAt($content, $pos, $delimiter)
|
||||
{
|
||||
@@ -783,6 +881,16 @@ class ReferenceCheckService
|
||||
return false;
|
||||
}
|
||||
|
||||
$before = substr($content, max(0, $pos - 12), min(12, $pos));
|
||||
if (preg_match('/\b(et\s+al|e\.g|i\.e|vs|etc|fig|no)\s*\.?\s*$/i', $before)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
$after = substr($content, $pos + 1, 24);
|
||||
if (preg_match('/^\s*<blue>\s*\[/', $after)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -792,7 +900,7 @@ class ReferenceCheckService
|
||||
foreach (['.', '。', '!', '?', "\n"] as $delimiter) {
|
||||
$pos = strrpos(substr($content, 0, $position), $delimiter);
|
||||
if ($pos !== false && $this->isSentenceDelimiterAt($content, $pos, $delimiter)) {
|
||||
$start = max($start, $pos + 1);
|
||||
$start = max($start, $this->utf8CharEnd($content, $pos));
|
||||
}
|
||||
}
|
||||
return $start;
|
||||
@@ -812,7 +920,7 @@ class ReferenceCheckService
|
||||
foreach (['.', '。', '!', '?', "\n"] as $delimiter) {
|
||||
$pos = strpos($content, $delimiter, $minPos);
|
||||
if ($pos !== false && $this->isSentenceDelimiterAt($content, $pos, $delimiter)) {
|
||||
$endPositions[] = $pos + 1;
|
||||
$endPositions[] = $this->utf8CharEnd($content, $pos);
|
||||
}
|
||||
}
|
||||
if (empty($endPositions)) {
|
||||
|
||||
Reference in New Issue
Block a user