Changes
This commit is contained in:
@@ -193,7 +193,7 @@ class ReferenceCheckService
|
|||||||
'created_at' => $now,
|
'created_at' => $now,
|
||||||
'updated_at' => $now,
|
'updated_at' => $now,
|
||||||
]);
|
]);
|
||||||
|
continue;
|
||||||
$this->pushJob(intval($checkId), $delay);
|
$this->pushJob(intval($checkId), $delay);
|
||||||
$checkIds[] = $checkId;
|
$checkIds[] = $checkId;
|
||||||
$queued++;
|
$queued++;
|
||||||
@@ -631,33 +631,33 @@ class ReferenceCheckService
|
|||||||
public function extractReferences($content)
|
public function extractReferences($content)
|
||||||
{
|
{
|
||||||
$result = [];
|
$result = [];
|
||||||
preg_match_all('/<blue>\[([\d,\-\s]+)\]<\/blue>/', $content, $matches,PREG_OFFSET_CAPTURE);
|
preg_match_all('/<blue>\[([\d,\-\s]+)\]<\/blue>/', $content, $matches, PREG_OFFSET_CAPTURE);
|
||||||
if (empty($matches[0])) {
|
if (empty($matches[0])) {
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
$tagSpans = [];
|
||||||
foreach ($matches[0] as $index => $match) {
|
foreach ($matches[0] as $index => $match) {
|
||||||
|
$tagSpans[] = [
|
||||||
|
'start' => $match[1],
|
||||||
|
'end' => $match[1] + strlen($match[0]),
|
||||||
|
'index' => $index,
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach ($matches[0] as $index => $match) {
|
||||||
$fullTag = $match[0];
|
$fullTag = $match[0];
|
||||||
$tagStart = $match[1];
|
$tagStart = $match[1];
|
||||||
$tagEnd = $tagStart + strlen($fullTag);
|
$tagEnd = $tagStart + strlen($fullTag);
|
||||||
$rawRef = trim($matches[1][$index][0]);
|
$rawRef = trim($matches[1][$index][0]);
|
||||||
$referenceNumbers = $this->expandReferenceNumbers($rawRef);
|
$referenceNumbers = $this->expandReferenceNumbers($rawRef);
|
||||||
|
|
||||||
$sentenceStart = $this->findSentenceStart($content, $tagStart);
|
list($localStart, $localEnd, $originalText) = $this->extractLocalCitationContext(
|
||||||
$sentenceEnd = $this->findSentenceEnd($content, $tagEnd, $tagEnd);
|
$content,
|
||||||
$originalText = $this->buildCitationContextText($content, $sentenceStart, $sentenceEnd);
|
$tagStart,
|
||||||
|
$tagEnd,
|
||||||
if (!$this->isMeaningfulCitationContext($originalText)) {
|
$tagSpans
|
||||||
list($sentenceStart, $sentenceEnd) = $this->widenCitationContextBounds(
|
);
|
||||||
$content,
|
|
||||||
$tagStart,
|
|
||||||
$tagEnd,
|
|
||||||
$sentenceStart,
|
|
||||||
$sentenceEnd
|
|
||||||
);
|
|
||||||
$originalText = $this->buildCitationContextText($content, $sentenceStart, $sentenceEnd);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!$this->isMeaningfulCitationContext($originalText) || empty($referenceNumbers)) {
|
if (!$this->isMeaningfulCitationContext($originalText) || empty($referenceNumbers)) {
|
||||||
continue;
|
continue;
|
||||||
@@ -669,14 +669,81 @@ class ReferenceCheckService
|
|||||||
'original_text' => $originalText,
|
'original_text' => $originalText,
|
||||||
'reference_start' => $tagStart,
|
'reference_start' => $tagStart,
|
||||||
'reference_end' => $tagEnd,
|
'reference_end' => $tagEnd,
|
||||||
'text_start' => $sentenceStart,
|
'text_start' => $localStart,
|
||||||
'text_end' => $sentenceEnd,
|
'text_end' => $localEnd,
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|
||||||
return $result;
|
return $result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 按引用位置截取局部上下文:优先取标签前叙述;同句多引时后续引用从上一标签后开始。
|
||||||
|
*/
|
||||||
|
private function extractLocalCitationContext($content, $tagStart, $tagEnd, array $tagSpans)
|
||||||
|
{
|
||||||
|
$sentenceStart = $this->findSentenceStart($content, $tagStart);
|
||||||
|
$sentenceEnd = $this->findSentenceEnd($content, $tagEnd, $tagEnd);
|
||||||
|
|
||||||
|
$prevTagEnd = $sentenceStart;
|
||||||
|
$nextTagStart = $sentenceEnd;
|
||||||
|
foreach ($tagSpans as $span) {
|
||||||
|
if ($span['end'] <= $tagStart && $span['end'] > $prevTagEnd && $span['end'] >= $sentenceStart) {
|
||||||
|
$prevTagEnd = $span['end'];
|
||||||
|
}
|
||||||
|
if ($span['start'] > $tagEnd && $span['start'] < $nextTagStart) {
|
||||||
|
$nextTagStart = $span['start'];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
$hasPriorCiteInSentence = ($prevTagEnd > $sentenceStart);
|
||||||
|
// 同句后续引用:从上一 <blue> 标签后开始;首个引用:从整句开头到本标签前
|
||||||
|
$localStart = $hasPriorCiteInSentence ? $prevTagEnd : $sentenceStart;
|
||||||
|
|
||||||
|
// 默认:引用标签前的论述(如 Yin et al. [13] → 含 “higher than … Yin et al.”)
|
||||||
|
$localEnd = $tagStart;
|
||||||
|
$originalText = $this->buildCitationContextText($content, $localStart, $localEnd);
|
||||||
|
|
||||||
|
// 标签前几乎无正文(如句末 … ICU nurses [14])→ 改用标签后至下一引用或句末
|
||||||
|
if (!$this->isMeaningfulCitationContext($originalText)
|
||||||
|
|| $this->shouldUseTrailingCitationContext($content, $localStart, $tagStart, $tagEnd)
|
||||||
|
) {
|
||||||
|
$trailEnd = ($nextTagStart < $sentenceEnd) ? $nextTagStart : $sentenceEnd;
|
||||||
|
$trailText = $this->buildCitationContextText($content, $tagEnd, $trailEnd);
|
||||||
|
if ($this->isMeaningfulCitationContext($trailText)) {
|
||||||
|
$localStart = $tagEnd;
|
||||||
|
$localEnd = $trailEnd;
|
||||||
|
$originalText = $trailText;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!$this->isMeaningfulCitationContext($originalText)) {
|
||||||
|
list($localStart, $localEnd) = $this->widenCitationContextBounds(
|
||||||
|
$content,
|
||||||
|
$tagStart,
|
||||||
|
$tagEnd,
|
||||||
|
$localStart,
|
||||||
|
$localEnd
|
||||||
|
);
|
||||||
|
$originalText = $this->buildCitationContextText($content, $localStart, $localEnd);
|
||||||
|
}
|
||||||
|
|
||||||
|
return [$localStart, $localEnd, $originalText];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 标签前仅有作者缩写等极短片段时,改用标签后上下文
|
||||||
|
*/
|
||||||
|
private function shouldUseTrailingCitationContext($content, $localStart, $tagStart, $tagEnd)
|
||||||
|
{
|
||||||
|
$before = $this->buildCitationContextText($content, $localStart, $tagStart);
|
||||||
|
if (!$this->isMeaningfulCitationContext($before)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return mb_strlen($before) < 25;
|
||||||
|
}
|
||||||
|
|
||||||
public function expandReferenceNumbers($refStr)
|
public function expandReferenceNumbers($refStr)
|
||||||
{
|
{
|
||||||
$refStr = str_replace(
|
$refStr = str_replace(
|
||||||
@@ -703,12 +770,43 @@ class ReferenceCheckService
|
|||||||
return array_values(array_unique($numbers));
|
return array_values(array_unique($numbers));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 返回 $bytePos 处 UTF-8 码点占用的最后一字节之后的位置(下一字符起始)
|
||||||
|
*/
|
||||||
|
private function utf8CharEnd($content, $bytePos)
|
||||||
|
{
|
||||||
|
$len = strlen($content);
|
||||||
|
if ($bytePos < 0 || $bytePos >= $len) {
|
||||||
|
return max(0, min($len, $bytePos + 1));
|
||||||
|
}
|
||||||
|
$next = $bytePos + 1;
|
||||||
|
while ($next < $len && (ord($content[$next]) & 0xC0) === 0x80) {
|
||||||
|
$next++;
|
||||||
|
}
|
||||||
|
|
||||||
|
return $next;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 按字节偏移截取(与 strpos/strlen 一致);勿用 mb_substr,否则遇中文前缀会截断英文词头
|
||||||
|
*/
|
||||||
|
private function byteSubstr($content, $start, $end)
|
||||||
|
{
|
||||||
|
$length = max(0, $end - $start);
|
||||||
|
if ($length === 0) {
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
|
||||||
|
return (string)mb_strcut($content, $start, $length, 'UTF-8');
|
||||||
|
}
|
||||||
|
|
||||||
private function buildCitationContextText($content, $start, $end)
|
private function buildCitationContextText($content, $start, $end)
|
||||||
{
|
{
|
||||||
$text = mb_substr($content, $start, max(0, $end - $start));
|
$text = $this->byteSubstr($content, $start, $end);
|
||||||
$text = preg_replace('/<blue>\[[\d,\-\s]+\]<\/blue>/', '', $text);
|
$text = preg_replace('/<blue>\[[\d,\-\s]+\]<\/blue>/', '', $text);
|
||||||
$text = trim(strip_tags($text));
|
$text = trim(strip_tags($text));
|
||||||
$text = preg_replace('/\s+/u', ' ', $text);
|
$text = preg_replace('/\s+/u', ' ', $text);
|
||||||
|
$text = ltrim($text, "\xEF\xBB\xBF");
|
||||||
|
|
||||||
return $text;
|
return $text;
|
||||||
}
|
}
|
||||||
@@ -768,7 +866,7 @@ class ReferenceCheckService
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 句号是否可作为句界(排除 0.95、3.14 等小数点)
|
* 句号是否可作为句界(排除小数点、et al. 等缩写)
|
||||||
*/
|
*/
|
||||||
private function isSentenceDelimiterAt($content, $pos, $delimiter)
|
private function isSentenceDelimiterAt($content, $pos, $delimiter)
|
||||||
{
|
{
|
||||||
@@ -783,6 +881,16 @@ class ReferenceCheckService
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
$before = substr($content, max(0, $pos - 12), min(12, $pos));
|
||||||
|
if (preg_match('/\b(et\s+al|e\.g|i\.e|vs|etc|fig|no)\s*\.?\s*$/i', $before)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
$after = substr($content, $pos + 1, 24);
|
||||||
|
if (preg_match('/^\s*<blue>\s*\[/', $after)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -792,7 +900,7 @@ class ReferenceCheckService
|
|||||||
foreach (['.', '。', '!', '?', "\n"] as $delimiter) {
|
foreach (['.', '。', '!', '?', "\n"] as $delimiter) {
|
||||||
$pos = strrpos(substr($content, 0, $position), $delimiter);
|
$pos = strrpos(substr($content, 0, $position), $delimiter);
|
||||||
if ($pos !== false && $this->isSentenceDelimiterAt($content, $pos, $delimiter)) {
|
if ($pos !== false && $this->isSentenceDelimiterAt($content, $pos, $delimiter)) {
|
||||||
$start = max($start, $pos + 1);
|
$start = max($start, $this->utf8CharEnd($content, $pos));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return $start;
|
return $start;
|
||||||
@@ -812,7 +920,7 @@ class ReferenceCheckService
|
|||||||
foreach (['.', '。', '!', '?', "\n"] as $delimiter) {
|
foreach (['.', '。', '!', '?', "\n"] as $delimiter) {
|
||||||
$pos = strpos($content, $delimiter, $minPos);
|
$pos = strpos($content, $delimiter, $minPos);
|
||||||
if ($pos !== false && $this->isSentenceDelimiterAt($content, $pos, $delimiter)) {
|
if ($pos !== false && $this->isSentenceDelimiterAt($content, $pos, $delimiter)) {
|
||||||
$endPositions[] = $pos + 1;
|
$endPositions[] = $this->utf8CharEnd($content, $pos);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (empty($endPositions)) {
|
if (empty($endPositions)) {
|
||||||
|
|||||||
Reference in New Issue
Block a user