Changes
This commit is contained in:
@@ -75,7 +75,10 @@ class ReferenceCheckService
|
||||
// ->find();
|
||||
$citations = $this->extractReferences((string)$main['content']);
|
||||
// return $citations;
|
||||
|
||||
if (empty($citations)) {
|
||||
$this->setAmRefCheckStatus($amId, self::AM_STATUS_NONE);
|
||||
return;
|
||||
}
|
||||
$prod = Db::name('production_article')
|
||||
->where('article_id', $main['article_id'])
|
||||
->where('state', 0)
|
||||
@@ -128,15 +131,10 @@ class ReferenceCheckService
|
||||
|
||||
$this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING);
|
||||
}
|
||||
/**
|
||||
* 按 article_id 扫描 t_article_main,为每个 blue 引用 × 文献号入队
|
||||
*/
|
||||
public function enqueueByArticle($articleId, $clearPrevious = true)
|
||||
{
|
||||
public function enqueueByArticle($articleId){
|
||||
if ($articleId <= 0) {
|
||||
throw new \InvalidArgumentException('article_id is required');
|
||||
}
|
||||
|
||||
$prod = Db::name('production_article')
|
||||
->where('article_id', $articleId)
|
||||
->where('state', 0)
|
||||
@@ -144,25 +142,18 @@ class ReferenceCheckService
|
||||
if (empty($prod)) {
|
||||
throw new \RuntimeException('production_article not found for article_id=' . $articleId);
|
||||
}
|
||||
|
||||
$pArticleId = intval($prod['p_article_id']);
|
||||
$referMap = $this->loadReferMapByPArticleId($pArticleId);
|
||||
|
||||
$mains = Db::name('article_main')
|
||||
->field('am_id,content')
|
||||
->field('am_id,content,article_id')
|
||||
->where('article_id', $articleId)
|
||||
->whereIn('state', [0, 2])
|
||||
->order('sort asc')
|
||||
->select();
|
||||
|
||||
if (empty($mains)) {
|
||||
throw new \RuntimeException('article_main is empty');
|
||||
}
|
||||
|
||||
if ($clearPrevious) {
|
||||
$this->clearArticleChecks($articleId);
|
||||
}
|
||||
|
||||
$queued = 0;
|
||||
$skipped = 0;
|
||||
$checkIds = [];
|
||||
@@ -189,20 +180,16 @@ class ReferenceCheckService
|
||||
$now = date('Y-m-d H:i:s');
|
||||
// [70-73] 展开为 reference_no=70,71,72,73 共 4 条记录
|
||||
$checkId = Db::name('article_reference_check_result')->insertGetId([
|
||||
'article_id' => $articleId,
|
||||
'am_id' => intval($main['am_id']),
|
||||
'article_id' => $main['article_id'],
|
||||
'p_article_id' => $pArticleId,
|
||||
'p_refer_id' => intval($refer['p_refer_id']),
|
||||
'refer_index' => $referIndex,
|
||||
'am_id' => intval($main['am_id']),
|
||||
'reference_no' => $refNo,
|
||||
'reference_raw' => $cite['reference_raw'],
|
||||
'cite_tag_start' => intval($cite['reference_start']),
|
||||
'cite_tag_end' => intval($cite['reference_end']),
|
||||
'text_start' => intval($cite['text_start']),
|
||||
'text_end' => intval($cite['text_end']),
|
||||
'content_a' => $cite['original_text'],
|
||||
'content_b' => $referText,
|
||||
'status' => 0,
|
||||
'refer_index' => $refNo,
|
||||
'origin_text' => $cite['original_text'],
|
||||
'refer_text' => $referText,
|
||||
'p_refer_id' => $referMap[$referIndex]['p_refer_id'],
|
||||
'text_start' => $cite['text_start'],
|
||||
'text_end' => $cite['text_end'],
|
||||
'created_at' => $now,
|
||||
'updated_at' => $now,
|
||||
]);
|
||||
@@ -658,12 +645,21 @@ class ReferenceCheckService
|
||||
$referenceNumbers = $this->expandReferenceNumbers($rawRef);
|
||||
|
||||
$sentenceStart = $this->findSentenceStart($content, $tagStart);
|
||||
$sentenceEnd = $this->findSentenceEnd($content, $tagEnd);
|
||||
$originalText = mb_substr($content, $sentenceStart, $sentenceEnd - $sentenceStart);
|
||||
$originalText = preg_replace('/<blue>\[[\d,\-\s]+\]<\/blue>/', '', $originalText);
|
||||
$originalText = trim(strip_tags($originalText));
|
||||
$sentenceEnd = $this->findSentenceEnd($content, $tagEnd, $tagEnd);
|
||||
$originalText = $this->buildCitationContextText($content, $sentenceStart, $sentenceEnd);
|
||||
|
||||
if ($originalText === '' || empty($referenceNumbers)) {
|
||||
if (!$this->isMeaningfulCitationContext($originalText)) {
|
||||
list($sentenceStart, $sentenceEnd) = $this->widenCitationContextBounds(
|
||||
$content,
|
||||
$tagStart,
|
||||
$tagEnd,
|
||||
$sentenceStart,
|
||||
$sentenceEnd
|
||||
);
|
||||
$originalText = $this->buildCitationContextText($content, $sentenceStart, $sentenceEnd);
|
||||
}
|
||||
|
||||
if (!$this->isMeaningfulCitationContext($originalText) || empty($referenceNumbers)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -707,29 +703,137 @@ class ReferenceCheckService
|
||||
return array_values(array_unique($numbers));
|
||||
}
|
||||
|
||||
private function buildCitationContextText($content, $start, $end)
|
||||
{
|
||||
$text = mb_substr($content, $start, max(0, $end - $start));
|
||||
$text = preg_replace('/<blue>\[[\d,\-\s]+\]<\/blue>/', '', $text);
|
||||
$text = trim(strip_tags($text));
|
||||
$text = preg_replace('/\s+/u', ' ', $text);
|
||||
|
||||
return $text;
|
||||
}
|
||||
|
||||
/**
|
||||
* 过滤仅标点、过短或无字母/汉字的上下文(如去掉标签后只剩 ".")
|
||||
*/
|
||||
private function isMeaningfulCitationContext($text)
|
||||
{
|
||||
$text = trim($text);
|
||||
if ($text === '') {
|
||||
return false;
|
||||
}
|
||||
if ($this->isOnlyPunctuationOrSpace($text)) {
|
||||
return false;
|
||||
}
|
||||
if (!preg_match('/[\p{L}\p{N}]/u', $text)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return mb_strlen($text) >= 2;
|
||||
}
|
||||
|
||||
private function isOnlyPunctuationOrSpace($text)
|
||||
{
|
||||
return preg_match('/^[\s\p{P}\p{S}]+$/u', $text) === 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* 首句过短时向前后各扩展一句(上限约 2000 字符)
|
||||
*/
|
||||
private function widenCitationContextBounds($content, $tagStart, $tagEnd, $start, $end)
|
||||
{
|
||||
$len = strlen($content);
|
||||
$maxSpan = 2000;
|
||||
|
||||
if ($start > 0) {
|
||||
$prevStart = $this->findSentenceStart($content, max(0, $start - 1));
|
||||
if ($prevStart < $start) {
|
||||
$start = $prevStart;
|
||||
}
|
||||
}
|
||||
|
||||
$nextEnd = $this->findSentenceEnd($content, $end, $tagEnd);
|
||||
if ($nextEnd > $end && $nextEnd <= $len) {
|
||||
$end = $nextEnd;
|
||||
}
|
||||
|
||||
if ($end - $start > $maxSpan) {
|
||||
$half = (int)floor($maxSpan / 2);
|
||||
$mid = (int)floor(($tagStart + $tagEnd) / 2);
|
||||
$start = max(0, $mid - $half);
|
||||
$end = min($len, $start + $maxSpan);
|
||||
}
|
||||
|
||||
return [$start, $end];
|
||||
}
|
||||
|
||||
/**
|
||||
* 句号是否可作为句界(排除 0.95、3.14 等小数点)
|
||||
*/
|
||||
private function isSentenceDelimiterAt($content, $pos, $delimiter)
|
||||
{
|
||||
$len = strlen($content);
|
||||
if ($delimiter !== '.' || $pos < 0 || $pos >= $len) {
|
||||
return true;
|
||||
}
|
||||
if ($pos > 0 && $pos + 1 < $len
|
||||
&& ctype_digit($content[$pos - 1])
|
||||
&& ctype_digit($content[$pos + 1])
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private function findSentenceStart($content, $position)
|
||||
{
|
||||
$start = 0;
|
||||
foreach (['.', '。', '!', '?', "\n"] as $delimiter) {
|
||||
$pos = strrpos(substr($content, 0, $position), $delimiter);
|
||||
if ($pos !== false) {
|
||||
if ($pos !== false && $this->isSentenceDelimiterAt($content, $pos, $delimiter)) {
|
||||
$start = max($start, $pos + 1);
|
||||
}
|
||||
}
|
||||
return $start;
|
||||
}
|
||||
|
||||
private function findSentenceEnd($content, $position)
|
||||
/**
|
||||
* @param int $searchFrom 从该字节位置起查找句末
|
||||
* @param int $tagEnd 引用标签结束位置;用于跳过 </blue> 后紧跟的孤立句号
|
||||
*/
|
||||
private function findSentenceEnd($content, $searchFrom, $tagEnd = 0)
|
||||
{
|
||||
$length = strlen($content);
|
||||
$endPositions = [];
|
||||
foreach (['.', '。', '!', '?', "\n"] as $delimiter) {
|
||||
$pos = strpos($content, $delimiter, $position);
|
||||
if ($pos !== false) {
|
||||
$endPositions[] = $pos + 1;
|
||||
$minPos = max(0, $searchFrom);
|
||||
|
||||
while ($minPos < $length) {
|
||||
$endPositions = [];
|
||||
foreach (['.', '。', '!', '?', "\n"] as $delimiter) {
|
||||
$pos = strpos($content, $delimiter, $minPos);
|
||||
if ($pos !== false && $this->isSentenceDelimiterAt($content, $pos, $delimiter)) {
|
||||
$endPositions[] = $pos + 1;
|
||||
}
|
||||
}
|
||||
if (empty($endPositions)) {
|
||||
return $length;
|
||||
}
|
||||
|
||||
$end = min($endPositions);
|
||||
if ($tagEnd <= 0 || $end <= $tagEnd) {
|
||||
return $end;
|
||||
}
|
||||
|
||||
$gap = substr($content, $tagEnd, $end - $tagEnd);
|
||||
$gapText = trim(strip_tags(preg_replace('/<blue>\[[\d,\-\s]+\]<\/blue>/', '', $gap)));
|
||||
if ($gapText !== '' && !$this->isOnlyPunctuationOrSpace($gapText)) {
|
||||
return $end;
|
||||
}
|
||||
|
||||
$minPos = $end;
|
||||
}
|
||||
return empty($endPositions) ? $length : min($endPositions);
|
||||
|
||||
return $length;
|
||||
}
|
||||
|
||||
private function pushJob($checkId, $delaySeconds = 0)
|
||||
|
||||
Reference in New Issue
Block a user