文章引用文献校验

This commit is contained in:
wyn
2026-05-21 10:02:05 +08:00
parent fa878334cd
commit 4aab7f5b7e
4 changed files with 1790 additions and 0 deletions

View File

@@ -0,0 +1,751 @@
<?php
namespace app\common;
use think\Db;
use think\Queue;
/**
* 正文 &lt;blue&gt;[n]&lt;/blue&gt; 引用与 t_production_article_referindex+1=n相关性校对。
* LLM 配置与 PromotionLlmService 相同;单条任务走 ReferenceCheck 队列。
*/
class ReferenceCheckService
{
const QUEUE_NAME = 'ReferenceCheck';
/** t_article_main.ref_check_status */
const AM_STATUS_NONE = 0;
const AM_STATUS_PASS = 1;
const AM_STATUS_FAIL = 2;
const AM_STATUS_RUNNING = 3;
/**
* 兼容无 ?? 的 PHP 版本
*/
private function arrGet($arr, $key, $default = '')
{
return isset($arr[$key]) ? $arr[$key] : $default;
}
/**
* 单条入队(可手工指定正文与文献文本)
*/
public function enqueue($contentA, $contentB, array $extra = [])
{
$contentA = trim($contentA);
if ($contentA === '') {
throw new \InvalidArgumentException('content_a is required');
}
$now = date('Y-m-d H:i:s');
$checkId = Db::name('article_reference_check_result')->insertGetId([
'article_id' => intval($this->arrGet($extra, 'article_id', 0)),
'am_id' => intval($this->arrGet($extra, 'am_id', 0)),
'p_article_id' => intval($this->arrGet($extra, 'p_article_id', 0)),
'p_refer_id' => intval($this->arrGet($extra, 'p_refer_id', 0)),
'refer_index' => intval($this->arrGet($extra, 'refer_index', 0)),
'reference_no' => intval($this->arrGet($extra, 'reference_no', 0)),
'reference_raw' => (string)$this->arrGet($extra, 'reference_raw', ''),
'cite_tag_start' => intval($this->arrGet($extra, 'cite_tag_start', 0)),
'cite_tag_end' => intval($this->arrGet($extra, 'cite_tag_end', 0)),
'text_start' => intval($this->arrGet($extra, 'text_start', 0)),
'text_end' => intval($this->arrGet($extra, 'text_end', 0)),
'content_a' => $contentA,
'content_b' => trim($contentB),
'status' => 0,
'created_at' => $now,
'updated_at' => $now,
]);
$amId = intval($this->arrGet($extra, 'am_id', 0));
if ($amId > 0) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING);
}
$this->pushJob(intval($checkId), intval($this->arrGet($extra, 'queue_delay', 0)));
return ['check_id' => $checkId, 'queued' => 1];
}
public function enqueueByArticleMain($main){
$amId = $main['am_id'];
// $main = Db::name('article_main')
// ->field('am_id,content,article_id')
// ->where('am_id', $amId)
// ->whereIn('state', [0, 2])
// ->find();
$citations = $this->extractReferences((string)$main['content']);
// return $citations;
$prod = Db::name('production_article')
->where('article_id', $main['article_id'])
->where('state', 0)
->find();
if (empty($prod)) {
throw new \RuntimeException('production_article not found for article_id=' . $main['article_id']);
}
$pArticleId = intval($prod['p_article_id']);
$referMap = $this->loadReferMapByPArticleId($pArticleId);
if (empty($citations)) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_PASS);
return;
}
$skipped = 0;
$delay = 0;
foreach ($citations as $cite) {
foreach ($cite['reference_numbers'] as $refNo) {
$referIndex = $refNo - 1;
if ($referIndex < 0 || !isset($referMap[$referIndex])) {
$skipped++;
continue;
}
$refer = $referMap[$referIndex];
$referText = $this->formatReferForLlm($refer);
$now = date('Y-m-d H:i:s');
// [70-73] 展开为 reference_no=70,71,72,73 共 4 条记录
$checkId = Db::name('article_reference_check_result')->insertGetId([
'article_id' => $main['article_id'],
'p_article_id' => $pArticleId,
'am_id' => intval($main['am_id']),
'reference_no' => $refNo,
'refer_index' => $refNo,
'origin_text' => $cite['original_text'],
'refer_text' => $referText,
'p_refer_id' => $referMap[$referIndex]['p_refer_id'],
'text_start' => $cite['text_start'],
'text_end' => $cite['text_end'],
'created_at' => $now,
'updated_at' => $now,
]);
$this->pushJob(intval($checkId), $delay);
$checkIds[] = $checkId;
$delay += 1;
}
}
$this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING);
}
/**
* 按 article_id 扫描 t_article_main为每个 blue 引用 × 文献号入队
*/
public function enqueueByArticle($articleId, $clearPrevious = true)
{
if ($articleId <= 0) {
throw new \InvalidArgumentException('article_id is required');
}
$prod = Db::name('production_article')
->where('article_id', $articleId)
->where('state', 0)
->find();
if (empty($prod)) {
throw new \RuntimeException('production_article not found for article_id=' . $articleId);
}
$pArticleId = intval($prod['p_article_id']);
$referMap = $this->loadReferMapByPArticleId($pArticleId);
$mains = Db::name('article_main')
->field('am_id,content')
->where('article_id', $articleId)
->whereIn('state', [0, 2])
->order('sort asc')
->select();
if (empty($mains)) {
throw new \RuntimeException('article_main is empty');
}
if ($clearPrevious) {
$this->clearArticleChecks($articleId);
}
$queued = 0;
$skipped = 0;
$checkIds = [];
$delay = 0;
$amIdsWithJobs = [];
foreach ($mains as $main) {
$amId = intval($main['am_id']);
$citations = $this->extractReferences((string)$main['content']);
if (empty($citations)) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_NONE);
continue;
}
foreach ($citations as $cite) {
foreach ($cite['reference_numbers'] as $refNo) {
$referIndex = $refNo - 1;
if ($referIndex < 0 || !isset($referMap[$referIndex])) {
$skipped++;
continue;
}
$refer = $referMap[$referIndex];
$referText = $this->formatReferForLlm($refer);
$now = date('Y-m-d H:i:s');
// [70-73] 展开为 reference_no=70,71,72,73 共 4 条记录
$checkId = Db::name('article_reference_check_result')->insertGetId([
'article_id' => $articleId,
'am_id' => intval($main['am_id']),
'p_article_id' => $pArticleId,
'p_refer_id' => intval($refer['p_refer_id']),
'refer_index' => $referIndex,
'reference_no' => $refNo,
'reference_raw' => $cite['reference_raw'],
'cite_tag_start' => intval($cite['reference_start']),
'cite_tag_end' => intval($cite['reference_end']),
'text_start' => intval($cite['text_start']),
'text_end' => intval($cite['text_end']),
'content_a' => $cite['original_text'],
'content_b' => $referText,
'status' => 0,
'created_at' => $now,
'updated_at' => $now,
]);
$this->pushJob(intval($checkId), $delay);
$checkIds[] = $checkId;
$queued++;
$delay += 1;
$amIdsWithJobs[$amId] = true;
}
}
}
foreach (array_keys($amIdsWithJobs) as $amId) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING);
}
return [
'article_id' => $articleId,
'p_article_id' => $pArticleId,
'queued' => $queued,
'skipped' => $skipped,
'check_ids' => $checkIds,
'queue' => self::QUEUE_NAME,
];
}
/**
* 根据该节全部明细行汇总更新 t_article_main.ref_check_status
*/
public function syncAmRefCheckStatus($amId)
{
if ($amId <= 0) {
return self::AM_STATUS_NONE;
}
$rows = Db::name('article_reference_check_result')->where('am_id', $amId)->select();
if (empty($rows)) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_NONE);
return self::AM_STATUS_NONE;
}
$pending = 0;
$hasFail = false;
$done = 0;
foreach ($rows as $row) {
$st = intval($row['status']);
if ($st === 0) {
$pending++;
continue;
}
if ($st === 2 || ($st === 1 && intval($row['is_match']) === 0)) {
$hasFail = true;
}
if ($st === 1) {
$done++;
}
}
if ($pending > 0) {
$status = self::AM_STATUS_RUNNING;
} elseif ($hasFail) {
$status = self::AM_STATUS_FAIL;
} elseif ($done === count($rows)) {
$status = self::AM_STATUS_PASS;
} else {
$status = self::AM_STATUS_FAIL;
}
$this->setAmRefCheckStatus($amId, $status);
return $status;
}
public function setAmRefCheckStatus($amId, $status)
{
if ($amId <= 0) {
return;
}
Db::name('article_main')->where('am_id', $amId)->update([
'ref_check_status' => $status,
]);
}
public function clearArticleChecks($articleId)
{
Db::name('article_reference_check_result')->where('article_id', $articleId)->delete();
Db::name('article_main')
->where('article_id', $articleId)
->whereIn('state', [0, 2])
->update(['ref_check_status' => self::AM_STATUS_NONE]);
}
public static function amStatusLabel($status)
{
$map = [
self::AM_STATUS_NONE => 'none',
self::AM_STATUS_PASS => 'pass',
self::AM_STATUS_FAIL => 'fail',
self::AM_STATUS_RUNNING => 'running',
];
return isset($map[$status]) ? $map[$status] : 'unknown';
}
public function getResult($checkId)
{
if ($checkId <= 0) {
return null;
}
$row = Db::name('article_reference_check_result')->where('check_id', $checkId)->find();
return $row ?: null;
}
public function listByArticle($articleId, $status = -1, $onlyMismatch = false)
{
$q = Db::name('article_reference_check_result')->where('article_id', $articleId);
if ($status >= 0) {
$q->where('status', $status);
}
if ($onlyMismatch) {
$q->where('status', 1)->where('is_match', 0);
}
return $q->order('am_id asc, cite_tag_start asc, reference_no asc')->select();
}
/**
* 稿件预览:在 content 上标记不合理引用序号与引用句
*
* @return array{sections: array, issues: array, stats: array}
*/
public function buildArticlePreview($articleId, $amId = 0)
{
$q = Db::name('article_main')
->field('am_id,content,sort,ref_check_status')
->where('article_id', $articleId)
->whereIn('state', [0, 2]);
if ($amId > 0) {
$q->where('am_id', $amId);
}
$mains = $q->order('sort asc')->select();
$rows = $this->listByArticle($articleId, 1);
$badByAm = $this->indexBadResults($rows);
$sections = [];
$issues = [];
$stats = ['total' => 0, 'mismatch' => 0, 'match' => 0, 'pending' => 0];
foreach ($this->listByArticle($articleId, -1) as $r) {
$stats['total']++;
if (intval($r['status']) === 0) {
$stats['pending']++;
} elseif (intval($r['is_match']) === 1) {
$stats['match']++;
} else {
$stats['mismatch']++;
}
}
foreach ($mains as $main) {
$id = intval($main['am_id']);
$content = (string)$main['content'];
$badIndex = isset($badByAm[$id]) ? $badByAm[$id] : array();
$marked = $this->markContentForPreview($content, $id, $badIndex);
$amStatus = intval($this->arrGet($main, 'ref_check_status', 0));
$sections[] = [
'am_id' => $id,
'ref_check_status' => $amStatus,
'ref_check_pass' => $amStatus === self::AM_STATUS_PASS,
'ref_check_label' => self::amStatusLabel($amStatus),
'content' => $content,
'content_marked' => $marked['html'],
'issue_count' => $marked['issue_count'],
];
foreach ($marked['issues'] as $issue) {
$issues[] = $issue;
}
}
$articlePass = $this->resolveArticlePass($sections);
return [
'article_id' => $articleId,
'article_ref_check_pass' => $articlePass,
'sections' => $sections,
'issues' => $issues,
'stats' => $stats,
];
}
/**
* 全文是否通过:各节均为 pass且无 running/fail无引用节忽略
*/
private function resolveArticlePass($sections)
{
$hasChecked = false;
foreach ($sections as $sec) {
$st = intval($this->arrGet($sec, 'ref_check_status', 0));
if ($st === self::AM_STATUS_NONE) {
continue;
}
$hasChecked = true;
if ($st !== self::AM_STATUS_PASS) {
return false;
}
}
return $hasChecked ? true : null;
}
/**
* @param array $rows status=1 的检测结果
* @return array<int, array> am_id => indexed bad map
*/
private function indexBadResults($rows)
{
$byAm = [];
foreach ($rows as $row) {
if (intval($row['status']) !== 1 || intval($row['is_match']) === 1) {
continue;
}
$amId = intval($row['am_id']);
$refNo = intval($row['reference_no']);
if ($amId <= 0 || $refNo <= 0) {
continue;
}
if (!isset($byAm[$amId])) {
$byAm[$amId] = ['by_raw' => [], 'contexts' => []];
}
$rawKey = $this->normalizeRefRawKey((string)$this->arrGet($row, 'reference_raw', ''));
if ($rawKey !== '') {
$byAm[$amId]['by_raw'][$rawKey][$refNo] = $row;
}
$ctxKey = intval($row['text_start']) . '_' . intval($row['text_end']);
if (!isset($byAm[$amId]['contexts'][$ctxKey])) {
$byAm[$amId]['contexts'][$ctxKey] = [
'text_start' => intval($row['text_start']),
'text_end' => intval($row['text_end']),
'check_ids' => [],
'reasons' => [],
'ref_nos' => [],
];
}
$byAm[$amId]['contexts'][$ctxKey]['check_ids'][] = intval($row['check_id']);
$byAm[$amId]['contexts'][$ctxKey]['ref_nos'][] = $refNo;
$reason = trim((string)$this->arrGet($row, 'reason', ''));
if ($reason !== '') {
$byAm[$amId]['contexts'][$ctxKey]['reasons'][$refNo] = $reason;
}
}
return $byAm;
}
private function normalizeRefRawKey($raw)
{
$raw = str_replace(
['', '', '—', '', '', '', ' '],
[',', '-', '-', '-', '-', '-', ''],
trim($raw)
);
return strtolower($raw);
}
/**
* @param array $badIndex indexBadResults 中单 am 的结构
*/
private function markContentForPreview($content, $amId, $badIndex)
{
$badByRaw = isset($badIndex['by_raw']) ? $badIndex['by_raw'] : array();
$contexts = isset($badIndex['contexts']) ? $badIndex['contexts'] : array();
$issues = array();
$issueCount = 0;
if ($content === '' || (empty($badByRaw) && empty($contexts))) {
return array('html' => $content, 'issues' => array(), 'issue_count' => 0);
}
$html = $content;
// 1) 先标记 blue 内各序号(在原文上操作,[70-73] 仅标不合理者如 70、71
preg_match_all(
'/<blue>\[([\d,\-\s]+)\]<\/blue>/',
$html,
$matches,
PREG_OFFSET_CAPTURE
);
$citeDeltas = [];
if (!empty($matches[0])) {
$replacements = [];
foreach ($matches[0] as $idx => $match) {
$fullTag = $match[0];
$tagStart = $match[1];
$tagEnd = $tagStart + strlen($fullTag);
$inner = $matches[1][$idx][0];
$rawKey = $this->normalizeRefRawKey($inner);
$badNums = isset($badByRaw[$rawKey]) ? $badByRaw[$rawKey] : array();
$innerMarked = preg_replace_callback(
'/\d+/',
function ($numMatch) use ($badNums, &$issues, &$issueCount, $amId, $inner) {
$num = intval($numMatch[0]);
if (!isset($badNums[$num])) {
return $numMatch[0];
}
$row = $badNums[$num];
$rowReason = isset($row['reason']) ? $row['reason'] : '';
$issueCount++;
$issues[] = array(
'am_id' => $amId,
'check_id' => intval($row['check_id']),
'reference_no' => $num,
'reference_raw' => $inner,
'reason' => $rowReason,
'confidence' => floatval(isset($row['confidence']) ? $row['confidence'] : 0),
);
$title = htmlspecialchars(
'引用[' . $num . ']不合理: ' . $rowReason,
ENT_QUOTES,
'UTF-8'
);
return '<span class="ref-no-error" data-check-id="' . intval($row['check_id'])
. '" data-ref-no="' . $num . '" title="' . $title . '">'
. $numMatch[0] . '</span>';
},
$inner
);
$tagClass = !empty($badNums) ? ' ref-cite-error' : '';
$groupIds = !empty($badNums)
? implode(',', array_map('intval', array_column($badNums, 'check_id')))
: '';
$newHtml = '<blue class="ref-cite-tag' . $tagClass . '" data-ref-raw="' . htmlspecialchars($inner, ENT_QUOTES, 'UTF-8')
. '" data-check-ids="' . $groupIds . '">[' . $innerMarked . ']</blue>';
$replacements[] = [
'start' => $tagStart,
'end' => $tagEnd,
'html' => $newHtml,
'delta' => strlen($newHtml) - ($tagEnd - $tagStart),
];
}
usort($replacements, function ($a, $b) {
return $b['start'] - $a['start'];
});
foreach ($replacements as $rep) {
$html = substr($html, 0, $rep['start']) . $rep['html'] . substr($html, $rep['end']);
$citeDeltas[] = ['start' => $rep['start'], 'delta' => $rep['delta']];
}
}
$shiftByCite = function ($pos) use ($citeDeltas) {
$d = 0;
foreach ($citeDeltas as $cd) {
if ($cd['start'] < $pos) {
$d += $cd['delta'];
}
}
return $pos + $d;
};
// 2) 再标记引用句(从后往前)
if (!empty($contexts)) {
$spans = array_values($contexts);
usort($spans, function ($a, $b) {
return $b['text_start'] - $a['text_start'];
});
foreach ($spans as $span) {
$start = $span['text_start'];
$end = $span['text_end'];
if ($start < 0 || $end <= $start) {
continue;
}
$s = $shiftByCite($start);
$e = $shiftByCite($end);
if ($e > strlen($html)) {
$e = strlen($html);
}
$checkIds = array_values(array_unique($span['check_ids']));
$refNos = array_values(array_unique($span['ref_nos']));
sort($refNos);
$reasonParts = [];
foreach ($refNos as $rn) {
if (!empty($span['reasons'][$rn])) {
$reasonParts[] = '[' . $rn . '] ' . $span['reasons'][$rn];
}
}
$title = htmlspecialchars(
'引用句可能不合理: ' . implode('; ', $reasonParts),
ENT_QUOTES,
'UTF-8'
);
$open = '<span class="ref-context-error" data-check-ids="' . implode(',', $checkIds)
. '" data-ref-nos="' . implode(',', $refNos) . '" title="' . $title . '">';
$close = '</span>';
$html = substr($html, 0, $s) . $open . substr($html, $s, $e - $s) . $close . substr($html, $e);
}
}
return ['html' => $html, 'issues' => $issues, 'issue_count' => $issueCount];
}
/**
* @return array<int, array> refer_index => row
*/
public function loadReferMapByPArticleId($pArticleId)
{
$map = [];
if ($pArticleId <= 0) {
return $map;
}
$rows = Db::name('production_article_refer')
->where('p_article_id', $pArticleId)
->where('state', 0)
->order('index asc')
->select();
foreach ($rows as $row) {
$map[intval($row['index'])] = $row;
}
return $map;
}
public function formatReferForLlm($refer)
{
$parts = [];
foreach (['title', 'author', 'joura', 'dateno', 'refer_doi', 'doilink'] as $f) {
$v = trim((string)$this->arrGet($refer, $f, ''));
if ($v !== '') {
$parts[] = ucfirst($f) . ': ' . $v;
}
}
$content = trim((string)$this->arrGet($refer, 'refer_content', ''));
if ($content !== '') {
$parts[] = 'Reference: ' . $content;
}
return implode("\n", $parts);
}
/**
* 从 article_main.content 提取 blue 引用
*/
public function extractReferences($content)
{
$result = [];
preg_match_all('/<blue>\[([\d,\-\s]+)\]<\/blue>/', $content, $matches,PREG_OFFSET_CAPTURE);
if (empty($matches[0])) {
return [];
}
foreach ($matches[0] as $index => $match) {
$fullTag = $match[0];
$tagStart = $match[1];
$tagEnd = $tagStart + strlen($fullTag);
$rawRef = trim($matches[1][$index][0]);
$referenceNumbers = $this->expandReferenceNumbers($rawRef);
$sentenceStart = $this->findSentenceStart($content, $tagStart);
$sentenceEnd = $this->findSentenceEnd($content, $tagEnd);
$originalText = mb_substr($content, $sentenceStart, $sentenceEnd - $sentenceStart);
$originalText = preg_replace('/<blue>\[[\d,\-\s]+\]<\/blue>/', '', $originalText);
$originalText = trim(strip_tags($originalText));
if ($originalText === '' || empty($referenceNumbers)) {
continue;
}
$result[] = [
'reference_raw' => $rawRef,
'reference_numbers' => $referenceNumbers,
'original_text' => $originalText,
'reference_start' => $tagStart,
'reference_end' => $tagEnd,
'text_start' => $sentenceStart,
'text_end' => $sentenceEnd,
];
}
return $result;
}
public function expandReferenceNumbers($refStr)
{
$refStr = str_replace(
['', '', '—', '', '', ''],
[',', '-', '-', '-', '-', '-'],
trim($refStr)
);
$numbers = [];
foreach (explode(',', $refStr) as $part) {
$part = trim($part);
if ($part === '') {
continue;
}
if (preg_match('/^(\d+)\s*-\s*(\d+)$/', $part, $m)) {
$start = intval($m[1]);
$end = intval($m[2]);
if ($start <= $end) {
$numbers = array_merge($numbers, range($start, $end));
}
} elseif (ctype_digit($part)) {
$numbers[] = intval($part);
}
}
return array_values(array_unique($numbers));
}
private function findSentenceStart($content, $position)
{
$start = 0;
foreach (['.', '。', '!', '?', "\n"] as $delimiter) {
$pos = strrpos(substr($content, 0, $position), $delimiter);
if ($pos !== false) {
$start = max($start, $pos + 1);
}
}
return $start;
}
private function findSentenceEnd($content, $position)
{
$length = strlen($content);
$endPositions = [];
foreach (['.', '。', '!', '?', "\n"] as $delimiter) {
$pos = strpos($content, $delimiter, $position);
if ($pos !== false) {
$endPositions[] = $pos + 1;
}
}
return empty($endPositions) ? $length : min($endPositions);
}
private function pushJob($checkId, $delaySeconds = 0)
{
$jobClass = 'app\api\job\ReferenceCheck@fire';
$data = ['check_id' => $checkId];
try {
if ($delaySeconds > 0) {
$jobId = Queue::later($delaySeconds, $jobClass, $data, self::QUEUE_NAME);
} else {
$jobId = Queue::push($jobClass, $data, self::QUEUE_NAME);
}
var_dump("=====jobId:".$jobId);
} catch (\Exception $e) {
\think\Log::error('ReferenceCheck pushJob failed check_id=' . $checkId . ' ' . $e->getMessage());
throw $e;
}
}
}