Files
tougao/application/common/ReferenceCheckService.php
2026-05-26 17:33:34 +08:00

2579 lines
93 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
namespace app\common;
use think\Db;
use think\Env;
use think\Queue;
use app\common\service\LLMService;
/**
* 正文 &lt;blue&gt;[n]&lt;/blue&gt; 引用与 t_production_article_referindex+1=n相关性校对。
* LLM 配置与 PromotionLlmService 相同;单条任务走 ReferenceCheck 队列。
*/
class ReferenceCheckService
{
const QUEUE_NAME = 'ReferenceCheck';
/** t_article_main.ref_check_status */
const AM_STATUS_NONE = 0;
const AM_STATUS_PASS = 1;
const AM_STATUS_FAIL = 2;
const AM_STATUS_RUNNING = 3;
/** 引用校对进度(按 reference_no 分组聚合后的对外状态) */
const PROGRESS_PENDING = 0; // 待校验:分组内全部明细 status=0
const PROGRESS_CHECKING = 1; // 校对中:分组内部分明细已结束、部分仍为 0
const PROGRESS_COMPLETED = 2; // 校对完成:分组内全部明细 status=1
const PROGRESS_FAILED = 3; // 校对失败:分组内全部明细已结束,且至少 1 条 status=2
/** 整篇文章的引用校对状态(对外整体状态,用于"开始/重置"按钮分流) */
const ARTICLE_PROGRESS_NONE = 0; // 还没有任何校对记录
const ARTICLE_PROGRESS_RUNNING = 1; // 至少 1 条 status=0队列里还有未跑完的
const ARTICLE_PROGRESS_COMPLETED = 2; // 所有明细 status != 0全部已完成或失败
/**
* 单条校对明细的对外状态getProgressByPArticleId 返回的 records[i].status
*
* DB 里 article_reference_check_result.status 只有 0/1/2 三种值;
* RECORD_PROCESSING 是基于 Redis 队列锁 :status='processing' 的瞬时态,
* 并不持久化。worker 进入 LLM 调用期间 DB.status 仍是 0需要靠队列锁识别。
*/
const RECORD_PENDING = 0; // 待校对,已入队但还没被 worker 拾起
const RECORD_COMPLETED = 1; // 校对完成
const RECORD_FAILED = 2; // 校对失败
const RECORD_PROCESSING = 3; // 处理中worker 正在跑 LLMRedis :status='processing'
/** LLM 评分confidence通过阈值>= 该值视为"通过" */
const PASS_CONFIDENCE_THRESHOLD = 0.65;
/**
* <blue>[...]</blue> 引用标签内允许的字符类(带 /u 修饰符使用)。
*
* 除 ASCII 数字、半角逗号、半角连字符、空白外,还兼容常见排版变体:
* U+FF0C 全角逗号
* U+2013 EN DASH
* — U+2014 EM DASH
* U+2212 MINUS SIGN
* U+2010 HYPHEN
* U+2011 NON-BREAKING HYPHEN
*
* 若不支持变体连字符,会导致 [1921] 这种区间引用整段被 preg 漏掉,
* 进而丢失对应的 reference_no 校对记录。
*/
const BLUE_TAG_REGEX = '/<blue>\[([\d,\-\x{2013}\x{2014}\x{2212}\x{2010}\x{2011}\s]+)\]<\/blue>/u';
/**
* 兼容无 ?? 的 PHP 版本
*/
private function arrGet($arr, $key, $default = '')
{
return isset($arr[$key]) ? $arr[$key] : $default;
}
/**
* 单条入队(可手工指定正文与文献文本)
*/
public function enqueue($contentA, $contentB, array $extra = [])
{
$contentA = trim($contentA);
if ($contentA === '') {
throw new \InvalidArgumentException('content_a is required');
}
$now = date('Y-m-d H:i:s');
$checkId = Db::name('article_reference_check_result')->insertGetId([
'article_id' => intval($this->arrGet($extra, 'article_id', 0)),
'am_id' => intval($this->arrGet($extra, 'am_id', 0)),
'p_article_id' => intval($this->arrGet($extra, 'p_article_id', 0)),
'p_refer_id' => intval($this->arrGet($extra, 'p_refer_id', 0)),
'refer_index' => intval($this->arrGet($extra, 'refer_index', 0)),
'reference_no' => intval($this->arrGet($extra, 'reference_no', 0)),
'reference_raw' => (string)$this->arrGet($extra, 'reference_raw', ''),
'cite_tag_start' => intval($this->arrGet($extra, 'cite_tag_start', 0)),
'cite_tag_end' => intval($this->arrGet($extra, 'cite_tag_end', 0)),
'text_start' => intval($this->arrGet($extra, 'text_start', 0)),
'text_end' => intval($this->arrGet($extra, 'text_end', 0)),
'content_a' => $contentA,
'content_b' => trim($contentB),
'status' => 0,
'created_at' => $now,
'updated_at' => $now,
]);
$amId = intval($this->arrGet($extra, 'am_id', 0));
if ($amId > 0) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING);
}
$this->pushJob(intval($checkId), intval($this->arrGet($extra, 'queue_delay', 0)));
return ['check_id' => $checkId, 'queued' => 1];
}
public function enqueueByArticleMain($main){
$amId = $main['am_id'];
// $main = Db::name('article_main')
// ->field('am_id,content,article_id')
// ->where('am_id', $amId)
// ->whereIn('state', [0, 2])
// ->find();
$citations = $this->extractReferences((string)$main['content']);
// return $citations;
if (empty($citations)) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_NONE);
return;
}
$prod = Db::name('production_article')
->where('article_id', $main['article_id'])
->where('state', 0)
->find();
if (empty($prod)) {
throw new \RuntimeException('production_article not found for article_id=' . $main['article_id']);
}
$pArticleId = intval($prod['p_article_id']);
$referMap = $this->loadReferMapByPArticleId($pArticleId);
if (empty($citations)) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_PASS);
return;
}
$skipped = 0;
$delay = 0;
foreach ($citations as $cite) {
foreach ($cite['reference_numbers'] as $refNo) {
$referIndex = $refNo - 1;
if ($referIndex < 0 || !isset($referMap[$referIndex])) {
$skipped++;
continue;
}
$refer = $referMap[$referIndex];
$referText = $this->formatReferForLlm($refer);
$now = date('Y-m-d H:i:s');
// [70-73] 展开为 reference_no=70,71,72,73 共 4 条记录
$checkId = Db::name('article_reference_check_result')->insertGetId([
'article_id' => $main['article_id'],
'p_article_id' => $pArticleId,
'am_id' => intval($main['am_id']),
'reference_no' => $refNo,
'refer_index' => $refNo,
'origin_text' => $cite['original_text'],
'refer_text' => $referText,
'p_refer_id' => $referMap[$referIndex]['p_refer_id'],
'text_start' => $cite['text_start'],
'text_end' => $cite['text_end'],
'created_at' => $now,
'updated_at' => $now,
]);
$this->pushJob(intval($checkId), $delay);
$checkIds[] = $checkId;
$delay += 1;
}
}
$this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING);
}
/**
* 手工触发:对已完成且 confidence<=0.65 的记录入队 DOI 第二轮复核
*/
public function enqueueSecondPassByArticle($articleId)
{
$articleId = intval($articleId);
if ($articleId <= 0) {
throw new \InvalidArgumentException('article_id is required');
}
$rows = Db::name('article_reference_check_result')
->where('article_id', $articleId)
->where('status', 1)
->where('confidence', '<=', 0.65)
->orderRaw('rand()')
->limit(2)
->select();
$checkIds2 = [];
$delay2 = 0;
foreach ($rows as $checkLog) {
$rowId = $this->resolveCheckRowId($checkLog);
if ($this->maybeEnqueueSecondPass($rowId, floatval($checkLog['confidence']))) {
$checkIds2[] = $rowId;
$delay2 += 1;
}
}
return [
'article_id' => $articleId,
'check_ids2' => $checkIds2,
'queued' => count($checkIds2),
];
}
public function enqueueByPArticle($prod){
if (empty($prod)) {
throw new \RuntimeException('production_article not found');
}
$pArticleId = intval($prod['p_article_id']);
$articleId = intval($prod['article_id']);
$referMap = $this->loadReferMapByPArticleId($pArticleId);
$mains = Db::name('article_main')
->field('am_id,content,article_id')
->where('article_id', $articleId)
->whereIn('state', [0, 2])
->order('sort asc')
->select();
if (empty($mains)) {
throw new \RuntimeException('article_main is empty');
}
$queued = 0;
$skipped = 0;
$pendingJobs = [];
$amIdsWithJobs = [];
$now = date('Y-m-d H:i:s');
foreach ($mains as $main) {
$amId = intval($main['am_id']);
$citations = $this->extractReferences((string)$main['content']);
if (empty($citations)) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_NONE);
continue;
}
foreach ($citations as $cite) {
foreach ($cite['reference_numbers'] as $refNo) {
$referIndex = $refNo - 1;
if ($referIndex < 0 || !isset($referMap[$referIndex])) {
$skipped++;
continue;
}
$refer = $referMap[$referIndex];
$referText = $this->formatReferForLlm($refer);
// [70-73] 展开为 reference_no=70,71,72,73 共 4 条记录;先入队表,再按文献号正序校对
$checkId = Db::name('article_reference_check_result')->insertGetId([
'article_id' => $main['article_id'],
'p_article_id' => $pArticleId,
'am_id' => $amId,
'reference_no' => $refNo,
'refer_index' => $refNo,
'origin_text' => $cite['original_text'],
'refer_text' => $referText,
'p_refer_id' => $referMap[$referIndex]['p_refer_id'],
'text_start' => $cite['text_start'],
'text_end' => $cite['text_end'],
'created_at' => $now,
'updated_at' => $now,
]);
$pendingJobs[] = [
'check_id' => intval($checkId),
'reference_no' => intval($refNo),
'am_id' => $amId,
'text_start' => intval($cite['text_start']),
];
$queued++;
$amIdsWithJobs[$amId] = true;
}
}
}
$checkIds = $this->pushJobsSortedByReferenceNo($pendingJobs);
foreach (array_keys($amIdsWithJobs) as $amId) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING);
}
return [
'article_id' => $articleId,
'p_article_id' => $pArticleId,
'queued' => $queued,
'skipped' => $skipped,
'check_ids' => $checkIds,
'queue' => self::QUEUE_NAME,
];
}
public function enqueueByArticle($articleId){
if ($articleId <= 0) {
throw new \InvalidArgumentException('article_id is required');
}
$prod = Db::name('production_article')
->where('article_id', $articleId)
->whereIn('state', [0, 2])
->find();
if (empty($prod)) {
throw new \RuntimeException('production_article not found for article_id=' . $articleId);
}
$pArticleId = intval($prod['p_article_id']);
$referMap = $this->loadReferMapByPArticleId($pArticleId);
$mains = Db::name('article_main')
->field('am_id,content,article_id')
->where('article_id', $articleId)
->whereIn('state', [0, 2])
->order('sort asc')
->select();
if (empty($mains)) {
throw new \RuntimeException('article_main is empty');
}
$queued = 0;
$skipped = 0;
$pendingJobs = [];
$amIdsWithJobs = [];
$now = date('Y-m-d H:i:s');
foreach ($mains as $main) {
$amId = intval($main['am_id']);
$citations = $this->extractReferences((string)$main['content']);
if (empty($citations)) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_NONE);
continue;
}
foreach ($citations as $cite) {
foreach ($cite['reference_numbers'] as $refNo) {
$referIndex = $refNo - 1;
if ($referIndex < 0 || !isset($referMap[$referIndex])) {
$skipped++;
continue;
}
$refer = $referMap[$referIndex];
$referText = $this->formatReferForLlm($refer);
// [70-73] 展开为 reference_no=70,71,72,73 共 4 条记录;先入队表,再按文献号正序校对
$checkId = Db::name('article_reference_check_result')->insertGetId([
'article_id' => $main['article_id'],
'p_article_id' => $pArticleId,
'am_id' => $amId,
'reference_no' => $refNo,
'refer_index' => $refNo,
'origin_text' => $cite['original_text'],
'refer_text' => $referText,
'p_refer_id' => $referMap[$referIndex]['p_refer_id'],
'text_start' => $cite['text_start'],
'text_end' => $cite['text_end'],
'created_at' => $now,
'updated_at' => $now,
]);
$pendingJobs[] = [
'check_id' => intval($checkId),
'reference_no' => intval($refNo),
'am_id' => $amId,
'text_start' => intval($cite['text_start']),
];
$queued++;
$amIdsWithJobs[$amId] = true;
}
}
}
$checkIds = $this->pushJobsSortedByReferenceNo($pendingJobs);
foreach (array_keys($amIdsWithJobs) as $amId) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING);
}
return [
'article_id' => $articleId,
'p_article_id' => $pArticleId,
'queued' => $queued,
'skipped' => $skipped,
'check_ids' => $checkIds,
'queue' => self::QUEUE_NAME,
];
}
/**
* 根据该节全部明细行汇总更新 t_article_main.ref_check_status
*/
public function syncAmRefCheckStatus($amId)
{
if ($amId <= 0) {
return self::AM_STATUS_NONE;
}
$rows = Db::name('article_reference_check_result')->where('am_id', $amId)->select();
if (empty($rows)) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_NONE);
return self::AM_STATUS_NONE;
}
$pending = 0;
$hasFail = false;
$done = 0;
foreach ($rows as $row) {
$st = intval($row['status']);
if ($st === 0) {
$pending++;
continue;
}
if ($st === 2 || ($st === 1 && intval($row['is_match']) === 0)) {
$hasFail = true;
}
if ($st === 1) {
$done++;
}
}
if ($pending > 0) {
$status = self::AM_STATUS_RUNNING;
} elseif ($hasFail) {
$status = self::AM_STATUS_FAIL;
} elseif ($done === count($rows)) {
$status = self::AM_STATUS_PASS;
} else {
$status = self::AM_STATUS_FAIL;
}
$this->setAmRefCheckStatus($amId, $status);
return $status;
}
public function setAmRefCheckStatus($amId, $status)
{
if ($amId <= 0) {
return;
}
Db::name('article_main')->where('am_id', $amId)->update([
'ref_check_status' => $status,
]);
}
/**
* 按 p_article_id 清空整篇文章的引用校对明细 + 重置节级 ref_check_status。
*
* 用于新增/删除文献后,旧的 reference_no 全部错位、原校对结果失效的场景:
* 物理删除后,整篇状态查询自然回到 ARTICLE_PROGRESS_NONE未校对
*
* @return int 被删除的明细条数
*/
public function clearArticleChecksByPArticleId($pArticleId)
{
$pArticleId = intval($pArticleId);
if ($pArticleId <= 0) {
return 0;
}
// 先反查 article_id用于重置 article_main.ref_check_status 节级状态)
$articleId = intval(Db::name('production_article')
->where('p_article_id', $pArticleId)
->whereIn('state', [0, 2])
->value('article_id'));
// 先清掉旧记录对应的队列 Redis 锁,避免在途 worker 写回数据
$oldIds = Db::name('article_reference_check_result')
->where('p_article_id', $pArticleId)
->column('id');
foreach ($oldIds as $oldId) {
$this->clearReferenceCheckQueueLock(intval($oldId));
}
$deleted = Db::name('article_reference_check_result')
->where('p_article_id', $pArticleId)
->delete();
if ($articleId > 0) {
Db::name('article_main')
->where('article_id', $articleId)
->whereIn('state', [0, 2])
->update(['ref_check_status' => self::AM_STATUS_NONE]);
}
return intval($deleted);
}
public function clearArticleChecks($articleId)
{
$articleId = intval($articleId);
if ($articleId <= 0) {
return 0;
}
// 先清掉旧记录对应的队列 Redis 锁,否则同 check_id 在 TTL 内不会再次执行
$oldIds = Db::name('article_reference_check_result')
->where('article_id', $articleId)
->column('id');
foreach ($oldIds as $oldId) {
$this->clearReferenceCheckQueueLock(intval($oldId));
}
$deleted = Db::name('article_reference_check_result')->where('article_id', $articleId)->delete();
Db::name('article_main')
->where('article_id', $articleId)
->whereIn('state', [0, 2])
->update(['ref_check_status' => self::AM_STATUS_NONE]);
return intval($deleted);
}
/**
* 文献列表局部挪动后,仅刷新指定 p_refer_id 对应的校对明细 reference_no / refer_index。
*
* 读 production_article_refer 的最新 index 来算新序号index + 1避免外部传入过期值。
* 仅更新受影响的两条左右记录,降低与并发挪动互相覆盖的风险。
*
* @param int[] $pReferIds 受影响的 p_refer_id一般为 2 个:被挪条目 + 其相邻条目)
* @param int $pArticleId 可选:附加 p_article_id 限定,进一步缩小行锁范围
* @return array{p_refer_ids:int[], affected_rows:int, changes:array}
*/
public function syncReferenceNoByPReferIds(array $pReferIds, $pArticleId = 0)
{
$pReferIds = array_values(array_unique(array_filter(array_map('intval', $pReferIds))));
$pArticleId = intval($pArticleId);
if (empty($pReferIds)) {
return [
'p_refer_ids' => [],
'affected_rows' => 0,
'changes' => [],
];
}
$referQuery = Db::name('production_article_refer')
->field('p_refer_id,p_article_id,index')
->whereIn('p_refer_id', $pReferIds)
->where('state', 0);
if ($pArticleId > 0) {
$referQuery->where('p_article_id', $pArticleId);
}
$refers = $referQuery->select();
if (empty($refers)) {
return [
'p_refer_ids' => $pReferIds,
'affected_rows' => 0,
'changes' => [],
];
}
$now = date('Y-m-d H:i:s');
$affected = 0;
$changes = [];
foreach ($refers as $refer) {
$pReferId = intval($refer['p_refer_id']);
$newNo = intval($refer['index']) + 1;
$updateQuery = Db::name('article_reference_check_result')
->where('p_refer_id', $pReferId)
->where('reference_no', '<>', $newNo);
if ($pArticleId > 0) {
$updateQuery->where('p_article_id', $pArticleId);
}
$rows = $updateQuery->update([
'reference_no' => $newNo,
'refer_index' => $newNo,
'updated_at' => $now,
]);
if ($rows > 0) {
$affected += intval($rows);
$changes[] = [
'p_refer_id' => $pReferId,
'new_ref_no' => $newNo,
'affected_rows' => intval($rows),
];
}
}
return [
'p_refer_ids' => $pReferIds,
'affected_rows' => $affected,
'changes' => $changes,
];
}
/**
* 重置整篇稿件的引用校对:删除旧明细 + 清理队列锁 + 全文重新入队校对
*
* @return array
*/
/**
* 按 p_article_id 查整篇文章的引用校对总状态。
*
* 统计维度是"参考文献"(按 reference_no 分组),不是单条校对明细行。
* 例如 50 条参考文献、底层明细 111 条时total 返回 50。
*
* 返回 status 数值含义(整篇):
* 0 = ARTICLE_PROGRESS_NONE 一条校对记录都没有
* 1 = ARTICLE_PROGRESS_RUNNING 至少 1 条参考文献仍有未跑完的明细
* 2 = ARTICLE_PROGRESS_COMPLETED 所有参考文献的全部明细都已结束
*
* 每条参考文献按其明细 status 分布落桶(互斥):
* pending —— 组内任一明细 status=0含部分跑完的"校对中"也归此桶)
* done —— 组内全部明细 status=1
* failed —— 组内全部明细已结束、至少 1 条 status=2
*
* pending + done + failed = totalprogress_percent = (done + failed) / total。
* 分组明细请走 getProgressByPArticleId控制器 referenceCheckProgressAI
*
* @return array{p_article_id:int, status:int, total:int, pending:int, done:int, failed:int, progress_percent:float}
*/
public function getArticleProgressStatusByPArticleId($pArticleId)
{
$pArticleId = intval($pArticleId);
if ($pArticleId <= 0) {
throw new \InvalidArgumentException('p_article_id is required');
}
// 一条 SQL 按 reference_no 聚合,组内 status 分布一并算出来;
// 50 条参考文献 → 返回 50 行PHP 走一次循环分桶即可
$rows = Db::name('article_reference_check_result')
->field('reference_no'
. ', SUM(CASE WHEN status = 0 THEN 1 ELSE 0 END) AS pending_cnt'
. ', SUM(CASE WHEN status = 2 THEN 1 ELSE 0 END) AS failed_cnt')
->where('p_article_id', $pArticleId)
->group('reference_no')
->select();
if (empty($rows)) {
return [
'p_article_id' => $pArticleId,
'status' => self::ARTICLE_PROGRESS_NONE,
'total' => 0,
'pending' => 0,
'done' => 0,
'failed' => 0,
'progress_percent' => 0,
];
}
$pending = 0;
$done = 0;
$failed = 0;
foreach ($rows as $row) {
$pendingCnt = intval($this->arrGet($row, 'pending_cnt', 0));
$failedCnt = intval($this->arrGet($row, 'failed_cnt', 0));
if ($pendingCnt > 0) {
$pending++;
} elseif ($failedCnt > 0) {
$failed++;
} else {
$done++;
}
}
$total = count($rows);
$articleStatus = $pending > 0
? self::ARTICLE_PROGRESS_RUNNING
: self::ARTICLE_PROGRESS_COMPLETED;
$finished = $done + $failed;
$progressPercent = round($finished / $total * 100, 1);
return [
'p_article_id' => $pArticleId,
'status' => $articleStatus,
'total' => $total,
'pending' => $pending,
'done' => $done,
'failed' => $failed,
'progress_percent' => $progressPercent,
];
}
/**
* 按 p_article_id 查整篇引用校对进度,按 reference_no 分组聚合状态,并展开每条明细。
*
* 单条 article_reference_check_result.status
* 0 = 待校验 1 = 校对完成 2 = 校对失败
*
* 分组reference_no状态返回字段 status数值类型
* 0 = PROGRESS_PENDING 待校验 :分组内全部明细 status=0
* 1 = PROGRESS_CHECKING 校对中 :分组内部分明细已结束、部分仍为 0
* 2 = PROGRESS_COMPLETED 校对完成:分组内全部明细 status=1
* 3 = PROGRESS_FAILED 校对失败:分组内全部明细已结束,且至少 1 条 status=2
*
* 每个分组还会展开 records 子数组,给出该 reference_no 下每条 check 明细的:
* - status同上 0/1/2
* - confidence 评分
* - is_passconfidence >= PASS_CONFIDENCE_THRESHOLD 视为通过)
*
* @return array{p_article_id:int, total_groups:int, summary:array, list:array}
*/
public function getProgressByPArticleId($pArticleId)
{
$pArticleId = intval($pArticleId);
if ($pArticleId <= 0) {
throw new \InvalidArgumentException('p_article_id is required');
}
$rows = Db::name('article_reference_check_result')
->field('id,p_refer_id,reference_no,am_id,status,confidence,is_match,reason,text_start,text_end,updated_at')
->where('p_article_id', $pArticleId)
->order('reference_no asc, id asc')
->select();
// summary 用数值键0/1/2/3 对应 PROGRESS_* 常量
$summary = [
self::PROGRESS_PENDING => 0,
self::PROGRESS_CHECKING => 0,
self::PROGRESS_COMPLETED => 0,
self::PROGRESS_FAILED => 0,
];
if (empty($rows)) {
return [
'p_article_id' => $pArticleId,
'total_groups' => 0,
'summary' => $summary,
'list' => [],
];
}
$groups = [];
foreach ($rows as $row) {
$refNo = intval($this->arrGet($row, 'reference_no', 0));
$pReferId = intval($this->arrGet($row, 'p_refer_id', 0));
if (!isset($groups[$refNo])) {
$groups[$refNo] = [
'reference_no' => $refNo,
'p_refer_id' => $pReferId,
'total' => 0,
'pending' => 0,
'done' => 0,
'failed' => 0,
'pass' => 0,
'last_updated_at' => '',
'records' => [],
];
}
// 同一 reference_no 理论上只对应一个 p_refer_id如果出现混淆保留首次出现的非空 id
if ($groups[$refNo]['p_refer_id'] <= 0 && $pReferId > 0) {
$groups[$refNo]['p_refer_id'] = $pReferId;
}
$groups[$refNo]['total']++;
$st = intval($this->arrGet($row, 'status', 0));
if ($st === 0) {
$groups[$refNo]['pending']++;
} elseif ($st === 1) {
$groups[$refNo]['done']++;
} elseif ($st === 2) {
$groups[$refNo]['failed']++;
}
$upd = (string)$this->arrGet($row, 'updated_at', '');
if ($upd > $groups[$refNo]['last_updated_at']) {
$groups[$refNo]['last_updated_at'] = $upd;
}
$confidence = floatval($this->arrGet($row, 'confidence', 0));
$isPass = $confidence >= self::PASS_CONFIDENCE_THRESHOLD;
if ($isPass) {
$groups[$refNo]['pass']++;
}
$groups[$refNo]['records'][] = [
'check_id' => intval($this->arrGet($row, 'id', 0)),
'am_id' => intval($this->arrGet($row, 'am_id', 0)),
'status' => $st,
'confidence' => $confidence,
'is_pass' => $isPass,
'is_match' => intval($this->arrGet($row, 'is_match', 0)),
'reason' => (string)$this->arrGet($row, 'reason', ''),
'text_start' => intval($this->arrGet($row, 'text_start', 0)),
'text_end' => intval($this->arrGet($row, 'text_end', 0)),
'last_updated_at' => $upd,
];
}
$list = [];
foreach ($groups as $g) {
$total = $g['total'];
$pending = $g['pending'];
$failed = $g['failed'];
$pass = $g['pass'];
if ($pending === $total) {
$status = self::PROGRESS_PENDING;
} elseif ($pending === 0) {
$status = $failed > 0 ? self::PROGRESS_FAILED : self::PROGRESS_COMPLETED;
} else {
$status = self::PROGRESS_CHECKING;
}
// 整体通过校验:分组已全部完成(无 pending、无 failed且每条 confidence >= 0.65
$g['is_pass'] = (
$status === self::PROGRESS_COMPLETED
&& $total > 0
&& $pass === $total
);
$summary[$status]++;
$g['status'] = $status;
$list[] = $g;
}
usort($list, function ($a, $b) {
return $a['reference_no'] - $b['reference_no'];
});
return [
'p_article_id' => $pArticleId,
'total_groups' => count($list),
'summary' => $summary,
'list' => $list,
];
}
/**
* 按 p_refer_id 查这条参考文献的所有校对明细。
*
* 每条 record 返回:
* - am_id 命中的 article_main 主键
* - confidence 匹配置信度0~1
* - reason LLM 给出的判定理由
* - is_match 是否匹配(来自 article_reference_check_result.is_match
* - is_pass 是否通过校验confidence >= PASS_CONFIDENCE_THRESHOLD
*
* @param int $pReferId production_article_refer.p_refer_id
* @return array{p_refer_id:int, p_article_id:int, reference_no:int, total:int, list:array}
*/
public function getCheckDetailsByPReferId($pReferId)
{
$pReferId = intval($pReferId);
if ($pReferId <= 0) {
throw new \InvalidArgumentException('p_refer_id is required');
}
$rows = Db::name('article_reference_check_result')
->field('id,p_article_id,reference_no,am_id,confidence,is_match,reason')
->where('p_refer_id', $pReferId)
->order('id asc')
->select();
$list = [];
$pArticleId = 0;
$referenceNo = 0;
foreach ($rows as $row) {
// 取首条出现的 p_article_id / reference_no 作为该 refer 的上下文
if ($pArticleId <= 0) {
$pArticleId = intval($this->arrGet($row, 'p_article_id', 0));
}
if ($referenceNo <= 0) {
$referenceNo = intval($this->arrGet($row, 'reference_no', 0));
}
$confidence = floatval($this->arrGet($row, 'confidence', 0));
$list[] = [
'am_id' => intval($this->arrGet($row, 'am_id', 0)),
'confidence' => $confidence,
'reason' => (string)$this->arrGet($row, 'reason', ''),
'is_match' => intval($this->arrGet($row, 'is_match', 0)),
'is_pass' => $confidence >= self::PASS_CONFIDENCE_THRESHOLD,
];
}
return [
'p_refer_id' => $pReferId,
'p_article_id' => $pArticleId,
'reference_no' => $referenceNo,
'total' => count($list),
'list' => $list,
];
}
public function resetAndRecheckByArticle($aProductionArticle)
{
if (empty($aProductionArticle) || !is_array($aProductionArticle)) {
throw new \InvalidArgumentException('production_article is required');
}
$pArticleId = intval($this->arrGet($aProductionArticle, 'p_article_id', 0));
$articleId = intval($this->arrGet($aProductionArticle, 'article_id', 0));
if ($pArticleId <= 0 || $articleId <= 0) {
throw new \InvalidArgumentException('production_article requires both p_article_id and article_id');
}
$existing = Db::name('article_reference_check_result')
->where('p_article_id', $pArticleId)
->count();
if (intval($existing) <= 0) {
throw new \RuntimeException('no existing reference check records for p_article_id=' . $pArticleId);
}
$cleared = $this->clearArticleChecks($articleId);
$enqueueResult = $this->enqueueByArticle($articleId);
if (!is_array($enqueueResult)) {
$enqueueResult = [];
}
$enqueueResult['cleared'] = $cleared;
$enqueueResult['reset'] = 1;
return $enqueueResult;
}
public static function amStatusLabel($status)
{
$map = [
self::AM_STATUS_NONE => 'none',
self::AM_STATUS_PASS => 'pass',
self::AM_STATUS_FAIL => 'fail',
self::AM_STATUS_RUNNING => 'running',
];
return isset($map[$status]) ? $map[$status] : 'unknown';
}
/**
* 表主键为 id对外 API 参数名仍叫 check_id
*/
public function resolveCheckRowId($row)
{
if (!is_array($row)) {
return 0;
}
if (isset($row['id']) && intval($row['id']) > 0) {
return intval($row['id']);
}
if (isset($row['check_id']) && intval($row['check_id']) > 0) {
return intval($row['check_id']);
}
return 0;
}
/**
* 解析 LLM 返回的 is_match兼容 bool / 0|1 / "true"|"false" 字符串)
*/
public function parseLlmIsMatch($value)
{
if (is_bool($value)) {
return $value;
}
if (is_int($value) || is_float($value)) {
return intval($value) === 1;
}
$s = strtolower(trim((string)$value));
return in_array($s, ['1', 'true', 'yes', 'match', 'matched'], true);
}
/**
* 写入单条校对结果(统一截断 reason/error_msg避免 varchar(512) 导致 UPDATE 失败)
*
* @throws \RuntimeException
*/
public function updateCheckResult($checkId, array $fields)
{
$checkId = intval($checkId);
if ($checkId <= 0) {
throw new \InvalidArgumentException('invalid check id');
}
if (isset($fields['reason'])) {
$fields['reason'] = mb_substr(trim((string)$fields['reason']), 0, 512);
}
if (isset($fields['error_msg'])) {
$fields['error_msg'] = mb_substr(trim((string)$fields['error_msg']), 0, 512);
}
$fields['updated_at'] = date('Y-m-d H:i:s');
$exists = Db::name('article_reference_check_result')->where('id', $checkId)->find();
if (empty($exists)) {
throw new \RuntimeException('article_reference_check_result not found, id=' . $checkId);
}
$affected = Db::name('article_reference_check_result')->where('id', $checkId)->update($fields);
if ($affected === false) {
throw new \RuntimeException('article_reference_check_result update failed, id=' . $checkId);
}
\think\Log::info('updateCheckResult id=' . $checkId . ' affected=' . intval($affected));
return intval($affected);
}
public function getResult($checkId)
{
if ($checkId <= 0) {
return null;
}
$row = Db::name('article_reference_check_result')->where('id', $checkId)->find();
return $row ?: null;
}
public function listByArticle($articleId, $status = -1, $onlyMismatch = false)
{
$q = Db::name('article_reference_check_result')->where('article_id', $articleId);
if ($status >= 0) {
$q->where('status', $status);
}
if ($onlyMismatch) {
$q->where('status', 1)->where('is_match', 0);
}
return $q->order('am_id asc, cite_tag_start asc, reference_no asc')->select();
}
/**
* 稿件预览:在 content 上标记不合理引用序号与引用句
*
* @return array{sections: array, issues: array, stats: array}
*/
public function buildArticlePreview($articleId, $amId = 0)
{
$q = Db::name('article_main')
->field('am_id,content,sort,ref_check_status')
->where('article_id', $articleId)
->whereIn('state', [0, 2]);
if ($amId > 0) {
$q->where('am_id', $amId);
}
$mains = $q->order('sort asc')->select();
$rows = $this->listByArticle($articleId, 1);
$badByAm = $this->indexBadResults($rows);
$sections = [];
$issues = [];
$stats = ['total' => 0, 'mismatch' => 0, 'match' => 0, 'pending' => 0];
foreach ($this->listByArticle($articleId, -1) as $r) {
$stats['total']++;
if (intval($r['status']) === 0) {
$stats['pending']++;
} elseif (intval($r['is_match']) === 1) {
$stats['match']++;
} else {
$stats['mismatch']++;
}
}
foreach ($mains as $main) {
$id = intval($main['am_id']);
$content = (string)$main['content'];
$badIndex = isset($badByAm[$id]) ? $badByAm[$id] : array();
$marked = $this->markContentForPreview($content, $id, $badIndex);
$amStatus = intval($this->arrGet($main, 'ref_check_status', 0));
$sections[] = [
'am_id' => $id,
'ref_check_status' => $amStatus,
'ref_check_pass' => $amStatus === self::AM_STATUS_PASS,
'ref_check_label' => self::amStatusLabel($amStatus),
'content' => $content,
'content_marked' => $marked['html'],
'issue_count' => $marked['issue_count'],
];
foreach ($marked['issues'] as $issue) {
$issues[] = $issue;
}
}
$articlePass = $this->resolveArticlePass($sections);
return [
'article_id' => $articleId,
'article_ref_check_pass' => $articlePass,
'sections' => $sections,
'issues' => $issues,
'stats' => $stats,
];
}
/**
* 全文是否通过:各节均为 pass且无 running/fail无引用节忽略
*/
private function resolveArticlePass($sections)
{
$hasChecked = false;
foreach ($sections as $sec) {
$st = intval($this->arrGet($sec, 'ref_check_status', 0));
if ($st === self::AM_STATUS_NONE) {
continue;
}
$hasChecked = true;
if ($st !== self::AM_STATUS_PASS) {
return false;
}
}
return $hasChecked ? true : null;
}
/**
* @param array $rows status=1 的检测结果
* @return array<int, array> am_id => indexed bad map
*/
private function indexBadResults($rows)
{
$byAm = [];
foreach ($rows as $row) {
if (intval($row['status']) !== 1 || intval($row['is_match']) === 1) {
continue;
}
$amId = intval($row['am_id']);
$refNo = intval($row['reference_no']);
if ($amId <= 0 || $refNo <= 0) {
continue;
}
if (!isset($byAm[$amId])) {
$byAm[$amId] = ['by_raw' => [], 'contexts' => []];
}
$rawKey = $this->normalizeRefRawKey((string)$this->arrGet($row, 'reference_raw', ''));
if ($rawKey !== '') {
$byAm[$amId]['by_raw'][$rawKey][$refNo] = $row;
}
$ctxKey = intval($row['text_start']) . '_' . intval($row['text_end']);
if (!isset($byAm[$amId]['contexts'][$ctxKey])) {
$byAm[$amId]['contexts'][$ctxKey] = [
'text_start' => intval($row['text_start']),
'text_end' => intval($row['text_end']),
'check_ids' => [],
'reasons' => [],
'ref_nos' => [],
];
}
$byAm[$amId]['contexts'][$ctxKey]['check_ids'][] = $this->resolveCheckRowId($row);
$byAm[$amId]['contexts'][$ctxKey]['ref_nos'][] = $refNo;
$reason = trim((string)$this->arrGet($row, 'reason', ''));
if ($reason !== '') {
$byAm[$amId]['contexts'][$ctxKey]['reasons'][$refNo] = $reason;
}
}
return $byAm;
}
private function normalizeRefRawKey($raw)
{
$raw = str_replace(
['', '', '—', '', '', '', ' '],
[',', '-', '-', '-', '-', '-', ''],
trim($raw)
);
return strtolower($raw);
}
/**
* @param array $badIndex indexBadResults 中单 am 的结构
*/
private function markContentForPreview($content, $amId, $badIndex)
{
$badByRaw = isset($badIndex['by_raw']) ? $badIndex['by_raw'] : array();
$contexts = isset($badIndex['contexts']) ? $badIndex['contexts'] : array();
$issues = array();
$issueCount = 0;
if ($content === '' || (empty($badByRaw) && empty($contexts))) {
return array('html' => $content, 'issues' => array(), 'issue_count' => 0);
}
$html = $content;
// 1) 先标记 blue 内各序号(在原文上操作,[70-73] 仅标不合理者如 70、71
preg_match_all(
self::BLUE_TAG_REGEX,
$html,
$matches,
PREG_OFFSET_CAPTURE
);
$citeDeltas = [];
if (!empty($matches[0])) {
$replacements = [];
foreach ($matches[0] as $idx => $match) {
$fullTag = $match[0];
$tagStart = $match[1];
$tagEnd = $tagStart + strlen($fullTag);
$inner = $matches[1][$idx][0];
$rawKey = $this->normalizeRefRawKey($inner);
$badNums = isset($badByRaw[$rawKey]) ? $badByRaw[$rawKey] : array();
$innerMarked = preg_replace_callback(
'/\d+/',
function ($numMatch) use ($badNums, &$issues, &$issueCount, $amId, $inner) {
$num = intval($numMatch[0]);
if (!isset($badNums[$num])) {
return $numMatch[0];
}
$row = $badNums[$num];
$rowReason = isset($row['reason']) ? $row['reason'] : '';
$issueCount++;
$issues[] = array(
'am_id' => $amId,
'check_id' => $this->resolveCheckRowId($row),
'reference_no' => $num,
'reference_raw' => $inner,
'reason' => $rowReason,
'confidence' => floatval(isset($row['confidence']) ? $row['confidence'] : 0),
);
$title = htmlspecialchars(
'引用[' . $num . ']不合理: ' . $rowReason,
ENT_QUOTES,
'UTF-8'
);
return '<span class="ref-no-error" data-check-id="' . $this->resolveCheckRowId($row)
. '" data-ref-no="' . $num . '" title="' . $title . '">'
. $numMatch[0] . '</span>';
},
$inner
);
$tagClass = !empty($badNums) ? ' ref-cite-error' : '';
$groupIds = !empty($badNums)
? implode(',', array_map(function ($row) {
return (int) $this->resolveCheckRowId($row);
}, $badNums))
: '';
$newHtml = '<blue class="ref-cite-tag' . $tagClass . '" data-ref-raw="' . htmlspecialchars($inner, ENT_QUOTES, 'UTF-8')
. '" data-check-ids="' . $groupIds . '">[' . $innerMarked . ']</blue>';
$replacements[] = [
'start' => $tagStart,
'end' => $tagEnd,
'html' => $newHtml,
'delta' => strlen($newHtml) - ($tagEnd - $tagStart),
];
}
usort($replacements, function ($a, $b) {
return $b['start'] - $a['start'];
});
foreach ($replacements as $rep) {
$html = substr($html, 0, $rep['start']) . $rep['html'] . substr($html, $rep['end']);
$citeDeltas[] = ['start' => $rep['start'], 'delta' => $rep['delta']];
}
}
$shiftByCite = function ($pos) use ($citeDeltas) {
$d = 0;
foreach ($citeDeltas as $cd) {
if ($cd['start'] < $pos) {
$d += $cd['delta'];
}
}
return $pos + $d;
};
// 2) 再标记引用句(从后往前)
if (!empty($contexts)) {
$spans = array_values($contexts);
usort($spans, function ($a, $b) {
return $b['text_start'] - $a['text_start'];
});
foreach ($spans as $span) {
$start = $span['text_start'];
$end = $span['text_end'];
if ($start < 0 || $end <= $start) {
continue;
}
$s = $shiftByCite($start);
$e = $shiftByCite($end);
if ($e > strlen($html)) {
$e = strlen($html);
}
$checkIds = array_values(array_unique($span['check_ids']));
$refNos = array_values(array_unique($span['ref_nos']));
sort($refNos);
$reasonParts = [];
foreach ($refNos as $rn) {
if (!empty($span['reasons'][$rn])) {
$reasonParts[] = '[' . $rn . '] ' . $span['reasons'][$rn];
}
}
$title = htmlspecialchars(
'引用句可能不合理: ' . implode('; ', $reasonParts),
ENT_QUOTES,
'UTF-8'
);
$open = '<span class="ref-context-error" data-check-ids="' . implode(',', $checkIds)
. '" data-ref-nos="' . implode(',', $refNos) . '" title="' . $title . '">';
$close = '</span>';
$html = substr($html, 0, $s) . $open . substr($html, $s, $e - $s) . $close . substr($html, $e);
}
}
return ['html' => $html, 'issues' => $issues, 'issue_count' => $issueCount];
}
/**
* @return array<int, array> refer_index => row
*/
public function loadReferMapByPArticleId($pArticleId)
{
$map = [];
if ($pArticleId <= 0) {
return $map;
}
$rows = Db::name('production_article_refer')
->where('p_article_id', $pArticleId)
->where('state', 0)
->order('index asc')
->select();
foreach ($rows as $row) {
$map[intval($row['index'])] = $row;
}
return $map;
}
public function formatReferForLlm($refer)
{
$parts = [];
foreach (['title', 'author', 'joura', 'dateno', 'refer_doi', 'doilink'] as $f) {
$v = trim((string)$this->arrGet($refer, $f, ''));
if ($v !== '') {
$parts[] = ucfirst($f) . ': ' . $v;
}
}
$frag = trim((string)$this->arrGet($refer, 'refer_frag', ''));
$content = trim((string)$this->arrGet($refer, 'refer_content', ''));
if ($frag !== '') {
$parts[] = 'Reference: ' . $frag;
} elseif ($content !== '') {
$parts[] = 'Reference: ' . $content;
}
return implode("\n", $parts);
}
/**
* 前端修改参考文献后重新校对:仅处理已有校对记录,刷新 refer_text、重置结果并入队无记录直接返回
*
* @param int $articleId
* @param int $pReferId t_production_article_refer.p_refer_id优先
* @param int $referenceNo 文献序号 index+1无 p_refer_id 时用)
* @return array
*/
/**
* 编辑某条文献内容后,按 p_refer_id 异步重新校对该文献对应的全部 check 明细
*
* 流程:刷新 refer_text/refer_index → 重置 status/is_match/confidence/reason
* → 设节级 ref_check_status=RUNNING → 投递到 ReferenceCheck 队列
*
* 与 recheckByRefer 的差异:本方法**不**在请求内同步跑 LLM仅入队立即返回。
* 前端可调 getProgressByPArticleId 轮询进度。
*
* @param int $pReferId t_production_article_refer.p_refer_id必填
* @param int $pArticleId 可选:传入跳过 refer 表二次查表
* @return array{p_refer_id:int, p_article_id:int, reference_no:int, reset:int, queued:int, check_ids:int[], queue:string}
*/
public function enqueueRecheckByPReferId($pReferId, $pArticleId = 0)
{
$pReferId = intval($pReferId);
if ($pReferId <= 0) {
throw new \InvalidArgumentException('p_refer_id is required');
}
$refer = Db::name('production_article_refer')
->where('p_refer_id', $pReferId)
->where('state', 0)
->find();
if (empty($refer)) {
throw new \RuntimeException('production_article_refer not found, p_refer_id=' . $pReferId);
}
$pArticleId = intval($pArticleId);
if ($pArticleId <= 0) {
$pArticleId = intval($this->arrGet($refer, 'p_article_id', 0));
}
if ($pArticleId <= 0) {
throw new \RuntimeException('p_article_id is missing for p_refer_id=' . $pReferId);
}
$referenceNo = intval($this->arrGet($refer, 'index', 0)) + 1;
$referText = $this->formatReferForLlm($refer);
$now = date('Y-m-d H:i:s');
$rows = Db::name('article_reference_check_result')
->where('p_article_id', $pArticleId)
->where('p_refer_id', $pReferId)
->select();
if (empty($rows)) {
return [
'p_refer_id' => $pReferId,
'p_article_id' => $pArticleId,
'reference_no' => $referenceNo,
'reset' => 0,
'queued' => 0,
'check_ids' => [],
'queue' => self::QUEUE_NAME,
];
}
$resetFields = [
'refer_text' => $referText,
'refer_index' => $referenceNo,
'reference_no' => $referenceNo,
'status' => 0,
'is_match' => 0,
'can_support' => 0,
'confidence' => 0,
'reason' => '',
'error_msg' => '',
'updated_at' => $now,
];
$pendingJobs = [];
$amIds = [];
foreach ($rows as $row) {
$checkId = $this->resolveCheckRowId($row);
Db::name('article_reference_check_result')->where('id', $checkId)->update($resetFields);
// 旧的队列完成标记必须清掉,否则同 check_id 再次投递会被 acquireLock 静默丢弃
$this->clearReferenceCheckQueueLock($checkId);
$pendingJobs[] = [
'check_id' => $checkId,
'reference_no' => $referenceNo,
'am_id' => intval($this->arrGet($row, 'am_id', 0)),
'text_start' => intval($this->arrGet($row, 'text_start', 0)),
];
$amId = intval($this->arrGet($row, 'am_id', 0));
if ($amId > 0) {
$amIds[$amId] = true;
}
}
foreach (array_keys($amIds) as $amId) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING);
}
$checkIds = $this->pushJobsSortedByReferenceNo($pendingJobs);
return [
'p_refer_id' => $pReferId,
'p_article_id' => $pArticleId,
'reference_no' => $referenceNo,
'reset' => count($rows),
'queued' => count($checkIds),
'check_ids' => $checkIds,
'queue' => self::QUEUE_NAME,
];
}
public function recheckByRefer($articleId, $pReferId = 0, $referenceNo = 0)
{
$articleId = intval($articleId);
if ($articleId <= 0) {
throw new \InvalidArgumentException('article_id is required');
}
$ctx = $this->resolveReferForRecheck($articleId, intval($pReferId), intval($referenceNo));
$refer = $ctx['refer'];
$pReferId = $ctx['p_refer_id'];
$referenceNo = $ctx['reference_no'];
$pArticleId = $ctx['p_article_id'];
$referText = $this->formatReferForLlm($refer);
$now = date('Y-m-d H:i:s');
$rows = Db::name('article_reference_check_result')
->where('article_id', $articleId)
->where(function ($query) use ($pReferId, $referenceNo) {
$query->where('p_refer_id', $pReferId)->whereOr('reference_no', $referenceNo);
})
->select();
if (empty($rows)) {
return [
'article_id' => $articleId,
'p_refer_id' => $pReferId,
'reference_no' => $referenceNo,
'reset' => 0,
'queued' => 0,
'check_ids' => [],
'queue' => self::QUEUE_NAME,
];
}
$resetFields = [
'refer_text' => $referText,
'p_refer_id' => $pReferId,
'p_article_id' => $pArticleId,
'refer_index' => $referenceNo,
'status' => 0,
'is_match' => 0,
'can_support' => 0,
'confidence' => 0,
'reason' => '',
'error_msg' => '',
'updated_at' => $now,
];
$pendingJobs = [];
$amIds = [];
foreach ($rows as $row) {
$checkId = $this->resolveCheckRowId($row);
Db::name('article_reference_check_result')->where('id', $checkId)->update($resetFields);
$pendingJobs[] = [
'check_id' => $checkId,
'reference_no' => $referenceNo,
'am_id' => intval($row['am_id']),
'text_start' => intval(isset($row['text_start']) ? $row['text_start'] : 0),
];
$amId = intval($row['am_id']);
if ($amId > 0) {
$amIds[$amId] = true;
}
}
foreach (array_keys($amIds) as $amId) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING);
}
usort($pendingJobs, function ($a, $b) {
if ($a['reference_no'] !== $b['reference_no']) {
return $a['reference_no'] - $b['reference_no'];
}
if ($a['am_id'] !== $b['am_id']) {
return $a['am_id'] - $b['am_id'];
}
return $a['text_start'] - $b['text_start'];
});
$checkIds = [];
$results = [];
$failed = [];
foreach ($pendingJobs as $job) {
$checkId = intval($job['check_id']);
$checkIds[] = $checkId;
$this->clearReferenceCheckQueueLock($checkId);
try {
$results[] = $this->runReferenceCheckOnce($checkId);
} catch (\Exception $e) {
$failed[] = [
'check_id' => $checkId,
'error' => $e->getMessage(),
];
\think\Log::error('recheckByRefer check_id=' . $checkId . ' ' . $e->getMessage());
}
}
foreach (array_keys($amIds) as $amId) {
$this->syncAmRefCheckStatus($amId);
}
return [
'article_id' => $articleId,
'p_refer_id' => $pReferId,
'reference_no' => $referenceNo,
'reset' => count($rows),
'checked' => count($results),
'failed' => count($failed),
'check_ids' => $checkIds,
'results' => $results,
'errors' => $failed,
];
}
/**
* 清除队列 Redis 完成标记,避免重检任务被 acquireLock 静默丢弃
*/
public function clearReferenceCheckQueueLock($checkId)
{
$checkId = intval($checkId);
if ($checkId <= 0) {
return;
}
try {
$keys = [];
foreach (['queue_job', 'queue_job_two'] as $prefix) {
$class = $prefix === 'queue_job_two'
? 'app\\api\\job\\ReferenceCheckTwo'
: 'app\\api\\job\\ReferenceCheck';
$base = $prefix . ':' . $class . ':' . $checkId;
$keys[] = $base;
$keys[] = $base . ':status';
}
QueueRedis::getInstance()->deleteRedisKeys($keys);
} catch (\Exception $e) {
\think\Log::warning('clearReferenceCheckQueueLock id=' . $checkId . ' ' . $e->getMessage());
}
}
/**
* 执行一次引用 LLM 校对(同步,写回 article_reference_check_result
*/
public function runReferenceCheckOnce($checkId)
{
$checkId = intval($checkId);
$row = Db::name('article_reference_check_result')->where('id', $checkId)->find();
if (empty($row)) {
throw new \RuntimeException('article_reference_check_result not found, id=' . $checkId);
}
$contentA = $this->resolveMainContentForJob($row);
$refer = null;
if (intval($row['p_refer_id']) > 0) {
$refer = Db::name('production_article_refer')
->where('p_refer_id', intval($row['p_refer_id']))
->where('state', 0)
->find();
}
if ($refer) {
$contentB = $this->formatReferForLlm($refer);
} else {
$contentB = trim((string)$this->arrGet($row, 'refer_text', ''));
}
if ($contentA === '' || $contentB === '') {
$this->updateCheckResult($checkId, [
'status' => 2,
'error_msg' => 'Missing article_main.content or refer_text',
]);
throw new \RuntimeException('Missing article_main.content or refer_text');
}
$llmResult = (new LLMService())->checkReference($contentA, $contentB, false);
$requestFailed = !empty($llmResult['request_failed']);
$canSupport = $this->parseLlmCanSupport($llmResult);
$confidence = floatval(isset($llmResult['confidence']) ? $llmResult['confidence'] : 0);
$reason = isset($llmResult['reason']) ? $llmResult['reason'] : '';
// LLM 通讯失败:写 status=2(校对失败) + error_msg抛异常让队列 worker 走 release(30) 重试;
// 重试 3 次后 ReferenceCheck::markFailed 会保持 status=2 收尾
if ($requestFailed) {
$this->updateCheckResult($checkId, [
'confidence' => $confidence,
'reason' => $reason,
'status' => 2,
'error_msg' => $reason,
]);
$this->clearReferenceCheckQueueLock($checkId);
throw new \RuntimeException($reason !== '' ? $reason : 'LLM request failed');
}
$this->updateCheckResult($checkId, [
'can_support' => $canSupport ? 1 : 0,
'is_match' => $canSupport ? 1 : 0,
'confidence' => $confidence,
'reason' => $reason,
'status' => 1,
'error_msg' => '',
]);
$this->clearReferenceCheckQueueLock($checkId);
$this->maybeEnqueueSecondPass($checkId, $confidence);
return [
'check_id' => $checkId,
'can_support' => $canSupport ? 1 : 0,
'is_match' => $canSupport ? 1 : 0,
'confidence' => $confidence,
'reason' => $reason,
];
}
/**
* @return array{refer: array, p_article_id: int, p_refer_id: int, reference_no: int}
*/
private function resolveReferForRecheck($articleId, $pReferId, $referenceNo)
{
$prod = Db::name('production_article')
->where('article_id', $articleId)
->whereIn('state', [0, 2])
->find();
if (empty($prod)) {
throw new \RuntimeException('production_article not found for article_id=' . $articleId);
}
$pArticleId = intval($prod['p_article_id']);
$refer = null;
if ($pReferId > 0) {
$refer = Db::name('production_article_refer')
->where('p_refer_id', $pReferId)
->where('p_article_id', $pArticleId)
->where('state', 0)
->find();
} elseif ($referenceNo > 0) {
$referMap = $this->loadReferMapByPArticleId($pArticleId);
$referIndex = $referenceNo - 1;
if (isset($referMap[$referIndex])) {
$refer = $referMap[$referIndex];
$pReferId = intval($refer['p_refer_id']);
}
} else {
throw new \InvalidArgumentException('p_refer_id or reference_no is required');
}
if (empty($refer)) {
throw new \RuntimeException('production_article_refer not found');
}
return [
'refer' => $refer,
'p_article_id' => $pArticleId,
'p_refer_id' => intval($refer['p_refer_id']),
'reference_no' => intval($refer['index']) + 1,
];
}
/**
* 仅使用 refer_doi 字段(二次 Crossref 摘要用)
*/
public function extractReferDoiOnly($refer)
{
if (!is_array($refer)) {
return '';
}
$raw = trim((string)$this->arrGet($refer, 'refer_doi', ''));
if ($raw === '' || stripos($raw, 'not available') !== false) {
return '';
}
$dois = $this->extractDoisFromString($raw);
return empty($dois) ? '' : $dois[0];
}
/**
* 根据 refer_doi 调用 Crossref works API 获取摘要(二次校对专用)
*
* @return array{text:string, has_abstract:bool, doi:string}
*/
public function fetchCrossrefAbstractByReferDoi($refer)
{
$doi = $this->extractReferDoiOnly($refer);
if ($doi === '') {
return ['text' => '', 'has_abstract' => false, 'doi' => ''];
}
$crossref = new CrossrefService([
'mailto' => trim((string)Env::get('crossref_mailto', '')),
]);
$block = $this->extractCrossrefBlock($doi, $crossref);
if ($block === null) {
return ['text' => '', 'has_abstract' => false, 'doi' => $doi];
}
return [
'text' => $block['text'],
'has_abstract' => !empty($block['has_abstract']),
'doi' => $doi,
];
}
/**
* 解析 LLM 返回的 can_support
*/
public function parseLlmCanSupport($llmResult)
{
if (!is_array($llmResult)) {
return false;
}
if (array_key_exists('can_support', $llmResult)) {
return $this->parseLlmIsMatch($llmResult['can_support']);
}
return $this->parseLlmIsMatch(isset($llmResult['is_match']) ? $llmResult['is_match'] : false);
}
/**
* 第一次校对:取 article_main.content整节正文
*/
public function resolveMainContentForJob(array $row, $maxChars = 8000)
{
$amId = intval($this->arrGet($row, 'am_id', 0));
if ($amId <= 0) {
return '';
}
$main = Db::name('article_main')
->field('content')
->where('am_id', $amId)
->find();
if (empty($main)) {
return '';
}
$text = trim((string)$this->arrGet($main, 'content', ''));
if ($text === '') {
return '';
}
$text = preg_replace(self::BLUE_TAG_REGEX, '[$1]', $text);
$text = strip_tags($text);
$text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8');
$text = preg_replace('/\s+/u', ' ', $text);
$text = trim($text);
$maxChars = max(500, intval($maxChars));
if (mb_strlen($text) > $maxChars) {
$text = mb_substr($text, 0, $maxChars) . '...';
}
return $text;
}
/**
* 引用处局部上下文origin_text供其它场景使用
*/
public function resolveCitationContextForJob(array $row)
{
$text = trim((string)$this->arrGet($row, 'origin_text', ''));
if ($text === '') {
$text = trim((string)$this->arrGet($row, 'content_a', ''));
}
return $text;
}
/**
* 从 refer 行提取标准 DOI10.xxxx/...
*
* 优先级refer_content原始引用文本里的 DOI 最贴近实际被引用的文献)
* > refer_doi > doi > doilink
*/
public function extractDoiFromRefer($refer)
{
$list = $this->extractAllDoiCandidatesFromRefer($refer);
return empty($list) ? '' : $list[0];
}
/**
* 返回 refer 行可能对应的全部 DOI 候选(去重,按优先级排序)
*
* 用于第二轮 DOI 复核场景:当 metadata 的 refer_doi 与原始引用文本里的 DOI
* 不一致时(数据漂移),优先尝试原始引用文本里的 DOI 抓真实摘要。
*
* @return string[]
*/
public function extractAllDoiCandidatesFromRefer($refer)
{
if (!is_array($refer)) {
return [];
}
$ordered = [
(string)$this->arrGet($refer, 'refer_content', ''),
(string)$this->arrGet($refer, 'refer_doi', ''),
(string)$this->arrGet($refer, 'doi', ''),
(string)$this->arrGet($refer, 'doilink', ''),
];
$result = [];
foreach ($ordered as $raw) {
foreach ($this->extractDoisFromString($raw) as $doi) {
if (!in_array($doi, $result, true)) {
$result[] = $doi;
}
}
}
return $result;
}
/**
* 从任意文本里抽取所有形如 10.xxxx/yyy 的 DOI
* @return string[]
*/
private function extractDoisFromString($text)
{
$text = trim((string)$text);
if ($text === '' || stripos($text, 'not available') !== false) {
return [];
}
$dois = [];
if (preg_match_all('~doi\.org/([^\s?#"\'<>]+)~i', $text, $m)) {
foreach ($m[1] as $cand) {
$cand = $this->trimDoiTail(trim($cand));
if ($this->isValidDoi($cand)) {
$dois[] = $cand;
}
}
}
if (preg_match_all('~\b(10\.\d{3,9}/[^\s?#"\'<>]+)~i', $text, $m)) {
foreach ($m[1] as $cand) {
$cand = $this->trimDoiTail(trim($cand));
if ($this->isValidDoi($cand)) {
$dois[] = $cand;
}
}
}
if ($dois === [] && strpos($text, '10.') === 0) {
$cand = $this->trimDoiTail($text);
if ($this->isValidDoi($cand)) {
$dois[] = $cand;
}
}
return array_values(array_unique($dois));
}
private function trimDoiTail($doi)
{
return rtrim($doi, ".,;:)]}>\"'\\ \t\n\r");
}
private function isValidDoi($doi)
{
return (bool)preg_match('~^10\.\d{3,9}/[^\s]+$~i', (string)$doi);
}
/**
* 通过 PubMed / Crossref 拉取 DOI 对应文献内容(本地 LLM 无法打开网页,须预先抓取)
*
* 行为:
* - 尝试 refer 行内所有 DOI 候选refer_content > refer_doi > doi > doilink
* - 优先采用第一个能拿到 abstract 的 DOI
* - PubMed 无摘要时回落到 Crossref raw 解析摘要(清理 JATS 标签)
* - 全部失败则返回空字符串(调用方据此跳过二次复核)
*/
public function fetchDoiLiteratureBlock($refer)
{
$candidates = $this->extractAllDoiCandidatesFromRefer($refer);
if (empty($candidates)) {
return '';
}
$pubmed = new PubmedService([
'email' => trim((string)Env::get('pubmed_email', '')),
'tool' => trim((string)Env::get('pubmed_tool', 'tmrjournals')),
]);
$crossref = new CrossrefService([
'mailto' => trim((string)Env::get('crossref_mailto', '')),
]);
$best = null;
$fallback = null;
foreach ($candidates as $doi) {
$block = $this->buildDoiBlockFromSources($doi, $pubmed, $crossref);
if ($block === null) {
continue;
}
if (!empty($block['has_abstract'])) {
$best = $block;
break;
}
if ($fallback === null) {
$fallback = $block;
}
}
$chosen = $best ?: $fallback;
if ($chosen === null) {
return '';
}
return $chosen['text'];
}
/**
* 拉单个 DOI 的真实内容,返回 ['text' => string, 'has_abstract' => bool] 或 null
*/
private function buildDoiBlockFromSources($doi, PubmedService $pubmed, CrossrefService $crossref)
{
$doi = trim((string)$doi);
if ($doi === '') {
return null;
}
$pub = $pubmed->fetchByDoi($doi);
$pubAbstract = is_array($pub) ? trim((string)$this->arrGet($pub, 'abstract', '')) : '';
if (is_array($pub) && ($pubAbstract !== '' || trim((string)$this->arrGet($pub, 'title', '')) !== '')) {
$lines = ['Source: PubMed (DOI ' . $doi . ')'];
if (!empty($pub['title'])) {
$lines[] = 'Actual Title: ' . trim((string)$pub['title']);
}
if (!empty($pub['journal'])) {
$lines[] = 'Journal: ' . trim((string)$pub['journal']);
}
if (!empty($pub['year'])) {
$lines[] = 'Year: ' . trim((string)$pub['year']);
}
if (!empty($pub['publication_types'])) {
$lines[] = 'Publication Types: ' . implode('; ', (array)$pub['publication_types']);
}
if (!empty($pub['mesh_terms'])) {
$lines[] = 'MeSH: ' . implode('; ', (array)$pub['mesh_terms']);
}
if ($pubAbstract !== '') {
$lines[] = 'Abstract: ' . $this->truncate($pubAbstract, 3500);
}
if ($pubAbstract === '') {
$cr = $this->extractCrossrefBlock($doi, $crossref);
if ($cr !== null && $cr['has_abstract']) {
$lines[] = "\n--- Crossref 补充 ---\n" . $cr['text'];
return ['text' => implode("\n", $lines), 'has_abstract' => true];
}
}
return ['text' => implode("\n", $lines), 'has_abstract' => $pubAbstract !== ''];
}
return $this->extractCrossrefBlock($doi, $crossref);
}
/**
* 从 Crossref 拉取标题/期刊/作者/摘要abstract 通常包裹 JATS XML需清洗
* @return array|null ['text' => string, 'has_abstract' => bool]
*/
private function extractCrossrefBlock($doi, CrossrefService $crossref)
{
$msg = $crossref->fetchWork($doi);
if (!is_array($msg)) {
return null;
}
$summary = $crossref->fetchWorkSummary($doi);
if (!is_array($summary)) {
$summary = [];
}
$lines = ['Source: Crossref api.crossref.org/works/' . rawurlencode($doi)];
$title = isset($msg['title'][0]) ? trim((string)$msg['title'][0]) : trim((string)$this->arrGet($summary, 'title', ''));
if ($title !== '') {
$lines[] = 'Actual Title: ' . $title;
}
if (!empty($summary['joura'])) {
$lines[] = 'Journal: ' . trim((string)$summary['joura']);
}
if (!empty($summary['author_str'])) {
$lines[] = 'Authors: ' . trim((string)$summary['author_str']);
}
if (!empty($summary['dateno'])) {
$lines[] = 'Publication: ' . trim((string)$summary['dateno']);
}
if (!empty($summary['doilink'])) {
$lines[] = 'DOI Link: ' . trim((string)$summary['doilink']);
}
if (!empty($summary['is_retracted'])) {
$lines[] = 'Retraction: yes - ' . trim((string)$this->arrGet($summary, 'retract_reason', ''));
}
$abstract = $this->cleanCrossrefAbstract((string)$this->arrGet($msg, 'abstract', ''));
$hasAbstract = $abstract !== '';
if ($hasAbstract) {
$lines[] = 'Abstract: ' . $this->truncate($abstract, 3500);
} else {
$lines[] = 'Note: Crossref 未返回摘要,请结合标题/期刊/作者与正文谨慎判断。';
}
return ['text' => implode("\n", $lines), 'has_abstract' => $hasAbstract];
}
private function cleanCrossrefAbstract($raw)
{
$raw = trim((string)$raw);
if ($raw === '') {
return '';
}
$raw = preg_replace('~<jats:title[^>]*>.*?</jats:title>~is', '', $raw);
$raw = preg_replace('~<jats:p[^>]*>~i', "\n", $raw);
$raw = preg_replace('~</jats:p>~i', '', $raw);
$raw = preg_replace('~</?jats:[^>]+>~i', '', $raw);
$raw = strip_tags($raw);
$raw = preg_replace('/[ \t]+/u', ' ', $raw);
$raw = preg_replace("/\r\n|\r/u", "\n", $raw);
$raw = preg_replace("/\n{2,}/u", "\n", $raw);
return trim($raw);
}
private function truncate($text, $max)
{
$text = (string)$text;
if (mb_strlen($text) <= $max) {
return $text;
}
return mb_substr($text, 0, $max) . '...';
}
/**
* 第二次 DOI 复核数据准备:返回书目信息 + 真实抓取内容
*
* @return array{refer_text:string, doi_block:string, has_abstract:bool, doi_used:string}
*/
public function prepareRecheckPayload($refer, $referText = '')
{
$base = trim($referText) !== '' ? trim($referText) : $this->formatReferForLlm($refer);
$cr = $this->fetchCrossrefAbstractByReferDoi($refer);
return [
'refer_text' => $base,
'doi_block' => $cr['text'],
'has_abstract' => $cr['has_abstract'],
'doi_used' => $cr['doi'],
];
}
/**
* 旧接口:拼接成单块文本(向后兼容,建议调用方改用 prepareRecheckPayload
*/
public function formatReferForDoiRecheck($refer, $referText = '')
{
$payload = $this->prepareRecheckPayload($refer, $referText);
if ($payload['doi_block'] === '') {
return $payload['refer_text']
. "\n\n【DOI 文献真实内容】\n未能从 PubMed/Crossref 获取该 DOI 的摘要或元数据,请依据书目条目与正文谨慎判断。";
}
return $payload['refer_text']
. "\n\n【Crossref 摘要(依据 Refer_doi 从 api.crossref.org/works 获取)】\n"
. $payload['doi_block'];
}
/**
* 第一轮 confidence<=0.65 且能抓到 DOI 真实内容时,延迟入队第二轮复核
*
* 跳过条件(避免无意义重跑得到相同结果):
* - check_id 不合法 / 一次置信度高于阈值
* - refer 行不存在
* - refer_doi 为空或 Crossref 未返回摘要
*/
public function maybeEnqueueSecondPass($checkId, $confidence)
{
$checkId = intval($checkId);
$confidence = floatval($confidence);
if ($checkId <= 0 || $confidence > 0.65) {
return false;
}
$row = Db::name('article_reference_check_result')->where('id', $checkId)->find();
if (empty($row)) {
return false;
}
$refer = null;
if (intval($row['p_refer_id']) > 0) {
$refer = Db::name('production_article_refer')
->where('p_refer_id', intval($row['p_refer_id']))
->where('state', 0)
->find();
}
if (empty($refer) || $this->extractReferDoiOnly($refer) === '') {
return false;
}
$cr = $this->fetchCrossrefAbstractByReferDoi($refer);
if (empty($cr['has_abstract'])) {
return false;
}
$this->clearReferenceCheckQueueLock($checkId);
$this->pushJob2($checkId, 5);
return true;
}
/**
* 从 article_main.content 提取 blue 引用
*/
public function extractReferences($content)
{
$result = [];
preg_match_all(self::BLUE_TAG_REGEX, $content, $matches, PREG_OFFSET_CAPTURE);
if (empty($matches[0])) {
return [];
}
$tagSpans = [];
foreach ($matches[0] as $index => $match) {
$tagSpans[] = [
'start' => $match[1],
'end' => $match[1] + strlen($match[0]),
'index' => $index,
];
}
foreach ($matches[0] as $index => $match) {
$fullTag = $match[0];
$tagStart = $match[1];
$tagEnd = $tagStart + strlen($fullTag);
$rawRef = trim($matches[1][$index][0]);
$referenceNumbers = $this->expandReferenceNumbers($rawRef);
list($localStart, $localEnd, $originalText) = $this->extractLocalCitationContext(
$content,
$tagStart,
$tagEnd,
$tagSpans
);
if (!$this->isMeaningfulCitationContext($originalText) || empty($referenceNumbers)) {
continue;
}
$result[] = [
'reference_raw' => $rawRef,
'reference_numbers' => $referenceNumbers,
'original_text' => $originalText,
'reference_start' => $tagStart,
'reference_end' => $tagEnd,
'text_start' => $localStart,
'text_end' => $localEnd,
];
}
return $result;
}
/**
* 按引用位置截取局部上下文:优先取标签前叙述;同句多引时后续引用从上一标签后开始。
*/
private function extractLocalCitationContext($content, $tagStart, $tagEnd, array $tagSpans)
{
$paragraphStart = $this->findParagraphStart($content, $tagStart);
$sentenceEnd = $this->findSentenceEnd($content, $tagEnd, $tagEnd);
$prevTagEnd = $paragraphStart;
$nextTagStart = $sentenceEnd;
foreach ($tagSpans as $span) {
if ($span['end'] <= $tagStart && $span['end'] > $prevTagEnd) {
$prevTagEnd = $span['end'];
}
if ($span['start'] > $tagEnd && $span['start'] < $nextTagStart) {
$nextTagStart = $span['start'];
}
}
$hasPriorCiteInParagraph = ($prevTagEnd > $paragraphStart);
$sentenceStart = $this->findSentenceStart($content, $tagStart);
// 段内首个引用:整段到标签前;后续引用:取「本句」起点(可早于上一标签),避免只剩 “and external environment” 再误用标签后文本
if ($hasPriorCiteInParagraph) {
$localStart = max($paragraphStart, $sentenceStart);
} else {
$localStart = $this->capContextStartBeforeTag($content, $tagStart, $paragraphStart);
}
// 默认:引用标签前的论述
$localEnd = $tagStart;
$originalText = $this->buildCitationContextText($content, $localStart, $localEnd);
// 仅段内首个引用、且标签前极短(如句末 ICU nurses [14])时,才改用标签后片段;同段多引禁止标签后截取(会错取下一句)
$allowTrailing = !$hasPriorCiteInParagraph;
if ($allowTrailing && (
!$this->isMeaningfulCitationContext($originalText)
|| $this->shouldUseTrailingCitationContext($content, $localStart, $tagStart, $tagEnd)
)) {
$trailEnd = ($nextTagStart < $sentenceEnd) ? $nextTagStart : $sentenceEnd;
$trailText = $this->buildCitationContextText($content, $tagEnd, $trailEnd);
if ($this->isMeaningfulCitationContext($trailText)) {
$localStart = $tagEnd;
$localEnd = $trailEnd;
$originalText = $trailText;
}
}
if (!$this->isMeaningfulCitationContext($originalText)) {
list($localStart, $localEnd) = $this->widenCitationContextBounds(
$content,
$tagStart,
$tagEnd,
$localStart,
$localEnd
);
$originalText = $this->buildCitationContextText($content, $localStart, $localEnd);
}
return [$localStart, $localEnd, $originalText];
}
/**
* 标签前仅有作者缩写等极短片段时,改用标签后上下文
*/
private function shouldUseTrailingCitationContext($content, $localStart, $tagStart, $tagEnd)
{
$before = $this->buildCitationContextText($content, $localStart, $tagStart);
if (!$this->isMeaningfulCitationContext($before)) {
return true;
}
return mb_strlen($before) < 25;
}
public function expandReferenceNumbers($refStr)
{
$refStr = str_replace(
['', '', '—', '', '', ''],
[',', '-', '-', '-', '-', '-'],
trim($refStr)
);
$numbers = [];
foreach (explode(',', $refStr) as $part) {
$part = trim($part);
if ($part === '') {
continue;
}
if (preg_match('/^(\d+)\s*-\s*(\d+)$/', $part, $m)) {
$start = intval($m[1]);
$end = intval($m[2]);
if ($start <= $end) {
$numbers = array_merge($numbers, range($start, $end));
}
} elseif (ctype_digit($part)) {
$numbers[] = intval($part);
}
}
return array_values(array_unique($numbers));
}
/**
* 返回 $bytePos 处 UTF-8 码点占用的最后一字节之后的位置(下一字符起始)
*/
private function utf8CharEnd($content, $bytePos)
{
$len = strlen($content);
if ($bytePos < 0 || $bytePos >= $len) {
return max(0, min($len, $bytePos + 1));
}
$next = $bytePos + 1;
while ($next < $len && (ord($content[$next]) & 0xC0) === 0x80) {
$next++;
}
return $next;
}
/**
* 按字节偏移截取(与 strpos/strlen 一致);勿用 mb_substr否则遇中文前缀会截断英文词头
*/
private function byteSubstr($content, $start, $end)
{
$length = max(0, $end - $start);
if ($length === 0) {
return '';
}
return (string)mb_strcut($content, $start, $length, 'UTF-8');
}
private function buildCitationContextText($content, $start, $end)
{
$text = $this->byteSubstr($content, $start, $end);
$text = preg_replace(self::BLUE_TAG_REGEX, '', $text);
$text = trim(strip_tags($text));
$text = preg_replace('/\s+/u', ' ', $text);
$text = ltrim($text, "\xEF\xBB\xBF");
return $text;
}
/**
* 过滤仅标点、过短或无字母/汉字的上下文(如去掉标签后只剩 "."
*/
private function isMeaningfulCitationContext($text)
{
$text = trim($text);
if ($text === '') {
return false;
}
if ($this->isOnlyPunctuationOrSpace($text)) {
return false;
}
if (!preg_match('/[\p{L}\p{N}]/u', $text)) {
return false;
}
return mb_strlen($text) >= 2;
}
private function isOnlyPunctuationOrSpace($text)
{
return preg_match('/^[\s\p{P}\p{S}]+$/u', $text) === 1;
}
/**
* 首句过短时向前后各扩展一句(上限约 2000 字符)
*/
private function widenCitationContextBounds($content, $tagStart, $tagEnd, $start, $end)
{
$len = strlen($content);
$maxSpan = 2000;
if ($start > 0) {
$prevStart = $this->findSentenceStart($content, max(0, $start - 1));
if ($prevStart < $start) {
$start = $prevStart;
}
}
$nextEnd = $this->findSentenceEnd($content, $end, $tagEnd);
if ($nextEnd > $end && $nextEnd <= $len) {
$end = $nextEnd;
}
if ($end - $start > $maxSpan) {
$half = (int)floor($maxSpan / 2);
$mid = (int)floor(($tagStart + $tagEnd) / 2);
$start = max(0, $mid - $half);
$end = min($len, $start + $maxSpan);
}
return [$start, $end];
}
/**
* 句号是否可作为句界排除小数点、et al. 等缩写)
*/
private function isSentenceDelimiterAt($content, $pos, $delimiter)
{
$len = strlen($content);
if ($delimiter !== '.' || $pos < 0 || $pos >= $len) {
return true;
}
if ($pos > 0 && $pos + 1 < $len
&& ctype_digit($content[$pos - 1])
&& ctype_digit($content[$pos + 1])
) {
return false;
}
$before = substr($content, max(0, $pos - 12), min(12, $pos));
if (preg_match('/\b(et\s+al|e\.g|i\.e|vs|etc|fig|no)\s*\.?\s*$/i', $before)) {
return false;
}
$after = substr($content, $pos + 1, 24);
if (preg_match('/^\s*<blue>\s*\[/', $after)) {
return false;
}
return true;
}
/**
* 段落起始HTML / 换行),避免英文多句段落只取到最后一个句号后的一句
*/
private function findParagraphStart($content, $tagStart)
{
$search = substr($content, 0, max(0, $tagStart));
if ($search === '') {
return 0;
}
$best = 0;
if (preg_match_all('/<p[^>]*>/i', $search, $m, PREG_OFFSET_CAPTURE)) {
$last = end($m[0]);
$best = max($best, $last[1] + strlen($last[0]));
}
if (preg_match_all('/<\/p>\s*/i', $search, $m, PREG_OFFSET_CAPTURE)) {
$last = end($m[0]);
$best = max($best, $last[1] + strlen($last[0]));
}
if (preg_match_all('/<br\s*\/?>\s*/i', $search, $m, PREG_OFFSET_CAPTURE)) {
$last = end($m[0]);
$best = max($best, $last[1] + strlen($last[0]));
}
$pos = strrpos($search, "\n\n");
if ($pos !== false) {
$best = max($best, $pos + 2);
}
$pos = strrpos($search, "\n");
if ($pos !== false) {
$best = max($best, $pos + 1);
}
return $best;
}
/**
* 段落过长时从引用处向前截取上限,避免单次 LLM 上下文过大
*/
private function capContextStartBeforeTag($content, $tagStart, $paragraphStart, $maxBytes = 2500)
{
if ($tagStart - $paragraphStart <= $maxBytes) {
return $paragraphStart;
}
$start = $tagStart - $maxBytes;
$slice = substr($content, $start, $tagStart - $start);
if (preg_match('/[.!?。!?]\s+/u', $slice, $m, PREG_OFFSET_CAPTURE)) {
$rel = $m[0][1] + strlen($m[0][0]);
return $start + $rel;
}
return max($paragraphStart, $start);
}
private function findSentenceStart($content, $position)
{
$start = 0;
foreach (['.', '。', '!', '?', "\n"] as $delimiter) {
$pos = strrpos(substr($content, 0, $position), $delimiter);
if ($pos !== false && $this->isSentenceDelimiterAt($content, $pos, $delimiter)) {
$start = max($start, $this->utf8CharEnd($content, $pos));
}
}
return $start;
}
/**
* @param int $searchFrom 从该字节位置起查找句末
* @param int $tagEnd 引用标签结束位置;用于跳过 </blue> 后紧跟的孤立句号
*/
private function findSentenceEnd($content, $searchFrom, $tagEnd = 0)
{
$length = strlen($content);
$minPos = max(0, $searchFrom);
while ($minPos < $length) {
$endPositions = [];
foreach (['.', '。', '!', '?', "\n"] as $delimiter) {
$pos = strpos($content, $delimiter, $minPos);
if ($pos !== false && $this->isSentenceDelimiterAt($content, $pos, $delimiter)) {
$endPositions[] = $this->utf8CharEnd($content, $pos);
}
}
if (empty($endPositions)) {
return $length;
}
$end = min($endPositions);
if ($tagEnd <= 0 || $end <= $tagEnd) {
return $end;
}
$gap = substr($content, $tagEnd, $end - $tagEnd);
$gapText = trim(strip_tags(preg_replace(self::BLUE_TAG_REGEX, '', $gap)));
if ($gapText !== '' && !$this->isOnlyPunctuationOrSpace($gapText)) {
return $end;
}
$minPos = $end;
}
return $length;
}
/**
* 已入库记录按文献编号正序入队(同号按 am_id、正文位置稳定排序
*
* @param array $rows 元素含 check_id、reference_no可选 am_id、text_start
*/
private function pushJobsSortedByReferenceNo(array $rows)
{
if (empty($rows)) {
return [];
}
usort($rows, function ($a, $b) {
if ($a['reference_no'] !== $b['reference_no']) {
return $a['reference_no'] - $b['reference_no'];
}
$amA = isset($a['am_id']) ? intval($a['am_id']) : 0;
$amB = isset($b['am_id']) ? intval($b['am_id']) : 0;
if ($amA !== $amB) {
return $amA - $amB;
}
$posA = isset($a['text_start']) ? intval($a['text_start']) : 0;
$posB = isset($b['text_start']) ? intval($b['text_start']) : 0;
return $posA - $posB;
});
$checkIds = [];
$delay = 0;
foreach ($rows as $row) {
$checkId = intval($row['check_id']);
$checkIds[] = $checkId;
$this->pushJob($checkId, $delay);
$delay++;
}
return $checkIds;
}
private function pushJob($checkId, $delaySeconds = 0)
{
$checkId = intval($checkId);
$this->clearReferenceCheckQueueLock($checkId);
$jobClass = 'app\api\job\ReferenceCheck@fire';
$data = ['check_id' => $checkId];
try {
if ($delaySeconds > 0) {
$jobId = Queue::later($delaySeconds, $jobClass, $data, self::QUEUE_NAME);
} else {
$jobId = Queue::push($jobClass, $data, self::QUEUE_NAME);
}
} catch (\Exception $e) {
\think\Log::error('ReferenceCheck pushJob failed check_id=' . $checkId . ' ' . $e->getMessage());
throw $e;
}
}
private function pushJob2($checkId, $delaySeconds = 0)
{
$jobClass = 'app\api\job\ReferenceCheckTwo@fire';
$data = ['check_id' => $checkId];
try {
if ($delaySeconds > 0) {
$jobId = Queue::later($delaySeconds, $jobClass, $data, self::QUEUE_NAME);
} else {
$jobId = Queue::push($jobClass, $data, self::QUEUE_NAME);
}
} catch (\Exception $e) {
\think\Log::error('ReferenceCheckTwo pushJob failed check_id=' . $checkId . ' ' . $e->getMessage());
throw $e;
}
}
}