= 该值视为"通过" */ const PASS_CONFIDENCE_THRESHOLD = 0.65; /** * 正文引用标签两种排版(带 /u): * 1) [8, 9][13-15] —— 方括号在 blue 内 * 2) [13-15] —— 方括号包裹 blue * * 捕获组均为序号串(可含逗号、区间连字符及排版变体)。 */ const BLUE_TAG_REGEX = '/\[([\d,,\-\x{2013}\x{2014}\x{2212}\x{2010}\x{2011}\s]+)\]<\/blue>/u'; const BLUE_TAG_REGEX_BRACKET_OUTSIDE = '/\[([\d,,\-\x{2013}\x{2014}\x{2212}\x{2010}\x{2011}\s]+)<\/blue>\]/u'; /** * 兼容无 ?? 的 PHP 版本 */ private function arrGet($arr, $key, $default = '') { return isset($arr[$key]) ? $arr[$key] : $default; } /** * 合并匹配两种 blue 引用排版,按在正文中的起始位置排序。 * * @return array{0: array, 1: array} 同 preg_match_all 的完整匹配与捕获组 1 */ private function collectBlueTagMatches($content) { $merged = []; foreach ([self::BLUE_TAG_REGEX, self::BLUE_TAG_REGEX_BRACKET_OUTSIDE] as $pattern) { if (!preg_match_all($pattern, $content, $m, PREG_OFFSET_CAPTURE)) { continue; } $count = count($m[0]); for ($i = 0; $i < $count; $i++) { $merged[] = ['full' => $m[0][$i], 'inner' => $m[1][$i]]; } } usort($merged, function ($a, $b) { return $a['full'][1] - $b['full'][1]; }); $matches = [[], []]; foreach ($merged as $item) { $matches[0][] = $item['full']; $matches[1][] = $item['inner']; } return $matches; } /** 对两种 blue 引用排版执行 preg_replace */ private function pregReplaceBlueTags($subject, $replacement) { $subject = preg_replace(self::BLUE_TAG_REGEX, $replacement, $subject); $subject = preg_replace(self::BLUE_TAG_REGEX_BRACKET_OUTSIDE, $replacement, $subject); return $subject; } /** * 单条入队(可手工指定正文与文献文本) */ public function enqueue($contentA, $contentB, array $extra = []) { $contentA = trim($contentA); if ($contentA === '') { throw new \InvalidArgumentException('content_a is required'); } $now = date('Y-m-d H:i:s'); $checkId = Db::name('article_reference_check_result')->insertGetId([ 'article_id' => intval($this->arrGet($extra, 'article_id', 0)), 'am_id' => intval($this->arrGet($extra, 'am_id', 0)), 'p_article_id' => intval($this->arrGet($extra, 'p_article_id', 0)), 'p_refer_id' => intval($this->arrGet($extra, 'p_refer_id', 0)), 'refer_index' => intval($this->arrGet($extra, 'refer_index', 0)), 'reference_no' => intval($this->arrGet($extra, 'reference_no', 0)), 'reference_raw' => (string)$this->arrGet($extra, 'reference_raw', ''), 'cite_tag_start' => intval($this->arrGet($extra, 'cite_tag_start', 0)), 'cite_tag_end' => intval($this->arrGet($extra, 'cite_tag_end', 0)), 'text_start' => intval($this->arrGet($extra, 'text_start', 0)), 'text_end' => intval($this->arrGet($extra, 'text_end', 0)), 'content_a' => $contentA, 'content_b' => trim($contentB), 'status' => 0, 'created_at' => $now, 'updated_at' => $now, ]); $amId = intval($this->arrGet($extra, 'am_id', 0)); if ($amId > 0) { $this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING); } $this->pushJob(intval($checkId), intval($this->arrGet($extra, 'queue_delay', 0))); return ['check_id' => $checkId, 'queued' => 1]; } public function enqueueByArticleMain($main){ $amId = intval($this->arrGet($main, 'am_id', 0)); if ($amId > 0 && (!isset($main['type']) || (intval($main['type']) === self::MAIN_TYPE_TABLE && intval($this->arrGet($main, 'amt_id', 0)) <= 0))) { $dbMain = Db::name('article_main') ->field('am_id,content,article_id,type,amt_id') ->where('am_id', $amId) ->whereIn('state', [0, 2]) ->find(); if (!empty($dbMain)) { $main = array_merge($dbMain, $main); } } $citations = $this->extractReferencesForArticleMain($main); if (empty($citations)) { $this->setAmRefCheckStatus($amId, self::AM_STATUS_NONE); return; } $prod = Db::name('production_article') ->where('article_id', $main['article_id']) ->where('state', 0) ->find(); if (empty($prod)) { throw new \RuntimeException('production_article not found for article_id=' . $main['article_id']); } $pArticleId = intval($prod['p_article_id']); $referMap = $this->loadReferMapByPArticleId($pArticleId); if (empty($citations)) { $this->setAmRefCheckStatus($amId, self::AM_STATUS_PASS); return; } $skipped = 0; $delay = 0; foreach ($citations as $cite) { foreach ($cite['reference_numbers'] as $refNo) { $referIndex = $refNo - 1; if ($referIndex < 0 || !isset($referMap[$referIndex])) { $skipped++; continue; } $refer = $referMap[$referIndex]; $referText = $this->formatReferForLlm($refer); $now = date('Y-m-d H:i:s'); // [70-73] 展开为 reference_no=70,71,72,73 共 4 条记录 $checkId = Db::name('article_reference_check_result')->insertGetId([ 'article_id' => $main['article_id'], 'p_article_id' => $pArticleId, 'am_id' => intval($main['am_id']), 'reference_no' => $refNo, 'refer_index' => $refNo, 'origin_text' => $cite['original_text'], 'refer_text' => $referText, 'p_refer_id' => $referMap[$referIndex]['p_refer_id'], 'text_start' => $cite['text_start'], 'text_end' => $cite['text_end'], 'created_at' => $now, 'updated_at' => $now, ]); $this->pushJob(intval($checkId), $delay); $checkIds[] = $checkId; $delay += 1; } } $this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING); } /** * 手工触发:对已完成且 confidence<=0.65 的记录入队 DOI 第二轮复核 */ public function enqueueSecondPassByArticle($articleId) { $articleId = intval($articleId); if ($articleId <= 0) { throw new \InvalidArgumentException('article_id is required'); } $rows = Db::name('article_reference_check_result') ->where('article_id', $articleId) ->where('status', self::RECORD_COMPLETED) ->where('confidence', '<=', 0.65) ->orderRaw('rand()') ->limit(2) ->select(); $checkIds2 = []; $delay2 = 0; foreach ($rows as $checkLog) { $rowId = $this->resolveCheckRowId($checkLog); if ($this->maybeEnqueueSecondPass($rowId, floatval($checkLog['confidence']))) { $checkIds2[] = $rowId; $delay2 += 1; } } return [ 'article_id' => $articleId, 'check_ids2' => $checkIds2, 'queued' => count($checkIds2), ]; } public function enqueueByPArticle($prod){ if (empty($prod)) { throw new \RuntimeException('production_article not found'); } $pArticleId = intval($prod['p_article_id']); $articleId = intval($prod['article_id']); $referMap = $this->loadReferMapByPArticleId($pArticleId); $mains = Db::name('article_main') ->field('am_id,content,article_id,type,amt_id') ->where('article_id', $articleId) ->whereIn('state', [0, 2]) ->order('sort asc') ->select(); if (empty($mains)) { throw new \RuntimeException('article_main is empty'); } $queued = 0; $skipped = 0; $pendingJobs = []; $amIdsWithJobs = []; $now = date('Y-m-d H:i:s'); foreach ($mains as $main) { $amId = intval($main['am_id']); $citations = $this->extractReferencesForArticleMain($main); if (empty($citations)) { $this->setAmRefCheckStatus($amId, self::AM_STATUS_NONE); continue; } foreach ($citations as $cite) { foreach ($cite['reference_numbers'] as $refNo) { $referIndex = $refNo - 1; if ($referIndex < 0 || !isset($referMap[$referIndex])) { $skipped++; continue; } $refer = $referMap[$referIndex]; $referText = $this->formatReferForLlm($refer); // [70-73] 展开为 reference_no=70,71,72,73 共 4 条记录;先入队表,再按文献号正序校对 $checkId = Db::name('article_reference_check_result')->insertGetId([ 'article_id' => $main['article_id'], 'p_article_id' => $pArticleId, 'am_id' => $amId, 'reference_no' => $refNo, 'refer_index' => $refNo, 'origin_text' => $cite['original_text'], 'refer_text' => $referText, 'p_refer_id' => $referMap[$referIndex]['p_refer_id'], 'text_start' => $cite['text_start'], 'text_end' => $cite['text_end'], 'created_at' => $now, 'updated_at' => $now, ]); $pendingJobs[] = [ 'check_id' => intval($checkId), 'reference_no' => intval($refNo), 'am_id' => $amId, 'text_start' => intval($cite['text_start']), ]; $queued++; $amIdsWithJobs[$amId] = true; } } } $checkIds = $this->pushJobsSortedByReferenceNo($pendingJobs); foreach (array_keys($amIdsWithJobs) as $amId) { $this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING); } return [ 'article_id' => $articleId, 'p_article_id' => $pArticleId, 'queued' => $queued, 'skipped' => $skipped, 'check_ids' => $checkIds, 'queue' => self::QUEUE_NAME, ]; } public function enqueueByArticle($articleId){ if ($articleId <= 0) { throw new \InvalidArgumentException('article_id is required'); } $prod = Db::name('production_article') ->where('article_id', $articleId) ->whereIn('state', [0, 2]) ->find(); if (empty($prod)) { throw new \RuntimeException('production_article not found for article_id=' . $articleId); } $pArticleId = intval($prod['p_article_id']); $referMap = $this->loadReferMapByPArticleId($pArticleId); $mains = Db::name('article_main') ->field('am_id,content,article_id,type,amt_id') ->where('article_id', $articleId) ->whereIn('state', [0, 2]) ->order('sort asc') ->select(); if (empty($mains)) { throw new \RuntimeException('article_main is empty'); } $queued = 0; $skipped = 0; $pendingJobs = []; $amIdsWithJobs = []; $now = date('Y-m-d H:i:s'); foreach ($mains as $main) { $amId = intval($main['am_id']); $citations = $this->extractReferencesForArticleMain($main); if (empty($citations)) { $this->setAmRefCheckStatus($amId, self::AM_STATUS_NONE); continue; } foreach ($citations as $cite) { foreach ($cite['reference_numbers'] as $refNo) { $referIndex = $refNo - 1; if ($referIndex < 0 || !isset($referMap[$referIndex])) { $skipped++; continue; } $refer = $referMap[$referIndex]; $referText = $this->formatReferForLlm($refer); // [70-73] 展开为 reference_no=70,71,72,73 共 4 条记录;先入队表,再按文献号正序校对 $checkId = Db::name('article_reference_check_result')->insertGetId([ 'article_id' => $main['article_id'], 'p_article_id' => $pArticleId, 'am_id' => $amId, 'reference_no' => $refNo, 'refer_index' => $refNo, 'origin_text' => $cite['original_text'], 'refer_text' => $referText, 'p_refer_id' => $referMap[$referIndex]['p_refer_id'], 'text_start' => $cite['text_start'], 'text_end' => $cite['text_end'], 'created_at' => $now, 'updated_at' => $now, ]); $pendingJobs[] = [ 'check_id' => intval($checkId), 'reference_no' => intval($refNo), 'am_id' => $amId, 'text_start' => intval($cite['text_start']), ]; $queued++; $amIdsWithJobs[$amId] = true; } } } $checkIds = $this->pushJobsSortedByReferenceNo($pendingJobs); foreach (array_keys($amIdsWithJobs) as $amId) { $this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING); } return [ 'article_id' => $articleId, 'p_article_id' => $pArticleId, 'queued' => $queued, 'skipped' => $skipped, 'check_ids' => $checkIds, 'queue' => self::QUEUE_NAME, ]; } /** * 根据该节全部明细行汇总更新 t_article_main.ref_check_status */ public function syncAmRefCheckStatus($amId) { if ($amId <= 0) { return self::AM_STATUS_NONE; } $rows = Db::name('article_reference_check_result')->where('am_id', $amId)->select(); if (empty($rows)) { $this->setAmRefCheckStatus($amId, self::AM_STATUS_NONE); return self::AM_STATUS_NONE; } $pending = 0; $hasFail = false; $done = 0; foreach ($rows as $row) { $st = intval($row['status']); if ($st === self::RECORD_PENDING) { $pending++; continue; } if ($st === self::RECORD_FAILED || ($st === self::RECORD_COMPLETED && intval($row['is_match']) === 0)) { $hasFail = true; } if ($st === self::RECORD_COMPLETED) { $done++; } } if ($pending > 0) { $status = self::AM_STATUS_RUNNING; } elseif ($hasFail) { $status = self::AM_STATUS_FAIL; } elseif ($done === count($rows)) { $status = self::AM_STATUS_PASS; } else { $status = self::AM_STATUS_FAIL; } $this->setAmRefCheckStatus($amId, $status); return $status; } /** * t_article_main 是否已加 ref_check_status 列(未迁移时跳过写入,避免 fields not exists) */ private function hasAmRefCheckStatusColumn() { if (self::$amRefCheckStatusColumnExists !== null) { return self::$amRefCheckStatusColumnExists; } try { $table = Db::name('article_main')->getTable(); $rows = Db::query('SHOW COLUMNS FROM `' . str_replace('`', '``', $table) . '` LIKE \'ref_check_status\''); self::$amRefCheckStatusColumnExists = !empty($rows); } catch (\Exception $e) { self::$amRefCheckStatusColumnExists = false; } return self::$amRefCheckStatusColumnExists; } public function setAmRefCheckStatus($amId, $status) { if ($amId <= 0 || !$this->hasAmRefCheckStatusColumn()) { return; } Db::name('article_main')->where('am_id', $amId)->update([ 'ref_check_status' => $status, ]); } /** * 按 p_article_id 清空整篇文章的引用校对明细 + 重置节级 ref_check_status。 * * 用于新增/删除文献后,旧的 reference_no 全部错位、原校对结果失效的场景: * 物理删除后,整篇状态查询自然回到 ARTICLE_PROGRESS_NONE(未校对)。 * * @return int 被删除的明细条数 */ public function clearArticleChecksByPArticleId($pArticleId) { $pArticleId = intval($pArticleId); if ($pArticleId <= 0) { return 0; } // 先反查 article_id(用于重置 article_main.ref_check_status 节级状态) $articleId = intval(Db::name('production_article') ->where('p_article_id', $pArticleId) ->whereIn('state', [0, 2]) ->value('article_id')); // 先清掉旧记录对应的队列 Redis 锁,避免在途 worker 写回数据 $oldIds = Db::name('article_reference_check_result') ->where('p_article_id', $pArticleId) ->column('id'); foreach ($oldIds as $oldId) { $this->clearReferenceCheckQueueLock(intval($oldId)); } $deleted = Db::name('article_reference_check_result') ->where('p_article_id', $pArticleId) ->delete(); if ($articleId > 0 && $this->hasAmRefCheckStatusColumn()) { Db::name('article_main') ->where('article_id', $articleId) ->whereIn('state', [0, 2]) ->update(['ref_check_status' => self::AM_STATUS_NONE]); } return intval($deleted); } public function clearArticleChecks($articleId) { $articleId = intval($articleId); if ($articleId <= 0) { return 0; } // 先清掉旧记录对应的队列 Redis 锁,否则同 check_id 在 TTL 内不会再次执行 $oldIds = Db::name('article_reference_check_result') ->where('article_id', $articleId) ->column('id'); foreach ($oldIds as $oldId) { $this->clearReferenceCheckQueueLock(intval($oldId)); } $deleted = Db::name('article_reference_check_result')->where('article_id', $articleId)->delete(); if ($this->hasAmRefCheckStatusColumn()) { Db::name('article_main') ->where('article_id', $articleId) ->whereIn('state', [0, 2]) ->update(['ref_check_status' => self::AM_STATUS_NONE]); } return intval($deleted); } /** * 文献列表局部挪动后,仅刷新指定 p_refer_id 对应的校对明细 reference_no / refer_index。 * * 读 production_article_refer 的最新 index 来算新序号(index + 1),避免外部传入过期值。 * 仅更新受影响的两条左右记录,降低与并发挪动互相覆盖的风险。 * * @param int[] $pReferIds 受影响的 p_refer_id(一般为 2 个:被挪条目 + 其相邻条目) * @param int $pArticleId 可选:附加 p_article_id 限定,进一步缩小行锁范围 * @return array{p_refer_ids:int[], affected_rows:int, changes:array} */ public function syncReferenceNoByPReferIds(array $pReferIds, $pArticleId = 0) { $pReferIds = array_values(array_unique(array_filter(array_map('intval', $pReferIds)))); $pArticleId = intval($pArticleId); if (empty($pReferIds)) { return [ 'p_refer_ids' => [], 'affected_rows' => 0, 'changes' => [], ]; } $referQuery = Db::name('production_article_refer') ->field('p_refer_id,p_article_id,index') ->whereIn('p_refer_id', $pReferIds) ->where('state', 0); if ($pArticleId > 0) { $referQuery->where('p_article_id', $pArticleId); } $refers = $referQuery->select(); if (empty($refers)) { return [ 'p_refer_ids' => $pReferIds, 'affected_rows' => 0, 'changes' => [], ]; } $now = date('Y-m-d H:i:s'); $affected = 0; $changes = []; foreach ($refers as $refer) { $pReferId = intval($refer['p_refer_id']); $newNo = intval($refer['index']) + 1; $updateQuery = Db::name('article_reference_check_result') ->where('p_refer_id', $pReferId) ->where('reference_no', '<>', $newNo); if ($pArticleId > 0) { $updateQuery->where('p_article_id', $pArticleId); } $rows = $updateQuery->update([ 'reference_no' => $newNo, 'refer_index' => $newNo, 'updated_at' => $now, ]); if ($rows > 0) { $affected += intval($rows); $changes[] = [ 'p_refer_id' => $pReferId, 'new_ref_no' => $newNo, 'affected_rows' => intval($rows), ]; } } return [ 'p_refer_ids' => $pReferIds, 'affected_rows' => $affected, 'changes' => $changes, ]; } /** * 重置整篇稿件的引用校对:删除旧明细 + 清理队列锁 + 全文重新入队校对 * * @return array */ /** * 按 p_article_id 查整篇文章的引用校对总状态。 * * 统计维度是"参考文献"(按 reference_no 分组),不是单条校对明细行。 * 例如 50 条参考文献、底层明细 111 条时,total 返回 50。 * * 返回 status 数值含义(整篇): * 0 = ARTICLE_PROGRESS_NONE 一条校对记录都没有 * 1 = ARTICLE_PROGRESS_RUNNING 至少 1 条参考文献仍有未跑完的明细 * 2 = ARTICLE_PROGRESS_COMPLETED 所有参考文献的全部明细都已结束 * * 每条参考文献按其明细 status 分布落桶(互斥): * pending —— 组内任一明细 status=0(含部分跑完的"校对中"也归此桶) * done —— 组内全部明细 status=2(完成) * failed —— 组内全部明细已结束、至少 1 条 status=3(失败) * * pending + done + failed = total;progress_percent = (done + failed) / total。 * 分组明细请走 getProgressByPArticleId(控制器 referenceCheckProgressAI)。 * * @return array{p_article_id:int, status:int, total:int, pending:int, done:int, failed:int, progress_percent:float} */ public function getArticleProgressStatusByPArticleId($pArticleId) { $pArticleId = intval($pArticleId); if ($pArticleId <= 0) { throw new \InvalidArgumentException('p_article_id is required'); } // 一条 SQL 按 reference_no 聚合,组内 status 分布一并算出来; // 50 条参考文献 → 返回 50 行,PHP 走一次循环分桶即可 $rows = Db::name('article_reference_check_result') ->field('reference_no' . ', SUM(CASE WHEN status = ' . self::RECORD_PENDING . ' THEN 1 ELSE 0 END) AS pending_cnt' . ', SUM(CASE WHEN status = ' . self::RECORD_FAILED . ' THEN 1 ELSE 0 END) AS failed_cnt') ->where('p_article_id', $pArticleId) ->group('reference_no') ->select(); if (empty($rows)) { return [ 'p_article_id' => $pArticleId, 'status' => self::ARTICLE_PROGRESS_NONE, 'total' => 0, 'pending' => 0, 'done' => 0, 'failed' => 0, 'progress_percent' => 0, ]; } $pending = 0; $done = 0; $failed = 0; foreach ($rows as $row) { $pendingCnt = intval($this->arrGet($row, 'pending_cnt', 0)); $failedCnt = intval($this->arrGet($row, 'failed_cnt', 0)); if ($pendingCnt > 0) { $pending++; } elseif ($failedCnt > 0) { $failed++; } else { $done++; } } $total = count($rows); $articleStatus = $pending > 0 ? self::ARTICLE_PROGRESS_RUNNING : self::ARTICLE_PROGRESS_COMPLETED; $finished = $done + $failed; $progressPercent = round($finished / $total * 100, 1); return [ 'p_article_id' => $pArticleId, 'status' => $articleStatus, 'total' => $total, 'pending' => $pending, 'done' => $done, 'failed' => $failed, 'progress_percent' => $progressPercent, ]; } /** * 多篇文章并行校对时,查询指定文章前面还有几篇在排队。 * * 「正在校对」= 该文至少还有 1 条明细 status=待校验(0)。 * 排队顺序:按各文章最早一条待校验明细的 id 升序(与全局入队先后一致)。 * * @return array{ * p_article_id:int, * running_total:int, * ahead:int, * position:int, * in_queue:bool, * status:int * } */ public function getArticleCheckQueuePositionByPArticleId($pArticleId) { $pArticleId = intval($pArticleId); if ($pArticleId <= 0) { throw new \InvalidArgumentException('p_article_id is required'); } $rows = Db::name('article_reference_check_result') ->field('p_article_id, MIN(id) AS queue_anchor') ->where('status', self::RECORD_PENDING) ->group('p_article_id') ->order('queue_anchor', 'asc') ->select(); $runningIds = []; foreach ($rows as $row) { $aid = intval($this->arrGet($row, 'p_article_id', 0)); if ($aid > 0) { $runningIds[] = $aid; } } $runningTotal = count($runningIds); $ahead = 0; $position = 0; $inQueue = false; foreach ($runningIds as $idx => $aid) { if ($aid === $pArticleId) { $ahead = $idx; $position = $idx + 1; $inQueue = true; break; } } $articleStatus = $this->getArticleProgressStatusByPArticleId($pArticleId); return [ 'p_article_id' => $pArticleId, 'running_total' => $runningTotal, 'ahead' => $inQueue ? $ahead : 0, 'position' => $inQueue ? $position : 0, 'in_queue' => $inQueue, 'status' => intval($this->arrGet($articleStatus, 'status', self::ARTICLE_PROGRESS_NONE)), ]; } /** * 按 p_article_id 查整篇引用校对进度,按 reference_no 分组聚合状态,并展开每条明细。 * * 状态映射统一遵循"生命周期顺序"(PROGRESS_* / RECORD_* 取值一致): * 0 = 待校验 1 = 校对中(仅分组层) 2 = 校对完成 3 = 校对失败 * * 分组(reference_no)状态返回字段 progress_status: * - 0 = PROGRESS_PENDING 分组内全部明细 status=0 * - 1 = PROGRESS_CHECKING 分组内部分明细已结束、部分仍为 0(明细不会出现此值) * - 2 = PROGRESS_COMPLETED 分组内全部明细 status=2 * - 3 = PROGRESS_FAILED 分组内全部明细已结束、且至少 1 条 status=3 * * records[i] 字段: * - status 0=待校验 2=完成 3=失败(与分组同一套数值含义,不会出现 1) * - confidence LLM 评分 * - is_pass confidence >= PASS_CONFIDENCE_THRESHOLD 视为通过 * * @return array{p_article_id:int, total_groups:int, summary:array, list:array} */ public function getProgressByPArticleId($pArticleId) { $pArticleId = intval($pArticleId); if ($pArticleId <= 0) { throw new \InvalidArgumentException('p_article_id is required'); } $rows = Db::name('article_reference_check_result') ->field('id,p_refer_id,reference_no,am_id,status,confidence,is_match,reason,text_start,text_end,updated_at') ->where('p_article_id', $pArticleId) ->order('reference_no asc, id asc') ->select(); // summary 用字符串键,避免数值下标看不出含义;同时保留数值键和 PROGRESS_* 常量对照 $summary = [ 'pending' => 0, // PROGRESS_PENDING = 0 'checking' => 0, // PROGRESS_CHECKING = 1 'completed' => 0, // PROGRESS_COMPLETED = 2 'failed' => 0, // PROGRESS_FAILED = 3 ]; if (empty($rows)) { return [ 'p_article_id' => $pArticleId, 'total_groups' => 0, 'summary' => $summary, 'list' => [], ]; } $groups = []; foreach ($rows as $row) { $refNo = intval($this->arrGet($row, 'reference_no', 0)); $pReferId = intval($this->arrGet($row, 'p_refer_id', 0)); if (!isset($groups[$refNo])) { $groups[$refNo] = [ 'reference_no' => $refNo, 'p_refer_id' => $pReferId, 'total' => 0, 'pending' => 0, 'done' => 0, 'failed' => 0, 'pass' => 0, 'last_updated_at' => '', 'records' => [], ]; } // 同一 reference_no 理论上只对应一个 p_refer_id;如果出现混淆,保留首次出现的非空 id if ($groups[$refNo]['p_refer_id'] <= 0 && $pReferId > 0) { $groups[$refNo]['p_refer_id'] = $pReferId; } $groups[$refNo]['total']++; $st = intval($this->arrGet($row, 'status', 0)); // record 仅存 {0=待校验, 2=完成, 3=失败};不会出现 1(校对中) if ($st === self::RECORD_PENDING) { $groups[$refNo]['pending']++; } elseif ($st === self::RECORD_COMPLETED) { $groups[$refNo]['done']++; } elseif ($st === self::RECORD_FAILED) { $groups[$refNo]['failed']++; } $upd = (string)$this->arrGet($row, 'updated_at', ''); if ($upd > $groups[$refNo]['last_updated_at']) { $groups[$refNo]['last_updated_at'] = $upd; } $confidence = floatval($this->arrGet($row, 'confidence', 0)); $isPass = $confidence >= self::PASS_CONFIDENCE_THRESHOLD; if ($isPass) { $groups[$refNo]['pass']++; } $groups[$refNo]['records'][] = [ 'check_id' => intval($this->arrGet($row, 'id', 0)), 'am_id' => intval($this->arrGet($row, 'am_id', 0)), 'status' => $st, 'confidence' => $confidence, 'is_pass' => $isPass, 'is_match' => intval($this->arrGet($row, 'is_match', 0)), 'reason' => (string)$this->arrGet($row, 'reason', ''), 'text_start' => intval($this->arrGet($row, 'text_start', 0)), 'text_end' => intval($this->arrGet($row, 'text_end', 0)), 'last_updated_at' => $upd, ]; } $list = []; foreach ($groups as $g) { $total = $g['total']; $pending = $g['pending']; $failed = $g['failed']; $pass = $g['pass']; if ($pending === $total) { $progressStatus = self::PROGRESS_PENDING; } elseif ($pending === 0) { $progressStatus = $failed > 0 ? self::PROGRESS_FAILED : self::PROGRESS_COMPLETED; } else { $progressStatus = self::PROGRESS_CHECKING; } // 整体通过校验:分组已全部完成(无 pending、无 failed),且每条 confidence >= 0.65 $g['is_pass'] = ( $progressStatus === self::PROGRESS_COMPLETED && $total > 0 && $pass === $total ); switch ($progressStatus) { case self::PROGRESS_PENDING: $summary['pending']++; break; case self::PROGRESS_CHECKING: $summary['checking']++; break; case self::PROGRESS_COMPLETED: $summary['completed']++; break; case self::PROGRESS_FAILED: $summary['failed']++; break; } $g['progress_status'] = $progressStatus; $list[] = $g; } usort($list, function ($a, $b) { return $a['reference_no'] - $b['reference_no']; }); return [ 'p_article_id' => $pArticleId, 'total_groups' => count($list), 'summary' => $summary, 'list' => $list, ]; } /** * 按 p_refer_id 查这条参考文献的校对明细与分组进度。 * * 分组进度(与 referenceCheckProgressAI 单条 list 项口径一致): * progress_status 0待校验 1校对中 2完成 3失败 * pending/done/failed/pass、is_pass、progress_percent * * list 每项:check_id、am_id、status、confidence、reason、is_match、is_pass * * @param int $pReferId production_article_refer.p_refer_id * @return array */ public function getCheckDetailsByPReferId($pReferId) { $pReferId = intval($pReferId); if ($pReferId <= 0) { throw new \InvalidArgumentException('p_refer_id is required'); } $rows = Db::name('article_reference_check_result') ->field('id,p_article_id,reference_no,am_id,status,confidence,is_match,reason,updated_at') ->where('p_refer_id', $pReferId) ->order('id asc') ->select(); $list = []; $pArticleId = 0; $referenceNo = 0; $pending = 0; $done = 0; $failed = 0; $pass = 0; $lastUpdatedAt = ''; foreach ($rows as $row) { if ($pArticleId <= 0) { $pArticleId = intval($this->arrGet($row, 'p_article_id', 0)); } if ($referenceNo <= 0) { $referenceNo = intval($this->arrGet($row, 'reference_no', 0)); } $st = intval($this->arrGet($row, 'status', 0)); if ($st === self::RECORD_PENDING) { $pending++; } elseif ($st === self::RECORD_COMPLETED) { $done++; } elseif ($st === self::RECORD_FAILED) { $failed++; } $upd = (string)$this->arrGet($row, 'updated_at', ''); if ($upd > $lastUpdatedAt) { $lastUpdatedAt = $upd; } $confidence = floatval($this->arrGet($row, 'confidence', 0)); $isPass = $confidence >= self::PASS_CONFIDENCE_THRESHOLD; if ($isPass) { $pass++; } $list[] = [ 'check_id' => intval($this->arrGet($row, 'id', 0)), 'am_id' => intval($this->arrGet($row, 'am_id', 0)), 'status' => $st, 'confidence' => $confidence, 'reason' => (string)$this->arrGet($row, 'reason', ''), 'is_match' => intval($this->arrGet($row, 'is_match', 0)), 'is_pass' => $isPass, ]; } if ($referenceNo <= 0) { $refer = Db::name('production_article_refer') ->where('p_refer_id', $pReferId) ->where('state', 0) ->find(); if (!empty($refer)) { if ($pArticleId <= 0) { $pArticleId = intval($this->arrGet($refer, 'p_article_id', 0)); } $referenceNo = intval($this->arrGet($refer, 'index', 0)) + 1; } } $total = count($list); if ($total === 0) { $progressStatus = self::PROGRESS_PENDING; $progressPercent = 0; $isPassGroup = false; } elseif ($pending === $total) { $progressStatus = self::PROGRESS_PENDING; $progressPercent = 0; $isPassGroup = false; } elseif ($pending === 0) { $progressStatus = $failed > 0 ? self::PROGRESS_FAILED : self::PROGRESS_COMPLETED; $progressPercent = 100; $isPassGroup = ( $progressStatus === self::PROGRESS_COMPLETED && $pass === $total ); } else { $progressStatus = self::PROGRESS_CHECKING; $finished = $done + $failed; $progressPercent = round($finished / $total * 100, 1); $isPassGroup = false; } return [ 'p_refer_id' => $pReferId, 'p_article_id' => $pArticleId, 'reference_no' => $referenceNo, 'total' => $total, 'pending' => $pending, 'done' => $done, 'failed' => $failed, 'pass' => $pass, 'progress_status' => $progressStatus, 'progress_percent' => $progressPercent, 'is_pass' => $isPassGroup, 'last_updated_at' => $lastUpdatedAt, 'list' => $list, ]; } public function resetAndRecheckByArticle($aProductionArticle) { if (empty($aProductionArticle) || !is_array($aProductionArticle)) { throw new \InvalidArgumentException('production_article is required'); } $pArticleId = intval($this->arrGet($aProductionArticle, 'p_article_id', 0)); $articleId = intval($this->arrGet($aProductionArticle, 'article_id', 0)); if ($pArticleId <= 0 || $articleId <= 0) { throw new \InvalidArgumentException('production_article requires both p_article_id and article_id'); } $existing = Db::name('article_reference_check_result') ->where('p_article_id', $pArticleId) ->count(); if (intval($existing) <= 0) { throw new \RuntimeException('no existing reference check records for p_article_id=' . $pArticleId); } $cleared = $this->clearArticleChecks($articleId); $enqueueResult = $this->enqueueByArticle($articleId); if (!is_array($enqueueResult)) { $enqueueResult = []; } $enqueueResult['cleared'] = $cleared; $enqueueResult['reset'] = 1; return $enqueueResult; } public static function amStatusLabel($status) { $map = [ self::AM_STATUS_NONE => 'none', self::AM_STATUS_PASS => 'pass', self::AM_STATUS_FAIL => 'fail', self::AM_STATUS_RUNNING => 'running', ]; return isset($map[$status]) ? $map[$status] : 'unknown'; } /** * 表主键为 id(对外 API 参数名仍叫 check_id) */ public function resolveCheckRowId($row) { if (!is_array($row)) { return 0; } if (isset($row['id']) && intval($row['id']) > 0) { return intval($row['id']); } if (isset($row['check_id']) && intval($row['check_id']) > 0) { return intval($row['check_id']); } return 0; } /** * 解析 LLM 返回的 is_match(兼容 bool / 0|1 / "true"|"false" 字符串) */ public function parseLlmIsMatch($value) { if (is_bool($value)) { return $value; } if (is_int($value) || is_float($value)) { return intval($value) === 1; } $s = strtolower(trim((string)$value)); return in_array($s, ['1', 'true', 'yes', 'match', 'matched'], true); } /** * 写入单条校对结果(统一截断 reason/error_msg,避免 varchar(512) 导致 UPDATE 失败) * * @throws \RuntimeException */ public function updateCheckResult($checkId, array $fields) { $checkId = intval($checkId); if ($checkId <= 0) { throw new \InvalidArgumentException('invalid check id'); } if (isset($fields['reason'])) { $fields['reason'] = mb_substr(trim((string)$fields['reason']), 0, 512); } if (isset($fields['error_msg'])) { $fields['error_msg'] = mb_substr(trim((string)$fields['error_msg']), 0, 512); } $fields['updated_at'] = date('Y-m-d H:i:s'); $exists = Db::name('article_reference_check_result')->where('id', $checkId)->find(); if (empty($exists)) { throw new \RuntimeException('article_reference_check_result not found, id=' . $checkId); } $affected = Db::name('article_reference_check_result')->where('id', $checkId)->update($fields); if ($affected === false) { throw new \RuntimeException('article_reference_check_result update failed, id=' . $checkId); } \think\Log::info('updateCheckResult id=' . $checkId . ' affected=' . intval($affected)); return intval($affected); } public function getResult($checkId) { if ($checkId <= 0) { return null; } $row = Db::name('article_reference_check_result')->where('id', $checkId)->find(); return $row ?: null; } public function listByArticle($articleId, $status = -1, $onlyMismatch = false) { $q = Db::name('article_reference_check_result')->where('article_id', $articleId); if ($status >= 0) { $q->where('status', $status); } if ($onlyMismatch) { $q->where('status', self::RECORD_COMPLETED)->where('is_match', 0); } return $q->order('am_id asc, cite_tag_start asc, reference_no asc')->select(); } /** * 稿件预览:在 content 上标记不合理引用序号与引用句 * * @return array{sections: array, issues: array, stats: array} */ public function buildArticlePreview($articleId, $amId = 0) { $fields = 'am_id,content,sort,type,amt_id'; if ($this->hasAmRefCheckStatusColumn()) { $fields .= ',ref_check_status'; } $q = Db::name('article_main') ->field($fields) ->where('article_id', $articleId) ->whereIn('state', [0, 2]); if ($amId > 0) { $q->where('am_id', $amId); } $mains = $q->order('sort asc')->select(); $rows = $this->listByArticle($articleId, 1); $badByAm = $this->indexBadResults($rows); $sections = []; $issues = []; $stats = ['total' => 0, 'mismatch' => 0, 'match' => 0, 'pending' => 0]; foreach ($this->listByArticle($articleId, -1) as $r) { $stats['total']++; if (intval($r['status']) === self::RECORD_PENDING) { $stats['pending']++; } elseif (intval($r['is_match']) === 1) { $stats['match']++; } else { $stats['mismatch']++; } } foreach ($mains as $main) { $id = intval($main['am_id']); $content = $this->resolveArticleMainCheckContent($main); $badIndex = isset($badByAm[$id]) ? $badByAm[$id] : array(); $marked = $this->markContentForPreview($content, $id, $badIndex); $amStatus = intval($this->arrGet($main, 'ref_check_status', 0)); $sections[] = [ 'am_id' => $id, 'ref_check_status' => $amStatus, 'ref_check_pass' => $amStatus === self::AM_STATUS_PASS, 'ref_check_label' => self::amStatusLabel($amStatus), 'content' => $content, 'content_marked' => $marked['html'], 'issue_count' => $marked['issue_count'], ]; foreach ($marked['issues'] as $issue) { $issues[] = $issue; } } $articlePass = $this->resolveArticlePass($sections); return [ 'article_id' => $articleId, 'article_ref_check_pass' => $articlePass, 'sections' => $sections, 'issues' => $issues, 'stats' => $stats, ]; } /** * 全文是否通过:各节均为 pass,且无 running/fail(无引用节忽略) */ private function resolveArticlePass($sections) { $hasChecked = false; foreach ($sections as $sec) { $st = intval($this->arrGet($sec, 'ref_check_status', 0)); if ($st === self::AM_STATUS_NONE) { continue; } $hasChecked = true; if ($st !== self::AM_STATUS_PASS) { return false; } } return $hasChecked ? true : null; } /** * @param array $rows 已校对完成(status=RECORD_COMPLETED)但 is_match=0 的检测结果 * @return array am_id => indexed bad map */ private function indexBadResults($rows) { $byAm = []; foreach ($rows as $row) { if (intval($row['status']) !== self::RECORD_COMPLETED || intval($row['is_match']) === 1) { continue; } $amId = intval($row['am_id']); $refNo = intval($row['reference_no']); if ($amId <= 0 || $refNo <= 0) { continue; } if (!isset($byAm[$amId])) { $byAm[$amId] = ['by_raw' => [], 'contexts' => []]; } $rawKey = $this->normalizeRefRawKey((string)$this->arrGet($row, 'reference_raw', '')); if ($rawKey !== '') { $byAm[$amId]['by_raw'][$rawKey][$refNo] = $row; } $ctxKey = intval($row['text_start']) . '_' . intval($row['text_end']); if (!isset($byAm[$amId]['contexts'][$ctxKey])) { $byAm[$amId]['contexts'][$ctxKey] = [ 'text_start' => intval($row['text_start']), 'text_end' => intval($row['text_end']), 'check_ids' => [], 'reasons' => [], 'ref_nos' => [], ]; } $byAm[$amId]['contexts'][$ctxKey]['check_ids'][] = $this->resolveCheckRowId($row); $byAm[$amId]['contexts'][$ctxKey]['ref_nos'][] = $refNo; $reason = trim((string)$this->arrGet($row, 'reason', '')); if ($reason !== '') { $byAm[$amId]['contexts'][$ctxKey]['reasons'][$refNo] = $reason; } } return $byAm; } private function normalizeRefRawKey($raw) { $raw = str_replace( [',', '–', '—', '−', '‐', '‑', ' '], [',', '-', '-', '-', '-', '-', ''], trim($raw) ); return strtolower($raw); } /** * @param array $badIndex indexBadResults 中单 am 的结构 */ private function markContentForPreview($content, $amId, $badIndex) { $badByRaw = isset($badIndex['by_raw']) ? $badIndex['by_raw'] : array(); $contexts = isset($badIndex['contexts']) ? $badIndex['contexts'] : array(); $issues = array(); $issueCount = 0; if ($content === '' || (empty($badByRaw) && empty($contexts))) { return array('html' => $content, 'issues' => array(), 'issue_count' => 0); } $html = $content; // 1) 先标记 blue 内各序号(在原文上操作,[70-73] 仅标不合理者如 70、71) $matches = $this->collectBlueTagMatches($html); $citeDeltas = []; if (!empty($matches[0])) { $replacements = []; foreach ($matches[0] as $idx => $match) { $fullTag = $match[0]; $tagStart = $match[1]; $tagEnd = $tagStart + strlen($fullTag); $inner = $matches[1][$idx][0]; $rawKey = $this->normalizeRefRawKey($inner); $badNums = isset($badByRaw[$rawKey]) ? $badByRaw[$rawKey] : array(); $innerMarked = preg_replace_callback( '/\d+/', function ($numMatch) use ($badNums, &$issues, &$issueCount, $amId, $inner) { $num = intval($numMatch[0]); if (!isset($badNums[$num])) { return $numMatch[0]; } $row = $badNums[$num]; $rowReason = isset($row['reason']) ? $row['reason'] : ''; $issueCount++; $issues[] = array( 'am_id' => $amId, 'check_id' => $this->resolveCheckRowId($row), 'reference_no' => $num, 'reference_raw' => $inner, 'reason' => $rowReason, 'confidence' => floatval(isset($row['confidence']) ? $row['confidence'] : 0), ); $title = htmlspecialchars( '引用[' . $num . ']不合理: ' . $rowReason, ENT_QUOTES, 'UTF-8' ); return '' . $numMatch[0] . ''; }, $inner ); $tagClass = !empty($badNums) ? ' ref-cite-error' : ''; $groupIds = !empty($badNums) ? implode(',', array_map(function ($row) { return (int) $this->resolveCheckRowId($row); }, $badNums)) : ''; $newHtml = '[' . $innerMarked . ']'; $replacements[] = [ 'start' => $tagStart, 'end' => $tagEnd, 'html' => $newHtml, 'delta' => strlen($newHtml) - ($tagEnd - $tagStart), ]; } usort($replacements, function ($a, $b) { return $b['start'] - $a['start']; }); foreach ($replacements as $rep) { $html = substr($html, 0, $rep['start']) . $rep['html'] . substr($html, $rep['end']); $citeDeltas[] = ['start' => $rep['start'], 'delta' => $rep['delta']]; } } $shiftByCite = function ($pos) use ($citeDeltas) { $d = 0; foreach ($citeDeltas as $cd) { if ($cd['start'] < $pos) { $d += $cd['delta']; } } return $pos + $d; }; // 2) 再标记引用句(从后往前) if (!empty($contexts)) { $spans = array_values($contexts); usort($spans, function ($a, $b) { return $b['text_start'] - $a['text_start']; }); foreach ($spans as $span) { $start = $span['text_start']; $end = $span['text_end']; if ($start < 0 || $end <= $start) { continue; } $s = $shiftByCite($start); $e = $shiftByCite($end); if ($e > strlen($html)) { $e = strlen($html); } $checkIds = array_values(array_unique($span['check_ids'])); $refNos = array_values(array_unique($span['ref_nos'])); sort($refNos); $reasonParts = []; foreach ($refNos as $rn) { if (!empty($span['reasons'][$rn])) { $reasonParts[] = '[' . $rn . '] ' . $span['reasons'][$rn]; } } $title = htmlspecialchars( '引用句可能不合理: ' . implode('; ', $reasonParts), ENT_QUOTES, 'UTF-8' ); $open = ''; $close = ''; $html = substr($html, 0, $s) . $open . substr($html, $s, $e - $s) . $close . substr($html, $e); } } return ['html' => $html, 'issues' => $issues, 'issue_count' => $issueCount]; } /** * @return array refer_index => row */ public function loadReferMapByPArticleId($pArticleId) { $map = []; if ($pArticleId <= 0) { return $map; } $rows = Db::name('production_article_refer') ->where('p_article_id', $pArticleId) ->where('state', 0) ->order('index asc') ->select(); foreach ($rows as $row) { $map[intval($row['index'])] = $row; } return $map; } public function formatReferForLlm($refer) { $parts = []; foreach (['title', 'author', 'joura', 'dateno', 'refer_doi', 'doilink'] as $f) { $v = trim((string)$this->arrGet($refer, $f, '')); if ($v !== '') { $parts[] = ucfirst($f) . ': ' . $v; } } $frag = trim((string)$this->arrGet($refer, 'refer_frag', '')); $content = trim((string)$this->arrGet($refer, 'refer_content', '')); if ($frag !== '') { $parts[] = 'Reference: ' . $frag; } elseif ($content !== '') { $parts[] = 'Reference: ' . $content; } return implode("\n", $parts); } /** * 编辑某条文献内容后,按 p_refer_id 异步重新校对该文献对应的全部 check 明细 * * 流程:刷新 refer_text/refer_index → 重置 status/is_match/confidence/reason * → 设节级 ref_check_status=RUNNING → 投递到 ReferenceCheck 队列 * * 与 recheckByRefer 的差异:本方法**不**在请求内同步跑 LLM,仅入队,立即返回。 * 前端可调 getProgressByPArticleId 轮询进度。 * * @param int $pReferId t_production_article_refer.p_refer_id(必填) * @param int $pArticleId 可选:传入跳过 refer 表二次查表 * @return array{p_refer_id:int, p_article_id:int, reference_no:int, reset:int, queued:int, check_ids:int[], queue:string} */ public function enqueueRecheckByPReferId($pReferId, $pArticleId = 0) { $pReferId = intval($pReferId); if ($pReferId <= 0) { throw new \InvalidArgumentException('p_refer_id is required'); } $refer = Db::name('production_article_refer') ->where('p_refer_id', $pReferId) ->where('state', 0) ->find(); if (empty($refer)) { throw new \RuntimeException('production_article_refer not found, p_refer_id=' . $pReferId); } $pArticleId = intval($pArticleId); if ($pArticleId <= 0) { $pArticleId = intval($this->arrGet($refer, 'p_article_id', 0)); } if ($pArticleId <= 0) { throw new \RuntimeException('p_article_id is missing for p_refer_id=' . $pReferId); } $referenceNo = intval($this->arrGet($refer, 'index', 0)) + 1; $referText = $this->formatReferForLlm($refer); $now = date('Y-m-d H:i:s'); $rows = Db::name('article_reference_check_result') ->where('p_article_id', $pArticleId) ->where('p_refer_id', $pReferId) ->select(); if (empty($rows)) { return [ 'p_refer_id' => $pReferId, 'p_article_id' => $pArticleId, 'reference_no' => $referenceNo, 'reset' => 0, 'queued' => 0, 'check_ids' => [], 'queue' => self::QUEUE_NAME, ]; } $resetFields = [ 'refer_text' => $referText, 'refer_index' => $referenceNo, 'reference_no' => $referenceNo, 'status' => self::RECORD_PENDING, 'is_match' => 0, 'can_support' => 0, 'confidence' => 0, 'reason' => '', 'error_msg' => '', 'updated_at' => $now, ]; $pendingJobs = []; $amIds = []; foreach ($rows as $row) { $checkId = $this->resolveCheckRowId($row); Db::name('article_reference_check_result')->where('id', $checkId)->update($resetFields); $this->clearReferenceCheckQueueLock($checkId); $pendingJobs[] = [ 'check_id' => $checkId, 'reference_no' => $referenceNo, 'am_id' => intval($this->arrGet($row, 'am_id', 0)), 'text_start' => intval($this->arrGet($row, 'text_start', 0)), ]; $amId = intval($this->arrGet($row, 'am_id', 0)); if ($amId > 0) { $amIds[$amId] = true; } } foreach (array_keys($amIds) as $amId) { $this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING); } $checkIds = $this->pushJobsSortedByReferenceNo($pendingJobs); return [ 'p_refer_id' => $pReferId, 'p_article_id' => $pArticleId, 'reference_no' => $referenceNo, 'reset' => count($rows), 'queued' => count($checkIds), 'check_ids' => $checkIds, 'queue' => self::QUEUE_NAME, ]; } /** * 某条参考文献下「校对失败」的明细重新校对(仅 status=RECORD_FAILED,异步入队) * * 不刷新 refer_text / reference_no,沿用记录内已有正文与文献快照,只重置结果字段后入队。 * * @param int $pReferId t_production_article_refer.p_refer_id(必填) * @param int $pArticleId 可选,进一步限定文章 * @return array{p_refer_id:int, p_article_id:int, reset:int, queued:int, check_ids:int[], queue:string} */ public function enqueueRecheckFailedByPReferId($pReferId, $pArticleId = 0) { $pReferId = intval($pReferId); if ($pReferId <= 0) { throw new \InvalidArgumentException('p_refer_id is required'); } $q = Db::name('article_reference_check_result') ->where('p_refer_id', $pReferId) ->where('status', self::RECORD_FAILED); $pArticleId = intval($pArticleId); if ($pArticleId > 0) { $q->where('p_article_id', $pArticleId); } $rows = $q->select(); if (empty($rows)) { return [ 'p_refer_id' => $pReferId, 'p_article_id' => $pArticleId, 'reset' => 0, 'queued' => 0, 'check_ids' => [], 'queue' => self::QUEUE_NAME, ]; } if ($pArticleId <= 0) { $pArticleId = intval($this->arrGet($rows[0], 'p_article_id', 0)); } $now = date('Y-m-d H:i:s'); $resetFields = [ 'status' => self::RECORD_PENDING, 'is_match' => 0, 'can_support' => 0, 'confidence' => 0, 'reason' => '', 'error_msg' => '', 'updated_at' => $now, ]; $pendingJobs = []; $amIds = []; foreach ($rows as $row) { $checkId = $this->resolveCheckRowId($row); Db::name('article_reference_check_result')->where('id', $checkId)->update($resetFields); $this->clearReferenceCheckQueueLock($checkId); $pendingJobs[] = [ 'check_id' => $checkId, 'reference_no' => intval($this->arrGet($row, 'reference_no', 0)), 'am_id' => intval($this->arrGet($row, 'am_id', 0)), 'text_start' => intval($this->arrGet($row, 'text_start', 0)), ]; $amId = intval($this->arrGet($row, 'am_id', 0)); if ($amId > 0) { $amIds[$amId] = true; } } foreach (array_keys($amIds) as $amId) { $this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING); } $checkIds = $this->pushJobsSortedByReferenceNo($pendingJobs); return [ 'p_refer_id' => $pReferId, 'p_article_id' => $pArticleId, 'reset' => count($rows), 'queued' => count($checkIds), 'check_ids' => $checkIds, 'queue' => self::QUEUE_NAME, ]; } public function recheckByRefer($articleId, $pReferId = 0, $referenceNo = 0) { $articleId = intval($articleId); if ($articleId <= 0) { throw new \InvalidArgumentException('article_id is required'); } $ctx = $this->resolveReferForRecheck($articleId, intval($pReferId), intval($referenceNo)); $refer = $ctx['refer']; $pReferId = $ctx['p_refer_id']; $referenceNo = $ctx['reference_no']; $pArticleId = $ctx['p_article_id']; $referText = $this->formatReferForLlm($refer); $now = date('Y-m-d H:i:s'); $rows = Db::name('article_reference_check_result') ->where('article_id', $articleId) ->where(function ($query) use ($pReferId, $referenceNo) { $query->where('p_refer_id', $pReferId)->whereOr('reference_no', $referenceNo); }) ->select(); if (empty($rows)) { return [ 'article_id' => $articleId, 'p_refer_id' => $pReferId, 'reference_no' => $referenceNo, 'reset' => 0, 'queued' => 0, 'check_ids' => [], 'queue' => self::QUEUE_NAME, ]; } $resetFields = [ 'refer_text' => $referText, 'p_refer_id' => $pReferId, 'p_article_id' => $pArticleId, 'refer_index' => $referenceNo, 'status' => 0, 'is_match' => 0, 'can_support' => 0, 'confidence' => 0, 'reason' => '', 'error_msg' => '', 'updated_at' => $now, ]; $pendingJobs = []; $amIds = []; foreach ($rows as $row) { $checkId = $this->resolveCheckRowId($row); Db::name('article_reference_check_result')->where('id', $checkId)->update($resetFields); $pendingJobs[] = [ 'check_id' => $checkId, 'reference_no' => $referenceNo, 'am_id' => intval($row['am_id']), 'text_start' => intval(isset($row['text_start']) ? $row['text_start'] : 0), ]; $amId = intval($row['am_id']); if ($amId > 0) { $amIds[$amId] = true; } } foreach (array_keys($amIds) as $amId) { $this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING); } usort($pendingJobs, function ($a, $b) { if ($a['reference_no'] !== $b['reference_no']) { return $a['reference_no'] - $b['reference_no']; } if ($a['am_id'] !== $b['am_id']) { return $a['am_id'] - $b['am_id']; } return $a['text_start'] - $b['text_start']; }); $checkIds = []; $results = []; $failed = []; foreach ($pendingJobs as $job) { $checkId = intval($job['check_id']); $checkIds[] = $checkId; $this->clearReferenceCheckQueueLock($checkId); try { $results[] = $this->runReferenceCheckOnce($checkId); } catch (\Exception $e) { $failed[] = [ 'check_id' => $checkId, 'error' => $e->getMessage(), ]; \think\Log::error('recheckByRefer check_id=' . $checkId . ' ' . $e->getMessage()); } } foreach (array_keys($amIds) as $amId) { $this->syncAmRefCheckStatus($amId); } return [ 'article_id' => $articleId, 'p_refer_id' => $pReferId, 'reference_no' => $referenceNo, 'reset' => count($rows), 'checked' => count($results), 'failed' => count($failed), 'check_ids' => $checkIds, 'results' => $results, 'errors' => $failed, ]; } /** * 清除队列 Redis 完成标记,避免重检任务被 acquireLock 静默丢弃 */ public function clearReferenceCheckQueueLock($checkId) { $checkId = intval($checkId); if ($checkId <= 0) { return; } try { $keys = []; foreach (['queue_job', 'queue_job_two'] as $prefix) { $class = $prefix === 'queue_job_two' ? 'app\\api\\job\\ReferenceCheckTwo' : 'app\\api\\job\\ReferenceCheck'; $base = $prefix . ':' . $class . ':' . $checkId; $keys[] = $base; $keys[] = $base . ':status'; } QueueRedis::getInstance()->deleteRedisKeys($keys); } catch (\Exception $e) { \think\Log::warning('clearReferenceCheckQueueLock id=' . $checkId . ' ' . $e->getMessage()); } } /** * 执行一次引用 LLM 校对(同步,写回 article_reference_check_result) */ public function runReferenceCheckOnce($checkId) { $checkId = intval($checkId); $row = Db::name('article_reference_check_result')->where('id', $checkId)->find(); if (empty($row)) { throw new \RuntimeException('article_reference_check_result not found, id=' . $checkId); } $contentA = $this->resolveMainContentForJob($row); $refer = null; if (intval($row['p_refer_id']) > 0) { $refer = Db::name('production_article_refer') ->where('p_refer_id', intval($row['p_refer_id'])) ->where('state', 0) ->find(); } if ($refer) { $contentB = $this->formatReferForLlm($refer); } else { $contentB = trim((string)$this->arrGet($row, 'refer_text', '')); } if ($contentA === '' || $contentB === '') { $this->updateCheckResult($checkId, [ 'status' => self::RECORD_FAILED, 'error_msg' => 'Missing section content (text/table) or refer_text', ]); throw new \RuntimeException('Missing section content (text/table) or refer_text'); } $llmResult = (new LLMService())->checkReference($contentA, $contentB, false); $requestFailed = !empty($llmResult['request_failed']); $canSupport = $this->parseLlmCanSupport($llmResult); $confidence = floatval(isset($llmResult['confidence']) ? $llmResult['confidence'] : 0); $reason = isset($llmResult['reason']) ? $llmResult['reason'] : ''; // LLM 通讯失败:写 status=RECORD_FAILED(3) + error_msg,抛异常让队列 worker 走 release(30) 重试; // 重试 3 次后 ReferenceCheck::markFailed 会保持 status=3 收尾 if ($requestFailed) { $this->updateCheckResult($checkId, [ 'confidence' => $confidence, 'reason' => $reason, 'status' => self::RECORD_FAILED, 'error_msg' => $reason, ]); $this->clearReferenceCheckQueueLock($checkId); throw new \RuntimeException($reason !== '' ? $reason : 'LLM request failed'); } $this->updateCheckResult($checkId, [ 'can_support' => $canSupport ? 1 : 0, 'is_match' => $canSupport ? 1 : 0, 'confidence' => $confidence, 'reason' => $reason, 'status' => self::RECORD_COMPLETED, 'error_msg' => '', ]); $this->clearReferenceCheckQueueLock($checkId); $this->maybeEnqueueSecondPass($checkId, $confidence); return [ 'check_id' => $checkId, 'can_support' => $canSupport ? 1 : 0, 'is_match' => $canSupport ? 1 : 0, 'confidence' => $confidence, 'reason' => $reason, ]; } /** * @return array{refer: array, p_article_id: int, p_refer_id: int, reference_no: int} */ private function resolveReferForRecheck($articleId, $pReferId, $referenceNo) { $prod = Db::name('production_article') ->where('article_id', $articleId) ->whereIn('state', [0, 2]) ->find(); if (empty($prod)) { throw new \RuntimeException('production_article not found for article_id=' . $articleId); } $pArticleId = intval($prod['p_article_id']); $refer = null; if ($pReferId > 0) { $refer = Db::name('production_article_refer') ->where('p_refer_id', $pReferId) ->where('p_article_id', $pArticleId) ->where('state', 0) ->find(); } elseif ($referenceNo > 0) { $referMap = $this->loadReferMapByPArticleId($pArticleId); $referIndex = $referenceNo - 1; if (isset($referMap[$referIndex])) { $refer = $referMap[$referIndex]; $pReferId = intval($refer['p_refer_id']); } } else { throw new \InvalidArgumentException('p_refer_id or reference_no is required'); } if (empty($refer)) { throw new \RuntimeException('production_article_refer not found'); } return [ 'refer' => $refer, 'p_article_id' => $pArticleId, 'p_refer_id' => intval($refer['p_refer_id']), 'reference_no' => intval($refer['index']) + 1, ]; } /** * 仅使用 refer_doi 字段(二次 Crossref 摘要用) */ public function extractReferDoiOnly($refer) { if (!is_array($refer)) { return ''; } $raw = trim((string)$this->arrGet($refer, 'refer_doi', '')); if ($raw === '' || stripos($raw, 'not available') !== false) { return ''; } $dois = $this->extractDoisFromString($raw); return empty($dois) ? '' : $dois[0]; } /** * 根据 refer_doi 调用 Crossref works API 获取摘要(二次校对专用) * * @return array{text:string, has_abstract:bool, doi:string} */ public function fetchCrossrefAbstractByReferDoi($refer) { $doi = $this->extractReferDoiOnly($refer); if ($doi === '') { return ['text' => '', 'has_abstract' => false, 'doi' => '']; } $crossref = new CrossrefService([ 'mailto' => trim((string)Env::get('crossref_mailto', '')), ]); $block = $this->extractCrossrefBlock($doi, $crossref); if ($block === null) { return ['text' => '', 'has_abstract' => false, 'doi' => $doi]; } return [ 'text' => $block['text'], 'has_abstract' => !empty($block['has_abstract']), 'doi' => $doi, ]; } /** * 解析 LLM 返回的 can_support */ public function parseLlmCanSupport($llmResult) { if (!is_array($llmResult)) { return false; } if (array_key_exists('can_support', $llmResult)) { return $this->parseLlmIsMatch($llmResult['can_support']); } return $this->parseLlmIsMatch(isset($llmResult['is_match']) ? $llmResult['is_match'] : false); } /** * 第一次校对:正文取 article_main.content;表格(type=2)取 article_main_table.table_data 等 */ public function resolveMainContentForJob(array $row, $maxChars = 8000) { $amId = intval($this->arrGet($row, 'am_id', 0)); if ($amId <= 0) { return ''; } $main = Db::name('article_main') ->field('content,type,amt_id,article_id') ->where('am_id', $amId) ->find(); if (empty($main)) { return ''; } $raw = trim($this->resolveArticleMainCheckContent($main)); if ($raw === '') { return ''; } return $this->normalizeCheckContentForLlm($raw, $maxChars); } /** * 是否为表格节:type=2、有 amt_id,或 content 为 <table tableId='…'/> 占位 */ private function isArticleMainTableSection(array $main) { if (intval($this->arrGet($main, 'type', self::MAIN_TYPE_TEXT)) === self::MAIN_TYPE_TABLE) { return true; } if (intval($this->arrGet($main, 'amt_id', 0)) > 0) { return true; } $content = (string)$this->arrGet($main, 'content', ''); return stripos($content, 'arrGet($main, 'amt_id', 0)); if ($amtId > 0) { return $amtId; } $content = (string)$this->arrGet($main, 'content', ''); if (preg_match('/tableId\s*=\s*[\'"]?(\d+)/i', $content, $m)) { return intval($m[1]); } return 0; } /** * @return array|null */ private function loadArticleMainTableRow(array $main) { $amtId = $this->resolveArticleMainTableAmtId($main); if ($amtId <= 0) { return null; } $q = Db::name('article_main_table') ->where('amt_id', $amtId) ->whereIn('state', [0, 2]) ->field('table_data,title,note'); $articleId = intval($this->arrGet($main, 'article_id', 0)); if ($articleId > 0) { $q->where('article_id', $articleId); } $tbl = $q->find(); return empty($tbl) ? null : $tbl; } /** * 按节提取引用:正文走 content;表格按行拼接单元格后扫描(Study 列仅 [n] 时也能带上同行上下文) */ public function extractReferencesForArticleMain(array $main) { if (!$this->isArticleMainTableSection($main)) { return $this->extractReferences((string)$this->arrGet($main, 'content', '')); } $tbl = $this->loadArticleMainTableRow($main); if (empty($tbl)) { return []; } $extra = []; foreach (['title', 'note'] as $field) { $part = trim((string)$this->arrGet($tbl, $field, '')); if ($part !== '') { $extra[] = $part; } } return $this->extractReferencesFromTableDataJson( (string)$this->arrGet($tbl, 'table_data', ''), $extra ); } /** * table_data 按行提取;$prefixChunks 为 title/note 等(在表格行之前扫描) */ public function extractReferencesFromTableDataJson($tableDataJson, array $prefixChunks = []) { $result = []; $offset = 0; foreach ($prefixChunks as $chunk) { $chunk = trim((string)$chunk); if ($chunk === '') { continue; } foreach ($this->extractReferences($chunk) as $cite) { $cite['text_start'] = intval($cite['text_start']) + $offset; $cite['text_end'] = intval($cite['text_end']) + $offset; $cite['reference_start'] = intval($cite['reference_start']) + $offset; $cite['reference_end'] = intval($cite['reference_end']) + $offset; $result[] = $cite; } $offset += strlen($chunk) + 1; } $tableDataJson = trim((string)$tableDataJson); if ($tableDataJson === '') { return $result; } $decoded = $this->decodeTableDataJsonToArray($tableDataJson); if ($decoded === null) { foreach ($this->extractReferences($tableDataJson) as $cite) { $cite['text_start'] = intval($cite['text_start']) + $offset; $cite['text_end'] = intval($cite['text_end']) + $offset; $cite['reference_start'] = intval($cite['reference_start']) + $offset; $cite['reference_end'] = intval($cite['reference_end']) + $offset; $result[] = $cite; } return $result; } foreach ($decoded as $row) { $line = $this->buildTableRowCheckLine($row); if ($line === '') { continue; } foreach ($this->extractReferences($line) as $cite) { $cite['text_start'] = intval($cite['text_start']) + $offset; $cite['text_end'] = intval($cite['text_end']) + $offset; $cite['reference_start'] = intval($cite['reference_start']) + $offset; $cite['reference_end'] = intval($cite['reference_end']) + $offset; $result[] = $cite; } $offset += strlen($line) + 1; } return $result; } /** * 入队/LLM 用的原始 HTML:type=0 为 content;表格为 table_data 按行展平 */ public function resolveArticleMainCheckContent(array $main) { if (!$this->isArticleMainTableSection($main)) { return (string)$this->arrGet($main, 'content', ''); } $tbl = $this->loadArticleMainTableRow($main); if (empty($tbl)) { return ''; } $chunks = []; foreach (['title', 'note'] as $field) { $part = trim((string)$this->arrGet($tbl, $field, '')); if ($part !== '') { $chunks[] = $part; } } $flat = $this->flattenTableDataJsonToCheckContent((string)$this->arrGet($tbl, 'table_data', '')); if ($flat !== '') { $chunks[] = $flat; } return implode("\n", $chunks); } /** * 表格一行:各单元格 text 用 " | " 连接(保留同行化学名/部位/Study 列引用) */ private function buildTableRowCheckLine($row) { if (!is_array($row)) { return ''; } $cells = []; foreach ($row as $cell) { if (!is_array($cell)) { continue; } $text = trim((string)$this->arrGet($cell, 'text', '')); if ($text !== '') { $cells[] = $text; } } return implode(' | ', $cells); } /** * table_data 按行展平(供 LLM / 预览);非法 JSON 时按整串处理 */ private function flattenTableDataJsonToCheckContent($tableDataJson) { $tableDataJson = trim((string)$tableDataJson); if ($tableDataJson === '') { return ''; } $decoded = $this->decodeTableDataJsonToArray($tableDataJson); if ($decoded === null) { return $tableDataJson; } $lines = []; foreach ($decoded as $row) { $line = $this->buildTableRowCheckLine($row); if ($line !== '') { $lines[] = $line; } } return implode("\n", $lines); } /** * @return array|null */ private function decodeTableDataJsonToArray($raw) { $raw = trim((string)$raw); if ($raw === '') { return null; } if (preg_match('/^\xEF\xBB\xBF/', $raw)) { $raw = substr($raw, 3); } $decoded = json_decode($raw, true); if (json_last_error() !== JSON_ERROR_NONE) { return null; } if (is_array($decoded)) { return $decoded; } if (is_string($decoded)) { $decoded2 = json_decode($decoded, true); if (json_last_error() === JSON_ERROR_NONE && is_array($decoded2)) { return $decoded2; } } return null; } private function normalizeCheckContentForLlm($raw, $maxChars = 8000) { $text = $this->pregReplaceBlueTags($raw, '[$1]'); $text = strip_tags($text); $text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8'); $text = preg_replace('/\s+/u', ' ', $text); $text = trim($text); if ($text === '') { return ''; } $maxChars = max(500, intval($maxChars)); if (mb_strlen($text) > $maxChars) { $text = mb_substr($text, 0, $maxChars) . '...'; } return $text; } /** * 引用处局部上下文(origin_text),供其它场景使用 */ public function resolveCitationContextForJob(array $row) { $text = trim((string)$this->arrGet($row, 'origin_text', '')); if ($text === '') { $text = trim((string)$this->arrGet($row, 'content_a', '')); } return $text; } /** * 从 refer 行提取标准 DOI(10.xxxx/...) * * 优先级:refer_content(原始引用文本里的 DOI 最贴近实际被引用的文献) * > refer_doi > doi > doilink */ public function extractDoiFromRefer($refer) { $list = $this->extractAllDoiCandidatesFromRefer($refer); return empty($list) ? '' : $list[0]; } /** * 返回 refer 行可能对应的全部 DOI 候选(去重,按优先级排序) * * 用于第二轮 DOI 复核场景:当 metadata 的 refer_doi 与原始引用文本里的 DOI * 不一致时(数据漂移),优先尝试原始引用文本里的 DOI 抓真实摘要。 * * @return string[] */ public function extractAllDoiCandidatesFromRefer($refer) { if (!is_array($refer)) { return []; } $ordered = [ (string)$this->arrGet($refer, 'refer_content', ''), (string)$this->arrGet($refer, 'refer_doi', ''), (string)$this->arrGet($refer, 'doi', ''), (string)$this->arrGet($refer, 'doilink', ''), ]; $result = []; foreach ($ordered as $raw) { foreach ($this->extractDoisFromString($raw) as $doi) { if (!in_array($doi, $result, true)) { $result[] = $doi; } } } return $result; } /** * 从任意文本里抽取所有形如 10.xxxx/yyy 的 DOI * @return string[] */ private function extractDoisFromString($text) { $text = trim((string)$text); if ($text === '' || stripos($text, 'not available') !== false) { return []; } $dois = []; if (preg_match_all('~doi\.org/([^\s?#"\'<>]+)~i', $text, $m)) { foreach ($m[1] as $cand) { $cand = $this->trimDoiTail(trim($cand)); if ($this->isValidDoi($cand)) { $dois[] = $cand; } } } if (preg_match_all('~\b(10\.\d{3,9}/[^\s?#"\'<>]+)~i', $text, $m)) { foreach ($m[1] as $cand) { $cand = $this->trimDoiTail(trim($cand)); if ($this->isValidDoi($cand)) { $dois[] = $cand; } } } if ($dois === [] && strpos($text, '10.') === 0) { $cand = $this->trimDoiTail($text); if ($this->isValidDoi($cand)) { $dois[] = $cand; } } return array_values(array_unique($dois)); } private function trimDoiTail($doi) { return rtrim($doi, ".,;:)]}>\"'\\ \t\n\r"); } private function isValidDoi($doi) { return (bool)preg_match('~^10\.\d{3,9}/[^\s]+$~i', (string)$doi); } /** * 通过 PubMed / Crossref 拉取 DOI 对应文献内容(本地 LLM 无法打开网页,须预先抓取) * * 行为: * - 尝试 refer 行内所有 DOI 候选(refer_content > refer_doi > doi > doilink) * - 优先采用第一个能拿到 abstract 的 DOI * - PubMed 无摘要时回落到 Crossref raw 解析摘要(清理 JATS 标签) * - 全部失败则返回空字符串(调用方据此跳过二次复核) */ public function fetchDoiLiteratureBlock($refer) { $candidates = $this->extractAllDoiCandidatesFromRefer($refer); if (empty($candidates)) { return ''; } $pubmed = new PubmedService([ 'email' => trim((string)Env::get('pubmed_email', '')), 'tool' => trim((string)Env::get('pubmed_tool', 'tmrjournals')), ]); $crossref = new CrossrefService([ 'mailto' => trim((string)Env::get('crossref_mailto', '')), ]); $best = null; $fallback = null; foreach ($candidates as $doi) { $block = $this->buildDoiBlockFromSources($doi, $pubmed, $crossref); if ($block === null) { continue; } if (!empty($block['has_abstract'])) { $best = $block; break; } if ($fallback === null) { $fallback = $block; } } $chosen = $best ?: $fallback; if ($chosen === null) { return ''; } return $chosen['text']; } /** * 拉单个 DOI 的真实内容,返回 ['text' => string, 'has_abstract' => bool] 或 null */ private function buildDoiBlockFromSources($doi, PubmedService $pubmed, CrossrefService $crossref) { $doi = trim((string)$doi); if ($doi === '') { return null; } $pub = $pubmed->fetchByDoi($doi); $pubAbstract = is_array($pub) ? trim((string)$this->arrGet($pub, 'abstract', '')) : ''; if (is_array($pub) && ($pubAbstract !== '' || trim((string)$this->arrGet($pub, 'title', '')) !== '')) { $lines = ['Source: PubMed (DOI ' . $doi . ')']; if (!empty($pub['title'])) { $lines[] = 'Actual Title: ' . trim((string)$pub['title']); } if (!empty($pub['journal'])) { $lines[] = 'Journal: ' . trim((string)$pub['journal']); } if (!empty($pub['year'])) { $lines[] = 'Year: ' . trim((string)$pub['year']); } if (!empty($pub['publication_types'])) { $lines[] = 'Publication Types: ' . implode('; ', (array)$pub['publication_types']); } if (!empty($pub['mesh_terms'])) { $lines[] = 'MeSH: ' . implode('; ', (array)$pub['mesh_terms']); } if ($pubAbstract !== '') { $lines[] = 'Abstract: ' . $this->truncate($pubAbstract, 3500); } if ($pubAbstract === '') { $cr = $this->extractCrossrefBlock($doi, $crossref); if ($cr !== null && $cr['has_abstract']) { $lines[] = "\n--- Crossref 补充 ---\n" . $cr['text']; return ['text' => implode("\n", $lines), 'has_abstract' => true]; } } return ['text' => implode("\n", $lines), 'has_abstract' => $pubAbstract !== '']; } return $this->extractCrossrefBlock($doi, $crossref); } /** * 从 Crossref 拉取标题/期刊/作者/摘要(abstract 通常包裹 JATS XML,需清洗) * @return array|null ['text' => string, 'has_abstract' => bool] */ private function extractCrossrefBlock($doi, CrossrefService $crossref) { $msg = $crossref->fetchWork($doi); if (!is_array($msg)) { return null; } $summary = $crossref->fetchWorkSummary($doi); if (!is_array($summary)) { $summary = []; } $lines = ['Source: Crossref api.crossref.org/works/' . rawurlencode($doi)]; $title = isset($msg['title'][0]) ? trim((string)$msg['title'][0]) : trim((string)$this->arrGet($summary, 'title', '')); if ($title !== '') { $lines[] = 'Actual Title: ' . $title; } if (!empty($summary['joura'])) { $lines[] = 'Journal: ' . trim((string)$summary['joura']); } if (!empty($summary['author_str'])) { $lines[] = 'Authors: ' . trim((string)$summary['author_str']); } if (!empty($summary['dateno'])) { $lines[] = 'Publication: ' . trim((string)$summary['dateno']); } if (!empty($summary['doilink'])) { $lines[] = 'DOI Link: ' . trim((string)$summary['doilink']); } if (!empty($summary['is_retracted'])) { $lines[] = 'Retraction: yes - ' . trim((string)$this->arrGet($summary, 'retract_reason', '')); } $abstract = $this->cleanCrossrefAbstract((string)$this->arrGet($msg, 'abstract', '')); $hasAbstract = $abstract !== ''; if ($hasAbstract) { $lines[] = 'Abstract: ' . $this->truncate($abstract, 3500); } else { $lines[] = 'Note: Crossref 未返回摘要,请结合标题/期刊/作者与正文谨慎判断。'; } return ['text' => implode("\n", $lines), 'has_abstract' => $hasAbstract]; } private function cleanCrossrefAbstract($raw) { $raw = trim((string)$raw); if ($raw === '') { return ''; } $raw = preg_replace('~]*>.*?~is', '', $raw); $raw = preg_replace('~]*>~i', "\n", $raw); $raw = preg_replace('~~i', '', $raw); $raw = preg_replace('~]+>~i', '', $raw); $raw = strip_tags($raw); $raw = preg_replace('/[ \t]+/u', ' ', $raw); $raw = preg_replace("/\r\n|\r/u", "\n", $raw); $raw = preg_replace("/\n{2,}/u", "\n", $raw); return trim($raw); } private function truncate($text, $max) { $text = (string)$text; if (mb_strlen($text) <= $max) { return $text; } return mb_substr($text, 0, $max) . '...'; } /** * 第二次 DOI 复核数据准备:返回书目信息 + 真实抓取内容 * * @return array{refer_text:string, doi_block:string, has_abstract:bool, doi_used:string} */ public function prepareRecheckPayload($refer, $referText = '') { $base = trim($referText) !== '' ? trim($referText) : $this->formatReferForLlm($refer); $cr = $this->fetchCrossrefAbstractByReferDoi($refer); return [ 'refer_text' => $base, 'doi_block' => $cr['text'], 'has_abstract' => $cr['has_abstract'], 'doi_used' => $cr['doi'], ]; } /** * 旧接口:拼接成单块文本(向后兼容,建议调用方改用 prepareRecheckPayload) */ public function formatReferForDoiRecheck($refer, $referText = '') { $payload = $this->prepareRecheckPayload($refer, $referText); if ($payload['doi_block'] === '') { return $payload['refer_text'] . "\n\n【DOI 文献真实内容】\n未能从 PubMed/Crossref 获取该 DOI 的摘要或元数据,请依据书目条目与正文谨慎判断。"; } return $payload['refer_text'] . "\n\n【Crossref 摘要(依据 Refer_doi 从 api.crossref.org/works 获取)】\n" . $payload['doi_block']; } /** * 第一轮 confidence<=0.65 且能抓到 DOI 真实内容时,延迟入队第二轮复核 * * 跳过条件(避免无意义重跑得到相同结果): * - check_id 不合法 / 一次置信度高于阈值 * - refer 行不存在 * - refer_doi 为空或 Crossref 未返回摘要 */ public function maybeEnqueueSecondPass($checkId, $confidence) { $checkId = intval($checkId); $confidence = floatval($confidence); if ($checkId <= 0 || $confidence > 0.65) { return false; } $row = Db::name('article_reference_check_result')->where('id', $checkId)->find(); if (empty($row)) { return false; } $refer = null; if (intval($row['p_refer_id']) > 0) { $refer = Db::name('production_article_refer') ->where('p_refer_id', intval($row['p_refer_id'])) ->where('state', 0) ->find(); } if (empty($refer) || $this->extractReferDoiOnly($refer) === '') { return false; } $cr = $this->fetchCrossrefAbstractByReferDoi($refer); if (empty($cr['has_abstract'])) { return false; } $this->clearReferenceCheckQueueLock($checkId); $this->pushJob2($checkId, 5); return true; } /** * 从正文 HTML 或表格展平后的 HTML 提取 blue 引用 */ public function extractReferences($content) { $result = []; $matches = $this->collectBlueTagMatches($content); if (empty($matches[0])) { return []; } $tagSpans = []; foreach ($matches[0] as $index => $match) { $tagSpans[] = [ 'start' => $match[1], 'end' => $match[1] + strlen($match[0]), 'index' => $index, ]; } foreach ($matches[0] as $index => $match) { $fullTag = $match[0]; $tagStart = $match[1]; $tagEnd = $tagStart + strlen($fullTag); $rawRef = trim($matches[1][$index][0]); $referenceNumbers = $this->expandReferenceNumbers($rawRef); list($localStart, $localEnd, $originalText) = $this->extractLocalCitationContext( $content, $tagStart, $tagEnd, $tagSpans ); if (!$this->isMeaningfulCitationContext($originalText) || empty($referenceNumbers)) { continue; } $result[] = [ 'reference_raw' => $rawRef, 'reference_numbers' => $referenceNumbers, 'original_text' => $originalText, 'reference_start' => $tagStart, 'reference_end' => $tagEnd, 'text_start' => $localStart, 'text_end' => $localEnd, ]; } return $result; } /** * 按引用位置截取局部上下文:优先取标签前叙述;同句多引时后续引用从上一标签后开始。 */ private function extractLocalCitationContext($content, $tagStart, $tagEnd, array $tagSpans) { $paragraphStart = $this->findParagraphStart($content, $tagStart); $sentenceEnd = $this->findSentenceEnd($content, $tagEnd, $tagEnd); $prevTagEnd = $paragraphStart; $nextTagStart = $sentenceEnd; foreach ($tagSpans as $span) { if ($span['end'] <= $tagStart && $span['end'] > $prevTagEnd) { $prevTagEnd = $span['end']; } if ($span['start'] > $tagEnd && $span['start'] < $nextTagStart) { $nextTagStart = $span['start']; } } $hasPriorCiteInParagraph = ($prevTagEnd > $paragraphStart); $sentenceStart = $this->findSentenceStart($content, $tagStart); // 段内首个引用:整段到标签前;后续引用:取「本句」起点(可早于上一标签),避免只剩 “and external environment” 再误用标签后文本 if ($hasPriorCiteInParagraph) { $localStart = max($paragraphStart, $sentenceStart); } else { $localStart = $this->capContextStartBeforeTag($content, $tagStart, $paragraphStart); } // 默认:引用标签前的论述 $localEnd = $tagStart; $originalText = $this->buildCitationContextText($content, $localStart, $localEnd); // 仅段内首个引用、且标签前极短(如句末 ICU nurses [14])时,才改用标签后片段;同段多引禁止标签后截取(会错取下一句) $allowTrailing = !$hasPriorCiteInParagraph; if ($allowTrailing && ( !$this->isMeaningfulCitationContext($originalText) || $this->shouldUseTrailingCitationContext($content, $localStart, $tagStart, $tagEnd) )) { $trailEnd = ($nextTagStart < $sentenceEnd) ? $nextTagStart : $sentenceEnd; $trailText = $this->buildCitationContextText($content, $tagEnd, $trailEnd); if ($this->isMeaningfulCitationContext($trailText)) { $localStart = $tagEnd; $localEnd = $trailEnd; $originalText = $trailText; } } if (!$this->isMeaningfulCitationContext($originalText)) { list($localStart, $localEnd) = $this->widenCitationContextBounds( $content, $tagStart, $tagEnd, $localStart, $localEnd ); $originalText = $this->buildCitationContextText($content, $localStart, $localEnd); } return [$localStart, $localEnd, $originalText]; } /** * 标签前仅有作者缩写等极短片段时,改用标签后上下文 */ private function shouldUseTrailingCitationContext($content, $localStart, $tagStart, $tagEnd) { $before = $this->buildCitationContextText($content, $localStart, $tagStart); if (!$this->isMeaningfulCitationContext($before)) { return true; } return mb_strlen($before) < 25; } public function expandReferenceNumbers($refStr) { $refStr = str_replace( [',', '–', '—', '−', '‐', '‑'], [',', '-', '-', '-', '-', '-'], trim($refStr) ); $numbers = []; foreach (explode(',', $refStr) as $part) { $part = trim($part); if ($part === '') { continue; } if (preg_match('/^(\d+)\s*-\s*(\d+)$/', $part, $m)) { $start = intval($m[1]); $end = intval($m[2]); if ($start <= $end) { $numbers = array_merge($numbers, range($start, $end)); } } elseif (ctype_digit($part)) { $numbers[] = intval($part); } } return array_values(array_unique($numbers)); } /** * 返回 $bytePos 处 UTF-8 码点占用的最后一字节之后的位置(下一字符起始) */ private function utf8CharEnd($content, $bytePos) { $len = strlen($content); if ($bytePos < 0 || $bytePos >= $len) { return max(0, min($len, $bytePos + 1)); } $next = $bytePos + 1; while ($next < $len && (ord($content[$next]) & 0xC0) === 0x80) { $next++; } return $next; } /** * 按字节偏移截取(与 strpos/strlen 一致);勿用 mb_substr,否则遇中文前缀会截断英文词头 */ private function byteSubstr($content, $start, $end) { $length = max(0, $end - $start); if ($length === 0) { return ''; } return (string)mb_strcut($content, $start, $length, 'UTF-8'); } private function buildCitationContextText($content, $start, $end) { $text = $this->byteSubstr($content, $start, $end); $text = $this->pregReplaceBlueTags($text, ''); $text = trim(strip_tags($text)); $text = preg_replace('/\s+/u', ' ', $text); $text = ltrim($text, "\xEF\xBB\xBF"); return $text; } /** * 过滤仅标点、过短或无字母/汉字的上下文(如去掉标签后只剩 ".") */ private function isMeaningfulCitationContext($text) { $text = trim($text); if ($text === '') { return false; } if ($this->isOnlyPunctuationOrSpace($text)) { return false; } if (!preg_match('/[\p{L}\p{N}]/u', $text)) { return false; } return mb_strlen($text) >= 2; } private function isOnlyPunctuationOrSpace($text) { return preg_match('/^[\s\p{P}\p{S}]+$/u', $text) === 1; } /** * 首句过短时向前后各扩展一句(上限约 2000 字符) */ private function widenCitationContextBounds($content, $tagStart, $tagEnd, $start, $end) { $len = strlen($content); $maxSpan = 2000; if ($start > 0) { $prevStart = $this->findSentenceStart($content, max(0, $start - 1)); if ($prevStart < $start) { $start = $prevStart; } } $nextEnd = $this->findSentenceEnd($content, $end, $tagEnd); if ($nextEnd > $end && $nextEnd <= $len) { $end = $nextEnd; } if ($end - $start > $maxSpan) { $half = (int)floor($maxSpan / 2); $mid = (int)floor(($tagStart + $tagEnd) / 2); $start = max(0, $mid - $half); $end = min($len, $start + $maxSpan); } return [$start, $end]; } /** * 句号是否可作为句界(排除小数点、et al. 等缩写) */ private function isSentenceDelimiterAt($content, $pos, $delimiter) { $len = strlen($content); if ($delimiter !== '.' || $pos < 0 || $pos >= $len) { return true; } if ($pos > 0 && $pos + 1 < $len && ctype_digit($content[$pos - 1]) && ctype_digit($content[$pos + 1]) ) { return false; } $before = substr($content, max(0, $pos - 12), min(12, $pos)); if (preg_match('/\b(et\s+al|e\.g|i\.e|vs|etc|fig|no)\s*\.?\s*$/i', $before)) { return false; } $after = substr($content, $pos + 1, 24); if (preg_match('/^\s*\s*\[/', $after)) { return false; } return true; } /** * 段落起始(HTML / 换行),避免英文多句段落只取到最后一个句号后的一句 */ private function findParagraphStart($content, $tagStart) { $search = substr($content, 0, max(0, $tagStart)); if ($search === '') { return 0; } $best = 0; if (preg_match_all('/]*>/i', $search, $m, PREG_OFFSET_CAPTURE)) { $last = end($m[0]); $best = max($best, $last[1] + strlen($last[0])); } if (preg_match_all('/<\/p>\s*/i', $search, $m, PREG_OFFSET_CAPTURE)) { $last = end($m[0]); $best = max($best, $last[1] + strlen($last[0])); } if (preg_match_all('/\s*/i', $search, $m, PREG_OFFSET_CAPTURE)) { $last = end($m[0]); $best = max($best, $last[1] + strlen($last[0])); } $pos = strrpos($search, "\n\n"); if ($pos !== false) { $best = max($best, $pos + 2); } $pos = strrpos($search, "\n"); if ($pos !== false) { $best = max($best, $pos + 1); } return $best; } /** * 段落过长时从引用处向前截取上限,避免单次 LLM 上下文过大 */ private function capContextStartBeforeTag($content, $tagStart, $paragraphStart, $maxBytes = 2500) { if ($tagStart - $paragraphStart <= $maxBytes) { return $paragraphStart; } $start = $tagStart - $maxBytes; $slice = substr($content, $start, $tagStart - $start); if (preg_match('/[.!?。!?]\s+/u', $slice, $m, PREG_OFFSET_CAPTURE)) { $rel = $m[0][1] + strlen($m[0][0]); return $start + $rel; } return max($paragraphStart, $start); } private function findSentenceStart($content, $position) { $start = 0; foreach (['.', '。', '!', '?', "\n"] as $delimiter) { $pos = strrpos(substr($content, 0, $position), $delimiter); if ($pos !== false && $this->isSentenceDelimiterAt($content, $pos, $delimiter)) { $start = max($start, $this->utf8CharEnd($content, $pos)); } } return $start; } /** * @param int $searchFrom 从该字节位置起查找句末 * @param int $tagEnd 引用标签结束位置;用于跳过 后紧跟的孤立句号 */ private function findSentenceEnd($content, $searchFrom, $tagEnd = 0) { $length = strlen($content); $minPos = max(0, $searchFrom); while ($minPos < $length) { $endPositions = []; foreach (['.', '。', '!', '?', "\n"] as $delimiter) { $pos = strpos($content, $delimiter, $minPos); if ($pos !== false && $this->isSentenceDelimiterAt($content, $pos, $delimiter)) { $endPositions[] = $this->utf8CharEnd($content, $pos); } } if (empty($endPositions)) { return $length; } $end = min($endPositions); if ($tagEnd <= 0 || $end <= $tagEnd) { return $end; } $gap = substr($content, $tagEnd, $end - $tagEnd); $gapText = trim(strip_tags($this->pregReplaceBlueTags($gap, ''))); if ($gapText !== '' && !$this->isOnlyPunctuationOrSpace($gapText)) { return $end; } $minPos = $end; } return $length; } /** * 已入库记录按文献编号正序入队(同号按 am_id、正文位置稳定排序) * * @param array $rows 元素含 check_id、reference_no,可选 am_id、text_start */ private function pushJobsSortedByReferenceNo(array $rows) { if (empty($rows)) { return []; } usort($rows, function ($a, $b) { if ($a['reference_no'] !== $b['reference_no']) { return $a['reference_no'] - $b['reference_no']; } $amA = isset($a['am_id']) ? intval($a['am_id']) : 0; $amB = isset($b['am_id']) ? intval($b['am_id']) : 0; if ($amA !== $amB) { return $amA - $amB; } $posA = isset($a['text_start']) ? intval($a['text_start']) : 0; $posB = isset($b['text_start']) ? intval($b['text_start']) : 0; return $posA - $posB; }); $checkIds = []; $delay = 0; foreach ($rows as $row) { $checkId = intval($row['check_id']); $checkIds[] = $checkId; $this->pushJob($checkId, $delay); $delay++; } return $checkIds; } private function pushJob($checkId, $delaySeconds = 0) { $checkId = intval($checkId); $this->clearReferenceCheckQueueLock($checkId); $jobClass = 'app\api\job\ReferenceCheck@fire'; $data = ['check_id' => $checkId]; try { if ($delaySeconds > 0) { $jobId = Queue::later($delaySeconds, $jobClass, $data, self::QUEUE_NAME); } else { $jobId = Queue::push($jobClass, $data, self::QUEUE_NAME); } } catch (\Exception $e) { \think\Log::error('ReferenceCheck pushJob failed check_id=' . $checkId . ' ' . $e->getMessage()); throw $e; } } private function pushJob2($checkId, $delaySeconds = 0) { $jobClass = 'app\api\job\ReferenceCheckTwo@fire'; $data = ['check_id' => $checkId]; try { if ($delaySeconds > 0) { $jobId = Queue::later($delaySeconds, $jobClass, $data, self::QUEUE_NAME); } else { $jobId = Queue::push($jobClass, $data, self::QUEUE_NAME); } } catch (\Exception $e) { \think\Log::error('ReferenceCheckTwo pushJob failed check_id=' . $checkId . ' ' . $e->getMessage()); throw $e; } } }