3119 lines
112 KiB
PHP
3119 lines
112 KiB
PHP
<?php
|
||
|
||
namespace app\common;
|
||
|
||
use think\Db;
|
||
use think\Env;
|
||
use think\Queue;
|
||
use app\common\service\LLMService;
|
||
|
||
/**
|
||
* 正文 <blue>[n]</blue> 引用与 t_production_article_refer(index+1=n)相关性校对。
|
||
* LLM 配置与 PromotionLlmService 相同;单条任务走 ReferenceCheck 队列。
|
||
*/
|
||
class ReferenceCheckService
|
||
{
|
||
const QUEUE_NAME = 'ReferenceCheck';
|
||
|
||
/** t_article_main.type */
|
||
const MAIN_TYPE_TEXT = 0;
|
||
const MAIN_TYPE_IMAGE = 1;
|
||
const MAIN_TYPE_TABLE = 2;
|
||
|
||
/** t_article_main.ref_check_status(需执行 sql/article_main_ref_check_status.sql) */
|
||
const AM_STATUS_NONE = 0;
|
||
const AM_STATUS_PASS = 1;
|
||
const AM_STATUS_FAIL = 2;
|
||
const AM_STATUS_RUNNING = 3;
|
||
|
||
/** @var bool|null t_article_main 是否已有 ref_check_status 列 */
|
||
private static $amRefCheckStatusColumnExists = null;
|
||
|
||
/**
|
||
* 引用校对状态(生命周期顺序:0→1→2→3 = 待→进行→完成→失败)
|
||
*
|
||
* 这套常量在两个维度共用:
|
||
* - 单条明细(article_reference_check_result.status)只会取 {0, 2, 3} —— 明细不会出现"校对中"
|
||
* - 分组(按 reference_no 聚合后的 progress_status)四个值都会用 —— 1=部分跑完、部分仍为 0
|
||
*/
|
||
const PROGRESS_PENDING = 0; // 待校验:明细 status=0;分组内全部明细 status=0
|
||
const PROGRESS_CHECKING = 1; // 校对中:仅分组层 —— 部分明细已结束、部分仍为 0
|
||
const PROGRESS_COMPLETED = 2; // 校对完成:明细 status=2;分组内全部明细 status=2
|
||
const PROGRESS_FAILED = 3; // 校对失败:明细 status=3;分组内全部跑完、≥1 条 status=3
|
||
|
||
/** 整篇文章的引用校对状态(对外整体状态,用于"开始/重置"按钮分流) */
|
||
const ARTICLE_PROGRESS_NONE = 0; // 还没有任何校对记录
|
||
const ARTICLE_PROGRESS_RUNNING = 1; // 至少 1 条明细 status=0(队列里还有未跑完的)
|
||
const ARTICLE_PROGRESS_COMPLETED = 2; // 所有明细 status != 0(全部已完成或失败)
|
||
|
||
/**
|
||
* 单条校对明细的状态(DB 字段 article_reference_check_result.status)
|
||
*
|
||
* 这里只列实际写入 DB 的三种值。"校对中"(值 1)是分组层专用,明细不会出现。
|
||
* 数值与 PROGRESS_* 对齐(同一套语义),方便前端/后端混用。
|
||
*/
|
||
const RECORD_PENDING = 0; // 待校对,已入队但还没被 worker 拾起
|
||
const RECORD_COMPLETED = 2; // 校对完成
|
||
const RECORD_FAILED = 3; // 校对失败
|
||
|
||
/** LLM 评分(confidence)通过阈值:>= 该值视为"通过" */
|
||
const PASS_CONFIDENCE_THRESHOLD = 0.65;
|
||
|
||
/**
|
||
* 正文引用标签两种排版(带 /u):
|
||
* 1) <blue>[8, 9]</blue>、<blue>[13-15]</blue> —— 方括号在 blue 内
|
||
* 2) [<blue>13-15</blue>] —— 方括号包裹 blue
|
||
*
|
||
* 捕获组均为序号串(可含逗号、区间连字符及排版变体)。
|
||
*/
|
||
const BLUE_TAG_REGEX = '/<blue>\[([\d,,\-\x{2013}\x{2014}\x{2212}\x{2010}\x{2011}\s]+)\]<\/blue>/u';
|
||
const BLUE_TAG_REGEX_BRACKET_OUTSIDE = '/\[<blue>([\d,,\-\x{2013}\x{2014}\x{2212}\x{2010}\x{2011}\s]+)<\/blue>\]/u';
|
||
|
||
/**
|
||
* 兼容无 ?? 的 PHP 版本
|
||
*/
|
||
private function arrGet($arr, $key, $default = '')
|
||
{
|
||
return isset($arr[$key]) ? $arr[$key] : $default;
|
||
}
|
||
|
||
/**
|
||
* 合并匹配两种 blue 引用排版,按在正文中的起始位置排序。
|
||
*
|
||
* @return array{0: array, 1: array} 同 preg_match_all 的完整匹配与捕获组 1
|
||
*/
|
||
private function collectBlueTagMatches($content)
|
||
{
|
||
$merged = [];
|
||
foreach ([self::BLUE_TAG_REGEX, self::BLUE_TAG_REGEX_BRACKET_OUTSIDE] as $pattern) {
|
||
if (!preg_match_all($pattern, $content, $m, PREG_OFFSET_CAPTURE)) {
|
||
continue;
|
||
}
|
||
$count = count($m[0]);
|
||
for ($i = 0; $i < $count; $i++) {
|
||
$merged[] = ['full' => $m[0][$i], 'inner' => $m[1][$i]];
|
||
}
|
||
}
|
||
|
||
usort($merged, function ($a, $b) {
|
||
return $a['full'][1] - $b['full'][1];
|
||
});
|
||
|
||
$matches = [[], []];
|
||
foreach ($merged as $item) {
|
||
$matches[0][] = $item['full'];
|
||
$matches[1][] = $item['inner'];
|
||
}
|
||
|
||
return $matches;
|
||
}
|
||
|
||
/** 对两种 blue 引用排版执行 preg_replace */
|
||
private function pregReplaceBlueTags($subject, $replacement)
|
||
{
|
||
$subject = preg_replace(self::BLUE_TAG_REGEX, $replacement, $subject);
|
||
$subject = preg_replace(self::BLUE_TAG_REGEX_BRACKET_OUTSIDE, $replacement, $subject);
|
||
|
||
return $subject;
|
||
}
|
||
|
||
/**
|
||
* 单条入队(可手工指定正文与文献文本)
|
||
*/
|
||
public function enqueue($contentA, $contentB, array $extra = [])
|
||
{
|
||
$contentA = trim($contentA);
|
||
if ($contentA === '') {
|
||
throw new \InvalidArgumentException('content_a is required');
|
||
}
|
||
|
||
$now = date('Y-m-d H:i:s');
|
||
$checkId = Db::name('article_reference_check_result')->insertGetId([
|
||
'article_id' => intval($this->arrGet($extra, 'article_id', 0)),
|
||
'am_id' => intval($this->arrGet($extra, 'am_id', 0)),
|
||
'p_article_id' => intval($this->arrGet($extra, 'p_article_id', 0)),
|
||
'p_refer_id' => intval($this->arrGet($extra, 'p_refer_id', 0)),
|
||
'refer_index' => intval($this->arrGet($extra, 'refer_index', 0)),
|
||
'reference_no' => intval($this->arrGet($extra, 'reference_no', 0)),
|
||
'reference_raw' => (string)$this->arrGet($extra, 'reference_raw', ''),
|
||
'cite_tag_start' => intval($this->arrGet($extra, 'cite_tag_start', 0)),
|
||
'cite_tag_end' => intval($this->arrGet($extra, 'cite_tag_end', 0)),
|
||
'text_start' => intval($this->arrGet($extra, 'text_start', 0)),
|
||
'text_end' => intval($this->arrGet($extra, 'text_end', 0)),
|
||
'content_a' => $contentA,
|
||
'content_b' => trim($contentB),
|
||
'status' => 0,
|
||
'created_at' => $now,
|
||
'updated_at' => $now,
|
||
]);
|
||
|
||
$amId = intval($this->arrGet($extra, 'am_id', 0));
|
||
if ($amId > 0) {
|
||
$this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING);
|
||
}
|
||
|
||
$this->pushJob(intval($checkId), intval($this->arrGet($extra, 'queue_delay', 0)));
|
||
|
||
return ['check_id' => $checkId, 'queued' => 1];
|
||
}
|
||
public function enqueueByArticleMain($main){
|
||
$amId = intval($this->arrGet($main, 'am_id', 0));
|
||
if ($amId > 0 && (!isset($main['type']) || (intval($main['type']) === self::MAIN_TYPE_TABLE && intval($this->arrGet($main, 'amt_id', 0)) <= 0))) {
|
||
$dbMain = Db::name('article_main')
|
||
->field('am_id,content,article_id,type,amt_id')
|
||
->where('am_id', $amId)
|
||
->whereIn('state', [0, 2])
|
||
->find();
|
||
if (!empty($dbMain)) {
|
||
$main = array_merge($dbMain, $main);
|
||
}
|
||
}
|
||
$citations = $this->extractReferencesForArticleMain($main);
|
||
if (empty($citations)) {
|
||
$this->setAmRefCheckStatus($amId, self::AM_STATUS_NONE);
|
||
return;
|
||
}
|
||
$prod = Db::name('production_article')
|
||
->where('article_id', $main['article_id'])
|
||
->where('state', 0)
|
||
->find();
|
||
if (empty($prod)) {
|
||
throw new \RuntimeException('production_article not found for article_id=' . $main['article_id']);
|
||
}
|
||
|
||
$pArticleId = intval($prod['p_article_id']);
|
||
$referMap = $this->loadReferMapByPArticleId($pArticleId);
|
||
|
||
if (empty($citations)) {
|
||
$this->setAmRefCheckStatus($amId, self::AM_STATUS_PASS);
|
||
return;
|
||
}
|
||
|
||
$skipped = 0;
|
||
$delay = 0;
|
||
foreach ($citations as $cite) {
|
||
foreach ($cite['reference_numbers'] as $refNo) {
|
||
$referIndex = $refNo - 1;
|
||
if ($referIndex < 0 || !isset($referMap[$referIndex])) {
|
||
$skipped++;
|
||
continue;
|
||
}
|
||
$refer = $referMap[$referIndex];
|
||
$referText = $this->formatReferForLlm($refer);
|
||
|
||
$now = date('Y-m-d H:i:s');
|
||
// [70-73] 展开为 reference_no=70,71,72,73 共 4 条记录
|
||
$checkId = Db::name('article_reference_check_result')->insertGetId([
|
||
'article_id' => $main['article_id'],
|
||
'p_article_id' => $pArticleId,
|
||
'am_id' => intval($main['am_id']),
|
||
'reference_no' => $refNo,
|
||
'refer_index' => $refNo,
|
||
'origin_text' => $cite['original_text'],
|
||
'refer_text' => $referText,
|
||
'p_refer_id' => $referMap[$referIndex]['p_refer_id'],
|
||
'text_start' => $cite['text_start'],
|
||
'text_end' => $cite['text_end'],
|
||
'created_at' => $now,
|
||
'updated_at' => $now,
|
||
]);
|
||
$this->pushJob(intval($checkId), $delay);
|
||
$checkIds[] = $checkId;
|
||
$delay += 1;
|
||
}
|
||
}
|
||
|
||
$this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING);
|
||
}
|
||
/**
|
||
* 手工触发:对已完成且 confidence<=0.65 的记录入队 DOI 第二轮复核
|
||
*/
|
||
public function enqueueSecondPassByArticle($articleId)
|
||
{
|
||
$articleId = intval($articleId);
|
||
if ($articleId <= 0) {
|
||
throw new \InvalidArgumentException('article_id is required');
|
||
}
|
||
|
||
$rows = Db::name('article_reference_check_result')
|
||
->where('article_id', $articleId)
|
||
->where('status', self::RECORD_COMPLETED)
|
||
->where('confidence', '<=', 0.65)
|
||
->orderRaw('rand()')
|
||
->limit(2)
|
||
->select();
|
||
|
||
$checkIds2 = [];
|
||
$delay2 = 0;
|
||
foreach ($rows as $checkLog) {
|
||
$rowId = $this->resolveCheckRowId($checkLog);
|
||
if ($this->maybeEnqueueSecondPass($rowId, floatval($checkLog['confidence']))) {
|
||
$checkIds2[] = $rowId;
|
||
$delay2 += 1;
|
||
}
|
||
}
|
||
|
||
return [
|
||
'article_id' => $articleId,
|
||
'check_ids2' => $checkIds2,
|
||
'queued' => count($checkIds2),
|
||
];
|
||
}
|
||
public function enqueueByPArticle($prod){
|
||
if (empty($prod)) {
|
||
throw new \RuntimeException('production_article not found');
|
||
}
|
||
$pArticleId = intval($prod['p_article_id']);
|
||
$articleId = intval($prod['article_id']);
|
||
$referMap = $this->loadReferMapByPArticleId($pArticleId);
|
||
|
||
$mains = Db::name('article_main')
|
||
->field('am_id,content,article_id,type,amt_id')
|
||
->where('article_id', $articleId)
|
||
->whereIn('state', [0, 2])
|
||
->order('sort asc')
|
||
->select();
|
||
if (empty($mains)) {
|
||
throw new \RuntimeException('article_main is empty');
|
||
}
|
||
$queued = 0;
|
||
$skipped = 0;
|
||
$pendingJobs = [];
|
||
$amIdsWithJobs = [];
|
||
$now = date('Y-m-d H:i:s');
|
||
foreach ($mains as $main) {
|
||
$amId = intval($main['am_id']);
|
||
$citations = $this->extractReferencesForArticleMain($main);
|
||
if (empty($citations)) {
|
||
$this->setAmRefCheckStatus($amId, self::AM_STATUS_NONE);
|
||
continue;
|
||
}
|
||
foreach ($citations as $cite) {
|
||
foreach ($cite['reference_numbers'] as $refNo) {
|
||
$referIndex = $refNo - 1;
|
||
if ($referIndex < 0 || !isset($referMap[$referIndex])) {
|
||
$skipped++;
|
||
continue;
|
||
}
|
||
$refer = $referMap[$referIndex];
|
||
$referText = $this->formatReferForLlm($refer);
|
||
|
||
// [70-73] 展开为 reference_no=70,71,72,73 共 4 条记录;先入队表,再按文献号正序校对
|
||
$checkId = Db::name('article_reference_check_result')->insertGetId([
|
||
'article_id' => $main['article_id'],
|
||
'p_article_id' => $pArticleId,
|
||
'am_id' => $amId,
|
||
'reference_no' => $refNo,
|
||
'refer_index' => $refNo,
|
||
'origin_text' => $cite['original_text'],
|
||
'refer_text' => $referText,
|
||
'p_refer_id' => $referMap[$referIndex]['p_refer_id'],
|
||
'text_start' => $cite['text_start'],
|
||
'text_end' => $cite['text_end'],
|
||
'created_at' => $now,
|
||
'updated_at' => $now,
|
||
]);
|
||
|
||
$pendingJobs[] = [
|
||
'check_id' => intval($checkId),
|
||
'reference_no' => intval($refNo),
|
||
'am_id' => $amId,
|
||
'text_start' => intval($cite['text_start']),
|
||
];
|
||
$queued++;
|
||
$amIdsWithJobs[$amId] = true;
|
||
}
|
||
}
|
||
}
|
||
|
||
$checkIds = $this->pushJobsSortedByReferenceNo($pendingJobs);
|
||
foreach (array_keys($amIdsWithJobs) as $amId) {
|
||
$this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING);
|
||
}
|
||
|
||
return [
|
||
'article_id' => $articleId,
|
||
'p_article_id' => $pArticleId,
|
||
'queued' => $queued,
|
||
'skipped' => $skipped,
|
||
'check_ids' => $checkIds,
|
||
'queue' => self::QUEUE_NAME,
|
||
];
|
||
}
|
||
public function enqueueByArticle($articleId){
|
||
if ($articleId <= 0) {
|
||
throw new \InvalidArgumentException('article_id is required');
|
||
}
|
||
$prod = Db::name('production_article')
|
||
->where('article_id', $articleId)
|
||
->whereIn('state', [0, 2])
|
||
->find();
|
||
if (empty($prod)) {
|
||
throw new \RuntimeException('production_article not found for article_id=' . $articleId);
|
||
}
|
||
$pArticleId = intval($prod['p_article_id']);
|
||
$referMap = $this->loadReferMapByPArticleId($pArticleId);
|
||
|
||
$mains = Db::name('article_main')
|
||
->field('am_id,content,article_id,type,amt_id')
|
||
->where('article_id', $articleId)
|
||
->whereIn('state', [0, 2])
|
||
->order('sort asc')
|
||
->select();
|
||
if (empty($mains)) {
|
||
throw new \RuntimeException('article_main is empty');
|
||
}
|
||
$queued = 0;
|
||
$skipped = 0;
|
||
$pendingJobs = [];
|
||
$amIdsWithJobs = [];
|
||
$now = date('Y-m-d H:i:s');
|
||
foreach ($mains as $main) {
|
||
$amId = intval($main['am_id']);
|
||
$citations = $this->extractReferencesForArticleMain($main);
|
||
if (empty($citations)) {
|
||
$this->setAmRefCheckStatus($amId, self::AM_STATUS_NONE);
|
||
continue;
|
||
}
|
||
foreach ($citations as $cite) {
|
||
foreach ($cite['reference_numbers'] as $refNo) {
|
||
$referIndex = $refNo - 1;
|
||
if ($referIndex < 0 || !isset($referMap[$referIndex])) {
|
||
$skipped++;
|
||
continue;
|
||
}
|
||
$refer = $referMap[$referIndex];
|
||
$referText = $this->formatReferForLlm($refer);
|
||
|
||
// [70-73] 展开为 reference_no=70,71,72,73 共 4 条记录;先入队表,再按文献号正序校对
|
||
$checkId = Db::name('article_reference_check_result')->insertGetId([
|
||
'article_id' => $main['article_id'],
|
||
'p_article_id' => $pArticleId,
|
||
'am_id' => $amId,
|
||
'reference_no' => $refNo,
|
||
'refer_index' => $refNo,
|
||
'origin_text' => $cite['original_text'],
|
||
'refer_text' => $referText,
|
||
'p_refer_id' => $referMap[$referIndex]['p_refer_id'],
|
||
'text_start' => $cite['text_start'],
|
||
'text_end' => $cite['text_end'],
|
||
'created_at' => $now,
|
||
'updated_at' => $now,
|
||
]);
|
||
|
||
$pendingJobs[] = [
|
||
'check_id' => intval($checkId),
|
||
'reference_no' => intval($refNo),
|
||
'am_id' => $amId,
|
||
'text_start' => intval($cite['text_start']),
|
||
];
|
||
$queued++;
|
||
$amIdsWithJobs[$amId] = true;
|
||
}
|
||
}
|
||
}
|
||
|
||
$checkIds = $this->pushJobsSortedByReferenceNo($pendingJobs);
|
||
foreach (array_keys($amIdsWithJobs) as $amId) {
|
||
$this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING);
|
||
}
|
||
|
||
return [
|
||
'article_id' => $articleId,
|
||
'p_article_id' => $pArticleId,
|
||
'queued' => $queued,
|
||
'skipped' => $skipped,
|
||
'check_ids' => $checkIds,
|
||
'queue' => self::QUEUE_NAME,
|
||
];
|
||
}
|
||
|
||
/**
|
||
* 根据该节全部明细行汇总更新 t_article_main.ref_check_status
|
||
*/
|
||
public function syncAmRefCheckStatus($amId)
|
||
{
|
||
if ($amId <= 0) {
|
||
return self::AM_STATUS_NONE;
|
||
}
|
||
|
||
$rows = Db::name('article_reference_check_result')->where('am_id', $amId)->select();
|
||
if (empty($rows)) {
|
||
$this->setAmRefCheckStatus($amId, self::AM_STATUS_NONE);
|
||
return self::AM_STATUS_NONE;
|
||
}
|
||
|
||
$pending = 0;
|
||
$hasFail = false;
|
||
$done = 0;
|
||
|
||
foreach ($rows as $row) {
|
||
$st = intval($row['status']);
|
||
if ($st === self::RECORD_PENDING) {
|
||
$pending++;
|
||
continue;
|
||
}
|
||
if ($st === self::RECORD_FAILED || ($st === self::RECORD_COMPLETED && intval($row['is_match']) === 0)) {
|
||
$hasFail = true;
|
||
}
|
||
if ($st === self::RECORD_COMPLETED) {
|
||
$done++;
|
||
}
|
||
}
|
||
|
||
if ($pending > 0) {
|
||
$status = self::AM_STATUS_RUNNING;
|
||
} elseif ($hasFail) {
|
||
$status = self::AM_STATUS_FAIL;
|
||
} elseif ($done === count($rows)) {
|
||
$status = self::AM_STATUS_PASS;
|
||
} else {
|
||
$status = self::AM_STATUS_FAIL;
|
||
}
|
||
|
||
$this->setAmRefCheckStatus($amId, $status);
|
||
return $status;
|
||
}
|
||
|
||
/**
|
||
* t_article_main 是否已加 ref_check_status 列(未迁移时跳过写入,避免 fields not exists)
|
||
*/
|
||
private function hasAmRefCheckStatusColumn()
|
||
{
|
||
if (self::$amRefCheckStatusColumnExists !== null) {
|
||
return self::$amRefCheckStatusColumnExists;
|
||
}
|
||
try {
|
||
$table = Db::name('article_main')->getTable();
|
||
$rows = Db::query('SHOW COLUMNS FROM `' . str_replace('`', '``', $table) . '` LIKE \'ref_check_status\'');
|
||
self::$amRefCheckStatusColumnExists = !empty($rows);
|
||
} catch (\Exception $e) {
|
||
self::$amRefCheckStatusColumnExists = false;
|
||
}
|
||
return self::$amRefCheckStatusColumnExists;
|
||
}
|
||
|
||
public function setAmRefCheckStatus($amId, $status)
|
||
{
|
||
if ($amId <= 0 || !$this->hasAmRefCheckStatusColumn()) {
|
||
return;
|
||
}
|
||
Db::name('article_main')->where('am_id', $amId)->update([
|
||
'ref_check_status' => $status,
|
||
]);
|
||
}
|
||
|
||
/**
|
||
* 按 p_article_id 清空整篇文章的引用校对明细 + 重置节级 ref_check_status。
|
||
*
|
||
* 用于新增/删除文献后,旧的 reference_no 全部错位、原校对结果失效的场景:
|
||
* 物理删除后,整篇状态查询自然回到 ARTICLE_PROGRESS_NONE(未校对)。
|
||
*
|
||
* @return int 被删除的明细条数
|
||
*/
|
||
public function clearArticleChecksByPArticleId($pArticleId)
|
||
{
|
||
$pArticleId = intval($pArticleId);
|
||
if ($pArticleId <= 0) {
|
||
return 0;
|
||
}
|
||
|
||
// 先反查 article_id(用于重置 article_main.ref_check_status 节级状态)
|
||
$articleId = intval(Db::name('production_article')
|
||
->where('p_article_id', $pArticleId)
|
||
->whereIn('state', [0, 2])
|
||
->value('article_id'));
|
||
|
||
// 先清掉旧记录对应的队列 Redis 锁,避免在途 worker 写回数据
|
||
$oldIds = Db::name('article_reference_check_result')
|
||
->where('p_article_id', $pArticleId)
|
||
->column('id');
|
||
foreach ($oldIds as $oldId) {
|
||
$this->clearReferenceCheckQueueLock(intval($oldId));
|
||
}
|
||
|
||
$deleted = Db::name('article_reference_check_result')
|
||
->where('p_article_id', $pArticleId)
|
||
->delete();
|
||
|
||
if ($articleId > 0 && $this->hasAmRefCheckStatusColumn()) {
|
||
Db::name('article_main')
|
||
->where('article_id', $articleId)
|
||
->whereIn('state', [0, 2])
|
||
->update(['ref_check_status' => self::AM_STATUS_NONE]);
|
||
}
|
||
|
||
return intval($deleted);
|
||
}
|
||
|
||
public function clearArticleChecks($articleId)
|
||
{
|
||
$articleId = intval($articleId);
|
||
if ($articleId <= 0) {
|
||
return 0;
|
||
}
|
||
|
||
// 先清掉旧记录对应的队列 Redis 锁,否则同 check_id 在 TTL 内不会再次执行
|
||
$oldIds = Db::name('article_reference_check_result')
|
||
->where('article_id', $articleId)
|
||
->column('id');
|
||
foreach ($oldIds as $oldId) {
|
||
$this->clearReferenceCheckQueueLock(intval($oldId));
|
||
}
|
||
|
||
$deleted = Db::name('article_reference_check_result')->where('article_id', $articleId)->delete();
|
||
if ($this->hasAmRefCheckStatusColumn()) {
|
||
Db::name('article_main')
|
||
->where('article_id', $articleId)
|
||
->whereIn('state', [0, 2])
|
||
->update(['ref_check_status' => self::AM_STATUS_NONE]);
|
||
}
|
||
|
||
return intval($deleted);
|
||
}
|
||
|
||
/**
|
||
* 文献列表局部挪动后,仅刷新指定 p_refer_id 对应的校对明细 reference_no / refer_index。
|
||
*
|
||
* 读 production_article_refer 的最新 index 来算新序号(index + 1),避免外部传入过期值。
|
||
* 仅更新受影响的两条左右记录,降低与并发挪动互相覆盖的风险。
|
||
*
|
||
* @param int[] $pReferIds 受影响的 p_refer_id(一般为 2 个:被挪条目 + 其相邻条目)
|
||
* @param int $pArticleId 可选:附加 p_article_id 限定,进一步缩小行锁范围
|
||
* @return array{p_refer_ids:int[], affected_rows:int, changes:array}
|
||
*/
|
||
public function syncReferenceNoByPReferIds(array $pReferIds, $pArticleId = 0)
|
||
{
|
||
$pReferIds = array_values(array_unique(array_filter(array_map('intval', $pReferIds))));
|
||
$pArticleId = intval($pArticleId);
|
||
if (empty($pReferIds)) {
|
||
return [
|
||
'p_refer_ids' => [],
|
||
'affected_rows' => 0,
|
||
'changes' => [],
|
||
];
|
||
}
|
||
|
||
$referQuery = Db::name('production_article_refer')
|
||
->field('p_refer_id,p_article_id,index')
|
||
->whereIn('p_refer_id', $pReferIds)
|
||
->where('state', 0);
|
||
if ($pArticleId > 0) {
|
||
$referQuery->where('p_article_id', $pArticleId);
|
||
}
|
||
$refers = $referQuery->select();
|
||
if (empty($refers)) {
|
||
return [
|
||
'p_refer_ids' => $pReferIds,
|
||
'affected_rows' => 0,
|
||
'changes' => [],
|
||
];
|
||
}
|
||
|
||
$now = date('Y-m-d H:i:s');
|
||
$affected = 0;
|
||
$changes = [];
|
||
|
||
foreach ($refers as $refer) {
|
||
$pReferId = intval($refer['p_refer_id']);
|
||
$newNo = intval($refer['index']) + 1;
|
||
|
||
$updateQuery = Db::name('article_reference_check_result')
|
||
->where('p_refer_id', $pReferId)
|
||
->where('reference_no', '<>', $newNo);
|
||
if ($pArticleId > 0) {
|
||
$updateQuery->where('p_article_id', $pArticleId);
|
||
}
|
||
$rows = $updateQuery->update([
|
||
'reference_no' => $newNo,
|
||
'refer_index' => $newNo,
|
||
'updated_at' => $now,
|
||
]);
|
||
|
||
if ($rows > 0) {
|
||
$affected += intval($rows);
|
||
$changes[] = [
|
||
'p_refer_id' => $pReferId,
|
||
'new_ref_no' => $newNo,
|
||
'affected_rows' => intval($rows),
|
||
];
|
||
}
|
||
}
|
||
|
||
return [
|
||
'p_refer_ids' => $pReferIds,
|
||
'affected_rows' => $affected,
|
||
'changes' => $changes,
|
||
];
|
||
}
|
||
|
||
/**
|
||
* 重置整篇稿件的引用校对:删除旧明细 + 清理队列锁 + 全文重新入队校对
|
||
*
|
||
* @return array
|
||
*/
|
||
/**
|
||
* 按 p_article_id 查整篇文章的引用校对总状态。
|
||
*
|
||
* 统计维度是"参考文献"(按 reference_no 分组),不是单条校对明细行。
|
||
* 例如 50 条参考文献、底层明细 111 条时,total 返回 50。
|
||
*
|
||
* 返回 status 数值含义(整篇):
|
||
* 0 = ARTICLE_PROGRESS_NONE 一条校对记录都没有
|
||
* 1 = ARTICLE_PROGRESS_RUNNING 至少 1 条参考文献仍有未跑完的明细
|
||
* 2 = ARTICLE_PROGRESS_COMPLETED 所有参考文献的全部明细都已结束
|
||
*
|
||
* 每条参考文献按其明细 status 分布落桶(互斥):
|
||
* pending —— 组内任一明细 status=0(含部分跑完的"校对中"也归此桶)
|
||
* done —— 组内全部明细 status=2(完成)
|
||
* failed —— 组内全部明细已结束、至少 1 条 status=3(失败)
|
||
*
|
||
* pending + done + failed = total;progress_percent = (done + failed) / total。
|
||
* 分组明细请走 getProgressByPArticleId(控制器 referenceCheckProgressAI)。
|
||
*
|
||
* @return array{p_article_id:int, status:int, total:int, pending:int, done:int, failed:int, progress_percent:float}
|
||
*/
|
||
public function getArticleProgressStatusByPArticleId($pArticleId)
|
||
{
|
||
$pArticleId = intval($pArticleId);
|
||
if ($pArticleId <= 0) {
|
||
throw new \InvalidArgumentException('p_article_id is required');
|
||
}
|
||
|
||
// 一条 SQL 按 reference_no 聚合,组内 status 分布一并算出来;
|
||
// 50 条参考文献 → 返回 50 行,PHP 走一次循环分桶即可
|
||
$rows = Db::name('article_reference_check_result')
|
||
->field('reference_no'
|
||
. ', SUM(CASE WHEN status = ' . self::RECORD_PENDING . ' THEN 1 ELSE 0 END) AS pending_cnt'
|
||
. ', SUM(CASE WHEN status = ' . self::RECORD_FAILED . ' THEN 1 ELSE 0 END) AS failed_cnt')
|
||
->where('p_article_id', $pArticleId)
|
||
->group('reference_no')
|
||
->select();
|
||
|
||
if (empty($rows)) {
|
||
return [
|
||
'p_article_id' => $pArticleId,
|
||
'status' => self::ARTICLE_PROGRESS_NONE,
|
||
'total' => 0,
|
||
'pending' => 0,
|
||
'done' => 0,
|
||
'failed' => 0,
|
||
'progress_percent' => 0,
|
||
];
|
||
}
|
||
|
||
$pending = 0;
|
||
$done = 0;
|
||
$failed = 0;
|
||
foreach ($rows as $row) {
|
||
$pendingCnt = intval($this->arrGet($row, 'pending_cnt', 0));
|
||
$failedCnt = intval($this->arrGet($row, 'failed_cnt', 0));
|
||
if ($pendingCnt > 0) {
|
||
$pending++;
|
||
} elseif ($failedCnt > 0) {
|
||
$failed++;
|
||
} else {
|
||
$done++;
|
||
}
|
||
}
|
||
|
||
$total = count($rows);
|
||
$articleStatus = $pending > 0
|
||
? self::ARTICLE_PROGRESS_RUNNING
|
||
: self::ARTICLE_PROGRESS_COMPLETED;
|
||
$finished = $done + $failed;
|
||
$progressPercent = round($finished / $total * 100, 1);
|
||
|
||
return [
|
||
'p_article_id' => $pArticleId,
|
||
'status' => $articleStatus,
|
||
'total' => $total,
|
||
'pending' => $pending,
|
||
'done' => $done,
|
||
'failed' => $failed,
|
||
'progress_percent' => $progressPercent,
|
||
];
|
||
}
|
||
|
||
/**
|
||
* 多篇文章并行校对时,查询指定文章前面还有几篇在排队。
|
||
*
|
||
* 「正在校对」= 该文至少还有 1 条明细 status=待校验(0)。
|
||
* 排队顺序:按各文章最早一条待校验明细的 id 升序(与全局入队先后一致)。
|
||
*
|
||
* @return array{
|
||
* p_article_id:int,
|
||
* running_total:int,
|
||
* ahead:int,
|
||
* position:int,
|
||
* in_queue:bool,
|
||
* status:int
|
||
* }
|
||
*/
|
||
public function getArticleCheckQueuePositionByPArticleId($pArticleId)
|
||
{
|
||
$pArticleId = intval($pArticleId);
|
||
if ($pArticleId <= 0) {
|
||
throw new \InvalidArgumentException('p_article_id is required');
|
||
}
|
||
|
||
$rows = Db::name('article_reference_check_result')
|
||
->field('p_article_id, MIN(id) AS queue_anchor')
|
||
->where('status', self::RECORD_PENDING)
|
||
->group('p_article_id')
|
||
->order('queue_anchor', 'asc')
|
||
->select();
|
||
|
||
$runningIds = [];
|
||
foreach ($rows as $row) {
|
||
$aid = intval($this->arrGet($row, 'p_article_id', 0));
|
||
if ($aid > 0) {
|
||
$runningIds[] = $aid;
|
||
}
|
||
}
|
||
|
||
$runningTotal = count($runningIds);
|
||
$ahead = 0;
|
||
$position = 0;
|
||
$inQueue = false;
|
||
foreach ($runningIds as $idx => $aid) {
|
||
if ($aid === $pArticleId) {
|
||
$ahead = $idx;
|
||
$position = $idx + 1;
|
||
$inQueue = true;
|
||
break;
|
||
}
|
||
}
|
||
|
||
$articleStatus = $this->getArticleProgressStatusByPArticleId($pArticleId);
|
||
|
||
return [
|
||
'p_article_id' => $pArticleId,
|
||
'running_total' => $runningTotal,
|
||
'ahead' => $inQueue ? $ahead : 0,
|
||
'position' => $inQueue ? $position : 0,
|
||
'in_queue' => $inQueue,
|
||
'status' => intval($this->arrGet($articleStatus, 'status', self::ARTICLE_PROGRESS_NONE)),
|
||
];
|
||
}
|
||
|
||
/**
|
||
* 按 p_article_id 查整篇引用校对进度,按 reference_no 分组聚合状态,并展开每条明细。
|
||
*
|
||
* 状态映射统一遵循"生命周期顺序"(PROGRESS_* / RECORD_* 取值一致):
|
||
* 0 = 待校验 1 = 校对中(仅分组层) 2 = 校对完成 3 = 校对失败
|
||
*
|
||
* 分组(reference_no)状态返回字段 progress_status:
|
||
* - 0 = PROGRESS_PENDING 分组内全部明细 status=0
|
||
* - 1 = PROGRESS_CHECKING 分组内部分明细已结束、部分仍为 0(明细不会出现此值)
|
||
* - 2 = PROGRESS_COMPLETED 分组内全部明细 status=2
|
||
* - 3 = PROGRESS_FAILED 分组内全部明细已结束、且至少 1 条 status=3
|
||
*
|
||
* records[i] 字段:
|
||
* - status 0=待校验 2=完成 3=失败(与分组同一套数值含义,不会出现 1)
|
||
* - confidence LLM 评分
|
||
* - is_pass confidence >= PASS_CONFIDENCE_THRESHOLD 视为通过
|
||
*
|
||
* @return array{p_article_id:int, total_groups:int, summary:array, list:array}
|
||
*/
|
||
public function getProgressByPArticleId($pArticleId)
|
||
{
|
||
$pArticleId = intval($pArticleId);
|
||
if ($pArticleId <= 0) {
|
||
throw new \InvalidArgumentException('p_article_id is required');
|
||
}
|
||
|
||
$rows = Db::name('article_reference_check_result')
|
||
->field('id,p_refer_id,reference_no,am_id,status,confidence,is_match,reason,text_start,text_end,updated_at')
|
||
->where('p_article_id', $pArticleId)
|
||
->order('reference_no asc, id asc')
|
||
->select();
|
||
|
||
// summary 用字符串键,避免数值下标看不出含义;同时保留数值键和 PROGRESS_* 常量对照
|
||
$summary = [
|
||
'pending' => 0, // PROGRESS_PENDING = 0
|
||
'checking' => 0, // PROGRESS_CHECKING = 1
|
||
'completed' => 0, // PROGRESS_COMPLETED = 2
|
||
'failed' => 0, // PROGRESS_FAILED = 3
|
||
];
|
||
if (empty($rows)) {
|
||
return [
|
||
'p_article_id' => $pArticleId,
|
||
'total_groups' => 0,
|
||
'summary' => $summary,
|
||
'list' => [],
|
||
];
|
||
}
|
||
|
||
$groups = [];
|
||
foreach ($rows as $row) {
|
||
$refNo = intval($this->arrGet($row, 'reference_no', 0));
|
||
$pReferId = intval($this->arrGet($row, 'p_refer_id', 0));
|
||
if (!isset($groups[$refNo])) {
|
||
$groups[$refNo] = [
|
||
'reference_no' => $refNo,
|
||
'p_refer_id' => $pReferId,
|
||
'total' => 0,
|
||
'pending' => 0,
|
||
'done' => 0,
|
||
'failed' => 0,
|
||
'pass' => 0,
|
||
'last_updated_at' => '',
|
||
'records' => [],
|
||
];
|
||
}
|
||
// 同一 reference_no 理论上只对应一个 p_refer_id;如果出现混淆,保留首次出现的非空 id
|
||
if ($groups[$refNo]['p_refer_id'] <= 0 && $pReferId > 0) {
|
||
$groups[$refNo]['p_refer_id'] = $pReferId;
|
||
}
|
||
|
||
$groups[$refNo]['total']++;
|
||
$st = intval($this->arrGet($row, 'status', 0));
|
||
// record 仅存 {0=待校验, 2=完成, 3=失败};不会出现 1(校对中)
|
||
if ($st === self::RECORD_PENDING) {
|
||
$groups[$refNo]['pending']++;
|
||
} elseif ($st === self::RECORD_COMPLETED) {
|
||
$groups[$refNo]['done']++;
|
||
} elseif ($st === self::RECORD_FAILED) {
|
||
$groups[$refNo]['failed']++;
|
||
}
|
||
|
||
$upd = (string)$this->arrGet($row, 'updated_at', '');
|
||
if ($upd > $groups[$refNo]['last_updated_at']) {
|
||
$groups[$refNo]['last_updated_at'] = $upd;
|
||
}
|
||
|
||
$confidence = floatval($this->arrGet($row, 'confidence', 0));
|
||
$isPass = $confidence >= self::PASS_CONFIDENCE_THRESHOLD;
|
||
if ($isPass) {
|
||
$groups[$refNo]['pass']++;
|
||
}
|
||
|
||
$groups[$refNo]['records'][] = [
|
||
'check_id' => intval($this->arrGet($row, 'id', 0)),
|
||
'am_id' => intval($this->arrGet($row, 'am_id', 0)),
|
||
'status' => $st,
|
||
'confidence' => $confidence,
|
||
'is_pass' => $isPass,
|
||
'is_match' => intval($this->arrGet($row, 'is_match', 0)),
|
||
'reason' => (string)$this->arrGet($row, 'reason', ''),
|
||
'text_start' => intval($this->arrGet($row, 'text_start', 0)),
|
||
'text_end' => intval($this->arrGet($row, 'text_end', 0)),
|
||
'last_updated_at' => $upd,
|
||
];
|
||
}
|
||
|
||
$list = [];
|
||
foreach ($groups as $g) {
|
||
$total = $g['total'];
|
||
$pending = $g['pending'];
|
||
$failed = $g['failed'];
|
||
$pass = $g['pass'];
|
||
|
||
if ($pending === $total) {
|
||
$progressStatus = self::PROGRESS_PENDING;
|
||
} elseif ($pending === 0) {
|
||
$progressStatus = $failed > 0 ? self::PROGRESS_FAILED : self::PROGRESS_COMPLETED;
|
||
} else {
|
||
$progressStatus = self::PROGRESS_CHECKING;
|
||
}
|
||
|
||
// 整体通过校验:分组已全部完成(无 pending、无 failed),且每条 confidence >= 0.65
|
||
$g['is_pass'] = (
|
||
$progressStatus === self::PROGRESS_COMPLETED
|
||
&& $total > 0
|
||
&& $pass === $total
|
||
);
|
||
|
||
switch ($progressStatus) {
|
||
case self::PROGRESS_PENDING: $summary['pending']++; break;
|
||
case self::PROGRESS_CHECKING: $summary['checking']++; break;
|
||
case self::PROGRESS_COMPLETED: $summary['completed']++; break;
|
||
case self::PROGRESS_FAILED: $summary['failed']++; break;
|
||
}
|
||
$g['progress_status'] = $progressStatus;
|
||
$list[] = $g;
|
||
}
|
||
|
||
usort($list, function ($a, $b) {
|
||
return $a['reference_no'] - $b['reference_no'];
|
||
});
|
||
|
||
return [
|
||
'p_article_id' => $pArticleId,
|
||
'total_groups' => count($list),
|
||
'summary' => $summary,
|
||
'list' => $list,
|
||
];
|
||
}
|
||
|
||
/**
|
||
* 按 p_refer_id 查这条参考文献的校对明细与分组进度。
|
||
*
|
||
* 分组进度(与 referenceCheckProgressAI 单条 list 项口径一致):
|
||
* progress_status 0待校验 1校对中 2完成 3失败
|
||
* pending/done/failed/pass、is_pass、progress_percent
|
||
*
|
||
* list 每项:check_id、am_id、status、confidence、reason、is_match、is_pass
|
||
*
|
||
* @param int $pReferId production_article_refer.p_refer_id
|
||
* @return array
|
||
*/
|
||
public function getCheckDetailsByPReferId($pReferId)
|
||
{
|
||
$pReferId = intval($pReferId);
|
||
if ($pReferId <= 0) {
|
||
throw new \InvalidArgumentException('p_refer_id is required');
|
||
}
|
||
|
||
$rows = Db::name('article_reference_check_result')
|
||
->field('id,p_article_id,reference_no,am_id,status,confidence,is_match,reason,updated_at')
|
||
->where('p_refer_id', $pReferId)
|
||
->order('id asc')
|
||
->select();
|
||
|
||
$list = [];
|
||
$pArticleId = 0;
|
||
$referenceNo = 0;
|
||
$pending = 0;
|
||
$done = 0;
|
||
$failed = 0;
|
||
$pass = 0;
|
||
$lastUpdatedAt = '';
|
||
|
||
foreach ($rows as $row) {
|
||
if ($pArticleId <= 0) {
|
||
$pArticleId = intval($this->arrGet($row, 'p_article_id', 0));
|
||
}
|
||
if ($referenceNo <= 0) {
|
||
$referenceNo = intval($this->arrGet($row, 'reference_no', 0));
|
||
}
|
||
|
||
$st = intval($this->arrGet($row, 'status', 0));
|
||
if ($st === self::RECORD_PENDING) {
|
||
$pending++;
|
||
} elseif ($st === self::RECORD_COMPLETED) {
|
||
$done++;
|
||
} elseif ($st === self::RECORD_FAILED) {
|
||
$failed++;
|
||
}
|
||
|
||
$upd = (string)$this->arrGet($row, 'updated_at', '');
|
||
if ($upd > $lastUpdatedAt) {
|
||
$lastUpdatedAt = $upd;
|
||
}
|
||
|
||
$confidence = floatval($this->arrGet($row, 'confidence', 0));
|
||
$isPass = $confidence >= self::PASS_CONFIDENCE_THRESHOLD;
|
||
if ($isPass) {
|
||
$pass++;
|
||
}
|
||
|
||
$list[] = [
|
||
'check_id' => intval($this->arrGet($row, 'id', 0)),
|
||
'am_id' => intval($this->arrGet($row, 'am_id', 0)),
|
||
'status' => $st,
|
||
'confidence' => $confidence,
|
||
'reason' => (string)$this->arrGet($row, 'reason', ''),
|
||
'is_match' => intval($this->arrGet($row, 'is_match', 0)),
|
||
'is_pass' => $isPass,
|
||
];
|
||
}
|
||
|
||
if ($referenceNo <= 0) {
|
||
$refer = Db::name('production_article_refer')
|
||
->where('p_refer_id', $pReferId)
|
||
->where('state', 0)
|
||
->find();
|
||
if (!empty($refer)) {
|
||
if ($pArticleId <= 0) {
|
||
$pArticleId = intval($this->arrGet($refer, 'p_article_id', 0));
|
||
}
|
||
$referenceNo = intval($this->arrGet($refer, 'index', 0)) + 1;
|
||
}
|
||
}
|
||
|
||
$total = count($list);
|
||
if ($total === 0) {
|
||
$progressStatus = self::PROGRESS_PENDING;
|
||
$progressPercent = 0;
|
||
$isPassGroup = false;
|
||
} elseif ($pending === $total) {
|
||
$progressStatus = self::PROGRESS_PENDING;
|
||
$progressPercent = 0;
|
||
$isPassGroup = false;
|
||
} elseif ($pending === 0) {
|
||
$progressStatus = $failed > 0 ? self::PROGRESS_FAILED : self::PROGRESS_COMPLETED;
|
||
$progressPercent = 100;
|
||
$isPassGroup = (
|
||
$progressStatus === self::PROGRESS_COMPLETED
|
||
&& $pass === $total
|
||
);
|
||
} else {
|
||
$progressStatus = self::PROGRESS_CHECKING;
|
||
$finished = $done + $failed;
|
||
$progressPercent = round($finished / $total * 100, 1);
|
||
$isPassGroup = false;
|
||
}
|
||
|
||
return [
|
||
'p_refer_id' => $pReferId,
|
||
'p_article_id' => $pArticleId,
|
||
'reference_no' => $referenceNo,
|
||
'total' => $total,
|
||
'pending' => $pending,
|
||
'done' => $done,
|
||
'failed' => $failed,
|
||
'pass' => $pass,
|
||
'progress_status' => $progressStatus,
|
||
'progress_percent' => $progressPercent,
|
||
'is_pass' => $isPassGroup,
|
||
'last_updated_at' => $lastUpdatedAt,
|
||
'list' => $list,
|
||
];
|
||
}
|
||
|
||
public function resetAndRecheckByArticle($aProductionArticle)
|
||
{
|
||
if (empty($aProductionArticle) || !is_array($aProductionArticle)) {
|
||
throw new \InvalidArgumentException('production_article is required');
|
||
}
|
||
$pArticleId = intval($this->arrGet($aProductionArticle, 'p_article_id', 0));
|
||
$articleId = intval($this->arrGet($aProductionArticle, 'article_id', 0));
|
||
if ($pArticleId <= 0 || $articleId <= 0) {
|
||
throw new \InvalidArgumentException('production_article requires both p_article_id and article_id');
|
||
}
|
||
|
||
$existing = Db::name('article_reference_check_result')
|
||
->where('p_article_id', $pArticleId)
|
||
->count();
|
||
if (intval($existing) <= 0) {
|
||
throw new \RuntimeException('no existing reference check records for p_article_id=' . $pArticleId);
|
||
}
|
||
|
||
$cleared = $this->clearArticleChecks($articleId);
|
||
$enqueueResult = $this->enqueueByArticle($articleId);
|
||
|
||
if (!is_array($enqueueResult)) {
|
||
$enqueueResult = [];
|
||
}
|
||
$enqueueResult['cleared'] = $cleared;
|
||
$enqueueResult['reset'] = 1;
|
||
return $enqueueResult;
|
||
}
|
||
|
||
public static function amStatusLabel($status)
|
||
{
|
||
$map = [
|
||
self::AM_STATUS_NONE => 'none',
|
||
self::AM_STATUS_PASS => 'pass',
|
||
self::AM_STATUS_FAIL => 'fail',
|
||
self::AM_STATUS_RUNNING => 'running',
|
||
];
|
||
return isset($map[$status]) ? $map[$status] : 'unknown';
|
||
}
|
||
|
||
/**
|
||
* 表主键为 id(对外 API 参数名仍叫 check_id)
|
||
*/
|
||
public function resolveCheckRowId($row)
|
||
{
|
||
if (!is_array($row)) {
|
||
return 0;
|
||
}
|
||
if (isset($row['id']) && intval($row['id']) > 0) {
|
||
return intval($row['id']);
|
||
}
|
||
if (isset($row['check_id']) && intval($row['check_id']) > 0) {
|
||
return intval($row['check_id']);
|
||
}
|
||
return 0;
|
||
}
|
||
|
||
/**
|
||
* 解析 LLM 返回的 is_match(兼容 bool / 0|1 / "true"|"false" 字符串)
|
||
*/
|
||
public function parseLlmIsMatch($value)
|
||
{
|
||
if (is_bool($value)) {
|
||
return $value;
|
||
}
|
||
if (is_int($value) || is_float($value)) {
|
||
return intval($value) === 1;
|
||
}
|
||
$s = strtolower(trim((string)$value));
|
||
return in_array($s, ['1', 'true', 'yes', 'match', 'matched'], true);
|
||
}
|
||
|
||
/**
|
||
* 写入单条校对结果(统一截断 reason/error_msg,避免 varchar(512) 导致 UPDATE 失败)
|
||
*
|
||
* @throws \RuntimeException
|
||
*/
|
||
public function updateCheckResult($checkId, array $fields)
|
||
{
|
||
$checkId = intval($checkId);
|
||
if ($checkId <= 0) {
|
||
throw new \InvalidArgumentException('invalid check id');
|
||
}
|
||
|
||
if (isset($fields['reason'])) {
|
||
$fields['reason'] = mb_substr(trim((string)$fields['reason']), 0, 512);
|
||
}
|
||
if (isset($fields['error_msg'])) {
|
||
$fields['error_msg'] = mb_substr(trim((string)$fields['error_msg']), 0, 512);
|
||
}
|
||
$fields['updated_at'] = date('Y-m-d H:i:s');
|
||
|
||
$exists = Db::name('article_reference_check_result')->where('id', $checkId)->find();
|
||
if (empty($exists)) {
|
||
throw new \RuntimeException('article_reference_check_result not found, id=' . $checkId);
|
||
}
|
||
|
||
$affected = Db::name('article_reference_check_result')->where('id', $checkId)->update($fields);
|
||
if ($affected === false) {
|
||
throw new \RuntimeException('article_reference_check_result update failed, id=' . $checkId);
|
||
}
|
||
|
||
\think\Log::info('updateCheckResult id=' . $checkId . ' affected=' . intval($affected));
|
||
return intval($affected);
|
||
}
|
||
|
||
public function getResult($checkId)
|
||
{
|
||
if ($checkId <= 0) {
|
||
return null;
|
||
}
|
||
$row = Db::name('article_reference_check_result')->where('id', $checkId)->find();
|
||
return $row ?: null;
|
||
}
|
||
|
||
public function listByArticle($articleId, $status = -1, $onlyMismatch = false)
|
||
{
|
||
$q = Db::name('article_reference_check_result')->where('article_id', $articleId);
|
||
if ($status >= 0) {
|
||
$q->where('status', $status);
|
||
}
|
||
if ($onlyMismatch) {
|
||
$q->where('status', self::RECORD_COMPLETED)->where('is_match', 0);
|
||
}
|
||
return $q->order('am_id asc, cite_tag_start asc, reference_no asc')->select();
|
||
}
|
||
|
||
/**
|
||
* 稿件预览:在 content 上标记不合理引用序号与引用句
|
||
*
|
||
* @return array{sections: array, issues: array, stats: array}
|
||
*/
|
||
public function buildArticlePreview($articleId, $amId = 0)
|
||
{
|
||
$fields = 'am_id,content,sort,type,amt_id';
|
||
if ($this->hasAmRefCheckStatusColumn()) {
|
||
$fields .= ',ref_check_status';
|
||
}
|
||
$q = Db::name('article_main')
|
||
->field($fields)
|
||
->where('article_id', $articleId)
|
||
->whereIn('state', [0, 2]);
|
||
if ($amId > 0) {
|
||
$q->where('am_id', $amId);
|
||
}
|
||
$mains = $q->order('sort asc')->select();
|
||
|
||
$rows = $this->listByArticle($articleId, 1);
|
||
$badByAm = $this->indexBadResults($rows);
|
||
|
||
$sections = [];
|
||
$issues = [];
|
||
$stats = ['total' => 0, 'mismatch' => 0, 'match' => 0, 'pending' => 0];
|
||
|
||
foreach ($this->listByArticle($articleId, -1) as $r) {
|
||
$stats['total']++;
|
||
if (intval($r['status']) === self::RECORD_PENDING) {
|
||
$stats['pending']++;
|
||
} elseif (intval($r['is_match']) === 1) {
|
||
$stats['match']++;
|
||
} else {
|
||
$stats['mismatch']++;
|
||
}
|
||
}
|
||
|
||
foreach ($mains as $main) {
|
||
$id = intval($main['am_id']);
|
||
$content = $this->resolveArticleMainCheckContent($main);
|
||
$badIndex = isset($badByAm[$id]) ? $badByAm[$id] : array();
|
||
$marked = $this->markContentForPreview($content, $id, $badIndex);
|
||
$amStatus = intval($this->arrGet($main, 'ref_check_status', 0));
|
||
$sections[] = [
|
||
'am_id' => $id,
|
||
'ref_check_status' => $amStatus,
|
||
'ref_check_pass' => $amStatus === self::AM_STATUS_PASS,
|
||
'ref_check_label' => self::amStatusLabel($amStatus),
|
||
'content' => $content,
|
||
'content_marked' => $marked['html'],
|
||
'issue_count' => $marked['issue_count'],
|
||
];
|
||
foreach ($marked['issues'] as $issue) {
|
||
$issues[] = $issue;
|
||
}
|
||
}
|
||
|
||
$articlePass = $this->resolveArticlePass($sections);
|
||
|
||
return [
|
||
'article_id' => $articleId,
|
||
'article_ref_check_pass' => $articlePass,
|
||
'sections' => $sections,
|
||
'issues' => $issues,
|
||
'stats' => $stats,
|
||
];
|
||
}
|
||
|
||
/**
|
||
* 全文是否通过:各节均为 pass,且无 running/fail(无引用节忽略)
|
||
*/
|
||
private function resolveArticlePass($sections)
|
||
{
|
||
$hasChecked = false;
|
||
foreach ($sections as $sec) {
|
||
$st = intval($this->arrGet($sec, 'ref_check_status', 0));
|
||
if ($st === self::AM_STATUS_NONE) {
|
||
continue;
|
||
}
|
||
$hasChecked = true;
|
||
if ($st !== self::AM_STATUS_PASS) {
|
||
return false;
|
||
}
|
||
}
|
||
return $hasChecked ? true : null;
|
||
}
|
||
|
||
/**
|
||
* @param array $rows 已校对完成(status=RECORD_COMPLETED)但 is_match=0 的检测结果
|
||
* @return array<int, array> am_id => indexed bad map
|
||
*/
|
||
private function indexBadResults($rows)
|
||
{
|
||
$byAm = [];
|
||
foreach ($rows as $row) {
|
||
if (intval($row['status']) !== self::RECORD_COMPLETED || intval($row['is_match']) === 1) {
|
||
continue;
|
||
}
|
||
$amId = intval($row['am_id']);
|
||
$refNo = intval($row['reference_no']);
|
||
if ($amId <= 0 || $refNo <= 0) {
|
||
continue;
|
||
}
|
||
if (!isset($byAm[$amId])) {
|
||
$byAm[$amId] = ['by_raw' => [], 'contexts' => []];
|
||
}
|
||
$rawKey = $this->normalizeRefRawKey((string)$this->arrGet($row, 'reference_raw', ''));
|
||
if ($rawKey !== '') {
|
||
$byAm[$amId]['by_raw'][$rawKey][$refNo] = $row;
|
||
}
|
||
|
||
$ctxKey = intval($row['text_start']) . '_' . intval($row['text_end']);
|
||
if (!isset($byAm[$amId]['contexts'][$ctxKey])) {
|
||
$byAm[$amId]['contexts'][$ctxKey] = [
|
||
'text_start' => intval($row['text_start']),
|
||
'text_end' => intval($row['text_end']),
|
||
'check_ids' => [],
|
||
'reasons' => [],
|
||
'ref_nos' => [],
|
||
];
|
||
}
|
||
$byAm[$amId]['contexts'][$ctxKey]['check_ids'][] = $this->resolveCheckRowId($row);
|
||
$byAm[$amId]['contexts'][$ctxKey]['ref_nos'][] = $refNo;
|
||
$reason = trim((string)$this->arrGet($row, 'reason', ''));
|
||
if ($reason !== '') {
|
||
$byAm[$amId]['contexts'][$ctxKey]['reasons'][$refNo] = $reason;
|
||
}
|
||
}
|
||
return $byAm;
|
||
}
|
||
|
||
private function normalizeRefRawKey($raw)
|
||
{
|
||
$raw = str_replace(
|
||
[',', '–', '—', '−', '‐', '‑', ' '],
|
||
[',', '-', '-', '-', '-', '-', ''],
|
||
trim($raw)
|
||
);
|
||
return strtolower($raw);
|
||
}
|
||
|
||
/**
|
||
* @param array $badIndex indexBadResults 中单 am 的结构
|
||
*/
|
||
private function markContentForPreview($content, $amId, $badIndex)
|
||
{
|
||
$badByRaw = isset($badIndex['by_raw']) ? $badIndex['by_raw'] : array();
|
||
$contexts = isset($badIndex['contexts']) ? $badIndex['contexts'] : array();
|
||
$issues = array();
|
||
$issueCount = 0;
|
||
|
||
if ($content === '' || (empty($badByRaw) && empty($contexts))) {
|
||
return array('html' => $content, 'issues' => array(), 'issue_count' => 0);
|
||
}
|
||
|
||
$html = $content;
|
||
|
||
// 1) 先标记 blue 内各序号(在原文上操作,[70-73] 仅标不合理者如 70、71)
|
||
$matches = $this->collectBlueTagMatches($html);
|
||
$citeDeltas = [];
|
||
if (!empty($matches[0])) {
|
||
$replacements = [];
|
||
foreach ($matches[0] as $idx => $match) {
|
||
$fullTag = $match[0];
|
||
$tagStart = $match[1];
|
||
$tagEnd = $tagStart + strlen($fullTag);
|
||
$inner = $matches[1][$idx][0];
|
||
$rawKey = $this->normalizeRefRawKey($inner);
|
||
$badNums = isset($badByRaw[$rawKey]) ? $badByRaw[$rawKey] : array();
|
||
|
||
$innerMarked = preg_replace_callback(
|
||
'/\d+/',
|
||
function ($numMatch) use ($badNums, &$issues, &$issueCount, $amId, $inner) {
|
||
$num = intval($numMatch[0]);
|
||
if (!isset($badNums[$num])) {
|
||
return $numMatch[0];
|
||
}
|
||
$row = $badNums[$num];
|
||
$rowReason = isset($row['reason']) ? $row['reason'] : '';
|
||
$issueCount++;
|
||
$issues[] = array(
|
||
'am_id' => $amId,
|
||
'check_id' => $this->resolveCheckRowId($row),
|
||
'reference_no' => $num,
|
||
'reference_raw' => $inner,
|
||
'reason' => $rowReason,
|
||
'confidence' => floatval(isset($row['confidence']) ? $row['confidence'] : 0),
|
||
);
|
||
$title = htmlspecialchars(
|
||
'引用[' . $num . ']不合理: ' . $rowReason,
|
||
ENT_QUOTES,
|
||
'UTF-8'
|
||
);
|
||
return '<span class="ref-no-error" data-check-id="' . $this->resolveCheckRowId($row)
|
||
. '" data-ref-no="' . $num . '" title="' . $title . '">'
|
||
. $numMatch[0] . '</span>';
|
||
},
|
||
$inner
|
||
);
|
||
|
||
$tagClass = !empty($badNums) ? ' ref-cite-error' : '';
|
||
$groupIds = !empty($badNums)
|
||
? implode(',', array_map(function ($row) {
|
||
return (int) $this->resolveCheckRowId($row);
|
||
}, $badNums))
|
||
: '';
|
||
$newHtml = '<blue class="ref-cite-tag' . $tagClass . '" data-ref-raw="' . htmlspecialchars($inner, ENT_QUOTES, 'UTF-8')
|
||
. '" data-check-ids="' . $groupIds . '">[' . $innerMarked . ']</blue>';
|
||
$replacements[] = [
|
||
'start' => $tagStart,
|
||
'end' => $tagEnd,
|
||
'html' => $newHtml,
|
||
'delta' => strlen($newHtml) - ($tagEnd - $tagStart),
|
||
];
|
||
}
|
||
usort($replacements, function ($a, $b) {
|
||
return $b['start'] - $a['start'];
|
||
});
|
||
foreach ($replacements as $rep) {
|
||
$html = substr($html, 0, $rep['start']) . $rep['html'] . substr($html, $rep['end']);
|
||
$citeDeltas[] = ['start' => $rep['start'], 'delta' => $rep['delta']];
|
||
}
|
||
}
|
||
|
||
$shiftByCite = function ($pos) use ($citeDeltas) {
|
||
$d = 0;
|
||
foreach ($citeDeltas as $cd) {
|
||
if ($cd['start'] < $pos) {
|
||
$d += $cd['delta'];
|
||
}
|
||
}
|
||
return $pos + $d;
|
||
};
|
||
|
||
// 2) 再标记引用句(从后往前)
|
||
if (!empty($contexts)) {
|
||
$spans = array_values($contexts);
|
||
usort($spans, function ($a, $b) {
|
||
return $b['text_start'] - $a['text_start'];
|
||
});
|
||
foreach ($spans as $span) {
|
||
$start = $span['text_start'];
|
||
$end = $span['text_end'];
|
||
if ($start < 0 || $end <= $start) {
|
||
continue;
|
||
}
|
||
$s = $shiftByCite($start);
|
||
$e = $shiftByCite($end);
|
||
if ($e > strlen($html)) {
|
||
$e = strlen($html);
|
||
}
|
||
$checkIds = array_values(array_unique($span['check_ids']));
|
||
$refNos = array_values(array_unique($span['ref_nos']));
|
||
sort($refNos);
|
||
$reasonParts = [];
|
||
foreach ($refNos as $rn) {
|
||
if (!empty($span['reasons'][$rn])) {
|
||
$reasonParts[] = '[' . $rn . '] ' . $span['reasons'][$rn];
|
||
}
|
||
}
|
||
$title = htmlspecialchars(
|
||
'引用句可能不合理: ' . implode('; ', $reasonParts),
|
||
ENT_QUOTES,
|
||
'UTF-8'
|
||
);
|
||
$open = '<span class="ref-context-error" data-check-ids="' . implode(',', $checkIds)
|
||
. '" data-ref-nos="' . implode(',', $refNos) . '" title="' . $title . '">';
|
||
$close = '</span>';
|
||
$html = substr($html, 0, $s) . $open . substr($html, $s, $e - $s) . $close . substr($html, $e);
|
||
}
|
||
}
|
||
|
||
return ['html' => $html, 'issues' => $issues, 'issue_count' => $issueCount];
|
||
}
|
||
|
||
/**
|
||
* @return array<int, array> refer_index => row
|
||
*/
|
||
public function loadReferMapByPArticleId($pArticleId)
|
||
{
|
||
$map = [];
|
||
if ($pArticleId <= 0) {
|
||
return $map;
|
||
}
|
||
$rows = Db::name('production_article_refer')
|
||
->where('p_article_id', $pArticleId)
|
||
->where('state', 0)
|
||
->order('index asc')
|
||
->select();
|
||
foreach ($rows as $row) {
|
||
$map[intval($row['index'])] = $row;
|
||
}
|
||
return $map;
|
||
}
|
||
public function formatReferForLlm($refer)
|
||
{
|
||
$parts = [];
|
||
foreach (['title', 'author', 'joura', 'dateno', 'refer_doi', 'doilink'] as $f) {
|
||
$v = trim((string)$this->arrGet($refer, $f, ''));
|
||
if ($v !== '') {
|
||
$parts[] = ucfirst($f) . ': ' . $v;
|
||
}
|
||
}
|
||
$frag = trim((string)$this->arrGet($refer, 'refer_frag', ''));
|
||
$content = trim((string)$this->arrGet($refer, 'refer_content', ''));
|
||
if ($frag !== '') {
|
||
$parts[] = 'Reference: ' . $frag;
|
||
} elseif ($content !== '') {
|
||
$parts[] = 'Reference: ' . $content;
|
||
}
|
||
return implode("\n", $parts);
|
||
}
|
||
|
||
/**
|
||
* 编辑某条文献内容后,按 p_refer_id 异步重新校对该文献对应的全部 check 明细
|
||
*
|
||
* 流程:刷新 refer_text/refer_index → 重置 status/is_match/confidence/reason
|
||
* → 设节级 ref_check_status=RUNNING → 投递到 ReferenceCheck 队列
|
||
*
|
||
* 与 recheckByRefer 的差异:本方法**不**在请求内同步跑 LLM,仅入队,立即返回。
|
||
* 前端可调 getProgressByPArticleId 轮询进度。
|
||
*
|
||
* @param int $pReferId t_production_article_refer.p_refer_id(必填)
|
||
* @param int $pArticleId 可选:传入跳过 refer 表二次查表
|
||
* @return array{p_refer_id:int, p_article_id:int, reference_no:int, reset:int, queued:int, check_ids:int[], queue:string}
|
||
*/
|
||
public function enqueueRecheckByPReferId($pReferId, $pArticleId = 0)
|
||
{
|
||
$pReferId = intval($pReferId);
|
||
if ($pReferId <= 0) {
|
||
throw new \InvalidArgumentException('p_refer_id is required');
|
||
}
|
||
|
||
$refer = Db::name('production_article_refer')
|
||
->where('p_refer_id', $pReferId)
|
||
->where('state', 0)
|
||
->find();
|
||
if (empty($refer)) {
|
||
throw new \RuntimeException('production_article_refer not found, p_refer_id=' . $pReferId);
|
||
}
|
||
|
||
$pArticleId = intval($pArticleId);
|
||
if ($pArticleId <= 0) {
|
||
$pArticleId = intval($this->arrGet($refer, 'p_article_id', 0));
|
||
}
|
||
if ($pArticleId <= 0) {
|
||
throw new \RuntimeException('p_article_id is missing for p_refer_id=' . $pReferId);
|
||
}
|
||
|
||
$referenceNo = intval($this->arrGet($refer, 'index', 0)) + 1;
|
||
$referText = $this->formatReferForLlm($refer);
|
||
$now = date('Y-m-d H:i:s');
|
||
|
||
$rows = Db::name('article_reference_check_result')
|
||
->where('p_article_id', $pArticleId)
|
||
->where('p_refer_id', $pReferId)
|
||
->select();
|
||
|
||
if (empty($rows)) {
|
||
return [
|
||
'p_refer_id' => $pReferId,
|
||
'p_article_id' => $pArticleId,
|
||
'reference_no' => $referenceNo,
|
||
'reset' => 0,
|
||
'queued' => 0,
|
||
'check_ids' => [],
|
||
'queue' => self::QUEUE_NAME,
|
||
];
|
||
}
|
||
|
||
$resetFields = [
|
||
'refer_text' => $referText,
|
||
'refer_index' => $referenceNo,
|
||
'reference_no' => $referenceNo,
|
||
'status' => self::RECORD_PENDING,
|
||
'is_match' => 0,
|
||
'can_support' => 0,
|
||
'confidence' => 0,
|
||
'reason' => '',
|
||
'error_msg' => '',
|
||
'updated_at' => $now,
|
||
];
|
||
|
||
$pendingJobs = [];
|
||
$amIds = [];
|
||
foreach ($rows as $row) {
|
||
$checkId = $this->resolveCheckRowId($row);
|
||
Db::name('article_reference_check_result')->where('id', $checkId)->update($resetFields);
|
||
$this->clearReferenceCheckQueueLock($checkId);
|
||
$pendingJobs[] = [
|
||
'check_id' => $checkId,
|
||
'reference_no' => $referenceNo,
|
||
'am_id' => intval($this->arrGet($row, 'am_id', 0)),
|
||
'text_start' => intval($this->arrGet($row, 'text_start', 0)),
|
||
];
|
||
$amId = intval($this->arrGet($row, 'am_id', 0));
|
||
if ($amId > 0) {
|
||
$amIds[$amId] = true;
|
||
}
|
||
}
|
||
|
||
foreach (array_keys($amIds) as $amId) {
|
||
$this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING);
|
||
}
|
||
|
||
$checkIds = $this->pushJobsSortedByReferenceNo($pendingJobs);
|
||
|
||
return [
|
||
'p_refer_id' => $pReferId,
|
||
'p_article_id' => $pArticleId,
|
||
'reference_no' => $referenceNo,
|
||
'reset' => count($rows),
|
||
'queued' => count($checkIds),
|
||
'check_ids' => $checkIds,
|
||
'queue' => self::QUEUE_NAME,
|
||
];
|
||
}
|
||
|
||
/**
|
||
* 某条参考文献下「校对失败」的明细重新校对(仅 status=RECORD_FAILED,异步入队)
|
||
*
|
||
* 不刷新 refer_text / reference_no,沿用记录内已有正文与文献快照,只重置结果字段后入队。
|
||
*
|
||
* @param int $pReferId t_production_article_refer.p_refer_id(必填)
|
||
* @param int $pArticleId 可选,进一步限定文章
|
||
* @return array{p_refer_id:int, p_article_id:int, reset:int, queued:int, check_ids:int[], queue:string}
|
||
*/
|
||
public function enqueueRecheckFailedByPReferId($pReferId, $pArticleId = 0)
|
||
{
|
||
$pReferId = intval($pReferId);
|
||
if ($pReferId <= 0) {
|
||
throw new \InvalidArgumentException('p_refer_id is required');
|
||
}
|
||
|
||
$q = Db::name('article_reference_check_result')
|
||
->where('p_refer_id', $pReferId)
|
||
->where('status', self::RECORD_FAILED);
|
||
$pArticleId = intval($pArticleId);
|
||
if ($pArticleId > 0) {
|
||
$q->where('p_article_id', $pArticleId);
|
||
}
|
||
|
||
$rows = $q->select();
|
||
|
||
if (empty($rows)) {
|
||
return [
|
||
'p_refer_id' => $pReferId,
|
||
'p_article_id' => $pArticleId,
|
||
'reset' => 0,
|
||
'queued' => 0,
|
||
'check_ids' => [],
|
||
'queue' => self::QUEUE_NAME,
|
||
];
|
||
}
|
||
|
||
if ($pArticleId <= 0) {
|
||
$pArticleId = intval($this->arrGet($rows[0], 'p_article_id', 0));
|
||
}
|
||
|
||
$now = date('Y-m-d H:i:s');
|
||
$resetFields = [
|
||
'status' => self::RECORD_PENDING,
|
||
'is_match' => 0,
|
||
'can_support' => 0,
|
||
'confidence' => 0,
|
||
'reason' => '',
|
||
'error_msg' => '',
|
||
'updated_at' => $now,
|
||
];
|
||
|
||
$pendingJobs = [];
|
||
$amIds = [];
|
||
foreach ($rows as $row) {
|
||
$checkId = $this->resolveCheckRowId($row);
|
||
Db::name('article_reference_check_result')->where('id', $checkId)->update($resetFields);
|
||
$this->clearReferenceCheckQueueLock($checkId);
|
||
$pendingJobs[] = [
|
||
'check_id' => $checkId,
|
||
'reference_no' => intval($this->arrGet($row, 'reference_no', 0)),
|
||
'am_id' => intval($this->arrGet($row, 'am_id', 0)),
|
||
'text_start' => intval($this->arrGet($row, 'text_start', 0)),
|
||
];
|
||
$amId = intval($this->arrGet($row, 'am_id', 0));
|
||
if ($amId > 0) {
|
||
$amIds[$amId] = true;
|
||
}
|
||
}
|
||
|
||
foreach (array_keys($amIds) as $amId) {
|
||
$this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING);
|
||
}
|
||
|
||
$checkIds = $this->pushJobsSortedByReferenceNo($pendingJobs);
|
||
|
||
return [
|
||
'p_refer_id' => $pReferId,
|
||
'p_article_id' => $pArticleId,
|
||
'reset' => count($rows),
|
||
'queued' => count($checkIds),
|
||
'check_ids' => $checkIds,
|
||
'queue' => self::QUEUE_NAME,
|
||
];
|
||
}
|
||
|
||
public function recheckByRefer($articleId, $pReferId = 0, $referenceNo = 0)
|
||
{
|
||
$articleId = intval($articleId);
|
||
if ($articleId <= 0) {
|
||
throw new \InvalidArgumentException('article_id is required');
|
||
}
|
||
|
||
$ctx = $this->resolveReferForRecheck($articleId, intval($pReferId), intval($referenceNo));
|
||
$refer = $ctx['refer'];
|
||
$pReferId = $ctx['p_refer_id'];
|
||
$referenceNo = $ctx['reference_no'];
|
||
$pArticleId = $ctx['p_article_id'];
|
||
$referText = $this->formatReferForLlm($refer);
|
||
$now = date('Y-m-d H:i:s');
|
||
|
||
$rows = Db::name('article_reference_check_result')
|
||
->where('article_id', $articleId)
|
||
->where(function ($query) use ($pReferId, $referenceNo) {
|
||
$query->where('p_refer_id', $pReferId)->whereOr('reference_no', $referenceNo);
|
||
})
|
||
->select();
|
||
|
||
if (empty($rows)) {
|
||
return [
|
||
'article_id' => $articleId,
|
||
'p_refer_id' => $pReferId,
|
||
'reference_no' => $referenceNo,
|
||
'reset' => 0,
|
||
'queued' => 0,
|
||
'check_ids' => [],
|
||
'queue' => self::QUEUE_NAME,
|
||
];
|
||
}
|
||
|
||
$resetFields = [
|
||
'refer_text' => $referText,
|
||
'p_refer_id' => $pReferId,
|
||
'p_article_id' => $pArticleId,
|
||
'refer_index' => $referenceNo,
|
||
'status' => 0,
|
||
'is_match' => 0,
|
||
'can_support' => 0,
|
||
'confidence' => 0,
|
||
'reason' => '',
|
||
'error_msg' => '',
|
||
'updated_at' => $now,
|
||
];
|
||
|
||
$pendingJobs = [];
|
||
$amIds = [];
|
||
foreach ($rows as $row) {
|
||
$checkId = $this->resolveCheckRowId($row);
|
||
Db::name('article_reference_check_result')->where('id', $checkId)->update($resetFields);
|
||
$pendingJobs[] = [
|
||
'check_id' => $checkId,
|
||
'reference_no' => $referenceNo,
|
||
'am_id' => intval($row['am_id']),
|
||
'text_start' => intval(isset($row['text_start']) ? $row['text_start'] : 0),
|
||
];
|
||
$amId = intval($row['am_id']);
|
||
if ($amId > 0) {
|
||
$amIds[$amId] = true;
|
||
}
|
||
}
|
||
|
||
foreach (array_keys($amIds) as $amId) {
|
||
$this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING);
|
||
}
|
||
|
||
usort($pendingJobs, function ($a, $b) {
|
||
if ($a['reference_no'] !== $b['reference_no']) {
|
||
return $a['reference_no'] - $b['reference_no'];
|
||
}
|
||
if ($a['am_id'] !== $b['am_id']) {
|
||
return $a['am_id'] - $b['am_id'];
|
||
}
|
||
return $a['text_start'] - $b['text_start'];
|
||
});
|
||
|
||
$checkIds = [];
|
||
$results = [];
|
||
$failed = [];
|
||
foreach ($pendingJobs as $job) {
|
||
$checkId = intval($job['check_id']);
|
||
$checkIds[] = $checkId;
|
||
$this->clearReferenceCheckQueueLock($checkId);
|
||
try {
|
||
$results[] = $this->runReferenceCheckOnce($checkId);
|
||
} catch (\Exception $e) {
|
||
$failed[] = [
|
||
'check_id' => $checkId,
|
||
'error' => $e->getMessage(),
|
||
];
|
||
\think\Log::error('recheckByRefer check_id=' . $checkId . ' ' . $e->getMessage());
|
||
}
|
||
}
|
||
|
||
foreach (array_keys($amIds) as $amId) {
|
||
$this->syncAmRefCheckStatus($amId);
|
||
}
|
||
|
||
return [
|
||
'article_id' => $articleId,
|
||
'p_refer_id' => $pReferId,
|
||
'reference_no' => $referenceNo,
|
||
'reset' => count($rows),
|
||
'checked' => count($results),
|
||
'failed' => count($failed),
|
||
'check_ids' => $checkIds,
|
||
'results' => $results,
|
||
'errors' => $failed,
|
||
];
|
||
}
|
||
|
||
/**
|
||
* 清除队列 Redis 完成标记,避免重检任务被 acquireLock 静默丢弃
|
||
*/
|
||
public function clearReferenceCheckQueueLock($checkId)
|
||
{
|
||
$checkId = intval($checkId);
|
||
if ($checkId <= 0) {
|
||
return;
|
||
}
|
||
try {
|
||
$keys = [];
|
||
foreach (['queue_job', 'queue_job_two'] as $prefix) {
|
||
$class = $prefix === 'queue_job_two'
|
||
? 'app\\api\\job\\ReferenceCheckTwo'
|
||
: 'app\\api\\job\\ReferenceCheck';
|
||
$base = $prefix . ':' . $class . ':' . $checkId;
|
||
$keys[] = $base;
|
||
$keys[] = $base . ':status';
|
||
}
|
||
QueueRedis::getInstance()->deleteRedisKeys($keys);
|
||
} catch (\Exception $e) {
|
||
\think\Log::warning('clearReferenceCheckQueueLock id=' . $checkId . ' ' . $e->getMessage());
|
||
}
|
||
}
|
||
|
||
/**
|
||
* 执行一次引用 LLM 校对(同步,写回 article_reference_check_result)
|
||
*/
|
||
public function runReferenceCheckOnce($checkId)
|
||
{
|
||
$checkId = intval($checkId);
|
||
$row = Db::name('article_reference_check_result')->where('id', $checkId)->find();
|
||
if (empty($row)) {
|
||
throw new \RuntimeException('article_reference_check_result not found, id=' . $checkId);
|
||
}
|
||
|
||
$contentA = $this->resolveMainContentForJob($row);
|
||
$refer = null;
|
||
if (intval($row['p_refer_id']) > 0) {
|
||
$refer = Db::name('production_article_refer')
|
||
->where('p_refer_id', intval($row['p_refer_id']))
|
||
->where('state', 0)
|
||
->find();
|
||
}
|
||
|
||
if ($refer) {
|
||
$contentB = $this->formatReferForLlm($refer);
|
||
} else {
|
||
$contentB = trim((string)$this->arrGet($row, 'refer_text', ''));
|
||
}
|
||
|
||
if ($contentA === '' || $contentB === '') {
|
||
$this->updateCheckResult($checkId, [
|
||
'status' => self::RECORD_FAILED,
|
||
'error_msg' => 'Missing section content (text/table) or refer_text',
|
||
]);
|
||
throw new \RuntimeException('Missing section content (text/table) or refer_text');
|
||
}
|
||
|
||
$llmResult = (new LLMService())->checkReference($contentA, $contentB, false);
|
||
$requestFailed = !empty($llmResult['request_failed']);
|
||
$canSupport = $this->parseLlmCanSupport($llmResult);
|
||
$confidence = floatval(isset($llmResult['confidence']) ? $llmResult['confidence'] : 0);
|
||
$reason = isset($llmResult['reason']) ? $llmResult['reason'] : '';
|
||
|
||
// LLM 通讯失败:写 status=RECORD_FAILED(3) + error_msg,抛异常让队列 worker 走 release(30) 重试;
|
||
// 重试 3 次后 ReferenceCheck::markFailed 会保持 status=3 收尾
|
||
if ($requestFailed) {
|
||
$this->updateCheckResult($checkId, [
|
||
'confidence' => $confidence,
|
||
'reason' => $reason,
|
||
'status' => self::RECORD_FAILED,
|
||
'error_msg' => $reason,
|
||
]);
|
||
$this->clearReferenceCheckQueueLock($checkId);
|
||
throw new \RuntimeException($reason !== '' ? $reason : 'LLM request failed');
|
||
}
|
||
|
||
$this->updateCheckResult($checkId, [
|
||
'can_support' => $canSupport ? 1 : 0,
|
||
'is_match' => $canSupport ? 1 : 0,
|
||
'confidence' => $confidence,
|
||
'reason' => $reason,
|
||
'status' => self::RECORD_COMPLETED,
|
||
'error_msg' => '',
|
||
]);
|
||
|
||
$this->clearReferenceCheckQueueLock($checkId);
|
||
$this->maybeEnqueueSecondPass($checkId, $confidence);
|
||
|
||
return [
|
||
'check_id' => $checkId,
|
||
'can_support' => $canSupport ? 1 : 0,
|
||
'is_match' => $canSupport ? 1 : 0,
|
||
'confidence' => $confidence,
|
||
'reason' => $reason,
|
||
];
|
||
}
|
||
|
||
/**
|
||
* @return array{refer: array, p_article_id: int, p_refer_id: int, reference_no: int}
|
||
*/
|
||
private function resolveReferForRecheck($articleId, $pReferId, $referenceNo)
|
||
{
|
||
$prod = Db::name('production_article')
|
||
->where('article_id', $articleId)
|
||
->whereIn('state', [0, 2])
|
||
->find();
|
||
if (empty($prod)) {
|
||
throw new \RuntimeException('production_article not found for article_id=' . $articleId);
|
||
}
|
||
|
||
$pArticleId = intval($prod['p_article_id']);
|
||
$refer = null;
|
||
|
||
if ($pReferId > 0) {
|
||
$refer = Db::name('production_article_refer')
|
||
->where('p_refer_id', $pReferId)
|
||
->where('p_article_id', $pArticleId)
|
||
->where('state', 0)
|
||
->find();
|
||
} elseif ($referenceNo > 0) {
|
||
$referMap = $this->loadReferMapByPArticleId($pArticleId);
|
||
$referIndex = $referenceNo - 1;
|
||
if (isset($referMap[$referIndex])) {
|
||
$refer = $referMap[$referIndex];
|
||
$pReferId = intval($refer['p_refer_id']);
|
||
}
|
||
} else {
|
||
throw new \InvalidArgumentException('p_refer_id or reference_no is required');
|
||
}
|
||
|
||
if (empty($refer)) {
|
||
throw new \RuntimeException('production_article_refer not found');
|
||
}
|
||
|
||
return [
|
||
'refer' => $refer,
|
||
'p_article_id' => $pArticleId,
|
||
'p_refer_id' => intval($refer['p_refer_id']),
|
||
'reference_no' => intval($refer['index']) + 1,
|
||
];
|
||
}
|
||
|
||
/**
|
||
* 仅使用 refer_doi 字段(二次 Crossref 摘要用)
|
||
*/
|
||
public function extractReferDoiOnly($refer)
|
||
{
|
||
if (!is_array($refer)) {
|
||
return '';
|
||
}
|
||
$raw = trim((string)$this->arrGet($refer, 'refer_doi', ''));
|
||
if ($raw === '' || stripos($raw, 'not available') !== false) {
|
||
return '';
|
||
}
|
||
$dois = $this->extractDoisFromString($raw);
|
||
return empty($dois) ? '' : $dois[0];
|
||
}
|
||
|
||
/**
|
||
* 根据 refer_doi 调用 Crossref works API 获取摘要(二次校对专用)
|
||
*
|
||
* @return array{text:string, has_abstract:bool, doi:string}
|
||
*/
|
||
public function fetchCrossrefAbstractByReferDoi($refer)
|
||
{
|
||
$doi = $this->extractReferDoiOnly($refer);
|
||
if ($doi === '') {
|
||
return ['text' => '', 'has_abstract' => false, 'doi' => ''];
|
||
}
|
||
|
||
$crossref = new CrossrefService([
|
||
'mailto' => trim((string)Env::get('crossref_mailto', '')),
|
||
]);
|
||
$block = $this->extractCrossrefBlock($doi, $crossref);
|
||
if ($block === null) {
|
||
return ['text' => '', 'has_abstract' => false, 'doi' => $doi];
|
||
}
|
||
|
||
return [
|
||
'text' => $block['text'],
|
||
'has_abstract' => !empty($block['has_abstract']),
|
||
'doi' => $doi,
|
||
];
|
||
}
|
||
|
||
/**
|
||
* 解析 LLM 返回的 can_support
|
||
*/
|
||
public function parseLlmCanSupport($llmResult)
|
||
{
|
||
if (!is_array($llmResult)) {
|
||
return false;
|
||
}
|
||
if (array_key_exists('can_support', $llmResult)) {
|
||
return $this->parseLlmIsMatch($llmResult['can_support']);
|
||
}
|
||
return $this->parseLlmIsMatch(isset($llmResult['is_match']) ? $llmResult['is_match'] : false);
|
||
}
|
||
|
||
/**
|
||
* 第一次校对:正文取 article_main.content;表格(type=2)取 article_main_table.table_data 等
|
||
*/
|
||
public function resolveMainContentForJob(array $row, $maxChars = 8000)
|
||
{
|
||
$amId = intval($this->arrGet($row, 'am_id', 0));
|
||
if ($amId <= 0) {
|
||
return '';
|
||
}
|
||
$main = Db::name('article_main')
|
||
->field('content,type,amt_id,article_id')
|
||
->where('am_id', $amId)
|
||
->find();
|
||
if (empty($main)) {
|
||
return '';
|
||
}
|
||
|
||
$raw = trim($this->resolveArticleMainCheckContent($main));
|
||
if ($raw === '') {
|
||
return '';
|
||
}
|
||
|
||
return $this->normalizeCheckContentForLlm($raw, $maxChars);
|
||
}
|
||
|
||
/**
|
||
* 是否为表格节:type=2、有 amt_id,或 content 为 <table tableId='…'/> 占位
|
||
*/
|
||
private function isArticleMainTableSection(array $main)
|
||
{
|
||
if (intval($this->arrGet($main, 'type', self::MAIN_TYPE_TEXT)) === self::MAIN_TYPE_TABLE) {
|
||
return true;
|
||
}
|
||
if (intval($this->arrGet($main, 'amt_id', 0)) > 0) {
|
||
return true;
|
||
}
|
||
$content = (string)$this->arrGet($main, 'content', '');
|
||
|
||
return stripos($content, '<table') !== false
|
||
&& preg_match('/tableId\s*=\s*[\'"]?\d+/i', $content);
|
||
}
|
||
|
||
/**
|
||
* 从 article_main 或 content 占位解析 amt_id
|
||
*/
|
||
private function resolveArticleMainTableAmtId(array $main)
|
||
{
|
||
$amtId = intval($this->arrGet($main, 'amt_id', 0));
|
||
if ($amtId > 0) {
|
||
return $amtId;
|
||
}
|
||
$content = (string)$this->arrGet($main, 'content', '');
|
||
if (preg_match('/tableId\s*=\s*[\'"]?(\d+)/i', $content, $m)) {
|
||
return intval($m[1]);
|
||
}
|
||
|
||
return 0;
|
||
}
|
||
|
||
/**
|
||
* @return array|null
|
||
*/
|
||
private function loadArticleMainTableRow(array $main)
|
||
{
|
||
$amtId = $this->resolveArticleMainTableAmtId($main);
|
||
if ($amtId <= 0) {
|
||
return null;
|
||
}
|
||
|
||
$q = Db::name('article_main_table')
|
||
->where('amt_id', $amtId)
|
||
->whereIn('state', [0, 2])
|
||
->field('table_data,title,note');
|
||
$articleId = intval($this->arrGet($main, 'article_id', 0));
|
||
if ($articleId > 0) {
|
||
$q->where('article_id', $articleId);
|
||
}
|
||
$tbl = $q->find();
|
||
|
||
return empty($tbl) ? null : $tbl;
|
||
}
|
||
|
||
/**
|
||
* 按节提取引用:正文走 content;表格按行拼接单元格后扫描(Study 列仅 [n] 时也能带上同行上下文)
|
||
*/
|
||
public function extractReferencesForArticleMain(array $main)
|
||
{
|
||
if (!$this->isArticleMainTableSection($main)) {
|
||
return $this->extractReferences((string)$this->arrGet($main, 'content', ''));
|
||
}
|
||
|
||
$tbl = $this->loadArticleMainTableRow($main);
|
||
if (empty($tbl)) {
|
||
return [];
|
||
}
|
||
|
||
$extra = [];
|
||
foreach (['title', 'note'] as $field) {
|
||
$part = trim((string)$this->arrGet($tbl, $field, ''));
|
||
if ($part !== '') {
|
||
$extra[] = $part;
|
||
}
|
||
}
|
||
|
||
return $this->extractReferencesFromTableDataJson(
|
||
(string)$this->arrGet($tbl, 'table_data', ''),
|
||
$extra
|
||
);
|
||
}
|
||
|
||
/**
|
||
* table_data 按行提取;$prefixChunks 为 title/note 等(在表格行之前扫描)
|
||
*/
|
||
public function extractReferencesFromTableDataJson($tableDataJson, array $prefixChunks = [])
|
||
{
|
||
$result = [];
|
||
$offset = 0;
|
||
|
||
foreach ($prefixChunks as $chunk) {
|
||
$chunk = trim((string)$chunk);
|
||
if ($chunk === '') {
|
||
continue;
|
||
}
|
||
foreach ($this->extractReferences($chunk) as $cite) {
|
||
$cite['text_start'] = intval($cite['text_start']) + $offset;
|
||
$cite['text_end'] = intval($cite['text_end']) + $offset;
|
||
$cite['reference_start'] = intval($cite['reference_start']) + $offset;
|
||
$cite['reference_end'] = intval($cite['reference_end']) + $offset;
|
||
$result[] = $cite;
|
||
}
|
||
$offset += strlen($chunk) + 1;
|
||
}
|
||
|
||
$tableDataJson = trim((string)$tableDataJson);
|
||
if ($tableDataJson === '') {
|
||
return $result;
|
||
}
|
||
|
||
$decoded = $this->decodeTableDataJsonToArray($tableDataJson);
|
||
if ($decoded === null) {
|
||
foreach ($this->extractReferences($tableDataJson) as $cite) {
|
||
$cite['text_start'] = intval($cite['text_start']) + $offset;
|
||
$cite['text_end'] = intval($cite['text_end']) + $offset;
|
||
$cite['reference_start'] = intval($cite['reference_start']) + $offset;
|
||
$cite['reference_end'] = intval($cite['reference_end']) + $offset;
|
||
$result[] = $cite;
|
||
}
|
||
|
||
return $result;
|
||
}
|
||
|
||
foreach ($decoded as $row) {
|
||
$line = $this->buildTableRowCheckLine($row);
|
||
if ($line === '') {
|
||
continue;
|
||
}
|
||
foreach ($this->extractReferences($line) as $cite) {
|
||
$cite['text_start'] = intval($cite['text_start']) + $offset;
|
||
$cite['text_end'] = intval($cite['text_end']) + $offset;
|
||
$cite['reference_start'] = intval($cite['reference_start']) + $offset;
|
||
$cite['reference_end'] = intval($cite['reference_end']) + $offset;
|
||
$result[] = $cite;
|
||
}
|
||
$offset += strlen($line) + 1;
|
||
}
|
||
|
||
return $result;
|
||
}
|
||
|
||
/**
|
||
* 入队/LLM 用的原始 HTML:type=0 为 content;表格为 table_data 按行展平
|
||
*/
|
||
public function resolveArticleMainCheckContent(array $main)
|
||
{
|
||
if (!$this->isArticleMainTableSection($main)) {
|
||
return (string)$this->arrGet($main, 'content', '');
|
||
}
|
||
|
||
$tbl = $this->loadArticleMainTableRow($main);
|
||
if (empty($tbl)) {
|
||
return '';
|
||
}
|
||
|
||
$chunks = [];
|
||
foreach (['title', 'note'] as $field) {
|
||
$part = trim((string)$this->arrGet($tbl, $field, ''));
|
||
if ($part !== '') {
|
||
$chunks[] = $part;
|
||
}
|
||
}
|
||
$flat = $this->flattenTableDataJsonToCheckContent((string)$this->arrGet($tbl, 'table_data', ''));
|
||
if ($flat !== '') {
|
||
$chunks[] = $flat;
|
||
}
|
||
|
||
return implode("\n", $chunks);
|
||
}
|
||
|
||
/**
|
||
* 表格一行:各单元格 text 用 " | " 连接(保留同行化学名/部位/Study 列引用)
|
||
*/
|
||
private function buildTableRowCheckLine($row)
|
||
{
|
||
if (!is_array($row)) {
|
||
return '';
|
||
}
|
||
$cells = [];
|
||
foreach ($row as $cell) {
|
||
if (!is_array($cell)) {
|
||
continue;
|
||
}
|
||
$text = trim((string)$this->arrGet($cell, 'text', ''));
|
||
if ($text !== '') {
|
||
$cells[] = $text;
|
||
}
|
||
}
|
||
|
||
return implode(' | ', $cells);
|
||
}
|
||
|
||
/**
|
||
* table_data 按行展平(供 LLM / 预览);非法 JSON 时按整串处理
|
||
*/
|
||
private function flattenTableDataJsonToCheckContent($tableDataJson)
|
||
{
|
||
$tableDataJson = trim((string)$tableDataJson);
|
||
if ($tableDataJson === '') {
|
||
return '';
|
||
}
|
||
$decoded = $this->decodeTableDataJsonToArray($tableDataJson);
|
||
if ($decoded === null) {
|
||
return $tableDataJson;
|
||
}
|
||
|
||
$lines = [];
|
||
foreach ($decoded as $row) {
|
||
$line = $this->buildTableRowCheckLine($row);
|
||
if ($line !== '') {
|
||
$lines[] = $line;
|
||
}
|
||
}
|
||
|
||
return implode("\n", $lines);
|
||
}
|
||
|
||
/**
|
||
* @return array|null
|
||
*/
|
||
private function decodeTableDataJsonToArray($raw)
|
||
{
|
||
$raw = trim((string)$raw);
|
||
if ($raw === '') {
|
||
return null;
|
||
}
|
||
if (preg_match('/^\xEF\xBB\xBF/', $raw)) {
|
||
$raw = substr($raw, 3);
|
||
}
|
||
$decoded = json_decode($raw, true);
|
||
if (json_last_error() !== JSON_ERROR_NONE) {
|
||
return null;
|
||
}
|
||
if (is_array($decoded)) {
|
||
return $decoded;
|
||
}
|
||
if (is_string($decoded)) {
|
||
$decoded2 = json_decode($decoded, true);
|
||
if (json_last_error() === JSON_ERROR_NONE && is_array($decoded2)) {
|
||
return $decoded2;
|
||
}
|
||
}
|
||
|
||
return null;
|
||
}
|
||
|
||
private function normalizeCheckContentForLlm($raw, $maxChars = 8000)
|
||
{
|
||
$text = $this->pregReplaceBlueTags($raw, '[$1]');
|
||
$text = strip_tags($text);
|
||
$text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8');
|
||
$text = preg_replace('/\s+/u', ' ', $text);
|
||
$text = trim($text);
|
||
if ($text === '') {
|
||
return '';
|
||
}
|
||
|
||
$maxChars = max(500, intval($maxChars));
|
||
if (mb_strlen($text) > $maxChars) {
|
||
$text = mb_substr($text, 0, $maxChars) . '...';
|
||
}
|
||
|
||
return $text;
|
||
}
|
||
|
||
/**
|
||
* 引用处局部上下文(origin_text),供其它场景使用
|
||
*/
|
||
public function resolveCitationContextForJob(array $row)
|
||
{
|
||
$text = trim((string)$this->arrGet($row, 'origin_text', ''));
|
||
if ($text === '') {
|
||
$text = trim((string)$this->arrGet($row, 'content_a', ''));
|
||
}
|
||
return $text;
|
||
}
|
||
|
||
/**
|
||
* 从 refer 行提取标准 DOI(10.xxxx/...)
|
||
*
|
||
* 优先级:refer_content(原始引用文本里的 DOI 最贴近实际被引用的文献)
|
||
* > refer_doi > doi > doilink
|
||
*/
|
||
public function extractDoiFromRefer($refer)
|
||
{
|
||
$list = $this->extractAllDoiCandidatesFromRefer($refer);
|
||
return empty($list) ? '' : $list[0];
|
||
}
|
||
|
||
/**
|
||
* 返回 refer 行可能对应的全部 DOI 候选(去重,按优先级排序)
|
||
*
|
||
* 用于第二轮 DOI 复核场景:当 metadata 的 refer_doi 与原始引用文本里的 DOI
|
||
* 不一致时(数据漂移),优先尝试原始引用文本里的 DOI 抓真实摘要。
|
||
*
|
||
* @return string[]
|
||
*/
|
||
public function extractAllDoiCandidatesFromRefer($refer)
|
||
{
|
||
if (!is_array($refer)) {
|
||
return [];
|
||
}
|
||
$ordered = [
|
||
(string)$this->arrGet($refer, 'refer_content', ''),
|
||
(string)$this->arrGet($refer, 'refer_doi', ''),
|
||
(string)$this->arrGet($refer, 'doi', ''),
|
||
(string)$this->arrGet($refer, 'doilink', ''),
|
||
];
|
||
|
||
$result = [];
|
||
foreach ($ordered as $raw) {
|
||
foreach ($this->extractDoisFromString($raw) as $doi) {
|
||
if (!in_array($doi, $result, true)) {
|
||
$result[] = $doi;
|
||
}
|
||
}
|
||
}
|
||
return $result;
|
||
}
|
||
|
||
/**
|
||
* 从任意文本里抽取所有形如 10.xxxx/yyy 的 DOI
|
||
* @return string[]
|
||
*/
|
||
private function extractDoisFromString($text)
|
||
{
|
||
$text = trim((string)$text);
|
||
if ($text === '' || stripos($text, 'not available') !== false) {
|
||
return [];
|
||
}
|
||
|
||
$dois = [];
|
||
|
||
if (preg_match_all('~doi\.org/([^\s?#"\'<>]+)~i', $text, $m)) {
|
||
foreach ($m[1] as $cand) {
|
||
$cand = $this->trimDoiTail(trim($cand));
|
||
if ($this->isValidDoi($cand)) {
|
||
$dois[] = $cand;
|
||
}
|
||
}
|
||
}
|
||
|
||
if (preg_match_all('~\b(10\.\d{3,9}/[^\s?#"\'<>]+)~i', $text, $m)) {
|
||
foreach ($m[1] as $cand) {
|
||
$cand = $this->trimDoiTail(trim($cand));
|
||
if ($this->isValidDoi($cand)) {
|
||
$dois[] = $cand;
|
||
}
|
||
}
|
||
}
|
||
|
||
if ($dois === [] && strpos($text, '10.') === 0) {
|
||
$cand = $this->trimDoiTail($text);
|
||
if ($this->isValidDoi($cand)) {
|
||
$dois[] = $cand;
|
||
}
|
||
}
|
||
|
||
return array_values(array_unique($dois));
|
||
}
|
||
|
||
private function trimDoiTail($doi)
|
||
{
|
||
return rtrim($doi, ".,;:)]}>\"'\\ \t\n\r");
|
||
}
|
||
|
||
private function isValidDoi($doi)
|
||
{
|
||
return (bool)preg_match('~^10\.\d{3,9}/[^\s]+$~i', (string)$doi);
|
||
}
|
||
|
||
/**
|
||
* 通过 PubMed / Crossref 拉取 DOI 对应文献内容(本地 LLM 无法打开网页,须预先抓取)
|
||
*
|
||
* 行为:
|
||
* - 尝试 refer 行内所有 DOI 候选(refer_content > refer_doi > doi > doilink)
|
||
* - 优先采用第一个能拿到 abstract 的 DOI
|
||
* - PubMed 无摘要时回落到 Crossref raw 解析摘要(清理 JATS 标签)
|
||
* - 全部失败则返回空字符串(调用方据此跳过二次复核)
|
||
*/
|
||
public function fetchDoiLiteratureBlock($refer)
|
||
{
|
||
$candidates = $this->extractAllDoiCandidatesFromRefer($refer);
|
||
if (empty($candidates)) {
|
||
return '';
|
||
}
|
||
|
||
$pubmed = new PubmedService([
|
||
'email' => trim((string)Env::get('pubmed_email', '')),
|
||
'tool' => trim((string)Env::get('pubmed_tool', 'tmrjournals')),
|
||
]);
|
||
$crossref = new CrossrefService([
|
||
'mailto' => trim((string)Env::get('crossref_mailto', '')),
|
||
]);
|
||
|
||
$best = null;
|
||
$fallback = null;
|
||
|
||
foreach ($candidates as $doi) {
|
||
$block = $this->buildDoiBlockFromSources($doi, $pubmed, $crossref);
|
||
if ($block === null) {
|
||
continue;
|
||
}
|
||
if (!empty($block['has_abstract'])) {
|
||
$best = $block;
|
||
break;
|
||
}
|
||
if ($fallback === null) {
|
||
$fallback = $block;
|
||
}
|
||
}
|
||
|
||
$chosen = $best ?: $fallback;
|
||
if ($chosen === null) {
|
||
return '';
|
||
}
|
||
return $chosen['text'];
|
||
}
|
||
|
||
/**
|
||
* 拉单个 DOI 的真实内容,返回 ['text' => string, 'has_abstract' => bool] 或 null
|
||
*/
|
||
private function buildDoiBlockFromSources($doi, PubmedService $pubmed, CrossrefService $crossref)
|
||
{
|
||
$doi = trim((string)$doi);
|
||
if ($doi === '') {
|
||
return null;
|
||
}
|
||
|
||
$pub = $pubmed->fetchByDoi($doi);
|
||
$pubAbstract = is_array($pub) ? trim((string)$this->arrGet($pub, 'abstract', '')) : '';
|
||
|
||
if (is_array($pub) && ($pubAbstract !== '' || trim((string)$this->arrGet($pub, 'title', '')) !== '')) {
|
||
$lines = ['Source: PubMed (DOI ' . $doi . ')'];
|
||
if (!empty($pub['title'])) {
|
||
$lines[] = 'Actual Title: ' . trim((string)$pub['title']);
|
||
}
|
||
if (!empty($pub['journal'])) {
|
||
$lines[] = 'Journal: ' . trim((string)$pub['journal']);
|
||
}
|
||
if (!empty($pub['year'])) {
|
||
$lines[] = 'Year: ' . trim((string)$pub['year']);
|
||
}
|
||
if (!empty($pub['publication_types'])) {
|
||
$lines[] = 'Publication Types: ' . implode('; ', (array)$pub['publication_types']);
|
||
}
|
||
if (!empty($pub['mesh_terms'])) {
|
||
$lines[] = 'MeSH: ' . implode('; ', (array)$pub['mesh_terms']);
|
||
}
|
||
if ($pubAbstract !== '') {
|
||
$lines[] = 'Abstract: ' . $this->truncate($pubAbstract, 3500);
|
||
}
|
||
|
||
if ($pubAbstract === '') {
|
||
$cr = $this->extractCrossrefBlock($doi, $crossref);
|
||
if ($cr !== null && $cr['has_abstract']) {
|
||
$lines[] = "\n--- Crossref 补充 ---\n" . $cr['text'];
|
||
return ['text' => implode("\n", $lines), 'has_abstract' => true];
|
||
}
|
||
}
|
||
|
||
return ['text' => implode("\n", $lines), 'has_abstract' => $pubAbstract !== ''];
|
||
}
|
||
|
||
return $this->extractCrossrefBlock($doi, $crossref);
|
||
}
|
||
|
||
/**
|
||
* 从 Crossref 拉取标题/期刊/作者/摘要(abstract 通常包裹 JATS XML,需清洗)
|
||
* @return array|null ['text' => string, 'has_abstract' => bool]
|
||
*/
|
||
private function extractCrossrefBlock($doi, CrossrefService $crossref)
|
||
{
|
||
$msg = $crossref->fetchWork($doi);
|
||
if (!is_array($msg)) {
|
||
return null;
|
||
}
|
||
|
||
$summary = $crossref->fetchWorkSummary($doi);
|
||
if (!is_array($summary)) {
|
||
$summary = [];
|
||
}
|
||
|
||
$lines = ['Source: Crossref api.crossref.org/works/' . rawurlencode($doi)];
|
||
$title = isset($msg['title'][0]) ? trim((string)$msg['title'][0]) : trim((string)$this->arrGet($summary, 'title', ''));
|
||
if ($title !== '') {
|
||
$lines[] = 'Actual Title: ' . $title;
|
||
}
|
||
if (!empty($summary['joura'])) {
|
||
$lines[] = 'Journal: ' . trim((string)$summary['joura']);
|
||
}
|
||
if (!empty($summary['author_str'])) {
|
||
$lines[] = 'Authors: ' . trim((string)$summary['author_str']);
|
||
}
|
||
if (!empty($summary['dateno'])) {
|
||
$lines[] = 'Publication: ' . trim((string)$summary['dateno']);
|
||
}
|
||
if (!empty($summary['doilink'])) {
|
||
$lines[] = 'DOI Link: ' . trim((string)$summary['doilink']);
|
||
}
|
||
if (!empty($summary['is_retracted'])) {
|
||
$lines[] = 'Retraction: yes - ' . trim((string)$this->arrGet($summary, 'retract_reason', ''));
|
||
}
|
||
|
||
$abstract = $this->cleanCrossrefAbstract((string)$this->arrGet($msg, 'abstract', ''));
|
||
$hasAbstract = $abstract !== '';
|
||
if ($hasAbstract) {
|
||
$lines[] = 'Abstract: ' . $this->truncate($abstract, 3500);
|
||
} else {
|
||
$lines[] = 'Note: Crossref 未返回摘要,请结合标题/期刊/作者与正文谨慎判断。';
|
||
}
|
||
|
||
return ['text' => implode("\n", $lines), 'has_abstract' => $hasAbstract];
|
||
}
|
||
|
||
private function cleanCrossrefAbstract($raw)
|
||
{
|
||
$raw = trim((string)$raw);
|
||
if ($raw === '') {
|
||
return '';
|
||
}
|
||
$raw = preg_replace('~<jats:title[^>]*>.*?</jats:title>~is', '', $raw);
|
||
$raw = preg_replace('~<jats:p[^>]*>~i', "\n", $raw);
|
||
$raw = preg_replace('~</jats:p>~i', '', $raw);
|
||
$raw = preg_replace('~</?jats:[^>]+>~i', '', $raw);
|
||
$raw = strip_tags($raw);
|
||
$raw = preg_replace('/[ \t]+/u', ' ', $raw);
|
||
$raw = preg_replace("/\r\n|\r/u", "\n", $raw);
|
||
$raw = preg_replace("/\n{2,}/u", "\n", $raw);
|
||
return trim($raw);
|
||
}
|
||
|
||
private function truncate($text, $max)
|
||
{
|
||
$text = (string)$text;
|
||
if (mb_strlen($text) <= $max) {
|
||
return $text;
|
||
}
|
||
return mb_substr($text, 0, $max) . '...';
|
||
}
|
||
|
||
/**
|
||
* 第二次 DOI 复核数据准备:返回书目信息 + 真实抓取内容
|
||
*
|
||
* @return array{refer_text:string, doi_block:string, has_abstract:bool, doi_used:string}
|
||
*/
|
||
public function prepareRecheckPayload($refer, $referText = '')
|
||
{
|
||
$base = trim($referText) !== '' ? trim($referText) : $this->formatReferForLlm($refer);
|
||
$cr = $this->fetchCrossrefAbstractByReferDoi($refer);
|
||
return [
|
||
'refer_text' => $base,
|
||
'doi_block' => $cr['text'],
|
||
'has_abstract' => $cr['has_abstract'],
|
||
'doi_used' => $cr['doi'],
|
||
];
|
||
}
|
||
|
||
/**
|
||
* 旧接口:拼接成单块文本(向后兼容,建议调用方改用 prepareRecheckPayload)
|
||
*/
|
||
public function formatReferForDoiRecheck($refer, $referText = '')
|
||
{
|
||
$payload = $this->prepareRecheckPayload($refer, $referText);
|
||
if ($payload['doi_block'] === '') {
|
||
return $payload['refer_text']
|
||
. "\n\n【DOI 文献真实内容】\n未能从 PubMed/Crossref 获取该 DOI 的摘要或元数据,请依据书目条目与正文谨慎判断。";
|
||
}
|
||
return $payload['refer_text']
|
||
. "\n\n【Crossref 摘要(依据 Refer_doi 从 api.crossref.org/works 获取)】\n"
|
||
. $payload['doi_block'];
|
||
}
|
||
|
||
/**
|
||
* 第一轮 confidence<=0.65 且能抓到 DOI 真实内容时,延迟入队第二轮复核
|
||
*
|
||
* 跳过条件(避免无意义重跑得到相同结果):
|
||
* - check_id 不合法 / 一次置信度高于阈值
|
||
* - refer 行不存在
|
||
* - refer_doi 为空或 Crossref 未返回摘要
|
||
*/
|
||
public function maybeEnqueueSecondPass($checkId, $confidence)
|
||
{
|
||
$checkId = intval($checkId);
|
||
$confidence = floatval($confidence);
|
||
if ($checkId <= 0 || $confidence > 0.65) {
|
||
return false;
|
||
}
|
||
|
||
$row = Db::name('article_reference_check_result')->where('id', $checkId)->find();
|
||
if (empty($row)) {
|
||
return false;
|
||
}
|
||
|
||
$refer = null;
|
||
if (intval($row['p_refer_id']) > 0) {
|
||
$refer = Db::name('production_article_refer')
|
||
->where('p_refer_id', intval($row['p_refer_id']))
|
||
->where('state', 0)
|
||
->find();
|
||
}
|
||
if (empty($refer) || $this->extractReferDoiOnly($refer) === '') {
|
||
return false;
|
||
}
|
||
|
||
$cr = $this->fetchCrossrefAbstractByReferDoi($refer);
|
||
if (empty($cr['has_abstract'])) {
|
||
return false;
|
||
}
|
||
|
||
$this->clearReferenceCheckQueueLock($checkId);
|
||
$this->pushJob2($checkId, 5);
|
||
return true;
|
||
}
|
||
|
||
/**
|
||
* 从正文 HTML 或表格展平后的 HTML 提取 blue 引用
|
||
*/
|
||
public function extractReferences($content)
|
||
{
|
||
$result = [];
|
||
$matches = $this->collectBlueTagMatches($content);
|
||
if (empty($matches[0])) {
|
||
return [];
|
||
}
|
||
|
||
$tagSpans = [];
|
||
foreach ($matches[0] as $index => $match) {
|
||
$tagSpans[] = [
|
||
'start' => $match[1],
|
||
'end' => $match[1] + strlen($match[0]),
|
||
'index' => $index,
|
||
];
|
||
}
|
||
|
||
foreach ($matches[0] as $index => $match) {
|
||
$fullTag = $match[0];
|
||
$tagStart = $match[1];
|
||
$tagEnd = $tagStart + strlen($fullTag);
|
||
$rawRef = trim($matches[1][$index][0]);
|
||
$referenceNumbers = $this->expandReferenceNumbers($rawRef);
|
||
|
||
list($localStart, $localEnd, $originalText) = $this->extractLocalCitationContext(
|
||
$content,
|
||
$tagStart,
|
||
$tagEnd,
|
||
$tagSpans
|
||
);
|
||
|
||
if (!$this->isMeaningfulCitationContext($originalText) || empty($referenceNumbers)) {
|
||
continue;
|
||
}
|
||
|
||
$result[] = [
|
||
'reference_raw' => $rawRef,
|
||
'reference_numbers' => $referenceNumbers,
|
||
'original_text' => $originalText,
|
||
'reference_start' => $tagStart,
|
||
'reference_end' => $tagEnd,
|
||
'text_start' => $localStart,
|
||
'text_end' => $localEnd,
|
||
];
|
||
}
|
||
|
||
return $result;
|
||
}
|
||
|
||
/**
|
||
* 按引用位置截取局部上下文:优先取标签前叙述;同句多引时后续引用从上一标签后开始。
|
||
*/
|
||
private function extractLocalCitationContext($content, $tagStart, $tagEnd, array $tagSpans)
|
||
{
|
||
$paragraphStart = $this->findParagraphStart($content, $tagStart);
|
||
$sentenceEnd = $this->findSentenceEnd($content, $tagEnd, $tagEnd);
|
||
|
||
$prevTagEnd = $paragraphStart;
|
||
$nextTagStart = $sentenceEnd;
|
||
foreach ($tagSpans as $span) {
|
||
if ($span['end'] <= $tagStart && $span['end'] > $prevTagEnd) {
|
||
$prevTagEnd = $span['end'];
|
||
}
|
||
if ($span['start'] > $tagEnd && $span['start'] < $nextTagStart) {
|
||
$nextTagStart = $span['start'];
|
||
}
|
||
}
|
||
|
||
$hasPriorCiteInParagraph = ($prevTagEnd > $paragraphStart);
|
||
$sentenceStart = $this->findSentenceStart($content, $tagStart);
|
||
|
||
// 段内首个引用:整段到标签前;后续引用:取「本句」起点(可早于上一标签),避免只剩 “and external environment” 再误用标签后文本
|
||
if ($hasPriorCiteInParagraph) {
|
||
$localStart = max($paragraphStart, $sentenceStart);
|
||
} else {
|
||
$localStart = $this->capContextStartBeforeTag($content, $tagStart, $paragraphStart);
|
||
}
|
||
|
||
// 默认:引用标签前的论述
|
||
$localEnd = $tagStart;
|
||
$originalText = $this->buildCitationContextText($content, $localStart, $localEnd);
|
||
|
||
// 仅段内首个引用、且标签前极短(如句末 ICU nurses [14])时,才改用标签后片段;同段多引禁止标签后截取(会错取下一句)
|
||
$allowTrailing = !$hasPriorCiteInParagraph;
|
||
if ($allowTrailing && (
|
||
!$this->isMeaningfulCitationContext($originalText)
|
||
|| $this->shouldUseTrailingCitationContext($content, $localStart, $tagStart, $tagEnd)
|
||
)) {
|
||
$trailEnd = ($nextTagStart < $sentenceEnd) ? $nextTagStart : $sentenceEnd;
|
||
$trailText = $this->buildCitationContextText($content, $tagEnd, $trailEnd);
|
||
if ($this->isMeaningfulCitationContext($trailText)) {
|
||
$localStart = $tagEnd;
|
||
$localEnd = $trailEnd;
|
||
$originalText = $trailText;
|
||
}
|
||
}
|
||
|
||
if (!$this->isMeaningfulCitationContext($originalText)) {
|
||
list($localStart, $localEnd) = $this->widenCitationContextBounds(
|
||
$content,
|
||
$tagStart,
|
||
$tagEnd,
|
||
$localStart,
|
||
$localEnd
|
||
);
|
||
$originalText = $this->buildCitationContextText($content, $localStart, $localEnd);
|
||
}
|
||
|
||
return [$localStart, $localEnd, $originalText];
|
||
}
|
||
|
||
/**
|
||
* 标签前仅有作者缩写等极短片段时,改用标签后上下文
|
||
*/
|
||
private function shouldUseTrailingCitationContext($content, $localStart, $tagStart, $tagEnd)
|
||
{
|
||
$before = $this->buildCitationContextText($content, $localStart, $tagStart);
|
||
if (!$this->isMeaningfulCitationContext($before)) {
|
||
return true;
|
||
}
|
||
|
||
return mb_strlen($before) < 25;
|
||
}
|
||
|
||
public function expandReferenceNumbers($refStr)
|
||
{
|
||
$refStr = str_replace(
|
||
[',', '–', '—', '−', '‐', '‑'],
|
||
[',', '-', '-', '-', '-', '-'],
|
||
trim($refStr)
|
||
);
|
||
$numbers = [];
|
||
foreach (explode(',', $refStr) as $part) {
|
||
$part = trim($part);
|
||
if ($part === '') {
|
||
continue;
|
||
}
|
||
if (preg_match('/^(\d+)\s*-\s*(\d+)$/', $part, $m)) {
|
||
$start = intval($m[1]);
|
||
$end = intval($m[2]);
|
||
if ($start <= $end) {
|
||
$numbers = array_merge($numbers, range($start, $end));
|
||
}
|
||
} elseif (ctype_digit($part)) {
|
||
$numbers[] = intval($part);
|
||
}
|
||
}
|
||
return array_values(array_unique($numbers));
|
||
}
|
||
|
||
/**
|
||
* 返回 $bytePos 处 UTF-8 码点占用的最后一字节之后的位置(下一字符起始)
|
||
*/
|
||
private function utf8CharEnd($content, $bytePos)
|
||
{
|
||
$len = strlen($content);
|
||
if ($bytePos < 0 || $bytePos >= $len) {
|
||
return max(0, min($len, $bytePos + 1));
|
||
}
|
||
$next = $bytePos + 1;
|
||
while ($next < $len && (ord($content[$next]) & 0xC0) === 0x80) {
|
||
$next++;
|
||
}
|
||
|
||
return $next;
|
||
}
|
||
|
||
/**
|
||
* 按字节偏移截取(与 strpos/strlen 一致);勿用 mb_substr,否则遇中文前缀会截断英文词头
|
||
*/
|
||
private function byteSubstr($content, $start, $end)
|
||
{
|
||
$length = max(0, $end - $start);
|
||
if ($length === 0) {
|
||
return '';
|
||
}
|
||
|
||
return (string)mb_strcut($content, $start, $length, 'UTF-8');
|
||
}
|
||
|
||
private function buildCitationContextText($content, $start, $end)
|
||
{
|
||
$text = $this->byteSubstr($content, $start, $end);
|
||
$text = $this->pregReplaceBlueTags($text, '');
|
||
$text = trim(strip_tags($text));
|
||
$text = preg_replace('/\s+/u', ' ', $text);
|
||
$text = ltrim($text, "\xEF\xBB\xBF");
|
||
|
||
return $text;
|
||
}
|
||
|
||
/**
|
||
* 过滤仅标点、过短或无字母/汉字的上下文(如去掉标签后只剩 ".")
|
||
*/
|
||
private function isMeaningfulCitationContext($text)
|
||
{
|
||
$text = trim($text);
|
||
if ($text === '') {
|
||
return false;
|
||
}
|
||
if ($this->isOnlyPunctuationOrSpace($text)) {
|
||
return false;
|
||
}
|
||
if (!preg_match('/[\p{L}\p{N}]/u', $text)) {
|
||
return false;
|
||
}
|
||
|
||
return mb_strlen($text) >= 2;
|
||
}
|
||
|
||
private function isOnlyPunctuationOrSpace($text)
|
||
{
|
||
return preg_match('/^[\s\p{P}\p{S}]+$/u', $text) === 1;
|
||
}
|
||
|
||
/**
|
||
* 首句过短时向前后各扩展一句(上限约 2000 字符)
|
||
*/
|
||
private function widenCitationContextBounds($content, $tagStart, $tagEnd, $start, $end)
|
||
{
|
||
$len = strlen($content);
|
||
$maxSpan = 2000;
|
||
|
||
if ($start > 0) {
|
||
$prevStart = $this->findSentenceStart($content, max(0, $start - 1));
|
||
if ($prevStart < $start) {
|
||
$start = $prevStart;
|
||
}
|
||
}
|
||
|
||
$nextEnd = $this->findSentenceEnd($content, $end, $tagEnd);
|
||
if ($nextEnd > $end && $nextEnd <= $len) {
|
||
$end = $nextEnd;
|
||
}
|
||
|
||
if ($end - $start > $maxSpan) {
|
||
$half = (int)floor($maxSpan / 2);
|
||
$mid = (int)floor(($tagStart + $tagEnd) / 2);
|
||
$start = max(0, $mid - $half);
|
||
$end = min($len, $start + $maxSpan);
|
||
}
|
||
|
||
return [$start, $end];
|
||
}
|
||
|
||
/**
|
||
* 句号是否可作为句界(排除小数点、et al. 等缩写)
|
||
*/
|
||
private function isSentenceDelimiterAt($content, $pos, $delimiter)
|
||
{
|
||
$len = strlen($content);
|
||
if ($delimiter !== '.' || $pos < 0 || $pos >= $len) {
|
||
return true;
|
||
}
|
||
if ($pos > 0 && $pos + 1 < $len
|
||
&& ctype_digit($content[$pos - 1])
|
||
&& ctype_digit($content[$pos + 1])
|
||
) {
|
||
return false;
|
||
}
|
||
|
||
$before = substr($content, max(0, $pos - 12), min(12, $pos));
|
||
if (preg_match('/\b(et\s+al|e\.g|i\.e|vs|etc|fig|no)\s*\.?\s*$/i', $before)) {
|
||
return false;
|
||
}
|
||
|
||
$after = substr($content, $pos + 1, 24);
|
||
if (preg_match('/^\s*<blue>\s*\[/', $after)) {
|
||
return false;
|
||
}
|
||
|
||
return true;
|
||
}
|
||
|
||
/**
|
||
* 段落起始(HTML / 换行),避免英文多句段落只取到最后一个句号后的一句
|
||
*/
|
||
private function findParagraphStart($content, $tagStart)
|
||
{
|
||
$search = substr($content, 0, max(0, $tagStart));
|
||
if ($search === '') {
|
||
return 0;
|
||
}
|
||
|
||
$best = 0;
|
||
|
||
if (preg_match_all('/<p[^>]*>/i', $search, $m, PREG_OFFSET_CAPTURE)) {
|
||
$last = end($m[0]);
|
||
$best = max($best, $last[1] + strlen($last[0]));
|
||
}
|
||
if (preg_match_all('/<\/p>\s*/i', $search, $m, PREG_OFFSET_CAPTURE)) {
|
||
$last = end($m[0]);
|
||
$best = max($best, $last[1] + strlen($last[0]));
|
||
}
|
||
if (preg_match_all('/<br\s*\/?>\s*/i', $search, $m, PREG_OFFSET_CAPTURE)) {
|
||
$last = end($m[0]);
|
||
$best = max($best, $last[1] + strlen($last[0]));
|
||
}
|
||
|
||
$pos = strrpos($search, "\n\n");
|
||
if ($pos !== false) {
|
||
$best = max($best, $pos + 2);
|
||
}
|
||
$pos = strrpos($search, "\n");
|
||
if ($pos !== false) {
|
||
$best = max($best, $pos + 1);
|
||
}
|
||
|
||
return $best;
|
||
}
|
||
|
||
/**
|
||
* 段落过长时从引用处向前截取上限,避免单次 LLM 上下文过大
|
||
*/
|
||
private function capContextStartBeforeTag($content, $tagStart, $paragraphStart, $maxBytes = 2500)
|
||
{
|
||
if ($tagStart - $paragraphStart <= $maxBytes) {
|
||
return $paragraphStart;
|
||
}
|
||
|
||
$start = $tagStart - $maxBytes;
|
||
$slice = substr($content, $start, $tagStart - $start);
|
||
if (preg_match('/[.!?。!?]\s+/u', $slice, $m, PREG_OFFSET_CAPTURE)) {
|
||
$rel = $m[0][1] + strlen($m[0][0]);
|
||
return $start + $rel;
|
||
}
|
||
|
||
return max($paragraphStart, $start);
|
||
}
|
||
|
||
private function findSentenceStart($content, $position)
|
||
{
|
||
$start = 0;
|
||
foreach (['.', '。', '!', '?', "\n"] as $delimiter) {
|
||
$pos = strrpos(substr($content, 0, $position), $delimiter);
|
||
if ($pos !== false && $this->isSentenceDelimiterAt($content, $pos, $delimiter)) {
|
||
$start = max($start, $this->utf8CharEnd($content, $pos));
|
||
}
|
||
}
|
||
return $start;
|
||
}
|
||
|
||
/**
|
||
* @param int $searchFrom 从该字节位置起查找句末
|
||
* @param int $tagEnd 引用标签结束位置;用于跳过 </blue> 后紧跟的孤立句号
|
||
*/
|
||
private function findSentenceEnd($content, $searchFrom, $tagEnd = 0)
|
||
{
|
||
$length = strlen($content);
|
||
$minPos = max(0, $searchFrom);
|
||
|
||
while ($minPos < $length) {
|
||
$endPositions = [];
|
||
foreach (['.', '。', '!', '?', "\n"] as $delimiter) {
|
||
$pos = strpos($content, $delimiter, $minPos);
|
||
if ($pos !== false && $this->isSentenceDelimiterAt($content, $pos, $delimiter)) {
|
||
$endPositions[] = $this->utf8CharEnd($content, $pos);
|
||
}
|
||
}
|
||
if (empty($endPositions)) {
|
||
return $length;
|
||
}
|
||
|
||
$end = min($endPositions);
|
||
if ($tagEnd <= 0 || $end <= $tagEnd) {
|
||
return $end;
|
||
}
|
||
|
||
$gap = substr($content, $tagEnd, $end - $tagEnd);
|
||
$gapText = trim(strip_tags($this->pregReplaceBlueTags($gap, '')));
|
||
if ($gapText !== '' && !$this->isOnlyPunctuationOrSpace($gapText)) {
|
||
return $end;
|
||
}
|
||
|
||
$minPos = $end;
|
||
}
|
||
|
||
return $length;
|
||
}
|
||
|
||
/**
|
||
* 已入库记录按文献编号正序入队(同号按 am_id、正文位置稳定排序)
|
||
*
|
||
* @param array $rows 元素含 check_id、reference_no,可选 am_id、text_start
|
||
*/
|
||
private function pushJobsSortedByReferenceNo(array $rows)
|
||
{
|
||
if (empty($rows)) {
|
||
return [];
|
||
}
|
||
|
||
usort($rows, function ($a, $b) {
|
||
if ($a['reference_no'] !== $b['reference_no']) {
|
||
return $a['reference_no'] - $b['reference_no'];
|
||
}
|
||
$amA = isset($a['am_id']) ? intval($a['am_id']) : 0;
|
||
$amB = isset($b['am_id']) ? intval($b['am_id']) : 0;
|
||
if ($amA !== $amB) {
|
||
return $amA - $amB;
|
||
}
|
||
$posA = isset($a['text_start']) ? intval($a['text_start']) : 0;
|
||
$posB = isset($b['text_start']) ? intval($b['text_start']) : 0;
|
||
return $posA - $posB;
|
||
});
|
||
|
||
$checkIds = [];
|
||
$delay = 0;
|
||
foreach ($rows as $row) {
|
||
$checkId = intval($row['check_id']);
|
||
$checkIds[] = $checkId;
|
||
$this->pushJob($checkId, $delay);
|
||
$delay++;
|
||
}
|
||
|
||
return $checkIds;
|
||
}
|
||
|
||
private function pushJob($checkId, $delaySeconds = 0)
|
||
{
|
||
$checkId = intval($checkId);
|
||
$this->clearReferenceCheckQueueLock($checkId);
|
||
$jobClass = 'app\api\job\ReferenceCheck@fire';
|
||
$data = ['check_id' => $checkId];
|
||
try {
|
||
if ($delaySeconds > 0) {
|
||
$jobId = Queue::later($delaySeconds, $jobClass, $data, self::QUEUE_NAME);
|
||
} else {
|
||
$jobId = Queue::push($jobClass, $data, self::QUEUE_NAME);
|
||
}
|
||
} catch (\Exception $e) {
|
||
\think\Log::error('ReferenceCheck pushJob failed check_id=' . $checkId . ' ' . $e->getMessage());
|
||
throw $e;
|
||
}
|
||
}
|
||
private function pushJob2($checkId, $delaySeconds = 0)
|
||
{
|
||
$jobClass = 'app\api\job\ReferenceCheckTwo@fire';
|
||
$data = ['check_id' => $checkId];
|
||
try {
|
||
if ($delaySeconds > 0) {
|
||
$jobId = Queue::later($delaySeconds, $jobClass, $data, self::QUEUE_NAME);
|
||
} else {
|
||
$jobId = Queue::push($jobClass, $data, self::QUEUE_NAME);
|
||
}
|
||
} catch (\Exception $e) {
|
||
\think\Log::error('ReferenceCheckTwo pushJob failed check_id=' . $checkId . ' ' . $e->getMessage());
|
||
throw $e;
|
||
}
|
||
}
|
||
}
|