Files
tougao/application/common/ReferenceCheckService.php
2026-06-02 09:57:51 +08:00

3212 lines
116 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
namespace app\common;
use think\Db;
use think\Env;
use app\common\service\LLMService;
use app\common\mq\ReferenceCheckMqPublisher;
/**
* 正文 &lt;blue&gt;[n]&lt;/blue&gt; 引用与 t_production_article_referindex+1=n相关性校对。
* LLM 配置与 PromotionLlmService 相同;异步任务走 RabbitMQ一篇一条消息
*/
class ReferenceCheckService
{
/** API 返回异步传输方式RabbitMQ 文章批次) */
const TRANSPORT_RABBITMQ = 'rabbitmq';
/** t_article_main.type */
const MAIN_TYPE_TEXT = 0;
const MAIN_TYPE_IMAGE = 1;
const MAIN_TYPE_TABLE = 2;
/** t_article_main.ref_check_status需执行 sql/article_main_ref_check_status.sql */
const AM_STATUS_NONE = 0;
const AM_STATUS_PASS = 1;
const AM_STATUS_FAIL = 2;
const AM_STATUS_RUNNING = 3;
/** @var bool|null t_article_main 是否已有 ref_check_status 列 */
private static $amRefCheckStatusColumnExists = null;
/** 单条任务最多重试次数(不含首次执行) */
const QUEUE_MAX_RETRY = 1;
/**
* 引用校对状态生命周期顺序0→1→2→3 = 待→进行→完成→失败)
*
* 这套常量在两个维度共用:
* - 单条明细article_reference_check_result.status只会取 {0, 2, 3} —— 明细不会出现"校对中"
* - 分组(按 reference_no 聚合后的 progress_status四个值都会用 —— 1=部分跑完、部分仍为 0
*/
const PROGRESS_PENDING = 0; // 待校验:明细 status=0分组内全部明细 status=0
const PROGRESS_CHECKING = 1; // 校对中:仅分组层 —— 部分明细已结束、部分仍为 0
const PROGRESS_COMPLETED = 2; // 校对完成:明细 status=2分组内全部明细 status=2
const PROGRESS_FAILED = 3; // 校对失败:明细 status=3分组内全部跑完、≥1 条 status=3
/** 整篇文章的引用校对状态(对外整体状态,用于"开始/重置"按钮分流) */
const ARTICLE_PROGRESS_NONE = 0; // 还没有任何校对记录
const ARTICLE_PROGRESS_RUNNING = 1; // 至少 1 条明细 status=0队列里还有未跑完的
const ARTICLE_PROGRESS_COMPLETED = 2; // 所有明细 status != 0全部已完成或失败
/**
* 单条校对明细的状态DB 字段 article_reference_check_result.status
*
* 这里只列实际写入 DB 的三种值。"校对中"(值 1是分组层专用明细不会出现。
* 数值与 PROGRESS_* 对齐(同一套语义),方便前端/后端混用。
*/
const RECORD_PENDING = 0; // 待校对,已入队但还没被 worker 拾起
const RECORD_COMPLETED = 2; // 校对完成
const RECORD_FAILED = 3; // 校对失败
/** 队列执行状态queue_status */
const QUEUE_PENDING = 0; // 已入队待执行
const QUEUE_RUNNING = 1; // worker 正在执行
const QUEUE_COMPLETED = 2; // 执行完成
const QUEUE_FAILED = 3; // 最终失败(重试耗尽)
/** LLM 评分confidence通过阈值>= 该值视为"通过" */
const PASS_CONFIDENCE_THRESHOLD = 0.65;
/**
* 正文引用标签两种排版(带 /u
* 1) <blue>[8, 9]</blue>、<blue>[13-15]</blue> —— 方括号在 blue 内
* 2) [<blue>13-15</blue>] —— 方括号包裹 blue
*
* 捕获组均为序号串(可含逗号、区间连字符及排版变体)。
*/
const BLUE_TAG_REGEX = '/<blue>\[([\d,\-\x{2013}\x{2014}\x{2212}\x{2010}\x{2011}\s]+)\]<\/blue>/u';
const BLUE_TAG_REGEX_BRACKET_OUTSIDE = '/\[<blue>([\d,\-\x{2013}\x{2014}\x{2212}\x{2010}\x{2011}\s]+)<\/blue>\]/u';
private $logFile;
public function __construct()
{
$this->logFile = ROOT_PATH . 'runtime' . DS . 'plagiarism_task.log';
}
/**
* 兼容无 ?? 的 PHP 版本
*/
private function arrGet($arr, $key, $default = '')
{
return isset($arr[$key]) ? $arr[$key] : $default;
}
/** 新建/重置校对明细时的队列初始字段 */
private function newCheckRecordFields(array $fields, $queueStatus = self::QUEUE_PENDING, $retryCount = 0)
{
$fields['queue_status'] = intval($queueStatus);
$fields['retry_count'] = max(0, intval($retryCount));
return $fields;
}
public function markQueueRuntime($checkId, $queueStatus, $retryCount = null)
{
$checkId = intval($checkId);
if ($checkId <= 0) {
return 0;
}
$fields = ['queue_status' => intval($queueStatus)];
if ($retryCount !== null) {
$fields['retry_count'] = max(0, intval($retryCount));
}
return Db::name('article_reference_check_result')->where('id', $checkId)->update($fields);
}
/**
* 合并匹配两种 blue 引用排版,按在正文中的起始位置排序。
*
* @return array{0: array, 1: array} 同 preg_match_all 的完整匹配与捕获组 1
*/
private function collectBlueTagMatches($content)
{
$merged = [];
foreach ([self::BLUE_TAG_REGEX, self::BLUE_TAG_REGEX_BRACKET_OUTSIDE] as $pattern) {
if (!preg_match_all($pattern, $content, $m, PREG_OFFSET_CAPTURE)) {
continue;
}
$count = count($m[0]);
for ($i = 0; $i < $count; $i++) {
$merged[] = ['full' => $m[0][$i], 'inner' => $m[1][$i]];
}
}
usort($merged, function ($a, $b) {
return $a['full'][1] - $b['full'][1];
});
$matches = [[], []];
foreach ($merged as $item) {
$matches[0][] = $item['full'];
$matches[1][] = $item['inner'];
}
return $matches;
}
/** 对两种 blue 引用排版执行 preg_replace */
private function pregReplaceBlueTags($subject, $replacement)
{
$subject = preg_replace(self::BLUE_TAG_REGEX, $replacement, $subject);
$subject = preg_replace(self::BLUE_TAG_REGEX_BRACKET_OUTSIDE, $replacement, $subject);
return $subject;
}
/**
* 单条入队(可手工指定正文与文献文本)
*/
public function enqueue($contentA, $contentB, array $extra = [])
{
$contentA = trim($contentA);
if ($contentA === '') {
throw new \InvalidArgumentException('content_a is required');
}
$now = date('Y-m-d H:i:s');
$checkId = Db::name('article_reference_check_result')->insertGetId($this->newCheckRecordFields([
'article_id' => intval($this->arrGet($extra, 'article_id', 0)),
'am_id' => intval($this->arrGet($extra, 'am_id', 0)),
'p_article_id' => intval($this->arrGet($extra, 'p_article_id', 0)),
'p_refer_id' => intval($this->arrGet($extra, 'p_refer_id', 0)),
'refer_index' => intval($this->arrGet($extra, 'refer_index', 0)),
'reference_no' => intval($this->arrGet($extra, 'reference_no', 0)),
'reference_raw' => (string)$this->arrGet($extra, 'reference_raw', ''),
'cite_tag_start' => intval($this->arrGet($extra, 'cite_tag_start', 0)),
'cite_tag_end' => intval($this->arrGet($extra, 'cite_tag_end', 0)),
'text_start' => intval($this->arrGet($extra, 'text_start', 0)),
'text_end' => intval($this->arrGet($extra, 'text_end', 0)),
'content_a' => $contentA,
'content_b' => trim($contentB),
'status' => 0,
'created_at' => $now,
'updated_at' => $now,
]));
$amId = intval($this->arrGet($extra, 'am_id', 0));
if ($amId > 0) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING);
}
$this->startArticleCheckQueue([intval($checkId)], intval($this->arrGet($extra, 'p_article_id', 0)), 'enqueue');
return ['check_id' => $checkId, 'queued' => 1];
}
public function enqueueByArticleMain($main){
$amId = intval($this->arrGet($main, 'am_id', 0));
if ($amId > 0 && (!isset($main['type']) || (intval($main['type']) === self::MAIN_TYPE_TABLE && intval($this->arrGet($main, 'amt_id', 0)) <= 0))) {
$dbMain = Db::name('article_main')
->field('am_id,content,article_id,type,amt_id')
->where('am_id', $amId)
->whereIn('state', [0, 2])
->find();
if (!empty($dbMain)) {
$main = array_merge($dbMain, $main);
}
}
$citations = $this->extractReferencesForArticleMain($main);
if (empty($citations)) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_NONE);
return;
}
$prod = Db::name('production_article')
->where('article_id', $main['article_id'])
->where('state', 0)
->find();
if (empty($prod)) {
throw new \RuntimeException('production_article not found for article_id=' . $main['article_id']);
}
$pArticleId = intval($prod['p_article_id']);
$referMap = $this->loadReferMapByPArticleId($pArticleId);
if (empty($citations)) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_PASS);
return;
}
$skipped = 0;
$pendingJobs = [];
$now = date('Y-m-d H:i:s');
foreach ($citations as $cite) {
foreach ($cite['reference_numbers'] as $refNo) {
$referIndex = $refNo - 1;
if ($referIndex < 0 || !isset($referMap[$referIndex])) {
$skipped++;
continue;
}
$refer = $referMap[$referIndex];
$referText = $this->formatReferForLlm($refer);
$checkId = Db::name('article_reference_check_result')->insertGetId($this->newCheckRecordFields([
'article_id' => $main['article_id'],
'p_article_id' => $pArticleId,
'am_id' => intval($main['am_id']),
'reference_no' => $refNo,
'refer_index' => $refNo,
'origin_text' => $cite['original_text'],
'refer_text' => $referText,
'p_refer_id' => $referMap[$referIndex]['p_refer_id'],
'text_start' => $cite['text_start'],
'text_end' => $cite['text_end'],
'status' => self::RECORD_PENDING,
'created_at' => $now,
'updated_at' => $now,
]));
$pendingJobs[] = [
'check_id' => intval($checkId),
'reference_no' => intval($refNo),
'am_id' => intval($main['am_id']),
'text_start' => intval($cite['text_start']),
];
}
}
$this->enqueueChecksSortedByReferenceNo($pendingJobs, $pArticleId, 'enqueue');
$this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING);
}
/**
* 手工触发:对已完成且 confidence<=0.65 的记录同步执行 Crossref 二轮复核
*/
public function enqueueSecondPassByArticle($articleId)
{
$articleId = intval($articleId);
if ($articleId <= 0) {
throw new \InvalidArgumentException('article_id is required');
}
$rows = Db::name('article_reference_check_result')
->where('article_id', $articleId)
->where('status', self::RECORD_COMPLETED)
->where('confidence', '<=', 0.65)
->orderRaw('rand()')
->limit(2)
->select();
$checkIds2 = [];
$delay2 = 0;
foreach ($rows as $checkLog) {
$rowId = $this->resolveCheckRowId($checkLog);
if ($this->runSecondPassIfNeeded($rowId, floatval($checkLog['confidence']))) {
$checkIds2[] = $rowId;
$delay2 += 1;
}
}
return [
'article_id' => $articleId,
'check_ids2' => $checkIds2,
'queued' => count($checkIds2),
];
}
public function enqueueByPArticle($prod){
if (empty($prod)) {
throw new \RuntimeException('production_article not found');
}
$pArticleId = intval($prod['p_article_id']);
$articleId = intval($prod['article_id']);
$referMap = $this->loadReferMapByPArticleId($pArticleId);
$mains = Db::name('article_main')
->field('am_id,content,article_id,type,amt_id')
->where('article_id', $articleId)
->whereIn('state', [0, 2])
->order('sort asc')
->select();
if (empty($mains)) {
throw new \RuntimeException('article_main is empty');
}
$queued = 0;
$skipped = 0;
$pendingJobs = [];
$amIdsWithJobs = [];
$now = date('Y-m-d H:i:s');
foreach ($mains as $main) {
$amId = intval($main['am_id']);
$citations = $this->extractReferencesForArticleMain($main);
if (empty($citations)) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_NONE);
continue;
}
foreach ($citations as $cite) {
foreach ($cite['reference_numbers'] as $refNo) {
$referIndex = $refNo - 1;
if ($referIndex < 0 || !isset($referMap[$referIndex])) {
$skipped++;
continue;
}
$refer = $referMap[$referIndex];
$referText = $this->formatReferForLlm($refer);
// [70-73] 展开为 reference_no=70,71,72,73 共 4 条记录;先入队表,再按文献号正序校对
$checkId = Db::name('article_reference_check_result')->insertGetId($this->newCheckRecordFields([
'article_id' => $main['article_id'],
'p_article_id' => $pArticleId,
'am_id' => $amId,
'reference_no' => $refNo,
'refer_index' => $refNo,
'origin_text' => $cite['original_text'],
'refer_text' => $referText,
'p_refer_id' => $referMap[$referIndex]['p_refer_id'],
'text_start' => $cite['text_start'],
'text_end' => $cite['text_end'],
'status' => self::RECORD_PENDING,
'created_at' => $now,
'updated_at' => $now,
]));
$pendingJobs[] = [
'check_id' => intval($checkId),
'reference_no' => intval($refNo),
'am_id' => $amId,
'text_start' => intval($cite['text_start']),
];
$queued++;
$amIdsWithJobs[$amId] = true;
}
}
}
$checkIds = $this->enqueueChecksSortedByReferenceNo($pendingJobs, $pArticleId, 'enqueue');
foreach (array_keys($amIdsWithJobs) as $amId) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING);
}
return [
'article_id' => $articleId,
'p_article_id' => $pArticleId,
'queued' => $queued,
'skipped' => $skipped,
'check_ids' => $checkIds,
'queue' => self::TRANSPORT_RABBITMQ,
];
}
public function enqueueByArticle($articleId){
if ($articleId <= 0) {
throw new \InvalidArgumentException('article_id is required');
}
$prod = Db::name('production_article')
->where('article_id', $articleId)
->whereIn('state', [0, 2])
->find();
if (empty($prod)) {
throw new \RuntimeException('production_article not found for article_id=' . $articleId);
}
$pArticleId = intval($prod['p_article_id']);
$referMap = $this->loadReferMapByPArticleId($pArticleId);
$mains = Db::name('article_main')
->field('am_id,content,article_id,type,amt_id')
->where('article_id', $articleId)
->whereIn('state', [0, 2])
->order('sort asc')
->select();
if (empty($mains)) {
throw new \RuntimeException('article_main is empty');
}
$queued = 0;
$skipped = 0;
$pendingJobs = [];
$amIdsWithJobs = [];
$now = date('Y-m-d H:i:s');
foreach ($mains as $main) {
$amId = intval($main['am_id']);
$citations = $this->extractReferencesForArticleMain($main);
if (empty($citations)) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_NONE);
continue;
}
foreach ($citations as $cite) {
foreach ($cite['reference_numbers'] as $refNo) {
$referIndex = $refNo - 1;
if ($referIndex < 0 || !isset($referMap[$referIndex])) {
$skipped++;
continue;
}
$refer = $referMap[$referIndex];
$referText = $this->formatReferForLlm($refer);
// [70-73] 展开为 reference_no=70,71,72,73 共 4 条记录;先入队表,再按文献号正序校对
$checkId = Db::name('article_reference_check_result')->insertGetId($this->newCheckRecordFields([
'article_id' => $main['article_id'],
'p_article_id' => $pArticleId,
'am_id' => $amId,
'reference_no' => $refNo,
'refer_index' => $refNo,
'origin_text' => $cite['original_text'],
'refer_text' => $referText,
'p_refer_id' => $referMap[$referIndex]['p_refer_id'],
'text_start' => $cite['text_start'],
'text_end' => $cite['text_end'],
'status' => self::RECORD_PENDING,
'created_at' => $now,
'updated_at' => $now,
]));
$pendingJobs[] = [
'check_id' => intval($checkId),
'reference_no' => intval($refNo),
'am_id' => $amId,
'text_start' => intval($cite['text_start']),
];
$queued++;
$amIdsWithJobs[$amId] = true;
}
}
}
$checkIds = $this->enqueueChecksSortedByReferenceNo($pendingJobs, $pArticleId, 'enqueue');
foreach (array_keys($amIdsWithJobs) as $amId) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING);
}
return [
'article_id' => $articleId,
'p_article_id' => $pArticleId,
'queued' => $queued,
'skipped' => $skipped,
'check_ids' => $checkIds,
'queue' => self::TRANSPORT_RABBITMQ,
];
}
/**
* 根据该节全部明细行汇总更新 t_article_main.ref_check_status
*/
public function syncAmRefCheckStatus($amId)
{
if ($amId <= 0) {
return self::AM_STATUS_NONE;
}
$rows = Db::name('article_reference_check_result')->where('am_id', $amId)->select();
if (empty($rows)) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_NONE);
return self::AM_STATUS_NONE;
}
$pending = 0;
$hasFail = false;
$done = 0;
foreach ($rows as $row) {
$st = intval($row['status']);
if ($st === self::RECORD_PENDING) {
$pending++;
continue;
}
if ($st === self::RECORD_FAILED || ($st === self::RECORD_COMPLETED && intval($row['is_match']) === 0)) {
$hasFail = true;
}
if ($st === self::RECORD_COMPLETED) {
$done++;
}
}
if ($pending > 0) {
$status = self::AM_STATUS_RUNNING;
} elseif ($hasFail) {
$status = self::AM_STATUS_FAIL;
} elseif ($done === count($rows)) {
$status = self::AM_STATUS_PASS;
} else {
$status = self::AM_STATUS_FAIL;
}
$this->setAmRefCheckStatus($amId, $status);
return $status;
}
/**
* t_article_main 是否已加 ref_check_status 列(未迁移时跳过写入,避免 fields not exists
*/
private function hasAmRefCheckStatusColumn()
{
if (self::$amRefCheckStatusColumnExists !== null) {
return self::$amRefCheckStatusColumnExists;
}
try {
$table = Db::name('article_main')->getTable();
$rows = Db::query('SHOW COLUMNS FROM `' . str_replace('`', '``', $table) . '` LIKE \'ref_check_status\'');
self::$amRefCheckStatusColumnExists = !empty($rows);
} catch (\Exception $e) {
self::$amRefCheckStatusColumnExists = false;
}
return self::$amRefCheckStatusColumnExists;
}
public function setAmRefCheckStatus($amId, $status)
{
if ($amId <= 0 || !$this->hasAmRefCheckStatusColumn()) {
return;
}
Db::name('article_main')->where('am_id', $amId)->update([
'ref_check_status' => $status,
]);
}
/**
* 按 p_article_id 清空整篇文章的引用校对明细 + 重置节级 ref_check_status。
*
* 用于新增/删除文献后,旧的 reference_no 全部错位、原校对结果失效的场景:
* 物理删除后,整篇状态查询自然回到 ARTICLE_PROGRESS_NONE未校对
*
* @return int 被删除的明细条数
*/
public function clearArticleChecksByPArticleId($pArticleId)
{
$pArticleId = intval($pArticleId);
if ($pArticleId <= 0) {
return 0;
}
// 先反查 article_id用于重置 article_main.ref_check_status 节级状态)
$articleId = intval(Db::name('production_article')
->where('p_article_id', $pArticleId)
->whereIn('state', [0, 2])
->value('article_id'));
$deleted = Db::name('article_reference_check_result')
->where('p_article_id', $pArticleId)
->delete();
if ($articleId > 0 && $this->hasAmRefCheckStatusColumn()) {
Db::name('article_main')
->where('article_id', $articleId)
->whereIn('state', [0, 2])
->update(['ref_check_status' => self::AM_STATUS_NONE]);
}
return intval($deleted);
}
public function clearArticleChecks($articleId)
{
$articleId = intval($articleId);
if ($articleId <= 0) {
return 0;
}
$deleted = Db::name('article_reference_check_result')->where('article_id', $articleId)->delete();
if ($this->hasAmRefCheckStatusColumn()) {
Db::name('article_main')
->where('article_id', $articleId)
->whereIn('state', [0, 2])
->update(['ref_check_status' => self::AM_STATUS_NONE]);
}
return intval($deleted);
}
/**
* 文献列表局部挪动后,仅刷新指定 p_refer_id 对应的校对明细 reference_no / refer_index。
*
* 读 production_article_refer 的最新 index 来算新序号index + 1避免外部传入过期值。
* 仅更新受影响的两条左右记录,降低与并发挪动互相覆盖的风险。
*
* @param int[] $pReferIds 受影响的 p_refer_id一般为 2 个:被挪条目 + 其相邻条目)
* @param int $pArticleId 可选:附加 p_article_id 限定,进一步缩小行锁范围
* @return array{p_refer_ids:int[], affected_rows:int, changes:array}
*/
public function syncReferenceNoByPReferIds(array $pReferIds, $pArticleId = 0)
{
$pReferIds = array_values(array_unique(array_filter(array_map('intval', $pReferIds))));
$pArticleId = intval($pArticleId);
if (empty($pReferIds)) {
return [
'p_refer_ids' => [],
'affected_rows' => 0,
'changes' => [],
];
}
$referQuery = Db::name('production_article_refer')
->field('p_refer_id,p_article_id,index')
->whereIn('p_refer_id', $pReferIds)
->where('state', 0);
if ($pArticleId > 0) {
$referQuery->where('p_article_id', $pArticleId);
}
$refers = $referQuery->select();
if (empty($refers)) {
return [
'p_refer_ids' => $pReferIds,
'affected_rows' => 0,
'changes' => [],
];
}
$now = date('Y-m-d H:i:s');
$affected = 0;
$changes = [];
foreach ($refers as $refer) {
$pReferId = intval($refer['p_refer_id']);
$newNo = intval($refer['index']) + 1;
$updateQuery = Db::name('article_reference_check_result')
->where('p_refer_id', $pReferId)
->where('reference_no', '<>', $newNo);
if ($pArticleId > 0) {
$updateQuery->where('p_article_id', $pArticleId);
}
$rows = $updateQuery->update([
'reference_no' => $newNo,
'refer_index' => $newNo,
'updated_at' => $now,
]);
if ($rows > 0) {
$affected += intval($rows);
$changes[] = [
'p_refer_id' => $pReferId,
'new_ref_no' => $newNo,
'affected_rows' => intval($rows),
];
}
}
return [
'p_refer_ids' => $pReferIds,
'affected_rows' => $affected,
'changes' => $changes,
];
}
/**
* 重置整篇稿件的引用校对:删除旧明细 + 清理队列锁 + 全文重新入队校对
*
* @return array
*/
/**
* 按 p_article_id 查整篇文章的引用校对总状态。
*
* 统计维度是"参考文献"(按 reference_no 分组),不是单条校对明细行。
* 例如 50 条参考文献、底层明细 111 条时total 返回 50。
*
* 返回 status 数值含义(整篇):
* 0 = ARTICLE_PROGRESS_NONE 一条校对记录都没有
* 1 = ARTICLE_PROGRESS_RUNNING 至少 1 条参考文献仍有未跑完的明细
* 2 = ARTICLE_PROGRESS_COMPLETED 所有参考文献的全部明细都已结束
*
* 每条参考文献按其明细 status 分布落桶(互斥):
* pending —— 组内任一明细 status=0含部分跑完的"校对中"也归此桶)
* done —— 组内全部明细 status=2(完成)
* failed —— 组内全部明细已结束、至少 1 条 status=3(失败)
*
* pending + done + failed = totalprogress_percent = (done + failed) / total。
* 分组明细请走 getProgressByPArticleId控制器 referenceCheckProgressAI
*
* @return array{p_article_id:int, status:int, total:int, pending:int, done:int, failed:int, progress_percent:float}
*/
public function getArticleProgressStatusByPArticleId($pArticleId)
{
$pArticleId = intval($pArticleId);
if ($pArticleId <= 0) {
throw new \InvalidArgumentException('p_article_id is required');
}
// 一条 SQL 按 reference_no 聚合,组内 status 分布一并算出来;
// 50 条参考文献 → 返回 50 行PHP 走一次循环分桶即可
$rows = Db::name('article_reference_check_result')
->field('reference_no'
. ', SUM(CASE WHEN status = ' . self::RECORD_PENDING . ' THEN 1 ELSE 0 END) AS pending_cnt'
. ', SUM(CASE WHEN status = ' . self::RECORD_FAILED . ' THEN 1 ELSE 0 END) AS failed_cnt')
->where('p_article_id', $pArticleId)
->group('reference_no')
->select();
if (empty($rows)) {
return [
'p_article_id' => $pArticleId,
'status' => self::ARTICLE_PROGRESS_NONE,
'total' => 0,
'pending' => 0,
'done' => 0,
'failed' => 0,
'progress_percent' => 0,
];
}
$pending = 0;
$done = 0;
$failed = 0;
foreach ($rows as $row) {
$pendingCnt = intval($this->arrGet($row, 'pending_cnt', 0));
$failedCnt = intval($this->arrGet($row, 'failed_cnt', 0));
if ($pendingCnt > 0) {
$pending++;
} elseif ($failedCnt > 0) {
$failed++;
} else {
$done++;
}
}
$total = count($rows);
$articleStatus = $pending > 0
? self::ARTICLE_PROGRESS_RUNNING
: self::ARTICLE_PROGRESS_COMPLETED;
$finished = $done + $failed;
$progressPercent = round($finished / $total * 100, 1);
return [
'p_article_id' => $pArticleId,
'status' => $articleStatus,
'total' => $total,
'pending' => $pending,
'done' => $done,
'failed' => $failed,
'progress_percent' => $progressPercent,
];
}
/**
* 多篇文章并行校对时,查询指定文章前面还有几篇在排队。
*
* 「正在校对」= 该文至少还有 1 条明细 status=待校验(0)。
* 排队顺序:按各文章最早一条待校验明细的 id 升序(与全局入队先后一致)。
*
* @return array{
* p_article_id:int,
* running_total:int,
* ahead:int,
* position:int,
* in_queue:bool,
* status:int
* }
*/
public function getArticleCheckQueuePositionByPArticleId($pArticleId)
{
$pArticleId = intval($pArticleId);
if ($pArticleId <= 0) {
throw new \InvalidArgumentException('p_article_id is required');
}
$rows = Db::name('article_reference_check_result')
->field('p_article_id, MIN(id) AS queue_anchor')
->where('status', self::RECORD_PENDING)
->group('p_article_id')
->order('queue_anchor', 'asc')
->select();
$runningIds = [];
foreach ($rows as $row) {
$aid = intval($this->arrGet($row, 'p_article_id', 0));
if ($aid > 0) {
$runningIds[] = $aid;
}
}
$runningTotal = count($runningIds);
$ahead = 0;
$position = 0;
$inQueue = false;
foreach ($runningIds as $idx => $aid) {
if ($aid === $pArticleId) {
$ahead = $idx;
$position = $idx + 1;
$inQueue = true;
break;
}
}
$articleStatus = $this->getArticleProgressStatusByPArticleId($pArticleId);
return [
'p_article_id' => $pArticleId,
'running_total' => $runningTotal,
'ahead' => $inQueue ? $ahead : 0,
'position' => $inQueue ? $position : 0,
'in_queue' => $inQueue,
'status' => intval($this->arrGet($articleStatus, 'status', self::ARTICLE_PROGRESS_NONE)),
];
}
/**
* 按 p_article_id 查整篇引用校对进度,按 reference_no 分组聚合状态,并展开每条明细。
*
* 状态映射统一遵循"生命周期顺序"PROGRESS_* / RECORD_* 取值一致):
* 0 = 待校验 1 = 校对中(仅分组层) 2 = 校对完成 3 = 校对失败
*
* 分组reference_no状态返回字段 progress_status
* - 0 = PROGRESS_PENDING 分组内全部明细 status=0
* - 1 = PROGRESS_CHECKING 分组内部分明细已结束、部分仍为 0明细不会出现此值
* - 2 = PROGRESS_COMPLETED 分组内全部明细 status=2
* - 3 = PROGRESS_FAILED 分组内全部明细已结束、且至少 1 条 status=3
*
* records[i] 字段:
* - status 0=待校验 2=完成 3=失败(与分组同一套数值含义,不会出现 1
* - confidence LLM 评分
* - is_pass confidence >= PASS_CONFIDENCE_THRESHOLD 视为通过
*
* @return array{p_article_id:int, total_groups:int, summary:array, list:array}
*/
public function getProgressByPArticleId($pArticleId)
{
$pArticleId = intval($pArticleId);
if ($pArticleId <= 0) {
throw new \InvalidArgumentException('p_article_id is required');
}
$rows = Db::name('article_reference_check_result')
->field('id,p_refer_id,reference_no,am_id,status,confidence,is_match,reason,text_start,text_end,updated_at')
->where('p_article_id', $pArticleId)
->order('reference_no asc, id asc')
->select();
// summary 用字符串键,避免数值下标看不出含义;同时保留数值键和 PROGRESS_* 常量对照
$summary = [
'pending' => 0, // PROGRESS_PENDING = 0
'checking' => 0, // PROGRESS_CHECKING = 1
'completed' => 0, // PROGRESS_COMPLETED = 2
'failed' => 0, // PROGRESS_FAILED = 3
];
if (empty($rows)) {
return [
'p_article_id' => $pArticleId,
'total_groups' => 0,
'summary' => $summary,
'list' => [],
];
}
$groups = [];
foreach ($rows as $row) {
$refNo = intval($this->arrGet($row, 'reference_no', 0));
$pReferId = intval($this->arrGet($row, 'p_refer_id', 0));
if (!isset($groups[$refNo])) {
$groups[$refNo] = [
'reference_no' => $refNo,
'p_refer_id' => $pReferId,
'total' => 0,
'pending' => 0,
'done' => 0,
'failed' => 0,
'pass' => 0,
'last_updated_at' => '',
'records' => [],
];
}
// 同一 reference_no 理论上只对应一个 p_refer_id如果出现混淆保留首次出现的非空 id
if ($groups[$refNo]['p_refer_id'] <= 0 && $pReferId > 0) {
$groups[$refNo]['p_refer_id'] = $pReferId;
}
$groups[$refNo]['total']++;
$st = intval($this->arrGet($row, 'status', 0));
// record 仅存 {0=待校验, 2=完成, 3=失败};不会出现 1校对中
if ($st === self::RECORD_PENDING) {
$groups[$refNo]['pending']++;
} elseif ($st === self::RECORD_COMPLETED) {
$groups[$refNo]['done']++;
} elseif ($st === self::RECORD_FAILED) {
$groups[$refNo]['failed']++;
}
$upd = (string)$this->arrGet($row, 'updated_at', '');
if ($upd > $groups[$refNo]['last_updated_at']) {
$groups[$refNo]['last_updated_at'] = $upd;
}
$confidence = floatval($this->arrGet($row, 'confidence', 0));
$isPass = $confidence >= self::PASS_CONFIDENCE_THRESHOLD;
if ($isPass) {
$groups[$refNo]['pass']++;
}
$groups[$refNo]['records'][] = [
'check_id' => intval($this->arrGet($row, 'id', 0)),
'am_id' => intval($this->arrGet($row, 'am_id', 0)),
'status' => $st,
'confidence' => $confidence,
'is_pass' => $isPass,
'is_match' => intval($this->arrGet($row, 'is_match', 0)),
'reason' => (string)$this->arrGet($row, 'reason', ''),
'text_start' => intval($this->arrGet($row, 'text_start', 0)),
'text_end' => intval($this->arrGet($row, 'text_end', 0)),
'last_updated_at' => $upd,
];
}
$list = [];
foreach ($groups as $g) {
$total = $g['total'];
$pending = $g['pending'];
$failed = $g['failed'];
$pass = $g['pass'];
if ($pending === $total) {
$progressStatus = self::PROGRESS_PENDING;
} elseif ($pending === 0) {
$progressStatus = $failed > 0 ? self::PROGRESS_FAILED : self::PROGRESS_COMPLETED;
} else {
$progressStatus = self::PROGRESS_CHECKING;
}
// 整体通过校验:分组已全部完成(无 pending、无 failed且每条 confidence >= 0.65
$g['is_pass'] = (
$progressStatus === self::PROGRESS_COMPLETED
&& $total > 0
&& $pass === $total
);
switch ($progressStatus) {
case self::PROGRESS_PENDING: $summary['pending']++; break;
case self::PROGRESS_CHECKING: $summary['checking']++; break;
case self::PROGRESS_COMPLETED: $summary['completed']++; break;
case self::PROGRESS_FAILED: $summary['failed']++; break;
}
$g['progress_status'] = $progressStatus;
$list[] = $g;
}
usort($list, function ($a, $b) {
return $a['reference_no'] - $b['reference_no'];
});
return [
'p_article_id' => $pArticleId,
'total_groups' => count($list),
'summary' => $summary,
'list' => $list,
];
}
/**
* 按 p_refer_id 查这条参考文献的校对明细与分组进度。
*
* 分组进度(与 referenceCheckProgressAI 单条 list 项口径一致):
* progress_status 0待校验 1校对中 2完成 3失败
* pending/done/failed/pass、is_pass、progress_percent
*
* list 每项check_id、am_id、status、confidence、reason、is_match、is_pass
*
* @param int $pReferId production_article_refer.p_refer_id
* @return array
*/
public function getCheckDetailsByPReferId($pReferId)
{
$pReferId = intval($pReferId);
if ($pReferId <= 0) {
throw new \InvalidArgumentException('p_refer_id is required');
}
$rows = Db::name('article_reference_check_result')
->field('id,p_article_id,reference_no,am_id,status,confidence,is_match,reason,updated_at')
->where('p_refer_id', $pReferId)
->order('id asc')
->select();
$list = [];
$pArticleId = 0;
$referenceNo = 0;
$pending = 0;
$done = 0;
$failed = 0;
$pass = 0;
$lastUpdatedAt = '';
foreach ($rows as $row) {
if ($pArticleId <= 0) {
$pArticleId = intval($this->arrGet($row, 'p_article_id', 0));
}
if ($referenceNo <= 0) {
$referenceNo = intval($this->arrGet($row, 'reference_no', 0));
}
$st = intval($this->arrGet($row, 'status', 0));
if ($st === self::RECORD_PENDING) {
$pending++;
} elseif ($st === self::RECORD_COMPLETED) {
$done++;
} elseif ($st === self::RECORD_FAILED) {
$failed++;
}
$upd = (string)$this->arrGet($row, 'updated_at', '');
if ($upd > $lastUpdatedAt) {
$lastUpdatedAt = $upd;
}
$confidence = floatval($this->arrGet($row, 'confidence', 0));
$isPass = $confidence >= self::PASS_CONFIDENCE_THRESHOLD;
if ($isPass) {
$pass++;
}
$list[] = [
'check_id' => intval($this->arrGet($row, 'id', 0)),
'am_id' => intval($this->arrGet($row, 'am_id', 0)),
'status' => $st,
'confidence' => $confidence,
'reason' => (string)$this->arrGet($row, 'reason', ''),
'is_match' => intval($this->arrGet($row, 'is_match', 0)),
'is_pass' => $isPass,
];
}
if ($referenceNo <= 0) {
$refer = Db::name('production_article_refer')
->where('p_refer_id', $pReferId)
->where('state', 0)
->find();
if (!empty($refer)) {
if ($pArticleId <= 0) {
$pArticleId = intval($this->arrGet($refer, 'p_article_id', 0));
}
$referenceNo = intval($this->arrGet($refer, 'index', 0)) + 1;
}
}
$total = count($list);
if ($total === 0) {
$progressStatus = self::PROGRESS_PENDING;
$progressPercent = 0;
$isPassGroup = false;
} elseif ($pending === $total) {
$progressStatus = self::PROGRESS_PENDING;
$progressPercent = 0;
$isPassGroup = false;
} elseif ($pending === 0) {
$progressStatus = $failed > 0 ? self::PROGRESS_FAILED : self::PROGRESS_COMPLETED;
$progressPercent = 100;
$isPassGroup = (
$progressStatus === self::PROGRESS_COMPLETED
&& $pass === $total
);
} else {
$progressStatus = self::PROGRESS_CHECKING;
$finished = $done + $failed;
$progressPercent = round($finished / $total * 100, 1);
$isPassGroup = false;
}
return [
'p_refer_id' => $pReferId,
'p_article_id' => $pArticleId,
'reference_no' => $referenceNo,
'total' => $total,
'pending' => $pending,
'done' => $done,
'failed' => $failed,
'pass' => $pass,
'progress_status' => $progressStatus,
'progress_percent' => $progressPercent,
'is_pass' => $isPassGroup,
'last_updated_at' => $lastUpdatedAt,
'list' => $list,
];
}
public function resetAndRecheckByArticle($aProductionArticle)
{
if (empty($aProductionArticle) || !is_array($aProductionArticle)) {
throw new \InvalidArgumentException('production_article is required');
}
$pArticleId = intval($this->arrGet($aProductionArticle, 'p_article_id', 0));
$articleId = intval($this->arrGet($aProductionArticle, 'article_id', 0));
if ($pArticleId <= 0 || $articleId <= 0) {
throw new \InvalidArgumentException('production_article requires both p_article_id and article_id');
}
$existing = Db::name('article_reference_check_result')
->where('p_article_id', $pArticleId)
->count();
if (intval($existing) <= 0) {
throw new \RuntimeException('no existing reference check records for p_article_id=' . $pArticleId);
}
$cleared = $this->clearArticleChecks($articleId);
$enqueueResult = $this->enqueueByArticle($articleId);
if (!is_array($enqueueResult)) {
$enqueueResult = [];
}
$enqueueResult['cleared'] = $cleared;
$enqueueResult['reset'] = 1;
return $enqueueResult;
}
public static function amStatusLabel($status)
{
$map = [
self::AM_STATUS_NONE => 'none',
self::AM_STATUS_PASS => 'pass',
self::AM_STATUS_FAIL => 'fail',
self::AM_STATUS_RUNNING => 'running',
];
return isset($map[$status]) ? $map[$status] : 'unknown';
}
/**
* 表主键为 id对外 API 参数名仍叫 check_id
*/
public function resolveCheckRowId($row)
{
if (!is_array($row)) {
return 0;
}
if (isset($row['id']) && intval($row['id']) > 0) {
return intval($row['id']);
}
if (isset($row['check_id']) && intval($row['check_id']) > 0) {
return intval($row['check_id']);
}
return 0;
}
/**
* 解析 LLM 返回的 is_match兼容 bool / 0|1 / "true"|"false" 字符串)
*/
public function parseLlmIsMatch($value)
{
if (is_bool($value)) {
return $value;
}
if (is_int($value) || is_float($value)) {
return intval($value) === 1;
}
$s = strtolower(trim((string)$value));
return in_array($s, ['1', 'true', 'yes', 'match', 'matched'], true);
}
/**
* 写入单条校对结果(统一截断 reason/error_msg避免 varchar(512) 导致 UPDATE 失败)
*
* @throws \RuntimeException
*/
public function updateCheckResult($checkId, array $fields)
{
$checkId = intval($checkId);
if ($checkId <= 0) {
throw new \InvalidArgumentException('invalid check id');
}
if (isset($fields['reason'])) {
$fields['reason'] = mb_substr(trim((string)$fields['reason']), 0, 512);
}
if (isset($fields['error_msg'])) {
$fields['error_msg'] = mb_substr(trim((string)$fields['error_msg']), 0, 512);
}
$fields['updated_at'] = date('Y-m-d H:i:s');
$exists = Db::name('article_reference_check_result')->where('id', $checkId)->find();
if (empty($exists)) {
throw new \RuntimeException('article_reference_check_result not found, id=' . $checkId);
}
$affected = Db::name('article_reference_check_result')->where('id', $checkId)->update($fields);
if ($affected === false) {
throw new \RuntimeException('article_reference_check_result update failed, id=' . $checkId);
}
\think\Log::info('updateCheckResult id=' . $checkId . ' affected=' . intval($affected));
return intval($affected);
}
public function getResult($checkId)
{
if ($checkId <= 0) {
return null;
}
$row = Db::name('article_reference_check_result')->where('id', $checkId)->find();
return $row ?: null;
}
public function listByArticle($articleId, $status = -1, $onlyMismatch = false)
{
$q = Db::name('article_reference_check_result')->where('article_id', $articleId);
if ($status >= 0) {
$q->where('status', $status);
}
if ($onlyMismatch) {
$q->where('status', self::RECORD_COMPLETED)->where('is_match', 0);
}
return $q->order('am_id asc, cite_tag_start asc, reference_no asc')->select();
}
/**
* 稿件预览:在 content 上标记不合理引用序号与引用句
*
* @return array{sections: array, issues: array, stats: array}
*/
public function buildArticlePreview($articleId, $amId = 0)
{
$fields = 'am_id,content,sort,type,amt_id';
if ($this->hasAmRefCheckStatusColumn()) {
$fields .= ',ref_check_status';
}
$q = Db::name('article_main')
->field($fields)
->where('article_id', $articleId)
->whereIn('state', [0, 2]);
if ($amId > 0) {
$q->where('am_id', $amId);
}
$mains = $q->order('sort asc')->select();
$rows = $this->listByArticle($articleId, 1);
$badByAm = $this->indexBadResults($rows);
$sections = [];
$issues = [];
$stats = ['total' => 0, 'mismatch' => 0, 'match' => 0, 'pending' => 0];
foreach ($this->listByArticle($articleId, -1) as $r) {
$stats['total']++;
if (intval($r['status']) === self::RECORD_PENDING) {
$stats['pending']++;
} elseif (intval($r['is_match']) === 1) {
$stats['match']++;
} else {
$stats['mismatch']++;
}
}
foreach ($mains as $main) {
$id = intval($main['am_id']);
$content = $this->resolveArticleMainCheckContent($main);
$badIndex = isset($badByAm[$id]) ? $badByAm[$id] : array();
$marked = $this->markContentForPreview($content, $id, $badIndex);
$amStatus = intval($this->arrGet($main, 'ref_check_status', 0));
$sections[] = [
'am_id' => $id,
'ref_check_status' => $amStatus,
'ref_check_pass' => $amStatus === self::AM_STATUS_PASS,
'ref_check_label' => self::amStatusLabel($amStatus),
'content' => $content,
'content_marked' => $marked['html'],
'issue_count' => $marked['issue_count'],
];
foreach ($marked['issues'] as $issue) {
$issues[] = $issue;
}
}
$articlePass = $this->resolveArticlePass($sections);
return [
'article_id' => $articleId,
'article_ref_check_pass' => $articlePass,
'sections' => $sections,
'issues' => $issues,
'stats' => $stats,
];
}
/**
* 全文是否通过:各节均为 pass且无 running/fail无引用节忽略
*/
private function resolveArticlePass($sections)
{
$hasChecked = false;
foreach ($sections as $sec) {
$st = intval($this->arrGet($sec, 'ref_check_status', 0));
if ($st === self::AM_STATUS_NONE) {
continue;
}
$hasChecked = true;
if ($st !== self::AM_STATUS_PASS) {
return false;
}
}
return $hasChecked ? true : null;
}
/**
* @param array $rows 已校对完成status=RECORD_COMPLETED但 is_match=0 的检测结果
* @return array<int, array> am_id => indexed bad map
*/
private function indexBadResults($rows)
{
$byAm = [];
foreach ($rows as $row) {
if (intval($row['status']) !== self::RECORD_COMPLETED || intval($row['is_match']) === 1) {
continue;
}
$amId = intval($row['am_id']);
$refNo = intval($row['reference_no']);
if ($amId <= 0 || $refNo <= 0) {
continue;
}
if (!isset($byAm[$amId])) {
$byAm[$amId] = ['by_raw' => [], 'contexts' => []];
}
$rawKey = $this->normalizeRefRawKey((string)$this->arrGet($row, 'reference_raw', ''));
if ($rawKey !== '') {
$byAm[$amId]['by_raw'][$rawKey][$refNo] = $row;
}
$ctxKey = intval($row['text_start']) . '_' . intval($row['text_end']);
if (!isset($byAm[$amId]['contexts'][$ctxKey])) {
$byAm[$amId]['contexts'][$ctxKey] = [
'text_start' => intval($row['text_start']),
'text_end' => intval($row['text_end']),
'check_ids' => [],
'reasons' => [],
'ref_nos' => [],
];
}
$byAm[$amId]['contexts'][$ctxKey]['check_ids'][] = $this->resolveCheckRowId($row);
$byAm[$amId]['contexts'][$ctxKey]['ref_nos'][] = $refNo;
$reason = trim((string)$this->arrGet($row, 'reason', ''));
if ($reason !== '') {
$byAm[$amId]['contexts'][$ctxKey]['reasons'][$refNo] = $reason;
}
}
return $byAm;
}
private function normalizeRefRawKey($raw)
{
$raw = str_replace(
['', '', '—', '', '', '', ' '],
[',', '-', '-', '-', '-', '-', ''],
trim($raw)
);
return strtolower($raw);
}
/**
* @param array $badIndex indexBadResults 中单 am 的结构
*/
private function markContentForPreview($content, $amId, $badIndex)
{
$badByRaw = isset($badIndex['by_raw']) ? $badIndex['by_raw'] : array();
$contexts = isset($badIndex['contexts']) ? $badIndex['contexts'] : array();
$issues = array();
$issueCount = 0;
if ($content === '' || (empty($badByRaw) && empty($contexts))) {
return array('html' => $content, 'issues' => array(), 'issue_count' => 0);
}
$html = $content;
// 1) 先标记 blue 内各序号(在原文上操作,[70-73] 仅标不合理者如 70、71
$matches = $this->collectBlueTagMatches($html);
$citeDeltas = [];
if (!empty($matches[0])) {
$replacements = [];
foreach ($matches[0] as $idx => $match) {
$fullTag = $match[0];
$tagStart = $match[1];
$tagEnd = $tagStart + strlen($fullTag);
$inner = $matches[1][$idx][0];
$rawKey = $this->normalizeRefRawKey($inner);
$badNums = isset($badByRaw[$rawKey]) ? $badByRaw[$rawKey] : array();
$innerMarked = preg_replace_callback(
'/\d+/',
function ($numMatch) use ($badNums, &$issues, &$issueCount, $amId, $inner) {
$num = intval($numMatch[0]);
if (!isset($badNums[$num])) {
return $numMatch[0];
}
$row = $badNums[$num];
$rowReason = isset($row['reason']) ? $row['reason'] : '';
$issueCount++;
$issues[] = array(
'am_id' => $amId,
'check_id' => $this->resolveCheckRowId($row),
'reference_no' => $num,
'reference_raw' => $inner,
'reason' => $rowReason,
'confidence' => floatval(isset($row['confidence']) ? $row['confidence'] : 0),
);
$title = htmlspecialchars(
'引用[' . $num . ']不合理: ' . $rowReason,
ENT_QUOTES,
'UTF-8'
);
return '<span class="ref-no-error" data-check-id="' . $this->resolveCheckRowId($row)
. '" data-ref-no="' . $num . '" title="' . $title . '">'
. $numMatch[0] . '</span>';
},
$inner
);
$tagClass = !empty($badNums) ? ' ref-cite-error' : '';
$groupIds = !empty($badNums)
? implode(',', array_map(function ($row) {
return (int) $this->resolveCheckRowId($row);
}, $badNums))
: '';
$newHtml = '<blue class="ref-cite-tag' . $tagClass . '" data-ref-raw="' . htmlspecialchars($inner, ENT_QUOTES, 'UTF-8')
. '" data-check-ids="' . $groupIds . '">[' . $innerMarked . ']</blue>';
$replacements[] = [
'start' => $tagStart,
'end' => $tagEnd,
'html' => $newHtml,
'delta' => strlen($newHtml) - ($tagEnd - $tagStart),
];
}
usort($replacements, function ($a, $b) {
return $b['start'] - $a['start'];
});
foreach ($replacements as $rep) {
$html = substr($html, 0, $rep['start']) . $rep['html'] . substr($html, $rep['end']);
$citeDeltas[] = ['start' => $rep['start'], 'delta' => $rep['delta']];
}
}
$shiftByCite = function ($pos) use ($citeDeltas) {
$d = 0;
foreach ($citeDeltas as $cd) {
if ($cd['start'] < $pos) {
$d += $cd['delta'];
}
}
return $pos + $d;
};
// 2) 再标记引用句(从后往前)
if (!empty($contexts)) {
$spans = array_values($contexts);
usort($spans, function ($a, $b) {
return $b['text_start'] - $a['text_start'];
});
foreach ($spans as $span) {
$start = $span['text_start'];
$end = $span['text_end'];
if ($start < 0 || $end <= $start) {
continue;
}
$s = $shiftByCite($start);
$e = $shiftByCite($end);
if ($e > strlen($html)) {
$e = strlen($html);
}
$checkIds = array_values(array_unique($span['check_ids']));
$refNos = array_values(array_unique($span['ref_nos']));
sort($refNos);
$reasonParts = [];
foreach ($refNos as $rn) {
if (!empty($span['reasons'][$rn])) {
$reasonParts[] = '[' . $rn . '] ' . $span['reasons'][$rn];
}
}
$title = htmlspecialchars(
'引用句可能不合理: ' . implode('; ', $reasonParts),
ENT_QUOTES,
'UTF-8'
);
$open = '<span class="ref-context-error" data-check-ids="' . implode(',', $checkIds)
. '" data-ref-nos="' . implode(',', $refNos) . '" title="' . $title . '">';
$close = '</span>';
$html = substr($html, 0, $s) . $open . substr($html, $s, $e - $s) . $close . substr($html, $e);
}
}
return ['html' => $html, 'issues' => $issues, 'issue_count' => $issueCount];
}
/**
* @return array<int, array> refer_index => row
*/
public function loadReferMapByPArticleId($pArticleId)
{
$map = [];
if ($pArticleId <= 0) {
return $map;
}
$rows = Db::name('production_article_refer')
->where('p_article_id', $pArticleId)
->where('state', 0)
->order('index asc')
->select();
foreach ($rows as $row) {
$map[intval($row['index'])] = $row;
}
return $map;
}
public function formatReferForLlm($refer)
{
$parts = [];
foreach (['title', 'author', 'joura', 'dateno', 'refer_doi', 'doilink'] as $f) {
$v = trim((string)$this->arrGet($refer, $f, ''));
if ($v !== '') {
$parts[] = ucfirst($f) . ': ' . $v;
}
}
$frag = trim((string)$this->arrGet($refer, 'refer_frag', ''));
$content = trim((string)$this->arrGet($refer, 'refer_content', ''));
if ($frag !== '') {
$parts[] = 'Reference: ' . $frag;
} elseif ($content !== '') {
$parts[] = 'Reference: ' . $content;
}
return implode("\n", $parts);
}
/**
* 编辑某条文献内容后,按 p_refer_id 异步重新校对该文献对应的全部 check 明细
*
* 流程:刷新 refer_text/refer_index → 重置 status/is_match/confidence/reason
* → 设节级 ref_check_status=RUNNING → 投递 RabbitMQ 文章批次
*
* 与 recheckByRefer 的差异:本方法**不**在请求内同步跑 LLM仅入队立即返回。
* 前端可调 getProgressByPArticleId 轮询进度。
*
* @param int $pReferId t_production_article_refer.p_refer_id必填
* @param int $pArticleId 可选:传入跳过 refer 表二次查表
* @return array{p_refer_id:int, p_article_id:int, reference_no:int, reset:int, queued:int, check_ids:int[], queue:string}
*/
public function enqueueRecheckByPReferId($pReferId, $pArticleId = 0)
{
$pReferId = intval($pReferId);
if ($pReferId <= 0) {
throw new \InvalidArgumentException('p_refer_id is required');
}
$refer = Db::name('production_article_refer')
->where('p_refer_id', $pReferId)
->where('state', 0)
->find();
if (empty($refer)) {
throw new \RuntimeException('production_article_refer not found, p_refer_id=' . $pReferId);
}
$pArticleId = intval($pArticleId);
if ($pArticleId <= 0) {
$pArticleId = intval($this->arrGet($refer, 'p_article_id', 0));
}
if ($pArticleId <= 0) {
throw new \RuntimeException('p_article_id is missing for p_refer_id=' . $pReferId);
}
$referenceNo = intval($this->arrGet($refer, 'index', 0)) + 1;
$referText = $this->formatReferForLlm($refer);
$now = date('Y-m-d H:i:s');
$rows = Db::name('article_reference_check_result')
->where('p_article_id', $pArticleId)
->where('p_refer_id', $pReferId)
->select();
if (empty($rows)) {
return [
'p_refer_id' => $pReferId,
'p_article_id' => $pArticleId,
'reference_no' => $referenceNo,
'reset' => 0,
'queued' => 0,
'check_ids' => [],
'queue' => self::TRANSPORT_RABBITMQ,
];
}
$resetFields = $this->newCheckRecordFields([
'refer_text' => $referText,
'refer_index' => $referenceNo,
'reference_no' => $referenceNo,
'status' => self::RECORD_PENDING,
'is_match' => 0,
'can_support' => 0,
'confidence' => 0,
'reason' => '',
'error_msg' => '',
'updated_at' => $now,
], self::QUEUE_PENDING, 0);
$pendingJobs = [];
$amIds = [];
foreach ($rows as $row) {
$checkId = $this->resolveCheckRowId($row);
Db::name('article_reference_check_result')->where('id', $checkId)->update($resetFields);
$pendingJobs[] = [
'check_id' => $checkId,
'reference_no' => $referenceNo,
'am_id' => intval($this->arrGet($row, 'am_id', 0)),
'text_start' => intval($this->arrGet($row, 'text_start', 0)),
];
$amId = intval($this->arrGet($row, 'am_id', 0));
if ($amId > 0) {
$amIds[$amId] = true;
}
}
foreach (array_keys($amIds) as $amId) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING);
}
$checkIds = $this->enqueueChecksSortedByReferenceNo($pendingJobs, $pArticleId, 'enqueue');
return [
'p_refer_id' => $pReferId,
'p_article_id' => $pArticleId,
'reference_no' => $referenceNo,
'reset' => count($rows),
'queued' => count($checkIds),
'check_ids' => $checkIds,
'queue' => self::TRANSPORT_RABBITMQ,
];
}
/**
* 某条参考文献下「校对失败」的明细重新校对(仅 status=RECORD_FAILED异步入队
*
* 不刷新 refer_text / reference_no沿用记录内已有正文与文献快照只重置结果字段后入队。
*
* @param int $pReferId t_production_article_refer.p_refer_id必填
* @param int $pArticleId 可选,进一步限定文章
* @return array{p_refer_id:int, p_article_id:int, reset:int, queued:int, check_ids:int[], queue:string}
*/
public function enqueueRecheckFailedByPReferId($pReferId, $pArticleId = 0)
{
$pReferId = intval($pReferId);
if ($pReferId <= 0) {
throw new \InvalidArgumentException('p_refer_id is required');
}
$q = Db::name('article_reference_check_result')
->where('p_refer_id', $pReferId)
->where('status', self::RECORD_FAILED);
$pArticleId = intval($pArticleId);
if ($pArticleId > 0) {
$q->where('p_article_id', $pArticleId);
}
$rows = $q->select();
if (empty($rows)) {
return [
'p_refer_id' => $pReferId,
'p_article_id' => $pArticleId,
'reset' => 0,
'queued' => 0,
'check_ids' => [],
'queue' => self::TRANSPORT_RABBITMQ,
];
}
if ($pArticleId <= 0) {
$pArticleId = intval($this->arrGet($rows[0], 'p_article_id', 0));
}
$now = date('Y-m-d H:i:s');
$resetFields = $this->newCheckRecordFields([
'status' => self::RECORD_PENDING,
'is_match' => 0,
'can_support' => 0,
'confidence' => 0,
'reason' => '',
'error_msg' => '',
'updated_at' => $now,
], self::QUEUE_PENDING, 0);
$pendingJobs = [];
$amIds = [];
foreach ($rows as $row) {
$checkId = $this->resolveCheckRowId($row);
Db::name('article_reference_check_result')->where('id', $checkId)->update($resetFields);
$pendingJobs[] = [
'check_id' => $checkId,
'reference_no' => intval($this->arrGet($row, 'reference_no', 0)),
'am_id' => intval($this->arrGet($row, 'am_id', 0)),
'text_start' => intval($this->arrGet($row, 'text_start', 0)),
];
$amId = intval($this->arrGet($row, 'am_id', 0));
if ($amId > 0) {
$amIds[$amId] = true;
}
}
foreach (array_keys($amIds) as $amId) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING);
}
$checkIds = $this->enqueueChecksSortedByReferenceNo($pendingJobs, $pArticleId, 'recheck_failed');
return [
'p_refer_id' => $pReferId,
'p_article_id' => $pArticleId,
'reset' => count($rows),
'queued' => count($checkIds),
'check_ids' => $checkIds,
'queue' => self::TRANSPORT_RABBITMQ,
];
}
public function recheckByRefer($articleId, $pReferId = 0, $referenceNo = 0)
{
$articleId = intval($articleId);
if ($articleId <= 0) {
throw new \InvalidArgumentException('article_id is required');
}
$ctx = $this->resolveReferForRecheck($articleId, intval($pReferId), intval($referenceNo));
$refer = $ctx['refer'];
$pReferId = $ctx['p_refer_id'];
$referenceNo = $ctx['reference_no'];
$pArticleId = $ctx['p_article_id'];
$referText = $this->formatReferForLlm($refer);
$now = date('Y-m-d H:i:s');
$rows = Db::name('article_reference_check_result')
->where('article_id', $articleId)
->where(function ($query) use ($pReferId, $referenceNo) {
$query->where('p_refer_id', $pReferId)->whereOr('reference_no', $referenceNo);
})
->select();
if (empty($rows)) {
return [
'article_id' => $articleId,
'p_refer_id' => $pReferId,
'reference_no' => $referenceNo,
'reset' => 0,
'queued' => 0,
'check_ids' => [],
'queue' => self::TRANSPORT_RABBITMQ,
];
}
$resetFields = $this->newCheckRecordFields([
'refer_text' => $referText,
'p_refer_id' => $pReferId,
'p_article_id' => $pArticleId,
'refer_index' => $referenceNo,
'status' => 0,
'is_match' => 0,
'can_support' => 0,
'confidence' => 0,
'reason' => '',
'error_msg' => '',
'updated_at' => $now,
], self::QUEUE_PENDING, 0);
$pendingJobs = [];
$amIds = [];
foreach ($rows as $row) {
$checkId = $this->resolveCheckRowId($row);
Db::name('article_reference_check_result')->where('id', $checkId)->update($resetFields);
$pendingJobs[] = [
'check_id' => $checkId,
'reference_no' => $referenceNo,
'am_id' => intval($row['am_id']),
'text_start' => intval(isset($row['text_start']) ? $row['text_start'] : 0),
];
$amId = intval($row['am_id']);
if ($amId > 0) {
$amIds[$amId] = true;
}
}
foreach (array_keys($amIds) as $amId) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING);
}
usort($pendingJobs, function ($a, $b) {
if ($a['reference_no'] !== $b['reference_no']) {
return $a['reference_no'] - $b['reference_no'];
}
if ($a['am_id'] !== $b['am_id']) {
return $a['am_id'] - $b['am_id'];
}
return $a['text_start'] - $b['text_start'];
});
$checkIds = [];
$results = [];
$failed = [];
foreach ($pendingJobs as $job) {
$checkId = intval($job['check_id']);
$checkIds[] = $checkId;
try {
$results[] = $this->runReferenceCheckOnce($checkId);
} catch (\Exception $e) {
$failed[] = [
'check_id' => $checkId,
'error' => $e->getMessage(),
];
\think\Log::error('recheckByRefer check_id=' . $checkId . ' ' . $e->getMessage());
}
}
foreach (array_keys($amIds) as $amId) {
$this->syncAmRefCheckStatus($amId);
}
return [
'article_id' => $articleId,
'p_refer_id' => $pReferId,
'reference_no' => $referenceNo,
'reset' => count($rows),
'checked' => count($results),
'failed' => count($failed),
'check_ids' => $checkIds,
'results' => $results,
'errors' => $failed,
];
}
/**
* 执行一次引用 LLM 校对(同步,写回 article_reference_check_result
*/
public function runReferenceCheckOnce($checkId)
{
$checkId = intval($checkId);
$row = Db::name('article_reference_check_result')->where('id', $checkId)->find();
if (empty($row)) {
throw new \RuntimeException('article_reference_check_result not found, id=' . $checkId);
}
$contentA = $this->resolveMainContentForJob($row);
$refer = null;
if (intval($row['p_refer_id']) > 0) {
$refer = Db::name('production_article_refer')
->where('p_refer_id', intval($row['p_refer_id']))
->where('state', 0)
->find();
}
if ($refer) {
$contentB = $this->formatReferForLlm($refer);
} else {
$contentB = trim((string)$this->arrGet($row, 'refer_text', ''));
}
if ($contentA === '' || $contentB === '') {
$this->updateCheckResult($checkId, [
'status' => self::RECORD_FAILED,
'error_msg' => 'Missing section content (text/table) or refer_text',
]);
throw new \RuntimeException('Missing section content (text/table) or refer_text');
}
$llmResult = (new LLMService())->checkReference($contentA, $contentB, false);
$requestFailed = !empty($llmResult['request_failed']);
$canSupport = $this->parseLlmCanSupport($llmResult);
$confidence = floatval(isset($llmResult['confidence']) ? $llmResult['confidence'] : 0);
$reason = isset($llmResult['reason']) ? $llmResult['reason'] : '';
// LLM 通讯失败:写 status=RECORD_FAILED(3) + error_msg抛异常由 MQ worker 重试
if ($requestFailed) {
$this->updateCheckResult($checkId, [
'confidence' => $confidence,
'reason' => $reason,
'status' => self::RECORD_FAILED,
'error_msg' => $reason,
]);
throw new \RuntimeException($reason !== '' ? $reason : 'LLM request failed');
}
$this->updateCheckResult($checkId, [
'can_support' => $canSupport ? 1 : 0,
'is_match' => $canSupport ? 1 : 0,
'confidence' => $confidence,
'reason' => $reason,
'status' => self::RECORD_COMPLETED,
'error_msg' => '',
]);
if ($confidence <= self::PASS_CONFIDENCE_THRESHOLD) {
$this->runSecondPassBlocking($checkId, $row, $contentA, $refer, $contentB);
}
return [
'check_id' => $checkId,
'can_support' => $canSupport ? 1 : 0,
'is_match' => $canSupport ? 1 : 0,
'confidence' => $confidence,
'reason' => $reason,
];
}
/**
* 低分结果的二轮 DOI 复核(同步阻塞执行;失败重试一次)
*/
public function runSecondPassBlocking($checkId, array $row, $contentA, $refer, $referText)
{
$checkId = intval($checkId);
if ($checkId <= 0) {
return false;
}
$payload = $this->prepareRecheckPayload(is_array($refer) ? $refer : [], trim((string)$referText));
if (empty($payload['has_abstract']) || trim((string)$payload['doi_block']) === '') {
return false;
}
$lastError = '';
for ($attempt = 0; $attempt < 2; $attempt++) {
try {
$llmResult = (new LLMService())->checkReference($contentA, trim((string)$referText), true, $payload['doi_block']);
$requestFailed = !empty($llmResult['request_failed']);
$canSupport = $this->parseLlmCanSupport($llmResult);
$confidence = floatval(isset($llmResult['confidence']) ? $llmResult['confidence'] : 0);
$tag = '[Crossref复核' . (trim((string)$payload['doi_used']) !== '' ? (' ' . trim((string)$payload['doi_used'])) : '') . ']';
$reason = $tag . ' ' . (isset($llmResult['reason']) ? $llmResult['reason'] : '');
if ($requestFailed) {
$lastError = isset($llmResult['reason']) ? (string)$llmResult['reason'] : 'LLM request failed';
if ($attempt < 1) {
continue;
}
$this->updateCheckResult($checkId, [
'confidence' => $confidence,
'reason' => $reason,
'status' => self::RECORD_FAILED,
'error_msg' => $lastError,
]);
$amId = intval(isset($row['am_id']) ? $row['am_id'] : 0);
if ($amId > 0) {
$this->syncAmRefCheckStatus($amId);
}
return false;
}
$this->updateCheckResult($checkId, [
'can_support' => $canSupport ? 1 : 0,
'is_match' => $canSupport ? 1 : 0,
'confidence' => $confidence,
'reason' => $reason,
'status' => self::RECORD_COMPLETED,
'error_msg' => '',
]);
$amId = intval(isset($row['am_id']) ? $row['am_id'] : 0);
if ($amId > 0) {
$this->syncAmRefCheckStatus($amId);
}
return true;
} catch (\Exception $e) {
$lastError = $e->getMessage();
if ($attempt < 1) {
continue;
}
$this->updateCheckResult($checkId, [
'status' => self::RECORD_FAILED,
'error_msg' => $lastError,
]);
$amId = intval(isset($row['am_id']) ? $row['am_id'] : 0);
if ($amId > 0) {
$this->syncAmRefCheckStatus($amId);
}
return false;
}
}
return false;
}
/**
* @return array{refer: array, p_article_id: int, p_refer_id: int, reference_no: int}
*/
private function resolveReferForRecheck($articleId, $pReferId, $referenceNo)
{
$prod = Db::name('production_article')
->where('article_id', $articleId)
->whereIn('state', [0, 2])
->find();
if (empty($prod)) {
throw new \RuntimeException('production_article not found for article_id=' . $articleId);
}
$pArticleId = intval($prod['p_article_id']);
$refer = null;
if ($pReferId > 0) {
$refer = Db::name('production_article_refer')
->where('p_refer_id', $pReferId)
->where('p_article_id', $pArticleId)
->where('state', 0)
->find();
} elseif ($referenceNo > 0) {
$referMap = $this->loadReferMapByPArticleId($pArticleId);
$referIndex = $referenceNo - 1;
if (isset($referMap[$referIndex])) {
$refer = $referMap[$referIndex];
$pReferId = intval($refer['p_refer_id']);
}
} else {
throw new \InvalidArgumentException('p_refer_id or reference_no is required');
}
if (empty($refer)) {
throw new \RuntimeException('production_article_refer not found');
}
return [
'refer' => $refer,
'p_article_id' => $pArticleId,
'p_refer_id' => intval($refer['p_refer_id']),
'reference_no' => intval($refer['index']) + 1,
];
}
/**
* 仅使用 refer_doi 字段(二次 Crossref 摘要用)
*/
public function extractReferDoiOnly($refer)
{
if (!is_array($refer)) {
return '';
}
$raw = trim((string)$this->arrGet($refer, 'refer_doi', ''));
if ($raw === '' || stripos($raw, 'not available') !== false) {
return '';
}
$dois = $this->extractDoisFromString($raw);
return empty($dois) ? '' : $dois[0];
}
/**
* 根据 refer_doi 调用 Crossref works API 获取摘要(二次校对专用)
*
* @return array{text:string, has_abstract:bool, doi:string}
*/
public function fetchCrossrefAbstractByReferDoi($refer)
{
$doi = $this->extractReferDoiOnly($refer);
if ($doi === '') {
return ['text' => '', 'has_abstract' => false, 'doi' => ''];
}
$crossref = new CrossrefService([
'mailto' => trim((string)Env::get('crossref_mailto', '')),
]);
$block = $this->extractCrossrefBlock($doi, $crossref);
if ($block === null) {
return ['text' => '', 'has_abstract' => false, 'doi' => $doi];
}
return [
'text' => $block['text'],
'has_abstract' => !empty($block['has_abstract']),
'doi' => $doi,
];
}
/**
* 解析 LLM 返回的 can_support
*/
public function parseLlmCanSupport($llmResult)
{
if (!is_array($llmResult)) {
return false;
}
if (array_key_exists('can_support', $llmResult)) {
return $this->parseLlmIsMatch($llmResult['can_support']);
}
return $this->parseLlmIsMatch(isset($llmResult['is_match']) ? $llmResult['is_match'] : false);
}
/**
* 第一次校对:正文取 article_main.content表格(type=2)取 article_main_table.table_data 等
*/
public function resolveMainContentForJob(array $row, $maxChars = 8000)
{
$amId = intval($this->arrGet($row, 'am_id', 0));
if ($amId <= 0) {
return '';
}
$main = Db::name('article_main')
->field('content,type,amt_id,article_id')
->where('am_id', $amId)
->find();
if (empty($main)) {
return '';
}
$raw = trim($this->resolveArticleMainCheckContent($main));
if ($raw === '') {
return '';
}
return $this->normalizeCheckContentForLlm($raw, $maxChars);
}
/**
* 是否为表格节type=2、有 amt_id或 content 为 &lt;table tableId='…'/&gt; 占位
*/
private function isArticleMainTableSection(array $main)
{
if (intval($this->arrGet($main, 'type', self::MAIN_TYPE_TEXT)) === self::MAIN_TYPE_TABLE) {
return true;
}
if (intval($this->arrGet($main, 'amt_id', 0)) > 0) {
return true;
}
$content = (string)$this->arrGet($main, 'content', '');
return stripos($content, '<table') !== false
&& preg_match('/tableId\s*=\s*[\'"]?\d+/i', $content);
}
/**
* 从 article_main 或 content 占位解析 amt_id
*/
private function resolveArticleMainTableAmtId(array $main)
{
$amtId = intval($this->arrGet($main, 'amt_id', 0));
if ($amtId > 0) {
return $amtId;
}
$content = (string)$this->arrGet($main, 'content', '');
if (preg_match('/tableId\s*=\s*[\'"]?(\d+)/i', $content, $m)) {
return intval($m[1]);
}
return 0;
}
/**
* @return array|null
*/
private function loadArticleMainTableRow(array $main)
{
$amtId = $this->resolveArticleMainTableAmtId($main);
if ($amtId <= 0) {
return null;
}
$q = Db::name('article_main_table')
->where('amt_id', $amtId)
->whereIn('state', [0, 2])
->field('table_data,title,note');
$articleId = intval($this->arrGet($main, 'article_id', 0));
if ($articleId > 0) {
$q->where('article_id', $articleId);
}
$tbl = $q->find();
return empty($tbl) ? null : $tbl;
}
/**
* 按节提取引用:正文走 content表格按行拼接单元格后扫描Study 列仅 [n] 时也能带上同行上下文)
*/
public function extractReferencesForArticleMain(array $main)
{
if (!$this->isArticleMainTableSection($main)) {
return $this->extractReferences((string)$this->arrGet($main, 'content', ''));
}
$tbl = $this->loadArticleMainTableRow($main);
if (empty($tbl)) {
return [];
}
$extra = [];
foreach (['title', 'note'] as $field) {
$part = trim((string)$this->arrGet($tbl, $field, ''));
if ($part !== '') {
$extra[] = $part;
}
}
return $this->extractReferencesFromTableDataJson(
(string)$this->arrGet($tbl, 'table_data', ''),
$extra
);
}
/**
* table_data 按行提取;$prefixChunks 为 title/note 等(在表格行之前扫描)
*/
public function extractReferencesFromTableDataJson($tableDataJson, array $prefixChunks = [])
{
$result = [];
$offset = 0;
foreach ($prefixChunks as $chunk) {
$chunk = trim((string)$chunk);
if ($chunk === '') {
continue;
}
foreach ($this->extractReferences($chunk) as $cite) {
$cite['text_start'] = intval($cite['text_start']) + $offset;
$cite['text_end'] = intval($cite['text_end']) + $offset;
$cite['reference_start'] = intval($cite['reference_start']) + $offset;
$cite['reference_end'] = intval($cite['reference_end']) + $offset;
$result[] = $cite;
}
$offset += strlen($chunk) + 1;
}
$tableDataJson = trim((string)$tableDataJson);
if ($tableDataJson === '') {
return $result;
}
$decoded = $this->decodeTableDataJsonToArray($tableDataJson);
if ($decoded === null) {
foreach ($this->extractReferences($tableDataJson) as $cite) {
$cite['text_start'] = intval($cite['text_start']) + $offset;
$cite['text_end'] = intval($cite['text_end']) + $offset;
$cite['reference_start'] = intval($cite['reference_start']) + $offset;
$cite['reference_end'] = intval($cite['reference_end']) + $offset;
$result[] = $cite;
}
return $result;
}
foreach ($decoded as $row) {
$line = $this->buildTableRowCheckLine($row);
if ($line === '') {
continue;
}
foreach ($this->extractReferences($line) as $cite) {
$cite['text_start'] = intval($cite['text_start']) + $offset;
$cite['text_end'] = intval($cite['text_end']) + $offset;
$cite['reference_start'] = intval($cite['reference_start']) + $offset;
$cite['reference_end'] = intval($cite['reference_end']) + $offset;
$result[] = $cite;
}
$offset += strlen($line) + 1;
}
return $result;
}
/**
* 入队/LLM 用的原始 HTMLtype=0 为 content表格为 table_data 按行展平
*/
public function resolveArticleMainCheckContent(array $main)
{
if (!$this->isArticleMainTableSection($main)) {
return (string)$this->arrGet($main, 'content', '');
}
$tbl = $this->loadArticleMainTableRow($main);
if (empty($tbl)) {
return '';
}
$chunks = [];
foreach (['title', 'note'] as $field) {
$part = trim((string)$this->arrGet($tbl, $field, ''));
if ($part !== '') {
$chunks[] = $part;
}
}
$flat = $this->flattenTableDataJsonToCheckContent((string)$this->arrGet($tbl, 'table_data', ''));
if ($flat !== '') {
$chunks[] = $flat;
}
return implode("\n", $chunks);
}
/**
* 表格一行:各单元格 text 用 " | " 连接(保留同行化学名/部位/Study 列引用)
*/
private function buildTableRowCheckLine($row)
{
if (!is_array($row)) {
return '';
}
$cells = [];
foreach ($row as $cell) {
if (!is_array($cell)) {
continue;
}
$text = trim((string)$this->arrGet($cell, 'text', ''));
if ($text !== '') {
$cells[] = $text;
}
}
return implode(' | ', $cells);
}
/**
* table_data 按行展平(供 LLM / 预览);非法 JSON 时按整串处理
*/
private function flattenTableDataJsonToCheckContent($tableDataJson)
{
$tableDataJson = trim((string)$tableDataJson);
if ($tableDataJson === '') {
return '';
}
$decoded = $this->decodeTableDataJsonToArray($tableDataJson);
if ($decoded === null) {
return $tableDataJson;
}
$lines = [];
foreach ($decoded as $row) {
$line = $this->buildTableRowCheckLine($row);
if ($line !== '') {
$lines[] = $line;
}
}
return implode("\n", $lines);
}
/**
* @return array|null
*/
private function decodeTableDataJsonToArray($raw)
{
$raw = trim((string)$raw);
if ($raw === '') {
return null;
}
if (preg_match('/^\xEF\xBB\xBF/', $raw)) {
$raw = substr($raw, 3);
}
$decoded = json_decode($raw, true);
if (json_last_error() !== JSON_ERROR_NONE) {
return null;
}
if (is_array($decoded)) {
return $decoded;
}
if (is_string($decoded)) {
$decoded2 = json_decode($decoded, true);
if (json_last_error() === JSON_ERROR_NONE && is_array($decoded2)) {
return $decoded2;
}
}
return null;
}
private function normalizeCheckContentForLlm($raw, $maxChars = 8000)
{
$text = $this->pregReplaceBlueTags($raw, '[$1]');
$text = strip_tags($text);
$text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8');
$text = preg_replace('/\s+/u', ' ', $text);
$text = trim($text);
if ($text === '') {
return '';
}
$maxChars = max(500, intval($maxChars));
if (mb_strlen($text) > $maxChars) {
$text = mb_substr($text, 0, $maxChars) . '...';
}
return $text;
}
/**
* 引用处局部上下文origin_text供其它场景使用
*/
public function resolveCitationContextForJob(array $row)
{
$text = trim((string)$this->arrGet($row, 'origin_text', ''));
if ($text === '') {
$text = trim((string)$this->arrGet($row, 'content_a', ''));
}
return $text;
}
/**
* 从 refer 行提取标准 DOI10.xxxx/...
*
* 优先级refer_content原始引用文本里的 DOI 最贴近实际被引用的文献)
* > refer_doi > doi > doilink
*/
public function extractDoiFromRefer($refer)
{
$list = $this->extractAllDoiCandidatesFromRefer($refer);
return empty($list) ? '' : $list[0];
}
/**
* 返回 refer 行可能对应的全部 DOI 候选(去重,按优先级排序)
*
* 用于第二轮 DOI 复核场景:当 metadata 的 refer_doi 与原始引用文本里的 DOI
* 不一致时(数据漂移),优先尝试原始引用文本里的 DOI 抓真实摘要。
*
* @return string[]
*/
public function extractAllDoiCandidatesFromRefer($refer)
{
if (!is_array($refer)) {
return [];
}
$ordered = [
(string)$this->arrGet($refer, 'refer_content', ''),
(string)$this->arrGet($refer, 'refer_doi', ''),
(string)$this->arrGet($refer, 'doi', ''),
(string)$this->arrGet($refer, 'doilink', ''),
];
$result = [];
foreach ($ordered as $raw) {
foreach ($this->extractDoisFromString($raw) as $doi) {
if (!in_array($doi, $result, true)) {
$result[] = $doi;
}
}
}
return $result;
}
/**
* 从任意文本里抽取所有形如 10.xxxx/yyy 的 DOI
* @return string[]
*/
private function extractDoisFromString($text)
{
$text = trim((string)$text);
if ($text === '' || stripos($text, 'not available') !== false) {
return [];
}
$dois = [];
if (preg_match_all('~doi\.org/([^\s?#"\'<>]+)~i', $text, $m)) {
foreach ($m[1] as $cand) {
$cand = $this->trimDoiTail(trim($cand));
if ($this->isValidDoi($cand)) {
$dois[] = $cand;
}
}
}
if (preg_match_all('~\b(10\.\d{3,9}/[^\s?#"\'<>]+)~i', $text, $m)) {
foreach ($m[1] as $cand) {
$cand = $this->trimDoiTail(trim($cand));
if ($this->isValidDoi($cand)) {
$dois[] = $cand;
}
}
}
if ($dois === [] && strpos($text, '10.') === 0) {
$cand = $this->trimDoiTail($text);
if ($this->isValidDoi($cand)) {
$dois[] = $cand;
}
}
return array_values(array_unique($dois));
}
private function trimDoiTail($doi)
{
return rtrim($doi, ".,;:)]}>\"'\\ \t\n\r");
}
private function isValidDoi($doi)
{
return (bool)preg_match('~^10\.\d{3,9}/[^\s]+$~i', (string)$doi);
}
/**
* 通过 PubMed / Crossref 拉取 DOI 对应文献内容(本地 LLM 无法打开网页,须预先抓取)
*
* 行为:
* - 尝试 refer 行内所有 DOI 候选refer_content > refer_doi > doi > doilink
* - 优先采用第一个能拿到 abstract 的 DOI
* - PubMed 无摘要时回落到 Crossref raw 解析摘要(清理 JATS 标签)
* - 全部失败则返回空字符串(调用方据此跳过二次复核)
*/
public function fetchDoiLiteratureBlock($refer)
{
$candidates = $this->extractAllDoiCandidatesFromRefer($refer);
if (empty($candidates)) {
return '';
}
$pubmed = new PubmedService([
'email' => trim((string)Env::get('pubmed_email', '')),
'tool' => trim((string)Env::get('pubmed_tool', 'tmrjournals')),
]);
$crossref = new CrossrefService([
'mailto' => trim((string)Env::get('crossref_mailto', '')),
]);
$best = null;
$fallback = null;
foreach ($candidates as $doi) {
$block = $this->buildDoiBlockFromSources($doi, $pubmed, $crossref);
if ($block === null) {
continue;
}
if (!empty($block['has_abstract'])) {
$best = $block;
break;
}
if ($fallback === null) {
$fallback = $block;
}
}
$chosen = $best ?: $fallback;
if ($chosen === null) {
return '';
}
return $chosen['text'];
}
/**
* 拉单个 DOI 的真实内容,返回 ['text' => string, 'has_abstract' => bool] 或 null
*/
private function buildDoiBlockFromSources($doi, PubmedService $pubmed, CrossrefService $crossref)
{
$doi = trim((string)$doi);
if ($doi === '') {
return null;
}
$pub = $pubmed->fetchByDoi($doi);
$pubAbstract = is_array($pub) ? trim((string)$this->arrGet($pub, 'abstract', '')) : '';
if (is_array($pub) && ($pubAbstract !== '' || trim((string)$this->arrGet($pub, 'title', '')) !== '')) {
$lines = ['Source: PubMed (DOI ' . $doi . ')'];
if (!empty($pub['title'])) {
$lines[] = 'Actual Title: ' . trim((string)$pub['title']);
}
if (!empty($pub['journal'])) {
$lines[] = 'Journal: ' . trim((string)$pub['journal']);
}
if (!empty($pub['year'])) {
$lines[] = 'Year: ' . trim((string)$pub['year']);
}
if (!empty($pub['publication_types'])) {
$lines[] = 'Publication Types: ' . implode('; ', (array)$pub['publication_types']);
}
if (!empty($pub['mesh_terms'])) {
$lines[] = 'MeSH: ' . implode('; ', (array)$pub['mesh_terms']);
}
if ($pubAbstract !== '') {
$lines[] = 'Abstract: ' . $this->truncate($pubAbstract, 3500);
}
if ($pubAbstract === '') {
$cr = $this->extractCrossrefBlock($doi, $crossref);
if ($cr !== null && $cr['has_abstract']) {
$lines[] = "\n--- Crossref 补充 ---\n" . $cr['text'];
return ['text' => implode("\n", $lines), 'has_abstract' => true];
}
}
return ['text' => implode("\n", $lines), 'has_abstract' => $pubAbstract !== ''];
}
return $this->extractCrossrefBlock($doi, $crossref);
}
/**
* 从 Crossref 拉取标题/期刊/作者/摘要abstract 通常包裹 JATS XML需清洗
* @return array|null ['text' => string, 'has_abstract' => bool]
*/
private function extractCrossrefBlock($doi, CrossrefService $crossref)
{
$msg = $crossref->fetchWork($doi);
if (!is_array($msg)) {
return null;
}
$summary = $crossref->fetchWorkSummary($doi);
if (!is_array($summary)) {
$summary = [];
}
$lines = ['Source: Crossref api.crossref.org/works/' . rawurlencode($doi)];
$title = isset($msg['title'][0]) ? trim((string)$msg['title'][0]) : trim((string)$this->arrGet($summary, 'title', ''));
if ($title !== '') {
$lines[] = 'Actual Title: ' . $title;
}
if (!empty($summary['joura'])) {
$lines[] = 'Journal: ' . trim((string)$summary['joura']);
}
if (!empty($summary['author_str'])) {
$lines[] = 'Authors: ' . trim((string)$summary['author_str']);
}
if (!empty($summary['dateno'])) {
$lines[] = 'Publication: ' . trim((string)$summary['dateno']);
}
if (!empty($summary['doilink'])) {
$lines[] = 'DOI Link: ' . trim((string)$summary['doilink']);
}
if (!empty($summary['is_retracted'])) {
$lines[] = 'Retraction: yes - ' . trim((string)$this->arrGet($summary, 'retract_reason', ''));
}
$abstract = $this->cleanCrossrefAbstract((string)$this->arrGet($msg, 'abstract', ''));
$hasAbstract = $abstract !== '';
if ($hasAbstract) {
$lines[] = 'Abstract: ' . $this->truncate($abstract, 3500);
} else {
$lines[] = 'Note: Crossref 未返回摘要,请结合标题/期刊/作者与正文谨慎判断。';
}
return ['text' => implode("\n", $lines), 'has_abstract' => $hasAbstract];
}
private function cleanCrossrefAbstract($raw)
{
$raw = trim((string)$raw);
if ($raw === '') {
return '';
}
$raw = preg_replace('~<jats:title[^>]*>.*?</jats:title>~is', '', $raw);
$raw = preg_replace('~<jats:p[^>]*>~i', "\n", $raw);
$raw = preg_replace('~</jats:p>~i', '', $raw);
$raw = preg_replace('~</?jats:[^>]+>~i', '', $raw);
$raw = strip_tags($raw);
$raw = preg_replace('/[ \t]+/u', ' ', $raw);
$raw = preg_replace("/\r\n|\r/u", "\n", $raw);
$raw = preg_replace("/\n{2,}/u", "\n", $raw);
return trim($raw);
}
private function truncate($text, $max)
{
$text = (string)$text;
if (mb_strlen($text) <= $max) {
return $text;
}
return mb_substr($text, 0, $max) . '...';
}
/**
* 第二次 DOI 复核数据准备:返回书目信息 + 真实抓取内容
*
* @return array{refer_text:string, doi_block:string, has_abstract:bool, doi_used:string}
*/
public function prepareRecheckPayload($refer, $referText = '')
{
$base = trim($referText) !== '' ? trim($referText) : $this->formatReferForLlm($refer);
$cr = $this->fetchCrossrefAbstractByReferDoi($refer);
return [
'refer_text' => $base,
'doi_block' => $cr['text'],
'has_abstract' => $cr['has_abstract'],
'doi_used' => $cr['doi'],
];
}
/**
* 旧接口:拼接成单块文本(向后兼容,建议调用方改用 prepareRecheckPayload
*/
public function formatReferForDoiRecheck($refer, $referText = '')
{
$payload = $this->prepareRecheckPayload($refer, $referText);
if ($payload['doi_block'] === '') {
return $payload['refer_text']
. "\n\n【DOI 文献真实内容】\n未能从 PubMed/Crossref 获取该 DOI 的摘要或元数据,请依据书目条目与正文谨慎判断。";
}
return $payload['refer_text']
. "\n\n【Crossref 摘要(依据 Refer_doi 从 api.crossref.org/works 获取)】\n"
. $payload['doi_block'];
}
/**
* 对已完成且低分的记录尝试同步 Crossref 二轮(供 enqueueSecondPassByArticle 等手工入口)
*/
public function runSecondPassIfNeeded($checkId, $confidence)
{
$checkId = intval($checkId);
$confidence = floatval($confidence);
if ($checkId <= 0 || $confidence > self::PASS_CONFIDENCE_THRESHOLD) {
return false;
}
$row = Db::name('article_reference_check_result')->where('id', $checkId)->find();
if (empty($row)) {
return false;
}
$refer = null;
if (intval($row['p_refer_id']) > 0) {
$refer = Db::name('production_article_refer')
->where('p_refer_id', intval($row['p_refer_id']))
->where('state', 0)
->find();
}
if (empty($refer) || $this->extractReferDoiOnly($refer) === '') {
return false;
}
$cr = $this->fetchCrossrefAbstractByReferDoi($refer);
if (empty($cr['has_abstract'])) {
return false;
}
$contentA = $this->resolveMainContentForJob($row);
$referText = trim((string)$this->arrGet($row, 'refer_text', ''));
if ($referText === '' && is_array($refer)) {
$referText = $this->formatReferForLlm($refer);
}
return $this->runSecondPassBlocking($checkId, $row, $contentA, $refer, $referText);
}
/**
* 从正文 HTML 或表格展平后的 HTML 提取 blue 引用
*/
public function extractReferences($content)
{
$result = [];
$matches = $this->collectBlueTagMatches($content);
if (empty($matches[0])) {
return [];
}
$tagSpans = [];
foreach ($matches[0] as $index => $match) {
$tagSpans[] = [
'start' => $match[1],
'end' => $match[1] + strlen($match[0]),
'index' => $index,
];
}
foreach ($matches[0] as $index => $match) {
$fullTag = $match[0];
$tagStart = $match[1];
$tagEnd = $tagStart + strlen($fullTag);
$rawRef = trim($matches[1][$index][0]);
$referenceNumbers = $this->expandReferenceNumbers($rawRef);
list($localStart, $localEnd, $originalText) = $this->extractLocalCitationContext(
$content,
$tagStart,
$tagEnd,
$tagSpans
);
if (!$this->isMeaningfulCitationContext($originalText) || empty($referenceNumbers)) {
continue;
}
$result[] = [
'reference_raw' => $rawRef,
'reference_numbers' => $referenceNumbers,
'original_text' => $originalText,
'reference_start' => $tagStart,
'reference_end' => $tagEnd,
'text_start' => $localStart,
'text_end' => $localEnd,
];
}
return $result;
}
/**
* 按引用位置截取局部上下文:优先取标签前叙述;同句多引时后续引用从上一标签后开始。
*/
private function extractLocalCitationContext($content, $tagStart, $tagEnd, array $tagSpans)
{
$paragraphStart = $this->findParagraphStart($content, $tagStart);
$sentenceEnd = $this->findSentenceEnd($content, $tagEnd, $tagEnd);
$prevTagEnd = $paragraphStart;
$nextTagStart = $sentenceEnd;
foreach ($tagSpans as $span) {
if ($span['end'] <= $tagStart && $span['end'] > $prevTagEnd) {
$prevTagEnd = $span['end'];
}
if ($span['start'] > $tagEnd && $span['start'] < $nextTagStart) {
$nextTagStart = $span['start'];
}
}
$hasPriorCiteInParagraph = ($prevTagEnd > $paragraphStart);
$sentenceStart = $this->findSentenceStart($content, $tagStart);
// 段内首个引用:整段到标签前;后续引用:取「本句」起点(可早于上一标签),避免只剩 “and external environment” 再误用标签后文本
if ($hasPriorCiteInParagraph) {
$localStart = max($paragraphStart, $sentenceStart);
} else {
$localStart = $this->capContextStartBeforeTag($content, $tagStart, $paragraphStart);
}
// 默认:引用标签前的论述
$localEnd = $tagStart;
$originalText = $this->buildCitationContextText($content, $localStart, $localEnd);
// 仅段内首个引用、且标签前极短(如句末 ICU nurses [14])时,才改用标签后片段;同段多引禁止标签后截取(会错取下一句)
$allowTrailing = !$hasPriorCiteInParagraph;
if ($allowTrailing && (
!$this->isMeaningfulCitationContext($originalText)
|| $this->shouldUseTrailingCitationContext($content, $localStart, $tagStart, $tagEnd)
)) {
$trailEnd = ($nextTagStart < $sentenceEnd) ? $nextTagStart : $sentenceEnd;
$trailText = $this->buildCitationContextText($content, $tagEnd, $trailEnd);
if ($this->isMeaningfulCitationContext($trailText)) {
$localStart = $tagEnd;
$localEnd = $trailEnd;
$originalText = $trailText;
}
}
if (!$this->isMeaningfulCitationContext($originalText)) {
list($localStart, $localEnd) = $this->widenCitationContextBounds(
$content,
$tagStart,
$tagEnd,
$localStart,
$localEnd
);
$originalText = $this->buildCitationContextText($content, $localStart, $localEnd);
}
return [$localStart, $localEnd, $originalText];
}
/**
* 标签前仅有作者缩写等极短片段时,改用标签后上下文
*/
private function shouldUseTrailingCitationContext($content, $localStart, $tagStart, $tagEnd)
{
$before = $this->buildCitationContextText($content, $localStart, $tagStart);
if (!$this->isMeaningfulCitationContext($before)) {
return true;
}
return mb_strlen($before) < 25;
}
public function expandReferenceNumbers($refStr)
{
$refStr = str_replace(
['', '', '—', '', '', ''],
[',', '-', '-', '-', '-', '-'],
trim($refStr)
);
$numbers = [];
foreach (explode(',', $refStr) as $part) {
$part = trim($part);
if ($part === '') {
continue;
}
if (preg_match('/^(\d+)\s*-\s*(\d+)$/', $part, $m)) {
$start = intval($m[1]);
$end = intval($m[2]);
if ($start <= $end) {
$numbers = array_merge($numbers, range($start, $end));
}
} elseif (ctype_digit($part)) {
$numbers[] = intval($part);
}
}
return array_values(array_unique($numbers));
}
/**
* 返回 $bytePos 处 UTF-8 码点占用的最后一字节之后的位置(下一字符起始)
*/
private function utf8CharEnd($content, $bytePos)
{
$len = strlen($content);
if ($bytePos < 0 || $bytePos >= $len) {
return max(0, min($len, $bytePos + 1));
}
$next = $bytePos + 1;
while ($next < $len && (ord($content[$next]) & 0xC0) === 0x80) {
$next++;
}
return $next;
}
/**
* 按字节偏移截取(与 strpos/strlen 一致);勿用 mb_substr否则遇中文前缀会截断英文词头
*/
private function byteSubstr($content, $start, $end)
{
$length = max(0, $end - $start);
if ($length === 0) {
return '';
}
return (string)mb_strcut($content, $start, $length, 'UTF-8');
}
private function buildCitationContextText($content, $start, $end)
{
$text = $this->byteSubstr($content, $start, $end);
$text = $this->pregReplaceBlueTags($text, '');
$text = trim(strip_tags($text));
$text = preg_replace('/\s+/u', ' ', $text);
$text = ltrim($text, "\xEF\xBB\xBF");
return $text;
}
/**
* 过滤仅标点、过短或无字母/汉字的上下文(如去掉标签后只剩 "."
*/
private function isMeaningfulCitationContext($text)
{
$text = trim($text);
if ($text === '') {
return false;
}
if ($this->isOnlyPunctuationOrSpace($text)) {
return false;
}
if (!preg_match('/[\p{L}\p{N}]/u', $text)) {
return false;
}
return mb_strlen($text) >= 2;
}
private function isOnlyPunctuationOrSpace($text)
{
return preg_match('/^[\s\p{P}\p{S}]+$/u', $text) === 1;
}
/**
* 首句过短时向前后各扩展一句(上限约 2000 字符)
*/
private function widenCitationContextBounds($content, $tagStart, $tagEnd, $start, $end)
{
$len = strlen($content);
$maxSpan = 2000;
if ($start > 0) {
$prevStart = $this->findSentenceStart($content, max(0, $start - 1));
if ($prevStart < $start) {
$start = $prevStart;
}
}
$nextEnd = $this->findSentenceEnd($content, $end, $tagEnd);
if ($nextEnd > $end && $nextEnd <= $len) {
$end = $nextEnd;
}
if ($end - $start > $maxSpan) {
$half = (int)floor($maxSpan / 2);
$mid = (int)floor(($tagStart + $tagEnd) / 2);
$start = max(0, $mid - $half);
$end = min($len, $start + $maxSpan);
}
return [$start, $end];
}
/**
* 句号是否可作为句界排除小数点、et al. 等缩写)
*/
private function isSentenceDelimiterAt($content, $pos, $delimiter)
{
$len = strlen($content);
if ($delimiter !== '.' || $pos < 0 || $pos >= $len) {
return true;
}
if ($pos > 0 && $pos + 1 < $len
&& ctype_digit($content[$pos - 1])
&& ctype_digit($content[$pos + 1])
) {
return false;
}
$before = substr($content, max(0, $pos - 12), min(12, $pos));
if (preg_match('/\b(et\s+al|e\.g|i\.e|vs|etc|fig|no)\s*\.?\s*$/i', $before)) {
return false;
}
$after = substr($content, $pos + 1, 24);
if (preg_match('/^\s*<blue>\s*\[/', $after)) {
return false;
}
return true;
}
/**
* 段落起始HTML / 换行),避免英文多句段落只取到最后一个句号后的一句
*/
private function findParagraphStart($content, $tagStart)
{
$search = substr($content, 0, max(0, $tagStart));
if ($search === '') {
return 0;
}
$best = 0;
if (preg_match_all('/<p[^>]*>/i', $search, $m, PREG_OFFSET_CAPTURE)) {
$last = end($m[0]);
$best = max($best, $last[1] + strlen($last[0]));
}
if (preg_match_all('/<\/p>\s*/i', $search, $m, PREG_OFFSET_CAPTURE)) {
$last = end($m[0]);
$best = max($best, $last[1] + strlen($last[0]));
}
if (preg_match_all('/<br\s*\/?>\s*/i', $search, $m, PREG_OFFSET_CAPTURE)) {
$last = end($m[0]);
$best = max($best, $last[1] + strlen($last[0]));
}
$pos = strrpos($search, "\n\n");
if ($pos !== false) {
$best = max($best, $pos + 2);
}
$pos = strrpos($search, "\n");
if ($pos !== false) {
$best = max($best, $pos + 1);
}
return $best;
}
/**
* 段落过长时从引用处向前截取上限,避免单次 LLM 上下文过大
*/
private function capContextStartBeforeTag($content, $tagStart, $paragraphStart, $maxBytes = 2500)
{
if ($tagStart - $paragraphStart <= $maxBytes) {
return $paragraphStart;
}
$start = $tagStart - $maxBytes;
$slice = substr($content, $start, $tagStart - $start);
if (preg_match('/[.!?。!?]\s+/u', $slice, $m, PREG_OFFSET_CAPTURE)) {
$rel = $m[0][1] + strlen($m[0][0]);
return $start + $rel;
}
return max($paragraphStart, $start);
}
private function findSentenceStart($content, $position)
{
$start = 0;
foreach (['.', '。', '!', '?', "\n"] as $delimiter) {
$pos = strrpos(substr($content, 0, $position), $delimiter);
if ($pos !== false && $this->isSentenceDelimiterAt($content, $pos, $delimiter)) {
$start = max($start, $this->utf8CharEnd($content, $pos));
}
}
return $start;
}
/**
* @param int $searchFrom 从该字节位置起查找句末
* @param int $tagEnd 引用标签结束位置;用于跳过 </blue> 后紧跟的孤立句号
*/
private function findSentenceEnd($content, $searchFrom, $tagEnd = 0)
{
$length = strlen($content);
$minPos = max(0, $searchFrom);
while ($minPos < $length) {
$endPositions = [];
foreach (['.', '。', '!', '?', "\n"] as $delimiter) {
$pos = strpos($content, $delimiter, $minPos);
if ($pos !== false && $this->isSentenceDelimiterAt($content, $pos, $delimiter)) {
$endPositions[] = $this->utf8CharEnd($content, $pos);
}
}
if (empty($endPositions)) {
return $length;
}
$end = min($endPositions);
if ($tagEnd <= 0 || $end <= $tagEnd) {
return $end;
}
$gap = substr($content, $tagEnd, $end - $tagEnd);
$gapText = trim(strip_tags($this->pregReplaceBlueTags($gap, '')));
if ($gapText !== '' && !$this->isOnlyPunctuationOrSpace($gapText)) {
return $end;
}
$minPos = $end;
}
return $length;
}
/**
* 批量记录已入库后创建文章批次并投递 RabbitMQ
*
* @param array $rows 元素含 check_id
* @param int $pArticleId
* @param string $trigger enqueue|recheck_failed|manual
* @return int[] check_id 列表
*/
private function enqueueChecksSortedByReferenceNo(array $rows, $pArticleId = 0, $trigger = 'enqueue')
{
$checkIds = [];
foreach ($rows as $row) {
$checkId = intval($row['check_id']);
if ($checkId > 0) {
$checkIds[] = $checkId;
}
}
if (!empty($checkIds)) {
$this->startArticleCheckQueue($checkIds, intval($pArticleId), $trigger);
}
return $checkIds;
}
/**
* 创建文章批次;队首批次立即发 MQ其余批次等待前序完成
*
* @param int[] $checkIds
* @param int $pArticleId
* @param string $trigger
* @return int[]
*/
public function startArticleCheckQueue(array $checkIds, $pArticleId = 0, $trigger = 'enqueue')
{
$checkIds = array_values(array_filter(array_map('intval', $checkIds)));
if (empty($checkIds)) {
return [];
}
$pArticleId = intval($pArticleId);
if ($pArticleId <= 0) {
$firstRow = Db::name('article_reference_check_result')->where('id', $checkIds[0])->find();
$pArticleId = empty($firstRow) ? 0 : intval($this->arrGet($firstRow, 'p_article_id', 0));
}
if ($pArticleId <= 0) {
throw new \RuntimeException('p_article_id is required for reference check queue');
}
$now = date('Y-m-d H:i:s');
$batchId = Db::name('article_reference_check_batch')->insertGetId([
'p_article_id' => $pArticleId,
'batch_status' => 0,
'total_count' => count($checkIds),
'done_count' => 0,
'failed_count' => 0,
'trigger' => (string)$trigger,
'created_at' => $now,
'updated_at' => $now,
]);
$shouldPublish = !$this->hasEarlierWaitingBatch($batchId) && !$this->hasRunningReferenceCheckBatch();
if ($shouldPublish) {
(new ReferenceCheckMqPublisher())->publishArticleStart($pArticleId, intval($batchId), $trigger);
$this->log('startArticleCheckQueue publish p_article_id=' . $pArticleId . ' batch_id=' . $batchId);
} else {
$this->log('startArticleCheckQueue queued batch_id=' . $batchId . ' p_article_id=' . $pArticleId);
}
return $checkIds;
}
private function hasRunningReferenceCheckBatch()
{
return Db::name('article_reference_check_batch')
->where('batch_status', 1)
->count() > 0;
}
private function hasEarlierWaitingBatch($batchId)
{
return Db::name('article_reference_check_batch')
->where('batch_status', 0)
->where('id', '<', intval($batchId))
->count() > 0;
}
public function log($msg)
{
$line = date('Y-m-d H:i:s') . ' ' . $msg . PHP_EOL;
@file_put_contents($this->logFile, $line, FILE_APPEND);
}
}