文献校对功能完善
This commit is contained in:
@@ -15,12 +15,20 @@ class ReferenceCheckService
|
||||
{
|
||||
const QUEUE_NAME = 'ReferenceCheck';
|
||||
|
||||
/** t_article_main.ref_check_status */
|
||||
/** t_article_main.type */
|
||||
const MAIN_TYPE_TEXT = 0;
|
||||
const MAIN_TYPE_IMAGE = 1;
|
||||
const MAIN_TYPE_TABLE = 2;
|
||||
|
||||
/** t_article_main.ref_check_status(需执行 sql/article_main_ref_check_status.sql) */
|
||||
const AM_STATUS_NONE = 0;
|
||||
const AM_STATUS_PASS = 1;
|
||||
const AM_STATUS_FAIL = 2;
|
||||
const AM_STATUS_RUNNING = 3;
|
||||
|
||||
/** @var bool|null t_article_main 是否已有 ref_check_status 列 */
|
||||
private static $amRefCheckStatusColumnExists = null;
|
||||
|
||||
/**
|
||||
* 引用校对状态(生命周期顺序:0→1→2→3 = 待→进行→完成→失败)
|
||||
*
|
||||
@@ -52,20 +60,14 @@ class ReferenceCheckService
|
||||
const PASS_CONFIDENCE_THRESHOLD = 0.65;
|
||||
|
||||
/**
|
||||
* <blue>[...]</blue> 引用标签内允许的字符类(带 /u 修饰符使用)。
|
||||
* 正文引用标签两种排版(带 /u):
|
||||
* 1) <blue>[8, 9]</blue>、<blue>[13-15]</blue> —— 方括号在 blue 内
|
||||
* 2) [<blue>13-15</blue>] —— 方括号包裹 blue
|
||||
*
|
||||
* 除 ASCII 数字、半角逗号、半角连字符、空白外,还兼容常见排版变体:
|
||||
* , U+FF0C 全角逗号
|
||||
* – U+2013 EN DASH
|
||||
* — U+2014 EM DASH
|
||||
* − U+2212 MINUS SIGN
|
||||
* ‐ U+2010 HYPHEN
|
||||
* ‑ U+2011 NON-BREAKING HYPHEN
|
||||
*
|
||||
* 若不支持变体连字符,会导致 [19–21] 这种区间引用整段被 preg 漏掉,
|
||||
* 进而丢失对应的 reference_no 校对记录。
|
||||
* 捕获组均为序号串(可含逗号、区间连字符及排版变体)。
|
||||
*/
|
||||
const BLUE_TAG_REGEX = '/<blue>\[([\d,,\-\x{2013}\x{2014}\x{2212}\x{2010}\x{2011}\s]+)\]<\/blue>/u';
|
||||
const BLUE_TAG_REGEX_BRACKET_OUTSIDE = '/\[<blue>([\d,,\-\x{2013}\x{2014}\x{2212}\x{2010}\x{2011}\s]+)<\/blue>\]/u';
|
||||
|
||||
/**
|
||||
* 兼容无 ?? 的 PHP 版本
|
||||
@@ -75,6 +77,46 @@ class ReferenceCheckService
|
||||
return isset($arr[$key]) ? $arr[$key] : $default;
|
||||
}
|
||||
|
||||
/**
|
||||
* 合并匹配两种 blue 引用排版,按在正文中的起始位置排序。
|
||||
*
|
||||
* @return array{0: array, 1: array} 同 preg_match_all 的完整匹配与捕获组 1
|
||||
*/
|
||||
private function collectBlueTagMatches($content)
|
||||
{
|
||||
$merged = [];
|
||||
foreach ([self::BLUE_TAG_REGEX, self::BLUE_TAG_REGEX_BRACKET_OUTSIDE] as $pattern) {
|
||||
if (!preg_match_all($pattern, $content, $m, PREG_OFFSET_CAPTURE)) {
|
||||
continue;
|
||||
}
|
||||
$count = count($m[0]);
|
||||
for ($i = 0; $i < $count; $i++) {
|
||||
$merged[] = ['full' => $m[0][$i], 'inner' => $m[1][$i]];
|
||||
}
|
||||
}
|
||||
|
||||
usort($merged, function ($a, $b) {
|
||||
return $a['full'][1] - $b['full'][1];
|
||||
});
|
||||
|
||||
$matches = [[], []];
|
||||
foreach ($merged as $item) {
|
||||
$matches[0][] = $item['full'];
|
||||
$matches[1][] = $item['inner'];
|
||||
}
|
||||
|
||||
return $matches;
|
||||
}
|
||||
|
||||
/** 对两种 blue 引用排版执行 preg_replace */
|
||||
private function pregReplaceBlueTags($subject, $replacement)
|
||||
{
|
||||
$subject = preg_replace(self::BLUE_TAG_REGEX, $replacement, $subject);
|
||||
$subject = preg_replace(self::BLUE_TAG_REGEX_BRACKET_OUTSIDE, $replacement, $subject);
|
||||
|
||||
return $subject;
|
||||
}
|
||||
|
||||
/**
|
||||
* 单条入队(可手工指定正文与文献文本)
|
||||
*/
|
||||
@@ -115,14 +157,18 @@ class ReferenceCheckService
|
||||
return ['check_id' => $checkId, 'queued' => 1];
|
||||
}
|
||||
public function enqueueByArticleMain($main){
|
||||
$amId = $main['am_id'];
|
||||
// $main = Db::name('article_main')
|
||||
// ->field('am_id,content,article_id')
|
||||
// ->where('am_id', $amId)
|
||||
// ->whereIn('state', [0, 2])
|
||||
// ->find();
|
||||
$citations = $this->extractReferences((string)$main['content']);
|
||||
// return $citations;
|
||||
$amId = intval($this->arrGet($main, 'am_id', 0));
|
||||
if ($amId > 0 && (!isset($main['type']) || (intval($main['type']) === self::MAIN_TYPE_TABLE && intval($this->arrGet($main, 'amt_id', 0)) <= 0))) {
|
||||
$dbMain = Db::name('article_main')
|
||||
->field('am_id,content,article_id,type,amt_id')
|
||||
->where('am_id', $amId)
|
||||
->whereIn('state', [0, 2])
|
||||
->find();
|
||||
if (!empty($dbMain)) {
|
||||
$main = array_merge($dbMain, $main);
|
||||
}
|
||||
}
|
||||
$citations = $this->extractReferencesForArticleMain($main);
|
||||
if (empty($citations)) {
|
||||
$this->setAmRefCheckStatus($amId, self::AM_STATUS_NONE);
|
||||
return;
|
||||
@@ -222,7 +268,7 @@ class ReferenceCheckService
|
||||
$referMap = $this->loadReferMapByPArticleId($pArticleId);
|
||||
|
||||
$mains = Db::name('article_main')
|
||||
->field('am_id,content,article_id')
|
||||
->field('am_id,content,article_id,type,amt_id')
|
||||
->where('article_id', $articleId)
|
||||
->whereIn('state', [0, 2])
|
||||
->order('sort asc')
|
||||
@@ -237,7 +283,7 @@ class ReferenceCheckService
|
||||
$now = date('Y-m-d H:i:s');
|
||||
foreach ($mains as $main) {
|
||||
$amId = intval($main['am_id']);
|
||||
$citations = $this->extractReferences((string)$main['content']);
|
||||
$citations = $this->extractReferencesForArticleMain($main);
|
||||
if (empty($citations)) {
|
||||
$this->setAmRefCheckStatus($amId, self::AM_STATUS_NONE);
|
||||
continue;
|
||||
@@ -309,7 +355,7 @@ class ReferenceCheckService
|
||||
$referMap = $this->loadReferMapByPArticleId($pArticleId);
|
||||
|
||||
$mains = Db::name('article_main')
|
||||
->field('am_id,content,article_id')
|
||||
->field('am_id,content,article_id,type,amt_id')
|
||||
->where('article_id', $articleId)
|
||||
->whereIn('state', [0, 2])
|
||||
->order('sort asc')
|
||||
@@ -324,7 +370,7 @@ class ReferenceCheckService
|
||||
$now = date('Y-m-d H:i:s');
|
||||
foreach ($mains as $main) {
|
||||
$amId = intval($main['am_id']);
|
||||
$citations = $this->extractReferences((string)$main['content']);
|
||||
$citations = $this->extractReferencesForArticleMain($main);
|
||||
if (empty($citations)) {
|
||||
$this->setAmRefCheckStatus($amId, self::AM_STATUS_NONE);
|
||||
continue;
|
||||
@@ -429,9 +475,27 @@ class ReferenceCheckService
|
||||
return $status;
|
||||
}
|
||||
|
||||
/**
|
||||
* t_article_main 是否已加 ref_check_status 列(未迁移时跳过写入,避免 fields not exists)
|
||||
*/
|
||||
private function hasAmRefCheckStatusColumn()
|
||||
{
|
||||
if (self::$amRefCheckStatusColumnExists !== null) {
|
||||
return self::$amRefCheckStatusColumnExists;
|
||||
}
|
||||
try {
|
||||
$table = Db::name('article_main')->getTable();
|
||||
$rows = Db::query('SHOW COLUMNS FROM `' . str_replace('`', '``', $table) . '` LIKE \'ref_check_status\'');
|
||||
self::$amRefCheckStatusColumnExists = !empty($rows);
|
||||
} catch (\Exception $e) {
|
||||
self::$amRefCheckStatusColumnExists = false;
|
||||
}
|
||||
return self::$amRefCheckStatusColumnExists;
|
||||
}
|
||||
|
||||
public function setAmRefCheckStatus($amId, $status)
|
||||
{
|
||||
if ($amId <= 0) {
|
||||
if ($amId <= 0 || !$this->hasAmRefCheckStatusColumn()) {
|
||||
return;
|
||||
}
|
||||
Db::name('article_main')->where('am_id', $amId)->update([
|
||||
@@ -472,7 +536,7 @@ class ReferenceCheckService
|
||||
->where('p_article_id', $pArticleId)
|
||||
->delete();
|
||||
|
||||
if ($articleId > 0) {
|
||||
if ($articleId > 0 && $this->hasAmRefCheckStatusColumn()) {
|
||||
Db::name('article_main')
|
||||
->where('article_id', $articleId)
|
||||
->whereIn('state', [0, 2])
|
||||
@@ -498,10 +562,12 @@ class ReferenceCheckService
|
||||
}
|
||||
|
||||
$deleted = Db::name('article_reference_check_result')->where('article_id', $articleId)->delete();
|
||||
Db::name('article_main')
|
||||
->where('article_id', $articleId)
|
||||
->whereIn('state', [0, 2])
|
||||
->update(['ref_check_status' => self::AM_STATUS_NONE]);
|
||||
if ($this->hasAmRefCheckStatusColumn()) {
|
||||
Db::name('article_main')
|
||||
->where('article_id', $articleId)
|
||||
->whereIn('state', [0, 2])
|
||||
->update(['ref_check_status' => self::AM_STATUS_NONE]);
|
||||
}
|
||||
|
||||
return intval($deleted);
|
||||
}
|
||||
@@ -669,6 +735,68 @@ class ReferenceCheckService
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* 多篇文章并行校对时,查询指定文章前面还有几篇在排队。
|
||||
*
|
||||
* 「正在校对」= 该文至少还有 1 条明细 status=待校验(0)。
|
||||
* 排队顺序:按各文章最早一条待校验明细的 id 升序(与全局入队先后一致)。
|
||||
*
|
||||
* @return array{
|
||||
* p_article_id:int,
|
||||
* running_total:int,
|
||||
* ahead:int,
|
||||
* position:int,
|
||||
* in_queue:bool,
|
||||
* status:int
|
||||
* }
|
||||
*/
|
||||
public function getArticleCheckQueuePositionByPArticleId($pArticleId)
|
||||
{
|
||||
$pArticleId = intval($pArticleId);
|
||||
if ($pArticleId <= 0) {
|
||||
throw new \InvalidArgumentException('p_article_id is required');
|
||||
}
|
||||
|
||||
$rows = Db::name('article_reference_check_result')
|
||||
->field('p_article_id, MIN(id) AS queue_anchor')
|
||||
->where('status', self::RECORD_PENDING)
|
||||
->group('p_article_id')
|
||||
->order('queue_anchor', 'asc')
|
||||
->select();
|
||||
|
||||
$runningIds = [];
|
||||
foreach ($rows as $row) {
|
||||
$aid = intval($this->arrGet($row, 'p_article_id', 0));
|
||||
if ($aid > 0) {
|
||||
$runningIds[] = $aid;
|
||||
}
|
||||
}
|
||||
|
||||
$runningTotal = count($runningIds);
|
||||
$ahead = 0;
|
||||
$position = 0;
|
||||
$inQueue = false;
|
||||
foreach ($runningIds as $idx => $aid) {
|
||||
if ($aid === $pArticleId) {
|
||||
$ahead = $idx;
|
||||
$position = $idx + 1;
|
||||
$inQueue = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
$articleStatus = $this->getArticleProgressStatusByPArticleId($pArticleId);
|
||||
|
||||
return [
|
||||
'p_article_id' => $pArticleId,
|
||||
'running_total' => $runningTotal,
|
||||
'ahead' => $inQueue ? $ahead : 0,
|
||||
'position' => $inQueue ? $position : 0,
|
||||
'in_queue' => $inQueue,
|
||||
'status' => intval($this->arrGet($articleStatus, 'status', self::ARTICLE_PROGRESS_NONE)),
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* 按 p_article_id 查整篇引用校对进度,按 reference_no 分组聚合状态,并展开每条明细。
|
||||
*
|
||||
@@ -820,17 +948,16 @@ class ReferenceCheckService
|
||||
}
|
||||
|
||||
/**
|
||||
* 按 p_refer_id 查这条参考文献的所有校对明细。
|
||||
* 按 p_refer_id 查这条参考文献的校对明细与分组进度。
|
||||
*
|
||||
* 每条 record 返回:
|
||||
* - am_id 命中的 article_main 主键
|
||||
* - confidence 匹配置信度(0~1)
|
||||
* - reason LLM 给出的判定理由
|
||||
* - is_match 是否匹配(来自 article_reference_check_result.is_match)
|
||||
* - is_pass 是否通过校验(confidence >= PASS_CONFIDENCE_THRESHOLD)
|
||||
* 分组进度(与 referenceCheckProgressAI 单条 list 项口径一致):
|
||||
* progress_status 0待校验 1校对中 2完成 3失败
|
||||
* pending/done/failed/pass、is_pass、progress_percent
|
||||
*
|
||||
* list 每项:check_id、am_id、status、confidence、reason、is_match、is_pass
|
||||
*
|
||||
* @param int $pReferId production_article_refer.p_refer_id
|
||||
* @return array{p_refer_id:int, p_article_id:int, reference_no:int, total:int, list:array}
|
||||
* @return array
|
||||
*/
|
||||
public function getCheckDetailsByPReferId($pReferId)
|
||||
{
|
||||
@@ -840,7 +967,7 @@ class ReferenceCheckService
|
||||
}
|
||||
|
||||
$rows = Db::name('article_reference_check_result')
|
||||
->field('id,p_article_id,reference_no,am_id,confidence,is_match,reason')
|
||||
->field('id,p_article_id,reference_no,am_id,status,confidence,is_match,reason,updated_at')
|
||||
->where('p_refer_id', $pReferId)
|
||||
->order('id asc')
|
||||
->select();
|
||||
@@ -848,8 +975,13 @@ class ReferenceCheckService
|
||||
$list = [];
|
||||
$pArticleId = 0;
|
||||
$referenceNo = 0;
|
||||
$pending = 0;
|
||||
$done = 0;
|
||||
$failed = 0;
|
||||
$pass = 0;
|
||||
$lastUpdatedAt = '';
|
||||
|
||||
foreach ($rows as $row) {
|
||||
// 取首条出现的 p_article_id / reference_no 作为该 refer 的上下文
|
||||
if ($pArticleId <= 0) {
|
||||
$pArticleId = intval($this->arrGet($row, 'p_article_id', 0));
|
||||
}
|
||||
@@ -857,22 +989,87 @@ class ReferenceCheckService
|
||||
$referenceNo = intval($this->arrGet($row, 'reference_no', 0));
|
||||
}
|
||||
|
||||
$st = intval($this->arrGet($row, 'status', 0));
|
||||
if ($st === self::RECORD_PENDING) {
|
||||
$pending++;
|
||||
} elseif ($st === self::RECORD_COMPLETED) {
|
||||
$done++;
|
||||
} elseif ($st === self::RECORD_FAILED) {
|
||||
$failed++;
|
||||
}
|
||||
|
||||
$upd = (string)$this->arrGet($row, 'updated_at', '');
|
||||
if ($upd > $lastUpdatedAt) {
|
||||
$lastUpdatedAt = $upd;
|
||||
}
|
||||
|
||||
$confidence = floatval($this->arrGet($row, 'confidence', 0));
|
||||
$isPass = $confidence >= self::PASS_CONFIDENCE_THRESHOLD;
|
||||
if ($isPass) {
|
||||
$pass++;
|
||||
}
|
||||
|
||||
$list[] = [
|
||||
'check_id' => intval($this->arrGet($row, 'id', 0)),
|
||||
'am_id' => intval($this->arrGet($row, 'am_id', 0)),
|
||||
'status' => $st,
|
||||
'confidence' => $confidence,
|
||||
'reason' => (string)$this->arrGet($row, 'reason', ''),
|
||||
'is_match' => intval($this->arrGet($row, 'is_match', 0)),
|
||||
'is_pass' => $confidence >= self::PASS_CONFIDENCE_THRESHOLD,
|
||||
'is_pass' => $isPass,
|
||||
];
|
||||
}
|
||||
|
||||
if ($referenceNo <= 0) {
|
||||
$refer = Db::name('production_article_refer')
|
||||
->where('p_refer_id', $pReferId)
|
||||
->where('state', 0)
|
||||
->find();
|
||||
if (!empty($refer)) {
|
||||
if ($pArticleId <= 0) {
|
||||
$pArticleId = intval($this->arrGet($refer, 'p_article_id', 0));
|
||||
}
|
||||
$referenceNo = intval($this->arrGet($refer, 'index', 0)) + 1;
|
||||
}
|
||||
}
|
||||
|
||||
$total = count($list);
|
||||
if ($total === 0) {
|
||||
$progressStatus = self::PROGRESS_PENDING;
|
||||
$progressPercent = 0;
|
||||
$isPassGroup = false;
|
||||
} elseif ($pending === $total) {
|
||||
$progressStatus = self::PROGRESS_PENDING;
|
||||
$progressPercent = 0;
|
||||
$isPassGroup = false;
|
||||
} elseif ($pending === 0) {
|
||||
$progressStatus = $failed > 0 ? self::PROGRESS_FAILED : self::PROGRESS_COMPLETED;
|
||||
$progressPercent = 100;
|
||||
$isPassGroup = (
|
||||
$progressStatus === self::PROGRESS_COMPLETED
|
||||
&& $pass === $total
|
||||
);
|
||||
} else {
|
||||
$progressStatus = self::PROGRESS_CHECKING;
|
||||
$finished = $done + $failed;
|
||||
$progressPercent = round($finished / $total * 100, 1);
|
||||
$isPassGroup = false;
|
||||
}
|
||||
|
||||
return [
|
||||
'p_refer_id' => $pReferId,
|
||||
'p_article_id' => $pArticleId,
|
||||
'reference_no' => $referenceNo,
|
||||
'total' => count($list),
|
||||
'list' => $list,
|
||||
'p_refer_id' => $pReferId,
|
||||
'p_article_id' => $pArticleId,
|
||||
'reference_no' => $referenceNo,
|
||||
'total' => $total,
|
||||
'pending' => $pending,
|
||||
'done' => $done,
|
||||
'failed' => $failed,
|
||||
'pass' => $pass,
|
||||
'progress_status' => $progressStatus,
|
||||
'progress_percent' => $progressPercent,
|
||||
'is_pass' => $isPassGroup,
|
||||
'last_updated_at' => $lastUpdatedAt,
|
||||
'list' => $list,
|
||||
];
|
||||
}
|
||||
|
||||
@@ -1010,8 +1207,12 @@ class ReferenceCheckService
|
||||
*/
|
||||
public function buildArticlePreview($articleId, $amId = 0)
|
||||
{
|
||||
$fields = 'am_id,content,sort,type,amt_id';
|
||||
if ($this->hasAmRefCheckStatusColumn()) {
|
||||
$fields .= ',ref_check_status';
|
||||
}
|
||||
$q = Db::name('article_main')
|
||||
->field('am_id,content,sort,ref_check_status')
|
||||
->field($fields)
|
||||
->where('article_id', $articleId)
|
||||
->whereIn('state', [0, 2]);
|
||||
if ($amId > 0) {
|
||||
@@ -1039,7 +1240,7 @@ class ReferenceCheckService
|
||||
|
||||
foreach ($mains as $main) {
|
||||
$id = intval($main['am_id']);
|
||||
$content = (string)$main['content'];
|
||||
$content = $this->resolveArticleMainCheckContent($main);
|
||||
$badIndex = isset($badByAm[$id]) ? $badByAm[$id] : array();
|
||||
$marked = $this->markContentForPreview($content, $id, $badIndex);
|
||||
$amStatus = intval($this->arrGet($main, 'ref_check_status', 0));
|
||||
@@ -1158,12 +1359,7 @@ class ReferenceCheckService
|
||||
$html = $content;
|
||||
|
||||
// 1) 先标记 blue 内各序号(在原文上操作,[70-73] 仅标不合理者如 70、71)
|
||||
preg_match_all(
|
||||
self::BLUE_TAG_REGEX,
|
||||
$html,
|
||||
$matches,
|
||||
PREG_OFFSET_CAPTURE
|
||||
);
|
||||
$matches = $this->collectBlueTagMatches($html);
|
||||
$citeDeltas = [];
|
||||
if (!empty($matches[0])) {
|
||||
$replacements = [];
|
||||
@@ -1318,14 +1514,6 @@ class ReferenceCheckService
|
||||
return implode("\n", $parts);
|
||||
}
|
||||
|
||||
/**
|
||||
* 前端修改参考文献后重新校对:仅处理已有校对记录,刷新 refer_text、重置结果并入队;无记录直接返回
|
||||
*
|
||||
* @param int $articleId
|
||||
* @param int $pReferId t_production_article_refer.p_refer_id(优先)
|
||||
* @param int $referenceNo 文献序号 index+1(无 p_refer_id 时用)
|
||||
* @return array
|
||||
*/
|
||||
/**
|
||||
* 编辑某条文献内容后,按 p_refer_id 异步重新校对该文献对应的全部 check 明细
|
||||
*
|
||||
@@ -1387,7 +1575,7 @@ class ReferenceCheckService
|
||||
'refer_text' => $referText,
|
||||
'refer_index' => $referenceNo,
|
||||
'reference_no' => $referenceNo,
|
||||
'status' => 0,
|
||||
'status' => self::RECORD_PENDING,
|
||||
'is_match' => 0,
|
||||
'can_support' => 0,
|
||||
'confidence' => 0,
|
||||
@@ -1401,7 +1589,6 @@ class ReferenceCheckService
|
||||
foreach ($rows as $row) {
|
||||
$checkId = $this->resolveCheckRowId($row);
|
||||
Db::name('article_reference_check_result')->where('id', $checkId)->update($resetFields);
|
||||
// 旧的队列完成标记必须清掉,否则同 check_id 再次投递会被 acquireLock 静默丢弃
|
||||
$this->clearReferenceCheckQueueLock($checkId);
|
||||
$pendingJobs[] = [
|
||||
'check_id' => $checkId,
|
||||
@@ -1432,6 +1619,92 @@ class ReferenceCheckService
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* 某条参考文献下「校对失败」的明细重新校对(仅 status=RECORD_FAILED,异步入队)
|
||||
*
|
||||
* 不刷新 refer_text / reference_no,沿用记录内已有正文与文献快照,只重置结果字段后入队。
|
||||
*
|
||||
* @param int $pReferId t_production_article_refer.p_refer_id(必填)
|
||||
* @param int $pArticleId 可选,进一步限定文章
|
||||
* @return array{p_refer_id:int, p_article_id:int, reset:int, queued:int, check_ids:int[], queue:string}
|
||||
*/
|
||||
public function enqueueRecheckFailedByPReferId($pReferId, $pArticleId = 0)
|
||||
{
|
||||
$pReferId = intval($pReferId);
|
||||
if ($pReferId <= 0) {
|
||||
throw new \InvalidArgumentException('p_refer_id is required');
|
||||
}
|
||||
|
||||
$q = Db::name('article_reference_check_result')
|
||||
->where('p_refer_id', $pReferId)
|
||||
->where('status', self::RECORD_FAILED);
|
||||
$pArticleId = intval($pArticleId);
|
||||
if ($pArticleId > 0) {
|
||||
$q->where('p_article_id', $pArticleId);
|
||||
}
|
||||
|
||||
$rows = $q->select();
|
||||
|
||||
if (empty($rows)) {
|
||||
return [
|
||||
'p_refer_id' => $pReferId,
|
||||
'p_article_id' => $pArticleId,
|
||||
'reset' => 0,
|
||||
'queued' => 0,
|
||||
'check_ids' => [],
|
||||
'queue' => self::QUEUE_NAME,
|
||||
];
|
||||
}
|
||||
|
||||
if ($pArticleId <= 0) {
|
||||
$pArticleId = intval($this->arrGet($rows[0], 'p_article_id', 0));
|
||||
}
|
||||
|
||||
$now = date('Y-m-d H:i:s');
|
||||
$resetFields = [
|
||||
'status' => self::RECORD_PENDING,
|
||||
'is_match' => 0,
|
||||
'can_support' => 0,
|
||||
'confidence' => 0,
|
||||
'reason' => '',
|
||||
'error_msg' => '',
|
||||
'updated_at' => $now,
|
||||
];
|
||||
|
||||
$pendingJobs = [];
|
||||
$amIds = [];
|
||||
foreach ($rows as $row) {
|
||||
$checkId = $this->resolveCheckRowId($row);
|
||||
Db::name('article_reference_check_result')->where('id', $checkId)->update($resetFields);
|
||||
$this->clearReferenceCheckQueueLock($checkId);
|
||||
$pendingJobs[] = [
|
||||
'check_id' => $checkId,
|
||||
'reference_no' => intval($this->arrGet($row, 'reference_no', 0)),
|
||||
'am_id' => intval($this->arrGet($row, 'am_id', 0)),
|
||||
'text_start' => intval($this->arrGet($row, 'text_start', 0)),
|
||||
];
|
||||
$amId = intval($this->arrGet($row, 'am_id', 0));
|
||||
if ($amId > 0) {
|
||||
$amIds[$amId] = true;
|
||||
}
|
||||
}
|
||||
|
||||
foreach (array_keys($amIds) as $amId) {
|
||||
$this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING);
|
||||
}
|
||||
|
||||
$checkIds = $this->pushJobsSortedByReferenceNo($pendingJobs);
|
||||
|
||||
return [
|
||||
'p_refer_id' => $pReferId,
|
||||
'p_article_id' => $pArticleId,
|
||||
'reset' => count($rows),
|
||||
'queued' => count($checkIds),
|
||||
'check_ids' => $checkIds,
|
||||
'queue' => self::QUEUE_NAME,
|
||||
];
|
||||
}
|
||||
|
||||
public function recheckByRefer($articleId, $pReferId = 0, $referenceNo = 0)
|
||||
{
|
||||
$articleId = intval($articleId);
|
||||
@@ -1600,9 +1873,9 @@ class ReferenceCheckService
|
||||
if ($contentA === '' || $contentB === '') {
|
||||
$this->updateCheckResult($checkId, [
|
||||
'status' => self::RECORD_FAILED,
|
||||
'error_msg' => 'Missing article_main.content or refer_text',
|
||||
'error_msg' => 'Missing section content (text/table) or refer_text',
|
||||
]);
|
||||
throw new \RuntimeException('Missing article_main.content or refer_text');
|
||||
throw new \RuntimeException('Missing section content (text/table) or refer_text');
|
||||
}
|
||||
|
||||
$llmResult = (new LLMService())->checkReference($contentA, $contentB, false);
|
||||
@@ -1748,7 +2021,7 @@ class ReferenceCheckService
|
||||
}
|
||||
|
||||
/**
|
||||
* 第一次校对:取 article_main.content(整节正文)
|
||||
* 第一次校对:正文取 article_main.content;表格(type=2)取 article_main_table.table_data 等
|
||||
*/
|
||||
public function resolveMainContentForJob(array $row, $maxChars = 8000)
|
||||
{
|
||||
@@ -1757,23 +2030,280 @@ class ReferenceCheckService
|
||||
return '';
|
||||
}
|
||||
$main = Db::name('article_main')
|
||||
->field('content')
|
||||
->field('content,type,amt_id,article_id')
|
||||
->where('am_id', $amId)
|
||||
->find();
|
||||
if (empty($main)) {
|
||||
return '';
|
||||
}
|
||||
|
||||
$text = trim((string)$this->arrGet($main, 'content', ''));
|
||||
if ($text === '') {
|
||||
$raw = trim($this->resolveArticleMainCheckContent($main));
|
||||
if ($raw === '') {
|
||||
return '';
|
||||
}
|
||||
|
||||
$text = preg_replace(self::BLUE_TAG_REGEX, '[$1]', $text);
|
||||
return $this->normalizeCheckContentForLlm($raw, $maxChars);
|
||||
}
|
||||
|
||||
/**
|
||||
* 是否为表格节:type=2、有 amt_id,或 content 为 <table tableId='…'/> 占位
|
||||
*/
|
||||
private function isArticleMainTableSection(array $main)
|
||||
{
|
||||
if (intval($this->arrGet($main, 'type', self::MAIN_TYPE_TEXT)) === self::MAIN_TYPE_TABLE) {
|
||||
return true;
|
||||
}
|
||||
if (intval($this->arrGet($main, 'amt_id', 0)) > 0) {
|
||||
return true;
|
||||
}
|
||||
$content = (string)$this->arrGet($main, 'content', '');
|
||||
|
||||
return stripos($content, '<table') !== false
|
||||
&& preg_match('/tableId\s*=\s*[\'"]?\d+/i', $content);
|
||||
}
|
||||
|
||||
/**
|
||||
* 从 article_main 或 content 占位解析 amt_id
|
||||
*/
|
||||
private function resolveArticleMainTableAmtId(array $main)
|
||||
{
|
||||
$amtId = intval($this->arrGet($main, 'amt_id', 0));
|
||||
if ($amtId > 0) {
|
||||
return $amtId;
|
||||
}
|
||||
$content = (string)$this->arrGet($main, 'content', '');
|
||||
if (preg_match('/tableId\s*=\s*[\'"]?(\d+)/i', $content, $m)) {
|
||||
return intval($m[1]);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array|null
|
||||
*/
|
||||
private function loadArticleMainTableRow(array $main)
|
||||
{
|
||||
$amtId = $this->resolveArticleMainTableAmtId($main);
|
||||
if ($amtId <= 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$q = Db::name('article_main_table')
|
||||
->where('amt_id', $amtId)
|
||||
->whereIn('state', [0, 2])
|
||||
->field('table_data,title,note');
|
||||
$articleId = intval($this->arrGet($main, 'article_id', 0));
|
||||
if ($articleId > 0) {
|
||||
$q->where('article_id', $articleId);
|
||||
}
|
||||
$tbl = $q->find();
|
||||
|
||||
return empty($tbl) ? null : $tbl;
|
||||
}
|
||||
|
||||
/**
|
||||
* 按节提取引用:正文走 content;表格按行拼接单元格后扫描(Study 列仅 [n] 时也能带上同行上下文)
|
||||
*/
|
||||
public function extractReferencesForArticleMain(array $main)
|
||||
{
|
||||
if (!$this->isArticleMainTableSection($main)) {
|
||||
return $this->extractReferences((string)$this->arrGet($main, 'content', ''));
|
||||
}
|
||||
|
||||
$tbl = $this->loadArticleMainTableRow($main);
|
||||
if (empty($tbl)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$extra = [];
|
||||
foreach (['title', 'note'] as $field) {
|
||||
$part = trim((string)$this->arrGet($tbl, $field, ''));
|
||||
if ($part !== '') {
|
||||
$extra[] = $part;
|
||||
}
|
||||
}
|
||||
|
||||
return $this->extractReferencesFromTableDataJson(
|
||||
(string)$this->arrGet($tbl, 'table_data', ''),
|
||||
$extra
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* table_data 按行提取;$prefixChunks 为 title/note 等(在表格行之前扫描)
|
||||
*/
|
||||
public function extractReferencesFromTableDataJson($tableDataJson, array $prefixChunks = [])
|
||||
{
|
||||
$result = [];
|
||||
$offset = 0;
|
||||
|
||||
foreach ($prefixChunks as $chunk) {
|
||||
$chunk = trim((string)$chunk);
|
||||
if ($chunk === '') {
|
||||
continue;
|
||||
}
|
||||
foreach ($this->extractReferences($chunk) as $cite) {
|
||||
$cite['text_start'] = intval($cite['text_start']) + $offset;
|
||||
$cite['text_end'] = intval($cite['text_end']) + $offset;
|
||||
$cite['reference_start'] = intval($cite['reference_start']) + $offset;
|
||||
$cite['reference_end'] = intval($cite['reference_end']) + $offset;
|
||||
$result[] = $cite;
|
||||
}
|
||||
$offset += strlen($chunk) + 1;
|
||||
}
|
||||
|
||||
$tableDataJson = trim((string)$tableDataJson);
|
||||
if ($tableDataJson === '') {
|
||||
return $result;
|
||||
}
|
||||
|
||||
$decoded = $this->decodeTableDataJsonToArray($tableDataJson);
|
||||
if ($decoded === null) {
|
||||
foreach ($this->extractReferences($tableDataJson) as $cite) {
|
||||
$cite['text_start'] = intval($cite['text_start']) + $offset;
|
||||
$cite['text_end'] = intval($cite['text_end']) + $offset;
|
||||
$cite['reference_start'] = intval($cite['reference_start']) + $offset;
|
||||
$cite['reference_end'] = intval($cite['reference_end']) + $offset;
|
||||
$result[] = $cite;
|
||||
}
|
||||
|
||||
return $result;
|
||||
}
|
||||
|
||||
foreach ($decoded as $row) {
|
||||
$line = $this->buildTableRowCheckLine($row);
|
||||
if ($line === '') {
|
||||
continue;
|
||||
}
|
||||
foreach ($this->extractReferences($line) as $cite) {
|
||||
$cite['text_start'] = intval($cite['text_start']) + $offset;
|
||||
$cite['text_end'] = intval($cite['text_end']) + $offset;
|
||||
$cite['reference_start'] = intval($cite['reference_start']) + $offset;
|
||||
$cite['reference_end'] = intval($cite['reference_end']) + $offset;
|
||||
$result[] = $cite;
|
||||
}
|
||||
$offset += strlen($line) + 1;
|
||||
}
|
||||
|
||||
return $result;
|
||||
}
|
||||
|
||||
/**
|
||||
* 入队/LLM 用的原始 HTML:type=0 为 content;表格为 table_data 按行展平
|
||||
*/
|
||||
public function resolveArticleMainCheckContent(array $main)
|
||||
{
|
||||
if (!$this->isArticleMainTableSection($main)) {
|
||||
return (string)$this->arrGet($main, 'content', '');
|
||||
}
|
||||
|
||||
$tbl = $this->loadArticleMainTableRow($main);
|
||||
if (empty($tbl)) {
|
||||
return '';
|
||||
}
|
||||
|
||||
$chunks = [];
|
||||
foreach (['title', 'note'] as $field) {
|
||||
$part = trim((string)$this->arrGet($tbl, $field, ''));
|
||||
if ($part !== '') {
|
||||
$chunks[] = $part;
|
||||
}
|
||||
}
|
||||
$flat = $this->flattenTableDataJsonToCheckContent((string)$this->arrGet($tbl, 'table_data', ''));
|
||||
if ($flat !== '') {
|
||||
$chunks[] = $flat;
|
||||
}
|
||||
|
||||
return implode("\n", $chunks);
|
||||
}
|
||||
|
||||
/**
|
||||
* 表格一行:各单元格 text 用 " | " 连接(保留同行化学名/部位/Study 列引用)
|
||||
*/
|
||||
private function buildTableRowCheckLine($row)
|
||||
{
|
||||
if (!is_array($row)) {
|
||||
return '';
|
||||
}
|
||||
$cells = [];
|
||||
foreach ($row as $cell) {
|
||||
if (!is_array($cell)) {
|
||||
continue;
|
||||
}
|
||||
$text = trim((string)$this->arrGet($cell, 'text', ''));
|
||||
if ($text !== '') {
|
||||
$cells[] = $text;
|
||||
}
|
||||
}
|
||||
|
||||
return implode(' | ', $cells);
|
||||
}
|
||||
|
||||
/**
|
||||
* table_data 按行展平(供 LLM / 预览);非法 JSON 时按整串处理
|
||||
*/
|
||||
private function flattenTableDataJsonToCheckContent($tableDataJson)
|
||||
{
|
||||
$tableDataJson = trim((string)$tableDataJson);
|
||||
if ($tableDataJson === '') {
|
||||
return '';
|
||||
}
|
||||
$decoded = $this->decodeTableDataJsonToArray($tableDataJson);
|
||||
if ($decoded === null) {
|
||||
return $tableDataJson;
|
||||
}
|
||||
|
||||
$lines = [];
|
||||
foreach ($decoded as $row) {
|
||||
$line = $this->buildTableRowCheckLine($row);
|
||||
if ($line !== '') {
|
||||
$lines[] = $line;
|
||||
}
|
||||
}
|
||||
|
||||
return implode("\n", $lines);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array|null
|
||||
*/
|
||||
private function decodeTableDataJsonToArray($raw)
|
||||
{
|
||||
$raw = trim((string)$raw);
|
||||
if ($raw === '') {
|
||||
return null;
|
||||
}
|
||||
if (preg_match('/^\xEF\xBB\xBF/', $raw)) {
|
||||
$raw = substr($raw, 3);
|
||||
}
|
||||
$decoded = json_decode($raw, true);
|
||||
if (json_last_error() !== JSON_ERROR_NONE) {
|
||||
return null;
|
||||
}
|
||||
if (is_array($decoded)) {
|
||||
return $decoded;
|
||||
}
|
||||
if (is_string($decoded)) {
|
||||
$decoded2 = json_decode($decoded, true);
|
||||
if (json_last_error() === JSON_ERROR_NONE && is_array($decoded2)) {
|
||||
return $decoded2;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private function normalizeCheckContentForLlm($raw, $maxChars = 8000)
|
||||
{
|
||||
$text = $this->pregReplaceBlueTags($raw, '[$1]');
|
||||
$text = strip_tags($text);
|
||||
$text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8');
|
||||
$text = preg_replace('/\s+/u', ' ', $text);
|
||||
$text = trim($text);
|
||||
if ($text === '') {
|
||||
return '';
|
||||
}
|
||||
|
||||
$maxChars = max(500, intval($maxChars));
|
||||
if (mb_strlen($text) > $maxChars) {
|
||||
@@ -2134,12 +2664,12 @@ class ReferenceCheckService
|
||||
}
|
||||
|
||||
/**
|
||||
* 从 article_main.content 提取 blue 引用
|
||||
* 从正文 HTML 或表格展平后的 HTML 提取 blue 引用
|
||||
*/
|
||||
public function extractReferences($content)
|
||||
{
|
||||
$result = [];
|
||||
preg_match_all(self::BLUE_TAG_REGEX, $content, $matches, PREG_OFFSET_CAPTURE);
|
||||
$matches = $this->collectBlueTagMatches($content);
|
||||
if (empty($matches[0])) {
|
||||
return [];
|
||||
}
|
||||
@@ -2319,7 +2849,7 @@ class ReferenceCheckService
|
||||
private function buildCitationContextText($content, $start, $end)
|
||||
{
|
||||
$text = $this->byteSubstr($content, $start, $end);
|
||||
$text = preg_replace(self::BLUE_TAG_REGEX, '', $text);
|
||||
$text = $this->pregReplaceBlueTags($text, '');
|
||||
$text = trim(strip_tags($text));
|
||||
$text = preg_replace('/\s+/u', ' ', $text);
|
||||
$text = ltrim($text, "\xEF\xBB\xBF");
|
||||
@@ -2505,7 +3035,7 @@ class ReferenceCheckService
|
||||
}
|
||||
|
||||
$gap = substr($content, $tagEnd, $end - $tagEnd);
|
||||
$gapText = trim(strip_tags(preg_replace(self::BLUE_TAG_REGEX, '', $gap)));
|
||||
$gapText = trim(strip_tags($this->pregReplaceBlueTags($gap, '')));
|
||||
if ($gapText !== '' && !$this->isOnlyPunctuationOrSpace($gapText)) {
|
||||
return $end;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user