文献校对功能完善

This commit is contained in:
wyn
2026-05-27 16:09:23 +08:00
parent 1fcd6a129d
commit 94b212fe7c
4 changed files with 681 additions and 80 deletions

View File

@@ -898,7 +898,17 @@ class Preaccept extends Base
return jsonSuccess($re);
}
public function getArticleMainById(){
$data = $this->request->post();
$rule = new Validate([
"am_id"=>"require"
]);
if(!$rule->check($data)){
return jsonError($rule->getError());
}
$am_info = $this->article_main_obj->where("am_id",$data['am_id'])->find();
return jsonSuccess($am_info);
}
public function changeH1(){
$data = $this->request->post();

View File

@@ -11,6 +11,7 @@ use think\Validate;
use think\Db;
use think\Env;
use think\Queue;
use app\common\ReferenceCheckService;
/**
* @title 参考文献
* @description 相关方法汇总
@@ -1499,12 +1500,72 @@ class References extends Base
}
/**
* 按 p_refer_id 查单条参考文献的校对明细
* 多篇文章并行校对时,查询指定文章前面还有几篇在排队
*
* POST/GET: p_article_id必填
*
* 例:当前 5 篇文章正在校对,该文排在第 3 → ahead=2, position=3, running_total=5。
* 返回running_total、ahead、position、in_queue、status整篇校对状态 0/1/2
*/
public function referenceCheckPendingCountAI()
{
$aParam = $this->request->post();
if (empty($aParam)) {
$aParam = $this->request->param();
}
$iPArticleId = empty($aParam['p_article_id']) ? 0 : intval($aParam['p_article_id']);
if ($iPArticleId <= 0) {
return json_encode(array('status' => 2, 'msg' => 'Please select an article'));
}
try {
$result = (new ReferenceCheckService())->getArticleCheckQueuePositionByPArticleId($iPArticleId);
return jsonSuccess($result);
} catch (\Exception $e) {
return jsonError($e->getMessage());
}
}
/**
* 某条参考文献下「校对失败」的明细重新校对(异步)
*
* POST/GET: p_refer_id必填
* p_article_id可选
*
* 仅重跑 status=3校对失败的记录不改动 refer_text只重置结果字段后入 ReferenceCheck 队列。
* 返回p_refer_id、p_article_id、reset、queued、check_ids、queue
*/
public function referenceCheckRecheckFailedAI()
{
$aParam = $this->request->post();
if (empty($aParam)) {
$aParam = $this->request->param();
}
$iPReferId = empty($aParam['p_refer_id']) ? 0 : intval($aParam['p_refer_id']);
if ($iPReferId <= 0) {
return json_encode(array('status' => 2, 'msg' => 'Please select a reference'));
}
$iPArticleId = empty($aParam['p_article_id']) ? 0 : intval($aParam['p_article_id']);
try {
$result = (new ReferenceCheckService())->enqueueRecheckFailedByPReferId($iPReferId, $iPArticleId);
return jsonSuccess([]);
} catch (\Exception $e) {
return jsonError($e->getMessage());
}
}
/**
* 按 p_refer_id 查单条参考文献的校对明细与进度
*
* POST/GET: p_refer_id必填
*
* 返回 list 中每项含am_id、confidence、reason、is_match、is_pass
* 同时附带上下文p_refer_id、p_article_id、reference_no、total
* 分组进度progress_status(0待/1中/2完成/3失败)、pending、done、failed、pass
* is_pass、progress_percent、last_updated_at
* list 每项check_id、am_id、status、confidence、reason、is_match、is_pass
*/
public function referenceCheckDetailsAI()
{

View File

@@ -15,12 +15,20 @@ class ReferenceCheckService
{
const QUEUE_NAME = 'ReferenceCheck';
/** t_article_main.ref_check_status */
/** t_article_main.type */
const MAIN_TYPE_TEXT = 0;
const MAIN_TYPE_IMAGE = 1;
const MAIN_TYPE_TABLE = 2;
/** t_article_main.ref_check_status需执行 sql/article_main_ref_check_status.sql */
const AM_STATUS_NONE = 0;
const AM_STATUS_PASS = 1;
const AM_STATUS_FAIL = 2;
const AM_STATUS_RUNNING = 3;
/** @var bool|null t_article_main 是否已有 ref_check_status 列 */
private static $amRefCheckStatusColumnExists = null;
/**
* 引用校对状态生命周期顺序0→1→2→3 = 待→进行→完成→失败)
*
@@ -52,20 +60,14 @@ class ReferenceCheckService
const PASS_CONFIDENCE_THRESHOLD = 0.65;
/**
* <blue>[...]</blue> 引用标签内允许的字符类(带 /u 修饰符使用)。
* 正文引用标签两种排版(带 /u
* 1) <blue>[8, 9]</blue>、<blue>[13-15]</blue> —— 方括号在 blue 内
* 2) [<blue>13-15</blue>] —— 方括号包裹 blue
*
* 除 ASCII 数字、半角逗号、半角连字符、空白外,还兼容常见排版变体
* U+FF0C 全角逗号
* U+2013 EN DASH
* — U+2014 EM DASH
* U+2212 MINUS SIGN
* U+2010 HYPHEN
* U+2011 NON-BREAKING HYPHEN
*
* 若不支持变体连字符,会导致 [1921] 这种区间引用整段被 preg 漏掉,
* 进而丢失对应的 reference_no 校对记录。
* 捕获组均为序号串(可含逗号、区间连字符排版变体)。
*/
const BLUE_TAG_REGEX = '/<blue>\[([\d,\-\x{2013}\x{2014}\x{2212}\x{2010}\x{2011}\s]+)\]<\/blue>/u';
const BLUE_TAG_REGEX_BRACKET_OUTSIDE = '/\[<blue>([\d,\-\x{2013}\x{2014}\x{2212}\x{2010}\x{2011}\s]+)<\/blue>\]/u';
/**
* 兼容无 ?? 的 PHP 版本
@@ -75,6 +77,46 @@ class ReferenceCheckService
return isset($arr[$key]) ? $arr[$key] : $default;
}
/**
* 合并匹配两种 blue 引用排版,按在正文中的起始位置排序。
*
* @return array{0: array, 1: array} 同 preg_match_all 的完整匹配与捕获组 1
*/
private function collectBlueTagMatches($content)
{
$merged = [];
foreach ([self::BLUE_TAG_REGEX, self::BLUE_TAG_REGEX_BRACKET_OUTSIDE] as $pattern) {
if (!preg_match_all($pattern, $content, $m, PREG_OFFSET_CAPTURE)) {
continue;
}
$count = count($m[0]);
for ($i = 0; $i < $count; $i++) {
$merged[] = ['full' => $m[0][$i], 'inner' => $m[1][$i]];
}
}
usort($merged, function ($a, $b) {
return $a['full'][1] - $b['full'][1];
});
$matches = [[], []];
foreach ($merged as $item) {
$matches[0][] = $item['full'];
$matches[1][] = $item['inner'];
}
return $matches;
}
/** 对两种 blue 引用排版执行 preg_replace */
private function pregReplaceBlueTags($subject, $replacement)
{
$subject = preg_replace(self::BLUE_TAG_REGEX, $replacement, $subject);
$subject = preg_replace(self::BLUE_TAG_REGEX_BRACKET_OUTSIDE, $replacement, $subject);
return $subject;
}
/**
* 单条入队(可手工指定正文与文献文本)
*/
@@ -115,14 +157,18 @@ class ReferenceCheckService
return ['check_id' => $checkId, 'queued' => 1];
}
public function enqueueByArticleMain($main){
$amId = $main['am_id'];
// $main = Db::name('article_main')
// ->field('am_id,content,article_id')
// ->where('am_id', $amId)
// ->whereIn('state', [0, 2])
// ->find();
$citations = $this->extractReferences((string)$main['content']);
// return $citations;
$amId = intval($this->arrGet($main, 'am_id', 0));
if ($amId > 0 && (!isset($main['type']) || (intval($main['type']) === self::MAIN_TYPE_TABLE && intval($this->arrGet($main, 'amt_id', 0)) <= 0))) {
$dbMain = Db::name('article_main')
->field('am_id,content,article_id,type,amt_id')
->where('am_id', $amId)
->whereIn('state', [0, 2])
->find();
if (!empty($dbMain)) {
$main = array_merge($dbMain, $main);
}
}
$citations = $this->extractReferencesForArticleMain($main);
if (empty($citations)) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_NONE);
return;
@@ -222,7 +268,7 @@ class ReferenceCheckService
$referMap = $this->loadReferMapByPArticleId($pArticleId);
$mains = Db::name('article_main')
->field('am_id,content,article_id')
->field('am_id,content,article_id,type,amt_id')
->where('article_id', $articleId)
->whereIn('state', [0, 2])
->order('sort asc')
@@ -237,7 +283,7 @@ class ReferenceCheckService
$now = date('Y-m-d H:i:s');
foreach ($mains as $main) {
$amId = intval($main['am_id']);
$citations = $this->extractReferences((string)$main['content']);
$citations = $this->extractReferencesForArticleMain($main);
if (empty($citations)) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_NONE);
continue;
@@ -309,7 +355,7 @@ class ReferenceCheckService
$referMap = $this->loadReferMapByPArticleId($pArticleId);
$mains = Db::name('article_main')
->field('am_id,content,article_id')
->field('am_id,content,article_id,type,amt_id')
->where('article_id', $articleId)
->whereIn('state', [0, 2])
->order('sort asc')
@@ -324,7 +370,7 @@ class ReferenceCheckService
$now = date('Y-m-d H:i:s');
foreach ($mains as $main) {
$amId = intval($main['am_id']);
$citations = $this->extractReferences((string)$main['content']);
$citations = $this->extractReferencesForArticleMain($main);
if (empty($citations)) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_NONE);
continue;
@@ -429,9 +475,27 @@ class ReferenceCheckService
return $status;
}
/**
* t_article_main 是否已加 ref_check_status 列(未迁移时跳过写入,避免 fields not exists
*/
private function hasAmRefCheckStatusColumn()
{
if (self::$amRefCheckStatusColumnExists !== null) {
return self::$amRefCheckStatusColumnExists;
}
try {
$table = Db::name('article_main')->getTable();
$rows = Db::query('SHOW COLUMNS FROM `' . str_replace('`', '``', $table) . '` LIKE \'ref_check_status\'');
self::$amRefCheckStatusColumnExists = !empty($rows);
} catch (\Exception $e) {
self::$amRefCheckStatusColumnExists = false;
}
return self::$amRefCheckStatusColumnExists;
}
public function setAmRefCheckStatus($amId, $status)
{
if ($amId <= 0) {
if ($amId <= 0 || !$this->hasAmRefCheckStatusColumn()) {
return;
}
Db::name('article_main')->where('am_id', $amId)->update([
@@ -472,7 +536,7 @@ class ReferenceCheckService
->where('p_article_id', $pArticleId)
->delete();
if ($articleId > 0) {
if ($articleId > 0 && $this->hasAmRefCheckStatusColumn()) {
Db::name('article_main')
->where('article_id', $articleId)
->whereIn('state', [0, 2])
@@ -498,10 +562,12 @@ class ReferenceCheckService
}
$deleted = Db::name('article_reference_check_result')->where('article_id', $articleId)->delete();
Db::name('article_main')
->where('article_id', $articleId)
->whereIn('state', [0, 2])
->update(['ref_check_status' => self::AM_STATUS_NONE]);
if ($this->hasAmRefCheckStatusColumn()) {
Db::name('article_main')
->where('article_id', $articleId)
->whereIn('state', [0, 2])
->update(['ref_check_status' => self::AM_STATUS_NONE]);
}
return intval($deleted);
}
@@ -669,6 +735,68 @@ class ReferenceCheckService
];
}
/**
* 多篇文章并行校对时,查询指定文章前面还有几篇在排队。
*
* 「正在校对」= 该文至少还有 1 条明细 status=待校验(0)。
* 排队顺序:按各文章最早一条待校验明细的 id 升序(与全局入队先后一致)。
*
* @return array{
* p_article_id:int,
* running_total:int,
* ahead:int,
* position:int,
* in_queue:bool,
* status:int
* }
*/
public function getArticleCheckQueuePositionByPArticleId($pArticleId)
{
$pArticleId = intval($pArticleId);
if ($pArticleId <= 0) {
throw new \InvalidArgumentException('p_article_id is required');
}
$rows = Db::name('article_reference_check_result')
->field('p_article_id, MIN(id) AS queue_anchor')
->where('status', self::RECORD_PENDING)
->group('p_article_id')
->order('queue_anchor', 'asc')
->select();
$runningIds = [];
foreach ($rows as $row) {
$aid = intval($this->arrGet($row, 'p_article_id', 0));
if ($aid > 0) {
$runningIds[] = $aid;
}
}
$runningTotal = count($runningIds);
$ahead = 0;
$position = 0;
$inQueue = false;
foreach ($runningIds as $idx => $aid) {
if ($aid === $pArticleId) {
$ahead = $idx;
$position = $idx + 1;
$inQueue = true;
break;
}
}
$articleStatus = $this->getArticleProgressStatusByPArticleId($pArticleId);
return [
'p_article_id' => $pArticleId,
'running_total' => $runningTotal,
'ahead' => $inQueue ? $ahead : 0,
'position' => $inQueue ? $position : 0,
'in_queue' => $inQueue,
'status' => intval($this->arrGet($articleStatus, 'status', self::ARTICLE_PROGRESS_NONE)),
];
}
/**
* 按 p_article_id 查整篇引用校对进度,按 reference_no 分组聚合状态,并展开每条明细。
*
@@ -820,17 +948,16 @@ class ReferenceCheckService
}
/**
* 按 p_refer_id 查这条参考文献的所有校对明细。
* 按 p_refer_id 查这条参考文献的校对明细与分组进度
*
* 每条 record 返回
* - am_id 命中的 article_main 主键
* - confidence 匹配置信度0~1
* - reason LLM 给出的判定理由
* - is_match 是否匹配(来自 article_reference_check_result.is_match
* - is_pass 是否通过校验confidence >= PASS_CONFIDENCE_THRESHOLD
* 分组进度(与 referenceCheckProgressAI 单条 list 项口径一致)
* progress_status 0待校验 1校对中 2完成 3失败
* pending/done/failed/pass、is_pass、progress_percent
*
* list 每项check_id、am_id、status、confidence、reason、is_match、is_pass
*
* @param int $pReferId production_article_refer.p_refer_id
* @return array{p_refer_id:int, p_article_id:int, reference_no:int, total:int, list:array}
* @return array
*/
public function getCheckDetailsByPReferId($pReferId)
{
@@ -840,7 +967,7 @@ class ReferenceCheckService
}
$rows = Db::name('article_reference_check_result')
->field('id,p_article_id,reference_no,am_id,confidence,is_match,reason')
->field('id,p_article_id,reference_no,am_id,status,confidence,is_match,reason,updated_at')
->where('p_refer_id', $pReferId)
->order('id asc')
->select();
@@ -848,8 +975,13 @@ class ReferenceCheckService
$list = [];
$pArticleId = 0;
$referenceNo = 0;
$pending = 0;
$done = 0;
$failed = 0;
$pass = 0;
$lastUpdatedAt = '';
foreach ($rows as $row) {
// 取首条出现的 p_article_id / reference_no 作为该 refer 的上下文
if ($pArticleId <= 0) {
$pArticleId = intval($this->arrGet($row, 'p_article_id', 0));
}
@@ -857,22 +989,87 @@ class ReferenceCheckService
$referenceNo = intval($this->arrGet($row, 'reference_no', 0));
}
$st = intval($this->arrGet($row, 'status', 0));
if ($st === self::RECORD_PENDING) {
$pending++;
} elseif ($st === self::RECORD_COMPLETED) {
$done++;
} elseif ($st === self::RECORD_FAILED) {
$failed++;
}
$upd = (string)$this->arrGet($row, 'updated_at', '');
if ($upd > $lastUpdatedAt) {
$lastUpdatedAt = $upd;
}
$confidence = floatval($this->arrGet($row, 'confidence', 0));
$isPass = $confidence >= self::PASS_CONFIDENCE_THRESHOLD;
if ($isPass) {
$pass++;
}
$list[] = [
'check_id' => intval($this->arrGet($row, 'id', 0)),
'am_id' => intval($this->arrGet($row, 'am_id', 0)),
'status' => $st,
'confidence' => $confidence,
'reason' => (string)$this->arrGet($row, 'reason', ''),
'is_match' => intval($this->arrGet($row, 'is_match', 0)),
'is_pass' => $confidence >= self::PASS_CONFIDENCE_THRESHOLD,
'is_pass' => $isPass,
];
}
if ($referenceNo <= 0) {
$refer = Db::name('production_article_refer')
->where('p_refer_id', $pReferId)
->where('state', 0)
->find();
if (!empty($refer)) {
if ($pArticleId <= 0) {
$pArticleId = intval($this->arrGet($refer, 'p_article_id', 0));
}
$referenceNo = intval($this->arrGet($refer, 'index', 0)) + 1;
}
}
$total = count($list);
if ($total === 0) {
$progressStatus = self::PROGRESS_PENDING;
$progressPercent = 0;
$isPassGroup = false;
} elseif ($pending === $total) {
$progressStatus = self::PROGRESS_PENDING;
$progressPercent = 0;
$isPassGroup = false;
} elseif ($pending === 0) {
$progressStatus = $failed > 0 ? self::PROGRESS_FAILED : self::PROGRESS_COMPLETED;
$progressPercent = 100;
$isPassGroup = (
$progressStatus === self::PROGRESS_COMPLETED
&& $pass === $total
);
} else {
$progressStatus = self::PROGRESS_CHECKING;
$finished = $done + $failed;
$progressPercent = round($finished / $total * 100, 1);
$isPassGroup = false;
}
return [
'p_refer_id' => $pReferId,
'p_article_id' => $pArticleId,
'reference_no' => $referenceNo,
'total' => count($list),
'list' => $list,
'p_refer_id' => $pReferId,
'p_article_id' => $pArticleId,
'reference_no' => $referenceNo,
'total' => $total,
'pending' => $pending,
'done' => $done,
'failed' => $failed,
'pass' => $pass,
'progress_status' => $progressStatus,
'progress_percent' => $progressPercent,
'is_pass' => $isPassGroup,
'last_updated_at' => $lastUpdatedAt,
'list' => $list,
];
}
@@ -1010,8 +1207,12 @@ class ReferenceCheckService
*/
public function buildArticlePreview($articleId, $amId = 0)
{
$fields = 'am_id,content,sort,type,amt_id';
if ($this->hasAmRefCheckStatusColumn()) {
$fields .= ',ref_check_status';
}
$q = Db::name('article_main')
->field('am_id,content,sort,ref_check_status')
->field($fields)
->where('article_id', $articleId)
->whereIn('state', [0, 2]);
if ($amId > 0) {
@@ -1039,7 +1240,7 @@ class ReferenceCheckService
foreach ($mains as $main) {
$id = intval($main['am_id']);
$content = (string)$main['content'];
$content = $this->resolveArticleMainCheckContent($main);
$badIndex = isset($badByAm[$id]) ? $badByAm[$id] : array();
$marked = $this->markContentForPreview($content, $id, $badIndex);
$amStatus = intval($this->arrGet($main, 'ref_check_status', 0));
@@ -1158,12 +1359,7 @@ class ReferenceCheckService
$html = $content;
// 1) 先标记 blue 内各序号(在原文上操作,[70-73] 仅标不合理者如 70、71
preg_match_all(
self::BLUE_TAG_REGEX,
$html,
$matches,
PREG_OFFSET_CAPTURE
);
$matches = $this->collectBlueTagMatches($html);
$citeDeltas = [];
if (!empty($matches[0])) {
$replacements = [];
@@ -1318,14 +1514,6 @@ class ReferenceCheckService
return implode("\n", $parts);
}
/**
* 前端修改参考文献后重新校对:仅处理已有校对记录,刷新 refer_text、重置结果并入队无记录直接返回
*
* @param int $articleId
* @param int $pReferId t_production_article_refer.p_refer_id优先
* @param int $referenceNo 文献序号 index+1无 p_refer_id 时用)
* @return array
*/
/**
* 编辑某条文献内容后,按 p_refer_id 异步重新校对该文献对应的全部 check 明细
*
@@ -1387,7 +1575,7 @@ class ReferenceCheckService
'refer_text' => $referText,
'refer_index' => $referenceNo,
'reference_no' => $referenceNo,
'status' => 0,
'status' => self::RECORD_PENDING,
'is_match' => 0,
'can_support' => 0,
'confidence' => 0,
@@ -1401,7 +1589,6 @@ class ReferenceCheckService
foreach ($rows as $row) {
$checkId = $this->resolveCheckRowId($row);
Db::name('article_reference_check_result')->where('id', $checkId)->update($resetFields);
// 旧的队列完成标记必须清掉,否则同 check_id 再次投递会被 acquireLock 静默丢弃
$this->clearReferenceCheckQueueLock($checkId);
$pendingJobs[] = [
'check_id' => $checkId,
@@ -1432,6 +1619,92 @@ class ReferenceCheckService
];
}
/**
* 某条参考文献下「校对失败」的明细重新校对(仅 status=RECORD_FAILED异步入队
*
* 不刷新 refer_text / reference_no沿用记录内已有正文与文献快照只重置结果字段后入队。
*
* @param int $pReferId t_production_article_refer.p_refer_id必填
* @param int $pArticleId 可选,进一步限定文章
* @return array{p_refer_id:int, p_article_id:int, reset:int, queued:int, check_ids:int[], queue:string}
*/
public function enqueueRecheckFailedByPReferId($pReferId, $pArticleId = 0)
{
$pReferId = intval($pReferId);
if ($pReferId <= 0) {
throw new \InvalidArgumentException('p_refer_id is required');
}
$q = Db::name('article_reference_check_result')
->where('p_refer_id', $pReferId)
->where('status', self::RECORD_FAILED);
$pArticleId = intval($pArticleId);
if ($pArticleId > 0) {
$q->where('p_article_id', $pArticleId);
}
$rows = $q->select();
if (empty($rows)) {
return [
'p_refer_id' => $pReferId,
'p_article_id' => $pArticleId,
'reset' => 0,
'queued' => 0,
'check_ids' => [],
'queue' => self::QUEUE_NAME,
];
}
if ($pArticleId <= 0) {
$pArticleId = intval($this->arrGet($rows[0], 'p_article_id', 0));
}
$now = date('Y-m-d H:i:s');
$resetFields = [
'status' => self::RECORD_PENDING,
'is_match' => 0,
'can_support' => 0,
'confidence' => 0,
'reason' => '',
'error_msg' => '',
'updated_at' => $now,
];
$pendingJobs = [];
$amIds = [];
foreach ($rows as $row) {
$checkId = $this->resolveCheckRowId($row);
Db::name('article_reference_check_result')->where('id', $checkId)->update($resetFields);
$this->clearReferenceCheckQueueLock($checkId);
$pendingJobs[] = [
'check_id' => $checkId,
'reference_no' => intval($this->arrGet($row, 'reference_no', 0)),
'am_id' => intval($this->arrGet($row, 'am_id', 0)),
'text_start' => intval($this->arrGet($row, 'text_start', 0)),
];
$amId = intval($this->arrGet($row, 'am_id', 0));
if ($amId > 0) {
$amIds[$amId] = true;
}
}
foreach (array_keys($amIds) as $amId) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING);
}
$checkIds = $this->pushJobsSortedByReferenceNo($pendingJobs);
return [
'p_refer_id' => $pReferId,
'p_article_id' => $pArticleId,
'reset' => count($rows),
'queued' => count($checkIds),
'check_ids' => $checkIds,
'queue' => self::QUEUE_NAME,
];
}
public function recheckByRefer($articleId, $pReferId = 0, $referenceNo = 0)
{
$articleId = intval($articleId);
@@ -1600,9 +1873,9 @@ class ReferenceCheckService
if ($contentA === '' || $contentB === '') {
$this->updateCheckResult($checkId, [
'status' => self::RECORD_FAILED,
'error_msg' => 'Missing article_main.content or refer_text',
'error_msg' => 'Missing section content (text/table) or refer_text',
]);
throw new \RuntimeException('Missing article_main.content or refer_text');
throw new \RuntimeException('Missing section content (text/table) or refer_text');
}
$llmResult = (new LLMService())->checkReference($contentA, $contentB, false);
@@ -1748,7 +2021,7 @@ class ReferenceCheckService
}
/**
* 第一次校对:取 article_main.content(整节正文)
* 第一次校对:正文取 article_main.content;表格(type=2)取 article_main_table.table_data 等
*/
public function resolveMainContentForJob(array $row, $maxChars = 8000)
{
@@ -1757,23 +2030,280 @@ class ReferenceCheckService
return '';
}
$main = Db::name('article_main')
->field('content')
->field('content,type,amt_id,article_id')
->where('am_id', $amId)
->find();
if (empty($main)) {
return '';
}
$text = trim((string)$this->arrGet($main, 'content', ''));
if ($text === '') {
$raw = trim($this->resolveArticleMainCheckContent($main));
if ($raw === '') {
return '';
}
$text = preg_replace(self::BLUE_TAG_REGEX, '[$1]', $text);
return $this->normalizeCheckContentForLlm($raw, $maxChars);
}
/**
* 是否为表格节type=2、有 amt_id或 content 为 &lt;table tableId='…'/&gt; 占位
*/
private function isArticleMainTableSection(array $main)
{
if (intval($this->arrGet($main, 'type', self::MAIN_TYPE_TEXT)) === self::MAIN_TYPE_TABLE) {
return true;
}
if (intval($this->arrGet($main, 'amt_id', 0)) > 0) {
return true;
}
$content = (string)$this->arrGet($main, 'content', '');
return stripos($content, '<table') !== false
&& preg_match('/tableId\s*=\s*[\'"]?\d+/i', $content);
}
/**
* 从 article_main 或 content 占位解析 amt_id
*/
private function resolveArticleMainTableAmtId(array $main)
{
$amtId = intval($this->arrGet($main, 'amt_id', 0));
if ($amtId > 0) {
return $amtId;
}
$content = (string)$this->arrGet($main, 'content', '');
if (preg_match('/tableId\s*=\s*[\'"]?(\d+)/i', $content, $m)) {
return intval($m[1]);
}
return 0;
}
/**
* @return array|null
*/
private function loadArticleMainTableRow(array $main)
{
$amtId = $this->resolveArticleMainTableAmtId($main);
if ($amtId <= 0) {
return null;
}
$q = Db::name('article_main_table')
->where('amt_id', $amtId)
->whereIn('state', [0, 2])
->field('table_data,title,note');
$articleId = intval($this->arrGet($main, 'article_id', 0));
if ($articleId > 0) {
$q->where('article_id', $articleId);
}
$tbl = $q->find();
return empty($tbl) ? null : $tbl;
}
/**
* 按节提取引用:正文走 content表格按行拼接单元格后扫描Study 列仅 [n] 时也能带上同行上下文)
*/
public function extractReferencesForArticleMain(array $main)
{
if (!$this->isArticleMainTableSection($main)) {
return $this->extractReferences((string)$this->arrGet($main, 'content', ''));
}
$tbl = $this->loadArticleMainTableRow($main);
if (empty($tbl)) {
return [];
}
$extra = [];
foreach (['title', 'note'] as $field) {
$part = trim((string)$this->arrGet($tbl, $field, ''));
if ($part !== '') {
$extra[] = $part;
}
}
return $this->extractReferencesFromTableDataJson(
(string)$this->arrGet($tbl, 'table_data', ''),
$extra
);
}
/**
* table_data 按行提取;$prefixChunks 为 title/note 等(在表格行之前扫描)
*/
public function extractReferencesFromTableDataJson($tableDataJson, array $prefixChunks = [])
{
$result = [];
$offset = 0;
foreach ($prefixChunks as $chunk) {
$chunk = trim((string)$chunk);
if ($chunk === '') {
continue;
}
foreach ($this->extractReferences($chunk) as $cite) {
$cite['text_start'] = intval($cite['text_start']) + $offset;
$cite['text_end'] = intval($cite['text_end']) + $offset;
$cite['reference_start'] = intval($cite['reference_start']) + $offset;
$cite['reference_end'] = intval($cite['reference_end']) + $offset;
$result[] = $cite;
}
$offset += strlen($chunk) + 1;
}
$tableDataJson = trim((string)$tableDataJson);
if ($tableDataJson === '') {
return $result;
}
$decoded = $this->decodeTableDataJsonToArray($tableDataJson);
if ($decoded === null) {
foreach ($this->extractReferences($tableDataJson) as $cite) {
$cite['text_start'] = intval($cite['text_start']) + $offset;
$cite['text_end'] = intval($cite['text_end']) + $offset;
$cite['reference_start'] = intval($cite['reference_start']) + $offset;
$cite['reference_end'] = intval($cite['reference_end']) + $offset;
$result[] = $cite;
}
return $result;
}
foreach ($decoded as $row) {
$line = $this->buildTableRowCheckLine($row);
if ($line === '') {
continue;
}
foreach ($this->extractReferences($line) as $cite) {
$cite['text_start'] = intval($cite['text_start']) + $offset;
$cite['text_end'] = intval($cite['text_end']) + $offset;
$cite['reference_start'] = intval($cite['reference_start']) + $offset;
$cite['reference_end'] = intval($cite['reference_end']) + $offset;
$result[] = $cite;
}
$offset += strlen($line) + 1;
}
return $result;
}
/**
* 入队/LLM 用的原始 HTMLtype=0 为 content表格为 table_data 按行展平
*/
public function resolveArticleMainCheckContent(array $main)
{
if (!$this->isArticleMainTableSection($main)) {
return (string)$this->arrGet($main, 'content', '');
}
$tbl = $this->loadArticleMainTableRow($main);
if (empty($tbl)) {
return '';
}
$chunks = [];
foreach (['title', 'note'] as $field) {
$part = trim((string)$this->arrGet($tbl, $field, ''));
if ($part !== '') {
$chunks[] = $part;
}
}
$flat = $this->flattenTableDataJsonToCheckContent((string)$this->arrGet($tbl, 'table_data', ''));
if ($flat !== '') {
$chunks[] = $flat;
}
return implode("\n", $chunks);
}
/**
* 表格一行:各单元格 text 用 " | " 连接(保留同行化学名/部位/Study 列引用)
*/
private function buildTableRowCheckLine($row)
{
if (!is_array($row)) {
return '';
}
$cells = [];
foreach ($row as $cell) {
if (!is_array($cell)) {
continue;
}
$text = trim((string)$this->arrGet($cell, 'text', ''));
if ($text !== '') {
$cells[] = $text;
}
}
return implode(' | ', $cells);
}
/**
* table_data 按行展平(供 LLM / 预览);非法 JSON 时按整串处理
*/
private function flattenTableDataJsonToCheckContent($tableDataJson)
{
$tableDataJson = trim((string)$tableDataJson);
if ($tableDataJson === '') {
return '';
}
$decoded = $this->decodeTableDataJsonToArray($tableDataJson);
if ($decoded === null) {
return $tableDataJson;
}
$lines = [];
foreach ($decoded as $row) {
$line = $this->buildTableRowCheckLine($row);
if ($line !== '') {
$lines[] = $line;
}
}
return implode("\n", $lines);
}
/**
* @return array|null
*/
private function decodeTableDataJsonToArray($raw)
{
$raw = trim((string)$raw);
if ($raw === '') {
return null;
}
if (preg_match('/^\xEF\xBB\xBF/', $raw)) {
$raw = substr($raw, 3);
}
$decoded = json_decode($raw, true);
if (json_last_error() !== JSON_ERROR_NONE) {
return null;
}
if (is_array($decoded)) {
return $decoded;
}
if (is_string($decoded)) {
$decoded2 = json_decode($decoded, true);
if (json_last_error() === JSON_ERROR_NONE && is_array($decoded2)) {
return $decoded2;
}
}
return null;
}
private function normalizeCheckContentForLlm($raw, $maxChars = 8000)
{
$text = $this->pregReplaceBlueTags($raw, '[$1]');
$text = strip_tags($text);
$text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8');
$text = preg_replace('/\s+/u', ' ', $text);
$text = trim($text);
if ($text === '') {
return '';
}
$maxChars = max(500, intval($maxChars));
if (mb_strlen($text) > $maxChars) {
@@ -2134,12 +2664,12 @@ class ReferenceCheckService
}
/**
* 从 article_main.content 提取 blue 引用
* 从正文 HTML 或表格展平后的 HTML 提取 blue 引用
*/
public function extractReferences($content)
{
$result = [];
preg_match_all(self::BLUE_TAG_REGEX, $content, $matches, PREG_OFFSET_CAPTURE);
$matches = $this->collectBlueTagMatches($content);
if (empty($matches[0])) {
return [];
}
@@ -2319,7 +2849,7 @@ class ReferenceCheckService
private function buildCitationContextText($content, $start, $end)
{
$text = $this->byteSubstr($content, $start, $end);
$text = preg_replace(self::BLUE_TAG_REGEX, '', $text);
$text = $this->pregReplaceBlueTags($text, '');
$text = trim(strip_tags($text));
$text = preg_replace('/\s+/u', ' ', $text);
$text = ltrim($text, "\xEF\xBB\xBF");
@@ -2505,7 +3035,7 @@ class ReferenceCheckService
}
$gap = substr($content, $tagEnd, $end - $tagEnd);
$gapText = trim(strip_tags(preg_replace(self::BLUE_TAG_REGEX, '', $gap)));
$gapText = trim(strip_tags($this->pregReplaceBlueTags($gap, '')));
if ($gapText !== '' && !$this->isOnlyPunctuationOrSpace($gapText)) {
return $end;
}

View File

@@ -17,7 +17,7 @@ return [
'hostname' => 'localhost',
// 'hostname' => 'ec2-13-229-30-239.ap-southeast-1.compute.amazonaws.com',
// 数据库名
'database' => 'tougao',
'database' => 'tougao2',
// 用户名
// 'username' => 'tmradmin',
'username' => 'root',