From 94b212fe7c6ec47113eeff7ab2125e0e1636d328 Mon Sep 17 00:00:00 2001 From: wyn <1074145239@qq.com> Date: Wed, 27 May 2026 16:09:23 +0800 Subject: [PATCH] =?UTF-8?q?=E6=96=87=E7=8C=AE=E6=A0=A1=E5=AF=B9=E5=8A=9F?= =?UTF-8?q?=E8=83=BD=E5=AE=8C=E5=96=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- application/api/controller/Preaccept.php | 12 +- application/api/controller/References.php | 67 +- application/common/ReferenceCheckService.php | 680 +++++++++++++++++-- application/database.php | 2 +- 4 files changed, 681 insertions(+), 80 deletions(-) diff --git a/application/api/controller/Preaccept.php b/application/api/controller/Preaccept.php index 166af09f..79794434 100644 --- a/application/api/controller/Preaccept.php +++ b/application/api/controller/Preaccept.php @@ -898,7 +898,17 @@ class Preaccept extends Base return jsonSuccess($re); } - + public function getArticleMainById(){ + $data = $this->request->post(); + $rule = new Validate([ + "am_id"=>"require" + ]); + if(!$rule->check($data)){ + return jsonError($rule->getError()); + } + $am_info = $this->article_main_obj->where("am_id",$data['am_id'])->find(); + return jsonSuccess($am_info); + } public function changeH1(){ $data = $this->request->post(); diff --git a/application/api/controller/References.php b/application/api/controller/References.php index 659c12b6..fbc6b6be 100644 --- a/application/api/controller/References.php +++ b/application/api/controller/References.php @@ -11,6 +11,7 @@ use think\Validate; use think\Db; use think\Env; use think\Queue; +use app\common\ReferenceCheckService; /** * @title 参考文献 * @description 相关方法汇总 @@ -1499,12 +1500,72 @@ class References extends Base } /** - * 按 p_refer_id 查单条参考文献的校对明细 + * 多篇文章并行校对时,查询指定文章前面还有几篇在排队 + * + * POST/GET: p_article_id(必填) + * + * 例:当前 5 篇文章正在校对,该文排在第 3 → ahead=2, position=3, running_total=5。 + * 返回:running_total、ahead、position、in_queue、status(整篇校对状态 0/1/2) + */ + public function referenceCheckPendingCountAI() + { + $aParam = $this->request->post(); + if (empty($aParam)) { + $aParam = $this->request->param(); + } + + $iPArticleId = empty($aParam['p_article_id']) ? 0 : intval($aParam['p_article_id']); + if ($iPArticleId <= 0) { + return json_encode(array('status' => 2, 'msg' => 'Please select an article')); + } + + try { + $result = (new ReferenceCheckService())->getArticleCheckQueuePositionByPArticleId($iPArticleId); + return jsonSuccess($result); + } catch (\Exception $e) { + return jsonError($e->getMessage()); + } + } + + /** + * 某条参考文献下「校对失败」的明细重新校对(异步) + * + * POST/GET: p_refer_id(必填) + * p_article_id(可选) + * + * 仅重跑 status=3(校对失败)的记录;不改动 refer_text,只重置结果字段后入 ReferenceCheck 队列。 + * 返回:p_refer_id、p_article_id、reset、queued、check_ids、queue + */ + public function referenceCheckRecheckFailedAI() + { + $aParam = $this->request->post(); + if (empty($aParam)) { + $aParam = $this->request->param(); + } + + $iPReferId = empty($aParam['p_refer_id']) ? 0 : intval($aParam['p_refer_id']); + if ($iPReferId <= 0) { + return json_encode(array('status' => 2, 'msg' => 'Please select a reference')); + } + + $iPArticleId = empty($aParam['p_article_id']) ? 0 : intval($aParam['p_article_id']); + + try { + $result = (new ReferenceCheckService())->enqueueRecheckFailedByPReferId($iPReferId, $iPArticleId); + return jsonSuccess([]); + } catch (\Exception $e) { + return jsonError($e->getMessage()); + } + } + + /** + * 按 p_refer_id 查单条参考文献的校对明细与进度 * * POST/GET: p_refer_id(必填) * - * 返回 list 中每项含:am_id、confidence、reason、is_match、is_pass - * 同时附带上下文:p_refer_id、p_article_id、reference_no、total + * 分组进度:progress_status(0待/1中/2完成/3失败)、pending、done、failed、pass、 + * is_pass、progress_percent、last_updated_at + * list 每项:check_id、am_id、status、confidence、reason、is_match、is_pass */ public function referenceCheckDetailsAI() { diff --git a/application/common/ReferenceCheckService.php b/application/common/ReferenceCheckService.php index b1d3223f..89ef6b8a 100644 --- a/application/common/ReferenceCheckService.php +++ b/application/common/ReferenceCheckService.php @@ -15,12 +15,20 @@ class ReferenceCheckService { const QUEUE_NAME = 'ReferenceCheck'; - /** t_article_main.ref_check_status */ + /** t_article_main.type */ + const MAIN_TYPE_TEXT = 0; + const MAIN_TYPE_IMAGE = 1; + const MAIN_TYPE_TABLE = 2; + + /** t_article_main.ref_check_status(需执行 sql/article_main_ref_check_status.sql) */ const AM_STATUS_NONE = 0; const AM_STATUS_PASS = 1; const AM_STATUS_FAIL = 2; const AM_STATUS_RUNNING = 3; + /** @var bool|null t_article_main 是否已有 ref_check_status 列 */ + private static $amRefCheckStatusColumnExists = null; + /** * 引用校对状态(生命周期顺序:0→1→2→3 = 待→进行→完成→失败) * @@ -52,20 +60,14 @@ class ReferenceCheckService const PASS_CONFIDENCE_THRESHOLD = 0.65; /** - * [...] 引用标签内允许的字符类(带 /u 修饰符使用)。 + * 正文引用标签两种排版(带 /u): + * 1) [8, 9][13-15] —— 方括号在 blue 内 + * 2) [13-15] —— 方括号包裹 blue * - * 除 ASCII 数字、半角逗号、半角连字符、空白外,还兼容常见排版变体: - * , U+FF0C 全角逗号 - * – U+2013 EN DASH - * — U+2014 EM DASH - * − U+2212 MINUS SIGN - * ‐ U+2010 HYPHEN - * ‑ U+2011 NON-BREAKING HYPHEN - * - * 若不支持变体连字符,会导致 [19–21] 这种区间引用整段被 preg 漏掉, - * 进而丢失对应的 reference_no 校对记录。 + * 捕获组均为序号串(可含逗号、区间连字符及排版变体)。 */ const BLUE_TAG_REGEX = '/\[([\d,,\-\x{2013}\x{2014}\x{2212}\x{2010}\x{2011}\s]+)\]<\/blue>/u'; + const BLUE_TAG_REGEX_BRACKET_OUTSIDE = '/\[([\d,,\-\x{2013}\x{2014}\x{2212}\x{2010}\x{2011}\s]+)<\/blue>\]/u'; /** * 兼容无 ?? 的 PHP 版本 @@ -75,6 +77,46 @@ class ReferenceCheckService return isset($arr[$key]) ? $arr[$key] : $default; } + /** + * 合并匹配两种 blue 引用排版,按在正文中的起始位置排序。 + * + * @return array{0: array, 1: array} 同 preg_match_all 的完整匹配与捕获组 1 + */ + private function collectBlueTagMatches($content) + { + $merged = []; + foreach ([self::BLUE_TAG_REGEX, self::BLUE_TAG_REGEX_BRACKET_OUTSIDE] as $pattern) { + if (!preg_match_all($pattern, $content, $m, PREG_OFFSET_CAPTURE)) { + continue; + } + $count = count($m[0]); + for ($i = 0; $i < $count; $i++) { + $merged[] = ['full' => $m[0][$i], 'inner' => $m[1][$i]]; + } + } + + usort($merged, function ($a, $b) { + return $a['full'][1] - $b['full'][1]; + }); + + $matches = [[], []]; + foreach ($merged as $item) { + $matches[0][] = $item['full']; + $matches[1][] = $item['inner']; + } + + return $matches; + } + + /** 对两种 blue 引用排版执行 preg_replace */ + private function pregReplaceBlueTags($subject, $replacement) + { + $subject = preg_replace(self::BLUE_TAG_REGEX, $replacement, $subject); + $subject = preg_replace(self::BLUE_TAG_REGEX_BRACKET_OUTSIDE, $replacement, $subject); + + return $subject; + } + /** * 单条入队(可手工指定正文与文献文本) */ @@ -115,14 +157,18 @@ class ReferenceCheckService return ['check_id' => $checkId, 'queued' => 1]; } public function enqueueByArticleMain($main){ - $amId = $main['am_id']; -// $main = Db::name('article_main') -// ->field('am_id,content,article_id') -// ->where('am_id', $amId) -// ->whereIn('state', [0, 2]) -// ->find(); - $citations = $this->extractReferences((string)$main['content']); -// return $citations; + $amId = intval($this->arrGet($main, 'am_id', 0)); + if ($amId > 0 && (!isset($main['type']) || (intval($main['type']) === self::MAIN_TYPE_TABLE && intval($this->arrGet($main, 'amt_id', 0)) <= 0))) { + $dbMain = Db::name('article_main') + ->field('am_id,content,article_id,type,amt_id') + ->where('am_id', $amId) + ->whereIn('state', [0, 2]) + ->find(); + if (!empty($dbMain)) { + $main = array_merge($dbMain, $main); + } + } + $citations = $this->extractReferencesForArticleMain($main); if (empty($citations)) { $this->setAmRefCheckStatus($amId, self::AM_STATUS_NONE); return; @@ -222,7 +268,7 @@ class ReferenceCheckService $referMap = $this->loadReferMapByPArticleId($pArticleId); $mains = Db::name('article_main') - ->field('am_id,content,article_id') + ->field('am_id,content,article_id,type,amt_id') ->where('article_id', $articleId) ->whereIn('state', [0, 2]) ->order('sort asc') @@ -237,7 +283,7 @@ class ReferenceCheckService $now = date('Y-m-d H:i:s'); foreach ($mains as $main) { $amId = intval($main['am_id']); - $citations = $this->extractReferences((string)$main['content']); + $citations = $this->extractReferencesForArticleMain($main); if (empty($citations)) { $this->setAmRefCheckStatus($amId, self::AM_STATUS_NONE); continue; @@ -309,7 +355,7 @@ class ReferenceCheckService $referMap = $this->loadReferMapByPArticleId($pArticleId); $mains = Db::name('article_main') - ->field('am_id,content,article_id') + ->field('am_id,content,article_id,type,amt_id') ->where('article_id', $articleId) ->whereIn('state', [0, 2]) ->order('sort asc') @@ -324,7 +370,7 @@ class ReferenceCheckService $now = date('Y-m-d H:i:s'); foreach ($mains as $main) { $amId = intval($main['am_id']); - $citations = $this->extractReferences((string)$main['content']); + $citations = $this->extractReferencesForArticleMain($main); if (empty($citations)) { $this->setAmRefCheckStatus($amId, self::AM_STATUS_NONE); continue; @@ -429,9 +475,27 @@ class ReferenceCheckService return $status; } + /** + * t_article_main 是否已加 ref_check_status 列(未迁移时跳过写入,避免 fields not exists) + */ + private function hasAmRefCheckStatusColumn() + { + if (self::$amRefCheckStatusColumnExists !== null) { + return self::$amRefCheckStatusColumnExists; + } + try { + $table = Db::name('article_main')->getTable(); + $rows = Db::query('SHOW COLUMNS FROM `' . str_replace('`', '``', $table) . '` LIKE \'ref_check_status\''); + self::$amRefCheckStatusColumnExists = !empty($rows); + } catch (\Exception $e) { + self::$amRefCheckStatusColumnExists = false; + } + return self::$amRefCheckStatusColumnExists; + } + public function setAmRefCheckStatus($amId, $status) { - if ($amId <= 0) { + if ($amId <= 0 || !$this->hasAmRefCheckStatusColumn()) { return; } Db::name('article_main')->where('am_id', $amId)->update([ @@ -472,7 +536,7 @@ class ReferenceCheckService ->where('p_article_id', $pArticleId) ->delete(); - if ($articleId > 0) { + if ($articleId > 0 && $this->hasAmRefCheckStatusColumn()) { Db::name('article_main') ->where('article_id', $articleId) ->whereIn('state', [0, 2]) @@ -498,10 +562,12 @@ class ReferenceCheckService } $deleted = Db::name('article_reference_check_result')->where('article_id', $articleId)->delete(); - Db::name('article_main') - ->where('article_id', $articleId) - ->whereIn('state', [0, 2]) - ->update(['ref_check_status' => self::AM_STATUS_NONE]); + if ($this->hasAmRefCheckStatusColumn()) { + Db::name('article_main') + ->where('article_id', $articleId) + ->whereIn('state', [0, 2]) + ->update(['ref_check_status' => self::AM_STATUS_NONE]); + } return intval($deleted); } @@ -669,6 +735,68 @@ class ReferenceCheckService ]; } + /** + * 多篇文章并行校对时,查询指定文章前面还有几篇在排队。 + * + * 「正在校对」= 该文至少还有 1 条明细 status=待校验(0)。 + * 排队顺序:按各文章最早一条待校验明细的 id 升序(与全局入队先后一致)。 + * + * @return array{ + * p_article_id:int, + * running_total:int, + * ahead:int, + * position:int, + * in_queue:bool, + * status:int + * } + */ + public function getArticleCheckQueuePositionByPArticleId($pArticleId) + { + $pArticleId = intval($pArticleId); + if ($pArticleId <= 0) { + throw new \InvalidArgumentException('p_article_id is required'); + } + + $rows = Db::name('article_reference_check_result') + ->field('p_article_id, MIN(id) AS queue_anchor') + ->where('status', self::RECORD_PENDING) + ->group('p_article_id') + ->order('queue_anchor', 'asc') + ->select(); + + $runningIds = []; + foreach ($rows as $row) { + $aid = intval($this->arrGet($row, 'p_article_id', 0)); + if ($aid > 0) { + $runningIds[] = $aid; + } + } + + $runningTotal = count($runningIds); + $ahead = 0; + $position = 0; + $inQueue = false; + foreach ($runningIds as $idx => $aid) { + if ($aid === $pArticleId) { + $ahead = $idx; + $position = $idx + 1; + $inQueue = true; + break; + } + } + + $articleStatus = $this->getArticleProgressStatusByPArticleId($pArticleId); + + return [ + 'p_article_id' => $pArticleId, + 'running_total' => $runningTotal, + 'ahead' => $inQueue ? $ahead : 0, + 'position' => $inQueue ? $position : 0, + 'in_queue' => $inQueue, + 'status' => intval($this->arrGet($articleStatus, 'status', self::ARTICLE_PROGRESS_NONE)), + ]; + } + /** * 按 p_article_id 查整篇引用校对进度,按 reference_no 分组聚合状态,并展开每条明细。 * @@ -820,17 +948,16 @@ class ReferenceCheckService } /** - * 按 p_refer_id 查这条参考文献的所有校对明细。 + * 按 p_refer_id 查这条参考文献的校对明细与分组进度。 * - * 每条 record 返回: - * - am_id 命中的 article_main 主键 - * - confidence 匹配置信度(0~1) - * - reason LLM 给出的判定理由 - * - is_match 是否匹配(来自 article_reference_check_result.is_match) - * - is_pass 是否通过校验(confidence >= PASS_CONFIDENCE_THRESHOLD) + * 分组进度(与 referenceCheckProgressAI 单条 list 项口径一致): + * progress_status 0待校验 1校对中 2完成 3失败 + * pending/done/failed/pass、is_pass、progress_percent + * + * list 每项:check_id、am_id、status、confidence、reason、is_match、is_pass * * @param int $pReferId production_article_refer.p_refer_id - * @return array{p_refer_id:int, p_article_id:int, reference_no:int, total:int, list:array} + * @return array */ public function getCheckDetailsByPReferId($pReferId) { @@ -840,7 +967,7 @@ class ReferenceCheckService } $rows = Db::name('article_reference_check_result') - ->field('id,p_article_id,reference_no,am_id,confidence,is_match,reason') + ->field('id,p_article_id,reference_no,am_id,status,confidence,is_match,reason,updated_at') ->where('p_refer_id', $pReferId) ->order('id asc') ->select(); @@ -848,8 +975,13 @@ class ReferenceCheckService $list = []; $pArticleId = 0; $referenceNo = 0; + $pending = 0; + $done = 0; + $failed = 0; + $pass = 0; + $lastUpdatedAt = ''; + foreach ($rows as $row) { - // 取首条出现的 p_article_id / reference_no 作为该 refer 的上下文 if ($pArticleId <= 0) { $pArticleId = intval($this->arrGet($row, 'p_article_id', 0)); } @@ -857,22 +989,87 @@ class ReferenceCheckService $referenceNo = intval($this->arrGet($row, 'reference_no', 0)); } + $st = intval($this->arrGet($row, 'status', 0)); + if ($st === self::RECORD_PENDING) { + $pending++; + } elseif ($st === self::RECORD_COMPLETED) { + $done++; + } elseif ($st === self::RECORD_FAILED) { + $failed++; + } + + $upd = (string)$this->arrGet($row, 'updated_at', ''); + if ($upd > $lastUpdatedAt) { + $lastUpdatedAt = $upd; + } + $confidence = floatval($this->arrGet($row, 'confidence', 0)); + $isPass = $confidence >= self::PASS_CONFIDENCE_THRESHOLD; + if ($isPass) { + $pass++; + } + $list[] = [ + 'check_id' => intval($this->arrGet($row, 'id', 0)), 'am_id' => intval($this->arrGet($row, 'am_id', 0)), + 'status' => $st, 'confidence' => $confidence, 'reason' => (string)$this->arrGet($row, 'reason', ''), 'is_match' => intval($this->arrGet($row, 'is_match', 0)), - 'is_pass' => $confidence >= self::PASS_CONFIDENCE_THRESHOLD, + 'is_pass' => $isPass, ]; } + if ($referenceNo <= 0) { + $refer = Db::name('production_article_refer') + ->where('p_refer_id', $pReferId) + ->where('state', 0) + ->find(); + if (!empty($refer)) { + if ($pArticleId <= 0) { + $pArticleId = intval($this->arrGet($refer, 'p_article_id', 0)); + } + $referenceNo = intval($this->arrGet($refer, 'index', 0)) + 1; + } + } + + $total = count($list); + if ($total === 0) { + $progressStatus = self::PROGRESS_PENDING; + $progressPercent = 0; + $isPassGroup = false; + } elseif ($pending === $total) { + $progressStatus = self::PROGRESS_PENDING; + $progressPercent = 0; + $isPassGroup = false; + } elseif ($pending === 0) { + $progressStatus = $failed > 0 ? self::PROGRESS_FAILED : self::PROGRESS_COMPLETED; + $progressPercent = 100; + $isPassGroup = ( + $progressStatus === self::PROGRESS_COMPLETED + && $pass === $total + ); + } else { + $progressStatus = self::PROGRESS_CHECKING; + $finished = $done + $failed; + $progressPercent = round($finished / $total * 100, 1); + $isPassGroup = false; + } + return [ - 'p_refer_id' => $pReferId, - 'p_article_id' => $pArticleId, - 'reference_no' => $referenceNo, - 'total' => count($list), - 'list' => $list, + 'p_refer_id' => $pReferId, + 'p_article_id' => $pArticleId, + 'reference_no' => $referenceNo, + 'total' => $total, + 'pending' => $pending, + 'done' => $done, + 'failed' => $failed, + 'pass' => $pass, + 'progress_status' => $progressStatus, + 'progress_percent' => $progressPercent, + 'is_pass' => $isPassGroup, + 'last_updated_at' => $lastUpdatedAt, + 'list' => $list, ]; } @@ -1010,8 +1207,12 @@ class ReferenceCheckService */ public function buildArticlePreview($articleId, $amId = 0) { + $fields = 'am_id,content,sort,type,amt_id'; + if ($this->hasAmRefCheckStatusColumn()) { + $fields .= ',ref_check_status'; + } $q = Db::name('article_main') - ->field('am_id,content,sort,ref_check_status') + ->field($fields) ->where('article_id', $articleId) ->whereIn('state', [0, 2]); if ($amId > 0) { @@ -1039,7 +1240,7 @@ class ReferenceCheckService foreach ($mains as $main) { $id = intval($main['am_id']); - $content = (string)$main['content']; + $content = $this->resolveArticleMainCheckContent($main); $badIndex = isset($badByAm[$id]) ? $badByAm[$id] : array(); $marked = $this->markContentForPreview($content, $id, $badIndex); $amStatus = intval($this->arrGet($main, 'ref_check_status', 0)); @@ -1158,12 +1359,7 @@ class ReferenceCheckService $html = $content; // 1) 先标记 blue 内各序号(在原文上操作,[70-73] 仅标不合理者如 70、71) - preg_match_all( - self::BLUE_TAG_REGEX, - $html, - $matches, - PREG_OFFSET_CAPTURE - ); + $matches = $this->collectBlueTagMatches($html); $citeDeltas = []; if (!empty($matches[0])) { $replacements = []; @@ -1318,14 +1514,6 @@ class ReferenceCheckService return implode("\n", $parts); } - /** - * 前端修改参考文献后重新校对:仅处理已有校对记录,刷新 refer_text、重置结果并入队;无记录直接返回 - * - * @param int $articleId - * @param int $pReferId t_production_article_refer.p_refer_id(优先) - * @param int $referenceNo 文献序号 index+1(无 p_refer_id 时用) - * @return array - */ /** * 编辑某条文献内容后,按 p_refer_id 异步重新校对该文献对应的全部 check 明细 * @@ -1387,7 +1575,7 @@ class ReferenceCheckService 'refer_text' => $referText, 'refer_index' => $referenceNo, 'reference_no' => $referenceNo, - 'status' => 0, + 'status' => self::RECORD_PENDING, 'is_match' => 0, 'can_support' => 0, 'confidence' => 0, @@ -1401,7 +1589,6 @@ class ReferenceCheckService foreach ($rows as $row) { $checkId = $this->resolveCheckRowId($row); Db::name('article_reference_check_result')->where('id', $checkId)->update($resetFields); - // 旧的队列完成标记必须清掉,否则同 check_id 再次投递会被 acquireLock 静默丢弃 $this->clearReferenceCheckQueueLock($checkId); $pendingJobs[] = [ 'check_id' => $checkId, @@ -1432,6 +1619,92 @@ class ReferenceCheckService ]; } + /** + * 某条参考文献下「校对失败」的明细重新校对(仅 status=RECORD_FAILED,异步入队) + * + * 不刷新 refer_text / reference_no,沿用记录内已有正文与文献快照,只重置结果字段后入队。 + * + * @param int $pReferId t_production_article_refer.p_refer_id(必填) + * @param int $pArticleId 可选,进一步限定文章 + * @return array{p_refer_id:int, p_article_id:int, reset:int, queued:int, check_ids:int[], queue:string} + */ + public function enqueueRecheckFailedByPReferId($pReferId, $pArticleId = 0) + { + $pReferId = intval($pReferId); + if ($pReferId <= 0) { + throw new \InvalidArgumentException('p_refer_id is required'); + } + + $q = Db::name('article_reference_check_result') + ->where('p_refer_id', $pReferId) + ->where('status', self::RECORD_FAILED); + $pArticleId = intval($pArticleId); + if ($pArticleId > 0) { + $q->where('p_article_id', $pArticleId); + } + + $rows = $q->select(); + + if (empty($rows)) { + return [ + 'p_refer_id' => $pReferId, + 'p_article_id' => $pArticleId, + 'reset' => 0, + 'queued' => 0, + 'check_ids' => [], + 'queue' => self::QUEUE_NAME, + ]; + } + + if ($pArticleId <= 0) { + $pArticleId = intval($this->arrGet($rows[0], 'p_article_id', 0)); + } + + $now = date('Y-m-d H:i:s'); + $resetFields = [ + 'status' => self::RECORD_PENDING, + 'is_match' => 0, + 'can_support' => 0, + 'confidence' => 0, + 'reason' => '', + 'error_msg' => '', + 'updated_at' => $now, + ]; + + $pendingJobs = []; + $amIds = []; + foreach ($rows as $row) { + $checkId = $this->resolveCheckRowId($row); + Db::name('article_reference_check_result')->where('id', $checkId)->update($resetFields); + $this->clearReferenceCheckQueueLock($checkId); + $pendingJobs[] = [ + 'check_id' => $checkId, + 'reference_no' => intval($this->arrGet($row, 'reference_no', 0)), + 'am_id' => intval($this->arrGet($row, 'am_id', 0)), + 'text_start' => intval($this->arrGet($row, 'text_start', 0)), + ]; + $amId = intval($this->arrGet($row, 'am_id', 0)); + if ($amId > 0) { + $amIds[$amId] = true; + } + } + + foreach (array_keys($amIds) as $amId) { + $this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING); + } + + $checkIds = $this->pushJobsSortedByReferenceNo($pendingJobs); + + return [ + 'p_refer_id' => $pReferId, + 'p_article_id' => $pArticleId, + 'reset' => count($rows), + 'queued' => count($checkIds), + 'check_ids' => $checkIds, + 'queue' => self::QUEUE_NAME, + ]; + } + public function recheckByRefer($articleId, $pReferId = 0, $referenceNo = 0) { $articleId = intval($articleId); @@ -1600,9 +1873,9 @@ class ReferenceCheckService if ($contentA === '' || $contentB === '') { $this->updateCheckResult($checkId, [ 'status' => self::RECORD_FAILED, - 'error_msg' => 'Missing article_main.content or refer_text', + 'error_msg' => 'Missing section content (text/table) or refer_text', ]); - throw new \RuntimeException('Missing article_main.content or refer_text'); + throw new \RuntimeException('Missing section content (text/table) or refer_text'); } $llmResult = (new LLMService())->checkReference($contentA, $contentB, false); @@ -1748,7 +2021,7 @@ class ReferenceCheckService } /** - * 第一次校对:取 article_main.content(整节正文) + * 第一次校对:正文取 article_main.content;表格(type=2)取 article_main_table.table_data 等 */ public function resolveMainContentForJob(array $row, $maxChars = 8000) { @@ -1757,23 +2030,280 @@ class ReferenceCheckService return ''; } $main = Db::name('article_main') - ->field('content') + ->field('content,type,amt_id,article_id') ->where('am_id', $amId) ->find(); if (empty($main)) { return ''; } - $text = trim((string)$this->arrGet($main, 'content', '')); - if ($text === '') { + $raw = trim($this->resolveArticleMainCheckContent($main)); + if ($raw === '') { return ''; } - $text = preg_replace(self::BLUE_TAG_REGEX, '[$1]', $text); + return $this->normalizeCheckContentForLlm($raw, $maxChars); + } + + /** + * 是否为表格节:type=2、有 amt_id,或 content 为 <table tableId='…'/> 占位 + */ + private function isArticleMainTableSection(array $main) + { + if (intval($this->arrGet($main, 'type', self::MAIN_TYPE_TEXT)) === self::MAIN_TYPE_TABLE) { + return true; + } + if (intval($this->arrGet($main, 'amt_id', 0)) > 0) { + return true; + } + $content = (string)$this->arrGet($main, 'content', ''); + + return stripos($content, 'arrGet($main, 'amt_id', 0)); + if ($amtId > 0) { + return $amtId; + } + $content = (string)$this->arrGet($main, 'content', ''); + if (preg_match('/tableId\s*=\s*[\'"]?(\d+)/i', $content, $m)) { + return intval($m[1]); + } + + return 0; + } + + /** + * @return array|null + */ + private function loadArticleMainTableRow(array $main) + { + $amtId = $this->resolveArticleMainTableAmtId($main); + if ($amtId <= 0) { + return null; + } + + $q = Db::name('article_main_table') + ->where('amt_id', $amtId) + ->whereIn('state', [0, 2]) + ->field('table_data,title,note'); + $articleId = intval($this->arrGet($main, 'article_id', 0)); + if ($articleId > 0) { + $q->where('article_id', $articleId); + } + $tbl = $q->find(); + + return empty($tbl) ? null : $tbl; + } + + /** + * 按节提取引用:正文走 content;表格按行拼接单元格后扫描(Study 列仅 [n] 时也能带上同行上下文) + */ + public function extractReferencesForArticleMain(array $main) + { + if (!$this->isArticleMainTableSection($main)) { + return $this->extractReferences((string)$this->arrGet($main, 'content', '')); + } + + $tbl = $this->loadArticleMainTableRow($main); + if (empty($tbl)) { + return []; + } + + $extra = []; + foreach (['title', 'note'] as $field) { + $part = trim((string)$this->arrGet($tbl, $field, '')); + if ($part !== '') { + $extra[] = $part; + } + } + + return $this->extractReferencesFromTableDataJson( + (string)$this->arrGet($tbl, 'table_data', ''), + $extra + ); + } + + /** + * table_data 按行提取;$prefixChunks 为 title/note 等(在表格行之前扫描) + */ + public function extractReferencesFromTableDataJson($tableDataJson, array $prefixChunks = []) + { + $result = []; + $offset = 0; + + foreach ($prefixChunks as $chunk) { + $chunk = trim((string)$chunk); + if ($chunk === '') { + continue; + } + foreach ($this->extractReferences($chunk) as $cite) { + $cite['text_start'] = intval($cite['text_start']) + $offset; + $cite['text_end'] = intval($cite['text_end']) + $offset; + $cite['reference_start'] = intval($cite['reference_start']) + $offset; + $cite['reference_end'] = intval($cite['reference_end']) + $offset; + $result[] = $cite; + } + $offset += strlen($chunk) + 1; + } + + $tableDataJson = trim((string)$tableDataJson); + if ($tableDataJson === '') { + return $result; + } + + $decoded = $this->decodeTableDataJsonToArray($tableDataJson); + if ($decoded === null) { + foreach ($this->extractReferences($tableDataJson) as $cite) { + $cite['text_start'] = intval($cite['text_start']) + $offset; + $cite['text_end'] = intval($cite['text_end']) + $offset; + $cite['reference_start'] = intval($cite['reference_start']) + $offset; + $cite['reference_end'] = intval($cite['reference_end']) + $offset; + $result[] = $cite; + } + + return $result; + } + + foreach ($decoded as $row) { + $line = $this->buildTableRowCheckLine($row); + if ($line === '') { + continue; + } + foreach ($this->extractReferences($line) as $cite) { + $cite['text_start'] = intval($cite['text_start']) + $offset; + $cite['text_end'] = intval($cite['text_end']) + $offset; + $cite['reference_start'] = intval($cite['reference_start']) + $offset; + $cite['reference_end'] = intval($cite['reference_end']) + $offset; + $result[] = $cite; + } + $offset += strlen($line) + 1; + } + + return $result; + } + + /** + * 入队/LLM 用的原始 HTML:type=0 为 content;表格为 table_data 按行展平 + */ + public function resolveArticleMainCheckContent(array $main) + { + if (!$this->isArticleMainTableSection($main)) { + return (string)$this->arrGet($main, 'content', ''); + } + + $tbl = $this->loadArticleMainTableRow($main); + if (empty($tbl)) { + return ''; + } + + $chunks = []; + foreach (['title', 'note'] as $field) { + $part = trim((string)$this->arrGet($tbl, $field, '')); + if ($part !== '') { + $chunks[] = $part; + } + } + $flat = $this->flattenTableDataJsonToCheckContent((string)$this->arrGet($tbl, 'table_data', '')); + if ($flat !== '') { + $chunks[] = $flat; + } + + return implode("\n", $chunks); + } + + /** + * 表格一行:各单元格 text 用 " | " 连接(保留同行化学名/部位/Study 列引用) + */ + private function buildTableRowCheckLine($row) + { + if (!is_array($row)) { + return ''; + } + $cells = []; + foreach ($row as $cell) { + if (!is_array($cell)) { + continue; + } + $text = trim((string)$this->arrGet($cell, 'text', '')); + if ($text !== '') { + $cells[] = $text; + } + } + + return implode(' | ', $cells); + } + + /** + * table_data 按行展平(供 LLM / 预览);非法 JSON 时按整串处理 + */ + private function flattenTableDataJsonToCheckContent($tableDataJson) + { + $tableDataJson = trim((string)$tableDataJson); + if ($tableDataJson === '') { + return ''; + } + $decoded = $this->decodeTableDataJsonToArray($tableDataJson); + if ($decoded === null) { + return $tableDataJson; + } + + $lines = []; + foreach ($decoded as $row) { + $line = $this->buildTableRowCheckLine($row); + if ($line !== '') { + $lines[] = $line; + } + } + + return implode("\n", $lines); + } + + /** + * @return array|null + */ + private function decodeTableDataJsonToArray($raw) + { + $raw = trim((string)$raw); + if ($raw === '') { + return null; + } + if (preg_match('/^\xEF\xBB\xBF/', $raw)) { + $raw = substr($raw, 3); + } + $decoded = json_decode($raw, true); + if (json_last_error() !== JSON_ERROR_NONE) { + return null; + } + if (is_array($decoded)) { + return $decoded; + } + if (is_string($decoded)) { + $decoded2 = json_decode($decoded, true); + if (json_last_error() === JSON_ERROR_NONE && is_array($decoded2)) { + return $decoded2; + } + } + + return null; + } + + private function normalizeCheckContentForLlm($raw, $maxChars = 8000) + { + $text = $this->pregReplaceBlueTags($raw, '[$1]'); $text = strip_tags($text); $text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8'); $text = preg_replace('/\s+/u', ' ', $text); $text = trim($text); + if ($text === '') { + return ''; + } $maxChars = max(500, intval($maxChars)); if (mb_strlen($text) > $maxChars) { @@ -2134,12 +2664,12 @@ class ReferenceCheckService } /** - * 从 article_main.content 提取 blue 引用 + * 从正文 HTML 或表格展平后的 HTML 提取 blue 引用 */ public function extractReferences($content) { $result = []; - preg_match_all(self::BLUE_TAG_REGEX, $content, $matches, PREG_OFFSET_CAPTURE); + $matches = $this->collectBlueTagMatches($content); if (empty($matches[0])) { return []; } @@ -2319,7 +2849,7 @@ class ReferenceCheckService private function buildCitationContextText($content, $start, $end) { $text = $this->byteSubstr($content, $start, $end); - $text = preg_replace(self::BLUE_TAG_REGEX, '', $text); + $text = $this->pregReplaceBlueTags($text, ''); $text = trim(strip_tags($text)); $text = preg_replace('/\s+/u', ' ', $text); $text = ltrim($text, "\xEF\xBB\xBF"); @@ -2505,7 +3035,7 @@ class ReferenceCheckService } $gap = substr($content, $tagEnd, $end - $tagEnd); - $gapText = trim(strip_tags(preg_replace(self::BLUE_TAG_REGEX, '', $gap))); + $gapText = trim(strip_tags($this->pregReplaceBlueTags($gap, ''))); if ($gapText !== '' && !$this->isOnlyPunctuationOrSpace($gapText)) { return $end; } diff --git a/application/database.php b/application/database.php index 0295739a..d8ef7790 100644 --- a/application/database.php +++ b/application/database.php @@ -17,7 +17,7 @@ return [ 'hostname' => 'localhost', // 'hostname' => 'ec2-13-229-30-239.ap-southeast-1.compute.amazonaws.com', // 数据库名 - 'database' => 'tougao', + 'database' => 'tougao2', // 用户名 // 'username' => 'tmradmin', 'username' => 'root',