Changes

2026-05-21 11:30:46 +08:00
parent 4aab7f5b7e
commit 7e5a087a4e
3 changed files with 494 additions and 53 deletions
--- a/application/common/ReferenceCheckService.php
+++ b/application/common/ReferenceCheckService.php
@@ -75,7 +75,10 @@ class ReferenceCheckService
 //            ->find();
        $citations = $this->extractReferences((string)$main['content']);
 //        return $citations;
-
+        if (empty($citations)) {
+            $this->setAmRefCheckStatus($amId, self::AM_STATUS_NONE);
+            return;
+        }
        $prod = Db::name('production_article')
            ->where('article_id', $main['article_id'])
            ->where('state', 0)
@@ -128,15 +131,10 @@ class ReferenceCheckService

        $this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING);
    }
-    /**
-     * 按 article_id 扫描 t_article_main，为每个 blue 引用 × 文献号入队
-     */
-    public function enqueueByArticle($articleId, $clearPrevious = true)
-    {
+    public function enqueueByArticle($articleId){
        if ($articleId <= 0) {
            throw new \InvalidArgumentException('article_id is required');
        }
-
        $prod = Db::name('production_article')
            ->where('article_id', $articleId)
            ->where('state', 0)
@@ -144,25 +142,18 @@ class ReferenceCheckService
        if (empty($prod)) {
            throw new \RuntimeException('production_article not found for article_id=' . $articleId);
        }
-
        $pArticleId = intval($prod['p_article_id']);
        $referMap = $this->loadReferMapByPArticleId($pArticleId);

        $mains = Db::name('article_main')
-            ->field('am_id,content')
+            ->field('am_id,content,article_id')
            ->where('article_id', $articleId)
            ->whereIn('state', [0, 2])
            ->order('sort asc')
            ->select();
-
        if (empty($mains)) {
            throw new \RuntimeException('article_main is empty');
        }
-
-        if ($clearPrevious) {
-            $this->clearArticleChecks($articleId);
-        }
-
        $queued = 0;
        $skipped = 0;
        $checkIds = [];
@@ -189,20 +180,16 @@ class ReferenceCheckService
                    $now = date('Y-m-d H:i:s');
                    // [70-73] 展开为 reference_no=70,71,72,73 共 4 条记录
                    $checkId = Db::name('article_reference_check_result')->insertGetId([
-                        'article_id'      => $articleId,
-                        'am_id'           => intval($main['am_id']),
+                        'article_id'      => $main['article_id'],
                        'p_article_id'    => $pArticleId,
-                        'p_refer_id'      => intval($refer['p_refer_id']),
-                        'refer_index'     => $referIndex,
+                        'am_id'           => intval($main['am_id']),
                        'reference_no'    => $refNo,
-                        'reference_raw'   => $cite['reference_raw'],
-                        'cite_tag_start'  => intval($cite['reference_start']),
-                        'cite_tag_end'    => intval($cite['reference_end']),
-                        'text_start'      => intval($cite['text_start']),
-                        'text_end'        => intval($cite['text_end']),
-                        'content_a'       => $cite['original_text'],
-                        'content_b'       => $referText,
-                        'status'          => 0,
+                        'refer_index'     => $refNo,
+                        'origin_text'     => $cite['original_text'],
+                        'refer_text'      => $referText,
+                        'p_refer_id'      => $referMap[$referIndex]['p_refer_id'],
+                        'text_start'      => $cite['text_start'],
+                        'text_end'        => $cite['text_end'],
                        'created_at'      => $now,
                        'updated_at'      => $now,
                    ]);
@@ -658,12 +645,21 @@ class ReferenceCheckService
            $referenceNumbers = $this->expandReferenceNumbers($rawRef);

            $sentenceStart = $this->findSentenceStart($content, $tagStart);
-            $sentenceEnd = $this->findSentenceEnd($content, $tagEnd);
-            $originalText = mb_substr($content, $sentenceStart, $sentenceEnd - $sentenceStart);
-            $originalText = preg_replace('/<blue>\[[\d,\-\s]+\]<\/blue>/', '', $originalText);
-            $originalText = trim(strip_tags($originalText));
+            $sentenceEnd = $this->findSentenceEnd($content, $tagEnd, $tagEnd);
+            $originalText = $this->buildCitationContextText($content, $sentenceStart, $sentenceEnd);

-            if ($originalText === '' || empty($referenceNumbers)) {
+            if (!$this->isMeaningfulCitationContext($originalText)) {
+                list($sentenceStart, $sentenceEnd) = $this->widenCitationContextBounds(
+                    $content,
+                    $tagStart,
+                    $tagEnd,
+                    $sentenceStart,
+                    $sentenceEnd
+                );
+                $originalText = $this->buildCitationContextText($content, $sentenceStart, $sentenceEnd);
+            }
+
+            if (!$this->isMeaningfulCitationContext($originalText) || empty($referenceNumbers)) {
                continue;
            }

@@ -707,29 +703,137 @@ class ReferenceCheckService
        return array_values(array_unique($numbers));
    }

+    private function buildCitationContextText($content, $start, $end)
+    {
+        $text = mb_substr($content, $start, max(0, $end - $start));
+        $text = preg_replace('/<blue>\[[\d,\-\s]+\]<\/blue>/', '', $text);
+        $text = trim(strip_tags($text));
+        $text = preg_replace('/\s+/u', ' ', $text);
+
+        return $text;
+    }
+
+    /**
+     * 过滤仅标点、过短或无字母/汉字的上下文（如去掉标签后只剩 "."）
+     */
+    private function isMeaningfulCitationContext($text)
+    {
+        $text = trim($text);
+        if ($text === '') {
+            return false;
+        }
+        if ($this->isOnlyPunctuationOrSpace($text)) {
+            return false;
+        }
+        if (!preg_match('/[\p{L}\p{N}]/u', $text)) {
+            return false;
+        }
+
+        return mb_strlen($text) >= 2;
+    }
+
+    private function isOnlyPunctuationOrSpace($text)
+    {
+        return preg_match('/^[\s\p{P}\p{S}]+$/u', $text) === 1;
+    }
+
+    /**
+     * 首句过短时向前后各扩展一句（上限约 2000 字符）
+     */
+    private function widenCitationContextBounds($content, $tagStart, $tagEnd, $start, $end)
+    {
+        $len = strlen($content);
+        $maxSpan = 2000;
+
+        if ($start > 0) {
+            $prevStart = $this->findSentenceStart($content, max(0, $start - 1));
+            if ($prevStart < $start) {
+                $start = $prevStart;
+            }
+        }
+
+        $nextEnd = $this->findSentenceEnd($content, $end, $tagEnd);
+        if ($nextEnd > $end && $nextEnd <= $len) {
+            $end = $nextEnd;
+        }
+
+        if ($end - $start > $maxSpan) {
+            $half = (int)floor($maxSpan / 2);
+            $mid = (int)floor(($tagStart + $tagEnd) / 2);
+            $start = max(0, $mid - $half);
+            $end = min($len, $start + $maxSpan);
+        }
+
+        return [$start, $end];
+    }
+
+    /**
+     * 句号是否可作为句界（排除 0.95、3.14 等小数点）
+     */
+    private function isSentenceDelimiterAt($content, $pos, $delimiter)
+    {
+        $len = strlen($content);
+        if ($delimiter !== '.' || $pos < 0 || $pos >= $len) {
+            return true;
+        }
+        if ($pos > 0 && $pos + 1 < $len
+            && ctype_digit($content[$pos - 1])
+            && ctype_digit($content[$pos + 1])
+        ) {
+            return false;
+        }
+
+        return true;
+    }
+
    private function findSentenceStart($content, $position)
    {
        $start = 0;
        foreach (['.', '。', '!', '?', "\n"] as $delimiter) {
            $pos = strrpos(substr($content, 0, $position), $delimiter);
-            if ($pos !== false) {
+            if ($pos !== false && $this->isSentenceDelimiterAt($content, $pos, $delimiter)) {
                $start = max($start, $pos + 1);
            }
        }
        return $start;
    }

-    private function findSentenceEnd($content, $position)
+    /**
+     * @param int $searchFrom  从该字节位置起查找句末
+     * @param int $tagEnd      引用标签结束位置；用于跳过 </blue> 后紧跟的孤立句号
+     */
+    private function findSentenceEnd($content, $searchFrom, $tagEnd = 0)
    {
        $length = strlen($content);
-        $endPositions = [];
-        foreach (['.', '。', '!', '?', "\n"] as $delimiter) {
-            $pos = strpos($content, $delimiter, $position);
-            if ($pos !== false) {
-                $endPositions[] = $pos + 1;
+        $minPos = max(0, $searchFrom);
+
+        while ($minPos < $length) {
+            $endPositions = [];
+            foreach (['.', '。', '!', '?', "\n"] as $delimiter) {
+                $pos = strpos($content, $delimiter, $minPos);
+                if ($pos !== false && $this->isSentenceDelimiterAt($content, $pos, $delimiter)) {
+                    $endPositions[] = $pos + 1;
+                }
            }
+            if (empty($endPositions)) {
+                return $length;
+            }
+
+            $end = min($endPositions);
+            if ($tagEnd <= 0 || $end <= $tagEnd) {
+                return $end;
+            }
+
+            $gap = substr($content, $tagEnd, $end - $tagEnd);
+            $gapText = trim(strip_tags(preg_replace('/<blue>\[[\d,\-\s]+\]<\/blue>/', '', $gap)));
+            if ($gapText !== '' && !$this->isOnlyPunctuationOrSpace($gapText)) {
+                return $end;
+            }
+
+            $minPos = $end;
        }
-        return empty($endPositions) ? $length : min($endPositions);
+
+        return $length;
    }

    private function pushJob($checkId, $delaySeconds = 0)
--- a/application/common/service/LLMService.php
+++ b/application/common/service/LLMService.php
@@ -93,6 +93,318 @@ class LLMService
            'reason' => $this->cleanReason((string)(isset($parsed['reason']) ? $parsed['reason'] : '')),
        ];
    }
+    private function buildReferenceCheckSystemPrompt3()
+    {
+        return <<<'PROMPT'
+你是一名护理、医学与科研期刊的资深编辑，专门校对「正文引用句」与「对应参考文献条目」是否匹配。
+
+你的职责是判断：作者在该引用位置引用的观点、数据、结论、方法、定义、理论或证据，是否能够被该条参考文献合理支撑。
+
+你只能依据用户提供的两段文本判断，不得假设已阅读全文，不得联网，不得编造文献中未出现的信息。
+
+【输入内容】
+你将收到：
+
+1. 正文引用句（引用位置附近的一句话或一段话）
+
+2. 当前对应的参考文献条目（仅当前编号，不是整篇参考文献列表）
+
+你必须严格只评估「当前这一条参考文献」与引用句的关系。
+
+====================
+【核心判断目标】
+
+判断：
+正文中的核心论点、事实、数据、定义、护理措施、医学结论、研究发现、理论依据、政策依据、算法方法、统计方法、模型结构等，是否可由该条参考文献合理支撑。
+
+你评估的是“引用是否成立”，不是“句子是否正确”。
+
+====================
+【硬性约束（必须遵守）】
+
+1. 只能依据用户提供的信息判断
+- 不得假设看过全文。
+- 不得联网。
+- 不得根据常识补全文献内容。
+- 不得根据作者、期刊名、热点方向脑补研究结果。
+- 不得把“可能研究了”视为“能够支撑”。
+
+2. 严禁串号判断
+- 仅允许依据「当前引用句」与「当前参考文献条目」判断。
+- 严禁利用其它参考文献编号或上下文内容推断当前文献。
+
+3. 不得关键词硬匹配
+禁止因为出现相同关键词就判匹配，例如：
+“护理”“患者”“治疗”“效果”“心理”“机器学习”“深度学习”“模型”等。
+
+必须重点判断：
+- 对象是否一致
+- 疾病/场景是否一致
+- 人群是否一致
+- 干预方式是否一致
+- 方法学是否一致
+- 关键结论是否一致
+
+4. 医学与科研错引从严
+若出现以下情况，优先判 false：
+
+- 同领域但具体疾病不同
+- 人群不同（儿童 vs 老年）
+- 场景不同（ICU vs 普通病房）
+- 干预方式不同
+- 指标或结局不同
+- 指南、综述、Meta、原始研究混用
+- 文献无法支撑正文中的强结论
+
+例如：
+正文：
+“研究证实显著降低死亡率”
+
+文献：
+“某护理模式应用观察”
+
+不得脑补效果成立，应从严判 false。
+
+5. 特定证据类型必须一致
+若正文明确声明：
+
+- “随机对照研究显示”
+- “Meta分析表明”
+- “系统综述指出”
+- “指南推荐”
+- “专家共识建议”
+
+而文献条目显示证据类型不一致，应从严判 false。
+
+6. 方法学引用必须严格一致（非常重要）
+若正文明确引用某种：
+
+- 算法
+- 模型
+- 聚类方法
+- 分类方法
+- 深度学习架构
+- 统计方法
+- 数学技术
+- 数据处理方法
+
+则文献必须与该方法存在明确合理关联。
+
+例如：
+
+不匹配：
+- fuzzy clustering ≠ deep learning
+- random forest ≠ SVM
+- CNN ≠ LSTM
+- 聚类模型 ≠ 分类模型
+- 回归分析 ≠ 聚类分析
+
+仅属于同一“人工智能/机器学习”大领域，不能视为匹配。
+
+若方法体系明显不同：
+优先判 false + confidence=0.15。
+
+7. 信息不足从严
+若参考文献条目信息过少（仅作者+年份等）：
+
+只有在能够建立明确关联时才可判 true。
+
+无法建立明确关联：
+判 false。
+
+====================
+【评估步骤（按顺序在心里完成）】
+
+第一步：主题域一致性
+判断正文核心主题与文献是否属于同一专业领域，包括：
+
+- 疾病
+- 患者群体
+- 护理问题
+- 医疗场景
+- 干预措施
+- 指标/结局
+- 理论模型
+- 政策/指南
+- 算法/统计方法
+
+第二步：关键断言对齐
+判断正文中的核心断言是否能够被文献合理支撑。
+
+允许：
+- 合理概括
+- 轻度表述扩展
+
+不允许：
+- 张冠李戴
+- 过度推断
+- 用弱证据支撑强结论
+- 用相关性支撑因果性
+- 用观察研究支撑RCT级表述
+- 方法体系不一致
+
+第三步：错引排查
+重点检查：
+
+- 疾病错
+- 人群错
+- 场景错
+- 方法错
+- 指标错
+- 研究类型错
+- 证据层级错
+- 算法体系错
+
+====================
+【最终判定规则】
+
+is_match（二选一）
+
+true：
+满足以下全部条件：
+- 主题明确相关
+- 核心对象基本一致
+- 方法或研究方向合理一致
+- 正文关键论点能够被文献支撑
+- 不存在明显错引风险
+
+false：
+满足任一情况：
+- 主题无关
+- 对象不同
+- 疾病/场景不同
+- 方法体系明显不同
+- 核心结论对不上
+- 文献无法支撑正文强结论
+- 证据类型不一致
+- 无法建立明确合理关联
+- 信息不足无法确认
+
+边界情况从严判 false。
+
+====================
+【confidence 固定评分规则】
+
+只能输出以下固定值之一：
+
+0.98
+0.92
+0.85
+0.78
+0.65
+0.45
+0.35
+0.25
+0.15
+
+禁止输出任何其它数字。
+
+--------------------
+【true 档位】
+
+0.98（几乎完全一致）
+主题、对象、方法、核心结论高度一致。
+
+0.92（高度匹配）
+主题与关键论点明确一致，仅存在轻微概括。
+
+0.85（较匹配）
+主题和核心结论一致，但表述略宽。
+
+0.78（基本匹配）
+大方向一致，但存在轻微泛化或不精确。
+
+0.65（边界匹配）
+存在一定支撑关系，但结论略强或关联较弱。
+
+--------------------
+【false 档位】
+
+0.45（人工复核）
+信息不足、标题过泛、同领域但无法确认。
+
+0.35（较可能错引）
+同领域但对象、场景、结论存在明显偏差。
+
+0.25（明显不匹配）
+主题相关但核心论点明显不一致。
+
+0.15（明确错引）
+以下情况优先使用：
+
+- 主题无关
+- 方法体系明显不同
+- 典型张冠李戴
+- 完全无法支撑正文内容
+
+例如：
+正文讲 fuzzy clustering，
+文献讲 hybrid deep learning，
+应判：
+false + 0.15。
+
+====================
+【硬性规则】
+
+- is_match=true 时：
+confidence 只能是：
+0.65 / 0.78 / 0.85 / 0.92 / 0.98
+
+- is_match=false 时：
+confidence 只能是：
+0.15 / 0.25 / 0.35 / 0.45
+
+禁止违反。
+
+====================
+【评分稳定原则】
+
+- 相同输入必须得到相同结果。
+- 优先依据“主题 + 核心断言”。
+- 不要被单个关键词误导。
+- 一句多引时，仅评价当前这一条文献。
+- 边界情况从严，降低漏报错引风险。
+- 方法学不一致时优先 false。
+
+====================
+【reason 输出要求】
+
+- 使用简体中文。
+- 长度控制在 30~80 字。
+- 只说明两件事：
+  1）主题/对象/方法是否一致；
+  2）核心论点是否能够支撑。
+
+禁止模糊措辞：
+- “可能有关”
+- “看起来一致”
+- “应该支持”
+- “似乎”
+
+应明确表达：
+一致 / 不一致 / 无法支撑。
+
+====================
+【输出格式（绝对严格）】
+
+仅输出一行 minified JSON。
+
+禁止：
+- markdown
+- 代码块
+- 换行
+- 解释说明
+- 前后文字
+
+格式：
+
+{"is_match":true|false,"confidence":0.15|0.25|0.35|0.45|0.65|0.78|0.85|0.92|0.98,"reason":"简体中文原因"}
+
+【示例输出】
+
+{"is_match":false,"confidence":0.15,"reason":"正文讨论改进模糊聚类算法及聚类划分优化，而文献主题为基于步态加速度的糖尿病深度学习检测，研究方法与核心内容明显不符。"}
+PROMPT;
+    }
    private function buildReferenceCheckSystemPrompt()
    {
        return <<<'PROMPT'
@@ -166,6 +478,24 @@ class LLMService
 - 只有在能够建立明确合理关联时才判 true。
 - 无法建立明确关联时，判 false（confidence=0.35）。

+7. 方法学引用严格一致
+若正文明确引用某一算法、模型、统计方法、聚类方法、
+深度学习架构、评估方法或数学技术：
+
+必须要求参考文献与该方法存在明确合理关联。
+
+例如：
+- fuzzy clustering ≠ deep learning
+- random forest ≠ SVM
+- CNN ≠ LSTM
+- 聚类方法 ≠ 分类模型
+
+仅属于同一“机器学习/人工智能”大领域，
+不能视为匹配，应从严判 false。
+
+若方法体系明显不同，优先判：
+confidence=0.15
+
 ====================
 【评估步骤（按顺序在心里完成）】