Files
tougao/application/common/ReferenceCheckService.php

1585 lines
55 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
namespace app\common;
use think\Db;
use think\Env;
use think\Queue;
/**
* 正文 &lt;blue&gt;[n]&lt;/blue&gt; 引用与 t_production_article_referindex+1=n相关性校对。
* LLM 配置与 PromotionLlmService 相同;单条任务走 ReferenceCheck 队列。
*/
class ReferenceCheckService
{
const QUEUE_NAME = 'ReferenceCheck';
/** t_article_main.ref_check_status */
const AM_STATUS_NONE = 0;
const AM_STATUS_PASS = 1;
const AM_STATUS_FAIL = 2;
const AM_STATUS_RUNNING = 3;
/**
* 兼容无 ?? 的 PHP 版本
*/
private function arrGet($arr, $key, $default = '')
{
return isset($arr[$key]) ? $arr[$key] : $default;
}
/**
* 单条入队(可手工指定正文与文献文本)
*/
public function enqueue($contentA, $contentB, array $extra = [])
{
$contentA = trim($contentA);
if ($contentA === '') {
throw new \InvalidArgumentException('content_a is required');
}
$now = date('Y-m-d H:i:s');
$checkId = Db::name('article_reference_check_result')->insertGetId([
'article_id' => intval($this->arrGet($extra, 'article_id', 0)),
'am_id' => intval($this->arrGet($extra, 'am_id', 0)),
'p_article_id' => intval($this->arrGet($extra, 'p_article_id', 0)),
'p_refer_id' => intval($this->arrGet($extra, 'p_refer_id', 0)),
'refer_index' => intval($this->arrGet($extra, 'refer_index', 0)),
'reference_no' => intval($this->arrGet($extra, 'reference_no', 0)),
'reference_raw' => (string)$this->arrGet($extra, 'reference_raw', ''),
'cite_tag_start' => intval($this->arrGet($extra, 'cite_tag_start', 0)),
'cite_tag_end' => intval($this->arrGet($extra, 'cite_tag_end', 0)),
'text_start' => intval($this->arrGet($extra, 'text_start', 0)),
'text_end' => intval($this->arrGet($extra, 'text_end', 0)),
'content_a' => $contentA,
'content_b' => trim($contentB),
'status' => 0,
'created_at' => $now,
'updated_at' => $now,
]);
$amId = intval($this->arrGet($extra, 'am_id', 0));
if ($amId > 0) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING);
}
$this->pushJob(intval($checkId), intval($this->arrGet($extra, 'queue_delay', 0)));
return ['check_id' => $checkId, 'queued' => 1];
}
public function enqueueByArticleMain($main){
$amId = $main['am_id'];
// $main = Db::name('article_main')
// ->field('am_id,content,article_id')
// ->where('am_id', $amId)
// ->whereIn('state', [0, 2])
// ->find();
$citations = $this->extractReferences((string)$main['content']);
// return $citations;
if (empty($citations)) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_NONE);
return;
}
$prod = Db::name('production_article')
->where('article_id', $main['article_id'])
->where('state', 0)
->find();
if (empty($prod)) {
throw new \RuntimeException('production_article not found for article_id=' . $main['article_id']);
}
$pArticleId = intval($prod['p_article_id']);
$referMap = $this->loadReferMapByPArticleId($pArticleId);
if (empty($citations)) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_PASS);
return;
}
$skipped = 0;
$delay = 0;
foreach ($citations as $cite) {
foreach ($cite['reference_numbers'] as $refNo) {
$referIndex = $refNo - 1;
if ($referIndex < 0 || !isset($referMap[$referIndex])) {
$skipped++;
continue;
}
$refer = $referMap[$referIndex];
$referText = $this->formatReferForLlm($refer);
$now = date('Y-m-d H:i:s');
// [70-73] 展开为 reference_no=70,71,72,73 共 4 条记录
$checkId = Db::name('article_reference_check_result')->insertGetId([
'article_id' => $main['article_id'],
'p_article_id' => $pArticleId,
'am_id' => intval($main['am_id']),
'reference_no' => $refNo,
'refer_index' => $refNo,
'origin_text' => $cite['original_text'],
'refer_text' => $referText,
'p_refer_id' => $referMap[$referIndex]['p_refer_id'],
'text_start' => $cite['text_start'],
'text_end' => $cite['text_end'],
'created_at' => $now,
'updated_at' => $now,
]);
$this->pushJob(intval($checkId), $delay);
$checkIds[] = $checkId;
$delay += 1;
}
}
$this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING);
}
/**
* 手工触发:对已完成且 confidence<=0.65 的记录入队 DOI 第二轮复核
*/
public function enqueueSecondPassByArticle($articleId)
{
$articleId = intval($articleId);
if ($articleId <= 0) {
throw new \InvalidArgumentException('article_id is required');
}
$rows = Db::name('article_reference_check_result')
->where('article_id', $articleId)
->where('status', 1)
->where('confidence', '<=', 0.65)
->orderRaw('rand()')
->limit(2)
->select();
$checkIds2 = [];
$delay2 = 0;
foreach ($rows as $checkLog) {
$rowId = $this->resolveCheckRowId($checkLog);
if ($this->maybeEnqueueSecondPass($rowId, floatval($checkLog['confidence']))) {
$checkIds2[] = $rowId;
$delay2 += 1;
}
}
return [
'article_id' => $articleId,
'check_ids2' => $checkIds2,
'queued' => count($checkIds2),
];
}
public function enqueueByArticle($articleId){
if ($articleId <= 0) {
throw new \InvalidArgumentException('article_id is required');
}
$prod = Db::name('production_article')
->where('article_id', $articleId)
->where('state', [0, 2])
->find();
if (empty($prod)) {
throw new \RuntimeException('production_article not found for article_id=' . $articleId);
}
$pArticleId = intval($prod['p_article_id']);
$referMap = $this->loadReferMapByPArticleId($pArticleId);
$mains = Db::name('article_main')
->field('am_id,content,article_id')
->where('article_id', $articleId)
->whereIn('state', [0, 2])
->order('sort asc')
->select();
if (empty($mains)) {
throw new \RuntimeException('article_main is empty');
}
$queued = 0;
$skipped = 0;
$checkIds = [];
$delay = 0;
$amIdsWithJobs = [];
foreach ($mains as $main) {
$amId = intval($main['am_id']);
$citations = $this->extractReferences((string)$main['content']);
if (empty($citations)) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_NONE);
continue;
}
foreach ($citations as $cite) {
foreach ($cite['reference_numbers'] as $refNo) {
$referIndex = $refNo - 1;
if ($referIndex < 0 || !isset($referMap[$referIndex])) {
$skipped++;
continue;
}
$refer = $referMap[$referIndex];
$referText = $this->formatReferForLlm($refer);
$now = date('Y-m-d H:i:s');
// [70-73] 展开为 reference_no=70,71,72,73 共 4 条记录
$checkId = Db::name('article_reference_check_result')->insertGetId([
'article_id' => $main['article_id'],
'p_article_id' => $pArticleId,
'am_id' => intval($main['am_id']),
'reference_no' => $refNo,
'refer_index' => $refNo,
'origin_text' => $cite['original_text'],
'refer_text' => $referText,
'p_refer_id' => $referMap[$referIndex]['p_refer_id'],
'text_start' => $cite['text_start'],
'text_end' => $cite['text_end'],
'created_at' => $now,
'updated_at' => $now,
]);
$this->pushJob(intval($checkId), $delay);
$checkIds[] = $checkId;
$queued++;
$delay += 1;
$amIdsWithJobs[$amId] = true;
}
}
}
foreach (array_keys($amIdsWithJobs) as $amId) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING);
}
return [
'article_id' => $articleId,
'p_article_id' => $pArticleId,
'queued' => $queued,
'skipped' => $skipped,
'check_ids' => $checkIds,
'queue' => self::QUEUE_NAME,
];
}
/**
* 根据该节全部明细行汇总更新 t_article_main.ref_check_status
*/
public function syncAmRefCheckStatus($amId)
{
if ($amId <= 0) {
return self::AM_STATUS_NONE;
}
$rows = Db::name('article_reference_check_result')->where('am_id', $amId)->select();
if (empty($rows)) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_NONE);
return self::AM_STATUS_NONE;
}
$pending = 0;
$hasFail = false;
$done = 0;
foreach ($rows as $row) {
$st = intval($row['status']);
if ($st === 0) {
$pending++;
continue;
}
if ($st === 2 || ($st === 1 && intval($row['is_match']) === 0)) {
$hasFail = true;
}
if ($st === 1) {
$done++;
}
}
if ($pending > 0) {
$status = self::AM_STATUS_RUNNING;
} elseif ($hasFail) {
$status = self::AM_STATUS_FAIL;
} elseif ($done === count($rows)) {
$status = self::AM_STATUS_PASS;
} else {
$status = self::AM_STATUS_FAIL;
}
$this->setAmRefCheckStatus($amId, $status);
return $status;
}
public function setAmRefCheckStatus($amId, $status)
{
if ($amId <= 0) {
return;
}
Db::name('article_main')->where('am_id', $amId)->update([
'ref_check_status' => $status,
]);
}
public function clearArticleChecks($articleId)
{
Db::name('article_reference_check_result')->where('article_id', $articleId)->delete();
Db::name('article_main')
->where('article_id', $articleId)
->whereIn('state', [0, 2])
->update(['ref_check_status' => self::AM_STATUS_NONE]);
}
public static function amStatusLabel($status)
{
$map = [
self::AM_STATUS_NONE => 'none',
self::AM_STATUS_PASS => 'pass',
self::AM_STATUS_FAIL => 'fail',
self::AM_STATUS_RUNNING => 'running',
];
return isset($map[$status]) ? $map[$status] : 'unknown';
}
/**
* 表主键为 id对外 API 参数名仍叫 check_id
*/
public function resolveCheckRowId($row)
{
if (!is_array($row)) {
return 0;
}
if (isset($row['id']) && intval($row['id']) > 0) {
return intval($row['id']);
}
if (isset($row['check_id']) && intval($row['check_id']) > 0) {
return intval($row['check_id']);
}
return 0;
}
/**
* 解析 LLM 返回的 is_match兼容 bool / 0|1 / "true"|"false" 字符串)
*/
public function parseLlmIsMatch($value)
{
if (is_bool($value)) {
return $value;
}
if (is_int($value) || is_float($value)) {
return intval($value) === 1;
}
$s = strtolower(trim((string)$value));
return in_array($s, ['1', 'true', 'yes', 'match', 'matched'], true);
}
/**
* 写入单条校对结果(统一截断 reason/error_msg避免 varchar(512) 导致 UPDATE 失败)
*
* @throws \RuntimeException
*/
public function updateCheckResult($checkId, array $fields)
{
$checkId = intval($checkId);
if ($checkId <= 0) {
throw new \InvalidArgumentException('invalid check id');
}
if (isset($fields['reason'])) {
$fields['reason'] = mb_substr(trim((string)$fields['reason']), 0, 512);
}
if (isset($fields['error_msg'])) {
$fields['error_msg'] = mb_substr(trim((string)$fields['error_msg']), 0, 512);
}
$fields['updated_at'] = date('Y-m-d H:i:s');
$exists = Db::name('article_reference_check_result')->where('id', $checkId)->find();
if (empty($exists)) {
throw new \RuntimeException('article_reference_check_result not found, id=' . $checkId);
}
$affected = Db::name('article_reference_check_result')->where('id', $checkId)->update($fields);
if ($affected === false) {
throw new \RuntimeException('article_reference_check_result update failed, id=' . $checkId);
}
\think\Log::info('updateCheckResult id=' . $checkId . ' affected=' . intval($affected));
return intval($affected);
}
public function getResult($checkId)
{
if ($checkId <= 0) {
return null;
}
$row = Db::name('article_reference_check_result')->where('id', $checkId)->find();
return $row ?: null;
}
public function listByArticle($articleId, $status = -1, $onlyMismatch = false)
{
$q = Db::name('article_reference_check_result')->where('article_id', $articleId);
if ($status >= 0) {
$q->where('status', $status);
}
if ($onlyMismatch) {
$q->where('status', 1)->where('is_match', 0);
}
return $q->order('am_id asc, cite_tag_start asc, reference_no asc')->select();
}
/**
* 稿件预览:在 content 上标记不合理引用序号与引用句
*
* @return array{sections: array, issues: array, stats: array}
*/
public function buildArticlePreview($articleId, $amId = 0)
{
$q = Db::name('article_main')
->field('am_id,content,sort,ref_check_status')
->where('article_id', $articleId)
->whereIn('state', [0, 2]);
if ($amId > 0) {
$q->where('am_id', $amId);
}
$mains = $q->order('sort asc')->select();
$rows = $this->listByArticle($articleId, 1);
$badByAm = $this->indexBadResults($rows);
$sections = [];
$issues = [];
$stats = ['total' => 0, 'mismatch' => 0, 'match' => 0, 'pending' => 0];
foreach ($this->listByArticle($articleId, -1) as $r) {
$stats['total']++;
if (intval($r['status']) === 0) {
$stats['pending']++;
} elseif (intval($r['is_match']) === 1) {
$stats['match']++;
} else {
$stats['mismatch']++;
}
}
foreach ($mains as $main) {
$id = intval($main['am_id']);
$content = (string)$main['content'];
$badIndex = isset($badByAm[$id]) ? $badByAm[$id] : array();
$marked = $this->markContentForPreview($content, $id, $badIndex);
$amStatus = intval($this->arrGet($main, 'ref_check_status', 0));
$sections[] = [
'am_id' => $id,
'ref_check_status' => $amStatus,
'ref_check_pass' => $amStatus === self::AM_STATUS_PASS,
'ref_check_label' => self::amStatusLabel($amStatus),
'content' => $content,
'content_marked' => $marked['html'],
'issue_count' => $marked['issue_count'],
];
foreach ($marked['issues'] as $issue) {
$issues[] = $issue;
}
}
$articlePass = $this->resolveArticlePass($sections);
return [
'article_id' => $articleId,
'article_ref_check_pass' => $articlePass,
'sections' => $sections,
'issues' => $issues,
'stats' => $stats,
];
}
/**
* 全文是否通过:各节均为 pass且无 running/fail无引用节忽略
*/
private function resolveArticlePass($sections)
{
$hasChecked = false;
foreach ($sections as $sec) {
$st = intval($this->arrGet($sec, 'ref_check_status', 0));
if ($st === self::AM_STATUS_NONE) {
continue;
}
$hasChecked = true;
if ($st !== self::AM_STATUS_PASS) {
return false;
}
}
return $hasChecked ? true : null;
}
/**
* @param array $rows status=1 的检测结果
* @return array<int, array> am_id => indexed bad map
*/
private function indexBadResults($rows)
{
$byAm = [];
foreach ($rows as $row) {
if (intval($row['status']) !== 1 || intval($row['is_match']) === 1) {
continue;
}
$amId = intval($row['am_id']);
$refNo = intval($row['reference_no']);
if ($amId <= 0 || $refNo <= 0) {
continue;
}
if (!isset($byAm[$amId])) {
$byAm[$amId] = ['by_raw' => [], 'contexts' => []];
}
$rawKey = $this->normalizeRefRawKey((string)$this->arrGet($row, 'reference_raw', ''));
if ($rawKey !== '') {
$byAm[$amId]['by_raw'][$rawKey][$refNo] = $row;
}
$ctxKey = intval($row['text_start']) . '_' . intval($row['text_end']);
if (!isset($byAm[$amId]['contexts'][$ctxKey])) {
$byAm[$amId]['contexts'][$ctxKey] = [
'text_start' => intval($row['text_start']),
'text_end' => intval($row['text_end']),
'check_ids' => [],
'reasons' => [],
'ref_nos' => [],
];
}
$byAm[$amId]['contexts'][$ctxKey]['check_ids'][] = $this->resolveCheckRowId($row);
$byAm[$amId]['contexts'][$ctxKey]['ref_nos'][] = $refNo;
$reason = trim((string)$this->arrGet($row, 'reason', ''));
if ($reason !== '') {
$byAm[$amId]['contexts'][$ctxKey]['reasons'][$refNo] = $reason;
}
}
return $byAm;
}
private function normalizeRefRawKey($raw)
{
$raw = str_replace(
['', '', '—', '', '', '', ' '],
[',', '-', '-', '-', '-', '-', ''],
trim($raw)
);
return strtolower($raw);
}
/**
* @param array $badIndex indexBadResults 中单 am 的结构
*/
private function markContentForPreview($content, $amId, $badIndex)
{
$badByRaw = isset($badIndex['by_raw']) ? $badIndex['by_raw'] : array();
$contexts = isset($badIndex['contexts']) ? $badIndex['contexts'] : array();
$issues = array();
$issueCount = 0;
if ($content === '' || (empty($badByRaw) && empty($contexts))) {
return array('html' => $content, 'issues' => array(), 'issue_count' => 0);
}
$html = $content;
// 1) 先标记 blue 内各序号(在原文上操作,[70-73] 仅标不合理者如 70、71
preg_match_all(
'/<blue>\[([\d,\-\s]+)\]<\/blue>/',
$html,
$matches,
PREG_OFFSET_CAPTURE
);
$citeDeltas = [];
if (!empty($matches[0])) {
$replacements = [];
foreach ($matches[0] as $idx => $match) {
$fullTag = $match[0];
$tagStart = $match[1];
$tagEnd = $tagStart + strlen($fullTag);
$inner = $matches[1][$idx][0];
$rawKey = $this->normalizeRefRawKey($inner);
$badNums = isset($badByRaw[$rawKey]) ? $badByRaw[$rawKey] : array();
$innerMarked = preg_replace_callback(
'/\d+/',
function ($numMatch) use ($badNums, &$issues, &$issueCount, $amId, $inner) {
$num = intval($numMatch[0]);
if (!isset($badNums[$num])) {
return $numMatch[0];
}
$row = $badNums[$num];
$rowReason = isset($row['reason']) ? $row['reason'] : '';
$issueCount++;
$issues[] = array(
'am_id' => $amId,
'check_id' => $this->resolveCheckRowId($row),
'reference_no' => $num,
'reference_raw' => $inner,
'reason' => $rowReason,
'confidence' => floatval(isset($row['confidence']) ? $row['confidence'] : 0),
);
$title = htmlspecialchars(
'引用[' . $num . ']不合理: ' . $rowReason,
ENT_QUOTES,
'UTF-8'
);
return '<span class="ref-no-error" data-check-id="' . $this->resolveCheckRowId($row)
. '" data-ref-no="' . $num . '" title="' . $title . '">'
. $numMatch[0] . '</span>';
},
$inner
);
$tagClass = !empty($badNums) ? ' ref-cite-error' : '';
$groupIds = !empty($badNums)
? implode(',', array_map('intval', array_column($badNums, 'check_id')))
: '';
$newHtml = '<blue class="ref-cite-tag' . $tagClass . '" data-ref-raw="' . htmlspecialchars($inner, ENT_QUOTES, 'UTF-8')
. '" data-check-ids="' . $groupIds . '">[' . $innerMarked . ']</blue>';
$replacements[] = [
'start' => $tagStart,
'end' => $tagEnd,
'html' => $newHtml,
'delta' => strlen($newHtml) - ($tagEnd - $tagStart),
];
}
usort($replacements, function ($a, $b) {
return $b['start'] - $a['start'];
});
foreach ($replacements as $rep) {
$html = substr($html, 0, $rep['start']) . $rep['html'] . substr($html, $rep['end']);
$citeDeltas[] = ['start' => $rep['start'], 'delta' => $rep['delta']];
}
}
$shiftByCite = function ($pos) use ($citeDeltas) {
$d = 0;
foreach ($citeDeltas as $cd) {
if ($cd['start'] < $pos) {
$d += $cd['delta'];
}
}
return $pos + $d;
};
// 2) 再标记引用句(从后往前)
if (!empty($contexts)) {
$spans = array_values($contexts);
usort($spans, function ($a, $b) {
return $b['text_start'] - $a['text_start'];
});
foreach ($spans as $span) {
$start = $span['text_start'];
$end = $span['text_end'];
if ($start < 0 || $end <= $start) {
continue;
}
$s = $shiftByCite($start);
$e = $shiftByCite($end);
if ($e > strlen($html)) {
$e = strlen($html);
}
$checkIds = array_values(array_unique($span['check_ids']));
$refNos = array_values(array_unique($span['ref_nos']));
sort($refNos);
$reasonParts = [];
foreach ($refNos as $rn) {
if (!empty($span['reasons'][$rn])) {
$reasonParts[] = '[' . $rn . '] ' . $span['reasons'][$rn];
}
}
$title = htmlspecialchars(
'引用句可能不合理: ' . implode('; ', $reasonParts),
ENT_QUOTES,
'UTF-8'
);
$open = '<span class="ref-context-error" data-check-ids="' . implode(',', $checkIds)
. '" data-ref-nos="' . implode(',', $refNos) . '" title="' . $title . '">';
$close = '</span>';
$html = substr($html, 0, $s) . $open . substr($html, $s, $e - $s) . $close . substr($html, $e);
}
}
return ['html' => $html, 'issues' => $issues, 'issue_count' => $issueCount];
}
/**
* @return array<int, array> refer_index => row
*/
public function loadReferMapByPArticleId($pArticleId)
{
$map = [];
if ($pArticleId <= 0) {
return $map;
}
$rows = Db::name('production_article_refer')
->where('p_article_id', $pArticleId)
->where('state', 0)
->order('index asc')
->select();
foreach ($rows as $row) {
$map[intval($row['index'])] = $row;
}
return $map;
}
public function formatReferForLlm($refer)
{
$parts = [];
foreach (['title', 'author', 'joura', 'dateno', 'refer_doi', 'doilink'] as $f) {
$v = trim((string)$this->arrGet($refer, $f, ''));
if ($v !== '') {
$parts[] = ucfirst($f) . ': ' . $v;
}
}
$content = trim((string)$this->arrGet($refer, 'refer_content', ''));
if ($content !== '') {
$parts[] = 'Reference: ' . $content;
}
return implode("\n", $parts);
}
/**
* 仅使用 refer_doi 字段(二次 Crossref 摘要用)
*/
public function extractReferDoiOnly($refer)
{
if (!is_array($refer)) {
return '';
}
$raw = trim((string)$this->arrGet($refer, 'refer_doi', ''));
if ($raw === '' || stripos($raw, 'not available') !== false) {
return '';
}
$dois = $this->extractDoisFromString($raw);
return empty($dois) ? '' : $dois[0];
}
/**
* 根据 refer_doi 调用 Crossref works API 获取摘要(二次校对专用)
*
* @return array{text:string, has_abstract:bool, doi:string}
*/
public function fetchCrossrefAbstractByReferDoi($refer)
{
$doi = $this->extractReferDoiOnly($refer);
if ($doi === '') {
return ['text' => '', 'has_abstract' => false, 'doi' => ''];
}
$crossref = new CrossrefService([
'mailto' => trim((string)Env::get('crossref_mailto', '')),
]);
$block = $this->extractCrossrefBlock($doi, $crossref);
if ($block === null) {
return ['text' => '', 'has_abstract' => false, 'doi' => $doi];
}
return [
'text' => $block['text'],
'has_abstract' => !empty($block['has_abstract']),
'doi' => $doi,
];
}
/**
* 解析 LLM 返回的 can_support
*/
public function parseLlmCanSupport($llmResult)
{
if (!is_array($llmResult)) {
return false;
}
if (array_key_exists('can_support', $llmResult)) {
return $this->parseLlmIsMatch($llmResult['can_support']);
}
return $this->parseLlmIsMatch(isset($llmResult['is_match']) ? $llmResult['is_match'] : false);
}
/**
* 第一次校对:取 article_main.content整节正文
*/
public function resolveMainContentForJob(array $row, $maxChars = 8000)
{
$amId = intval($this->arrGet($row, 'am_id', 0));
if ($amId <= 0) {
return '';
}
$main = Db::name('article_main')
->field('content')
->where('am_id', $amId)
->find();
if (empty($main)) {
return '';
}
$text = trim((string)$this->arrGet($main, 'content', ''));
if ($text === '') {
return '';
}
$text = preg_replace('/<blue>\[([\d,\-\s]+)\]<\/blue>/', '[$1]', $text);
$text = strip_tags($text);
$text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8');
$text = preg_replace('/\s+/u', ' ', $text);
$text = trim($text);
$maxChars = max(500, intval($maxChars));
if (mb_strlen($text) > $maxChars) {
$text = mb_substr($text, 0, $maxChars) . '...';
}
return $text;
}
/**
* 引用处局部上下文origin_text供其它场景使用
*/
public function resolveCitationContextForJob(array $row)
{
$text = trim((string)$this->arrGet($row, 'origin_text', ''));
if ($text === '') {
$text = trim((string)$this->arrGet($row, 'content_a', ''));
}
return $text;
}
/**
* 从 refer 行提取标准 DOI10.xxxx/...
*
* 优先级refer_content原始引用文本里的 DOI 最贴近实际被引用的文献)
* > refer_doi > doi > doilink
*/
public function extractDoiFromRefer($refer)
{
$list = $this->extractAllDoiCandidatesFromRefer($refer);
return empty($list) ? '' : $list[0];
}
/**
* 返回 refer 行可能对应的全部 DOI 候选(去重,按优先级排序)
*
* 用于第二轮 DOI 复核场景:当 metadata 的 refer_doi 与原始引用文本里的 DOI
* 不一致时(数据漂移),优先尝试原始引用文本里的 DOI 抓真实摘要。
*
* @return string[]
*/
public function extractAllDoiCandidatesFromRefer($refer)
{
if (!is_array($refer)) {
return [];
}
$ordered = [
(string)$this->arrGet($refer, 'refer_content', ''),
(string)$this->arrGet($refer, 'refer_doi', ''),
(string)$this->arrGet($refer, 'doi', ''),
(string)$this->arrGet($refer, 'doilink', ''),
];
$result = [];
foreach ($ordered as $raw) {
foreach ($this->extractDoisFromString($raw) as $doi) {
if (!in_array($doi, $result, true)) {
$result[] = $doi;
}
}
}
return $result;
}
/**
* 从任意文本里抽取所有形如 10.xxxx/yyy 的 DOI
* @return string[]
*/
private function extractDoisFromString($text)
{
$text = trim((string)$text);
if ($text === '' || stripos($text, 'not available') !== false) {
return [];
}
$dois = [];
if (preg_match_all('~doi\.org/([^\s?#"\'<>]+)~i', $text, $m)) {
foreach ($m[1] as $cand) {
$cand = $this->trimDoiTail(trim($cand));
if ($this->isValidDoi($cand)) {
$dois[] = $cand;
}
}
}
if (preg_match_all('~\b(10\.\d{3,9}/[^\s?#"\'<>]+)~i', $text, $m)) {
foreach ($m[1] as $cand) {
$cand = $this->trimDoiTail(trim($cand));
if ($this->isValidDoi($cand)) {
$dois[] = $cand;
}
}
}
if ($dois === [] && strpos($text, '10.') === 0) {
$cand = $this->trimDoiTail($text);
if ($this->isValidDoi($cand)) {
$dois[] = $cand;
}
}
return array_values(array_unique($dois));
}
private function trimDoiTail($doi)
{
return rtrim($doi, ".,;:)]}>\"'\\ \t\n\r");
}
private function isValidDoi($doi)
{
return (bool)preg_match('~^10\.\d{3,9}/[^\s]+$~i', (string)$doi);
}
/**
* 通过 PubMed / Crossref 拉取 DOI 对应文献内容(本地 LLM 无法打开网页,须预先抓取)
*
* 行为:
* - 尝试 refer 行内所有 DOI 候选refer_content > refer_doi > doi > doilink
* - 优先采用第一个能拿到 abstract 的 DOI
* - PubMed 无摘要时回落到 Crossref raw 解析摘要(清理 JATS 标签)
* - 全部失败则返回空字符串(调用方据此跳过二次复核)
*/
public function fetchDoiLiteratureBlock($refer)
{
$candidates = $this->extractAllDoiCandidatesFromRefer($refer);
if (empty($candidates)) {
return '';
}
$pubmed = new PubmedService([
'email' => trim((string)Env::get('pubmed_email', '')),
'tool' => trim((string)Env::get('pubmed_tool', 'tmrjournals')),
]);
$crossref = new CrossrefService([
'mailto' => trim((string)Env::get('crossref_mailto', '')),
]);
$best = null;
$fallback = null;
foreach ($candidates as $doi) {
$block = $this->buildDoiBlockFromSources($doi, $pubmed, $crossref);
if ($block === null) {
continue;
}
if (!empty($block['has_abstract'])) {
$best = $block;
break;
}
if ($fallback === null) {
$fallback = $block;
}
}
$chosen = $best ?: $fallback;
if ($chosen === null) {
return '';
}
return $chosen['text'];
}
/**
* 拉单个 DOI 的真实内容,返回 ['text' => string, 'has_abstract' => bool] 或 null
*/
private function buildDoiBlockFromSources($doi, PubmedService $pubmed, CrossrefService $crossref)
{
$doi = trim((string)$doi);
if ($doi === '') {
return null;
}
$pub = $pubmed->fetchByDoi($doi);
$pubAbstract = is_array($pub) ? trim((string)$this->arrGet($pub, 'abstract', '')) : '';
if (is_array($pub) && ($pubAbstract !== '' || trim((string)$this->arrGet($pub, 'title', '')) !== '')) {
$lines = ['Source: PubMed (DOI ' . $doi . ')'];
if (!empty($pub['title'])) {
$lines[] = 'Actual Title: ' . trim((string)$pub['title']);
}
if (!empty($pub['journal'])) {
$lines[] = 'Journal: ' . trim((string)$pub['journal']);
}
if (!empty($pub['year'])) {
$lines[] = 'Year: ' . trim((string)$pub['year']);
}
if (!empty($pub['publication_types'])) {
$lines[] = 'Publication Types: ' . implode('; ', (array)$pub['publication_types']);
}
if (!empty($pub['mesh_terms'])) {
$lines[] = 'MeSH: ' . implode('; ', (array)$pub['mesh_terms']);
}
if ($pubAbstract !== '') {
$lines[] = 'Abstract: ' . $this->truncate($pubAbstract, 3500);
}
if ($pubAbstract === '') {
$cr = $this->extractCrossrefBlock($doi, $crossref);
if ($cr !== null && $cr['has_abstract']) {
$lines[] = "\n--- Crossref 补充 ---\n" . $cr['text'];
return ['text' => implode("\n", $lines), 'has_abstract' => true];
}
}
return ['text' => implode("\n", $lines), 'has_abstract' => $pubAbstract !== ''];
}
return $this->extractCrossrefBlock($doi, $crossref);
}
/**
* 从 Crossref 拉取标题/期刊/作者/摘要abstract 通常包裹 JATS XML需清洗
* @return array|null ['text' => string, 'has_abstract' => bool]
*/
private function extractCrossrefBlock($doi, CrossrefService $crossref)
{
$msg = $crossref->fetchWork($doi);
if (!is_array($msg)) {
return null;
}
$summary = $crossref->fetchWorkSummary($doi);
if (!is_array($summary)) {
$summary = [];
}
$lines = ['Source: Crossref api.crossref.org/works/' . rawurlencode($doi)];
$title = isset($msg['title'][0]) ? trim((string)$msg['title'][0]) : trim((string)$this->arrGet($summary, 'title', ''));
if ($title !== '') {
$lines[] = 'Actual Title: ' . $title;
}
if (!empty($summary['joura'])) {
$lines[] = 'Journal: ' . trim((string)$summary['joura']);
}
if (!empty($summary['author_str'])) {
$lines[] = 'Authors: ' . trim((string)$summary['author_str']);
}
if (!empty($summary['dateno'])) {
$lines[] = 'Publication: ' . trim((string)$summary['dateno']);
}
if (!empty($summary['doilink'])) {
$lines[] = 'DOI Link: ' . trim((string)$summary['doilink']);
}
if (!empty($summary['is_retracted'])) {
$lines[] = 'Retraction: yes - ' . trim((string)$this->arrGet($summary, 'retract_reason', ''));
}
$abstract = $this->cleanCrossrefAbstract((string)$this->arrGet($msg, 'abstract', ''));
$hasAbstract = $abstract !== '';
if ($hasAbstract) {
$lines[] = 'Abstract: ' . $this->truncate($abstract, 3500);
} else {
$lines[] = 'Note: Crossref 未返回摘要,请结合标题/期刊/作者与正文谨慎判断。';
}
return ['text' => implode("\n", $lines), 'has_abstract' => $hasAbstract];
}
private function cleanCrossrefAbstract($raw)
{
$raw = trim((string)$raw);
if ($raw === '') {
return '';
}
$raw = preg_replace('~<jats:title[^>]*>.*?</jats:title>~is', '', $raw);
$raw = preg_replace('~<jats:p[^>]*>~i', "\n", $raw);
$raw = preg_replace('~</jats:p>~i', '', $raw);
$raw = preg_replace('~</?jats:[^>]+>~i', '', $raw);
$raw = strip_tags($raw);
$raw = preg_replace('/[ \t]+/u', ' ', $raw);
$raw = preg_replace("/\r\n|\r/u", "\n", $raw);
$raw = preg_replace("/\n{2,}/u", "\n", $raw);
return trim($raw);
}
private function truncate($text, $max)
{
$text = (string)$text;
if (mb_strlen($text) <= $max) {
return $text;
}
return mb_substr($text, 0, $max) . '...';
}
/**
* 第二次 DOI 复核数据准备:返回书目信息 + 真实抓取内容
*
* @return array{refer_text:string, doi_block:string, has_abstract:bool, doi_used:string}
*/
public function prepareRecheckPayload($refer, $referText = '')
{
$base = trim($referText) !== '' ? trim($referText) : $this->formatReferForLlm($refer);
$cr = $this->fetchCrossrefAbstractByReferDoi($refer);
return [
'refer_text' => $base,
'doi_block' => $cr['text'],
'has_abstract' => $cr['has_abstract'],
'doi_used' => $cr['doi'],
];
}
/**
* 旧接口:拼接成单块文本(向后兼容,建议调用方改用 prepareRecheckPayload
*/
public function formatReferForDoiRecheck($refer, $referText = '')
{
$payload = $this->prepareRecheckPayload($refer, $referText);
if ($payload['doi_block'] === '') {
return $payload['refer_text']
. "\n\n【DOI 文献真实内容】\n未能从 PubMed/Crossref 获取该 DOI 的摘要或元数据,请依据书目条目与正文谨慎判断。";
}
return $payload['refer_text']
. "\n\n【Crossref 摘要(依据 Refer_doi 从 api.crossref.org/works 获取)】\n"
. $payload['doi_block'];
}
/**
* 第一轮 confidence<=0.65 且能抓到 DOI 真实内容时,延迟入队第二轮复核
*
* 跳过条件(避免无意义重跑得到相同结果):
* - check_id 不合法 / 一次置信度高于阈值
* - refer 行不存在
* - refer_doi 为空或 Crossref 未返回摘要
*/
public function maybeEnqueueSecondPass($checkId, $confidence)
{
$checkId = intval($checkId);
$confidence = floatval($confidence);
if ($checkId <= 0 || $confidence > 0.65) {
return false;
}
$row = Db::name('article_reference_check_result')->where('id', $checkId)->find();
if (empty($row)) {
return false;
}
$refer = null;
if (intval($row['p_refer_id']) > 0) {
$refer = Db::name('production_article_refer')
->where('p_refer_id', intval($row['p_refer_id']))
->where('state', 0)
->find();
}
if (empty($refer) || $this->extractReferDoiOnly($refer) === '') {
return false;
}
$cr = $this->fetchCrossrefAbstractByReferDoi($refer);
if (empty($cr['has_abstract'])) {
return false;
}
$this->pushJob2($checkId, 5);
return true;
}
/**
* 从 article_main.content 提取 blue 引用
*/
public function extractReferences($content)
{
$result = [];
preg_match_all('/<blue>\[([\d,\-\s]+)\]<\/blue>/', $content, $matches, PREG_OFFSET_CAPTURE);
if (empty($matches[0])) {
return [];
}
$tagSpans = [];
foreach ($matches[0] as $index => $match) {
$tagSpans[] = [
'start' => $match[1],
'end' => $match[1] + strlen($match[0]),
'index' => $index,
];
}
foreach ($matches[0] as $index => $match) {
$fullTag = $match[0];
$tagStart = $match[1];
$tagEnd = $tagStart + strlen($fullTag);
$rawRef = trim($matches[1][$index][0]);
$referenceNumbers = $this->expandReferenceNumbers($rawRef);
list($localStart, $localEnd, $originalText) = $this->extractLocalCitationContext(
$content,
$tagStart,
$tagEnd,
$tagSpans
);
if (!$this->isMeaningfulCitationContext($originalText) || empty($referenceNumbers)) {
continue;
}
$result[] = [
'reference_raw' => $rawRef,
'reference_numbers' => $referenceNumbers,
'original_text' => $originalText,
'reference_start' => $tagStart,
'reference_end' => $tagEnd,
'text_start' => $localStart,
'text_end' => $localEnd,
];
}
return $result;
}
/**
* 按引用位置截取局部上下文:优先取标签前叙述;同句多引时后续引用从上一标签后开始。
*/
private function extractLocalCitationContext($content, $tagStart, $tagEnd, array $tagSpans)
{
$paragraphStart = $this->findParagraphStart($content, $tagStart);
$sentenceEnd = $this->findSentenceEnd($content, $tagEnd, $tagEnd);
$prevTagEnd = $paragraphStart;
$nextTagStart = $sentenceEnd;
foreach ($tagSpans as $span) {
if ($span['end'] <= $tagStart && $span['end'] > $prevTagEnd) {
$prevTagEnd = $span['end'];
}
if ($span['start'] > $tagEnd && $span['start'] < $nextTagStart) {
$nextTagStart = $span['start'];
}
}
$hasPriorCiteInParagraph = ($prevTagEnd > $paragraphStart);
$sentenceStart = $this->findSentenceStart($content, $tagStart);
// 段内首个引用:整段到标签前;后续引用:取「本句」起点(可早于上一标签),避免只剩 “and external environment” 再误用标签后文本
if ($hasPriorCiteInParagraph) {
$localStart = max($paragraphStart, $sentenceStart);
} else {
$localStart = $this->capContextStartBeforeTag($content, $tagStart, $paragraphStart);
}
// 默认:引用标签前的论述
$localEnd = $tagStart;
$originalText = $this->buildCitationContextText($content, $localStart, $localEnd);
// 仅段内首个引用、且标签前极短(如句末 ICU nurses [14])时,才改用标签后片段;同段多引禁止标签后截取(会错取下一句)
$allowTrailing = !$hasPriorCiteInParagraph;
if ($allowTrailing && (
!$this->isMeaningfulCitationContext($originalText)
|| $this->shouldUseTrailingCitationContext($content, $localStart, $tagStart, $tagEnd)
)) {
$trailEnd = ($nextTagStart < $sentenceEnd) ? $nextTagStart : $sentenceEnd;
$trailText = $this->buildCitationContextText($content, $tagEnd, $trailEnd);
if ($this->isMeaningfulCitationContext($trailText)) {
$localStart = $tagEnd;
$localEnd = $trailEnd;
$originalText = $trailText;
}
}
if (!$this->isMeaningfulCitationContext($originalText)) {
list($localStart, $localEnd) = $this->widenCitationContextBounds(
$content,
$tagStart,
$tagEnd,
$localStart,
$localEnd
);
$originalText = $this->buildCitationContextText($content, $localStart, $localEnd);
}
return [$localStart, $localEnd, $originalText];
}
/**
* 标签前仅有作者缩写等极短片段时,改用标签后上下文
*/
private function shouldUseTrailingCitationContext($content, $localStart, $tagStart, $tagEnd)
{
$before = $this->buildCitationContextText($content, $localStart, $tagStart);
if (!$this->isMeaningfulCitationContext($before)) {
return true;
}
return mb_strlen($before) < 25;
}
public function expandReferenceNumbers($refStr)
{
$refStr = str_replace(
['', '', '—', '', '', ''],
[',', '-', '-', '-', '-', '-'],
trim($refStr)
);
$numbers = [];
foreach (explode(',', $refStr) as $part) {
$part = trim($part);
if ($part === '') {
continue;
}
if (preg_match('/^(\d+)\s*-\s*(\d+)$/', $part, $m)) {
$start = intval($m[1]);
$end = intval($m[2]);
if ($start <= $end) {
$numbers = array_merge($numbers, range($start, $end));
}
} elseif (ctype_digit($part)) {
$numbers[] = intval($part);
}
}
return array_values(array_unique($numbers));
}
/**
* 返回 $bytePos 处 UTF-8 码点占用的最后一字节之后的位置(下一字符起始)
*/
private function utf8CharEnd($content, $bytePos)
{
$len = strlen($content);
if ($bytePos < 0 || $bytePos >= $len) {
return max(0, min($len, $bytePos + 1));
}
$next = $bytePos + 1;
while ($next < $len && (ord($content[$next]) & 0xC0) === 0x80) {
$next++;
}
return $next;
}
/**
* 按字节偏移截取(与 strpos/strlen 一致);勿用 mb_substr否则遇中文前缀会截断英文词头
*/
private function byteSubstr($content, $start, $end)
{
$length = max(0, $end - $start);
if ($length === 0) {
return '';
}
return (string)mb_strcut($content, $start, $length, 'UTF-8');
}
private function buildCitationContextText($content, $start, $end)
{
$text = $this->byteSubstr($content, $start, $end);
$text = preg_replace('/<blue>\[[\d,\-\s]+\]<\/blue>/', '', $text);
$text = trim(strip_tags($text));
$text = preg_replace('/\s+/u', ' ', $text);
$text = ltrim($text, "\xEF\xBB\xBF");
return $text;
}
/**
* 过滤仅标点、过短或无字母/汉字的上下文(如去掉标签后只剩 "."
*/
private function isMeaningfulCitationContext($text)
{
$text = trim($text);
if ($text === '') {
return false;
}
if ($this->isOnlyPunctuationOrSpace($text)) {
return false;
}
if (!preg_match('/[\p{L}\p{N}]/u', $text)) {
return false;
}
return mb_strlen($text) >= 2;
}
private function isOnlyPunctuationOrSpace($text)
{
return preg_match('/^[\s\p{P}\p{S}]+$/u', $text) === 1;
}
/**
* 首句过短时向前后各扩展一句(上限约 2000 字符)
*/
private function widenCitationContextBounds($content, $tagStart, $tagEnd, $start, $end)
{
$len = strlen($content);
$maxSpan = 2000;
if ($start > 0) {
$prevStart = $this->findSentenceStart($content, max(0, $start - 1));
if ($prevStart < $start) {
$start = $prevStart;
}
}
$nextEnd = $this->findSentenceEnd($content, $end, $tagEnd);
if ($nextEnd > $end && $nextEnd <= $len) {
$end = $nextEnd;
}
if ($end - $start > $maxSpan) {
$half = (int)floor($maxSpan / 2);
$mid = (int)floor(($tagStart + $tagEnd) / 2);
$start = max(0, $mid - $half);
$end = min($len, $start + $maxSpan);
}
return [$start, $end];
}
/**
* 句号是否可作为句界排除小数点、et al. 等缩写)
*/
private function isSentenceDelimiterAt($content, $pos, $delimiter)
{
$len = strlen($content);
if ($delimiter !== '.' || $pos < 0 || $pos >= $len) {
return true;
}
if ($pos > 0 && $pos + 1 < $len
&& ctype_digit($content[$pos - 1])
&& ctype_digit($content[$pos + 1])
) {
return false;
}
$before = substr($content, max(0, $pos - 12), min(12, $pos));
if (preg_match('/\b(et\s+al|e\.g|i\.e|vs|etc|fig|no)\s*\.?\s*$/i', $before)) {
return false;
}
$after = substr($content, $pos + 1, 24);
if (preg_match('/^\s*<blue>\s*\[/', $after)) {
return false;
}
return true;
}
/**
* 段落起始HTML / 换行),避免英文多句段落只取到最后一个句号后的一句
*/
private function findParagraphStart($content, $tagStart)
{
$search = substr($content, 0, max(0, $tagStart));
if ($search === '') {
return 0;
}
$best = 0;
if (preg_match_all('/<p[^>]*>/i', $search, $m, PREG_OFFSET_CAPTURE)) {
$last = end($m[0]);
$best = max($best, $last[1] + strlen($last[0]));
}
if (preg_match_all('/<\/p>\s*/i', $search, $m, PREG_OFFSET_CAPTURE)) {
$last = end($m[0]);
$best = max($best, $last[1] + strlen($last[0]));
}
if (preg_match_all('/<br\s*\/?>\s*/i', $search, $m, PREG_OFFSET_CAPTURE)) {
$last = end($m[0]);
$best = max($best, $last[1] + strlen($last[0]));
}
$pos = strrpos($search, "\n\n");
if ($pos !== false) {
$best = max($best, $pos + 2);
}
$pos = strrpos($search, "\n");
if ($pos !== false) {
$best = max($best, $pos + 1);
}
return $best;
}
/**
* 段落过长时从引用处向前截取上限,避免单次 LLM 上下文过大
*/
private function capContextStartBeforeTag($content, $tagStart, $paragraphStart, $maxBytes = 2500)
{
if ($tagStart - $paragraphStart <= $maxBytes) {
return $paragraphStart;
}
$start = $tagStart - $maxBytes;
$slice = substr($content, $start, $tagStart - $start);
if (preg_match('/[.!?。!?]\s+/u', $slice, $m, PREG_OFFSET_CAPTURE)) {
$last = end($m[0]);
$rel = $last[1] + strlen($last[0]);
return $start + $rel;
}
return max($paragraphStart, $start);
}
private function findSentenceStart($content, $position)
{
$start = 0;
foreach (['.', '。', '!', '?', "\n"] as $delimiter) {
$pos = strrpos(substr($content, 0, $position), $delimiter);
if ($pos !== false && $this->isSentenceDelimiterAt($content, $pos, $delimiter)) {
$start = max($start, $this->utf8CharEnd($content, $pos));
}
}
return $start;
}
/**
* @param int $searchFrom 从该字节位置起查找句末
* @param int $tagEnd 引用标签结束位置;用于跳过 </blue> 后紧跟的孤立句号
*/
private function findSentenceEnd($content, $searchFrom, $tagEnd = 0)
{
$length = strlen($content);
$minPos = max(0, $searchFrom);
while ($minPos < $length) {
$endPositions = [];
foreach (['.', '。', '!', '?', "\n"] as $delimiter) {
$pos = strpos($content, $delimiter, $minPos);
if ($pos !== false && $this->isSentenceDelimiterAt($content, $pos, $delimiter)) {
$endPositions[] = $this->utf8CharEnd($content, $pos);
}
}
if (empty($endPositions)) {
return $length;
}
$end = min($endPositions);
if ($tagEnd <= 0 || $end <= $tagEnd) {
return $end;
}
$gap = substr($content, $tagEnd, $end - $tagEnd);
$gapText = trim(strip_tags(preg_replace('/<blue>\[[\d,\-\s]+\]<\/blue>/', '', $gap)));
if ($gapText !== '' && !$this->isOnlyPunctuationOrSpace($gapText)) {
return $end;
}
$minPos = $end;
}
return $length;
}
private function pushJob($checkId, $delaySeconds = 0)
{
$jobClass = 'app\api\job\ReferenceCheck@fire';
$data = ['check_id' => $checkId];
try {
if ($delaySeconds > 0) {
$jobId = Queue::later($delaySeconds, $jobClass, $data, self::QUEUE_NAME);
} else {
$jobId = Queue::push($jobClass, $data, self::QUEUE_NAME);
}
} catch (\Exception $e) {
\think\Log::error('ReferenceCheck pushJob failed check_id=' . $checkId . ' ' . $e->getMessage());
throw $e;
}
}
private function pushJob2($checkId, $delaySeconds = 0)
{
$jobClass = 'app\api\job\ReferenceCheckTwo@fire';
$data = ['check_id' => $checkId];
try {
if ($delaySeconds > 0) {
$jobId = Queue::later($delaySeconds, $jobClass, $data, self::QUEUE_NAME);
} else {
$jobId = Queue::push($jobClass, $data, self::QUEUE_NAME);
}
} catch (\Exception $e) {
\think\Log::error('ReferenceCheckTwo pushJob failed check_id=' . $checkId . ' ' . $e->getMessage());
throw $e;
}
}
}