已经完成一个文章校对了,但换个文章id就报错了,排查前备份

This commit is contained in:
wyn
2026-05-22 16:58:07 +08:00
parent 44f3383887
commit 68cf1867d8
5 changed files with 1755 additions and 616 deletions

View File

@@ -3,6 +3,7 @@
namespace app\common;
use think\Db;
use think\Env;
use think\Queue;
/**
@@ -131,8 +132,39 @@ class ReferenceCheckService
$this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING);
}
public function checkOne(){
$this->pushJob(intval(724), 0);
/**
* 手工触发:对已完成且 confidence<=0.65 的记录入队 DOI 第二轮复核
*/
public function enqueueSecondPassByArticle($articleId)
{
$articleId = intval($articleId);
if ($articleId <= 0) {
throw new \InvalidArgumentException('article_id is required');
}
$rows = Db::name('article_reference_check_result')
->where('article_id', $articleId)
->where('status', 1)
->where('confidence', '<=', 0.65)
->orderRaw('rand()')
->limit(2)
->select();
$checkIds2 = [];
$delay2 = 0;
foreach ($rows as $checkLog) {
$rowId = $this->resolveCheckRowId($checkLog);
if ($this->maybeEnqueueSecondPass($rowId, floatval($checkLog['confidence']))) {
$checkIds2[] = $rowId;
$delay2 += 1;
}
}
return [
'article_id' => $articleId,
'check_ids2' => $checkIds2,
'queued' => count($checkIds2),
];
}
public function enqueueByArticle($articleId){
if ($articleId <= 0) {
@@ -140,7 +172,7 @@ class ReferenceCheckService
}
$prod = Db::name('production_article')
->where('article_id', $articleId)
->where('state', 0)
->where('state', [0, 2])
->find();
if (empty($prod)) {
throw new \RuntimeException('production_article not found for article_id=' . $articleId);
@@ -296,12 +328,78 @@ class ReferenceCheckService
return isset($map[$status]) ? $map[$status] : 'unknown';
}
/**
* 表主键为 id对外 API 参数名仍叫 check_id
*/
public function resolveCheckRowId($row)
{
if (!is_array($row)) {
return 0;
}
if (isset($row['id']) && intval($row['id']) > 0) {
return intval($row['id']);
}
if (isset($row['check_id']) && intval($row['check_id']) > 0) {
return intval($row['check_id']);
}
return 0;
}
/**
* 解析 LLM 返回的 is_match兼容 bool / 0|1 / "true"|"false" 字符串)
*/
public function parseLlmIsMatch($value)
{
if (is_bool($value)) {
return $value;
}
if (is_int($value) || is_float($value)) {
return intval($value) === 1;
}
$s = strtolower(trim((string)$value));
return in_array($s, ['1', 'true', 'yes', 'match', 'matched'], true);
}
/**
* 写入单条校对结果(统一截断 reason/error_msg避免 varchar(512) 导致 UPDATE 失败)
*
* @throws \RuntimeException
*/
public function updateCheckResult($checkId, array $fields)
{
$checkId = intval($checkId);
if ($checkId <= 0) {
throw new \InvalidArgumentException('invalid check id');
}
if (isset($fields['reason'])) {
$fields['reason'] = mb_substr(trim((string)$fields['reason']), 0, 512);
}
if (isset($fields['error_msg'])) {
$fields['error_msg'] = mb_substr(trim((string)$fields['error_msg']), 0, 512);
}
$fields['updated_at'] = date('Y-m-d H:i:s');
$exists = Db::name('article_reference_check_result')->where('id', $checkId)->find();
if (empty($exists)) {
throw new \RuntimeException('article_reference_check_result not found, id=' . $checkId);
}
$affected = Db::name('article_reference_check_result')->where('id', $checkId)->update($fields);
if ($affected === false) {
throw new \RuntimeException('article_reference_check_result update failed, id=' . $checkId);
}
\think\Log::info('updateCheckResult id=' . $checkId . ' affected=' . intval($affected));
return intval($affected);
}
public function getResult($checkId)
{
if ($checkId <= 0) {
return null;
}
$row = Db::name('article_reference_check_result')->where('check_id', $checkId)->find();
$row = Db::name('article_reference_check_result')->where('id', $checkId)->find();
return $row ?: null;
}
@@ -435,7 +533,7 @@ class ReferenceCheckService
'ref_nos' => [],
];
}
$byAm[$amId]['contexts'][$ctxKey]['check_ids'][] = intval($row['check_id']);
$byAm[$amId]['contexts'][$ctxKey]['check_ids'][] = $this->resolveCheckRowId($row);
$byAm[$amId]['contexts'][$ctxKey]['ref_nos'][] = $refNo;
$reason = trim((string)$this->arrGet($row, 'reason', ''));
if ($reason !== '') {
@@ -501,7 +599,7 @@ class ReferenceCheckService
$issueCount++;
$issues[] = array(
'am_id' => $amId,
'check_id' => intval($row['check_id']),
'check_id' => $this->resolveCheckRowId($row),
'reference_no' => $num,
'reference_raw' => $inner,
'reason' => $rowReason,
@@ -512,7 +610,7 @@ class ReferenceCheckService
ENT_QUOTES,
'UTF-8'
);
return '<span class="ref-no-error" data-check-id="' . intval($row['check_id'])
return '<span class="ref-no-error" data-check-id="' . $this->resolveCheckRowId($row)
. '" data-ref-no="' . $num . '" title="' . $title . '">'
. $numMatch[0] . '</span>';
},
@@ -627,6 +725,448 @@ class ReferenceCheckService
return implode("\n", $parts);
}
/**
* 仅使用 refer_doi 字段(二次 Crossref 摘要用)
*/
public function extractReferDoiOnly($refer)
{
if (!is_array($refer)) {
return '';
}
$raw = trim((string)$this->arrGet($refer, 'refer_doi', ''));
if ($raw === '' || stripos($raw, 'not available') !== false) {
return '';
}
$dois = $this->extractDoisFromString($raw);
return empty($dois) ? '' : $dois[0];
}
/**
* 根据 refer_doi 调用 Crossref works API 获取摘要(二次校对专用)
*
* @return array{text:string, has_abstract:bool, doi:string}
*/
public function fetchCrossrefAbstractByReferDoi($refer)
{
$doi = $this->extractReferDoiOnly($refer);
if ($doi === '') {
return ['text' => '', 'has_abstract' => false, 'doi' => ''];
}
$crossref = new CrossrefService([
'mailto' => trim((string)Env::get('crossref_mailto', '')),
]);
$block = $this->extractCrossrefBlock($doi, $crossref);
if ($block === null) {
return ['text' => '', 'has_abstract' => false, 'doi' => $doi];
}
return [
'text' => $block['text'],
'has_abstract' => !empty($block['has_abstract']),
'doi' => $doi,
];
}
/**
* 解析 LLM 返回的 can_support
*/
public function parseLlmCanSupport($llmResult)
{
if (!is_array($llmResult)) {
return false;
}
if (array_key_exists('can_support', $llmResult)) {
return $this->parseLlmIsMatch($llmResult['can_support']);
}
return $this->parseLlmIsMatch(isset($llmResult['is_match']) ? $llmResult['is_match'] : false);
}
/**
* 第一次校对:取 article_main.content整节正文
*/
public function resolveMainContentForJob(array $row, $maxChars = 8000)
{
$amId = intval($this->arrGet($row, 'am_id', 0));
if ($amId <= 0) {
return '';
}
$main = Db::name('article_main')
->field('content')
->where('am_id', $amId)
->find();
if (empty($main)) {
return '';
}
$text = trim((string)$this->arrGet($main, 'content', ''));
if ($text === '') {
return '';
}
$text = preg_replace('/<blue>\[([\d,\-\s]+)\]<\/blue>/', '[$1]', $text);
$text = strip_tags($text);
$text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8');
$text = preg_replace('/\s+/u', ' ', $text);
$text = trim($text);
$maxChars = max(500, intval($maxChars));
if (mb_strlen($text) > $maxChars) {
$text = mb_substr($text, 0, $maxChars) . '...';
}
return $text;
}
/**
* 引用处局部上下文origin_text供其它场景使用
*/
public function resolveCitationContextForJob(array $row)
{
$text = trim((string)$this->arrGet($row, 'origin_text', ''));
if ($text === '') {
$text = trim((string)$this->arrGet($row, 'content_a', ''));
}
return $text;
}
/**
* 从 refer 行提取标准 DOI10.xxxx/...
*
* 优先级refer_content原始引用文本里的 DOI 最贴近实际被引用的文献)
* > refer_doi > doi > doilink
*/
public function extractDoiFromRefer($refer)
{
$list = $this->extractAllDoiCandidatesFromRefer($refer);
return empty($list) ? '' : $list[0];
}
/**
* 返回 refer 行可能对应的全部 DOI 候选(去重,按优先级排序)
*
* 用于第二轮 DOI 复核场景:当 metadata 的 refer_doi 与原始引用文本里的 DOI
* 不一致时(数据漂移),优先尝试原始引用文本里的 DOI 抓真实摘要。
*
* @return string[]
*/
public function extractAllDoiCandidatesFromRefer($refer)
{
if (!is_array($refer)) {
return [];
}
$ordered = [
(string)$this->arrGet($refer, 'refer_content', ''),
(string)$this->arrGet($refer, 'refer_doi', ''),
(string)$this->arrGet($refer, 'doi', ''),
(string)$this->arrGet($refer, 'doilink', ''),
];
$result = [];
foreach ($ordered as $raw) {
foreach ($this->extractDoisFromString($raw) as $doi) {
if (!in_array($doi, $result, true)) {
$result[] = $doi;
}
}
}
return $result;
}
/**
* 从任意文本里抽取所有形如 10.xxxx/yyy 的 DOI
* @return string[]
*/
private function extractDoisFromString($text)
{
$text = trim((string)$text);
if ($text === '' || stripos($text, 'not available') !== false) {
return [];
}
$dois = [];
if (preg_match_all('~doi\.org/([^\s?#"\'<>]+)~i', $text, $m)) {
foreach ($m[1] as $cand) {
$cand = $this->trimDoiTail(trim($cand));
if ($this->isValidDoi($cand)) {
$dois[] = $cand;
}
}
}
if (preg_match_all('~\b(10\.\d{3,9}/[^\s?#"\'<>]+)~i', $text, $m)) {
foreach ($m[1] as $cand) {
$cand = $this->trimDoiTail(trim($cand));
if ($this->isValidDoi($cand)) {
$dois[] = $cand;
}
}
}
if ($dois === [] && strpos($text, '10.') === 0) {
$cand = $this->trimDoiTail($text);
if ($this->isValidDoi($cand)) {
$dois[] = $cand;
}
}
return array_values(array_unique($dois));
}
private function trimDoiTail($doi)
{
return rtrim($doi, ".,;:)]}>\"'\\ \t\n\r");
}
private function isValidDoi($doi)
{
return (bool)preg_match('~^10\.\d{3,9}/[^\s]+$~i', (string)$doi);
}
/**
* 通过 PubMed / Crossref 拉取 DOI 对应文献内容(本地 LLM 无法打开网页,须预先抓取)
*
* 行为:
* - 尝试 refer 行内所有 DOI 候选refer_content > refer_doi > doi > doilink
* - 优先采用第一个能拿到 abstract 的 DOI
* - PubMed 无摘要时回落到 Crossref raw 解析摘要(清理 JATS 标签)
* - 全部失败则返回空字符串(调用方据此跳过二次复核)
*/
public function fetchDoiLiteratureBlock($refer)
{
$candidates = $this->extractAllDoiCandidatesFromRefer($refer);
if (empty($candidates)) {
return '';
}
$pubmed = new PubmedService([
'email' => trim((string)Env::get('pubmed_email', '')),
'tool' => trim((string)Env::get('pubmed_tool', 'tmrjournals')),
]);
$crossref = new CrossrefService([
'mailto' => trim((string)Env::get('crossref_mailto', '')),
]);
$best = null;
$fallback = null;
foreach ($candidates as $doi) {
$block = $this->buildDoiBlockFromSources($doi, $pubmed, $crossref);
if ($block === null) {
continue;
}
if (!empty($block['has_abstract'])) {
$best = $block;
break;
}
if ($fallback === null) {
$fallback = $block;
}
}
$chosen = $best ?: $fallback;
if ($chosen === null) {
return '';
}
return $chosen['text'];
}
/**
* 拉单个 DOI 的真实内容,返回 ['text' => string, 'has_abstract' => bool] 或 null
*/
private function buildDoiBlockFromSources($doi, PubmedService $pubmed, CrossrefService $crossref)
{
$doi = trim((string)$doi);
if ($doi === '') {
return null;
}
$pub = $pubmed->fetchByDoi($doi);
$pubAbstract = is_array($pub) ? trim((string)$this->arrGet($pub, 'abstract', '')) : '';
if (is_array($pub) && ($pubAbstract !== '' || trim((string)$this->arrGet($pub, 'title', '')) !== '')) {
$lines = ['Source: PubMed (DOI ' . $doi . ')'];
if (!empty($pub['title'])) {
$lines[] = 'Actual Title: ' . trim((string)$pub['title']);
}
if (!empty($pub['journal'])) {
$lines[] = 'Journal: ' . trim((string)$pub['journal']);
}
if (!empty($pub['year'])) {
$lines[] = 'Year: ' . trim((string)$pub['year']);
}
if (!empty($pub['publication_types'])) {
$lines[] = 'Publication Types: ' . implode('; ', (array)$pub['publication_types']);
}
if (!empty($pub['mesh_terms'])) {
$lines[] = 'MeSH: ' . implode('; ', (array)$pub['mesh_terms']);
}
if ($pubAbstract !== '') {
$lines[] = 'Abstract: ' . $this->truncate($pubAbstract, 3500);
}
if ($pubAbstract === '') {
$cr = $this->extractCrossrefBlock($doi, $crossref);
if ($cr !== null && $cr['has_abstract']) {
$lines[] = "\n--- Crossref 补充 ---\n" . $cr['text'];
return ['text' => implode("\n", $lines), 'has_abstract' => true];
}
}
return ['text' => implode("\n", $lines), 'has_abstract' => $pubAbstract !== ''];
}
return $this->extractCrossrefBlock($doi, $crossref);
}
/**
* 从 Crossref 拉取标题/期刊/作者/摘要abstract 通常包裹 JATS XML需清洗
* @return array|null ['text' => string, 'has_abstract' => bool]
*/
private function extractCrossrefBlock($doi, CrossrefService $crossref)
{
$msg = $crossref->fetchWork($doi);
if (!is_array($msg)) {
return null;
}
$summary = $crossref->fetchWorkSummary($doi);
if (!is_array($summary)) {
$summary = [];
}
$lines = ['Source: Crossref api.crossref.org/works/' . rawurlencode($doi)];
$title = isset($msg['title'][0]) ? trim((string)$msg['title'][0]) : trim((string)$this->arrGet($summary, 'title', ''));
if ($title !== '') {
$lines[] = 'Actual Title: ' . $title;
}
if (!empty($summary['joura'])) {
$lines[] = 'Journal: ' . trim((string)$summary['joura']);
}
if (!empty($summary['author_str'])) {
$lines[] = 'Authors: ' . trim((string)$summary['author_str']);
}
if (!empty($summary['dateno'])) {
$lines[] = 'Publication: ' . trim((string)$summary['dateno']);
}
if (!empty($summary['doilink'])) {
$lines[] = 'DOI Link: ' . trim((string)$summary['doilink']);
}
if (!empty($summary['is_retracted'])) {
$lines[] = 'Retraction: yes - ' . trim((string)$this->arrGet($summary, 'retract_reason', ''));
}
$abstract = $this->cleanCrossrefAbstract((string)$this->arrGet($msg, 'abstract', ''));
$hasAbstract = $abstract !== '';
if ($hasAbstract) {
$lines[] = 'Abstract: ' . $this->truncate($abstract, 3500);
} else {
$lines[] = 'Note: Crossref 未返回摘要,请结合标题/期刊/作者与正文谨慎判断。';
}
return ['text' => implode("\n", $lines), 'has_abstract' => $hasAbstract];
}
private function cleanCrossrefAbstract($raw)
{
$raw = trim((string)$raw);
if ($raw === '') {
return '';
}
$raw = preg_replace('~<jats:title[^>]*>.*?</jats:title>~is', '', $raw);
$raw = preg_replace('~<jats:p[^>]*>~i', "\n", $raw);
$raw = preg_replace('~</jats:p>~i', '', $raw);
$raw = preg_replace('~</?jats:[^>]+>~i', '', $raw);
$raw = strip_tags($raw);
$raw = preg_replace('/[ \t]+/u', ' ', $raw);
$raw = preg_replace("/\r\n|\r/u", "\n", $raw);
$raw = preg_replace("/\n{2,}/u", "\n", $raw);
return trim($raw);
}
private function truncate($text, $max)
{
$text = (string)$text;
if (mb_strlen($text) <= $max) {
return $text;
}
return mb_substr($text, 0, $max) . '...';
}
/**
* 第二次 DOI 复核数据准备:返回书目信息 + 真实抓取内容
*
* @return array{refer_text:string, doi_block:string, has_abstract:bool, doi_used:string}
*/
public function prepareRecheckPayload($refer, $referText = '')
{
$base = trim($referText) !== '' ? trim($referText) : $this->formatReferForLlm($refer);
$cr = $this->fetchCrossrefAbstractByReferDoi($refer);
return [
'refer_text' => $base,
'doi_block' => $cr['text'],
'has_abstract' => $cr['has_abstract'],
'doi_used' => $cr['doi'],
];
}
/**
* 旧接口:拼接成单块文本(向后兼容,建议调用方改用 prepareRecheckPayload
*/
public function formatReferForDoiRecheck($refer, $referText = '')
{
$payload = $this->prepareRecheckPayload($refer, $referText);
if ($payload['doi_block'] === '') {
return $payload['refer_text']
. "\n\n【DOI 文献真实内容】\n未能从 PubMed/Crossref 获取该 DOI 的摘要或元数据,请依据书目条目与正文谨慎判断。";
}
return $payload['refer_text']
. "\n\n【Crossref 摘要(依据 Refer_doi 从 api.crossref.org/works 获取)】\n"
. $payload['doi_block'];
}
/**
* 第一轮 confidence<=0.65 且能抓到 DOI 真实内容时,延迟入队第二轮复核
*
* 跳过条件(避免无意义重跑得到相同结果):
* - check_id 不合法 / 一次置信度高于阈值
* - refer 行不存在
* - refer_doi 为空或 Crossref 未返回摘要
*/
public function maybeEnqueueSecondPass($checkId, $confidence)
{
$checkId = intval($checkId);
$confidence = floatval($confidence);
if ($checkId <= 0 || $confidence > 0.65) {
return false;
}
$row = Db::name('article_reference_check_result')->where('id', $checkId)->find();
if (empty($row)) {
return false;
}
$refer = null;
if (intval($row['p_refer_id']) > 0) {
$refer = Db::name('production_article_refer')
->where('p_refer_id', intval($row['p_refer_id']))
->where('state', 0)
->find();
}
if (empty($refer) || $this->extractReferDoiOnly($refer) === '') {
return false;
}
$cr = $this->fetchCrossrefAbstractByReferDoi($refer);
if (empty($cr['has_abstract'])) {
return false;
}
$this->pushJob2($checkId, 5);
return true;
}
/**
* 从 article_main.content 提取 blue 引用
*/
@@ -1021,10 +1561,24 @@ class ReferenceCheckService
} else {
$jobId = Queue::push($jobClass, $data, self::QUEUE_NAME);
}
var_dump("=====jobId:".$jobId);
} catch (\Exception $e) {
\think\Log::error('ReferenceCheck pushJob failed check_id=' . $checkId . ' ' . $e->getMessage());
throw $e;
}
}
private function pushJob2($checkId, $delaySeconds = 0)
{
$jobClass = 'app\api\job\ReferenceCheckTwo@fire';
$data = ['check_id' => $checkId];
try {
if ($delaySeconds > 0) {
$jobId = Queue::later($delaySeconds, $jobClass, $data, self::QUEUE_NAME);
} else {
$jobId = Queue::push($jobClass, $data, self::QUEUE_NAME);
}
} catch (\Exception $e) {
\think\Log::error('ReferenceCheckTwo pushJob failed check_id=' . $checkId . ' ' . $e->getMessage());
throw $e;
}
}
}

File diff suppressed because it is too large Load Diff