文章引用文献校验

This commit is contained in:
wyn
2026-05-21 10:02:05 +08:00
parent fa878334cd
commit 4aab7f5b7e
4 changed files with 1790 additions and 0 deletions

View File

@@ -10,6 +10,7 @@ use PhpOffice\PhpWord\IOFactory;
use app\common\OpenAi;
use app\common\CrossrefService;
use app\common\PubmedService;
use app\common\ReferenceCheckService;
/**
* @title 文章接口
@@ -6391,4 +6392,417 @@ class Article extends Base
Db::commit();
return json_encode(['status' => 1,'msg' => 'success']);
}
/**
* 调试:预览 article_main 中提取的 blue 引用(不入队)
* POST: article_id
*/
public function citationReview()
{
$articleId = 7821;//intval($this->request->post('article_id', 0));
if ($articleId <= 0) {
return jsonError('article_id is required');
}
$svc = new ReferenceCheckService();
$mains = Db::name('article_main')
->field('am_id,content')
->where('article_id', $articleId)
->where('am_id', 127448)
//->whereIn('state', [0, 2])
->order('sort asc')
->select();
$preview = [];
foreach ($mains as $item) {
$preview[] = [
'am_id' => $item['am_id'],
'citations' => $svc->extractReferences((string)$item['content']),
];
break;
}
return jsonSuccess(['article_id' => $articleId, 'sections' => $preview]);
}
/**
* 提取文献引用
*
* @param string $content 原始内容
* @return array
*/
function extractReferences($content)
{
$result = [];
// 匹配 <blue>[57]</blue>、<blue>[74-79]</blue>、<blue>[72, 45]</blue>
preg_match_all(
'/<blue>\[([\d,\-\s]+)\]<\/blue>/',
$content,
$matches,
PREG_OFFSET_CAPTURE
);
if (empty($matches[0])) {
return [];
}
foreach ($matches[0] as $index => $match) {
// 完整标签
$fullTag = $match[0];
// 标签开始位置
$tagStart = $match[1];
// 标签结束位置
$tagEnd = $tagStart + strlen($fullTag);
// 文献号原始字符串
$rawRef = trim($matches[1][$index][0]);
// 展开文献号
$referenceNumbers = $this->expandReferenceNumbers($rawRef);
/**
* 获取原文内容
* 这里按句号切分:
* 找当前引用所在句子的开始和结束位置
*/
$sentenceStart = $this->findSentenceStart($content, $tagStart);
$sentenceEnd = $this->findSentenceEnd($content, $tagEnd);
$originalText = mb_substr(
$content,
$sentenceStart,
$sentenceEnd - $sentenceStart
);
// 去掉 blue 标签
$originalText = preg_replace(
'/<blue>\[[\d,\-\s]+\]<\/blue>/',
'',
$originalText
);
$originalText = trim($originalText);
$result[] = [
'reference_raw' => $rawRef,
'reference_numbers' => $referenceNumbers,
'original_text' => $originalText,
// blue标签在整段中的位置
'reference_start' => $tagStart,
'reference_end' => $tagEnd,
// 原文位置
'text_start' => $sentenceStart,
'text_end' => $sentenceEnd,
];
}
return $result;
}
/**
* 展开文献号
* 11-15 => [11,12,13,14,15]
* 72,45 => [72,45]
* 74-79,81 => [74,75,76,77,78,79,81]
*/
function expandReferenceNumbers($refStr)
{
$numbers = [];
$parts = explode(',', $refStr);
foreach ($parts as $part) {
$part = trim($part);
// 范围
if (strpos($part, '-') !== false) {
list($start, $end) = explode('-', $part);
$start = intval(trim($start));
$end = intval(trim($end));
if ($start <= $end) {
$numbers = array_merge(
$numbers,
range($start, $end)
);
}
} else {
// 单个数字
if (is_numeric($part)) {
$numbers[] = intval($part);
}
}
}
return array_values(array_unique($numbers));
}
/**
* 查找句子开始位置
*/
function findSentenceStart($content, $position)
{
$delimiters = ['.', '。', '!', '?', "\n"];
$start = 0;
foreach ($delimiters as $delimiter) {
$pos = strrpos(
substr($content, 0, $position),
$delimiter
);
if ($pos !== false) {
$start = max($start, $pos + 1);
}
}
return $start;
}
/**
* 查找句子结束位置
*/
function findSentenceEnd($content, $position)
{
$length = strlen($content);
$endPositions = [];
foreach (['.', '。', '!', '?', "\n"] as $delimiter) {
$pos = strpos($content, $delimiter, $position);
if ($pos !== false) {
$endPositions[] = $pos + 1;
}
}
return empty($endPositions)
? $length
: min($endPositions);
}
/**
* 引用相关性:提交单条到队列(异步调用 promotion 同款本地大模型)
* POST: content_a必填, content_b可选, article_id, reference_non=index+1, am_id
*/
public function referenceCheckEnqueue()
{
$data = $this->request->post();
$contentA = trim((string)(isset($data['content_a']) ? $data['content_a'] : ''));
$contentB = trim((string)(isset($data['content_b']) ? $data['content_b'] : ''));
$articleId = intval(isset($data['article_id']) ? $data['article_id'] : 0);
$referenceNo = intval(isset($data['reference_no']) ? $data['reference_no'] : 0);
if ($contentA === '') {
return jsonError('content_a is required');
}
try {
$svc = new ReferenceCheckService();
$extra = [
'reference_no' => $referenceNo,
'article_id' => $articleId,
'am_id' => intval(isset($data['am_id']) ? $data['am_id'] : 0),
];
if ($contentB === '' && $articleId > 0 && $referenceNo > 0) {
$prod = Db::name('production_article')
->where('article_id', $articleId)
->where('state', 0)
->find();
if ($prod) {
$referMap = $svc->loadReferMapByPArticleId(intval($prod['p_article_id']));
$referIndex = $referenceNo - 1;
if (isset($referMap[$referIndex])) {
$refer = $referMap[$referIndex];
$contentB = $svc->formatReferForLlm($refer);
$extra['p_article_id'] = intval($prod['p_article_id']);
$extra['p_refer_id'] = intval($refer['p_refer_id']);
$extra['refer_index'] = $referIndex;
}
}
}
$result = $svc->enqueue($contentA, $contentB, $extra);
return jsonSuccess($result);
} catch (\Exception $e) {
return jsonError($e->getMessage());
}
}
public function referenceCheckEnqueueArticleMain(){
$data = $this->request->post();
$articleId = intval(isset($data['article_id']) ? $data['article_id'] : 0);
if ($articleId <= 0) {
return jsonError('article_id is required');
}
$mainsList = Db::name('article_main')
->field('am_id,content,article_id')
->where('article_id', $articleId)
->whereIn('state', [0, 2])
->order('sort asc')
->select();
$svc = new ReferenceCheckService();
foreach ($mainsList as $mainInfo ){
$svc->enqueueByArticleMain($mainInfo);
}
}
/**
* 按文章批量入队:从 article_main 提取 blue 引用与文献号
* POST: article_id, clear_previous=1默认清空该文旧明细后重检
*/
public function referenceCheckEnqueueArticle()
{
$data = $this->request->post();
$articleId = intval(isset($data['article_id']) ? $data['article_id'] : 0);
if ($articleId <= 0) {
return jsonError('article_id is required');
}
try {
$svc = new ReferenceCheckService();
$clear = !isset($data['clear_previous']) || intval($data['clear_previous']) === 1;
$result = $svc->enqueueByArticle($articleId, $clear);
return jsonSuccess($result);
} catch (\Exception $e) {
return jsonError($e->getMessage());
}
}
/**
* 查询单条引用相关性检测结果
* GET/POST: check_id
*/
public function referenceCheckResult()
{
$checkId = intval($this->request->param('check_id', 0));
if ($checkId <= 0) {
return jsonError('check_id is required');
}
$row = (new ReferenceCheckService())->getResult($checkId);
if (!$row) {
return jsonError('result not found');
}
return jsonSuccess($this->formatReferenceCheckRow($row));
}
/**
* 稿件预览:带不合理引用标记的 content序号 + 引用句)
* GET/POST: article_id, am_id可选只预览某一节
*/
public function referenceCheckPreview()
{
$articleId = intval($this->request->param('article_id', 0));
if ($articleId <= 0) {
return jsonError('article_id is required');
}
$amId = intval($this->request->param('am_id', 0));
try {
$data = (new ReferenceCheckService())->buildArticlePreview($articleId, $amId);
$data['markup_hint'] = [
'ref_no' => '.ref-no-error — 不合理的文献序号(如 70-73 中单独的 70',
'ref_cite' => '.ref-cite-tag.ref-cite-error — 含不合理序号的 blue 引用块',
'ref_context'=> '.ref-context-error — 不合理的引用句/上下文',
];
$data['preview_css'] = '.ref-no-error{color:#c00;font-weight:bold;border-bottom:2px wavy #c00}'
. '.ref-cite-tag.ref-cite-error{background:#ffecec}'
. '.ref-context-error{background:#fff3cd;outline:1px dashed #e6a700}';
return jsonSuccess($data);
} catch (\Exception $e) {
return jsonError($e->getMessage());
}
}
/**
* 按文章列出引用校对结果([70-73] 为 4 条reference_no 分别为 70,71,72,73
* GET/POST: article_id, status可选, only_mismatch=1 仅不合理
*/
public function referenceCheckList()
{
$articleId = intval($this->request->param('article_id', 0));
if ($articleId <= 0) {
return jsonError('article_id is required');
}
$status = $this->request->param('status', '');
$statusFilter = ($status === '' || $status === null) ? -1 : intval($status);
$onlyMismatch = intval($this->request->param('only_mismatch', 0)) === 1;
$rows = (new ReferenceCheckService())->listByArticle($articleId, $statusFilter, $onlyMismatch);
$list = [];
foreach ($rows as $row) {
$list[] = $this->formatReferenceCheckRow($row);
}
$mains = Db::name('article_main')
->field('am_id,ref_check_status,sort')
->where('article_id', $articleId)
->whereIn('state', [0, 2])
->order('sort asc')
->select();
$sections = [];
foreach ($mains as $m) {
$st = intval(isset($m['ref_check_status']) ? $m['ref_check_status'] : 0);
$sections[] = [
'am_id' => intval($m['am_id']),
'ref_check_status' => $st,
'ref_check_pass' => $st === ReferenceCheckService::AM_STATUS_PASS,
'ref_check_label' => ReferenceCheckService::amStatusLabel($st),
];
}
return jsonSuccess([
'article_id' => $articleId,
'total' => count($list),
'list' => $list,
'sections' => $sections,
]);
}
private function formatReferenceCheckRow($row)
{
$statusMap = array(0 => 'pending', 1 => 'done', 2 => 'failed');
$amId = intval(isset($row['am_id']) ? $row['am_id'] : 0);
$citeStart = intval(isset($row['cite_tag_start']) ? $row['cite_tag_start'] : 0);
$rowStatus = intval($row['status']);
return array(
'check_id' => intval($row['check_id']),
'article_id' => intval(isset($row['article_id']) ? $row['article_id'] : 0),
'am_id' => $amId,
'cite_group_key' => $amId . '_' . $citeStart,
'p_refer_id' => intval(isset($row['p_refer_id']) ? $row['p_refer_id'] : 0),
'refer_index' => intval(isset($row['refer_index']) ? $row['refer_index'] : 0),
'reference_no' => intval(isset($row['reference_no']) ? $row['reference_no'] : 0),
'reference_raw' => isset($row['reference_raw']) ? $row['reference_raw'] : '',
'cite_tag_start' => $citeStart,
'cite_tag_end' => intval(isset($row['cite_tag_end']) ? $row['cite_tag_end'] : 0),
'text_start' => intval(isset($row['text_start']) ? $row['text_start'] : 0),
'text_end' => intval(isset($row['text_end']) ? $row['text_end'] : 0),
'status' => isset($statusMap[$rowStatus]) ? $statusMap[$rowStatus] : 'unknown',
'is_match' => intval($row['is_match']),
'is_reasonable' => intval($row['is_match']) === 1,
'confidence' => floatval($row['confidence']),
'reason' => isset($row['reason']) ? $row['reason'] : '',
'error_msg' => isset($row['error_msg']) ? $row['error_msg'] : '',
'content_a' => isset($row['content_a']) ? $row['content_a'] : '',
'content_b' => isset($row['content_b']) ? $row['content_b'] : '',
'updated_at' => isset($row['updated_at']) ? $row['updated_at'] : '',
);
}
}