文章引用文献校验
This commit is contained in:
751
application/common/ReferenceCheckService.php
Normal file
751
application/common/ReferenceCheckService.php
Normal file
@@ -0,0 +1,751 @@
|
||||
<?php
|
||||
|
||||
namespace app\common;
|
||||
|
||||
use think\Db;
|
||||
use think\Queue;
|
||||
|
||||
/**
|
||||
* 正文 <blue>[n]</blue> 引用与 t_production_article_refer(index+1=n)相关性校对。
|
||||
* LLM 配置与 PromotionLlmService 相同;单条任务走 ReferenceCheck 队列。
|
||||
*/
|
||||
class ReferenceCheckService
|
||||
{
|
||||
const QUEUE_NAME = 'ReferenceCheck';
|
||||
|
||||
/** t_article_main.ref_check_status */
|
||||
const AM_STATUS_NONE = 0;
|
||||
const AM_STATUS_PASS = 1;
|
||||
const AM_STATUS_FAIL = 2;
|
||||
const AM_STATUS_RUNNING = 3;
|
||||
|
||||
/**
|
||||
* 兼容无 ?? 的 PHP 版本
|
||||
*/
|
||||
private function arrGet($arr, $key, $default = '')
|
||||
{
|
||||
return isset($arr[$key]) ? $arr[$key] : $default;
|
||||
}
|
||||
|
||||
/**
|
||||
* 单条入队(可手工指定正文与文献文本)
|
||||
*/
|
||||
public function enqueue($contentA, $contentB, array $extra = [])
|
||||
{
|
||||
$contentA = trim($contentA);
|
||||
if ($contentA === '') {
|
||||
throw new \InvalidArgumentException('content_a is required');
|
||||
}
|
||||
|
||||
$now = date('Y-m-d H:i:s');
|
||||
$checkId = Db::name('article_reference_check_result')->insertGetId([
|
||||
'article_id' => intval($this->arrGet($extra, 'article_id', 0)),
|
||||
'am_id' => intval($this->arrGet($extra, 'am_id', 0)),
|
||||
'p_article_id' => intval($this->arrGet($extra, 'p_article_id', 0)),
|
||||
'p_refer_id' => intval($this->arrGet($extra, 'p_refer_id', 0)),
|
||||
'refer_index' => intval($this->arrGet($extra, 'refer_index', 0)),
|
||||
'reference_no' => intval($this->arrGet($extra, 'reference_no', 0)),
|
||||
'reference_raw' => (string)$this->arrGet($extra, 'reference_raw', ''),
|
||||
'cite_tag_start' => intval($this->arrGet($extra, 'cite_tag_start', 0)),
|
||||
'cite_tag_end' => intval($this->arrGet($extra, 'cite_tag_end', 0)),
|
||||
'text_start' => intval($this->arrGet($extra, 'text_start', 0)),
|
||||
'text_end' => intval($this->arrGet($extra, 'text_end', 0)),
|
||||
'content_a' => $contentA,
|
||||
'content_b' => trim($contentB),
|
||||
'status' => 0,
|
||||
'created_at' => $now,
|
||||
'updated_at' => $now,
|
||||
]);
|
||||
|
||||
$amId = intval($this->arrGet($extra, 'am_id', 0));
|
||||
if ($amId > 0) {
|
||||
$this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING);
|
||||
}
|
||||
|
||||
$this->pushJob(intval($checkId), intval($this->arrGet($extra, 'queue_delay', 0)));
|
||||
|
||||
return ['check_id' => $checkId, 'queued' => 1];
|
||||
}
|
||||
public function enqueueByArticleMain($main){
|
||||
$amId = $main['am_id'];
|
||||
// $main = Db::name('article_main')
|
||||
// ->field('am_id,content,article_id')
|
||||
// ->where('am_id', $amId)
|
||||
// ->whereIn('state', [0, 2])
|
||||
// ->find();
|
||||
$citations = $this->extractReferences((string)$main['content']);
|
||||
// return $citations;
|
||||
|
||||
$prod = Db::name('production_article')
|
||||
->where('article_id', $main['article_id'])
|
||||
->where('state', 0)
|
||||
->find();
|
||||
if (empty($prod)) {
|
||||
throw new \RuntimeException('production_article not found for article_id=' . $main['article_id']);
|
||||
}
|
||||
|
||||
$pArticleId = intval($prod['p_article_id']);
|
||||
$referMap = $this->loadReferMapByPArticleId($pArticleId);
|
||||
|
||||
if (empty($citations)) {
|
||||
$this->setAmRefCheckStatus($amId, self::AM_STATUS_PASS);
|
||||
return;
|
||||
}
|
||||
|
||||
$skipped = 0;
|
||||
$delay = 0;
|
||||
foreach ($citations as $cite) {
|
||||
foreach ($cite['reference_numbers'] as $refNo) {
|
||||
$referIndex = $refNo - 1;
|
||||
if ($referIndex < 0 || !isset($referMap[$referIndex])) {
|
||||
$skipped++;
|
||||
continue;
|
||||
}
|
||||
$refer = $referMap[$referIndex];
|
||||
$referText = $this->formatReferForLlm($refer);
|
||||
|
||||
$now = date('Y-m-d H:i:s');
|
||||
// [70-73] 展开为 reference_no=70,71,72,73 共 4 条记录
|
||||
$checkId = Db::name('article_reference_check_result')->insertGetId([
|
||||
'article_id' => $main['article_id'],
|
||||
'p_article_id' => $pArticleId,
|
||||
'am_id' => intval($main['am_id']),
|
||||
'reference_no' => $refNo,
|
||||
'refer_index' => $refNo,
|
||||
'origin_text' => $cite['original_text'],
|
||||
'refer_text' => $referText,
|
||||
'p_refer_id' => $referMap[$referIndex]['p_refer_id'],
|
||||
'text_start' => $cite['text_start'],
|
||||
'text_end' => $cite['text_end'],
|
||||
'created_at' => $now,
|
||||
'updated_at' => $now,
|
||||
]);
|
||||
$this->pushJob(intval($checkId), $delay);
|
||||
$checkIds[] = $checkId;
|
||||
$delay += 1;
|
||||
}
|
||||
}
|
||||
|
||||
$this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING);
|
||||
}
|
||||
/**
|
||||
* 按 article_id 扫描 t_article_main,为每个 blue 引用 × 文献号入队
|
||||
*/
|
||||
public function enqueueByArticle($articleId, $clearPrevious = true)
|
||||
{
|
||||
if ($articleId <= 0) {
|
||||
throw new \InvalidArgumentException('article_id is required');
|
||||
}
|
||||
|
||||
$prod = Db::name('production_article')
|
||||
->where('article_id', $articleId)
|
||||
->where('state', 0)
|
||||
->find();
|
||||
if (empty($prod)) {
|
||||
throw new \RuntimeException('production_article not found for article_id=' . $articleId);
|
||||
}
|
||||
|
||||
$pArticleId = intval($prod['p_article_id']);
|
||||
$referMap = $this->loadReferMapByPArticleId($pArticleId);
|
||||
|
||||
$mains = Db::name('article_main')
|
||||
->field('am_id,content')
|
||||
->where('article_id', $articleId)
|
||||
->whereIn('state', [0, 2])
|
||||
->order('sort asc')
|
||||
->select();
|
||||
|
||||
if (empty($mains)) {
|
||||
throw new \RuntimeException('article_main is empty');
|
||||
}
|
||||
|
||||
if ($clearPrevious) {
|
||||
$this->clearArticleChecks($articleId);
|
||||
}
|
||||
|
||||
$queued = 0;
|
||||
$skipped = 0;
|
||||
$checkIds = [];
|
||||
$delay = 0;
|
||||
$amIdsWithJobs = [];
|
||||
|
||||
foreach ($mains as $main) {
|
||||
$amId = intval($main['am_id']);
|
||||
$citations = $this->extractReferences((string)$main['content']);
|
||||
if (empty($citations)) {
|
||||
$this->setAmRefCheckStatus($amId, self::AM_STATUS_NONE);
|
||||
continue;
|
||||
}
|
||||
foreach ($citations as $cite) {
|
||||
foreach ($cite['reference_numbers'] as $refNo) {
|
||||
$referIndex = $refNo - 1;
|
||||
if ($referIndex < 0 || !isset($referMap[$referIndex])) {
|
||||
$skipped++;
|
||||
continue;
|
||||
}
|
||||
$refer = $referMap[$referIndex];
|
||||
$referText = $this->formatReferForLlm($refer);
|
||||
|
||||
$now = date('Y-m-d H:i:s');
|
||||
// [70-73] 展开为 reference_no=70,71,72,73 共 4 条记录
|
||||
$checkId = Db::name('article_reference_check_result')->insertGetId([
|
||||
'article_id' => $articleId,
|
||||
'am_id' => intval($main['am_id']),
|
||||
'p_article_id' => $pArticleId,
|
||||
'p_refer_id' => intval($refer['p_refer_id']),
|
||||
'refer_index' => $referIndex,
|
||||
'reference_no' => $refNo,
|
||||
'reference_raw' => $cite['reference_raw'],
|
||||
'cite_tag_start' => intval($cite['reference_start']),
|
||||
'cite_tag_end' => intval($cite['reference_end']),
|
||||
'text_start' => intval($cite['text_start']),
|
||||
'text_end' => intval($cite['text_end']),
|
||||
'content_a' => $cite['original_text'],
|
||||
'content_b' => $referText,
|
||||
'status' => 0,
|
||||
'created_at' => $now,
|
||||
'updated_at' => $now,
|
||||
]);
|
||||
|
||||
$this->pushJob(intval($checkId), $delay);
|
||||
$checkIds[] = $checkId;
|
||||
$queued++;
|
||||
$delay += 1;
|
||||
$amIdsWithJobs[$amId] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
foreach (array_keys($amIdsWithJobs) as $amId) {
|
||||
$this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING);
|
||||
}
|
||||
|
||||
return [
|
||||
'article_id' => $articleId,
|
||||
'p_article_id' => $pArticleId,
|
||||
'queued' => $queued,
|
||||
'skipped' => $skipped,
|
||||
'check_ids' => $checkIds,
|
||||
'queue' => self::QUEUE_NAME,
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* 根据该节全部明细行汇总更新 t_article_main.ref_check_status
|
||||
*/
|
||||
public function syncAmRefCheckStatus($amId)
|
||||
{
|
||||
if ($amId <= 0) {
|
||||
return self::AM_STATUS_NONE;
|
||||
}
|
||||
|
||||
$rows = Db::name('article_reference_check_result')->where('am_id', $amId)->select();
|
||||
if (empty($rows)) {
|
||||
$this->setAmRefCheckStatus($amId, self::AM_STATUS_NONE);
|
||||
return self::AM_STATUS_NONE;
|
||||
}
|
||||
|
||||
$pending = 0;
|
||||
$hasFail = false;
|
||||
$done = 0;
|
||||
|
||||
foreach ($rows as $row) {
|
||||
$st = intval($row['status']);
|
||||
if ($st === 0) {
|
||||
$pending++;
|
||||
continue;
|
||||
}
|
||||
if ($st === 2 || ($st === 1 && intval($row['is_match']) === 0)) {
|
||||
$hasFail = true;
|
||||
}
|
||||
if ($st === 1) {
|
||||
$done++;
|
||||
}
|
||||
}
|
||||
|
||||
if ($pending > 0) {
|
||||
$status = self::AM_STATUS_RUNNING;
|
||||
} elseif ($hasFail) {
|
||||
$status = self::AM_STATUS_FAIL;
|
||||
} elseif ($done === count($rows)) {
|
||||
$status = self::AM_STATUS_PASS;
|
||||
} else {
|
||||
$status = self::AM_STATUS_FAIL;
|
||||
}
|
||||
|
||||
$this->setAmRefCheckStatus($amId, $status);
|
||||
return $status;
|
||||
}
|
||||
|
||||
public function setAmRefCheckStatus($amId, $status)
|
||||
{
|
||||
if ($amId <= 0) {
|
||||
return;
|
||||
}
|
||||
Db::name('article_main')->where('am_id', $amId)->update([
|
||||
'ref_check_status' => $status,
|
||||
]);
|
||||
}
|
||||
|
||||
public function clearArticleChecks($articleId)
|
||||
{
|
||||
Db::name('article_reference_check_result')->where('article_id', $articleId)->delete();
|
||||
Db::name('article_main')
|
||||
->where('article_id', $articleId)
|
||||
->whereIn('state', [0, 2])
|
||||
->update(['ref_check_status' => self::AM_STATUS_NONE]);
|
||||
}
|
||||
|
||||
public static function amStatusLabel($status)
|
||||
{
|
||||
$map = [
|
||||
self::AM_STATUS_NONE => 'none',
|
||||
self::AM_STATUS_PASS => 'pass',
|
||||
self::AM_STATUS_FAIL => 'fail',
|
||||
self::AM_STATUS_RUNNING => 'running',
|
||||
];
|
||||
return isset($map[$status]) ? $map[$status] : 'unknown';
|
||||
}
|
||||
|
||||
public function getResult($checkId)
|
||||
{
|
||||
if ($checkId <= 0) {
|
||||
return null;
|
||||
}
|
||||
$row = Db::name('article_reference_check_result')->where('check_id', $checkId)->find();
|
||||
return $row ?: null;
|
||||
}
|
||||
|
||||
public function listByArticle($articleId, $status = -1, $onlyMismatch = false)
|
||||
{
|
||||
$q = Db::name('article_reference_check_result')->where('article_id', $articleId);
|
||||
if ($status >= 0) {
|
||||
$q->where('status', $status);
|
||||
}
|
||||
if ($onlyMismatch) {
|
||||
$q->where('status', 1)->where('is_match', 0);
|
||||
}
|
||||
return $q->order('am_id asc, cite_tag_start asc, reference_no asc')->select();
|
||||
}
|
||||
|
||||
/**
|
||||
* 稿件预览:在 content 上标记不合理引用序号与引用句
|
||||
*
|
||||
* @return array{sections: array, issues: array, stats: array}
|
||||
*/
|
||||
public function buildArticlePreview($articleId, $amId = 0)
|
||||
{
|
||||
$q = Db::name('article_main')
|
||||
->field('am_id,content,sort,ref_check_status')
|
||||
->where('article_id', $articleId)
|
||||
->whereIn('state', [0, 2]);
|
||||
if ($amId > 0) {
|
||||
$q->where('am_id', $amId);
|
||||
}
|
||||
$mains = $q->order('sort asc')->select();
|
||||
|
||||
$rows = $this->listByArticle($articleId, 1);
|
||||
$badByAm = $this->indexBadResults($rows);
|
||||
|
||||
$sections = [];
|
||||
$issues = [];
|
||||
$stats = ['total' => 0, 'mismatch' => 0, 'match' => 0, 'pending' => 0];
|
||||
|
||||
foreach ($this->listByArticle($articleId, -1) as $r) {
|
||||
$stats['total']++;
|
||||
if (intval($r['status']) === 0) {
|
||||
$stats['pending']++;
|
||||
} elseif (intval($r['is_match']) === 1) {
|
||||
$stats['match']++;
|
||||
} else {
|
||||
$stats['mismatch']++;
|
||||
}
|
||||
}
|
||||
|
||||
foreach ($mains as $main) {
|
||||
$id = intval($main['am_id']);
|
||||
$content = (string)$main['content'];
|
||||
$badIndex = isset($badByAm[$id]) ? $badByAm[$id] : array();
|
||||
$marked = $this->markContentForPreview($content, $id, $badIndex);
|
||||
$amStatus = intval($this->arrGet($main, 'ref_check_status', 0));
|
||||
$sections[] = [
|
||||
'am_id' => $id,
|
||||
'ref_check_status' => $amStatus,
|
||||
'ref_check_pass' => $amStatus === self::AM_STATUS_PASS,
|
||||
'ref_check_label' => self::amStatusLabel($amStatus),
|
||||
'content' => $content,
|
||||
'content_marked' => $marked['html'],
|
||||
'issue_count' => $marked['issue_count'],
|
||||
];
|
||||
foreach ($marked['issues'] as $issue) {
|
||||
$issues[] = $issue;
|
||||
}
|
||||
}
|
||||
|
||||
$articlePass = $this->resolveArticlePass($sections);
|
||||
|
||||
return [
|
||||
'article_id' => $articleId,
|
||||
'article_ref_check_pass' => $articlePass,
|
||||
'sections' => $sections,
|
||||
'issues' => $issues,
|
||||
'stats' => $stats,
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* 全文是否通过:各节均为 pass,且无 running/fail(无引用节忽略)
|
||||
*/
|
||||
private function resolveArticlePass($sections)
|
||||
{
|
||||
$hasChecked = false;
|
||||
foreach ($sections as $sec) {
|
||||
$st = intval($this->arrGet($sec, 'ref_check_status', 0));
|
||||
if ($st === self::AM_STATUS_NONE) {
|
||||
continue;
|
||||
}
|
||||
$hasChecked = true;
|
||||
if ($st !== self::AM_STATUS_PASS) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return $hasChecked ? true : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $rows status=1 的检测结果
|
||||
* @return array<int, array> am_id => indexed bad map
|
||||
*/
|
||||
private function indexBadResults($rows)
|
||||
{
|
||||
$byAm = [];
|
||||
foreach ($rows as $row) {
|
||||
if (intval($row['status']) !== 1 || intval($row['is_match']) === 1) {
|
||||
continue;
|
||||
}
|
||||
$amId = intval($row['am_id']);
|
||||
$refNo = intval($row['reference_no']);
|
||||
if ($amId <= 0 || $refNo <= 0) {
|
||||
continue;
|
||||
}
|
||||
if (!isset($byAm[$amId])) {
|
||||
$byAm[$amId] = ['by_raw' => [], 'contexts' => []];
|
||||
}
|
||||
$rawKey = $this->normalizeRefRawKey((string)$this->arrGet($row, 'reference_raw', ''));
|
||||
if ($rawKey !== '') {
|
||||
$byAm[$amId]['by_raw'][$rawKey][$refNo] = $row;
|
||||
}
|
||||
|
||||
$ctxKey = intval($row['text_start']) . '_' . intval($row['text_end']);
|
||||
if (!isset($byAm[$amId]['contexts'][$ctxKey])) {
|
||||
$byAm[$amId]['contexts'][$ctxKey] = [
|
||||
'text_start' => intval($row['text_start']),
|
||||
'text_end' => intval($row['text_end']),
|
||||
'check_ids' => [],
|
||||
'reasons' => [],
|
||||
'ref_nos' => [],
|
||||
];
|
||||
}
|
||||
$byAm[$amId]['contexts'][$ctxKey]['check_ids'][] = intval($row['check_id']);
|
||||
$byAm[$amId]['contexts'][$ctxKey]['ref_nos'][] = $refNo;
|
||||
$reason = trim((string)$this->arrGet($row, 'reason', ''));
|
||||
if ($reason !== '') {
|
||||
$byAm[$amId]['contexts'][$ctxKey]['reasons'][$refNo] = $reason;
|
||||
}
|
||||
}
|
||||
return $byAm;
|
||||
}
|
||||
|
||||
private function normalizeRefRawKey($raw)
|
||||
{
|
||||
$raw = str_replace(
|
||||
[',', '–', '—', '−', '‐', '‑', ' '],
|
||||
[',', '-', '-', '-', '-', '-', ''],
|
||||
trim($raw)
|
||||
);
|
||||
return strtolower($raw);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $badIndex indexBadResults 中单 am 的结构
|
||||
*/
|
||||
private function markContentForPreview($content, $amId, $badIndex)
|
||||
{
|
||||
$badByRaw = isset($badIndex['by_raw']) ? $badIndex['by_raw'] : array();
|
||||
$contexts = isset($badIndex['contexts']) ? $badIndex['contexts'] : array();
|
||||
$issues = array();
|
||||
$issueCount = 0;
|
||||
|
||||
if ($content === '' || (empty($badByRaw) && empty($contexts))) {
|
||||
return array('html' => $content, 'issues' => array(), 'issue_count' => 0);
|
||||
}
|
||||
|
||||
$html = $content;
|
||||
|
||||
// 1) 先标记 blue 内各序号(在原文上操作,[70-73] 仅标不合理者如 70、71)
|
||||
preg_match_all(
|
||||
'/<blue>\[([\d,\-\s]+)\]<\/blue>/',
|
||||
$html,
|
||||
$matches,
|
||||
PREG_OFFSET_CAPTURE
|
||||
);
|
||||
$citeDeltas = [];
|
||||
if (!empty($matches[0])) {
|
||||
$replacements = [];
|
||||
foreach ($matches[0] as $idx => $match) {
|
||||
$fullTag = $match[0];
|
||||
$tagStart = $match[1];
|
||||
$tagEnd = $tagStart + strlen($fullTag);
|
||||
$inner = $matches[1][$idx][0];
|
||||
$rawKey = $this->normalizeRefRawKey($inner);
|
||||
$badNums = isset($badByRaw[$rawKey]) ? $badByRaw[$rawKey] : array();
|
||||
|
||||
$innerMarked = preg_replace_callback(
|
||||
'/\d+/',
|
||||
function ($numMatch) use ($badNums, &$issues, &$issueCount, $amId, $inner) {
|
||||
$num = intval($numMatch[0]);
|
||||
if (!isset($badNums[$num])) {
|
||||
return $numMatch[0];
|
||||
}
|
||||
$row = $badNums[$num];
|
||||
$rowReason = isset($row['reason']) ? $row['reason'] : '';
|
||||
$issueCount++;
|
||||
$issues[] = array(
|
||||
'am_id' => $amId,
|
||||
'check_id' => intval($row['check_id']),
|
||||
'reference_no' => $num,
|
||||
'reference_raw' => $inner,
|
||||
'reason' => $rowReason,
|
||||
'confidence' => floatval(isset($row['confidence']) ? $row['confidence'] : 0),
|
||||
);
|
||||
$title = htmlspecialchars(
|
||||
'引用[' . $num . ']不合理: ' . $rowReason,
|
||||
ENT_QUOTES,
|
||||
'UTF-8'
|
||||
);
|
||||
return '<span class="ref-no-error" data-check-id="' . intval($row['check_id'])
|
||||
. '" data-ref-no="' . $num . '" title="' . $title . '">'
|
||||
. $numMatch[0] . '</span>';
|
||||
},
|
||||
$inner
|
||||
);
|
||||
|
||||
$tagClass = !empty($badNums) ? ' ref-cite-error' : '';
|
||||
$groupIds = !empty($badNums)
|
||||
? implode(',', array_map('intval', array_column($badNums, 'check_id')))
|
||||
: '';
|
||||
$newHtml = '<blue class="ref-cite-tag' . $tagClass . '" data-ref-raw="' . htmlspecialchars($inner, ENT_QUOTES, 'UTF-8')
|
||||
. '" data-check-ids="' . $groupIds . '">[' . $innerMarked . ']</blue>';
|
||||
$replacements[] = [
|
||||
'start' => $tagStart,
|
||||
'end' => $tagEnd,
|
||||
'html' => $newHtml,
|
||||
'delta' => strlen($newHtml) - ($tagEnd - $tagStart),
|
||||
];
|
||||
}
|
||||
usort($replacements, function ($a, $b) {
|
||||
return $b['start'] - $a['start'];
|
||||
});
|
||||
foreach ($replacements as $rep) {
|
||||
$html = substr($html, 0, $rep['start']) . $rep['html'] . substr($html, $rep['end']);
|
||||
$citeDeltas[] = ['start' => $rep['start'], 'delta' => $rep['delta']];
|
||||
}
|
||||
}
|
||||
|
||||
$shiftByCite = function ($pos) use ($citeDeltas) {
|
||||
$d = 0;
|
||||
foreach ($citeDeltas as $cd) {
|
||||
if ($cd['start'] < $pos) {
|
||||
$d += $cd['delta'];
|
||||
}
|
||||
}
|
||||
return $pos + $d;
|
||||
};
|
||||
|
||||
// 2) 再标记引用句(从后往前)
|
||||
if (!empty($contexts)) {
|
||||
$spans = array_values($contexts);
|
||||
usort($spans, function ($a, $b) {
|
||||
return $b['text_start'] - $a['text_start'];
|
||||
});
|
||||
foreach ($spans as $span) {
|
||||
$start = $span['text_start'];
|
||||
$end = $span['text_end'];
|
||||
if ($start < 0 || $end <= $start) {
|
||||
continue;
|
||||
}
|
||||
$s = $shiftByCite($start);
|
||||
$e = $shiftByCite($end);
|
||||
if ($e > strlen($html)) {
|
||||
$e = strlen($html);
|
||||
}
|
||||
$checkIds = array_values(array_unique($span['check_ids']));
|
||||
$refNos = array_values(array_unique($span['ref_nos']));
|
||||
sort($refNos);
|
||||
$reasonParts = [];
|
||||
foreach ($refNos as $rn) {
|
||||
if (!empty($span['reasons'][$rn])) {
|
||||
$reasonParts[] = '[' . $rn . '] ' . $span['reasons'][$rn];
|
||||
}
|
||||
}
|
||||
$title = htmlspecialchars(
|
||||
'引用句可能不合理: ' . implode('; ', $reasonParts),
|
||||
ENT_QUOTES,
|
||||
'UTF-8'
|
||||
);
|
||||
$open = '<span class="ref-context-error" data-check-ids="' . implode(',', $checkIds)
|
||||
. '" data-ref-nos="' . implode(',', $refNos) . '" title="' . $title . '">';
|
||||
$close = '</span>';
|
||||
$html = substr($html, 0, $s) . $open . substr($html, $s, $e - $s) . $close . substr($html, $e);
|
||||
}
|
||||
}
|
||||
|
||||
return ['html' => $html, 'issues' => $issues, 'issue_count' => $issueCount];
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array<int, array> refer_index => row
|
||||
*/
|
||||
public function loadReferMapByPArticleId($pArticleId)
|
||||
{
|
||||
$map = [];
|
||||
if ($pArticleId <= 0) {
|
||||
return $map;
|
||||
}
|
||||
$rows = Db::name('production_article_refer')
|
||||
->where('p_article_id', $pArticleId)
|
||||
->where('state', 0)
|
||||
->order('index asc')
|
||||
->select();
|
||||
foreach ($rows as $row) {
|
||||
$map[intval($row['index'])] = $row;
|
||||
}
|
||||
return $map;
|
||||
}
|
||||
public function formatReferForLlm($refer)
|
||||
{
|
||||
$parts = [];
|
||||
foreach (['title', 'author', 'joura', 'dateno', 'refer_doi', 'doilink'] as $f) {
|
||||
$v = trim((string)$this->arrGet($refer, $f, ''));
|
||||
if ($v !== '') {
|
||||
$parts[] = ucfirst($f) . ': ' . $v;
|
||||
}
|
||||
}
|
||||
$content = trim((string)$this->arrGet($refer, 'refer_content', ''));
|
||||
if ($content !== '') {
|
||||
$parts[] = 'Reference: ' . $content;
|
||||
}
|
||||
return implode("\n", $parts);
|
||||
}
|
||||
|
||||
/**
|
||||
* 从 article_main.content 提取 blue 引用
|
||||
*/
|
||||
public function extractReferences($content)
|
||||
{
|
||||
$result = [];
|
||||
preg_match_all('/<blue>\[([\d,\-\s]+)\]<\/blue>/', $content, $matches,PREG_OFFSET_CAPTURE);
|
||||
if (empty($matches[0])) {
|
||||
return [];
|
||||
}
|
||||
|
||||
foreach ($matches[0] as $index => $match) {
|
||||
|
||||
$fullTag = $match[0];
|
||||
$tagStart = $match[1];
|
||||
$tagEnd = $tagStart + strlen($fullTag);
|
||||
$rawRef = trim($matches[1][$index][0]);
|
||||
$referenceNumbers = $this->expandReferenceNumbers($rawRef);
|
||||
|
||||
$sentenceStart = $this->findSentenceStart($content, $tagStart);
|
||||
$sentenceEnd = $this->findSentenceEnd($content, $tagEnd);
|
||||
$originalText = mb_substr($content, $sentenceStart, $sentenceEnd - $sentenceStart);
|
||||
$originalText = preg_replace('/<blue>\[[\d,\-\s]+\]<\/blue>/', '', $originalText);
|
||||
$originalText = trim(strip_tags($originalText));
|
||||
|
||||
if ($originalText === '' || empty($referenceNumbers)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$result[] = [
|
||||
'reference_raw' => $rawRef,
|
||||
'reference_numbers' => $referenceNumbers,
|
||||
'original_text' => $originalText,
|
||||
'reference_start' => $tagStart,
|
||||
'reference_end' => $tagEnd,
|
||||
'text_start' => $sentenceStart,
|
||||
'text_end' => $sentenceEnd,
|
||||
];
|
||||
}
|
||||
|
||||
return $result;
|
||||
}
|
||||
|
||||
public function expandReferenceNumbers($refStr)
|
||||
{
|
||||
$refStr = str_replace(
|
||||
[',', '–', '—', '−', '‐', '‑'],
|
||||
[',', '-', '-', '-', '-', '-'],
|
||||
trim($refStr)
|
||||
);
|
||||
$numbers = [];
|
||||
foreach (explode(',', $refStr) as $part) {
|
||||
$part = trim($part);
|
||||
if ($part === '') {
|
||||
continue;
|
||||
}
|
||||
if (preg_match('/^(\d+)\s*-\s*(\d+)$/', $part, $m)) {
|
||||
$start = intval($m[1]);
|
||||
$end = intval($m[2]);
|
||||
if ($start <= $end) {
|
||||
$numbers = array_merge($numbers, range($start, $end));
|
||||
}
|
||||
} elseif (ctype_digit($part)) {
|
||||
$numbers[] = intval($part);
|
||||
}
|
||||
}
|
||||
return array_values(array_unique($numbers));
|
||||
}
|
||||
|
||||
private function findSentenceStart($content, $position)
|
||||
{
|
||||
$start = 0;
|
||||
foreach (['.', '。', '!', '?', "\n"] as $delimiter) {
|
||||
$pos = strrpos(substr($content, 0, $position), $delimiter);
|
||||
if ($pos !== false) {
|
||||
$start = max($start, $pos + 1);
|
||||
}
|
||||
}
|
||||
return $start;
|
||||
}
|
||||
|
||||
private function findSentenceEnd($content, $position)
|
||||
{
|
||||
$length = strlen($content);
|
||||
$endPositions = [];
|
||||
foreach (['.', '。', '!', '?', "\n"] as $delimiter) {
|
||||
$pos = strpos($content, $delimiter, $position);
|
||||
if ($pos !== false) {
|
||||
$endPositions[] = $pos + 1;
|
||||
}
|
||||
}
|
||||
return empty($endPositions) ? $length : min($endPositions);
|
||||
}
|
||||
|
||||
private function pushJob($checkId, $delaySeconds = 0)
|
||||
{
|
||||
$jobClass = 'app\api\job\ReferenceCheck@fire';
|
||||
$data = ['check_id' => $checkId];
|
||||
try {
|
||||
if ($delaySeconds > 0) {
|
||||
$jobId = Queue::later($delaySeconds, $jobClass, $data, self::QUEUE_NAME);
|
||||
} else {
|
||||
$jobId = Queue::push($jobClass, $data, self::QUEUE_NAME);
|
||||
}
|
||||
var_dump("=====jobId:".$jobId);
|
||||
} catch (\Exception $e) {
|
||||
\think\Log::error('ReferenceCheck pushJob failed check_id=' . $checkId . ' ' . $e->getMessage());
|
||||
throw $e;
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user