Files
tougao/application/common/ReferenceCheckService.php

1025 lines
36 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
namespace app\common;
use think\Db;
use think\Queue;
/**
* 正文 &lt;blue&gt;[n]&lt;/blue&gt; 引用与 t_production_article_referindex+1=n相关性校对。
* LLM 配置与 PromotionLlmService 相同;单条任务走 ReferenceCheck 队列。
*/
class ReferenceCheckService
{
const QUEUE_NAME = 'ReferenceCheck';
/** t_article_main.ref_check_status */
const AM_STATUS_NONE = 0;
const AM_STATUS_PASS = 1;
const AM_STATUS_FAIL = 2;
const AM_STATUS_RUNNING = 3;
/**
* 兼容无 ?? 的 PHP 版本
*/
private function arrGet($arr, $key, $default = '')
{
return isset($arr[$key]) ? $arr[$key] : $default;
}
/**
* 单条入队(可手工指定正文与文献文本)
*/
public function enqueue($contentA, $contentB, array $extra = [])
{
$contentA = trim($contentA);
if ($contentA === '') {
throw new \InvalidArgumentException('content_a is required');
}
$now = date('Y-m-d H:i:s');
$checkId = Db::name('article_reference_check_result')->insertGetId([
'article_id' => intval($this->arrGet($extra, 'article_id', 0)),
'am_id' => intval($this->arrGet($extra, 'am_id', 0)),
'p_article_id' => intval($this->arrGet($extra, 'p_article_id', 0)),
'p_refer_id' => intval($this->arrGet($extra, 'p_refer_id', 0)),
'refer_index' => intval($this->arrGet($extra, 'refer_index', 0)),
'reference_no' => intval($this->arrGet($extra, 'reference_no', 0)),
'reference_raw' => (string)$this->arrGet($extra, 'reference_raw', ''),
'cite_tag_start' => intval($this->arrGet($extra, 'cite_tag_start', 0)),
'cite_tag_end' => intval($this->arrGet($extra, 'cite_tag_end', 0)),
'text_start' => intval($this->arrGet($extra, 'text_start', 0)),
'text_end' => intval($this->arrGet($extra, 'text_end', 0)),
'content_a' => $contentA,
'content_b' => trim($contentB),
'status' => 0,
'created_at' => $now,
'updated_at' => $now,
]);
$amId = intval($this->arrGet($extra, 'am_id', 0));
if ($amId > 0) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING);
}
$this->pushJob(intval($checkId), intval($this->arrGet($extra, 'queue_delay', 0)));
return ['check_id' => $checkId, 'queued' => 1];
}
public function enqueueByArticleMain($main){
$amId = $main['am_id'];
// $main = Db::name('article_main')
// ->field('am_id,content,article_id')
// ->where('am_id', $amId)
// ->whereIn('state', [0, 2])
// ->find();
$citations = $this->extractReferences((string)$main['content']);
// return $citations;
if (empty($citations)) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_NONE);
return;
}
$prod = Db::name('production_article')
->where('article_id', $main['article_id'])
->where('state', 0)
->find();
if (empty($prod)) {
throw new \RuntimeException('production_article not found for article_id=' . $main['article_id']);
}
$pArticleId = intval($prod['p_article_id']);
$referMap = $this->loadReferMapByPArticleId($pArticleId);
if (empty($citations)) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_PASS);
return;
}
$skipped = 0;
$delay = 0;
foreach ($citations as $cite) {
foreach ($cite['reference_numbers'] as $refNo) {
$referIndex = $refNo - 1;
if ($referIndex < 0 || !isset($referMap[$referIndex])) {
$skipped++;
continue;
}
$refer = $referMap[$referIndex];
$referText = $this->formatReferForLlm($refer);
$now = date('Y-m-d H:i:s');
// [70-73] 展开为 reference_no=70,71,72,73 共 4 条记录
$checkId = Db::name('article_reference_check_result')->insertGetId([
'article_id' => $main['article_id'],
'p_article_id' => $pArticleId,
'am_id' => intval($main['am_id']),
'reference_no' => $refNo,
'refer_index' => $refNo,
'origin_text' => $cite['original_text'],
'refer_text' => $referText,
'p_refer_id' => $referMap[$referIndex]['p_refer_id'],
'text_start' => $cite['text_start'],
'text_end' => $cite['text_end'],
'created_at' => $now,
'updated_at' => $now,
]);
$this->pushJob(intval($checkId), $delay);
$checkIds[] = $checkId;
$delay += 1;
}
}
$this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING);
}
public function enqueueByArticle($articleId){
if ($articleId <= 0) {
throw new \InvalidArgumentException('article_id is required');
}
$prod = Db::name('production_article')
->where('article_id', $articleId)
->where('state', 0)
->find();
if (empty($prod)) {
throw new \RuntimeException('production_article not found for article_id=' . $articleId);
}
$pArticleId = intval($prod['p_article_id']);
$referMap = $this->loadReferMapByPArticleId($pArticleId);
$mains = Db::name('article_main')
->field('am_id,content,article_id')
->where('article_id', $articleId)
->whereIn('state', [0, 2])
->order('sort asc')
->select();
if (empty($mains)) {
throw new \RuntimeException('article_main is empty');
}
$queued = 0;
$skipped = 0;
$checkIds = [];
$delay = 0;
$amIdsWithJobs = [];
foreach ($mains as $main) {
$amId = intval($main['am_id']);
$citations = $this->extractReferences((string)$main['content']);
if (empty($citations)) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_NONE);
continue;
}
foreach ($citations as $cite) {
foreach ($cite['reference_numbers'] as $refNo) {
$referIndex = $refNo - 1;
if ($referIndex < 0 || !isset($referMap[$referIndex])) {
$skipped++;
continue;
}
$refer = $referMap[$referIndex];
$referText = $this->formatReferForLlm($refer);
$now = date('Y-m-d H:i:s');
// [70-73] 展开为 reference_no=70,71,72,73 共 4 条记录
$checkId = Db::name('article_reference_check_result')->insertGetId([
'article_id' => $main['article_id'],
'p_article_id' => $pArticleId,
'am_id' => intval($main['am_id']),
'reference_no' => $refNo,
'refer_index' => $refNo,
'origin_text' => $cite['original_text'],
'refer_text' => $referText,
'p_refer_id' => $referMap[$referIndex]['p_refer_id'],
'text_start' => $cite['text_start'],
'text_end' => $cite['text_end'],
'created_at' => $now,
'updated_at' => $now,
]);
$this->pushJob(intval($checkId), $delay);
$checkIds[] = $checkId;
$queued++;
$delay += 1;
$amIdsWithJobs[$amId] = true;
}
}
}
foreach (array_keys($amIdsWithJobs) as $amId) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_RUNNING);
}
return [
'article_id' => $articleId,
'p_article_id' => $pArticleId,
'queued' => $queued,
'skipped' => $skipped,
'check_ids' => $checkIds,
'queue' => self::QUEUE_NAME,
];
}
/**
* 根据该节全部明细行汇总更新 t_article_main.ref_check_status
*/
public function syncAmRefCheckStatus($amId)
{
if ($amId <= 0) {
return self::AM_STATUS_NONE;
}
$rows = Db::name('article_reference_check_result')->where('am_id', $amId)->select();
if (empty($rows)) {
$this->setAmRefCheckStatus($amId, self::AM_STATUS_NONE);
return self::AM_STATUS_NONE;
}
$pending = 0;
$hasFail = false;
$done = 0;
foreach ($rows as $row) {
$st = intval($row['status']);
if ($st === 0) {
$pending++;
continue;
}
if ($st === 2 || ($st === 1 && intval($row['is_match']) === 0)) {
$hasFail = true;
}
if ($st === 1) {
$done++;
}
}
if ($pending > 0) {
$status = self::AM_STATUS_RUNNING;
} elseif ($hasFail) {
$status = self::AM_STATUS_FAIL;
} elseif ($done === count($rows)) {
$status = self::AM_STATUS_PASS;
} else {
$status = self::AM_STATUS_FAIL;
}
$this->setAmRefCheckStatus($amId, $status);
return $status;
}
public function setAmRefCheckStatus($amId, $status)
{
if ($amId <= 0) {
return;
}
Db::name('article_main')->where('am_id', $amId)->update([
'ref_check_status' => $status,
]);
}
public function clearArticleChecks($articleId)
{
Db::name('article_reference_check_result')->where('article_id', $articleId)->delete();
Db::name('article_main')
->where('article_id', $articleId)
->whereIn('state', [0, 2])
->update(['ref_check_status' => self::AM_STATUS_NONE]);
}
public static function amStatusLabel($status)
{
$map = [
self::AM_STATUS_NONE => 'none',
self::AM_STATUS_PASS => 'pass',
self::AM_STATUS_FAIL => 'fail',
self::AM_STATUS_RUNNING => 'running',
];
return isset($map[$status]) ? $map[$status] : 'unknown';
}
public function getResult($checkId)
{
if ($checkId <= 0) {
return null;
}
$row = Db::name('article_reference_check_result')->where('check_id', $checkId)->find();
return $row ?: null;
}
public function listByArticle($articleId, $status = -1, $onlyMismatch = false)
{
$q = Db::name('article_reference_check_result')->where('article_id', $articleId);
if ($status >= 0) {
$q->where('status', $status);
}
if ($onlyMismatch) {
$q->where('status', 1)->where('is_match', 0);
}
return $q->order('am_id asc, cite_tag_start asc, reference_no asc')->select();
}
/**
* 稿件预览:在 content 上标记不合理引用序号与引用句
*
* @return array{sections: array, issues: array, stats: array}
*/
public function buildArticlePreview($articleId, $amId = 0)
{
$q = Db::name('article_main')
->field('am_id,content,sort,ref_check_status')
->where('article_id', $articleId)
->whereIn('state', [0, 2]);
if ($amId > 0) {
$q->where('am_id', $amId);
}
$mains = $q->order('sort asc')->select();
$rows = $this->listByArticle($articleId, 1);
$badByAm = $this->indexBadResults($rows);
$sections = [];
$issues = [];
$stats = ['total' => 0, 'mismatch' => 0, 'match' => 0, 'pending' => 0];
foreach ($this->listByArticle($articleId, -1) as $r) {
$stats['total']++;
if (intval($r['status']) === 0) {
$stats['pending']++;
} elseif (intval($r['is_match']) === 1) {
$stats['match']++;
} else {
$stats['mismatch']++;
}
}
foreach ($mains as $main) {
$id = intval($main['am_id']);
$content = (string)$main['content'];
$badIndex = isset($badByAm[$id]) ? $badByAm[$id] : array();
$marked = $this->markContentForPreview($content, $id, $badIndex);
$amStatus = intval($this->arrGet($main, 'ref_check_status', 0));
$sections[] = [
'am_id' => $id,
'ref_check_status' => $amStatus,
'ref_check_pass' => $amStatus === self::AM_STATUS_PASS,
'ref_check_label' => self::amStatusLabel($amStatus),
'content' => $content,
'content_marked' => $marked['html'],
'issue_count' => $marked['issue_count'],
];
foreach ($marked['issues'] as $issue) {
$issues[] = $issue;
}
}
$articlePass = $this->resolveArticlePass($sections);
return [
'article_id' => $articleId,
'article_ref_check_pass' => $articlePass,
'sections' => $sections,
'issues' => $issues,
'stats' => $stats,
];
}
/**
* 全文是否通过:各节均为 pass且无 running/fail无引用节忽略
*/
private function resolveArticlePass($sections)
{
$hasChecked = false;
foreach ($sections as $sec) {
$st = intval($this->arrGet($sec, 'ref_check_status', 0));
if ($st === self::AM_STATUS_NONE) {
continue;
}
$hasChecked = true;
if ($st !== self::AM_STATUS_PASS) {
return false;
}
}
return $hasChecked ? true : null;
}
/**
* @param array $rows status=1 的检测结果
* @return array<int, array> am_id => indexed bad map
*/
private function indexBadResults($rows)
{
$byAm = [];
foreach ($rows as $row) {
if (intval($row['status']) !== 1 || intval($row['is_match']) === 1) {
continue;
}
$amId = intval($row['am_id']);
$refNo = intval($row['reference_no']);
if ($amId <= 0 || $refNo <= 0) {
continue;
}
if (!isset($byAm[$amId])) {
$byAm[$amId] = ['by_raw' => [], 'contexts' => []];
}
$rawKey = $this->normalizeRefRawKey((string)$this->arrGet($row, 'reference_raw', ''));
if ($rawKey !== '') {
$byAm[$amId]['by_raw'][$rawKey][$refNo] = $row;
}
$ctxKey = intval($row['text_start']) . '_' . intval($row['text_end']);
if (!isset($byAm[$amId]['contexts'][$ctxKey])) {
$byAm[$amId]['contexts'][$ctxKey] = [
'text_start' => intval($row['text_start']),
'text_end' => intval($row['text_end']),
'check_ids' => [],
'reasons' => [],
'ref_nos' => [],
];
}
$byAm[$amId]['contexts'][$ctxKey]['check_ids'][] = intval($row['check_id']);
$byAm[$amId]['contexts'][$ctxKey]['ref_nos'][] = $refNo;
$reason = trim((string)$this->arrGet($row, 'reason', ''));
if ($reason !== '') {
$byAm[$amId]['contexts'][$ctxKey]['reasons'][$refNo] = $reason;
}
}
return $byAm;
}
private function normalizeRefRawKey($raw)
{
$raw = str_replace(
['', '', '—', '', '', '', ' '],
[',', '-', '-', '-', '-', '-', ''],
trim($raw)
);
return strtolower($raw);
}
/**
* @param array $badIndex indexBadResults 中单 am 的结构
*/
private function markContentForPreview($content, $amId, $badIndex)
{
$badByRaw = isset($badIndex['by_raw']) ? $badIndex['by_raw'] : array();
$contexts = isset($badIndex['contexts']) ? $badIndex['contexts'] : array();
$issues = array();
$issueCount = 0;
if ($content === '' || (empty($badByRaw) && empty($contexts))) {
return array('html' => $content, 'issues' => array(), 'issue_count' => 0);
}
$html = $content;
// 1) 先标记 blue 内各序号(在原文上操作,[70-73] 仅标不合理者如 70、71
preg_match_all(
'/<blue>\[([\d,\-\s]+)\]<\/blue>/',
$html,
$matches,
PREG_OFFSET_CAPTURE
);
$citeDeltas = [];
if (!empty($matches[0])) {
$replacements = [];
foreach ($matches[0] as $idx => $match) {
$fullTag = $match[0];
$tagStart = $match[1];
$tagEnd = $tagStart + strlen($fullTag);
$inner = $matches[1][$idx][0];
$rawKey = $this->normalizeRefRawKey($inner);
$badNums = isset($badByRaw[$rawKey]) ? $badByRaw[$rawKey] : array();
$innerMarked = preg_replace_callback(
'/\d+/',
function ($numMatch) use ($badNums, &$issues, &$issueCount, $amId, $inner) {
$num = intval($numMatch[0]);
if (!isset($badNums[$num])) {
return $numMatch[0];
}
$row = $badNums[$num];
$rowReason = isset($row['reason']) ? $row['reason'] : '';
$issueCount++;
$issues[] = array(
'am_id' => $amId,
'check_id' => intval($row['check_id']),
'reference_no' => $num,
'reference_raw' => $inner,
'reason' => $rowReason,
'confidence' => floatval(isset($row['confidence']) ? $row['confidence'] : 0),
);
$title = htmlspecialchars(
'引用[' . $num . ']不合理: ' . $rowReason,
ENT_QUOTES,
'UTF-8'
);
return '<span class="ref-no-error" data-check-id="' . intval($row['check_id'])
. '" data-ref-no="' . $num . '" title="' . $title . '">'
. $numMatch[0] . '</span>';
},
$inner
);
$tagClass = !empty($badNums) ? ' ref-cite-error' : '';
$groupIds = !empty($badNums)
? implode(',', array_map('intval', array_column($badNums, 'check_id')))
: '';
$newHtml = '<blue class="ref-cite-tag' . $tagClass . '" data-ref-raw="' . htmlspecialchars($inner, ENT_QUOTES, 'UTF-8')
. '" data-check-ids="' . $groupIds . '">[' . $innerMarked . ']</blue>';
$replacements[] = [
'start' => $tagStart,
'end' => $tagEnd,
'html' => $newHtml,
'delta' => strlen($newHtml) - ($tagEnd - $tagStart),
];
}
usort($replacements, function ($a, $b) {
return $b['start'] - $a['start'];
});
foreach ($replacements as $rep) {
$html = substr($html, 0, $rep['start']) . $rep['html'] . substr($html, $rep['end']);
$citeDeltas[] = ['start' => $rep['start'], 'delta' => $rep['delta']];
}
}
$shiftByCite = function ($pos) use ($citeDeltas) {
$d = 0;
foreach ($citeDeltas as $cd) {
if ($cd['start'] < $pos) {
$d += $cd['delta'];
}
}
return $pos + $d;
};
// 2) 再标记引用句(从后往前)
if (!empty($contexts)) {
$spans = array_values($contexts);
usort($spans, function ($a, $b) {
return $b['text_start'] - $a['text_start'];
});
foreach ($spans as $span) {
$start = $span['text_start'];
$end = $span['text_end'];
if ($start < 0 || $end <= $start) {
continue;
}
$s = $shiftByCite($start);
$e = $shiftByCite($end);
if ($e > strlen($html)) {
$e = strlen($html);
}
$checkIds = array_values(array_unique($span['check_ids']));
$refNos = array_values(array_unique($span['ref_nos']));
sort($refNos);
$reasonParts = [];
foreach ($refNos as $rn) {
if (!empty($span['reasons'][$rn])) {
$reasonParts[] = '[' . $rn . '] ' . $span['reasons'][$rn];
}
}
$title = htmlspecialchars(
'引用句可能不合理: ' . implode('; ', $reasonParts),
ENT_QUOTES,
'UTF-8'
);
$open = '<span class="ref-context-error" data-check-ids="' . implode(',', $checkIds)
. '" data-ref-nos="' . implode(',', $refNos) . '" title="' . $title . '">';
$close = '</span>';
$html = substr($html, 0, $s) . $open . substr($html, $s, $e - $s) . $close . substr($html, $e);
}
}
return ['html' => $html, 'issues' => $issues, 'issue_count' => $issueCount];
}
/**
* @return array<int, array> refer_index => row
*/
public function loadReferMapByPArticleId($pArticleId)
{
$map = [];
if ($pArticleId <= 0) {
return $map;
}
$rows = Db::name('production_article_refer')
->where('p_article_id', $pArticleId)
->where('state', 0)
->order('index asc')
->select();
foreach ($rows as $row) {
$map[intval($row['index'])] = $row;
}
return $map;
}
public function formatReferForLlm($refer)
{
$parts = [];
foreach (['title', 'author', 'joura', 'dateno', 'refer_doi', 'doilink'] as $f) {
$v = trim((string)$this->arrGet($refer, $f, ''));
if ($v !== '') {
$parts[] = ucfirst($f) . ': ' . $v;
}
}
$content = trim((string)$this->arrGet($refer, 'refer_content', ''));
if ($content !== '') {
$parts[] = 'Reference: ' . $content;
}
return implode("\n", $parts);
}
/**
* 从 article_main.content 提取 blue 引用
*/
public function extractReferences($content)
{
$result = [];
preg_match_all('/<blue>\[([\d,\-\s]+)\]<\/blue>/', $content, $matches, PREG_OFFSET_CAPTURE);
if (empty($matches[0])) {
return [];
}
$tagSpans = [];
foreach ($matches[0] as $index => $match) {
$tagSpans[] = [
'start' => $match[1],
'end' => $match[1] + strlen($match[0]),
'index' => $index,
];
}
foreach ($matches[0] as $index => $match) {
$fullTag = $match[0];
$tagStart = $match[1];
$tagEnd = $tagStart + strlen($fullTag);
$rawRef = trim($matches[1][$index][0]);
$referenceNumbers = $this->expandReferenceNumbers($rawRef);
list($localStart, $localEnd, $originalText) = $this->extractLocalCitationContext(
$content,
$tagStart,
$tagEnd,
$tagSpans
);
if (!$this->isMeaningfulCitationContext($originalText) || empty($referenceNumbers)) {
continue;
}
$result[] = [
'reference_raw' => $rawRef,
'reference_numbers' => $referenceNumbers,
'original_text' => $originalText,
'reference_start' => $tagStart,
'reference_end' => $tagEnd,
'text_start' => $localStart,
'text_end' => $localEnd,
];
}
return $result;
}
/**
* 按引用位置截取局部上下文:优先取标签前叙述;同句多引时后续引用从上一标签后开始。
*/
private function extractLocalCitationContext($content, $tagStart, $tagEnd, array $tagSpans)
{
$paragraphStart = $this->findParagraphStart($content, $tagStart);
$sentenceEnd = $this->findSentenceEnd($content, $tagEnd, $tagEnd);
$prevTagEnd = $paragraphStart;
$nextTagStart = $sentenceEnd;
foreach ($tagSpans as $span) {
if ($span['end'] <= $tagStart && $span['end'] > $prevTagEnd) {
$prevTagEnd = $span['end'];
}
if ($span['start'] > $tagEnd && $span['start'] < $nextTagStart) {
$nextTagStart = $span['start'];
}
}
$hasPriorCiteInParagraph = ($prevTagEnd > $paragraphStart);
// 同段后续引用:从上一 <blue> 后开始;段内首个引用:从段落开头到本标签前(非仅最后一句)
if ($hasPriorCiteInParagraph) {
$localStart = $prevTagEnd;
} else {
$localStart = $this->capContextStartBeforeTag($content, $tagStart, $paragraphStart);
}
// 默认:引用标签前的论述(如 Yin et al. [13] → 含 “higher than … Yin et al.”)
$localEnd = $tagStart;
$originalText = $this->buildCitationContextText($content, $localStart, $localEnd);
// 标签前几乎无正文(如句末 … ICU nurses [14])→ 改用标签后至下一引用或句末
if (!$this->isMeaningfulCitationContext($originalText)
|| $this->shouldUseTrailingCitationContext($content, $localStart, $tagStart, $tagEnd)
) {
$trailEnd = ($nextTagStart < $sentenceEnd) ? $nextTagStart : $sentenceEnd;
$trailText = $this->buildCitationContextText($content, $tagEnd, $trailEnd);
if ($this->isMeaningfulCitationContext($trailText)) {
$localStart = $tagEnd;
$localEnd = $trailEnd;
$originalText = $trailText;
}
}
if (!$this->isMeaningfulCitationContext($originalText)) {
list($localStart, $localEnd) = $this->widenCitationContextBounds(
$content,
$tagStart,
$tagEnd,
$localStart,
$localEnd
);
$originalText = $this->buildCitationContextText($content, $localStart, $localEnd);
}
return [$localStart, $localEnd, $originalText];
}
/**
* 标签前仅有作者缩写等极短片段时,改用标签后上下文
*/
private function shouldUseTrailingCitationContext($content, $localStart, $tagStart, $tagEnd)
{
$before = $this->buildCitationContextText($content, $localStart, $tagStart);
if (!$this->isMeaningfulCitationContext($before)) {
return true;
}
return mb_strlen($before) < 25;
}
public function expandReferenceNumbers($refStr)
{
$refStr = str_replace(
['', '', '—', '', '', ''],
[',', '-', '-', '-', '-', '-'],
trim($refStr)
);
$numbers = [];
foreach (explode(',', $refStr) as $part) {
$part = trim($part);
if ($part === '') {
continue;
}
if (preg_match('/^(\d+)\s*-\s*(\d+)$/', $part, $m)) {
$start = intval($m[1]);
$end = intval($m[2]);
if ($start <= $end) {
$numbers = array_merge($numbers, range($start, $end));
}
} elseif (ctype_digit($part)) {
$numbers[] = intval($part);
}
}
return array_values(array_unique($numbers));
}
/**
* 返回 $bytePos 处 UTF-8 码点占用的最后一字节之后的位置(下一字符起始)
*/
private function utf8CharEnd($content, $bytePos)
{
$len = strlen($content);
if ($bytePos < 0 || $bytePos >= $len) {
return max(0, min($len, $bytePos + 1));
}
$next = $bytePos + 1;
while ($next < $len && (ord($content[$next]) & 0xC0) === 0x80) {
$next++;
}
return $next;
}
/**
* 按字节偏移截取(与 strpos/strlen 一致);勿用 mb_substr否则遇中文前缀会截断英文词头
*/
private function byteSubstr($content, $start, $end)
{
$length = max(0, $end - $start);
if ($length === 0) {
return '';
}
return (string)mb_strcut($content, $start, $length, 'UTF-8');
}
private function buildCitationContextText($content, $start, $end)
{
$text = $this->byteSubstr($content, $start, $end);
$text = preg_replace('/<blue>\[[\d,\-\s]+\]<\/blue>/', '', $text);
$text = trim(strip_tags($text));
$text = preg_replace('/\s+/u', ' ', $text);
$text = ltrim($text, "\xEF\xBB\xBF");
return $text;
}
/**
* 过滤仅标点、过短或无字母/汉字的上下文(如去掉标签后只剩 "."
*/
private function isMeaningfulCitationContext($text)
{
$text = trim($text);
if ($text === '') {
return false;
}
if ($this->isOnlyPunctuationOrSpace($text)) {
return false;
}
if (!preg_match('/[\p{L}\p{N}]/u', $text)) {
return false;
}
return mb_strlen($text) >= 2;
}
private function isOnlyPunctuationOrSpace($text)
{
return preg_match('/^[\s\p{P}\p{S}]+$/u', $text) === 1;
}
/**
* 首句过短时向前后各扩展一句(上限约 2000 字符)
*/
private function widenCitationContextBounds($content, $tagStart, $tagEnd, $start, $end)
{
$len = strlen($content);
$maxSpan = 2000;
if ($start > 0) {
$prevStart = $this->findSentenceStart($content, max(0, $start - 1));
if ($prevStart < $start) {
$start = $prevStart;
}
}
$nextEnd = $this->findSentenceEnd($content, $end, $tagEnd);
if ($nextEnd > $end && $nextEnd <= $len) {
$end = $nextEnd;
}
if ($end - $start > $maxSpan) {
$half = (int)floor($maxSpan / 2);
$mid = (int)floor(($tagStart + $tagEnd) / 2);
$start = max(0, $mid - $half);
$end = min($len, $start + $maxSpan);
}
return [$start, $end];
}
/**
* 句号是否可作为句界排除小数点、et al. 等缩写)
*/
private function isSentenceDelimiterAt($content, $pos, $delimiter)
{
$len = strlen($content);
if ($delimiter !== '.' || $pos < 0 || $pos >= $len) {
return true;
}
if ($pos > 0 && $pos + 1 < $len
&& ctype_digit($content[$pos - 1])
&& ctype_digit($content[$pos + 1])
) {
return false;
}
$before = substr($content, max(0, $pos - 12), min(12, $pos));
if (preg_match('/\b(et\s+al|e\.g|i\.e|vs|etc|fig|no)\s*\.?\s*$/i', $before)) {
return false;
}
$after = substr($content, $pos + 1, 24);
if (preg_match('/^\s*<blue>\s*\[/', $after)) {
return false;
}
return true;
}
/**
* 段落起始HTML / 换行),避免英文多句段落只取到最后一个句号后的一句
*/
private function findParagraphStart($content, $tagStart)
{
$search = substr($content, 0, max(0, $tagStart));
if ($search === '') {
return 0;
}
$best = 0;
if (preg_match_all('/<p[^>]*>/i', $search, $m, PREG_OFFSET_CAPTURE)) {
$last = end($m[0]);
$best = max($best, $last[1] + strlen($last[0]));
}
if (preg_match_all('/<\/p>\s*/i', $search, $m, PREG_OFFSET_CAPTURE)) {
$last = end($m[0]);
$best = max($best, $last[1] + strlen($last[0]));
}
if (preg_match_all('/<br\s*\/?>\s*/i', $search, $m, PREG_OFFSET_CAPTURE)) {
$last = end($m[0]);
$best = max($best, $last[1] + strlen($last[0]));
}
$pos = strrpos($search, "\n\n");
if ($pos !== false) {
$best = max($best, $pos + 2);
}
$pos = strrpos($search, "\n");
if ($pos !== false) {
$best = max($best, $pos + 1);
}
return $best;
}
/**
* 段落过长时从引用处向前截取上限,避免单次 LLM 上下文过大
*/
private function capContextStartBeforeTag($content, $tagStart, $paragraphStart, $maxBytes = 2500)
{
if ($tagStart - $paragraphStart <= $maxBytes) {
return $paragraphStart;
}
$start = $tagStart - $maxBytes;
$slice = substr($content, $start, $tagStart - $start);
if (preg_match('/[.!?。!?]\s+/u', $slice, $m, PREG_OFFSET_CAPTURE)) {
$last = end($m[0]);
$rel = $last[1] + strlen($last[0]);
return $start + $rel;
}
return max($paragraphStart, $start);
}
private function findSentenceStart($content, $position)
{
$start = 0;
foreach (['.', '。', '!', '?', "\n"] as $delimiter) {
$pos = strrpos(substr($content, 0, $position), $delimiter);
if ($pos !== false && $this->isSentenceDelimiterAt($content, $pos, $delimiter)) {
$start = max($start, $this->utf8CharEnd($content, $pos));
}
}
return $start;
}
/**
* @param int $searchFrom 从该字节位置起查找句末
* @param int $tagEnd 引用标签结束位置;用于跳过 </blue> 后紧跟的孤立句号
*/
private function findSentenceEnd($content, $searchFrom, $tagEnd = 0)
{
$length = strlen($content);
$minPos = max(0, $searchFrom);
while ($minPos < $length) {
$endPositions = [];
foreach (['.', '。', '!', '?', "\n"] as $delimiter) {
$pos = strpos($content, $delimiter, $minPos);
if ($pos !== false && $this->isSentenceDelimiterAt($content, $pos, $delimiter)) {
$endPositions[] = $this->utf8CharEnd($content, $pos);
}
}
if (empty($endPositions)) {
return $length;
}
$end = min($endPositions);
if ($tagEnd <= 0 || $end <= $tagEnd) {
return $end;
}
$gap = substr($content, $tagEnd, $end - $tagEnd);
$gapText = trim(strip_tags(preg_replace('/<blue>\[[\d,\-\s]+\]<\/blue>/', '', $gap)));
if ($gapText !== '' && !$this->isOnlyPunctuationOrSpace($gapText)) {
return $end;
}
$minPos = $end;
}
return $length;
}
private function pushJob($checkId, $delaySeconds = 0)
{
$jobClass = 'app\api\job\ReferenceCheck@fire';
$data = ['check_id' => $checkId];
try {
if ($delaySeconds > 0) {
$jobId = Queue::later($delaySeconds, $jobClass, $data, self::QUEUE_NAME);
} else {
$jobId = Queue::push($jobClass, $data, self::QUEUE_NAME);
}
var_dump("=====jobId:".$jobId);
} catch (\Exception $e) {
\think\Log::error('ReferenceCheck pushJob failed check_id=' . $checkId . ' ' . $e->getMessage());
throw $e;
}
}
}