\s*\[/', $after)) {
return false;
}
return true;
}
/**
* 段落起始(HTML / 换行),避免英文多句段落只取到最后一个句号后的一句
*/
private function findParagraphStart($content, $tagStart)
{
$search = substr($content, 0, max(0, $tagStart));
if ($search === '') {
return 0;
}
$best = 0;
if (preg_match_all('/]*>/i', $search, $m, PREG_OFFSET_CAPTURE)) {
$last = end($m[0]);
$best = max($best, $last[1] + strlen($last[0]));
}
if (preg_match_all('/<\/p>\s*/i', $search, $m, PREG_OFFSET_CAPTURE)) {
$last = end($m[0]);
$best = max($best, $last[1] + strlen($last[0]));
}
if (preg_match_all('/
\s*/i', $search, $m, PREG_OFFSET_CAPTURE)) {
$last = end($m[0]);
$best = max($best, $last[1] + strlen($last[0]));
}
$pos = strrpos($search, "\n\n");
if ($pos !== false) {
$best = max($best, $pos + 2);
}
$pos = strrpos($search, "\n");
if ($pos !== false) {
$best = max($best, $pos + 1);
}
return $best;
}
/**
* 段落过长时从引用处向前截取上限,避免单次 LLM 上下文过大
*/
private function capContextStartBeforeTag($content, $tagStart, $paragraphStart, $maxBytes = 2500)
{
if ($tagStart - $paragraphStart <= $maxBytes) {
return $paragraphStart;
}
$start = $tagStart - $maxBytes;
$slice = substr($content, $start, $tagStart - $start);
if (preg_match('/[.!?。!?]\s+/u', $slice, $m, PREG_OFFSET_CAPTURE)) {
$rel = $m[0][1] + strlen($m[0][0]);
return $start + $rel;
}
return max($paragraphStart, $start);
}
private function findSentenceStart($content, $position)
{
$start = 0;
foreach (['.', '。', '!', '?', "\n"] as $delimiter) {
$pos = strrpos(substr($content, 0, $position), $delimiter);
if ($pos !== false && $this->isSentenceDelimiterAt($content, $pos, $delimiter)) {
$start = max($start, $this->utf8CharEnd($content, $pos));
}
}
return $start;
}
/**
* @param int $searchFrom 从该字节位置起查找句末
* @param int $tagEnd 引用标签结束位置;用于跳过
后紧跟的孤立句号
*/
private function findSentenceEnd($content, $searchFrom, $tagEnd = 0)
{
$length = strlen($content);
$minPos = max(0, $searchFrom);
while ($minPos < $length) {
$endPositions = [];
foreach (['.', '。', '!', '?', "\n"] as $delimiter) {
$pos = strpos($content, $delimiter, $minPos);
if ($pos !== false && $this->isSentenceDelimiterAt($content, $pos, $delimiter)) {
$endPositions[] = $this->utf8CharEnd($content, $pos);
}
}
if (empty($endPositions)) {
return $length;
}
$end = min($endPositions);
if ($tagEnd <= 0 || $end <= $tagEnd) {
return $end;
}
$gap = substr($content, $tagEnd, $end - $tagEnd);
$gapText = trim(strip_tags($this->pregReplaceBlueTags($gap, '')));
if ($gapText !== '' && !$this->isOnlyPunctuationOrSpace($gapText)) {
return $end;
}
$minPos = $end;
}
return $length;
}
/**
* 批量记录已入库后创建文章批次并投递 RabbitMQ
*
* @param array $rows 元素含 check_id
* @param int $pArticleId
* @param string $trigger enqueue|recheck_failed|manual
* @return int[] check_id 列表
*/
private function enqueueChecksSortedByReferenceNo(array $rows, $pArticleId = 0, $trigger = 'enqueue')
{
$checkIds = [];
foreach ($rows as $row) {
$checkId = intval($row['check_id']);
if ($checkId > 0) {
$checkIds[] = $checkId;
}
}
if (!empty($checkIds)) {
$this->startArticleCheckQueue($checkIds, intval($pArticleId), $trigger);
}
return $checkIds;
}
/**
* 创建文章批次;队首批次立即发 MQ,其余批次等待前序完成
*
* @param int[] $checkIds
* @param int $pArticleId
* @param string $trigger
* @return int[]
*/
public function startArticleCheckQueue(array $checkIds, $pArticleId = 0, $trigger = 'enqueue')
{
$checkIds = array_values(array_filter(array_map('intval', $checkIds)));
if (empty($checkIds)) {
return [];
}
$pArticleId = intval($pArticleId);
if ($pArticleId <= 0) {
$firstRow = Db::name('article_reference_check_result')->where('id', $checkIds[0])->find();
$pArticleId = empty($firstRow) ? 0 : intval($this->arrGet($firstRow, 'p_article_id', 0));
}
if ($pArticleId <= 0) {
throw new \RuntimeException('p_article_id is required for reference check queue');
}
$now = date('Y-m-d H:i:s');
$batchId = Db::name('article_reference_check_batch')->insertGetId([
'p_article_id' => $pArticleId,
'batch_status' => 0,
'total_count' => count($checkIds),
'done_count' => 0,
'failed_count' => 0,
'trigger' => (string)$trigger,
'created_at' => $now,
'updated_at' => $now,
]);
$shouldPublish = !$this->hasEarlierWaitingBatch($batchId) && !$this->hasRunningReferenceCheckBatch();
if ($shouldPublish) {
(new ReferenceCheckMqPublisher())->publishArticleStart($pArticleId, intval($batchId), $trigger);
$this->log('startArticleCheckQueue publish p_article_id=' . $pArticleId . ' batch_id=' . $batchId);
} else {
$this->log('startArticleCheckQueue queued batch_id=' . $batchId . ' p_article_id=' . $pArticleId);
}
return $checkIds;
}
private function hasRunningReferenceCheckBatch()
{
return Db::name('article_reference_check_batch')
->where('batch_status', 1)
->count() > 0;
}
private function hasEarlierWaitingBatch($batchId)
{
return Db::name('article_reference_check_batch')
->where('batch_status', 0)
->where('id', '<', intval($batchId))
->count() > 0;
}
public function log($msg)
{
$line = date('Y-m-d H:i:s') . ' ' . $msg . PHP_EOL;
@file_put_contents($this->logFile, $line, FILE_APPEND);
}
}