@@ -15,12 +15,20 @@ class ReferenceCheckService
{
const QUEUE_NAME = 'ReferenceCheck' ;
/** t_article_main.ref_check_status */
/** t_article_main.type */
const MAIN_TYPE_TEXT = 0 ;
const MAIN_TYPE_IMAGE = 1 ;
const MAIN_TYPE_TABLE = 2 ;
/** t_article_main.ref_check_status( 需执行 sql/article_main_ref_check_status.sql) */
const AM_STATUS_NONE = 0 ;
const AM_STATUS_PASS = 1 ;
const AM_STATUS_FAIL = 2 ;
const AM_STATUS_RUNNING = 3 ;
/** @var bool|null t_article_main 是否已有 ref_check_status 列 */
private static $amRefCheckStatusColumnExists = null ;
/**
* 引用校对状态( 生命周期顺序: 0→1→2→3 = 待→进行→完成→失败)
*
@@ -52,20 +60,14 @@ class ReferenceCheckService
const PASS_CONFIDENCE_THRESHOLD = 0.65 ;
/**
* <blue>[...]</blue> 引用标签内允许的字符类(带 /u 修饰符使用)。
* 正文引用标签两种排版(带 /u) :
* 1) <blue>[8, 9]</blue>、<blue>[13-15]</blue> —— 方括号在 blue 内
* 2) [<blue>13-15</blue>] —— 方括号包裹 blue
*
* 除 ASCII 数字、半角 逗号、半角 连字符、空白外,还兼容常见 排版变体:
* , U+FF0C 全角逗号
* – U+2013 EN DASH
* — U+2014 EM DASH
* − U+2212 MINUS SIGN
* ‐ U+2010 HYPHEN
* ‑ U+2011 NON-BREAKING HYPHEN
*
* 若不支持变体连字符,会导致 [19– 21] 这种区间引用整段被 preg 漏掉,
* 进而丢失对应的 reference_no 校对记录。
* 捕获组均为序号串(可含 逗号、区间 连字符及 排版变体)。
*/
const BLUE_TAG_REGEX = '/<blue>\[([\d,, \-\x{2013}\x{2014}\x{2212}\x{2010}\x{2011}\s]+)\]<\/blue>/u' ;
const BLUE_TAG_REGEX_BRACKET_OUTSIDE = '/\[<blue>([\d,, \-\x{2013}\x{2014}\x{2212}\x{2010}\x{2011}\s]+)<\/blue>\]/u' ;
/**
* 兼容无 ?? 的 PHP 版本
@@ -75,6 +77,46 @@ class ReferenceCheckService
return isset ( $arr [ $key ]) ? $arr [ $key ] : $default ;
}
/**
* 合并匹配两种 blue 引用排版,按在正文中的起始位置排序。
*
* @return array{0: array, 1: array} 同 preg_match_all 的完整匹配与捕获组 1
*/
private function collectBlueTagMatches ( $content )
{
$merged = [];
foreach ([ self :: BLUE_TAG_REGEX , self :: BLUE_TAG_REGEX_BRACKET_OUTSIDE ] as $pattern ) {
if ( ! preg_match_all ( $pattern , $content , $m , PREG_OFFSET_CAPTURE )) {
continue ;
}
$count = count ( $m [ 0 ]);
for ( $i = 0 ; $i < $count ; $i ++ ) {
$merged [] = [ 'full' => $m [ 0 ][ $i ], 'inner' => $m [ 1 ][ $i ]];
}
}
usort ( $merged , function ( $a , $b ) {
return $a [ 'full' ][ 1 ] - $b [ 'full' ][ 1 ];
});
$matches = [[], []];
foreach ( $merged as $item ) {
$matches [ 0 ][] = $item [ 'full' ];
$matches [ 1 ][] = $item [ 'inner' ];
}
return $matches ;
}
/** 对两种 blue 引用排版执行 preg_replace */
private function pregReplaceBlueTags ( $subject , $replacement )
{
$subject = preg_replace ( self :: BLUE_TAG_REGEX , $replacement , $subject );
$subject = preg_replace ( self :: BLUE_TAG_REGEX_BRACKET_OUTSIDE , $replacement , $subject );
return $subject ;
}
/**
* 单条入队(可手工指定正文与文献文本)
*/
@@ -115,14 +157,18 @@ class ReferenceCheckService
return [ 'check_id' => $checkId , 'queued' => 1 ];
}
public function enqueueByArticleMain ( $main ){
$amId = $main [ 'am_id' ] ;
// $main = Db::name('article_main')
// ->field('am_id,content,article_id')
// ->where('am_id', $amId)
// ->whereIn('state', [0, 2])
// ->find();
$citations = $this -> extractReferences (( string ) $main [ 'content' ] );
// return $citations;
$amId = intval ( $this -> arrGet ( $main , 'am_id' , 0 )) ;
if ( $amId > 0 && ( ! isset ( $main [ 'type' ]) || ( intval ( $main [ 'type' ]) === self :: MAIN_TYPE_TABLE && intval ( $this -> arrGet ( $main , 'amt_id' , 0 )) <= 0 ))) {
$dbMain = Db :: name ( 'article_main' )
-> field ( 'am_id,content,article_id,type,amt_id' )
-> where( 'am_id' , $amId )
-> whereIn ( 'state' , [ 0 , 2 ])
-> find ( );
if ( ! empty ( $dbMain )) {
$main = array_merge ( $dbMain , $main );
}
}
$citations = $this -> extractReferencesForArticleMain ( $main );
if ( empty ( $citations )) {
$this -> setAmRefCheckStatus ( $amId , self :: AM_STATUS_NONE );
return ;
@@ -222,7 +268,7 @@ class ReferenceCheckService
$referMap = $this -> loadReferMapByPArticleId ( $pArticleId );
$mains = Db :: name ( 'article_main' )
-> field ( 'am_id,content,article_id' )
-> field ( 'am_id,content,article_id,type,amt_id ' )
-> where ( 'article_id' , $articleId )
-> whereIn ( 'state' , [ 0 , 2 ])
-> order ( 'sort asc' )
@@ -237,7 +283,7 @@ class ReferenceCheckService
$now = date ( 'Y-m-d H:i:s' );
foreach ( $mains as $main ) {
$amId = intval ( $main [ 'am_id' ]);
$citations = $this -> extractReferences(( str ing ) $main [ 'content' ] );
$citations = $this -> extractReferencesForArticleMa in ( $main );
if ( empty ( $citations )) {
$this -> setAmRefCheckStatus ( $amId , self :: AM_STATUS_NONE );
continue ;
@@ -309,7 +355,7 @@ class ReferenceCheckService
$referMap = $this -> loadReferMapByPArticleId ( $pArticleId );
$mains = Db :: name ( 'article_main' )
-> field ( 'am_id,content,article_id' )
-> field ( 'am_id,content,article_id,type,amt_id ' )
-> where ( 'article_id' , $articleId )
-> whereIn ( 'state' , [ 0 , 2 ])
-> order ( 'sort asc' )
@@ -324,7 +370,7 @@ class ReferenceCheckService
$now = date ( 'Y-m-d H:i:s' );
foreach ( $mains as $main ) {
$amId = intval ( $main [ 'am_id' ]);
$citations = $this -> extractReferences(( str ing ) $main [ 'content' ] );
$citations = $this -> extractReferencesForArticleMa in ( $main );
if ( empty ( $citations )) {
$this -> setAmRefCheckStatus ( $amId , self :: AM_STATUS_NONE );
continue ;
@@ -429,9 +475,27 @@ class ReferenceCheckService
return $status ;
}
/**
* t_article_main 是否已加 ref_check_status 列(未迁移时跳过写入,避免 fields not exists)
*/
private function hasAmRefCheckStatusColumn ()
{
if ( self :: $amRefCheckStatusColumnExists !== null ) {
return self :: $amRefCheckStatusColumnExists ;
}
try {
$table = Db :: name ( 'article_main' ) -> getTable ();
$rows = Db :: query ( 'SHOW COLUMNS FROM `' . str_replace ( '`' , '``' , $table ) . '` LIKE \'ref_check_status\'' );
self :: $amRefCheckStatusColumnExists = ! empty ( $rows );
} catch ( \Exception $e ) {
self :: $amRefCheckStatusColumnExists = false ;
}
return self :: $amRefCheckStatusColumnExists ;
}
public function setAmRefCheckStatus ( $amId , $status )
{
if ( $amId <= 0 ) {
if ( $amId <= 0 || ! $this -> hasAmRefCheckStatusColumn () ) {
return ;
}
Db :: name ( 'article_main' ) -> where ( 'am_id' , $amId ) -> update ([
@@ -472,7 +536,7 @@ class ReferenceCheckService
-> where ( 'p_article_id' , $pArticleId )
-> delete ();
if ( $articleId > 0 ) {
if ( $articleId > 0 && $this -> hasAmRefCheckStatusColumn () ) {
Db :: name ( 'article_main' )
-> where ( 'article_id' , $articleId )
-> whereIn ( 'state' , [ 0 , 2 ])
@@ -498,10 +562,12 @@ class ReferenceCheckService
}
$deleted = Db :: name ( 'article_reference_check_result' ) -> where ( 'article_id' , $articleId ) -> delete ();
Db :: name ( 'article_main' )
-> wher e( 'article_id' , $articleId )
-> whereIn ( 'state' , [ 0 , 2 ] )
-> update ([ 'ref_check_status' => self :: AM_STATUS_NONE ]);
if ( $this -> hasAmRefCheckStatusColumn ()) {
Db :: nam e( 'article_main' )
-> where ( 'article_id' , $articleId )
-> whereIn ( 'state' , [ 0 , 2 ])
-> update ([ 'ref_check_status' => self :: AM_STATUS_NONE ]);
}
return intval ( $deleted );
}
@@ -669,6 +735,68 @@ class ReferenceCheckService
];
}
/**
* 多篇文章并行校对时,查询指定文章前面还有几篇在排队。
*
* 「正在校对」= 该文至少还有 1 条明细 status=待校验(0)。
* 排队顺序:按各文章最早一条待校验明细的 id 升序(与全局入队先后一致)。
*
* @return array{
* p_article_id:int,
* running_total:int,
* ahead:int,
* position:int,
* in_queue:bool,
* status:int
* }
*/
public function getArticleCheckQueuePositionByPArticleId ( $pArticleId )
{
$pArticleId = intval ( $pArticleId );
if ( $pArticleId <= 0 ) {
throw new \InvalidArgumentException ( 'p_article_id is required' );
}
$rows = Db :: name ( 'article_reference_check_result' )
-> field ( 'p_article_id, MIN(id) AS queue_anchor' )
-> where ( 'status' , self :: RECORD_PENDING )
-> group ( 'p_article_id' )
-> order ( 'queue_anchor' , 'asc' )
-> select ();
$runningIds = [];
foreach ( $rows as $row ) {
$aid = intval ( $this -> arrGet ( $row , 'p_article_id' , 0 ));
if ( $aid > 0 ) {
$runningIds [] = $aid ;
}
}
$runningTotal = count ( $runningIds );
$ahead = 0 ;
$position = 0 ;
$inQueue = false ;
foreach ( $runningIds as $idx => $aid ) {
if ( $aid === $pArticleId ) {
$ahead = $idx ;
$position = $idx + 1 ;
$inQueue = true ;
break ;
}
}
$articleStatus = $this -> getArticleProgressStatusByPArticleId ( $pArticleId );
return [
'p_article_id' => $pArticleId ,
'running_total' => $runningTotal ,
'ahead' => $inQueue ? $ahead : 0 ,
'position' => $inQueue ? $position : 0 ,
'in_queue' => $inQueue ,
'status' => intval ( $this -> arrGet ( $articleStatus , 'status' , self :: ARTICLE_PROGRESS_NONE )),
];
}
/**
* 按 p_article_id 查整篇引用校对进度,按 reference_no 分组聚合状态,并展开每条明细。
*
@@ -820,17 +948,16 @@ class ReferenceCheckService
}
/**
* 按 p_refer_id 查这条参考文献的所有 校对明细。
* 按 p_refer_id 查这条参考文献的校对明细与分组进度 。
*
* 每条 record 返回 :
* - am_id 命中的 article_main 主键
* - confidence 匹配置信度( 0~1)
* - reason LLM 给出的判定理由
* - is_match 是否匹配(来自 article_reference_check_result.is_match)
* - is_pass 是否通过校验( confidence >= PASS_CONFIDENCE_THRESHOLD)
* 分组进度(与 referenceCheckProgressAI 单条 list 项口径一致) :
* progress_status 0待校验 1校对中 2完成 3失败
* pending/done/failed/pass、is_pass、progress_percent
*
* list 每项: check_id、am_id、status、confidence、reason、is_match、is_pass
*
* @param int $pReferId production_article_refer.p_refer_id
* @return array{p_refer_id:int, p_article_id:int, reference_no:int, total:int, list:array}
* @return array
*/
public function getCheckDetailsByPReferId ( $pReferId )
{
@@ -840,7 +967,7 @@ class ReferenceCheckService
}
$rows = Db :: name ( 'article_reference_check_result' )
-> field ( 'id,p_article_id,reference_no,am_id,confidence,is_match,reason' )
-> field ( 'id,p_article_id,reference_no,am_id,status, confidence,is_match,reason,updated_at ' )
-> where ( 'p_refer_id' , $pReferId )
-> order ( 'id asc' )
-> select ();
@@ -848,8 +975,13 @@ class ReferenceCheckService
$list = [];
$pArticleId = 0 ;
$referenceNo = 0 ;
$pending = 0 ;
$done = 0 ;
$failed = 0 ;
$pass = 0 ;
$lastUpdatedAt = '' ;
foreach ( $rows as $row ) {
// 取首条出现的 p_article_id / reference_no 作为该 refer 的上下文
if ( $pArticleId <= 0 ) {
$pArticleId = intval ( $this -> arrGet ( $row , 'p_article_id' , 0 ));
}
@@ -857,22 +989,87 @@ class ReferenceCheckService
$referenceNo = intval ( $this -> arrGet ( $row , 'reference_no' , 0 ));
}
$st = intval ( $this -> arrGet ( $row , 'status' , 0 ));
if ( $st === self :: RECORD_PENDING ) {
$pending ++ ;
} elseif ( $st === self :: RECORD_COMPLETED ) {
$done ++ ;
} elseif ( $st === self :: RECORD_FAILED ) {
$failed ++ ;
}
$upd = ( string ) $this -> arrGet ( $row , 'updated_at' , '' );
if ( $upd > $lastUpdatedAt ) {
$lastUpdatedAt = $upd ;
}
$confidence = floatval ( $this -> arrGet ( $row , 'confidence' , 0 ));
$isPass = $confidence >= self :: PASS_CONFIDENCE_THRESHOLD ;
if ( $isPass ) {
$pass ++ ;
}
$list [] = [
'check_id' => intval ( $this -> arrGet ( $row , 'id' , 0 )),
'am_id' => intval ( $this -> arrGet ( $row , 'am_id' , 0 )),
'status' => $st ,
'confidence' => $confidence ,
'reason' => ( string ) $this -> arrGet ( $row , 'reason' , '' ),
'is_match' => intval ( $this -> arrGet ( $row , 'is_match' , 0 )),
'is_pass' => $confidence >= self :: PASS_CONFIDENCE_THRESHOLD ,
'is_pass' => $isPass ,
];
}
if ( $referenceNo <= 0 ) {
$refer = Db :: name ( 'production_article_refer' )
-> where ( 'p_refer_id' , $pReferId )
-> where ( 'state' , 0 )
-> find ();
if ( ! empty ( $refer )) {
if ( $pArticleId <= 0 ) {
$pArticleId = intval ( $this -> arrGet ( $refer , 'p_article_id' , 0 ));
}
$referenceNo = intval ( $this -> arrGet ( $refer , 'index' , 0 )) + 1 ;
}
}
$total = count ( $list );
if ( $total === 0 ) {
$progressStatus = self :: PROGRESS_PENDING ;
$progressPercent = 0 ;
$isPassGroup = false ;
} elseif ( $pending === $total ) {
$progressStatus = self :: PROGRESS_PENDING ;
$progressPercent = 0 ;
$isPassGroup = false ;
} elseif ( $pending === 0 ) {
$progressStatus = $failed > 0 ? self :: PROGRESS_FAILED : self :: PROGRESS_COMPLETED ;
$progressPercent = 100 ;
$isPassGroup = (
$progressStatus === self :: PROGRESS_COMPLETED
&& $pass === $total
);
} else {
$progressStatus = self :: PROGRESS_CHECKING ;
$finished = $done + $failed ;
$progressPercent = round ( $finished / $total * 100 , 1 );
$isPassGroup = false ;
}
return [
'p_refer_id' => $pReferId ,
'p_article_id' => $pArticleId ,
'reference_no' => $referenceNo ,
'total' => count ( $list ) ,
'list ' => $list ,
'p_refer_id' => $pReferId ,
'p_article_id' => $pArticleId ,
'reference_no' => $referenceNo ,
'total' => $total ,
'pending ' => $pending ,
'done' => $done ,
'failed' => $failed ,
'pass' => $pass ,
'progress_status' => $progressStatus ,
'progress_percent' => $progressPercent ,
'is_pass' => $isPassGroup ,
'last_updated_at' => $lastUpdatedAt ,
'list' => $list ,
];
}
@@ -1010,8 +1207,12 @@ class ReferenceCheckService
*/
public function buildArticlePreview ( $articleId , $amId = 0 )
{
$fields = 'am_id,content,sort,type,amt_id' ;
if ( $this -> hasAmRefCheckStatusColumn ()) {
$fields .= ',ref_check_status' ;
}
$q = Db :: name ( 'article_main' )
-> field ( 'am_id,content,sort,ref_check_status' )
-> field ( $fields )
-> where ( 'article_id' , $articleId )
-> whereIn ( 'state' , [ 0 , 2 ]);
if ( $amId > 0 ) {
@@ -1039,7 +1240,7 @@ class ReferenceCheckService
foreach ( $mains as $main ) {
$id = intval ( $main [ 'am_id' ]);
$content = ( string ) $main [ 'content' ] ;
$content = $this -> resolveArticleMainCheckContent ( $main ) ;
$badIndex = isset ( $badByAm [ $id ]) ? $badByAm [ $id ] : array ();
$marked = $this -> markContentForPreview ( $content , $id , $badIndex );
$amStatus = intval ( $this -> arrGet ( $main , 'ref_check_status' , 0 ));
@@ -1158,12 +1359,7 @@ class ReferenceCheckService
$html = $content ;
// 1) 先标记 blue 内各序号(在原文上操作,[70-73] 仅标不合理者如 70、71)
preg_match_all (
self :: BLUE_TAG_REGEX ,
$html ,
$matches ,
PREG_OFFSET_CAPTURE
);
$matches = $this -> collectBlueTagMatches ( $html );
$citeDeltas = [];
if ( ! empty ( $matches [ 0 ])) {
$replacements = [];
@@ -1318,14 +1514,6 @@ class ReferenceCheckService
return implode ( " \n " , $parts );
}
/**
* 前端修改参考文献后重新校对:仅处理已有校对记录,刷新 refer_text、重置结果并入队; 无记录直接返回
*
* @param int $articleId
* @param int $pReferId t_production_article_refer.p_refer_id( 优先)
* @param int $referenceNo 文献序号 index+1( 无 p_refer_id 时用)
* @return array
*/
/**
* 编辑某条文献内容后,按 p_refer_id 异步重新校对该文献对应的全部 check 明细
*
@@ -1387,7 +1575,7 @@ class ReferenceCheckService
'refer_text' => $referText ,
'refer_index' => $referenceNo ,
'reference_no' => $referenceNo ,
'status' => 0 ,
'status' => self :: RECORD_PENDING ,
'is_match' => 0 ,
'can_support' => 0 ,
'confidence' => 0 ,
@@ -1401,7 +1589,6 @@ class ReferenceCheckService
foreach ( $rows as $row ) {
$checkId = $this -> resolveCheckRowId ( $row );
Db :: name ( 'article_reference_check_result' ) -> where ( 'id' , $checkId ) -> update ( $resetFields );
// 旧的队列完成标记必须清掉,否则同 check_id 再次投递会被 acquireLock 静默丢弃
$this -> clearReferenceCheckQueueLock ( $checkId );
$pendingJobs [] = [
'check_id' => $checkId ,
@@ -1432,6 +1619,92 @@ class ReferenceCheckService
];
}
/**
* 某条参考文献下「校对失败」的明细重新校对(仅 status=RECORD_FAILED, 异步入队)
*
* 不刷新 refer_text / reference_no, 沿用记录内已有正文与文献快照, 只重置结果字段后入队。
*
* @param int $pReferId t_production_article_refer.p_refer_id( 必填)
* @param int $pArticleId 可选,进一步限定文章
* @return array{p_refer_id:int, p_article_id:int, reset:int, queued:int, check_ids:int[], queue:string}
*/
public function enqueueRecheckFailedByPReferId ( $pReferId , $pArticleId = 0 )
{
$pReferId = intval ( $pReferId );
if ( $pReferId <= 0 ) {
throw new \InvalidArgumentException ( 'p_refer_id is required' );
}
$q = Db :: name ( 'article_reference_check_result' )
-> where ( 'p_refer_id' , $pReferId )
-> where ( 'status' , self :: RECORD_FAILED );
$pArticleId = intval ( $pArticleId );
if ( $pArticleId > 0 ) {
$q -> where ( 'p_article_id' , $pArticleId );
}
$rows = $q -> select ();
if ( empty ( $rows )) {
return [
'p_refer_id' => $pReferId ,
'p_article_id' => $pArticleId ,
'reset' => 0 ,
'queued' => 0 ,
'check_ids' => [],
'queue' => self :: QUEUE_NAME ,
];
}
if ( $pArticleId <= 0 ) {
$pArticleId = intval ( $this -> arrGet ( $rows [ 0 ], 'p_article_id' , 0 ));
}
$now = date ( 'Y-m-d H:i:s' );
$resetFields = [
'status' => self :: RECORD_PENDING ,
'is_match' => 0 ,
'can_support' => 0 ,
'confidence' => 0 ,
'reason' => '' ,
'error_msg' => '' ,
'updated_at' => $now ,
];
$pendingJobs = [];
$amIds = [];
foreach ( $rows as $row ) {
$checkId = $this -> resolveCheckRowId ( $row );
Db :: name ( 'article_reference_check_result' ) -> where ( 'id' , $checkId ) -> update ( $resetFields );
$this -> clearReferenceCheckQueueLock ( $checkId );
$pendingJobs [] = [
'check_id' => $checkId ,
'reference_no' => intval ( $this -> arrGet ( $row , 'reference_no' , 0 )),
'am_id' => intval ( $this -> arrGet ( $row , 'am_id' , 0 )),
'text_start' => intval ( $this -> arrGet ( $row , 'text_start' , 0 )),
];
$amId = intval ( $this -> arrGet ( $row , 'am_id' , 0 ));
if ( $amId > 0 ) {
$amIds [ $amId ] = true ;
}
}
foreach ( array_keys ( $amIds ) as $amId ) {
$this -> setAmRefCheckStatus ( $amId , self :: AM_STATUS_RUNNING );
}
$checkIds = $this -> pushJobsSortedByReferenceNo ( $pendingJobs );
return [
'p_refer_id' => $pReferId ,
'p_article_id' => $pArticleId ,
'reset' => count ( $rows ),
'queued' => count ( $checkIds ),
'check_ids' => $checkIds ,
'queue' => self :: QUEUE_NAME ,
];
}
public function recheckByRefer ( $articleId , $pReferId = 0 , $referenceNo = 0 )
{
$articleId = intval ( $articleId );
@@ -1600,9 +1873,9 @@ class ReferenceCheckService
if ( $contentA === '' || $contentB === '' ) {
$this -> updateCheckResult ( $checkId , [
'status' => self :: RECORD_FAILED ,
'error_msg' => 'Missing article_main.content or refer_text' ,
'error_msg' => 'Missing section content (text/table) or refer_text' ,
]);
throw new \RuntimeException ( 'Missing article_main.content or refer_text' );
throw new \RuntimeException ( 'Missing section content (text/table) or refer_text' );
}
$llmResult = ( new LLMService ()) -> checkReference ( $contentA , $contentB , false );
@@ -1748,7 +2021,7 @@ class ReferenceCheckService
}
/**
* 第一次校对:取 article_main.content(整节正文)
* 第一次校对:正文 取 article_main.content;表格(type=2)取 article_main_table.table_data 等
*/
public function resolveMainContentForJob ( array $row , $maxChars = 8000 )
{
@@ -1757,23 +2030,280 @@ class ReferenceCheckService
return '' ;
}
$main = Db :: name ( 'article_main' )
-> field ( 'content' )
-> field ( 'content,type,amt_id,article_id ' )
-> where ( 'am_id' , $amId )
-> find ();
if ( empty ( $main )) {
return '' ;
}
$text = trim (( string ) $this -> arrGet ( $main , 'c ontent' , '' ));
if ( $text === '' ) {
$raw = trim ( $this -> resolveArticleMainCheckC ontent( $main ));
if ( $raw === '' ) {
return '' ;
}
$text = preg_replace ( self :: BLUE_TAG_REGEX , '[$1]' , $text );
return $this -> normalizeCheckContentForLlm ( $raw , $maxChars );
}
/**
* 是否为表格节: type=2、有 amt_id, 或 content 为 <table tableId='…'/> 占位
*/
private function isArticleMainTableSection ( array $main )
{
if ( intval ( $this -> arrGet ( $main , 'type' , self :: MAIN_TYPE_TEXT )) === self :: MAIN_TYPE_TABLE ) {
return true ;
}
if ( intval ( $this -> arrGet ( $main , 'amt_id' , 0 )) > 0 ) {
return true ;
}
$content = ( string ) $this -> arrGet ( $main , 'content' , '' );
return stripos ( $content , '<table' ) !== false
&& preg_match ( '/tableId\s*=\s*[\'"]?\d+/i' , $content );
}
/**
* 从 article_main 或 content 占位解析 amt_id
*/
private function resolveArticleMainTableAmtId ( array $main )
{
$amtId = intval ( $this -> arrGet ( $main , 'amt_id' , 0 ));
if ( $amtId > 0 ) {
return $amtId ;
}
$content = ( string ) $this -> arrGet ( $main , 'content' , '' );
if ( preg_match ( '/tableId\s*=\s*[\'"]?(\d+)/i' , $content , $m )) {
return intval ( $m [ 1 ]);
}
return 0 ;
}
/**
* @return array|null
*/
private function loadArticleMainTableRow ( array $main )
{
$amtId = $this -> resolveArticleMainTableAmtId ( $main );
if ( $amtId <= 0 ) {
return null ;
}
$q = Db :: name ( 'article_main_table' )
-> where ( 'amt_id' , $amtId )
-> whereIn ( 'state' , [ 0 , 2 ])
-> field ( 'table_data,title,note' );
$articleId = intval ( $this -> arrGet ( $main , 'article_id' , 0 ));
if ( $articleId > 0 ) {
$q -> where ( 'article_id' , $articleId );
}
$tbl = $q -> find ();
return empty ( $tbl ) ? null : $tbl ;
}
/**
* 按节提取引用:正文走 content; 表格按行拼接单元格后扫描( Study 列仅 [n] 时也能带上同行上下文)
*/
public function extractReferencesForArticleMain ( array $main )
{
if ( ! $this -> isArticleMainTableSection ( $main )) {
return $this -> extractReferences (( string ) $this -> arrGet ( $main , 'content' , '' ));
}
$tbl = $this -> loadArticleMainTableRow ( $main );
if ( empty ( $tbl )) {
return [];
}
$extra = [];
foreach ([ 'title' , 'note' ] as $field ) {
$part = trim (( string ) $this -> arrGet ( $tbl , $field , '' ));
if ( $part !== '' ) {
$extra [] = $part ;
}
}
return $this -> extractReferencesFromTableDataJson (
( string ) $this -> arrGet ( $tbl , 'table_data' , '' ),
$extra
);
}
/**
* table_data 按行提取;$prefixChunks 为 title/note 等(在表格行之前扫描)
*/
public function extractReferencesFromTableDataJson ( $tableDataJson , array $prefixChunks = [])
{
$result = [];
$offset = 0 ;
foreach ( $prefixChunks as $chunk ) {
$chunk = trim (( string ) $chunk );
if ( $chunk === '' ) {
continue ;
}
foreach ( $this -> extractReferences ( $chunk ) as $cite ) {
$cite [ 'text_start' ] = intval ( $cite [ 'text_start' ]) + $offset ;
$cite [ 'text_end' ] = intval ( $cite [ 'text_end' ]) + $offset ;
$cite [ 'reference_start' ] = intval ( $cite [ 'reference_start' ]) + $offset ;
$cite [ 'reference_end' ] = intval ( $cite [ 'reference_end' ]) + $offset ;
$result [] = $cite ;
}
$offset += strlen ( $chunk ) + 1 ;
}
$tableDataJson = trim (( string ) $tableDataJson );
if ( $tableDataJson === '' ) {
return $result ;
}
$decoded = $this -> decodeTableDataJsonToArray ( $tableDataJson );
if ( $decoded === null ) {
foreach ( $this -> extractReferences ( $tableDataJson ) as $cite ) {
$cite [ 'text_start' ] = intval ( $cite [ 'text_start' ]) + $offset ;
$cite [ 'text_end' ] = intval ( $cite [ 'text_end' ]) + $offset ;
$cite [ 'reference_start' ] = intval ( $cite [ 'reference_start' ]) + $offset ;
$cite [ 'reference_end' ] = intval ( $cite [ 'reference_end' ]) + $offset ;
$result [] = $cite ;
}
return $result ;
}
foreach ( $decoded as $row ) {
$line = $this -> buildTableRowCheckLine ( $row );
if ( $line === '' ) {
continue ;
}
foreach ( $this -> extractReferences ( $line ) as $cite ) {
$cite [ 'text_start' ] = intval ( $cite [ 'text_start' ]) + $offset ;
$cite [ 'text_end' ] = intval ( $cite [ 'text_end' ]) + $offset ;
$cite [ 'reference_start' ] = intval ( $cite [ 'reference_start' ]) + $offset ;
$cite [ 'reference_end' ] = intval ( $cite [ 'reference_end' ]) + $offset ;
$result [] = $cite ;
}
$offset += strlen ( $line ) + 1 ;
}
return $result ;
}
/**
* 入队/LLM 用的原始 HTML: type=0 为 content; 表格为 table_data 按行展平
*/
public function resolveArticleMainCheckContent ( array $main )
{
if ( ! $this -> isArticleMainTableSection ( $main )) {
return ( string ) $this -> arrGet ( $main , 'content' , '' );
}
$tbl = $this -> loadArticleMainTableRow ( $main );
if ( empty ( $tbl )) {
return '' ;
}
$chunks = [];
foreach ([ 'title' , 'note' ] as $field ) {
$part = trim (( string ) $this -> arrGet ( $tbl , $field , '' ));
if ( $part !== '' ) {
$chunks [] = $part ;
}
}
$flat = $this -> flattenTableDataJsonToCheckContent (( string ) $this -> arrGet ( $tbl , 'table_data' , '' ));
if ( $flat !== '' ) {
$chunks [] = $flat ;
}
return implode ( " \n " , $chunks );
}
/**
* 表格一行:各单元格 text 用 " | " 连接(保留同行化学名/部位/Study 列引用)
*/
private function buildTableRowCheckLine ( $row )
{
if ( ! is_array ( $row )) {
return '' ;
}
$cells = [];
foreach ( $row as $cell ) {
if ( ! is_array ( $cell )) {
continue ;
}
$text = trim (( string ) $this -> arrGet ( $cell , 'text' , '' ));
if ( $text !== '' ) {
$cells [] = $text ;
}
}
return implode ( ' | ' , $cells );
}
/**
* table_data 按行展平(供 LLM / 预览);非法 JSON 时按整串处理
*/
private function flattenTableDataJsonToCheckContent ( $tableDataJson )
{
$tableDataJson = trim (( string ) $tableDataJson );
if ( $tableDataJson === '' ) {
return '' ;
}
$decoded = $this -> decodeTableDataJsonToArray ( $tableDataJson );
if ( $decoded === null ) {
return $tableDataJson ;
}
$lines = [];
foreach ( $decoded as $row ) {
$line = $this -> buildTableRowCheckLine ( $row );
if ( $line !== '' ) {
$lines [] = $line ;
}
}
return implode ( " \n " , $lines );
}
/**
* @return array|null
*/
private function decodeTableDataJsonToArray ( $raw )
{
$raw = trim (( string ) $raw );
if ( $raw === '' ) {
return null ;
}
if ( preg_match ( '/^\xEF\xBB\xBF/' , $raw )) {
$raw = substr ( $raw , 3 );
}
$decoded = json_decode ( $raw , true );
if ( json_last_error () !== JSON_ERROR_NONE ) {
return null ;
}
if ( is_array ( $decoded )) {
return $decoded ;
}
if ( is_string ( $decoded )) {
$decoded2 = json_decode ( $decoded , true );
if ( json_last_error () === JSON_ERROR_NONE && is_array ( $decoded2 )) {
return $decoded2 ;
}
}
return null ;
}
private function normalizeCheckContentForLlm ( $raw , $maxChars = 8000 )
{
$text = $this -> pregReplaceBlueTags ( $raw , '[$1]' );
$text = strip_tags ( $text );
$text = html_entity_decode ( $text , ENT_QUOTES | ENT_HTML5 , 'UTF-8' );
$text = preg_replace ( '/\s+/u' , ' ' , $text );
$text = trim ( $text );
if ( $text === '' ) {
return '' ;
}
$maxChars = max ( 500 , intval ( $maxChars ));
if ( mb_strlen ( $text ) > $maxChars ) {
@@ -2134,12 +2664,12 @@ class ReferenceCheckService
}
/**
* 从 article_main.content 提取 blue 引用
* 从正文 HTML 或表格展平后的 HTML 提取 blue 引用
*/
public function extractReferences ( $content )
{
$result = [];
preg_match_all ( self :: BLUE_TAG_REGEX , $content , $matches , PREG_OFFSET_CAPTURE );
$matches = $this -> collectBlueTagMatches ( $content );
if ( empty ( $matches [ 0 ])) {
return [];
}
@@ -2319,7 +2849,7 @@ class ReferenceCheckService
private function buildCitationContextText ( $content , $start , $end )
{
$text = $this -> byteSubstr ( $content , $start , $end );
$text = preg_r eplace( self :: BLUE_TAG_REGEX , '' , $text );
$text = $this -> pregR eplaceBlueTags ( $text , '' );
$text = trim ( strip_tags ( $text ));
$text = preg_replace ( '/\s+/u' , ' ' , $text );
$text = ltrim ( $text , " \xEF \xBB \xBF " );
@@ -2505,7 +3035,7 @@ class ReferenceCheckService
}
$gap = substr ( $content , $tagEnd , $end - $tagEnd );
$gapText = trim ( strip_tags ( preg_r eplace( self :: BLUE_TAG_REGEX , '' , $gap )));
$gapText = trim ( strip_tags ( $this -> pregR eplaceBlueTags ( $gap , '' )));
if ( $gapText !== '' && ! $this -> isOnlyPunctuationOrSpace ( $gapText )) {
return $end ;
}