自动推广
This commit is contained in:
@@ -8,6 +8,8 @@ use think\Queue;
|
||||
use think\Validate;
|
||||
use PhpOffice\PhpWord\IOFactory;
|
||||
use app\common\OpenAi;
|
||||
use app\common\CrossrefService;
|
||||
use app\common\PubmedService;
|
||||
|
||||
/**
|
||||
* @title 文章接口
|
||||
@@ -17,9 +19,17 @@ class Article extends Base
|
||||
{
|
||||
|
||||
|
||||
/**
|
||||
* @var CrossrefService
|
||||
*/
|
||||
private $crossService;
|
||||
private $pubmedService;
|
||||
|
||||
public function __construct(\think\Request $request = null)
|
||||
{
|
||||
parent::__construct($request);
|
||||
$this->crossService = new CrossrefService();
|
||||
$this->pubmedService = new PubmedService();
|
||||
}
|
||||
|
||||
|
||||
@@ -169,16 +179,6 @@ class Article extends Base
|
||||
return jsonSuccess($re);
|
||||
}
|
||||
|
||||
|
||||
public function myttt()
|
||||
{
|
||||
$res = $this->addProductionEx("3689");
|
||||
echo "<pre>";
|
||||
var_dump($res);
|
||||
echo "</pre>";
|
||||
die;
|
||||
}
|
||||
|
||||
/**获取预接收内容状态
|
||||
* @return void
|
||||
*/
|
||||
@@ -1053,6 +1053,36 @@ class Article extends Base
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public function testCheckArticleCitation()
|
||||
{
|
||||
$data = $this->request->post();
|
||||
$rule = new Validate([
|
||||
"article_id"=>"require"
|
||||
]);
|
||||
if(!$rule->check($data)){
|
||||
return jsonError($rule->getError());
|
||||
}
|
||||
$mains = $this->article_main_obj->where("article_id",$data['article_id'])->where("state",0)->select();
|
||||
$production = $this->production_article_obj->where("article_id",$data['article_id'])->find();
|
||||
$refers = $this->production_article_refer_obj->where("p_article_id",$production['p_article_id'])->where("state",0)->order("index asc")->select();
|
||||
$res = $this->crossService->qcArticleCitations($mains,$refers);
|
||||
return jsonSuccess(['res'=>$res]);
|
||||
}
|
||||
|
||||
public function testCheckArticlePubmed()
|
||||
{
|
||||
$data = $this->request->post();
|
||||
$rule = new Validate([
|
||||
"doi"=>"require"
|
||||
]);
|
||||
if(!$rule->check($data)){
|
||||
return jsonError($rule->getError());
|
||||
}
|
||||
$res = $this->pubmedService->fetchByDoi($data['doi']);
|
||||
return jsonSuccess(['res'=>$res]);
|
||||
}
|
||||
|
||||
/**
|
||||
* @title 发送留言板消息
|
||||
* @description 发送留言板消息
|
||||
@@ -3940,7 +3970,14 @@ class Article extends Base
|
||||
|
||||
public function ffff(){
|
||||
$data = $this->request->post();
|
||||
$this->ai_scor($data['article_id']);
|
||||
$rule = new Validate([
|
||||
"doi"=>"require"
|
||||
]);
|
||||
if(!$rule->check($data)){
|
||||
return jsonError($rule->getError());
|
||||
}
|
||||
$res = $this->crossService->fetchWorkSummary($data['doi']);
|
||||
return jsonSuccess($res);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -6,6 +6,7 @@ use app\api\controller\Base;
|
||||
use think\Db;
|
||||
use PhpOffice\PhpWord\IOFactory;
|
||||
use think\Exception;
|
||||
use think\Validate;
|
||||
use \app\common\ArticleParserService;
|
||||
/**
|
||||
* @title 自动投稿控制器
|
||||
@@ -102,6 +103,23 @@ class Contribute extends Base
|
||||
return $result;
|
||||
}
|
||||
|
||||
|
||||
|
||||
public function myTestArticle(){
|
||||
$data = $this->request->post();
|
||||
$rule = new Validate([
|
||||
'article_id'=>"require"
|
||||
]);
|
||||
if(!$rule->check($data)){
|
||||
return jsonError($rule->getError());
|
||||
}
|
||||
$files = $this->article_file_obj->where("article_id",$data['article_id'])->where("type_name","manuscirpt")->order("file_id desc")->limit(1)->select();
|
||||
$sFileUrl =$files[0]['file_url'];
|
||||
$sFileUrl = rtrim(ROOT_PATH,'/').'/public/'.ltrim(ltrim($sFileUrl,'/'),'public');
|
||||
$res = ArticleParserService::getReferencesFromWord($sFileUrl);
|
||||
return jsonSuccess($res);
|
||||
}
|
||||
|
||||
/**
|
||||
* 组装数据插入相关数据表
|
||||
* @param array $aParam
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
namespace app\api\controller;
|
||||
use app\api\controller\Base;
|
||||
use think\Db;
|
||||
use think\Validate;
|
||||
class Crossrefdoi extends Base{
|
||||
|
||||
public function __construct(\think\Request $request = null) {
|
||||
@@ -99,6 +100,21 @@ class Crossrefdoi extends Base{
|
||||
return json_encode(['status' => 1,'msg' => 'Update successful']);
|
||||
}
|
||||
|
||||
|
||||
public function getOneDoi(){
|
||||
$data = $this->request->post();
|
||||
$rule = new Validate([
|
||||
"doi"=>"require"
|
||||
]);
|
||||
if(!$rule->check($data)){
|
||||
return jsonError($rule->getError());
|
||||
}
|
||||
$sCheckDoi = $this->fetchSingleDoiWithRetry($this->filterValidDoi($data['doi'])); // 过滤非法DOI
|
||||
return jsonSuccess($sCheckDoi);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* 过滤非法DOI(仅保留10.xxxx/xxx格式)
|
||||
*/
|
||||
|
||||
@@ -2335,6 +2335,8 @@ class EmailClient extends Base
|
||||
->where('jpf.state', 0)
|
||||
->where('ef_fetch.state', 0)
|
||||
->column('ef_fetch.field');
|
||||
|
||||
|
||||
$fields = array_unique(array_filter(array_map('trim', $fields)));
|
||||
|
||||
if (empty($fields)) {
|
||||
|
||||
@@ -43,28 +43,36 @@ class ExpertManage extends Base
|
||||
$pageSize = max(1, intval(isset($data['pageSize']) ? $data['pageSize'] : 20));
|
||||
|
||||
$query = Db::name('expert')->alias('e');
|
||||
$countQuery = Db::name('expert')->alias('e');
|
||||
$needJoin = ($field !== '');
|
||||
|
||||
if ($needJoin) {
|
||||
$query->join('t_expert_field ef', 'ef.expert_id = e.expert_id AND ef.state = 0', 'inner');
|
||||
$countQuery->join('t_expert_field ef', 'ef.expert_id = e.expert_id AND ef.state = 0', 'inner');
|
||||
if ($field !== '') {
|
||||
$query->where('ef.field', 'like', '%' . $field . '%');
|
||||
$countQuery->where('ef.field', 'like', '%' . $field . '%');
|
||||
}
|
||||
$query->group('e.expert_id');
|
||||
$countQuery->group('e.expert_id');
|
||||
}
|
||||
|
||||
if ($state !== '-1' && $state !== '') {
|
||||
$query->where('e.state', intval($state));
|
||||
$countQuery->where('e.state', intval($state));
|
||||
}
|
||||
if ($keyword !== '') {
|
||||
$query->where('e.name|e.email|e.affiliation', 'like', '%' . $keyword . '%');
|
||||
$countQuery->where('e.name|e.email|e.affiliation', 'like', '%' . $keyword . '%');
|
||||
}
|
||||
if ($source !== '') {
|
||||
$query->where('e.source', $source);
|
||||
$countQuery->where('e.source', $source);
|
||||
}
|
||||
|
||||
$countQuery = clone $query;
|
||||
$total = $countQuery->distinct('e.expert_id')->count();
|
||||
// $countQuery = clone $query;
|
||||
// $total = $countQuery->distinct('e.expert_id')->count();
|
||||
$total = $needJoin ? count($countQuery->group('e.expert_id')->column('e.expert_id')) : $countQuery->count();
|
||||
|
||||
$list = $query
|
||||
->field('e.*')
|
||||
|
||||
695
application/api/controller/References.php
Normal file
695
application/api/controller/References.php
Normal file
@@ -0,0 +1,695 @@
|
||||
<?php
|
||||
|
||||
namespace app\api\controller;
|
||||
|
||||
use app\api\controller\Base;
|
||||
use app\common\CitationRelevanceService;
|
||||
use app\common\CrossrefService;
|
||||
use app\common\PubmedService;
|
||||
use think\Db;
|
||||
use think\Env;
|
||||
/**
|
||||
* @title 参考文献
|
||||
* @description 相关方法汇总
|
||||
*/
|
||||
class References extends Base
|
||||
{
|
||||
public function __construct(\think\Request $request = null) {
|
||||
parent::__construct($request);
|
||||
}
|
||||
//OPENAI token
|
||||
private $sApiKey = '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';
|
||||
//OPENAI URL
|
||||
private $sApiUrl = 'https://api.openai.com/v1/chat/completions';
|
||||
/**
|
||||
* 获取参考文献的信息
|
||||
* @param p_refer_id 主键ID
|
||||
*/
|
||||
public function get($aParam = []){
|
||||
|
||||
//获取参数
|
||||
$aParam = empty($aParam) ? $this->request->post() : $aParam;
|
||||
|
||||
//必填值验证
|
||||
$iPReferId = empty($aParam['p_refer_id']) ? '' : $aParam['p_refer_id'];
|
||||
if(empty($iPReferId)){
|
||||
return json_encode(['status' => 2,'msg' => 'Please select the reference to be queried']);
|
||||
}
|
||||
$aWhere = ['p_refer_id' => $iPReferId,'state' => 0];
|
||||
$aRefer = Db::name('production_article_refer')->where($aWhere)->find();
|
||||
if(empty($aRefer)){
|
||||
return json_encode(['status' => 4,'msg' => 'Reference is empty']);
|
||||
}
|
||||
//获取文章信息
|
||||
$aParam['p_article_id'] = $aRefer['p_article_id'];
|
||||
$aArticle = $this->getArticle($aParam);
|
||||
$iStatus = empty($aArticle['status']) ? 0 : $aArticle['status'];
|
||||
if($iStatus != 1){
|
||||
return json_encode($aArticle);
|
||||
}
|
||||
$aArticle = empty($aArticle['data']) ? [] : $aArticle['data'];
|
||||
if(empty($aArticle)){
|
||||
return json_encode(['status' => 3,'msg' => 'The article does not exist']);
|
||||
}
|
||||
|
||||
//获取参考文献信息作者名.文章题目.期刊名缩写.年卷页.Available at: //https://doi.org/xxxxx
|
||||
//作者
|
||||
$sData = $aRefer['refer_frag'];
|
||||
if($aRefer['refer_type'] == 'journal'){
|
||||
if(!empty($aRefer['doilink'])){
|
||||
$sAuthor = empty($aRefer['author']) ? '' : trim(trim($aRefer['author']),'.');
|
||||
if(!empty($sAuthor)){
|
||||
$aAuthor = explode(',', $sAuthor);
|
||||
if(count($aAuthor) > 3){
|
||||
$sAuthor = implode(',', array_slice($aAuthor, 0,3));
|
||||
$sAuthor .= ', et al';
|
||||
}
|
||||
if(count($aAuthor) <= 3 ){
|
||||
$sAuthor = implode(',', $aAuthor);
|
||||
}
|
||||
}
|
||||
//文章标题
|
||||
$sTitle = empty($aRefer['title']) ? '' : trim(trim($aRefer['title']),'.');
|
||||
//期刊名缩写
|
||||
$sJoura = empty($aRefer['joura']) ? '' : trim(trim($aRefer['joura']),'.');
|
||||
//年卷页
|
||||
$sDateno = empty($aRefer['dateno']) ? '' : trim(trim($aRefer['dateno']),'.');
|
||||
//DOI
|
||||
$sDoilink = empty($aRefer['doilink']) ? '' : trim($aRefer['doilink']);
|
||||
if(!empty($sDoilink)){
|
||||
$sDoilink = strpos($sDoilink ,"http")===false ? "https://doi.org/".$sDoilink : $sDoilink;
|
||||
$sDoilink = str_replace('http://doi.org/', 'https://doi.org/', $sDoilink);
|
||||
}
|
||||
$sReferDoi = empty($aRefer['refer_doi']) ? '' : trim($aRefer['refer_doi']);
|
||||
if(!empty($sReferDoi)){
|
||||
$sReferDoi = strpos($sReferDoi ,"http")===false ? "https://doi.org/".$sReferDoi : $sReferDoi;
|
||||
$sReferDoi = str_replace('http://doi.org/', 'https://doi.org/', $sReferDoi);
|
||||
}
|
||||
$sDoilink = empty($sDoilink) ? $sReferDoi : $sDoilink;
|
||||
|
||||
$sData = $sAuthor.'.'.$sTitle.'.'.$sJoura.'.'.$sDateno.".Available at:\n".$sDoilink;
|
||||
}
|
||||
}
|
||||
if($aRefer['refer_type'] == 'book'){
|
||||
$sAuthor = empty($aRefer['author']) ? '' : trim(trim($aRefer['author']),'.');
|
||||
if(!empty($sAuthor)){
|
||||
$aAuthor = explode(',', $sAuthor);
|
||||
if(count($aAuthor) > 3){
|
||||
$sAuthor = implode(',', array_slice($aAuthor, 0,3));
|
||||
$sAuthor .= ', et al';
|
||||
}
|
||||
if(count($aAuthor) <= 3 ){
|
||||
$sAuthor = implode(',', $aAuthor);
|
||||
}
|
||||
}
|
||||
//文章标题
|
||||
$sTitle = empty($aRefer['title']) ? '' : trim(trim($aRefer['title']),'.');
|
||||
//期刊名缩写
|
||||
$sJoura = empty($aRefer['joura']) ? '' : trim(trim($aRefer['joura']),'.');
|
||||
//年卷页
|
||||
$sDateno = empty($aRefer['dateno']) ? '' : trim(trim($aRefer['dateno']),'.');
|
||||
//DOI
|
||||
$sDoilink = empty($aRefer['isbn']) ? '' : trim($aRefer['isbn']);
|
||||
|
||||
$sData = $sAuthor.'.'.$sTitle.'.'.$sJoura.'.'.$sDateno.".Available at:\n".$sDoilink;
|
||||
}
|
||||
$aRefer['deal_content'] = $sData;
|
||||
return json_encode(['status' => 1,'msg' => 'success','data' => $aRefer]);
|
||||
}
|
||||
|
||||
/**
|
||||
* 参考文献鉴别:正文引用上下文 + PubMed/Crossref + 大模型向量相似度
|
||||
* 参数:p_refer_id(必填)
|
||||
* 环境变量(可选):citation_chat_url、citation_chat_model、citation_chat_api_key、citation_chat_timeout、crossref_mailto、pubmed_email
|
||||
*/
|
||||
public function checkCitationRelevance($aParam = [])
|
||||
{
|
||||
$aParam = empty($aParam) ? $this->request->post() : $aParam;
|
||||
$pReferId = intval(isset($aParam['p_refer_id']) ? $aParam['p_refer_id'] : 0);
|
||||
if (!$pReferId) {
|
||||
return jsonError('p_refer_id is required');
|
||||
}
|
||||
|
||||
$refer = Db::name('production_article_refer')
|
||||
->where('p_refer_id', $pReferId)
|
||||
->where('state', 0)
|
||||
->find();
|
||||
if (empty($refer)) {
|
||||
return jsonError('Reference not found');
|
||||
}
|
||||
|
||||
$aArticle = $this->getArticle(['p_article_id' => $refer['p_article_id']]);
|
||||
$iStatus = empty($aArticle['status']) ? 0 : $aArticle['status'];
|
||||
if ($iStatus != 1) {
|
||||
return json_encode($aArticle);
|
||||
}
|
||||
$aArticle = empty($aArticle['data']) ? [] : $aArticle['data'];
|
||||
if (empty($aArticle['article_id'])) {
|
||||
return jsonError('Article not found');
|
||||
}
|
||||
|
||||
$articleId = intval($aArticle['article_id']);
|
||||
$mains = Db::name('article_main')
|
||||
->where('article_id', $articleId)
|
||||
->whereIn('state', [0, 2])
|
||||
->order('sort asc')
|
||||
->select();
|
||||
if (empty($mains)) {
|
||||
return jsonError('article_main is empty');
|
||||
}
|
||||
|
||||
$citationMark = intval($refer['index']) + 1;
|
||||
$context = $this->extractCitationContextFromMains($mains, $citationMark);
|
||||
if ($context === '') {
|
||||
return jsonError('Citation context not found in article_main for mark [' . $citationMark . ']');
|
||||
}
|
||||
|
||||
$apiKey = trim((string)Env::get('citation_chat_api_key', ''));
|
||||
if ($apiKey === '') {
|
||||
return jsonError('Please set env citation_chat_api_key for embedding via chat');
|
||||
}
|
||||
|
||||
$config = [
|
||||
'chat_url' => trim((string)Env::get('citation_chat_url', 'http://chat.taimed.cn/v1/chat/completions')),
|
||||
'chat_model' => trim((string)Env::get('citation_chat_model', 'DeepSeek-Coder-V2-Instruct')),
|
||||
'timeout' => max(60, intval(Env::get('citation_chat_timeout', 180))),
|
||||
'embedding_dim' => max(32, intval(Env::get('citation_embedding_dim', 256))),
|
||||
'embedding_headers' => [
|
||||
'Authorization: Bearer ' . $apiKey,
|
||||
],
|
||||
];
|
||||
|
||||
$pubmed = new PubmedService([
|
||||
'email' => trim((string)Env::get('pubmed_email', '')),
|
||||
'tool' => trim((string)Env::get('pubmed_tool', 'tmrjournals')),
|
||||
]);
|
||||
$crossref = new CrossrefService([
|
||||
'mailto' => trim((string)Env::get('crossref_mailto', '')),
|
||||
]);
|
||||
$svc = new CitationRelevanceService($pubmed, $crossref, $config);
|
||||
|
||||
$qc = $svc->checkOne($context, $refer, []);
|
||||
|
||||
return jsonSuccess([
|
||||
'p_refer_id' => $pReferId,
|
||||
'citation_mark' => $citationMark,
|
||||
'refer_index' => intval($refer['index']),
|
||||
'context' => $context,
|
||||
'problem_flag' => $qc['problem_flag'] ?? '',
|
||||
'problem_reason' => $qc['problem_reason'] ?? '',
|
||||
'relevance_flag' => $qc['relevance_flag'] ?? '',
|
||||
'relevance_score'=> $qc['relevance_score'] ?? 0,
|
||||
'reason' => $qc['reason'] ?? '',
|
||||
'pubmed' => $qc['pubmed'] ?? [],
|
||||
]);
|
||||
}
|
||||
|
||||
/**
|
||||
* 从 t_article_main 拼接正文,按 [n] 定位句子并取前后各 1 句作为上下文
|
||||
*/
|
||||
private function extractCitationContextFromMains(array $mains, int $citationMark): string
|
||||
{
|
||||
if ($citationMark <= 0) {
|
||||
return '';
|
||||
}
|
||||
$chunks = [];
|
||||
foreach ($mains as $row) {
|
||||
$text = isset($row['content']) ? (string)$row['content'] : '';
|
||||
if ($text === '') {
|
||||
continue;
|
||||
}
|
||||
$text = preg_replace('/<\s*\/?\s*blue[^>]*>/i', '', $text);
|
||||
$text = strip_tags($text);
|
||||
$text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8');
|
||||
$text = preg_replace('/\s+/u', ' ', trim($text));
|
||||
if ($text !== '') {
|
||||
$chunks[] = $text;
|
||||
}
|
||||
}
|
||||
$fullText = implode("\n", $chunks);
|
||||
if ($fullText === '') {
|
||||
return '';
|
||||
}
|
||||
|
||||
$sentences = $this->splitEnglishSentences($fullText);
|
||||
$pattern = '/\[' . preg_quote((string)$citationMark, '/') . '\]/';
|
||||
foreach ($sentences as $si => $sent) {
|
||||
if (!preg_match($pattern, $sent)) {
|
||||
continue;
|
||||
}
|
||||
$start = max(0, $si - 1);
|
||||
$end = min(count($sentences) - 1, $si + 1);
|
||||
$ctx = implode(' ', array_slice($sentences, $start, $end - $start + 1));
|
||||
return trim(preg_replace('/\s+/u', ' ', $ctx));
|
||||
}
|
||||
return '';
|
||||
}
|
||||
|
||||
private function splitEnglishSentences(string $text): array
|
||||
{
|
||||
$text = trim($text);
|
||||
if ($text === '') {
|
||||
return [];
|
||||
}
|
||||
$text = preg_replace('/\s+/u', ' ', $text);
|
||||
$parts = preg_split('/(?<=[\.\?\!])\s+/', $text);
|
||||
$out = [];
|
||||
foreach ($parts as $p) {
|
||||
$p = trim((string)$p);
|
||||
if ($p !== '') {
|
||||
$out[] = $p;
|
||||
}
|
||||
}
|
||||
return $out;
|
||||
}
|
||||
|
||||
/**
|
||||
* 修改参考文献的信息
|
||||
* @param p_refer_id 主键ID
|
||||
*/
|
||||
public function modify($aParam = []){
|
||||
|
||||
//获取参数
|
||||
$aParam = empty($aParam) ? $this->request->post() : $aParam;
|
||||
|
||||
//必填值验证
|
||||
$iPReferId = empty($aParam['p_refer_id']) ? '' : $aParam['p_refer_id'];
|
||||
if(empty($iPReferId)){
|
||||
return json_encode(['status' => 2,'msg' => 'Please select the reference to be queried']);
|
||||
}
|
||||
$sContent = empty($aParam['content']) ? '' : $aParam['content'];
|
||||
if(empty($sContent)){
|
||||
return json_encode(['status' => 2,'msg' => 'Please enter the modification content']);
|
||||
}
|
||||
if(!is_string($sContent)){
|
||||
return json_encode(['status' => 2,'msg' => 'The content format is incorrect']);
|
||||
}
|
||||
|
||||
//获取参考文献信息
|
||||
$aWhere = ['p_refer_id' => $iPReferId,'state' => 0];
|
||||
$aRefer = Db::name('production_article_refer')->where($aWhere)->find();
|
||||
if(empty($aRefer)){
|
||||
return json_encode(['status' => 4,'msg' => 'Reference is empty']);
|
||||
}
|
||||
|
||||
//获取文章信息
|
||||
$aParam['p_article_id'] = $aRefer['p_article_id'];
|
||||
$aArticle = $this->getArticle($aParam);
|
||||
$iStatus = empty($aArticle['status']) ? 0 : $aArticle['status'];
|
||||
if($iStatus != 1){
|
||||
return json_encode($aArticle);
|
||||
}
|
||||
$aArticle = empty($aArticle['data']) ? [] : $aArticle['data'];
|
||||
if(empty($aArticle)){
|
||||
return json_encode(['status' => 3,'msg' => 'The article does not exist']);
|
||||
}
|
||||
|
||||
//数据处理
|
||||
$aContent = json_decode($this->dealContent(['content' => $sContent]),true);
|
||||
$aUpdate = empty($aContent['data']) ? [] : $aContent['data'];
|
||||
if(empty($aUpdate)){
|
||||
return json_encode(['status' => 5,'msg' => 'The content format is incorrect']);
|
||||
}
|
||||
$aUpdate['refer_content'] = $sContent;
|
||||
$aUpdate['is_change'] = 1;
|
||||
$aUpdate['update_time'] = time();
|
||||
//更新数据
|
||||
$aWhere = ['p_refer_id' => $iPReferId,'state' => 0];
|
||||
$result = Db::name('production_article_refer')->where($aWhere)->limit(1)->update($aUpdate);
|
||||
if($result === false){
|
||||
return json_encode(['status' => 6,'msg' => 'Update failed']);
|
||||
}
|
||||
return json_encode(['status' => 1,'msg' => 'success']);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 处理参考文献的信息
|
||||
* @param p_refer_id 主键ID
|
||||
*/
|
||||
public function dealContent($aParam = []){
|
||||
//获取参数
|
||||
$aParam = empty($aParam) ? $this->request->post() : $aParam;
|
||||
//必填验证
|
||||
$sContent = empty($aParam['content']) ? '' : $aParam['content'];
|
||||
if(empty($sContent)){
|
||||
return json_encode(['status' => 2,'msg' => 'Please enter the modification content']);
|
||||
}
|
||||
if(!is_string($sContent)){
|
||||
return json_encode(['status' => 2,'msg' => 'The content format is incorrect']);
|
||||
}
|
||||
$sContent = str_replace(['?','?'], '.', $sContent);
|
||||
$aContent = explode('.', $sContent);
|
||||
$aUpdate = [];
|
||||
if(count($aContent) > 1){
|
||||
$aField = [0 => 'author',1 => 'title', 2 => 'joura',3 => 'dateno'];
|
||||
$aStart = array_slice($aContent, 0,4);
|
||||
foreach ($aStart as $key => $value) {
|
||||
if(empty($value)){
|
||||
continue;
|
||||
}
|
||||
$aUpdate[$aField[$key]] = trim(trim($value),'.');
|
||||
}
|
||||
|
||||
$sDoi = empty(array_slice($aContent, 4)) ? '' : implode('.', array_slice($aContent, 4));
|
||||
// 匹配http/https开头的URL正则
|
||||
$urlPattern = '/https?:\/\/[^\s<>"]+|http?:\/\/[^\s<>"]+/i';
|
||||
// 执行匹配(preg_match_all返回所有结果)
|
||||
preg_match_all($urlPattern, $sDoi, $matches);
|
||||
if(!empty($matches[0])){
|
||||
$sDoi = implode(',', array_unique($matches[0]));
|
||||
}
|
||||
if(empty($sDoi)){
|
||||
return json_encode(['status' => 4,'msg' => 'Reference DOI is empty']);
|
||||
}
|
||||
$sDoi = trim(trim($sDoi),':');
|
||||
$sDoi = strpos($sDoi ,"http")===false ? "https://doi.org/".$sDoi : $sDoi;
|
||||
$sDoi = str_replace('http://doi.org/', 'https://doi.org/', $sDoi);
|
||||
$aUpdate['doilink'] = $sDoi;
|
||||
//$doiPattern = '/10\.\d{4,9}\/[^\s\/?#&=]+/i';
|
||||
$doiPattern = '/\b10\.\d+(?:\.\d+)*\/[^\s?#&=]+/i';
|
||||
if (preg_match($doiPattern, $sDoi, $matches)) {
|
||||
$aUpdate['doi'] = $matches[0];
|
||||
$aUpdate['doilink'] = 'https://doi.org/'.''.$aUpdate['doi'];
|
||||
}else{
|
||||
$aUpdate['doi'] = $sDoi;
|
||||
}
|
||||
if(!empty($aUpdate['author'])){
|
||||
$aUpdate['author'] = trim(trim($aUpdate['author'])).'.';
|
||||
}
|
||||
|
||||
}
|
||||
return json_encode(['status' => 1,'msg' => 'success','data' => $aUpdate]);
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取文章信息
|
||||
*/
|
||||
private function getArticle($aParam = []){
|
||||
|
||||
//获取参数
|
||||
$aParam = empty($aParam) ? $this->request->post() : $aParam;
|
||||
|
||||
//获取生产文章信息
|
||||
$iPArticleId = empty($aParam['p_article_id']) ? 0 : $aParam['p_article_id'];
|
||||
if(empty($iPArticleId)){
|
||||
return ['status' => 2,'msg' => 'Please select the article to query'];
|
||||
}
|
||||
$aWhere = ['p_article_id' => $iPArticleId,'state' => ['in',[0,2]]];
|
||||
$aProductionArticle = Db::name('production_article')->field('article_id')->where($aWhere)->find();
|
||||
$iArticleId = empty($aProductionArticle['article_id']) ? 0 : $aProductionArticle['article_id'];
|
||||
if(empty($iArticleId)) {
|
||||
return ['status' => 2,'msg' => 'No articles found'];
|
||||
}
|
||||
|
||||
//查询条件
|
||||
$aWhere = ['article_id' => $iArticleId,'state' => ['in',[5,6]]];
|
||||
$aArticle = Db::name('article')->field('article_id')->where($aWhere)->find();
|
||||
if(empty($aArticle)){
|
||||
return ['status' => 3,'msg' => 'The article does not exist or has not entered the editorial reference status'];
|
||||
}
|
||||
$aArticle['p_article_id'] = $iPArticleId;
|
||||
return ['status' => 1,'msg' => 'success','data' => $aArticle];
|
||||
}
|
||||
/**
|
||||
* AI检测
|
||||
*/
|
||||
public function checkByAi($aParam = []){
|
||||
//获取参数
|
||||
$aParam = empty($aParam) ? $this->request->post() : $aParam;
|
||||
|
||||
//获取文章信息
|
||||
$aArticle = $this->getArticle($aParam);
|
||||
$iStatus = empty($aArticle['status']) ? 0 : $aArticle['status'];
|
||||
if($iStatus != 1){
|
||||
return json_encode($aArticle);
|
||||
}
|
||||
$aArticle = empty($aArticle['data']) ? [] : $aArticle['data'];
|
||||
if(empty($aArticle)){
|
||||
return json_encode(['status' => 3,'msg' => 'The article does not exist']);
|
||||
}
|
||||
//查询参考文献信息
|
||||
$aWhere = ['p_article_id' => $aArticle['p_article_id'],'state' => 0,'doilink' => ''];
|
||||
$aRefer = Db::name('production_article_refer')->field('p_refer_id,p_article_id,refer_type,refer_content,doilink,refer_doi')->where($aWhere)->select();
|
||||
if(empty($aRefer)){
|
||||
return json_encode(['status' => 4,'msg' => 'No reference information found']);
|
||||
}
|
||||
//数据处理
|
||||
foreach ($aRefer as $key => $value) {
|
||||
if(empty($value['refer_doi'])){
|
||||
continue;
|
||||
}
|
||||
if($value['refer_doi'] == 'Not Available'){
|
||||
continue;
|
||||
}
|
||||
if($value['refer_type'] == 'journal' && !empty($value['doilink'])){
|
||||
continue;
|
||||
}
|
||||
if($value['refer_type'] == 'book' && !empty($value['isbn'])){
|
||||
continue;
|
||||
}
|
||||
//写入获取参考文献详情队列
|
||||
\think\Queue::push('app\api\job\AiCheckReferByDoi@fire',$value,'AiCheckReferByDoi');
|
||||
}
|
||||
return json_encode(['status' => 1,'msg' => 'Successfully joined the AI inspection DOI queue']);
|
||||
}
|
||||
/**
|
||||
* 获取结果
|
||||
*/
|
||||
public function getCheckByAiResult($aParam = []){
|
||||
//获取参数
|
||||
$aParam = empty($aParam) ? $this->request->post() : $aParam;
|
||||
|
||||
//必填值验证
|
||||
$iPReferId = empty($aParam['p_refer_id']) ? '' : $aParam['p_refer_id'];
|
||||
if(empty($iPReferId)){
|
||||
return json_encode(['status' => 2,'msg' => 'Please select the reference to be queried']);
|
||||
}
|
||||
//获取参考文献信息
|
||||
$aWhere = ['p_refer_id' => $iPReferId,'state' => 0];
|
||||
$aRefer = Db::name('production_article_refer')->field('p_refer_id,p_article_id,refer_type,refer_content,doilink,refer_doi,state,dateno')->where($aWhere)->find();
|
||||
if(empty($aRefer)){
|
||||
return json_encode(['status' => 4,'msg' => 'Reference is empty'.json_encode($aParam)]);
|
||||
}
|
||||
if(empty($aRefer['refer_doi'])){
|
||||
return json_encode(['status' => 4,'msg' => 'Reference DOI is empty'.json_encode($aParam)]);
|
||||
}
|
||||
if($aRefer['refer_type'] == 'journal' && !empty($aRefer['doilink'])){
|
||||
$aDateno = empty($aRefer['dateno']) ? [] : explode(':', $aRefer['dateno']);
|
||||
if(count($aDateno) > 1){
|
||||
return json_encode(['status' => 4,'msg' => 'No need to parse again-journal'.json_encode($aParam)]);
|
||||
}
|
||||
}
|
||||
if($aRefer['refer_type'] == 'book' && !empty($aRefer['isbn'])){
|
||||
return json_encode(['status' => 4,'msg' => 'No need to parse again-book'.json_encode($aParam)]);
|
||||
}
|
||||
//获取文章信息
|
||||
$aParam['p_article_id'] = $aRefer['p_article_id'];
|
||||
$aArticle = $this->getArticle($aParam);
|
||||
$iStatus = empty($aArticle['status']) ? 0 : $aArticle['status'];
|
||||
if($iStatus != 1){
|
||||
return json_encode($aArticle);
|
||||
}
|
||||
$aArticle = empty($aArticle['data']) ? [] : $aArticle['data'];
|
||||
if(empty($aArticle)){
|
||||
return json_encode(['status' => 3,'msg' => 'The article does not exist']);
|
||||
}
|
||||
|
||||
//请求AI获取结果
|
||||
$aResult = $this->curlOpenAIByDoi(['doi' => $aRefer['refer_doi']]);
|
||||
$iStatus = empty($aResult['status']) ? 0 : $aResult['status'];
|
||||
$sMsg = empty($aResult['msg']) ? 'The DOI number AI did not find any relevant information' : $aResult['msg'];
|
||||
if($iStatus != 1){
|
||||
return json_encode(['status' => 4,'msg' => $sMsg]);
|
||||
}
|
||||
$aData = empty($aResult['data']) ? [] : $aResult['data'];
|
||||
if(empty($aData)){
|
||||
return json_encode(['status' => 5,'msg' => 'AI obtains empty data']);
|
||||
}
|
||||
//写入日志
|
||||
$aLog = [];
|
||||
$aLog['content'] = json_encode($aResult);
|
||||
$aLog['update_time'] = time();
|
||||
$aLog['p_refer_id'] = $iPReferId;
|
||||
$iLogId = Db::name('production_article_refer_ai')->insertGetId($aLog);
|
||||
$iIsAiCheck = empty($aData['is_ai_check']) ? 2 : $aData['is_ai_check'];
|
||||
if($iIsAiCheck != 1){//AI未检测到信息
|
||||
return json_encode(['status' => 6,'msg' => 'AI did not find any information'.json_encode($aParam)]);
|
||||
}
|
||||
|
||||
//数据处理入库
|
||||
$aField = ['author','title','joura','dateno','doilink'];
|
||||
foreach ($aField as $key => $value) {
|
||||
if(empty($aData[$value])){
|
||||
continue;
|
||||
}
|
||||
if($value == 'author'){
|
||||
$aUpdate['author'] = implode(',', $aData['author']);
|
||||
// $aUpdate['author'] = str_replace('et al.', '', $aUpdate['author']);
|
||||
}else{
|
||||
$aUpdate[$value] = $aData[$value];
|
||||
}
|
||||
}
|
||||
if(empty($aUpdate)){
|
||||
return json_encode(['status' => 6,'msg' => 'Update data to empty'.json_encode($aData)]);
|
||||
}
|
||||
if($aRefer['refer_type'] == 'other'){
|
||||
$aUpdate['refer_type'] = 'journal';
|
||||
}
|
||||
if($aRefer['refer_type'] == 'book' && !empty($aUpdate['doilink'])){
|
||||
$aUpdate['refer_type'] = $aUpdate['doilink'];
|
||||
unset($aUpdate['doilink']);
|
||||
}
|
||||
$aLog = $aUpdate;
|
||||
$aUpdate['is_change'] = 1;
|
||||
$aUpdate['is_ai_check'] = 1;
|
||||
$aUpdate['cs'] = 1;
|
||||
$aUpdate['update_time'] = time();
|
||||
Db::startTrans();
|
||||
//更新数据
|
||||
$aWhere = ['p_refer_id' => $iPReferId,'state' => 0];
|
||||
$result = Db::name('production_article_refer')->where($aWhere)->limit(1)->update($aUpdate);
|
||||
if($result === false){
|
||||
return json_encode(['status' => 6,'msg' => 'Update failed']);
|
||||
}
|
||||
//更新日志
|
||||
if(!empty($iLogId)){
|
||||
$aWhere = ['id' => $iLogId];
|
||||
if(isset($aLog['refer_type'])){
|
||||
unset($aLog['refer_type']);
|
||||
}
|
||||
$result = Db::name('production_article_refer_ai')->where($aWhere)->limit(1)->update($aLog);
|
||||
}
|
||||
Db::commit();
|
||||
return json_encode(['status' => 1,'msg' => 'success']);
|
||||
}
|
||||
|
||||
/**
|
||||
* 对接OPENAI
|
||||
*/
|
||||
private function curlOpenAIByDoi($aParam = []){
|
||||
|
||||
//获取DOI
|
||||
$sDoi = empty($aParam['doi']) ? '' : $aParam['doi'];
|
||||
if(empty($sDoi)){
|
||||
return ['status' => 2,'msg' => 'Reference doi is empty'];
|
||||
}
|
||||
//系统角色
|
||||
$sSysMessagePrompt = '请完成以下任务:
|
||||
1. 根据提供的DOI号,查询该文献的AMA引用格式;
|
||||
2. 按照以下规则调整AMA引用格式:
|
||||
- 第三个作者名字后添加 et al.;
|
||||
- DOI前加上"Available at: ";
|
||||
- DOI信息格式调整为"https://doi.org/+真实DOI"(替换真实DOI为文献实际DOI).
|
||||
3. 严格按照以下JSON结构返回结果,仅返回JSON数据,不要额外文字,包含字段:doilink(url格式)、title(标题)、author(作者数组)、joura(出版社名称)、dateno(年;卷(期):起始页-终止页),is_ai_check(默认1)
|
||||
4. 若未查询到信息,字段is_ai_check为2,相关字段为null。';
|
||||
//用户角色
|
||||
$sUserPrompt = '我提供的DOI是:'.$sDoi;
|
||||
$aMessage = [
|
||||
['role' => 'system', 'content' => $sSysMessagePrompt],
|
||||
['role' => 'user', 'content' => $sUserPrompt],
|
||||
];
|
||||
//请求OPENAI接口
|
||||
$sModel = empty($aParam['model']) ? 'gpt-4.1' : $aParam['model'];//模型
|
||||
$sApiUrl = $this->sApiUrl;//'http://chat.taimed.cn/v1/chat/completions';//
|
||||
$aParam = ['model' => $sModel,'url' => $sApiUrl,'temperature' => 0,'messages' => $aMessage,'api_key' => $this->sApiKey];
|
||||
$oOpenAi = new \app\common\OpenAi;
|
||||
$aResult = json_decode($oOpenAi->curlOpenAI($aParam),true);
|
||||
return $aResult;
|
||||
}
|
||||
/**
|
||||
* 作者修改完成发邮件
|
||||
*/
|
||||
public function finishSendEmail(){
|
||||
//获取参数
|
||||
$aParam = empty($aParam) ? $this->request->post() : $aParam;
|
||||
//文章ID
|
||||
$iArticleId = empty($aParam['article_id']) ? '' : $aParam['article_id'];
|
||||
if(empty($iArticleId)){
|
||||
return json_encode(array('status' => 2,'msg' => 'Please select an article'));
|
||||
}
|
||||
//查询条件
|
||||
$aWhere = ['article_id' => $iArticleId,'state' => ['in',[5,6]]];
|
||||
$aArticle = Db::name('article')->field('article_id,journal_id,accept_sn')->where($aWhere)->find();
|
||||
if(empty($aArticle)){
|
||||
return json_encode(['status' => 3,'msg' => 'The article does not exist or has not entered the editorial reference status']);
|
||||
}
|
||||
$aWhere = ['article_id' => $iArticleId,'state' => 0];
|
||||
$aProductionArticle = Db::name('production_article')->field('p_article_id')->where($aWhere)->find();
|
||||
if(empty($aProductionArticle)) {
|
||||
return ['status' => 2,'msg' => 'The article has not entered the production stage'];
|
||||
}
|
||||
//查询是否有参考文献
|
||||
$aWhere = ['p_article_id' => $aProductionArticle['p_article_id'],'state' => 0];
|
||||
$aRefer = Db::name('production_article_refer')->field('article_id')->where($aWhere)->find();
|
||||
if(empty($aRefer)) {
|
||||
return ['status' => 2,'msg' => 'No reference information found, please be patient and wait for the editor to upload'];
|
||||
}
|
||||
//查询期刊信息
|
||||
if(empty($aArticle['journal_id'])){
|
||||
return json_encode(array('status' => 4,'msg' => 'The article is not associated with a journal' ));
|
||||
}
|
||||
$aWhere = ['state' => 0,'journal_id' => $aArticle['journal_id']];
|
||||
$aJournal = Db::name('journal')->where($aWhere)->find();
|
||||
if(empty($aJournal)){
|
||||
return json_encode(array('status' => 5,'msg' => 'No journal information found' ));
|
||||
}
|
||||
//查询编辑邮箱
|
||||
$iUserId = empty($aJournal['editor_id']) ? '' : $aJournal['editor_id'];
|
||||
if(empty($iUserId)){
|
||||
return json_encode(array('status' => 6,'msg' => 'The journal to which the article belongs has not designated a responsible editor' ));
|
||||
}
|
||||
$aWhere = ['user_id' => $iUserId,'state' => 0,'email' => ['<>','']];
|
||||
$aUser = Db::name('user')->field('user_id,email,realname,account')->where($aWhere)->find();
|
||||
if(empty($aUser)){
|
||||
return json_encode(['status' => 7,'msg' => "Edit email as empty"]);
|
||||
}
|
||||
|
||||
//处理发邮件
|
||||
//邮件模版
|
||||
$aEmailConfig = [
|
||||
'email_subject' => '{journal_title}-{accept_sn}',
|
||||
'email_content' => '
|
||||
Dear Editor,<br><br>
|
||||
The authors have revised the formats of all references, please check.<br>
|
||||
Sn:{accept_sn}<br><br>
|
||||
Sincerely,<br>Editorial Office<br>
|
||||
<a href="https://www.tmrjournals.com/draw_up.html?issn={journal_issn}">Subscribe to this journal</a><br>{journal_title}<br>
|
||||
Email: {journal_email}<br>
|
||||
Website: {website}'
|
||||
];
|
||||
//邮件内容
|
||||
$aSearch = [
|
||||
'{accept_sn}' => empty($aArticle['accept_sn']) ? '' : $aArticle['accept_sn'],//accept_sn
|
||||
'{journal_title}' => empty($aJournal['title']) ? '' : $aJournal['title'],//期刊名
|
||||
'{journal_issn}' => empty($aJournal['issn']) ? '' : $aJournal['issn'],
|
||||
'{journal_email}' => empty($aJournal['email']) ? '' : $aJournal['email'],
|
||||
'{website}' => empty($aJournal['website']) ? '' : $aJournal['website'],
|
||||
];
|
||||
|
||||
//发邮件
|
||||
//邮件标题
|
||||
$email = $aUser['email'];
|
||||
$title = str_replace(array_keys($aSearch), array_values($aSearch),$aEmailConfig['email_subject']);
|
||||
//邮件内容变量替换
|
||||
$content = str_replace(array_keys($aSearch), array_values($aSearch), $aEmailConfig['email_content']);
|
||||
$pre = \think\Env::get('emailtemplete.pre');
|
||||
$net = \think\Env::get('emailtemplete.net');
|
||||
$net1 = str_replace("{{email}}",trim($email),$net);
|
||||
$content=$pre.$content.$net1;
|
||||
//发送邮件
|
||||
$memail = empty($aJournal['email']) ? '' : $aJournal['email'];
|
||||
$mpassword = empty($aJournal['epassword']) ? '' : $aJournal['epassword'];
|
||||
//期刊标题
|
||||
$from_name = empty($aJournal['title']) ? '' : $aJournal['title'];
|
||||
//邮件队列组装参数
|
||||
$aResult = sendEmail($email,$title,$from_name,$content,$memail,$mpassword);
|
||||
$iStatus = empty($aResult['status']) ? 1 : $aResult['status'];
|
||||
$iIsSuccess = 2;
|
||||
$sMsg = empty($aResult['data']) ? '失败' : $aResult['data'];
|
||||
if($iStatus == 1){
|
||||
return json_encode(['status' => 1,'msg' => 'success']);
|
||||
}
|
||||
return json_encode(['status' => 8,'msg' => 'fail']);
|
||||
}
|
||||
}
|
||||
@@ -16,35 +16,13 @@ class FetchExperts
|
||||
public function fire(Job $job, $data)
|
||||
{
|
||||
$field = isset($data['field']) ? $data['field'] : '';
|
||||
// $attempts = $job->attempts();
|
||||
//
|
||||
$service = new ExpertFinderService();
|
||||
// $service->log('[FetchExperts] start field=' . $field . ' attempts=' . $attempts);
|
||||
//
|
||||
// try {
|
||||
$result = $service->doFetchForField(
|
||||
$service->doFetchForField(
|
||||
$field,
|
||||
isset($data['source']) ? $data['source'] : 'pubmed',
|
||||
isset($data['per_page']) ? intval($data['per_page']) : 100,
|
||||
isset($data['min_year']) ? $data['min_year'] : null
|
||||
);
|
||||
// $service->log('[FetchExperts] completed field=' . $field . ' result=' . json_encode($result));
|
||||
// } catch (\Throwable $e) {
|
||||
// $service->log(
|
||||
// '[FetchExperts] failed field=' . $field .
|
||||
// ' msg=' . $e->getMessage() .
|
||||
// ' file=' . $e->getFile() .
|
||||
// ' line=' . $e->getLine()
|
||||
// );
|
||||
//
|
||||
// if ($attempts >= 3) {
|
||||
// $job->delete();
|
||||
// return;
|
||||
// }
|
||||
//
|
||||
// $job->release(60);
|
||||
// return;
|
||||
// }
|
||||
|
||||
$job->delete();
|
||||
}
|
||||
|
||||
@@ -1151,6 +1151,123 @@ class ArticleParserService
|
||||
]
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* 提取 Word 文档中的参考文献列表(仅返回数组,不做入库)
|
||||
* @return array 每条为一个参考文献的纯文本字符串
|
||||
*/
|
||||
public static function getReferencesFromWord($filePath): array
|
||||
{
|
||||
$othis = new self($filePath) ;
|
||||
if (empty($othis->sections)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$lines = [];
|
||||
foreach ($othis->sections as $section) {
|
||||
foreach ($section->getElements() as $element) {
|
||||
$text = $othis->getTextFromElement($element);
|
||||
$text = trim((string)$text);
|
||||
if ($text === '') continue;
|
||||
$lines[] = $text;
|
||||
}
|
||||
}
|
||||
|
||||
if (empty($lines)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
// 识别参考文献段落起点(允许同一行包含域代码或第一条内容)
|
||||
$startIdx = -1;
|
||||
$startRemainder = ''; // 标题行后可能跟着第一条参考文献内容
|
||||
foreach ($lines as $i => $line) {
|
||||
$t = trim($line);
|
||||
if ($t === '') continue;
|
||||
|
||||
// 行首命中即可(避免 “References { ADDIN... }” / “References 1. ...” 漏判)
|
||||
if (preg_match('/^\s*(references|reference|bibliography|参考文献|文献)\b\s*[::]?\s*/iu', $t, $m)) {
|
||||
$startIdx = $i;
|
||||
$remainder = preg_replace('/^\s*(references|reference|bibliography|参考文献|文献)\b\s*[::]?\s*/iu', '', $t);
|
||||
$remainder = trim($remainder);
|
||||
// 过滤 EndNote 域代码(允许其出现在标题行后)
|
||||
if ($remainder !== '' && !preg_match('/^\{\s*ADDIN\s+EN\.REFLIST\s*\}$/i', $remainder)) {
|
||||
$startRemainder = $remainder;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ($startIdx < 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
// 收集参考文献区域内容,遇到常见结尾段落标题则停止
|
||||
$stopKeywords = [
|
||||
'acknowledgements', 'acknowledgments', 'funding', 'appendix', 'supplementary',
|
||||
'conflict of interest', 'competing interests', 'author contributions',
|
||||
'致谢', '基金', '附录', '补充材料', '利益冲突', '作者贡献',
|
||||
];
|
||||
|
||||
// startRemainder 已在起点识别时处理
|
||||
|
||||
$raw = [];
|
||||
if ($startRemainder !== '') {
|
||||
$raw[] = $startRemainder;
|
||||
}
|
||||
|
||||
for ($i = $startIdx + 1; $i < count($lines); $i++) {
|
||||
$line = trim($lines[$i]);
|
||||
if ($line === '') continue;
|
||||
// 跳过 EndNote / Word 域代码
|
||||
if (preg_match('/^\{\s*ADDIN\s+EN\.REFLIST\s*\}$/i', $line)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$lineLower = strtolower($line);
|
||||
foreach ($stopKeywords as $sk) {
|
||||
$skLower = strtolower($sk);
|
||||
if ($lineLower === $skLower || $lineLower === $skLower . ':' || $lineLower === $skLower . ':') {
|
||||
$i = count($lines); // break outer
|
||||
continue 2;
|
||||
}
|
||||
}
|
||||
|
||||
$raw[] = $line;
|
||||
}
|
||||
|
||||
if (empty($raw)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
// 合并多行:以 “数字.” / “[数字]” / “数字]” 等作为新条目起始
|
||||
$refs = [];
|
||||
$current = '';
|
||||
foreach ($raw as $line) {
|
||||
$isNew = false;
|
||||
if (preg_match('/^\s*(\[\d+\]|\d+\s*[\.\)]|\d+\s*\])\s*/u', $line)) {
|
||||
$isNew = true;
|
||||
}
|
||||
|
||||
if ($isNew) {
|
||||
if (trim($current) !== '') {
|
||||
$refs[] = trim(preg_replace('/\s+/u', ' ', $current));
|
||||
}
|
||||
$current = $line;
|
||||
} else {
|
||||
// 续行拼接
|
||||
if ($current === '') {
|
||||
$current = $line;
|
||||
} else {
|
||||
$current .= ' ' . $line;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (trim($current) !== '') {
|
||||
$refs[] = trim(preg_replace('/\s+/u', ' ', $current));
|
||||
}
|
||||
|
||||
return $refs;
|
||||
}
|
||||
/**
|
||||
* 核心解码方法
|
||||
* @param string $str 待解码字符串
|
||||
|
||||
331
application/common/CitationRelevanceService.php
Normal file
331
application/common/CitationRelevanceService.php
Normal file
@@ -0,0 +1,331 @@
|
||||
<?php
|
||||
|
||||
namespace app\common;
|
||||
|
||||
/**
|
||||
* 引用相关性检测服务(PubMed + embedding)
|
||||
*
|
||||
* 依赖:
|
||||
* - PubmedService:用 DOI 抓取 title/abstract/mesh/publication_types
|
||||
* - CrossrefService:撤稿/更正识别(补充)
|
||||
*
|
||||
* embedding:
|
||||
* - 使用你们内部大模型的 embedding 接口(无需 token 付费,但速度慢)
|
||||
* - 通过构造参数传入 embedding_url / headers / timeout
|
||||
* - 内置文件缓存,减少重复 embedding 成本
|
||||
*/
|
||||
class CitationRelevanceService
|
||||
{
|
||||
private $pubmed;
|
||||
private $crossref;
|
||||
|
||||
private $embeddingUrl = '';
|
||||
private $embeddingHeaders = [];
|
||||
private $timeout = 120;
|
||||
private $chatUrl = '';
|
||||
private $chatModel = '';
|
||||
private $embeddingDim = 256;
|
||||
private $chatMaxTokens = 1200;
|
||||
|
||||
public function __construct(PubmedService $pubmed = null, CrossrefService $crossref = null, array $config = [])
|
||||
{
|
||||
$this->pubmed = $pubmed ?: new PubmedService();
|
||||
$this->crossref = $crossref ?: new CrossrefService();
|
||||
|
||||
if (isset($config['embedding_url'])) $this->embeddingUrl = (string)$config['embedding_url'];
|
||||
if (isset($config['embedding_headers']) && is_array($config['embedding_headers'])) $this->embeddingHeaders = $config['embedding_headers'];
|
||||
if (isset($config['timeout'])) $this->timeout = max(10, intval($config['timeout']));
|
||||
if (isset($config['chat_url'])) $this->chatUrl = (string)$config['chat_url'];
|
||||
if (isset($config['chat_model'])) $this->chatModel = (string)$config['chat_model'];
|
||||
if (isset($config['embedding_dim'])) $this->embeddingDim = max(32, intval($config['embedding_dim']));
|
||||
if (isset($config['chat_max_tokens'])) $this->chatMaxTokens = max(256, intval($config['chat_max_tokens']));
|
||||
}
|
||||
|
||||
/**
|
||||
* 单条引用相关性检测
|
||||
*
|
||||
* @param string $contextText 引用处上下文(英文)
|
||||
* @param array $referRow production_article_refer 行(至少含 refer_doi/doilink/title/author/joura/dateno)
|
||||
* @param array $options
|
||||
* - sentence_is_background(bool) 若外部已判断为背景堆引用,可直接降级
|
||||
* - sim_related(float) related 阈值,默认 0.75
|
||||
* - sim_unsure(float) unsure 阈值,默认 0.60
|
||||
* - check_retraction(bool) 是否检查撤稿/更正,默认 true
|
||||
*/
|
||||
public function checkOne(string $contextText, array $referRow, array $options = []): array
|
||||
{
|
||||
$contextText = trim($contextText);
|
||||
$simRelated = isset($options['sim_related']) ? (float)$options['sim_related'] : 0.75;
|
||||
$simUnsure = isset($options['sim_unsure']) ? (float)$options['sim_unsure'] : 0.60;
|
||||
$checkRetraction = isset($options['check_retraction']) ? (bool)$options['check_retraction'] : true;
|
||||
$isBackground = !empty($options['sentence_is_background']);
|
||||
|
||||
// 1) 问题条目(退稿/更正):先 Crossref(有 DOI 才能判断)
|
||||
$problemFlag = 'unknown';
|
||||
$problemReason = '';
|
||||
if ($checkRetraction) {
|
||||
$qc = $this->crossref->qcCitation($contextText, $referRow, ['check_retraction' => true]);
|
||||
$problemFlag = $qc['problem_flag'] ?? 'unknown';
|
||||
$problemReason = $qc['problem_reason'] ?? '';
|
||||
}
|
||||
|
||||
// 2) PubMed 抓取 abstract/mesh(提升语义)
|
||||
$doi = $this->extractDoiFromRefer($referRow);
|
||||
$pub = $doi ? $this->pubmed->fetchByDoi($doi) : null;
|
||||
|
||||
$pubText = '';
|
||||
$pubTypes = [];
|
||||
if ($pub) {
|
||||
$pubTypes = $pub['publication_types'] ?? [];
|
||||
$mesh = $pub['mesh_terms'] ?? [];
|
||||
$pubText = trim(
|
||||
($pub['title'] ?? '') . "\n" .
|
||||
($pub['abstract'] ?? '') . "\n" .
|
||||
(!empty($mesh) ? ('MeSH: ' . implode('; ', $mesh)) : '')
|
||||
);
|
||||
}
|
||||
|
||||
// 3) embedding 相似度(context vs pubmed_text),无 pubmed_text 则退化为 crossref 的证据法
|
||||
if ($pubText !== '') {
|
||||
$v1 = $this->embedCached($contextText);
|
||||
$v2 = $this->embedCached($pubText);
|
||||
$sim = ($v1 && $v2) ? $this->cosine($v1, $v2) : 0.0;
|
||||
|
||||
$relevanceFlag = 'unsure';
|
||||
if ($sim >= $simRelated) {
|
||||
$relevanceFlag = 'related';
|
||||
} elseif ($sim >= $simUnsure) {
|
||||
$relevanceFlag = 'unsure';
|
||||
} else {
|
||||
$relevanceFlag = $isBackground ? 'unsure_background' : 'suspicious_unrelated';
|
||||
}
|
||||
|
||||
// PubMed 自身也能提示撤稿/更正(作为补充)
|
||||
if ($checkRetraction && $problemFlag !== 'retracted_or_corrected' && !empty($pubTypes)) {
|
||||
$ptLower = strtolower(implode(' | ', $pubTypes));
|
||||
if (strpos($ptLower, 'retracted publication') !== false
|
||||
|| strpos($ptLower, 'retraction of publication') !== false
|
||||
|| strpos($ptLower, 'published erratum') !== false
|
||||
) {
|
||||
$problemFlag = 'retracted_or_corrected';
|
||||
$problemReason = 'PubMed publication type indicates retraction/correction';
|
||||
}
|
||||
}
|
||||
|
||||
return [
|
||||
'problem_flag' => $problemFlag,
|
||||
'problem_reason' => $problemReason,
|
||||
'relevance_flag' => $relevanceFlag,
|
||||
'relevance_score' => round($sim, 4),
|
||||
'reason' => $pubText !== '' ? 'embedding(context,pubmed_text)' : 'embedding_unavailable',
|
||||
'pubmed' => [
|
||||
'pmid' => $pub['pmid'] ?? '',
|
||||
'year' => $pub['year'] ?? '',
|
||||
'journal' => $pub['journal'] ?? '',
|
||||
'publication_types' => $pubTypes,
|
||||
],
|
||||
];
|
||||
}
|
||||
|
||||
// 退化:没有 PubMed 信息时,用 CrossrefService 证据法(会偏保守)
|
||||
$fallback = $this->crossref->qcCitation($contextText, $referRow, ['check_retraction' => false]);
|
||||
$fallback['problem_flag'] = $problemFlag;
|
||||
$fallback['problem_reason'] = $problemReason;
|
||||
$fallback['reason'] = 'fallback_crossref_evidence; ' . ($fallback['reason'] ?? '');
|
||||
return $fallback;
|
||||
}
|
||||
|
||||
// ---------------- embedding ----------------
|
||||
|
||||
private function embedCached(string $text): ?array
|
||||
{
|
||||
$text = trim($text);
|
||||
if ($text === '') return null;
|
||||
|
||||
$key = 'emb_' . sha1($text);
|
||||
$cached = $this->cacheGet($key, 90 * 86400);
|
||||
if (is_array($cached) && !empty($cached)) return $cached;
|
||||
|
||||
$vec = $this->embed($text);
|
||||
if (is_array($vec) && !empty($vec)) {
|
||||
$this->cacheSet($key, $vec);
|
||||
return $vec;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* 调用内部 embedding 接口
|
||||
* 兼容返回格式:
|
||||
* - OpenAI embeddings: {data:[{embedding:[...] }]}
|
||||
* - {embedding:[...]}
|
||||
* - 直接返回数组 [...]
|
||||
*/
|
||||
private function embed(string $text): ?array
|
||||
{
|
||||
// 1) 优先使用独立 embeddings 接口
|
||||
if ($this->embeddingUrl !== '') {
|
||||
$payload = json_encode(['text' => $text], JSON_UNESCAPED_UNICODE);
|
||||
$ch = curl_init();
|
||||
curl_setopt($ch, CURLOPT_URL, $this->embeddingUrl);
|
||||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
||||
curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout);
|
||||
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
|
||||
curl_setopt($ch, CURLOPT_POST, true);
|
||||
curl_setopt($ch, CURLOPT_POSTFIELDS, $payload);
|
||||
|
||||
$headers = array_merge(['Content-Type: application/json'], $this->embeddingHeaders);
|
||||
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
|
||||
|
||||
$res = curl_exec($ch);
|
||||
curl_close($ch);
|
||||
if (!is_string($res) || trim($res) === '') return null;
|
||||
|
||||
$decoded = json_decode($res, true);
|
||||
if (is_array($decoded)) {
|
||||
if (isset($decoded['data'][0]['embedding']) && is_array($decoded['data'][0]['embedding'])) {
|
||||
return $this->normalizeVector($decoded['data'][0]['embedding']);
|
||||
}
|
||||
if (isset($decoded['embedding']) && is_array($decoded['embedding'])) {
|
||||
return $this->normalizeVector($decoded['embedding']);
|
||||
}
|
||||
$isVec = isset($decoded[0]) && (is_float($decoded[0]) || is_int($decoded[0]));
|
||||
if ($isVec) return $this->normalizeVector($decoded);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// 2) 没有 embeddings 接口时,使用 chat/completions 生成固定维度向量
|
||||
if ($this->chatUrl === '' || $this->chatModel === '') {
|
||||
return null;
|
||||
}
|
||||
|
||||
$sys = "You are an embedding generator. Output ONLY valid JSON in this exact shape: {\"embedding\":[...]}.\n"
|
||||
. "Rules:\n"
|
||||
. "- embedding must be an array of exactly {$this->embeddingDim} floats\n"
|
||||
. "- each float must be between -1 and 1\n"
|
||||
. "- do not include any other keys or any extra text\n";
|
||||
|
||||
$payload = json_encode([
|
||||
'model' => $this->chatModel,
|
||||
'temperature' => 0,
|
||||
'max_tokens' => $this->chatMaxTokens,
|
||||
'messages' => [
|
||||
['role' => 'system', 'content' => $sys],
|
||||
['role' => 'user', 'content' => $text],
|
||||
],
|
||||
], JSON_UNESCAPED_UNICODE);
|
||||
|
||||
$ch = curl_init();
|
||||
curl_setopt($ch, CURLOPT_URL, $this->chatUrl);
|
||||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
||||
curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout);
|
||||
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
|
||||
curl_setopt($ch, CURLOPT_POST, true);
|
||||
curl_setopt($ch, CURLOPT_POSTFIELDS, $payload);
|
||||
|
||||
$headers = array_merge(['Content-Type: application/json'], $this->embeddingHeaders);
|
||||
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
|
||||
|
||||
$res = curl_exec($ch);
|
||||
curl_close($ch);
|
||||
if (!is_string($res) || trim($res) === '') return null;
|
||||
|
||||
$decoded = json_decode($res, true);
|
||||
$content = '';
|
||||
if (is_array($decoded) && isset($decoded['choices'][0]['message']['content'])) {
|
||||
$content = (string)$decoded['choices'][0]['message']['content'];
|
||||
}
|
||||
$content = trim($content);
|
||||
if ($content === '') return null;
|
||||
|
||||
// content 可能被包裹在 ```json ... ```
|
||||
if (preg_match('/```(?:json)?\\s*([\\s\\S]*?)\\s*```/i', $content, $m)) {
|
||||
$content = trim($m[1]);
|
||||
}
|
||||
$j = json_decode($content, true);
|
||||
if (!is_array($j) || !isset($j['embedding']) || !is_array($j['embedding'])) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$vec = $j['embedding'];
|
||||
if (count($vec) !== $this->embeddingDim) {
|
||||
return null;
|
||||
}
|
||||
return $this->normalizeVector($vec);
|
||||
}
|
||||
|
||||
private function cosine(array $a, array $b): float
|
||||
{
|
||||
$n = min(count($a), count($b));
|
||||
if ($n <= 0) return 0.0;
|
||||
$dot = 0.0; $na = 0.0; $nb = 0.0;
|
||||
for ($i = 0; $i < $n; $i++) {
|
||||
$x = (float)$a[$i];
|
||||
$y = (float)$b[$i];
|
||||
$dot += $x * $y;
|
||||
$na += $x * $x;
|
||||
$nb += $y * $y;
|
||||
}
|
||||
if ($na <= 0.0 || $nb <= 0.0) return 0.0;
|
||||
return $dot / (sqrt($na) * sqrt($nb));
|
||||
}
|
||||
|
||||
private function normalizeVector(array $v): array
|
||||
{
|
||||
$sum = 0.0;
|
||||
$out = [];
|
||||
foreach ($v as $x) {
|
||||
$fx = (float)$x;
|
||||
$out[] = $fx;
|
||||
$sum += $fx * $fx;
|
||||
}
|
||||
if ($sum <= 0.0) return $out;
|
||||
$norm = sqrt($sum);
|
||||
for ($i = 0; $i < count($out); $i++) {
|
||||
$out[$i] = $out[$i] / $norm;
|
||||
}
|
||||
return $out;
|
||||
}
|
||||
|
||||
private function extractDoiFromRefer(array $referRow): string
|
||||
{
|
||||
// 复用 CrossrefService 内部逻辑(通过 qcCitation 的抽取函数不可直接访问,所以这里简单实现)
|
||||
$doi = trim((string)($referRow['refer_doi'] ?? ''));
|
||||
if ($doi !== '') return $doi;
|
||||
|
||||
$doilink = trim((string)($referRow['doilink'] ?? ''));
|
||||
if ($doilink === '') return '';
|
||||
if (preg_match('#doi\\.org/([^?#]+)#i', $doilink, $m)) {
|
||||
return trim((string)$m[1]);
|
||||
}
|
||||
return $doilink;
|
||||
}
|
||||
|
||||
// ---------------- cache ----------------
|
||||
|
||||
private function cacheDir(): string
|
||||
{
|
||||
return rtrim(ROOT_PATH, '/') . '/runtime/embed_cache';
|
||||
}
|
||||
|
||||
private function cacheGet(string $key, int $ttlSeconds)
|
||||
{
|
||||
$file = $this->cacheDir() . '/' . $key . '.json';
|
||||
if (!is_file($file)) return null;
|
||||
$mtime = filemtime($file);
|
||||
if (!$mtime || (time() - $mtime) > $ttlSeconds) return null;
|
||||
$raw = @file_get_contents($file);
|
||||
$decoded = json_decode((string)$raw, true);
|
||||
return $decoded;
|
||||
}
|
||||
|
||||
private function cacheSet(string $key, $value): void
|
||||
{
|
||||
$dir = $this->cacheDir();
|
||||
if (!is_dir($dir)) @mkdir($dir, 0777, true);
|
||||
$file = $dir . '/' . $key . '.json';
|
||||
@file_put_contents($file, json_encode($value, JSON_UNESCAPED_UNICODE));
|
||||
}
|
||||
}
|
||||
|
||||
765
application/common/CrossrefService.php
Normal file
765
application/common/CrossrefService.php
Normal file
@@ -0,0 +1,765 @@
|
||||
<?php
|
||||
|
||||
namespace app\common;
|
||||
|
||||
/**
|
||||
* Crossref API 工具类
|
||||
*
|
||||
* 说明:
|
||||
* - 仿照 application/api/controller/Crossrefdoi.php 的实现风格抽成 Service
|
||||
* - 仅做「请求 + 解析」;不包含任何数据库读写
|
||||
*/
|
||||
class CrossrefService
|
||||
{
|
||||
// 配置项
|
||||
private $mailto = ''; // 邮箱(提升优先级)
|
||||
private $timeout = 15; // 请求超时(秒)
|
||||
private $maxRetry = 2; // 单个DOI最大重试次数
|
||||
private $crossrefUrl = "https://api.crossref.org/works/"; // 接口地址
|
||||
|
||||
public function __construct($config = [])
|
||||
{
|
||||
if (is_array($config)) {
|
||||
if (isset($config['mailto'])) $this->mailto = (string)$config['mailto'];
|
||||
if (isset($config['timeout'])) $this->timeout = intval($config['timeout']);
|
||||
if (isset($config['maxRetry'])) $this->maxRetry = intval($config['maxRetry']);
|
||||
if (isset($config['crossrefUrl'])) $this->crossrefUrl = (string)$config['crossrefUrl'];
|
||||
}
|
||||
}
|
||||
|
||||
public function setMailto($mailto)
|
||||
{
|
||||
$this->mailto = (string)$mailto;
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* 引用标号转换工具:正文里的 [n] 对应 production_article_refer.index = n-1(index 从 0 开始)。
|
||||
*
|
||||
* @param int $citationMark 正文引用编号,如 13(来自 [13])
|
||||
* @return int production_article_refer.index,如 12
|
||||
*/
|
||||
public function referIndexFromCitationMark(int $citationMark): int
|
||||
{
|
||||
$citationMark = intval($citationMark);
|
||||
return max(0, $citationMark - 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* 反向转换工具:production_article_refer.index(从 0 开始)→ 正文引用编号 [n]。
|
||||
*
|
||||
* @param int $referIndex production_article_refer.index,如 12
|
||||
* @return int 正文引用编号 n,如 13
|
||||
*/
|
||||
public function citationMarkFromReferIndex(int $referIndex): int
|
||||
{
|
||||
$referIndex = intval($referIndex);
|
||||
return max(0, $referIndex + 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* 批量引用质检(不查库版):\n
|
||||
* - 输入文章分节内容(t_article_main 的 content 列表)\n
|
||||
* - 输入引用条目(production_article_refer 的行列表)\n
|
||||
* - 自动抽取每个 [n] 的英文句子上下文,并映射到 refer.index=n-1 后调用 qcCitation()\n
|
||||
*
|
||||
* 说明:本方法不做任何数据库查询,方便你在 controller/service 中自由组合数据来源。
|
||||
*
|
||||
* @param array $articleMainContents 文章内容片段数组(按 sort 顺序),元素为 string 或含 content 的数组
|
||||
* @param array $referRows production_article_refer 行数组(至少含 index/title/author/joura/dateno/refer_doi/doilink)
|
||||
* @param array $options 透传给 qcCitation 的 options,并支持:
|
||||
* - sentence_window(int) 上下文句子窗口,默认 1(即前1句+本句+后1句)
|
||||
* @return array 结果列表,每条包含 citation_mark/refer_index/context/ref_meta/qc
|
||||
*/
|
||||
public function qcArticleCitations(array $articleMainContents, array $referRows, array $options = []): array
|
||||
{
|
||||
$window = isset($options['sentence_window']) ? max(0, intval($options['sentence_window'])) : 1;
|
||||
|
||||
// 1) 组装全文纯文本(保留 [n])
|
||||
$chunks = [];
|
||||
foreach ($articleMainContents as $row) {
|
||||
if (is_array($row)) {
|
||||
$text = (string)($row['content'] ?? '');
|
||||
} else {
|
||||
$text = (string)$row;
|
||||
}
|
||||
if ($text === '') continue;
|
||||
// 去掉常见标签,保留 [n]
|
||||
$text = preg_replace('/<\s*\/?\s*blue[^>]*>/i', '', $text);
|
||||
$text = strip_tags($text);
|
||||
$text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8');
|
||||
$text = preg_replace('/\s+/u', ' ', trim($text));
|
||||
if ($text !== '') $chunks[] = $text;
|
||||
}
|
||||
$fullText = implode("\n", $chunks);
|
||||
|
||||
if ($fullText === '') return [];
|
||||
|
||||
// 2) 构建引用条目映射:refer_index => row
|
||||
$referMap = [];
|
||||
foreach ($referRows as $r) {
|
||||
if (!is_array($r)) continue;
|
||||
if (!isset($r['index'])) continue;
|
||||
$idx = intval($r['index']);
|
||||
$referMap[$idx] = $r;
|
||||
}
|
||||
|
||||
// 3) 英文切句(简单稳健版)
|
||||
$sentences = $this->splitEnglishSentences($fullText);
|
||||
if (empty($sentences)) return [];
|
||||
|
||||
// 4) 遍历句子,抓取其中的 [n]
|
||||
$results = [];
|
||||
foreach ($sentences as $si => $sent) {
|
||||
if (!preg_match_all('/\[(\d+)\]/', $sent, $m)) {
|
||||
continue;
|
||||
}
|
||||
$marks = array_unique(array_map('intval', $m[1]));
|
||||
foreach ($marks as $citationMark) {
|
||||
if ($citationMark <= 0) continue;
|
||||
$referIndex = $this->referIndexFromCitationMark($citationMark);
|
||||
if (!isset($referMap[$referIndex])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$start = max(0, $si - $window);
|
||||
$end = min(count($sentences) - 1, $si + $window);
|
||||
$ctx = implode(' ', array_slice($sentences, $start, $end - $start + 1));
|
||||
$ctx = trim(preg_replace('/\s+/u', ' ', $ctx));
|
||||
|
||||
$refMeta = $referMap[$referIndex];
|
||||
$qc = $this->qcCitation($ctx, $refMeta, $options);
|
||||
|
||||
$results[] = [
|
||||
'citation_mark' => $citationMark, // 正文编号 n(来自 [n])
|
||||
'refer_index' => $referIndex, // production_article_refer.index
|
||||
'context' => $ctx,
|
||||
'ref_meta' => [
|
||||
'p_refer_id' => $refMeta['p_refer_id'] ?? 0,
|
||||
'title' => $refMeta['title'] ?? '',
|
||||
'author' => $refMeta['author'] ?? '',
|
||||
'joura' => $refMeta['joura'] ?? '',
|
||||
'dateno' => $refMeta['dateno'] ?? '',
|
||||
'refer_doi' => $refMeta['refer_doi'] ?? '',
|
||||
'doilink' => $refMeta['doilink'] ?? '',
|
||||
'index' => $refMeta['index'] ?? $referIndex,
|
||||
],
|
||||
'qc' => $qc,
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
return $results;
|
||||
}
|
||||
|
||||
/**
|
||||
* 过滤非法DOI(仅保留10.xxxx/xxx格式)
|
||||
* @param string $doi
|
||||
* @return string
|
||||
*/
|
||||
public function filterValidDoi($doi = '')
|
||||
{
|
||||
$doi = trim((string)$doi);
|
||||
if ($doi === '') return '';
|
||||
if (preg_match('/^10\.\d{4,}\/.+/', $doi)) {
|
||||
return $doi;
|
||||
}
|
||||
return '';
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取 Crossref message(带重试)
|
||||
* @param string $doi
|
||||
* @return array|null
|
||||
*/
|
||||
public function fetchWork($doi)
|
||||
{
|
||||
$doi = $this->filterValidDoi($doi);
|
||||
if ($doi === '') return null;
|
||||
return $this->fetchSingleDoiWithRetry($doi);
|
||||
}
|
||||
|
||||
/**
|
||||
* 返回常用字段集合(标题/期刊/作者/卷期页/撤稿/URL)
|
||||
* @param string $doi
|
||||
* @return array|null
|
||||
*/
|
||||
public function fetchWorkSummary($doi)
|
||||
{
|
||||
$msg = $this->fetchWork($doi);
|
||||
if (!$msg) return null;
|
||||
|
||||
$title = $this->getTitle($msg);
|
||||
$publisher = $this->getPublisher($msg);
|
||||
$joura = !empty($publisher['title']) ? $publisher['title'] : ($publisher['short_title'] ?? '');
|
||||
$authors = $this->getAuthors($msg);
|
||||
$dateno = $this->getVolumeIssuePages($msg);
|
||||
$retractInfo = $this->checkRetracted($msg);
|
||||
$dolink = $this->getDolink($msg);
|
||||
if (empty($dolink)) {
|
||||
$dolink = 'https://doi.org/' . $this->filterValidDoi($doi);
|
||||
}
|
||||
|
||||
return [
|
||||
'doi' => $this->filterValidDoi($doi),
|
||||
'title' => $title,
|
||||
'joura' => $joura,
|
||||
'publisher' => $publisher,
|
||||
'authors' => $authors,
|
||||
'author_str' => empty($authors) ? '' : implode(',', $authors),
|
||||
'dateno' => $dateno,
|
||||
'is_retracted' => !empty($retractInfo['is_retracted']) ? 1 : 0,
|
||||
'retract_reason' => $retractInfo['reason'] ?? '',
|
||||
'doilink' => $dolink,
|
||||
'raw' => $msg,
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* 单DOI查询(带重试)
|
||||
* @param string $doi
|
||||
* @return array|null
|
||||
*/
|
||||
private function fetchSingleDoiWithRetry($doi)
|
||||
{
|
||||
$retryCount = 0;
|
||||
while ($retryCount < $this->maxRetry) {
|
||||
$url = $this->crossrefUrl . rawurlencode($doi);
|
||||
if (!empty($this->mailto)) {
|
||||
$url .= "?mailto=" . rawurlencode($this->mailto);
|
||||
}
|
||||
|
||||
$ch = curl_init();
|
||||
curl_setopt($ch, CURLOPT_URL, $url);
|
||||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
||||
curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout);
|
||||
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
|
||||
curl_setopt($ch, CURLOPT_HTTPHEADER, [
|
||||
"User-Agent: DOI-Fetcher/1.0 (mailto:{$this->mailto})"
|
||||
]);
|
||||
$response = curl_exec($ch);
|
||||
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
||||
curl_close($ch);
|
||||
|
||||
if ($httpCode == 200) {
|
||||
$data = json_decode($response, true);
|
||||
return (isset($data['status']) && $data['status'] == 'ok') ? ($data['message'] ?? null) : null;
|
||||
}
|
||||
|
||||
if ($httpCode == 429) {
|
||||
sleep(5);
|
||||
$retryCount++;
|
||||
continue;
|
||||
}
|
||||
|
||||
$retryCount++;
|
||||
sleep(1);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* 提取标题
|
||||
*/
|
||||
public function getTitle($aDoiInfo = [])
|
||||
{
|
||||
return $aDoiInfo['title'][0] ?? '';
|
||||
}
|
||||
|
||||
/**
|
||||
* 提取期刊/出版社相关信息
|
||||
*/
|
||||
public function getPublisher($aDoiInfo = [])
|
||||
{
|
||||
return [
|
||||
'title' => isset($aDoiInfo['container-title'][0]) ? $aDoiInfo['container-title'][0] : '',
|
||||
'short_title' => isset($aDoiInfo['short-container-title'][0]) ? $aDoiInfo['short-container-title'][0] : '',
|
||||
'ISSN' => $aDoiInfo['ISSN'] ?? [],
|
||||
'publisher' => $aDoiInfo['publisher'] ?? '',
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* 提取作者列表
|
||||
*/
|
||||
public function getAuthors($aDoiInfo = [])
|
||||
{
|
||||
$authors = [];
|
||||
if (!empty($aDoiInfo['author'])) {
|
||||
foreach ($aDoiInfo['author'] as $author) {
|
||||
$name = $author['family'] ?? '';
|
||||
if (!empty($author['given'])) {
|
||||
$name = $author['given'] . ' ' . $name;
|
||||
}
|
||||
if (!empty($name)) {
|
||||
$authors[] = $name;
|
||||
}
|
||||
}
|
||||
}
|
||||
return $authors;
|
||||
}
|
||||
|
||||
/**
|
||||
* 提取发表年份
|
||||
*/
|
||||
public function getPublishYear($aDoiInfo = [])
|
||||
{
|
||||
if (!empty($aDoiInfo['issued']['date-parts'][0][0])) {
|
||||
return (string)$aDoiInfo['issued']['date-parts'][0][0];
|
||||
}
|
||||
return '';
|
||||
}
|
||||
|
||||
/**
|
||||
* 提取卷(期):起始页-终止页(格式:2024:10(2):100-120)
|
||||
*/
|
||||
public function getVolumeIssuePages($aDoiInfo = [])
|
||||
{
|
||||
$parts = [];
|
||||
|
||||
$year = $this->getPublishYear($aDoiInfo);
|
||||
if ($year) $parts[] = $year;
|
||||
|
||||
$volume = $aDoiInfo['volume'] ?? '';
|
||||
$issue = $aDoiInfo['issue'] ?? '';
|
||||
if ($volume) {
|
||||
$parts[] = $volume . ($issue ? "({$issue})" : '');
|
||||
}
|
||||
|
||||
$pageStart = $aDoiInfo['page']['start'] ?? ($aDoiInfo['first-page'] ?? '');
|
||||
$pageEnd = $aDoiInfo['page']['end'] ?? ($aDoiInfo['last-page'] ?? '');
|
||||
$pages = '';
|
||||
if ($pageStart) {
|
||||
$pages = $pageStart . ($pageEnd ? "-{$pageEnd}" : '');
|
||||
} else {
|
||||
$pages = $aDoiInfo['page'] ?? '';
|
||||
}
|
||||
if ($pages) $parts[] = $pages;
|
||||
|
||||
return implode(':', $parts);
|
||||
}
|
||||
|
||||
/**
|
||||
* 识别撤稿文章(与 Crossrefdoi.php 同逻辑)
|
||||
*/
|
||||
public function checkRetracted($aDoiInfo = [])
|
||||
{
|
||||
$isRetracted = false;
|
||||
$reason = "未撤稿";
|
||||
|
||||
$sType = strtolower($aDoiInfo['type'] ?? '');
|
||||
$sSubtype = strtolower($aDoiInfo['subtype'] ?? '');
|
||||
if ($sType && in_array($sType, ['retraction', 'correction'])) {
|
||||
$isRetracted = true;
|
||||
$reason = "文章类型为{$sType}(撤稿/更正声明)";
|
||||
}
|
||||
if ($sSubtype && in_array($sSubtype, ['retraction', 'correction'])) {
|
||||
$isRetracted = true;
|
||||
$reason = "文章类型为{$sSubtype}(撤稿/更正声明)";
|
||||
}
|
||||
|
||||
if (isset($aDoiInfo['update-type']) && is_array($aDoiInfo['update-type']) && in_array('retraction', $aDoiInfo['update-type'])) {
|
||||
$isRetracted = true;
|
||||
$reason = "官方标记为撤稿(update-type: retraction)";
|
||||
}
|
||||
|
||||
if (isset($aDoiInfo['relation']) && !empty($aDoiInfo['relation'])) {
|
||||
foreach ($aDoiInfo['relation'] as $relType => $relItems) {
|
||||
if (in_array($relType, ['is-retraction-of', 'corrects'])) {
|
||||
$isRetracted = true;
|
||||
$relatedDoi = $relItems[0]['id'] ?? '未知';
|
||||
$reason = "关联撤稿文章{$relatedDoi}(关系:{$relType})";
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (isset($aDoiInfo['update-to']) && is_array($aDoiInfo['update-to'])) {
|
||||
foreach ($aDoiInfo['update-to'] as $update) {
|
||||
$updateType = strtolower($update['type'] ?? '');
|
||||
$updateLabel = strtolower($update['label'] ?? '');
|
||||
if (strpos($updateType, 'retract') !== false || strpos($updateLabel, 'retract') !== false) {
|
||||
$isRetracted = true;
|
||||
$reason = "update-to 标记撤稿({$updateType}/{$updateLabel})";
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$aTitles = $aDoiInfo['title'] ?? [];
|
||||
foreach ($aTitles as $value) {
|
||||
$sTitleLower = strtolower($value);
|
||||
if (strpos($sTitleLower, 'retraction') !== false || strpos($sTitleLower, 'retracted') !== false
|
||||
|| strpos($sTitleLower, 'withdrawal') !== false || strpos($sTitleLower, 'withdrawn') !== false) {
|
||||
$isRetracted = true;
|
||||
$reason = "标题包含撤稿关键词";
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return [
|
||||
'is_retracted' => $isRetracted,
|
||||
'reason' => $reason,
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* 识别 doi 链接
|
||||
*/
|
||||
public function getDolink($aDoiInfo = [])
|
||||
{
|
||||
return $aDoiInfo['URL'] ?? '';
|
||||
}
|
||||
|
||||
/**
|
||||
* 解析 Crossref date-parts
|
||||
*/
|
||||
public function parseDateParts($dateObj)
|
||||
{
|
||||
$parts = $dateObj['date-parts'][0] ?? [];
|
||||
if (empty($parts)) return '';
|
||||
$y = $parts[0] ?? '';
|
||||
$m = $parts[1] ?? '';
|
||||
$d = $parts[2] ?? '';
|
||||
$out = (string)$y;
|
||||
if ($m !== '') $out .= '-' . str_pad((string)$m, 2, '0', STR_PAD_LEFT);
|
||||
if ($d !== '') $out .= '-' . str_pad((string)$d, 2, '0', STR_PAD_LEFT);
|
||||
return $out;
|
||||
}
|
||||
|
||||
/**
|
||||
* 引用质检:判断(1) 被引条目是否疑似退稿/更正(基于 Crossref);(2) 引用上下文是否与被引条目相关(基于证据命中)。
|
||||
*
|
||||
* 说明:
|
||||
* - 适用于没有 abstract/keywords 的场景(仅用 title/author/journal/year + 引用上下文句子)。
|
||||
* - 如果 refer_doi/doilink 为空,则 problem_flag 只能返回 unknown。
|
||||
*
|
||||
* @param string $contextText 引用处的上下文句子(英文,最好只包含引用所在句 + 少量相邻句)
|
||||
* @param array $refMeta 被引条目元信息(建议来自 production_article_refer)
|
||||
* - refer_doi / doilink / title / author / joura / dateno
|
||||
* @param array $options 可选参数
|
||||
* - check_retraction(bool) 是否调用 Crossref 判断退稿/更正;默认 true
|
||||
* - background_phrases(array) 背景堆引用触发短语;默认使用内置
|
||||
*
|
||||
* @return array
|
||||
* [
|
||||
* 'problem_flag' => 'ok'|'retracted_or_corrected'|'unknown',
|
||||
* 'problem_reason' => string,
|
||||
* 'relevance_flag' => 'related'|'unsure'|'unsure_background'|'suspicious_unrelated',
|
||||
* 'relevance_score' => float,
|
||||
* 'reason' => string
|
||||
* ]
|
||||
*/
|
||||
public function qcCitation(string $contextText, array $refMeta, array $options = []): array
|
||||
{
|
||||
$contextText = trim($contextText);
|
||||
$checkRetraction = isset($options['check_retraction']) ? (bool)$options['check_retraction'] : true;
|
||||
|
||||
$refTitle = (string)($refMeta['title'] ?? '');
|
||||
$refAuthor = (string)($refMeta['author'] ?? '');
|
||||
$refJoura = (string)($refMeta['joura'] ?? '');
|
||||
$refDateno = (string)($refMeta['dateno'] ?? '');
|
||||
$referDoi = (string)($refMeta['refer_doi'] ?? '');
|
||||
$doilink = (string)($refMeta['doilink'] ?? '');
|
||||
|
||||
$doi = $this->extractDoiFromMeta($referDoi, $doilink);
|
||||
|
||||
// 1) 退稿/更正判断(强规则,影响 problem_flag)
|
||||
$problemFlag = 'unknown';
|
||||
$problemReason = '';
|
||||
if ($checkRetraction) {
|
||||
if (!empty($doi)) {
|
||||
$summary = $this->fetchWorkSummary($doi);
|
||||
if ($summary && isset($summary['is_retracted'])) {
|
||||
if ((int)$summary['is_retracted'] === 1) {
|
||||
$problemFlag = 'retracted_or_corrected';
|
||||
$problemReason = !empty($summary['retract_reason']) ? $summary['retract_reason'] : 'Crossref indicates retraction/correction';
|
||||
} else {
|
||||
$problemFlag = 'ok';
|
||||
$problemReason = 'Crossref indicates not retracted/corrected';
|
||||
}
|
||||
} else {
|
||||
$problemFlag = 'unknown';
|
||||
$problemReason = 'Crossref fetch failed or returned unexpected data';
|
||||
}
|
||||
} else {
|
||||
$problemFlag = 'unknown';
|
||||
$problemReason = 'DOI is empty';
|
||||
}
|
||||
} else {
|
||||
$problemFlag = 'unknown';
|
||||
$problemReason = 'Skip retraction check';
|
||||
}
|
||||
|
||||
// 2) 相关性判断(弱规则+证据命中)
|
||||
$backgroundPhrases = isset($options['background_phrases']) ? (array)$options['background_phrases'] : [
|
||||
'several studies',
|
||||
'many studies',
|
||||
'the literature',
|
||||
'the existing literature',
|
||||
'has been reported',
|
||||
'have been reported',
|
||||
'it has been shown',
|
||||
'previous studies',
|
||||
'the study suggests',
|
||||
'the literature suggests',
|
||||
'in the literature',
|
||||
];
|
||||
|
||||
$ctxLower = strtolower($contextText);
|
||||
$isBackground = false;
|
||||
foreach ($backgroundPhrases as $ph) {
|
||||
$ph = strtolower(trim((string)$ph));
|
||||
if ($ph !== '' && $ph !== '0' && strpos($ctxLower, $ph) !== false) {
|
||||
$isBackground = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
$refTokens = $this->buildEvidenceTokens([
|
||||
'title' => $refTitle,
|
||||
'author' => $refAuthor,
|
||||
'journal' => $refJoura,
|
||||
'year' => $refDateno,
|
||||
]);
|
||||
|
||||
$ctxTokens = $this->tokenize($contextText);
|
||||
|
||||
$titleOverlap = 0.0;
|
||||
$authorHit = 0.0;
|
||||
$journalOverlap = 0.0;
|
||||
$yearHit = 0.0;
|
||||
|
||||
$titleTokens = $refTokens['titleTokens'] ?? [];
|
||||
$authorTokens = $refTokens['authorTokens'] ?? [];
|
||||
$journalTokens = $refTokens['journalTokens'] ?? [];
|
||||
$yearToken = $refTokens['yearToken'] ?? '';
|
||||
|
||||
if (!empty($titleTokens)) {
|
||||
$inter = array_intersect($titleTokens, $ctxTokens);
|
||||
$titleOverlap = count($inter) / max(1, count($titleTokens));
|
||||
}
|
||||
|
||||
if (!empty($authorTokens)) {
|
||||
foreach ($authorTokens as $at) {
|
||||
if ($at !== '' && in_array($at, $ctxTokens, true)) {
|
||||
$authorHit = 1.0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!empty($journalTokens)) {
|
||||
$interJ = array_intersect($journalTokens, $ctxTokens);
|
||||
$journalOverlap = count($interJ) / max(1, count($journalTokens));
|
||||
}
|
||||
|
||||
if (!empty($yearToken) && strpos($ctxLower, (string)$yearToken) !== false) {
|
||||
$yearHit = 1.0;
|
||||
}
|
||||
|
||||
// 综合得分(保持解释性:越高越相关)
|
||||
$score = round((
|
||||
0.60 * $titleOverlap +
|
||||
0.20 * $authorHit +
|
||||
0.15 * $yearHit +
|
||||
0.05 * $journalOverlap
|
||||
), 4);
|
||||
|
||||
$relevanceFlag = 'unsure';
|
||||
$reasonParts = [];
|
||||
|
||||
if ($score >= 0.35 && ($authorHit > 0.0 || $yearHit > 0.0)) {
|
||||
$relevanceFlag = 'related';
|
||||
$reasonParts[] = 'title_keyword_overlap_high=' . $titleOverlap;
|
||||
} elseif ($score >= 0.25) {
|
||||
$relevanceFlag = 'unsure';
|
||||
$reasonParts[] = 'evidence_score_mid=' . $score;
|
||||
} else {
|
||||
if ($isBackground) {
|
||||
$relevanceFlag = 'unsure_background';
|
||||
$reasonParts[] = 'background_phrases_detected';
|
||||
} else {
|
||||
$relevanceFlag = 'suspicious_unrelated';
|
||||
$reasonParts[] = 'evidence_score_low=' . $score;
|
||||
}
|
||||
}
|
||||
|
||||
$reasonParts[] = 'titleOverlap=' . $titleOverlap;
|
||||
$reasonParts[] = 'authorHit=' . $authorHit;
|
||||
$reasonParts[] = 'yearHit=' . $yearHit;
|
||||
$reasonParts[] = 'journalOverlap=' . $journalOverlap;
|
||||
|
||||
$reason = implode('; ', $reasonParts);
|
||||
|
||||
return [
|
||||
'problem_flag' => $problemFlag,
|
||||
'problem_reason' => $problemReason,
|
||||
'relevance_flag' => $relevanceFlag,
|
||||
'relevance_score' => (float)$score,
|
||||
'reason' => $reason,
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* 从 refer_doi / doilink 中抽取 DOI 字符串。
|
||||
* @param string $referDoi
|
||||
* @param string $doilink
|
||||
* @return string
|
||||
*/
|
||||
private function extractDoiFromMeta(string $referDoi, string $doilink): string
|
||||
{
|
||||
$doi = trim($referDoi);
|
||||
if (!empty($doi)) {
|
||||
return $this->filterValidDoi($doi);
|
||||
}
|
||||
|
||||
$link = trim($doilink);
|
||||
if ($link === '') return '';
|
||||
|
||||
// 常见:https://doi.org/10.xxxx/xxxx 或 http://doi.org/...
|
||||
if (preg_match('#doi\.org/([^?#]+)#i', $link, $m)) {
|
||||
$candidate = trim((string)$m[1]);
|
||||
return $this->filterValidDoi($candidate);
|
||||
}
|
||||
|
||||
// 兜底:如果doilink本身就是doi格式
|
||||
return $this->filterValidDoi($link);
|
||||
}
|
||||
|
||||
/**
|
||||
* 构建证据 token(用于证据命中/相似度粗判)
|
||||
* @param array $src
|
||||
* @return array
|
||||
*/
|
||||
private function buildEvidenceTokens(array $src): array
|
||||
{
|
||||
$stop = [
|
||||
'the','a','an','and','or','of','in','on','for','with','to','from','by','at','as','is','are',
|
||||
'was','were','be','been','being','that','this','these','those','which','who','whom','it','its',
|
||||
'we','our','us','they','their','them','i','you','your','he','she','his','her',
|
||||
'study','studies','report','reported','reports','model','models','analysis','analyses','method','methods',
|
||||
'results','result','using','used','show','shown','demonstrated','demonstrate',
|
||||
];
|
||||
|
||||
$titleTokens = $this->tokenize((string)($src['title'] ?? ''));
|
||||
$titleTokens = array_values(array_filter(array_unique($titleTokens), function ($t) use ($stop) {
|
||||
return !in_array($t, $stop, true) && mb_strlen($t) >= 4;
|
||||
}));
|
||||
|
||||
$authorTokens = $this->extractAuthorTokens((string)($src['author'] ?? ''));
|
||||
$authorTokens = array_values(array_unique(array_filter(array_map(function ($t) use ($stop) {
|
||||
$t = trim($t);
|
||||
if ($t === '') return '';
|
||||
if (in_array($t, $stop, true)) return '';
|
||||
return $t;
|
||||
}, $authorTokens))));
|
||||
|
||||
$journalTokens = $this->tokenize((string)($src['journal'] ?? ''));
|
||||
$journalTokens = array_values(array_filter(array_unique($journalTokens), function ($t) use ($stop) {
|
||||
return !in_array($t, $stop, true) && mb_strlen($t) >= 4;
|
||||
}));
|
||||
|
||||
$yearToken = '';
|
||||
$yearRaw = (string)($src['year'] ?? '');
|
||||
if (preg_match('/(19\d{2}|20\d{2})/', $yearRaw, $m)) {
|
||||
$yearToken = (string)$m[1];
|
||||
}
|
||||
|
||||
return [
|
||||
'titleTokens' => $titleTokens,
|
||||
'authorTokens' => $authorTokens,
|
||||
'journalTokens' => $journalTokens,
|
||||
'yearToken' => $yearToken,
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* 提取作者姓/缩写 token(简化版)
|
||||
* @param string $authorStr
|
||||
* @return array
|
||||
*/
|
||||
private function extractAuthorTokens(string $authorStr): array
|
||||
{
|
||||
$authorStr = trim($authorStr);
|
||||
if ($authorStr === '') return [];
|
||||
|
||||
// 把常见分隔符拆开
|
||||
$parts = preg_split('/[,;]| and /i', $authorStr);
|
||||
$tokens = [];
|
||||
foreach ($parts as $p) {
|
||||
$p = trim((string)$p);
|
||||
if ($p === '') continue;
|
||||
|
||||
// 取最后一个词当作姓(例如 "Smith J" -> "Smith"),或取首段词
|
||||
$words = preg_split('/\s+/', $p);
|
||||
if (empty($words)) continue;
|
||||
|
||||
$cand = trim((string)end($words));
|
||||
if ($cand === '') $cand = trim((string)($words[0] ?? ''));
|
||||
|
||||
// 只保留字母/点号(去掉异常符号)
|
||||
$cand = preg_replace('/[^A-Za-z\.\-]/', '', $cand);
|
||||
$cand = strtolower($cand);
|
||||
if ($cand !== '') {
|
||||
$tokens[] = $cand;
|
||||
}
|
||||
}
|
||||
|
||||
// 去掉过短的 token
|
||||
$tokens = array_values(array_filter(array_unique($tokens), function ($t) {
|
||||
return mb_strlen($t) >= 4;
|
||||
}));
|
||||
|
||||
return $tokens;
|
||||
}
|
||||
|
||||
/**
|
||||
* 文本 tokenize(英文下的轻量分词)
|
||||
* @param string $text
|
||||
* @return array
|
||||
*/
|
||||
private function tokenize(string $text): array
|
||||
{
|
||||
$text = strtolower(trim($text));
|
||||
if ($text === '') return [];
|
||||
|
||||
$parts = preg_split('/[^a-z0-9]+/i', $text);
|
||||
$tokens = [];
|
||||
foreach ($parts as $p) {
|
||||
$p = trim((string)$p);
|
||||
if ($p === '') continue;
|
||||
// 保留较有信息量的 token
|
||||
if (mb_strlen($p) < 3) continue;
|
||||
$tokens[] = $p;
|
||||
}
|
||||
|
||||
return array_values(array_unique($tokens));
|
||||
}
|
||||
|
||||
/**
|
||||
* 英文切句(轻量实现):按 .?! 分割,同时保留句内的 [n]。
|
||||
* @param string $text
|
||||
* @return array
|
||||
*/
|
||||
private function splitEnglishSentences(string $text): array
|
||||
{
|
||||
$text = trim($text);
|
||||
if ($text === '') return [];
|
||||
|
||||
// 先把换行统一为空格,避免断句被打断
|
||||
$text = preg_replace('/\s+/u', ' ', $text);
|
||||
|
||||
// 按句末标点断句:. ? ! 后面跟空格/结尾
|
||||
$parts = preg_split('/(?<=[\.\?\!])\s+/', $text);
|
||||
$sentences = [];
|
||||
foreach ($parts as $p) {
|
||||
$p = trim((string)$p);
|
||||
if ($p === '') continue;
|
||||
$sentences[] = $p;
|
||||
}
|
||||
return $sentences;
|
||||
}
|
||||
}
|
||||
|
||||
237
application/common/PubmedService.php
Normal file
237
application/common/PubmedService.php
Normal file
@@ -0,0 +1,237 @@
|
||||
<?php
|
||||
|
||||
namespace app\common;
|
||||
|
||||
/**
|
||||
* PubMed 工具类(E-utilities)
|
||||
*
|
||||
* 功能:
|
||||
* - DOI -> PMID
|
||||
* - PMID -> 文章结构化信息(title/abstract/mesh/publication_types/year/journal)
|
||||
*
|
||||
* 说明:
|
||||
* - 默认使用 runtime 文件缓存,避免重复请求 NCBI
|
||||
*/
|
||||
class PubmedService
|
||||
{
|
||||
private $base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/';
|
||||
private $timeout = 20;
|
||||
private $tool = 'tmrjournals';
|
||||
private $email = '';
|
||||
|
||||
public function __construct(array $config = [])
|
||||
{
|
||||
if (isset($config['base'])) $this->base = rtrim((string)$config['base'], '/') . '/';
|
||||
if (isset($config['timeout'])) $this->timeout = max(5, intval($config['timeout']));
|
||||
if (isset($config['tool'])) $this->tool = (string)$config['tool'];
|
||||
if (isset($config['email'])) $this->email = (string)$config['email'];
|
||||
}
|
||||
|
||||
/**
|
||||
* DOI -> PMID(优先用 [DOI],命中不到再用 [AID])
|
||||
*/
|
||||
public function doiToPmid(string $doi): ?string
|
||||
{
|
||||
$doi = trim($doi);
|
||||
if ($doi === '') return null;
|
||||
|
||||
$cacheKey = 'doi2pmid_' . sha1(strtolower($doi));
|
||||
$cached = $this->cacheGet($cacheKey, 30 * 86400);
|
||||
if (is_string($cached) && $cached !== '') {
|
||||
return $cached;
|
||||
}
|
||||
|
||||
$pmid = $this->esearch($doi . '[DOI]');
|
||||
if (!$pmid) {
|
||||
$pmid = $this->esearch($doi . '[AID]');
|
||||
}
|
||||
if ($pmid) {
|
||||
$this->cacheSet($cacheKey, $pmid);
|
||||
return $pmid;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* PMID -> 文章信息(title/abstract/mesh/publication_types/year/journal)
|
||||
*/
|
||||
public function fetchByPmid(string $pmid): ?array
|
||||
{
|
||||
$pmid = trim($pmid);
|
||||
if ($pmid === '') return null;
|
||||
|
||||
$cacheKey = 'pmid_' . $pmid;
|
||||
$cached = $this->cacheGet($cacheKey, 30 * 86400);
|
||||
if (is_array($cached)) return $cached;
|
||||
|
||||
$url = $this->base . 'efetch.fcgi?' . http_build_query([
|
||||
'db' => 'pubmed',
|
||||
'id' => $pmid,
|
||||
'retmode' => 'xml',
|
||||
'tool' => $this->tool,
|
||||
'email' => $this->email,
|
||||
]);
|
||||
|
||||
$xml = $this->httpGet($url);
|
||||
if (!is_string($xml) || trim($xml) === '') return null;
|
||||
|
||||
$data = $this->parseEfetchXml($xml);
|
||||
if (!$data) return null;
|
||||
|
||||
$this->cacheSet($cacheKey, $data);
|
||||
return $data;
|
||||
}
|
||||
|
||||
/**
|
||||
* DOI -> PubMed 信息(含 abstract/mesh)
|
||||
*/
|
||||
public function fetchByDoi(string $doi): ?array
|
||||
{
|
||||
$pmid = $this->doiToPmid($doi);
|
||||
if (!$pmid) return null;
|
||||
$info = $this->fetchByPmid($pmid);
|
||||
if (!$info) return null;
|
||||
$info['pmid'] = $pmid;
|
||||
$info['doi'] = $doi;
|
||||
return $info;
|
||||
}
|
||||
|
||||
// ----------------- Internals -----------------
|
||||
|
||||
private function esearch(string $term): ?string
|
||||
{
|
||||
$url = $this->base . 'esearch.fcgi?' . http_build_query([
|
||||
'db' => 'pubmed',
|
||||
'retmode' => 'json',
|
||||
'retmax' => 1,
|
||||
'term' => $term,
|
||||
'tool' => $this->tool,
|
||||
'email' => $this->email,
|
||||
]);
|
||||
|
||||
$res = $this->httpGet($url);
|
||||
$json = json_decode((string)$res, true);
|
||||
$ids = $json['esearchresult']['idlist'] ?? [];
|
||||
if (!empty($ids[0])) return (string)$ids[0];
|
||||
return null;
|
||||
}
|
||||
|
||||
private function parseEfetchXml(string $xml): ?array
|
||||
{
|
||||
libxml_use_internal_errors(true);
|
||||
$doc = new \DOMDocument();
|
||||
if (!$doc->loadXML($xml)) {
|
||||
return null;
|
||||
}
|
||||
$xp = new \DOMXPath($doc);
|
||||
|
||||
$title = $this->xpText($xp, '//PubmedArticle//ArticleTitle');
|
||||
|
||||
$abstractParts = [];
|
||||
$absNodes = $xp->query('//PubmedArticle//Abstract//AbstractText');
|
||||
if ($absNodes) {
|
||||
foreach ($absNodes as $n) {
|
||||
$label = $n->attributes && $n->attributes->getNamedItem('Label')
|
||||
? trim($n->attributes->getNamedItem('Label')->nodeValue)
|
||||
: '';
|
||||
$txt = trim($n->textContent);
|
||||
if ($txt === '') continue;
|
||||
$abstractParts[] = $label ? ($label . ': ' . $txt) : $txt;
|
||||
}
|
||||
}
|
||||
$abstract = trim(implode("\n", $abstractParts));
|
||||
|
||||
$mesh = [];
|
||||
$meshNodes = $xp->query('//PubmedArticle//MeshHeadingList//MeshHeading//DescriptorName');
|
||||
if ($meshNodes) {
|
||||
foreach ($meshNodes as $n) {
|
||||
$t = trim($n->textContent);
|
||||
if ($t !== '') $mesh[] = $t;
|
||||
}
|
||||
}
|
||||
$mesh = array_values(array_unique($mesh));
|
||||
|
||||
$pubTypes = [];
|
||||
$ptNodes = $xp->query('//PubmedArticle//PublicationTypeList//PublicationType');
|
||||
if ($ptNodes) {
|
||||
foreach ($ptNodes as $n) {
|
||||
$t = trim($n->textContent);
|
||||
if ($t !== '') $pubTypes[] = $t;
|
||||
}
|
||||
}
|
||||
$pubTypes = array_values(array_unique($pubTypes));
|
||||
|
||||
$journal = $this->xpText($xp, '//PubmedArticle//Journal//Title');
|
||||
|
||||
$year = '';
|
||||
$year = $this->xpText($xp, '//PubmedArticle//JournalIssue//PubDate//Year');
|
||||
if ($year === '') {
|
||||
$medlineDate = $this->xpText($xp, '//PubmedArticle//JournalIssue//PubDate//MedlineDate');
|
||||
if (preg_match('/(19\\d{2}|20\\d{2})/', $medlineDate, $m)) {
|
||||
$year = $m[1];
|
||||
}
|
||||
}
|
||||
|
||||
if ($title === '' && $abstract === '') {
|
||||
return null;
|
||||
}
|
||||
|
||||
return [
|
||||
'title' => $title,
|
||||
'abstract' => $abstract,
|
||||
'mesh_terms' => $mesh,
|
||||
'publication_types' => $pubTypes,
|
||||
'journal' => $journal,
|
||||
'year' => $year,
|
||||
];
|
||||
}
|
||||
|
||||
private function xpText(\DOMXPath $xp, string $query): string
|
||||
{
|
||||
$n = $xp->query($query);
|
||||
if ($n && $n->length > 0) {
|
||||
return trim($n->item(0)->textContent);
|
||||
}
|
||||
return '';
|
||||
}
|
||||
|
||||
private function httpGet(string $url): string
|
||||
{
|
||||
$ch = curl_init();
|
||||
curl_setopt($ch, CURLOPT_URL, $url);
|
||||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
||||
curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout);
|
||||
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
|
||||
curl_setopt($ch, CURLOPT_HTTPHEADER, [
|
||||
'User-Agent: TMRjournals-PubMed/1.0'
|
||||
]);
|
||||
$res = curl_exec($ch);
|
||||
curl_close($ch);
|
||||
return is_string($res) ? $res : '';
|
||||
}
|
||||
|
||||
private function cacheDir(): string
|
||||
{
|
||||
return rtrim(ROOT_PATH, '/') . '/runtime/pubmed_cache';
|
||||
}
|
||||
|
||||
private function cacheGet(string $key, int $ttlSeconds)
|
||||
{
|
||||
$file = $this->cacheDir() . '/' . $key . '.json';
|
||||
if (!is_file($file)) return null;
|
||||
$mtime = filemtime($file);
|
||||
if (!$mtime || (time() - $mtime) > $ttlSeconds) return null;
|
||||
$raw = @file_get_contents($file);
|
||||
$decoded = json_decode((string)$raw, true);
|
||||
return $decoded;
|
||||
}
|
||||
|
||||
private function cacheSet(string $key, $value): void
|
||||
{
|
||||
$dir = $this->cacheDir();
|
||||
if (!is_dir($dir)) @mkdir($dir, 0777, true);
|
||||
$file = $dir . '/' . $key . '.json';
|
||||
@file_put_contents($file, json_encode($value, JSON_UNESCAPED_UNICODE));
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user