From a802b2e92377cde34770762f836ad2cfb6a92822 Mon Sep 17 00:00:00 2001 From: wangjinlei <751475802@qq.com> Date: Fri, 3 Apr 2026 11:45:45 +0800 Subject: [PATCH] =?UTF-8?q?=E8=87=AA=E5=8A=A8=E6=8E=A8=E5=B9=BF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- application/api/controller/Article.php | 59 +- application/api/controller/Contribute.php | 18 + application/api/controller/Crossrefdoi.php | 16 + application/api/controller/EmailClient.php | 2 + application/api/controller/ExpertManage.php | 12 +- application/api/controller/References.php | 695 ++++++++++++++++ application/api/job/FetchExperts.php | 24 +- application/common/ArticleParserService.php | 117 +++ .../common/CitationRelevanceService.php | 331 ++++++++ application/common/CrossrefService.php | 765 ++++++++++++++++++ application/common/PubmedService.php | 237 ++++++ 11 files changed, 2240 insertions(+), 36 deletions(-) create mode 100644 application/api/controller/References.php create mode 100644 application/common/CitationRelevanceService.php create mode 100644 application/common/CrossrefService.php create mode 100644 application/common/PubmedService.php diff --git a/application/api/controller/Article.php b/application/api/controller/Article.php index 8ab5d3b..e47a047 100644 --- a/application/api/controller/Article.php +++ b/application/api/controller/Article.php @@ -8,6 +8,8 @@ use think\Queue; use think\Validate; use PhpOffice\PhpWord\IOFactory; use app\common\OpenAi; +use app\common\CrossrefService; +use app\common\PubmedService; /** * @title 文章接口 @@ -17,9 +19,17 @@ class Article extends Base { + /** + * @var CrossrefService + */ + private $crossService; + private $pubmedService; + public function __construct(\think\Request $request = null) { parent::__construct($request); + $this->crossService = new CrossrefService(); + $this->pubmedService = new PubmedService(); } @@ -169,16 +179,6 @@ class Article extends Base return jsonSuccess($re); } - - public function myttt() - { - $res = $this->addProductionEx("3689"); - echo "
";
-        var_dump($res);
-        echo "
"; - die; - } - /**获取预接收内容状态 * @return void */ @@ -1053,6 +1053,36 @@ class Article extends Base } } + + public function testCheckArticleCitation() + { + $data = $this->request->post(); + $rule = new Validate([ + "article_id"=>"require" + ]); + if(!$rule->check($data)){ + return jsonError($rule->getError()); + } + $mains = $this->article_main_obj->where("article_id",$data['article_id'])->where("state",0)->select(); + $production = $this->production_article_obj->where("article_id",$data['article_id'])->find(); + $refers = $this->production_article_refer_obj->where("p_article_id",$production['p_article_id'])->where("state",0)->order("index asc")->select(); + $res = $this->crossService->qcArticleCitations($mains,$refers); + return jsonSuccess(['res'=>$res]); + } + + public function testCheckArticlePubmed() + { + $data = $this->request->post(); + $rule = new Validate([ + "doi"=>"require" + ]); + if(!$rule->check($data)){ + return jsonError($rule->getError()); + } + $res = $this->pubmedService->fetchByDoi($data['doi']); + return jsonSuccess(['res'=>$res]); + } + /** * @title 发送留言板消息 * @description 发送留言板消息 @@ -3940,7 +3970,14 @@ class Article extends Base public function ffff(){ $data = $this->request->post(); - $this->ai_scor($data['article_id']); + $rule = new Validate([ + "doi"=>"require" + ]); + if(!$rule->check($data)){ + return jsonError($rule->getError()); + } + $res = $this->crossService->fetchWorkSummary($data['doi']); + return jsonSuccess($res); } diff --git a/application/api/controller/Contribute.php b/application/api/controller/Contribute.php index fa51163..e5b4d77 100644 --- a/application/api/controller/Contribute.php +++ b/application/api/controller/Contribute.php @@ -6,6 +6,7 @@ use app\api\controller\Base; use think\Db; use PhpOffice\PhpWord\IOFactory; use think\Exception; +use think\Validate; use \app\common\ArticleParserService; /** * @title 自动投稿控制器 @@ -102,6 +103,23 @@ class Contribute extends Base return $result; } + + + public function myTestArticle(){ + $data = $this->request->post(); + $rule = new Validate([ + 'article_id'=>"require" + ]); + if(!$rule->check($data)){ + return jsonError($rule->getError()); + } + $files = $this->article_file_obj->where("article_id",$data['article_id'])->where("type_name","manuscirpt")->order("file_id desc")->limit(1)->select(); + $sFileUrl =$files[0]['file_url']; + $sFileUrl = rtrim(ROOT_PATH,'/').'/public/'.ltrim(ltrim($sFileUrl,'/'),'public'); + $res = ArticleParserService::getReferencesFromWord($sFileUrl); + return jsonSuccess($res); + } + /** * 组装数据插入相关数据表 * @param array $aParam diff --git a/application/api/controller/Crossrefdoi.php b/application/api/controller/Crossrefdoi.php index 8c17c6e..62e792c 100644 --- a/application/api/controller/Crossrefdoi.php +++ b/application/api/controller/Crossrefdoi.php @@ -2,6 +2,7 @@ namespace app\api\controller; use app\api\controller\Base; use think\Db; +use think\Validate; class Crossrefdoi extends Base{ public function __construct(\think\Request $request = null) { @@ -99,6 +100,21 @@ class Crossrefdoi extends Base{ return json_encode(['status' => 1,'msg' => 'Update successful']); } + + public function getOneDoi(){ + $data = $this->request->post(); + $rule = new Validate([ + "doi"=>"require" + ]); + if(!$rule->check($data)){ + return jsonError($rule->getError()); + } + $sCheckDoi = $this->fetchSingleDoiWithRetry($this->filterValidDoi($data['doi'])); // 过滤非法DOI + return jsonSuccess($sCheckDoi); + } + + + /** * 过滤非法DOI(仅保留10.xxxx/xxx格式) */ diff --git a/application/api/controller/EmailClient.php b/application/api/controller/EmailClient.php index cec596a..512e346 100644 --- a/application/api/controller/EmailClient.php +++ b/application/api/controller/EmailClient.php @@ -2335,6 +2335,8 @@ class EmailClient extends Base ->where('jpf.state', 0) ->where('ef_fetch.state', 0) ->column('ef_fetch.field'); + + $fields = array_unique(array_filter(array_map('trim', $fields))); if (empty($fields)) { diff --git a/application/api/controller/ExpertManage.php b/application/api/controller/ExpertManage.php index a92423c..55cd51f 100644 --- a/application/api/controller/ExpertManage.php +++ b/application/api/controller/ExpertManage.php @@ -43,28 +43,36 @@ class ExpertManage extends Base $pageSize = max(1, intval(isset($data['pageSize']) ? $data['pageSize'] : 20)); $query = Db::name('expert')->alias('e'); + $countQuery = Db::name('expert')->alias('e'); $needJoin = ($field !== ''); if ($needJoin) { $query->join('t_expert_field ef', 'ef.expert_id = e.expert_id AND ef.state = 0', 'inner'); + $countQuery->join('t_expert_field ef', 'ef.expert_id = e.expert_id AND ef.state = 0', 'inner'); if ($field !== '') { $query->where('ef.field', 'like', '%' . $field . '%'); + $countQuery->where('ef.field', 'like', '%' . $field . '%'); } $query->group('e.expert_id'); + $countQuery->group('e.expert_id'); } if ($state !== '-1' && $state !== '') { $query->where('e.state', intval($state)); + $countQuery->where('e.state', intval($state)); } if ($keyword !== '') { $query->where('e.name|e.email|e.affiliation', 'like', '%' . $keyword . '%'); + $countQuery->where('e.name|e.email|e.affiliation', 'like', '%' . $keyword . '%'); } if ($source !== '') { $query->where('e.source', $source); + $countQuery->where('e.source', $source); } - $countQuery = clone $query; - $total = $countQuery->distinct('e.expert_id')->count(); +// $countQuery = clone $query; +// $total = $countQuery->distinct('e.expert_id')->count(); + $total = $needJoin ? count($countQuery->group('e.expert_id')->column('e.expert_id')) : $countQuery->count(); $list = $query ->field('e.*') diff --git a/application/api/controller/References.php b/application/api/controller/References.php new file mode 100644 index 0000000..9e5eb26 --- /dev/null +++ b/application/api/controller/References.php @@ -0,0 +1,695 @@ +request->post() : $aParam; + + //必填值验证 + $iPReferId = empty($aParam['p_refer_id']) ? '' : $aParam['p_refer_id']; + if(empty($iPReferId)){ + return json_encode(['status' => 2,'msg' => 'Please select the reference to be queried']); + } + $aWhere = ['p_refer_id' => $iPReferId,'state' => 0]; + $aRefer = Db::name('production_article_refer')->where($aWhere)->find(); + if(empty($aRefer)){ + return json_encode(['status' => 4,'msg' => 'Reference is empty']); + } + //获取文章信息 + $aParam['p_article_id'] = $aRefer['p_article_id']; + $aArticle = $this->getArticle($aParam); + $iStatus = empty($aArticle['status']) ? 0 : $aArticle['status']; + if($iStatus != 1){ + return json_encode($aArticle); + } + $aArticle = empty($aArticle['data']) ? [] : $aArticle['data']; + if(empty($aArticle)){ + return json_encode(['status' => 3,'msg' => 'The article does not exist']); + } + + //获取参考文献信息作者名.文章题目.期刊名缩写.年卷页.Available at: //https://doi.org/xxxxx + //作者 + $sData = $aRefer['refer_frag']; + if($aRefer['refer_type'] == 'journal'){ + if(!empty($aRefer['doilink'])){ + $sAuthor = empty($aRefer['author']) ? '' : trim(trim($aRefer['author']),'.'); + if(!empty($sAuthor)){ + $aAuthor = explode(',', $sAuthor); + if(count($aAuthor) > 3){ + $sAuthor = implode(',', array_slice($aAuthor, 0,3)); + $sAuthor .= ', et al'; + } + if(count($aAuthor) <= 3 ){ + $sAuthor = implode(',', $aAuthor); + } + } + //文章标题 + $sTitle = empty($aRefer['title']) ? '' : trim(trim($aRefer['title']),'.'); + //期刊名缩写 + $sJoura = empty($aRefer['joura']) ? '' : trim(trim($aRefer['joura']),'.'); + //年卷页 + $sDateno = empty($aRefer['dateno']) ? '' : trim(trim($aRefer['dateno']),'.'); + //DOI + $sDoilink = empty($aRefer['doilink']) ? '' : trim($aRefer['doilink']); + if(!empty($sDoilink)){ + $sDoilink = strpos($sDoilink ,"http")===false ? "https://doi.org/".$sDoilink : $sDoilink; + $sDoilink = str_replace('http://doi.org/', 'https://doi.org/', $sDoilink); + } + $sReferDoi = empty($aRefer['refer_doi']) ? '' : trim($aRefer['refer_doi']); + if(!empty($sReferDoi)){ + $sReferDoi = strpos($sReferDoi ,"http")===false ? "https://doi.org/".$sReferDoi : $sReferDoi; + $sReferDoi = str_replace('http://doi.org/', 'https://doi.org/', $sReferDoi); + } + $sDoilink = empty($sDoilink) ? $sReferDoi : $sDoilink; + + $sData = $sAuthor.'.'.$sTitle.'.'.$sJoura.'.'.$sDateno.".Available at:\n".$sDoilink; + } + } + if($aRefer['refer_type'] == 'book'){ + $sAuthor = empty($aRefer['author']) ? '' : trim(trim($aRefer['author']),'.'); + if(!empty($sAuthor)){ + $aAuthor = explode(',', $sAuthor); + if(count($aAuthor) > 3){ + $sAuthor = implode(',', array_slice($aAuthor, 0,3)); + $sAuthor .= ', et al'; + } + if(count($aAuthor) <= 3 ){ + $sAuthor = implode(',', $aAuthor); + } + } + //文章标题 + $sTitle = empty($aRefer['title']) ? '' : trim(trim($aRefer['title']),'.'); + //期刊名缩写 + $sJoura = empty($aRefer['joura']) ? '' : trim(trim($aRefer['joura']),'.'); + //年卷页 + $sDateno = empty($aRefer['dateno']) ? '' : trim(trim($aRefer['dateno']),'.'); + //DOI + $sDoilink = empty($aRefer['isbn']) ? '' : trim($aRefer['isbn']); + + $sData = $sAuthor.'.'.$sTitle.'.'.$sJoura.'.'.$sDateno.".Available at:\n".$sDoilink; + } + $aRefer['deal_content'] = $sData; + return json_encode(['status' => 1,'msg' => 'success','data' => $aRefer]); + } + + /** + * 参考文献鉴别:正文引用上下文 + PubMed/Crossref + 大模型向量相似度 + * 参数:p_refer_id(必填) + * 环境变量(可选):citation_chat_url、citation_chat_model、citation_chat_api_key、citation_chat_timeout、crossref_mailto、pubmed_email + */ + public function checkCitationRelevance($aParam = []) + { + $aParam = empty($aParam) ? $this->request->post() : $aParam; + $pReferId = intval(isset($aParam['p_refer_id']) ? $aParam['p_refer_id'] : 0); + if (!$pReferId) { + return jsonError('p_refer_id is required'); + } + + $refer = Db::name('production_article_refer') + ->where('p_refer_id', $pReferId) + ->where('state', 0) + ->find(); + if (empty($refer)) { + return jsonError('Reference not found'); + } + + $aArticle = $this->getArticle(['p_article_id' => $refer['p_article_id']]); + $iStatus = empty($aArticle['status']) ? 0 : $aArticle['status']; + if ($iStatus != 1) { + return json_encode($aArticle); + } + $aArticle = empty($aArticle['data']) ? [] : $aArticle['data']; + if (empty($aArticle['article_id'])) { + return jsonError('Article not found'); + } + + $articleId = intval($aArticle['article_id']); + $mains = Db::name('article_main') + ->where('article_id', $articleId) + ->whereIn('state', [0, 2]) + ->order('sort asc') + ->select(); + if (empty($mains)) { + return jsonError('article_main is empty'); + } + + $citationMark = intval($refer['index']) + 1; + $context = $this->extractCitationContextFromMains($mains, $citationMark); + if ($context === '') { + return jsonError('Citation context not found in article_main for mark [' . $citationMark . ']'); + } + + $apiKey = trim((string)Env::get('citation_chat_api_key', '')); + if ($apiKey === '') { + return jsonError('Please set env citation_chat_api_key for embedding via chat'); + } + + $config = [ + 'chat_url' => trim((string)Env::get('citation_chat_url', 'http://chat.taimed.cn/v1/chat/completions')), + 'chat_model' => trim((string)Env::get('citation_chat_model', 'DeepSeek-Coder-V2-Instruct')), + 'timeout' => max(60, intval(Env::get('citation_chat_timeout', 180))), + 'embedding_dim' => max(32, intval(Env::get('citation_embedding_dim', 256))), + 'embedding_headers' => [ + 'Authorization: Bearer ' . $apiKey, + ], + ]; + + $pubmed = new PubmedService([ + 'email' => trim((string)Env::get('pubmed_email', '')), + 'tool' => trim((string)Env::get('pubmed_tool', 'tmrjournals')), + ]); + $crossref = new CrossrefService([ + 'mailto' => trim((string)Env::get('crossref_mailto', '')), + ]); + $svc = new CitationRelevanceService($pubmed, $crossref, $config); + + $qc = $svc->checkOne($context, $refer, []); + + return jsonSuccess([ + 'p_refer_id' => $pReferId, + 'citation_mark' => $citationMark, + 'refer_index' => intval($refer['index']), + 'context' => $context, + 'problem_flag' => $qc['problem_flag'] ?? '', + 'problem_reason' => $qc['problem_reason'] ?? '', + 'relevance_flag' => $qc['relevance_flag'] ?? '', + 'relevance_score'=> $qc['relevance_score'] ?? 0, + 'reason' => $qc['reason'] ?? '', + 'pubmed' => $qc['pubmed'] ?? [], + ]); + } + + /** + * 从 t_article_main 拼接正文,按 [n] 定位句子并取前后各 1 句作为上下文 + */ + private function extractCitationContextFromMains(array $mains, int $citationMark): string + { + if ($citationMark <= 0) { + return ''; + } + $chunks = []; + foreach ($mains as $row) { + $text = isset($row['content']) ? (string)$row['content'] : ''; + if ($text === '') { + continue; + } + $text = preg_replace('/<\s*\/?\s*blue[^>]*>/i', '', $text); + $text = strip_tags($text); + $text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8'); + $text = preg_replace('/\s+/u', ' ', trim($text)); + if ($text !== '') { + $chunks[] = $text; + } + } + $fullText = implode("\n", $chunks); + if ($fullText === '') { + return ''; + } + + $sentences = $this->splitEnglishSentences($fullText); + $pattern = '/\[' . preg_quote((string)$citationMark, '/') . '\]/'; + foreach ($sentences as $si => $sent) { + if (!preg_match($pattern, $sent)) { + continue; + } + $start = max(0, $si - 1); + $end = min(count($sentences) - 1, $si + 1); + $ctx = implode(' ', array_slice($sentences, $start, $end - $start + 1)); + return trim(preg_replace('/\s+/u', ' ', $ctx)); + } + return ''; + } + + private function splitEnglishSentences(string $text): array + { + $text = trim($text); + if ($text === '') { + return []; + } + $text = preg_replace('/\s+/u', ' ', $text); + $parts = preg_split('/(?<=[\.\?\!])\s+/', $text); + $out = []; + foreach ($parts as $p) { + $p = trim((string)$p); + if ($p !== '') { + $out[] = $p; + } + } + return $out; + } + + /** + * 修改参考文献的信息 + * @param p_refer_id 主键ID + */ + public function modify($aParam = []){ + + //获取参数 + $aParam = empty($aParam) ? $this->request->post() : $aParam; + + //必填值验证 + $iPReferId = empty($aParam['p_refer_id']) ? '' : $aParam['p_refer_id']; + if(empty($iPReferId)){ + return json_encode(['status' => 2,'msg' => 'Please select the reference to be queried']); + } + $sContent = empty($aParam['content']) ? '' : $aParam['content']; + if(empty($sContent)){ + return json_encode(['status' => 2,'msg' => 'Please enter the modification content']); + } + if(!is_string($sContent)){ + return json_encode(['status' => 2,'msg' => 'The content format is incorrect']); + } + + //获取参考文献信息 + $aWhere = ['p_refer_id' => $iPReferId,'state' => 0]; + $aRefer = Db::name('production_article_refer')->where($aWhere)->find(); + if(empty($aRefer)){ + return json_encode(['status' => 4,'msg' => 'Reference is empty']); + } + + //获取文章信息 + $aParam['p_article_id'] = $aRefer['p_article_id']; + $aArticle = $this->getArticle($aParam); + $iStatus = empty($aArticle['status']) ? 0 : $aArticle['status']; + if($iStatus != 1){ + return json_encode($aArticle); + } + $aArticle = empty($aArticle['data']) ? [] : $aArticle['data']; + if(empty($aArticle)){ + return json_encode(['status' => 3,'msg' => 'The article does not exist']); + } + + //数据处理 + $aContent = json_decode($this->dealContent(['content' => $sContent]),true); + $aUpdate = empty($aContent['data']) ? [] : $aContent['data']; + if(empty($aUpdate)){ + return json_encode(['status' => 5,'msg' => 'The content format is incorrect']); + } + $aUpdate['refer_content'] = $sContent; + $aUpdate['is_change'] = 1; + $aUpdate['update_time'] = time(); + //更新数据 + $aWhere = ['p_refer_id' => $iPReferId,'state' => 0]; + $result = Db::name('production_article_refer')->where($aWhere)->limit(1)->update($aUpdate); + if($result === false){ + return json_encode(['status' => 6,'msg' => 'Update failed']); + } + return json_encode(['status' => 1,'msg' => 'success']); + } + + + /** + * 处理参考文献的信息 + * @param p_refer_id 主键ID + */ + public function dealContent($aParam = []){ + //获取参数 + $aParam = empty($aParam) ? $this->request->post() : $aParam; + //必填验证 + $sContent = empty($aParam['content']) ? '' : $aParam['content']; + if(empty($sContent)){ + return json_encode(['status' => 2,'msg' => 'Please enter the modification content']); + } + if(!is_string($sContent)){ + return json_encode(['status' => 2,'msg' => 'The content format is incorrect']); + } + $sContent = str_replace(['?','?'], '.', $sContent); + $aContent = explode('.', $sContent); + $aUpdate = []; + if(count($aContent) > 1){ + $aField = [0 => 'author',1 => 'title', 2 => 'joura',3 => 'dateno']; + $aStart = array_slice($aContent, 0,4); + foreach ($aStart as $key => $value) { + if(empty($value)){ + continue; + } + $aUpdate[$aField[$key]] = trim(trim($value),'.'); + } + + $sDoi = empty(array_slice($aContent, 4)) ? '' : implode('.', array_slice($aContent, 4)); + // 匹配http/https开头的URL正则 + $urlPattern = '/https?:\/\/[^\s<>"]+|http?:\/\/[^\s<>"]+/i'; + // 执行匹配(preg_match_all返回所有结果) + preg_match_all($urlPattern, $sDoi, $matches); + if(!empty($matches[0])){ + $sDoi = implode(',', array_unique($matches[0])); + } + if(empty($sDoi)){ + return json_encode(['status' => 4,'msg' => 'Reference DOI is empty']); + } + $sDoi = trim(trim($sDoi),':'); + $sDoi = strpos($sDoi ,"http")===false ? "https://doi.org/".$sDoi : $sDoi; + $sDoi = str_replace('http://doi.org/', 'https://doi.org/', $sDoi); + $aUpdate['doilink'] = $sDoi; + //$doiPattern = '/10\.\d{4,9}\/[^\s\/?#&=]+/i'; + $doiPattern = '/\b10\.\d+(?:\.\d+)*\/[^\s?#&=]+/i'; + if (preg_match($doiPattern, $sDoi, $matches)) { + $aUpdate['doi'] = $matches[0]; + $aUpdate['doilink'] = 'https://doi.org/'.''.$aUpdate['doi']; + }else{ + $aUpdate['doi'] = $sDoi; + } + if(!empty($aUpdate['author'])){ + $aUpdate['author'] = trim(trim($aUpdate['author'])).'.'; + } + + } + return json_encode(['status' => 1,'msg' => 'success','data' => $aUpdate]); + } + + /** + * 获取文章信息 + */ + private function getArticle($aParam = []){ + + //获取参数 + $aParam = empty($aParam) ? $this->request->post() : $aParam; + + //获取生产文章信息 + $iPArticleId = empty($aParam['p_article_id']) ? 0 : $aParam['p_article_id']; + if(empty($iPArticleId)){ + return ['status' => 2,'msg' => 'Please select the article to query']; + } + $aWhere = ['p_article_id' => $iPArticleId,'state' => ['in',[0,2]]]; + $aProductionArticle = Db::name('production_article')->field('article_id')->where($aWhere)->find(); + $iArticleId = empty($aProductionArticle['article_id']) ? 0 : $aProductionArticle['article_id']; + if(empty($iArticleId)) { + return ['status' => 2,'msg' => 'No articles found']; + } + + //查询条件 + $aWhere = ['article_id' => $iArticleId,'state' => ['in',[5,6]]]; + $aArticle = Db::name('article')->field('article_id')->where($aWhere)->find(); + if(empty($aArticle)){ + return ['status' => 3,'msg' => 'The article does not exist or has not entered the editorial reference status']; + } + $aArticle['p_article_id'] = $iPArticleId; + return ['status' => 1,'msg' => 'success','data' => $aArticle]; + } + /** + * AI检测 + */ + public function checkByAi($aParam = []){ + //获取参数 + $aParam = empty($aParam) ? $this->request->post() : $aParam; + + //获取文章信息 + $aArticle = $this->getArticle($aParam); + $iStatus = empty($aArticle['status']) ? 0 : $aArticle['status']; + if($iStatus != 1){ + return json_encode($aArticle); + } + $aArticle = empty($aArticle['data']) ? [] : $aArticle['data']; + if(empty($aArticle)){ + return json_encode(['status' => 3,'msg' => 'The article does not exist']); + } + //查询参考文献信息 + $aWhere = ['p_article_id' => $aArticle['p_article_id'],'state' => 0,'doilink' => '']; + $aRefer = Db::name('production_article_refer')->field('p_refer_id,p_article_id,refer_type,refer_content,doilink,refer_doi')->where($aWhere)->select(); + if(empty($aRefer)){ + return json_encode(['status' => 4,'msg' => 'No reference information found']); + } + //数据处理 + foreach ($aRefer as $key => $value) { + if(empty($value['refer_doi'])){ + continue; + } + if($value['refer_doi'] == 'Not Available'){ + continue; + } + if($value['refer_type'] == 'journal' && !empty($value['doilink'])){ + continue; + } + if($value['refer_type'] == 'book' && !empty($value['isbn'])){ + continue; + } + //写入获取参考文献详情队列 + \think\Queue::push('app\api\job\AiCheckReferByDoi@fire',$value,'AiCheckReferByDoi'); + } + return json_encode(['status' => 1,'msg' => 'Successfully joined the AI inspection DOI queue']); + } + /** + * 获取结果 + */ + public function getCheckByAiResult($aParam = []){ + //获取参数 + $aParam = empty($aParam) ? $this->request->post() : $aParam; + + //必填值验证 + $iPReferId = empty($aParam['p_refer_id']) ? '' : $aParam['p_refer_id']; + if(empty($iPReferId)){ + return json_encode(['status' => 2,'msg' => 'Please select the reference to be queried']); + } + //获取参考文献信息 + $aWhere = ['p_refer_id' => $iPReferId,'state' => 0]; + $aRefer = Db::name('production_article_refer')->field('p_refer_id,p_article_id,refer_type,refer_content,doilink,refer_doi,state,dateno')->where($aWhere)->find(); + if(empty($aRefer)){ + return json_encode(['status' => 4,'msg' => 'Reference is empty'.json_encode($aParam)]); + } + if(empty($aRefer['refer_doi'])){ + return json_encode(['status' => 4,'msg' => 'Reference DOI is empty'.json_encode($aParam)]); + } + if($aRefer['refer_type'] == 'journal' && !empty($aRefer['doilink'])){ + $aDateno = empty($aRefer['dateno']) ? [] : explode(':', $aRefer['dateno']); + if(count($aDateno) > 1){ + return json_encode(['status' => 4,'msg' => 'No need to parse again-journal'.json_encode($aParam)]); + } + } + if($aRefer['refer_type'] == 'book' && !empty($aRefer['isbn'])){ + return json_encode(['status' => 4,'msg' => 'No need to parse again-book'.json_encode($aParam)]); + } + //获取文章信息 + $aParam['p_article_id'] = $aRefer['p_article_id']; + $aArticle = $this->getArticle($aParam); + $iStatus = empty($aArticle['status']) ? 0 : $aArticle['status']; + if($iStatus != 1){ + return json_encode($aArticle); + } + $aArticle = empty($aArticle['data']) ? [] : $aArticle['data']; + if(empty($aArticle)){ + return json_encode(['status' => 3,'msg' => 'The article does not exist']); + } + + //请求AI获取结果 + $aResult = $this->curlOpenAIByDoi(['doi' => $aRefer['refer_doi']]); + $iStatus = empty($aResult['status']) ? 0 : $aResult['status']; + $sMsg = empty($aResult['msg']) ? 'The DOI number AI did not find any relevant information' : $aResult['msg']; + if($iStatus != 1){ + return json_encode(['status' => 4,'msg' => $sMsg]); + } + $aData = empty($aResult['data']) ? [] : $aResult['data']; + if(empty($aData)){ + return json_encode(['status' => 5,'msg' => 'AI obtains empty data']); + } + //写入日志 + $aLog = []; + $aLog['content'] = json_encode($aResult); + $aLog['update_time'] = time(); + $aLog['p_refer_id'] = $iPReferId; + $iLogId = Db::name('production_article_refer_ai')->insertGetId($aLog); + $iIsAiCheck = empty($aData['is_ai_check']) ? 2 : $aData['is_ai_check']; + if($iIsAiCheck != 1){//AI未检测到信息 + return json_encode(['status' => 6,'msg' => 'AI did not find any information'.json_encode($aParam)]); + } + + //数据处理入库 + $aField = ['author','title','joura','dateno','doilink']; + foreach ($aField as $key => $value) { + if(empty($aData[$value])){ + continue; + } + if($value == 'author'){ + $aUpdate['author'] = implode(',', $aData['author']); + // $aUpdate['author'] = str_replace('et al.', '', $aUpdate['author']); + }else{ + $aUpdate[$value] = $aData[$value]; + } + } + if(empty($aUpdate)){ + return json_encode(['status' => 6,'msg' => 'Update data to empty'.json_encode($aData)]); + } + if($aRefer['refer_type'] == 'other'){ + $aUpdate['refer_type'] = 'journal'; + } + if($aRefer['refer_type'] == 'book' && !empty($aUpdate['doilink'])){ + $aUpdate['refer_type'] = $aUpdate['doilink']; + unset($aUpdate['doilink']); + } + $aLog = $aUpdate; + $aUpdate['is_change'] = 1; + $aUpdate['is_ai_check'] = 1; + $aUpdate['cs'] = 1; + $aUpdate['update_time'] = time(); + Db::startTrans(); + //更新数据 + $aWhere = ['p_refer_id' => $iPReferId,'state' => 0]; + $result = Db::name('production_article_refer')->where($aWhere)->limit(1)->update($aUpdate); + if($result === false){ + return json_encode(['status' => 6,'msg' => 'Update failed']); + } + //更新日志 + if(!empty($iLogId)){ + $aWhere = ['id' => $iLogId]; + if(isset($aLog['refer_type'])){ + unset($aLog['refer_type']); + } + $result = Db::name('production_article_refer_ai')->where($aWhere)->limit(1)->update($aLog); + } + Db::commit(); + return json_encode(['status' => 1,'msg' => 'success']); + } + + /** + * 对接OPENAI + */ + private function curlOpenAIByDoi($aParam = []){ + + //获取DOI + $sDoi = empty($aParam['doi']) ? '' : $aParam['doi']; + if(empty($sDoi)){ + return ['status' => 2,'msg' => 'Reference doi is empty']; + } + //系统角色 + $sSysMessagePrompt = '请完成以下任务: + 1. 根据提供的DOI号,查询该文献的AMA引用格式; + 2. 按照以下规则调整AMA引用格式: + - 第三个作者名字后添加 et al.; + - DOI前加上"Available at: "; + - DOI信息格式调整为"https://doi.org/+真实DOI"(替换真实DOI为文献实际DOI). + 3. 严格按照以下JSON结构返回结果,仅返回JSON数据,不要额外文字,包含字段:doilink(url格式)、title(标题)、author(作者数组)、joura(出版社名称)、dateno(年;卷(期):起始页-终止页),is_ai_check(默认1) + 4. 若未查询到信息,字段is_ai_check为2,相关字段为null。'; + //用户角色 + $sUserPrompt = '我提供的DOI是:'.$sDoi; + $aMessage = [ + ['role' => 'system', 'content' => $sSysMessagePrompt], + ['role' => 'user', 'content' => $sUserPrompt], + ]; + //请求OPENAI接口 + $sModel = empty($aParam['model']) ? 'gpt-4.1' : $aParam['model'];//模型 + $sApiUrl = $this->sApiUrl;//'http://chat.taimed.cn/v1/chat/completions';// + $aParam = ['model' => $sModel,'url' => $sApiUrl,'temperature' => 0,'messages' => $aMessage,'api_key' => $this->sApiKey]; + $oOpenAi = new \app\common\OpenAi; + $aResult = json_decode($oOpenAi->curlOpenAI($aParam),true); + return $aResult; + } + /** + * 作者修改完成发邮件 + */ + public function finishSendEmail(){ + //获取参数 + $aParam = empty($aParam) ? $this->request->post() : $aParam; + //文章ID + $iArticleId = empty($aParam['article_id']) ? '' : $aParam['article_id']; + if(empty($iArticleId)){ + return json_encode(array('status' => 2,'msg' => 'Please select an article')); + } + //查询条件 + $aWhere = ['article_id' => $iArticleId,'state' => ['in',[5,6]]]; + $aArticle = Db::name('article')->field('article_id,journal_id,accept_sn')->where($aWhere)->find(); + if(empty($aArticle)){ + return json_encode(['status' => 3,'msg' => 'The article does not exist or has not entered the editorial reference status']); + } + $aWhere = ['article_id' => $iArticleId,'state' => 0]; + $aProductionArticle = Db::name('production_article')->field('p_article_id')->where($aWhere)->find(); + if(empty($aProductionArticle)) { + return ['status' => 2,'msg' => 'The article has not entered the production stage']; + } + //查询是否有参考文献 + $aWhere = ['p_article_id' => $aProductionArticle['p_article_id'],'state' => 0]; + $aRefer = Db::name('production_article_refer')->field('article_id')->where($aWhere)->find(); + if(empty($aRefer)) { + return ['status' => 2,'msg' => 'No reference information found, please be patient and wait for the editor to upload']; + } + //查询期刊信息 + if(empty($aArticle['journal_id'])){ + return json_encode(array('status' => 4,'msg' => 'The article is not associated with a journal' )); + } + $aWhere = ['state' => 0,'journal_id' => $aArticle['journal_id']]; + $aJournal = Db::name('journal')->where($aWhere)->find(); + if(empty($aJournal)){ + return json_encode(array('status' => 5,'msg' => 'No journal information found' )); + } + //查询编辑邮箱 + $iUserId = empty($aJournal['editor_id']) ? '' : $aJournal['editor_id']; + if(empty($iUserId)){ + return json_encode(array('status' => 6,'msg' => 'The journal to which the article belongs has not designated a responsible editor' )); + } + $aWhere = ['user_id' => $iUserId,'state' => 0,'email' => ['<>','']]; + $aUser = Db::name('user')->field('user_id,email,realname,account')->where($aWhere)->find(); + if(empty($aUser)){ + return json_encode(['status' => 7,'msg' => "Edit email as empty"]); + } + + //处理发邮件 + //邮件模版 + $aEmailConfig = [ + 'email_subject' => '{journal_title}-{accept_sn}', + 'email_content' => ' + Dear Editor,

+ The authors have revised the formats of all references, please check.
+ Sn:{accept_sn}

+ Sincerely,
Editorial Office
+ Subscribe to this journal
{journal_title}
+ Email: {journal_email}
+ Website: {website}' + ]; + //邮件内容 + $aSearch = [ + '{accept_sn}' => empty($aArticle['accept_sn']) ? '' : $aArticle['accept_sn'],//accept_sn + '{journal_title}' => empty($aJournal['title']) ? '' : $aJournal['title'],//期刊名 + '{journal_issn}' => empty($aJournal['issn']) ? '' : $aJournal['issn'], + '{journal_email}' => empty($aJournal['email']) ? '' : $aJournal['email'], + '{website}' => empty($aJournal['website']) ? '' : $aJournal['website'], + ]; + + //发邮件 + //邮件标题 + $email = $aUser['email']; + $title = str_replace(array_keys($aSearch), array_values($aSearch),$aEmailConfig['email_subject']); + //邮件内容变量替换 + $content = str_replace(array_keys($aSearch), array_values($aSearch), $aEmailConfig['email_content']); + $pre = \think\Env::get('emailtemplete.pre'); + $net = \think\Env::get('emailtemplete.net'); + $net1 = str_replace("{{email}}",trim($email),$net); + $content=$pre.$content.$net1; + //发送邮件 + $memail = empty($aJournal['email']) ? '' : $aJournal['email']; + $mpassword = empty($aJournal['epassword']) ? '' : $aJournal['epassword']; + //期刊标题 + $from_name = empty($aJournal['title']) ? '' : $aJournal['title']; + //邮件队列组装参数 + $aResult = sendEmail($email,$title,$from_name,$content,$memail,$mpassword); + $iStatus = empty($aResult['status']) ? 1 : $aResult['status']; + $iIsSuccess = 2; + $sMsg = empty($aResult['data']) ? '失败' : $aResult['data']; + if($iStatus == 1){ + return json_encode(['status' => 1,'msg' => 'success']); + } + return json_encode(['status' => 8,'msg' => 'fail']); + } +} diff --git a/application/api/job/FetchExperts.php b/application/api/job/FetchExperts.php index 5162031..d0118d7 100644 --- a/application/api/job/FetchExperts.php +++ b/application/api/job/FetchExperts.php @@ -16,35 +16,13 @@ class FetchExperts public function fire(Job $job, $data) { $field = isset($data['field']) ? $data['field'] : ''; -// $attempts = $job->attempts(); -// $service = new ExpertFinderService(); -// $service->log('[FetchExperts] start field=' . $field . ' attempts=' . $attempts); -// -// try { - $result = $service->doFetchForField( + $service->doFetchForField( $field, isset($data['source']) ? $data['source'] : 'pubmed', isset($data['per_page']) ? intval($data['per_page']) : 100, isset($data['min_year']) ? $data['min_year'] : null ); -// $service->log('[FetchExperts] completed field=' . $field . ' result=' . json_encode($result)); -// } catch (\Throwable $e) { -// $service->log( -// '[FetchExperts] failed field=' . $field . -// ' msg=' . $e->getMessage() . -// ' file=' . $e->getFile() . -// ' line=' . $e->getLine() -// ); -// -// if ($attempts >= 3) { -// $job->delete(); -// return; -// } -// -// $job->release(60); -// return; -// } $job->delete(); } diff --git a/application/common/ArticleParserService.php b/application/common/ArticleParserService.php index 5b0dd52..2d4619c 100644 --- a/application/common/ArticleParserService.php +++ b/application/common/ArticleParserService.php @@ -1151,6 +1151,123 @@ class ArticleParserService ] ]; } + + /** + * 提取 Word 文档中的参考文献列表(仅返回数组,不做入库) + * @return array 每条为一个参考文献的纯文本字符串 + */ + public static function getReferencesFromWord($filePath): array + { + $othis = new self($filePath) ; + if (empty($othis->sections)) { + return []; + } + + $lines = []; + foreach ($othis->sections as $section) { + foreach ($section->getElements() as $element) { + $text = $othis->getTextFromElement($element); + $text = trim((string)$text); + if ($text === '') continue; + $lines[] = $text; + } + } + + if (empty($lines)) { + return []; + } + + // 识别参考文献段落起点(允许同一行包含域代码或第一条内容) + $startIdx = -1; + $startRemainder = ''; // 标题行后可能跟着第一条参考文献内容 + foreach ($lines as $i => $line) { + $t = trim($line); + if ($t === '') continue; + + // 行首命中即可(避免 “References { ADDIN... }” / “References 1. ...” 漏判) + if (preg_match('/^\s*(references|reference|bibliography|参考文献|文献)\b\s*[::]?\s*/iu', $t, $m)) { + $startIdx = $i; + $remainder = preg_replace('/^\s*(references|reference|bibliography|参考文献|文献)\b\s*[::]?\s*/iu', '', $t); + $remainder = trim($remainder); + // 过滤 EndNote 域代码(允许其出现在标题行后) + if ($remainder !== '' && !preg_match('/^\{\s*ADDIN\s+EN\.REFLIST\s*\}$/i', $remainder)) { + $startRemainder = $remainder; + } + break; + } + } + + if ($startIdx < 0) { + return []; + } + + // 收集参考文献区域内容,遇到常见结尾段落标题则停止 + $stopKeywords = [ + 'acknowledgements', 'acknowledgments', 'funding', 'appendix', 'supplementary', + 'conflict of interest', 'competing interests', 'author contributions', + '致谢', '基金', '附录', '补充材料', '利益冲突', '作者贡献', + ]; + + // startRemainder 已在起点识别时处理 + + $raw = []; + if ($startRemainder !== '') { + $raw[] = $startRemainder; + } + + for ($i = $startIdx + 1; $i < count($lines); $i++) { + $line = trim($lines[$i]); + if ($line === '') continue; + // 跳过 EndNote / Word 域代码 + if (preg_match('/^\{\s*ADDIN\s+EN\.REFLIST\s*\}$/i', $line)) { + continue; + } + + $lineLower = strtolower($line); + foreach ($stopKeywords as $sk) { + $skLower = strtolower($sk); + if ($lineLower === $skLower || $lineLower === $skLower . ':' || $lineLower === $skLower . ':') { + $i = count($lines); // break outer + continue 2; + } + } + + $raw[] = $line; + } + + if (empty($raw)) { + return []; + } + + // 合并多行:以 “数字.” / “[数字]” / “数字]” 等作为新条目起始 + $refs = []; + $current = ''; + foreach ($raw as $line) { + $isNew = false; + if (preg_match('/^\s*(\[\d+\]|\d+\s*[\.\)]|\d+\s*\])\s*/u', $line)) { + $isNew = true; + } + + if ($isNew) { + if (trim($current) !== '') { + $refs[] = trim(preg_replace('/\s+/u', ' ', $current)); + } + $current = $line; + } else { + // 续行拼接 + if ($current === '') { + $current = $line; + } else { + $current .= ' ' . $line; + } + } + } + if (trim($current) !== '') { + $refs[] = trim(preg_replace('/\s+/u', ' ', $current)); + } + + return $refs; + } /** * 核心解码方法 * @param string $str 待解码字符串 diff --git a/application/common/CitationRelevanceService.php b/application/common/CitationRelevanceService.php new file mode 100644 index 0000000..24d1d62 --- /dev/null +++ b/application/common/CitationRelevanceService.php @@ -0,0 +1,331 @@ +pubmed = $pubmed ?: new PubmedService(); + $this->crossref = $crossref ?: new CrossrefService(); + + if (isset($config['embedding_url'])) $this->embeddingUrl = (string)$config['embedding_url']; + if (isset($config['embedding_headers']) && is_array($config['embedding_headers'])) $this->embeddingHeaders = $config['embedding_headers']; + if (isset($config['timeout'])) $this->timeout = max(10, intval($config['timeout'])); + if (isset($config['chat_url'])) $this->chatUrl = (string)$config['chat_url']; + if (isset($config['chat_model'])) $this->chatModel = (string)$config['chat_model']; + if (isset($config['embedding_dim'])) $this->embeddingDim = max(32, intval($config['embedding_dim'])); + if (isset($config['chat_max_tokens'])) $this->chatMaxTokens = max(256, intval($config['chat_max_tokens'])); + } + + /** + * 单条引用相关性检测 + * + * @param string $contextText 引用处上下文(英文) + * @param array $referRow production_article_refer 行(至少含 refer_doi/doilink/title/author/joura/dateno) + * @param array $options + * - sentence_is_background(bool) 若外部已判断为背景堆引用,可直接降级 + * - sim_related(float) related 阈值,默认 0.75 + * - sim_unsure(float) unsure 阈值,默认 0.60 + * - check_retraction(bool) 是否检查撤稿/更正,默认 true + */ + public function checkOne(string $contextText, array $referRow, array $options = []): array + { + $contextText = trim($contextText); + $simRelated = isset($options['sim_related']) ? (float)$options['sim_related'] : 0.75; + $simUnsure = isset($options['sim_unsure']) ? (float)$options['sim_unsure'] : 0.60; + $checkRetraction = isset($options['check_retraction']) ? (bool)$options['check_retraction'] : true; + $isBackground = !empty($options['sentence_is_background']); + + // 1) 问题条目(退稿/更正):先 Crossref(有 DOI 才能判断) + $problemFlag = 'unknown'; + $problemReason = ''; + if ($checkRetraction) { + $qc = $this->crossref->qcCitation($contextText, $referRow, ['check_retraction' => true]); + $problemFlag = $qc['problem_flag'] ?? 'unknown'; + $problemReason = $qc['problem_reason'] ?? ''; + } + + // 2) PubMed 抓取 abstract/mesh(提升语义) + $doi = $this->extractDoiFromRefer($referRow); + $pub = $doi ? $this->pubmed->fetchByDoi($doi) : null; + + $pubText = ''; + $pubTypes = []; + if ($pub) { + $pubTypes = $pub['publication_types'] ?? []; + $mesh = $pub['mesh_terms'] ?? []; + $pubText = trim( + ($pub['title'] ?? '') . "\n" . + ($pub['abstract'] ?? '') . "\n" . + (!empty($mesh) ? ('MeSH: ' . implode('; ', $mesh)) : '') + ); + } + + // 3) embedding 相似度(context vs pubmed_text),无 pubmed_text 则退化为 crossref 的证据法 + if ($pubText !== '') { + $v1 = $this->embedCached($contextText); + $v2 = $this->embedCached($pubText); + $sim = ($v1 && $v2) ? $this->cosine($v1, $v2) : 0.0; + + $relevanceFlag = 'unsure'; + if ($sim >= $simRelated) { + $relevanceFlag = 'related'; + } elseif ($sim >= $simUnsure) { + $relevanceFlag = 'unsure'; + } else { + $relevanceFlag = $isBackground ? 'unsure_background' : 'suspicious_unrelated'; + } + + // PubMed 自身也能提示撤稿/更正(作为补充) + if ($checkRetraction && $problemFlag !== 'retracted_or_corrected' && !empty($pubTypes)) { + $ptLower = strtolower(implode(' | ', $pubTypes)); + if (strpos($ptLower, 'retracted publication') !== false + || strpos($ptLower, 'retraction of publication') !== false + || strpos($ptLower, 'published erratum') !== false + ) { + $problemFlag = 'retracted_or_corrected'; + $problemReason = 'PubMed publication type indicates retraction/correction'; + } + } + + return [ + 'problem_flag' => $problemFlag, + 'problem_reason' => $problemReason, + 'relevance_flag' => $relevanceFlag, + 'relevance_score' => round($sim, 4), + 'reason' => $pubText !== '' ? 'embedding(context,pubmed_text)' : 'embedding_unavailable', + 'pubmed' => [ + 'pmid' => $pub['pmid'] ?? '', + 'year' => $pub['year'] ?? '', + 'journal' => $pub['journal'] ?? '', + 'publication_types' => $pubTypes, + ], + ]; + } + + // 退化:没有 PubMed 信息时,用 CrossrefService 证据法(会偏保守) + $fallback = $this->crossref->qcCitation($contextText, $referRow, ['check_retraction' => false]); + $fallback['problem_flag'] = $problemFlag; + $fallback['problem_reason'] = $problemReason; + $fallback['reason'] = 'fallback_crossref_evidence; ' . ($fallback['reason'] ?? ''); + return $fallback; + } + + // ---------------- embedding ---------------- + + private function embedCached(string $text): ?array + { + $text = trim($text); + if ($text === '') return null; + + $key = 'emb_' . sha1($text); + $cached = $this->cacheGet($key, 90 * 86400); + if (is_array($cached) && !empty($cached)) return $cached; + + $vec = $this->embed($text); + if (is_array($vec) && !empty($vec)) { + $this->cacheSet($key, $vec); + return $vec; + } + return null; + } + + /** + * 调用内部 embedding 接口 + * 兼容返回格式: + * - OpenAI embeddings: {data:[{embedding:[...] }]} + * - {embedding:[...]} + * - 直接返回数组 [...] + */ + private function embed(string $text): ?array + { + // 1) 优先使用独立 embeddings 接口 + if ($this->embeddingUrl !== '') { + $payload = json_encode(['text' => $text], JSON_UNESCAPED_UNICODE); + $ch = curl_init(); + curl_setopt($ch, CURLOPT_URL, $this->embeddingUrl); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); + curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout); + curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); + curl_setopt($ch, CURLOPT_POST, true); + curl_setopt($ch, CURLOPT_POSTFIELDS, $payload); + + $headers = array_merge(['Content-Type: application/json'], $this->embeddingHeaders); + curl_setopt($ch, CURLOPT_HTTPHEADER, $headers); + + $res = curl_exec($ch); + curl_close($ch); + if (!is_string($res) || trim($res) === '') return null; + + $decoded = json_decode($res, true); + if (is_array($decoded)) { + if (isset($decoded['data'][0]['embedding']) && is_array($decoded['data'][0]['embedding'])) { + return $this->normalizeVector($decoded['data'][0]['embedding']); + } + if (isset($decoded['embedding']) && is_array($decoded['embedding'])) { + return $this->normalizeVector($decoded['embedding']); + } + $isVec = isset($decoded[0]) && (is_float($decoded[0]) || is_int($decoded[0])); + if ($isVec) return $this->normalizeVector($decoded); + } + return null; + } + + // 2) 没有 embeddings 接口时,使用 chat/completions 生成固定维度向量 + if ($this->chatUrl === '' || $this->chatModel === '') { + return null; + } + + $sys = "You are an embedding generator. Output ONLY valid JSON in this exact shape: {\"embedding\":[...]}.\n" + . "Rules:\n" + . "- embedding must be an array of exactly {$this->embeddingDim} floats\n" + . "- each float must be between -1 and 1\n" + . "- do not include any other keys or any extra text\n"; + + $payload = json_encode([ + 'model' => $this->chatModel, + 'temperature' => 0, + 'max_tokens' => $this->chatMaxTokens, + 'messages' => [ + ['role' => 'system', 'content' => $sys], + ['role' => 'user', 'content' => $text], + ], + ], JSON_UNESCAPED_UNICODE); + + $ch = curl_init(); + curl_setopt($ch, CURLOPT_URL, $this->chatUrl); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); + curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout); + curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); + curl_setopt($ch, CURLOPT_POST, true); + curl_setopt($ch, CURLOPT_POSTFIELDS, $payload); + + $headers = array_merge(['Content-Type: application/json'], $this->embeddingHeaders); + curl_setopt($ch, CURLOPT_HTTPHEADER, $headers); + + $res = curl_exec($ch); + curl_close($ch); + if (!is_string($res) || trim($res) === '') return null; + + $decoded = json_decode($res, true); + $content = ''; + if (is_array($decoded) && isset($decoded['choices'][0]['message']['content'])) { + $content = (string)$decoded['choices'][0]['message']['content']; + } + $content = trim($content); + if ($content === '') return null; + + // content 可能被包裹在 ```json ... ``` + if (preg_match('/```(?:json)?\\s*([\\s\\S]*?)\\s*```/i', $content, $m)) { + $content = trim($m[1]); + } + $j = json_decode($content, true); + if (!is_array($j) || !isset($j['embedding']) || !is_array($j['embedding'])) { + return null; + } + + $vec = $j['embedding']; + if (count($vec) !== $this->embeddingDim) { + return null; + } + return $this->normalizeVector($vec); + } + + private function cosine(array $a, array $b): float + { + $n = min(count($a), count($b)); + if ($n <= 0) return 0.0; + $dot = 0.0; $na = 0.0; $nb = 0.0; + for ($i = 0; $i < $n; $i++) { + $x = (float)$a[$i]; + $y = (float)$b[$i]; + $dot += $x * $y; + $na += $x * $x; + $nb += $y * $y; + } + if ($na <= 0.0 || $nb <= 0.0) return 0.0; + return $dot / (sqrt($na) * sqrt($nb)); + } + + private function normalizeVector(array $v): array + { + $sum = 0.0; + $out = []; + foreach ($v as $x) { + $fx = (float)$x; + $out[] = $fx; + $sum += $fx * $fx; + } + if ($sum <= 0.0) return $out; + $norm = sqrt($sum); + for ($i = 0; $i < count($out); $i++) { + $out[$i] = $out[$i] / $norm; + } + return $out; + } + + private function extractDoiFromRefer(array $referRow): string + { + // 复用 CrossrefService 内部逻辑(通过 qcCitation 的抽取函数不可直接访问,所以这里简单实现) + $doi = trim((string)($referRow['refer_doi'] ?? '')); + if ($doi !== '') return $doi; + + $doilink = trim((string)($referRow['doilink'] ?? '')); + if ($doilink === '') return ''; + if (preg_match('#doi\\.org/([^?#]+)#i', $doilink, $m)) { + return trim((string)$m[1]); + } + return $doilink; + } + + // ---------------- cache ---------------- + + private function cacheDir(): string + { + return rtrim(ROOT_PATH, '/') . '/runtime/embed_cache'; + } + + private function cacheGet(string $key, int $ttlSeconds) + { + $file = $this->cacheDir() . '/' . $key . '.json'; + if (!is_file($file)) return null; + $mtime = filemtime($file); + if (!$mtime || (time() - $mtime) > $ttlSeconds) return null; + $raw = @file_get_contents($file); + $decoded = json_decode((string)$raw, true); + return $decoded; + } + + private function cacheSet(string $key, $value): void + { + $dir = $this->cacheDir(); + if (!is_dir($dir)) @mkdir($dir, 0777, true); + $file = $dir . '/' . $key . '.json'; + @file_put_contents($file, json_encode($value, JSON_UNESCAPED_UNICODE)); + } +} + diff --git a/application/common/CrossrefService.php b/application/common/CrossrefService.php new file mode 100644 index 0000000..699768f --- /dev/null +++ b/application/common/CrossrefService.php @@ -0,0 +1,765 @@ +mailto = (string)$config['mailto']; + if (isset($config['timeout'])) $this->timeout = intval($config['timeout']); + if (isset($config['maxRetry'])) $this->maxRetry = intval($config['maxRetry']); + if (isset($config['crossrefUrl'])) $this->crossrefUrl = (string)$config['crossrefUrl']; + } + } + + public function setMailto($mailto) + { + $this->mailto = (string)$mailto; + return $this; + } + + /** + * 引用标号转换工具:正文里的 [n] 对应 production_article_refer.index = n-1(index 从 0 开始)。 + * + * @param int $citationMark 正文引用编号,如 13(来自 [13]) + * @return int production_article_refer.index,如 12 + */ + public function referIndexFromCitationMark(int $citationMark): int + { + $citationMark = intval($citationMark); + return max(0, $citationMark - 1); + } + + /** + * 反向转换工具:production_article_refer.index(从 0 开始)→ 正文引用编号 [n]。 + * + * @param int $referIndex production_article_refer.index,如 12 + * @return int 正文引用编号 n,如 13 + */ + public function citationMarkFromReferIndex(int $referIndex): int + { + $referIndex = intval($referIndex); + return max(0, $referIndex + 1); + } + + /** + * 批量引用质检(不查库版):\n + * - 输入文章分节内容(t_article_main 的 content 列表)\n + * - 输入引用条目(production_article_refer 的行列表)\n + * - 自动抽取每个 [n] 的英文句子上下文,并映射到 refer.index=n-1 后调用 qcCitation()\n + * + * 说明:本方法不做任何数据库查询,方便你在 controller/service 中自由组合数据来源。 + * + * @param array $articleMainContents 文章内容片段数组(按 sort 顺序),元素为 string 或含 content 的数组 + * @param array $referRows production_article_refer 行数组(至少含 index/title/author/joura/dateno/refer_doi/doilink) + * @param array $options 透传给 qcCitation 的 options,并支持: + * - sentence_window(int) 上下文句子窗口,默认 1(即前1句+本句+后1句) + * @return array 结果列表,每条包含 citation_mark/refer_index/context/ref_meta/qc + */ + public function qcArticleCitations(array $articleMainContents, array $referRows, array $options = []): array + { + $window = isset($options['sentence_window']) ? max(0, intval($options['sentence_window'])) : 1; + + // 1) 组装全文纯文本(保留 [n]) + $chunks = []; + foreach ($articleMainContents as $row) { + if (is_array($row)) { + $text = (string)($row['content'] ?? ''); + } else { + $text = (string)$row; + } + if ($text === '') continue; + // 去掉常见标签,保留 [n] + $text = preg_replace('/<\s*\/?\s*blue[^>]*>/i', '', $text); + $text = strip_tags($text); + $text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8'); + $text = preg_replace('/\s+/u', ' ', trim($text)); + if ($text !== '') $chunks[] = $text; + } + $fullText = implode("\n", $chunks); + + if ($fullText === '') return []; + + // 2) 构建引用条目映射:refer_index => row + $referMap = []; + foreach ($referRows as $r) { + if (!is_array($r)) continue; + if (!isset($r['index'])) continue; + $idx = intval($r['index']); + $referMap[$idx] = $r; + } + + // 3) 英文切句(简单稳健版) + $sentences = $this->splitEnglishSentences($fullText); + if (empty($sentences)) return []; + + // 4) 遍历句子,抓取其中的 [n] + $results = []; + foreach ($sentences as $si => $sent) { + if (!preg_match_all('/\[(\d+)\]/', $sent, $m)) { + continue; + } + $marks = array_unique(array_map('intval', $m[1])); + foreach ($marks as $citationMark) { + if ($citationMark <= 0) continue; + $referIndex = $this->referIndexFromCitationMark($citationMark); + if (!isset($referMap[$referIndex])) { + continue; + } + + $start = max(0, $si - $window); + $end = min(count($sentences) - 1, $si + $window); + $ctx = implode(' ', array_slice($sentences, $start, $end - $start + 1)); + $ctx = trim(preg_replace('/\s+/u', ' ', $ctx)); + + $refMeta = $referMap[$referIndex]; + $qc = $this->qcCitation($ctx, $refMeta, $options); + + $results[] = [ + 'citation_mark' => $citationMark, // 正文编号 n(来自 [n]) + 'refer_index' => $referIndex, // production_article_refer.index + 'context' => $ctx, + 'ref_meta' => [ + 'p_refer_id' => $refMeta['p_refer_id'] ?? 0, + 'title' => $refMeta['title'] ?? '', + 'author' => $refMeta['author'] ?? '', + 'joura' => $refMeta['joura'] ?? '', + 'dateno' => $refMeta['dateno'] ?? '', + 'refer_doi' => $refMeta['refer_doi'] ?? '', + 'doilink' => $refMeta['doilink'] ?? '', + 'index' => $refMeta['index'] ?? $referIndex, + ], + 'qc' => $qc, + ]; + } + } + + return $results; + } + + /** + * 过滤非法DOI(仅保留10.xxxx/xxx格式) + * @param string $doi + * @return string + */ + public function filterValidDoi($doi = '') + { + $doi = trim((string)$doi); + if ($doi === '') return ''; + if (preg_match('/^10\.\d{4,}\/.+/', $doi)) { + return $doi; + } + return ''; + } + + /** + * 获取 Crossref message(带重试) + * @param string $doi + * @return array|null + */ + public function fetchWork($doi) + { + $doi = $this->filterValidDoi($doi); + if ($doi === '') return null; + return $this->fetchSingleDoiWithRetry($doi); + } + + /** + * 返回常用字段集合(标题/期刊/作者/卷期页/撤稿/URL) + * @param string $doi + * @return array|null + */ + public function fetchWorkSummary($doi) + { + $msg = $this->fetchWork($doi); + if (!$msg) return null; + + $title = $this->getTitle($msg); + $publisher = $this->getPublisher($msg); + $joura = !empty($publisher['title']) ? $publisher['title'] : ($publisher['short_title'] ?? ''); + $authors = $this->getAuthors($msg); + $dateno = $this->getVolumeIssuePages($msg); + $retractInfo = $this->checkRetracted($msg); + $dolink = $this->getDolink($msg); + if (empty($dolink)) { + $dolink = 'https://doi.org/' . $this->filterValidDoi($doi); + } + + return [ + 'doi' => $this->filterValidDoi($doi), + 'title' => $title, + 'joura' => $joura, + 'publisher' => $publisher, + 'authors' => $authors, + 'author_str' => empty($authors) ? '' : implode(',', $authors), + 'dateno' => $dateno, + 'is_retracted' => !empty($retractInfo['is_retracted']) ? 1 : 0, + 'retract_reason' => $retractInfo['reason'] ?? '', + 'doilink' => $dolink, + 'raw' => $msg, + ]; + } + + /** + * 单DOI查询(带重试) + * @param string $doi + * @return array|null + */ + private function fetchSingleDoiWithRetry($doi) + { + $retryCount = 0; + while ($retryCount < $this->maxRetry) { + $url = $this->crossrefUrl . rawurlencode($doi); + if (!empty($this->mailto)) { + $url .= "?mailto=" . rawurlencode($this->mailto); + } + + $ch = curl_init(); + curl_setopt($ch, CURLOPT_URL, $url); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); + curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout); + curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); + curl_setopt($ch, CURLOPT_HTTPHEADER, [ + "User-Agent: DOI-Fetcher/1.0 (mailto:{$this->mailto})" + ]); + $response = curl_exec($ch); + $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); + curl_close($ch); + + if ($httpCode == 200) { + $data = json_decode($response, true); + return (isset($data['status']) && $data['status'] == 'ok') ? ($data['message'] ?? null) : null; + } + + if ($httpCode == 429) { + sleep(5); + $retryCount++; + continue; + } + + $retryCount++; + sleep(1); + } + + return null; + } + + /** + * 提取标题 + */ + public function getTitle($aDoiInfo = []) + { + return $aDoiInfo['title'][0] ?? ''; + } + + /** + * 提取期刊/出版社相关信息 + */ + public function getPublisher($aDoiInfo = []) + { + return [ + 'title' => isset($aDoiInfo['container-title'][0]) ? $aDoiInfo['container-title'][0] : '', + 'short_title' => isset($aDoiInfo['short-container-title'][0]) ? $aDoiInfo['short-container-title'][0] : '', + 'ISSN' => $aDoiInfo['ISSN'] ?? [], + 'publisher' => $aDoiInfo['publisher'] ?? '', + ]; + } + + /** + * 提取作者列表 + */ + public function getAuthors($aDoiInfo = []) + { + $authors = []; + if (!empty($aDoiInfo['author'])) { + foreach ($aDoiInfo['author'] as $author) { + $name = $author['family'] ?? ''; + if (!empty($author['given'])) { + $name = $author['given'] . ' ' . $name; + } + if (!empty($name)) { + $authors[] = $name; + } + } + } + return $authors; + } + + /** + * 提取发表年份 + */ + public function getPublishYear($aDoiInfo = []) + { + if (!empty($aDoiInfo['issued']['date-parts'][0][0])) { + return (string)$aDoiInfo['issued']['date-parts'][0][0]; + } + return ''; + } + + /** + * 提取卷(期):起始页-终止页(格式:2024:10(2):100-120) + */ + public function getVolumeIssuePages($aDoiInfo = []) + { + $parts = []; + + $year = $this->getPublishYear($aDoiInfo); + if ($year) $parts[] = $year; + + $volume = $aDoiInfo['volume'] ?? ''; + $issue = $aDoiInfo['issue'] ?? ''; + if ($volume) { + $parts[] = $volume . ($issue ? "({$issue})" : ''); + } + + $pageStart = $aDoiInfo['page']['start'] ?? ($aDoiInfo['first-page'] ?? ''); + $pageEnd = $aDoiInfo['page']['end'] ?? ($aDoiInfo['last-page'] ?? ''); + $pages = ''; + if ($pageStart) { + $pages = $pageStart . ($pageEnd ? "-{$pageEnd}" : ''); + } else { + $pages = $aDoiInfo['page'] ?? ''; + } + if ($pages) $parts[] = $pages; + + return implode(':', $parts); + } + + /** + * 识别撤稿文章(与 Crossrefdoi.php 同逻辑) + */ + public function checkRetracted($aDoiInfo = []) + { + $isRetracted = false; + $reason = "未撤稿"; + + $sType = strtolower($aDoiInfo['type'] ?? ''); + $sSubtype = strtolower($aDoiInfo['subtype'] ?? ''); + if ($sType && in_array($sType, ['retraction', 'correction'])) { + $isRetracted = true; + $reason = "文章类型为{$sType}(撤稿/更正声明)"; + } + if ($sSubtype && in_array($sSubtype, ['retraction', 'correction'])) { + $isRetracted = true; + $reason = "文章类型为{$sSubtype}(撤稿/更正声明)"; + } + + if (isset($aDoiInfo['update-type']) && is_array($aDoiInfo['update-type']) && in_array('retraction', $aDoiInfo['update-type'])) { + $isRetracted = true; + $reason = "官方标记为撤稿(update-type: retraction)"; + } + + if (isset($aDoiInfo['relation']) && !empty($aDoiInfo['relation'])) { + foreach ($aDoiInfo['relation'] as $relType => $relItems) { + if (in_array($relType, ['is-retraction-of', 'corrects'])) { + $isRetracted = true; + $relatedDoi = $relItems[0]['id'] ?? '未知'; + $reason = "关联撤稿文章{$relatedDoi}(关系:{$relType})"; + break; + } + } + } + + if (isset($aDoiInfo['update-to']) && is_array($aDoiInfo['update-to'])) { + foreach ($aDoiInfo['update-to'] as $update) { + $updateType = strtolower($update['type'] ?? ''); + $updateLabel = strtolower($update['label'] ?? ''); + if (strpos($updateType, 'retract') !== false || strpos($updateLabel, 'retract') !== false) { + $isRetracted = true; + $reason = "update-to 标记撤稿({$updateType}/{$updateLabel})"; + break; + } + } + } + + $aTitles = $aDoiInfo['title'] ?? []; + foreach ($aTitles as $value) { + $sTitleLower = strtolower($value); + if (strpos($sTitleLower, 'retraction') !== false || strpos($sTitleLower, 'retracted') !== false + || strpos($sTitleLower, 'withdrawal') !== false || strpos($sTitleLower, 'withdrawn') !== false) { + $isRetracted = true; + $reason = "标题包含撤稿关键词"; + break; + } + } + + return [ + 'is_retracted' => $isRetracted, + 'reason' => $reason, + ]; + } + + /** + * 识别 doi 链接 + */ + public function getDolink($aDoiInfo = []) + { + return $aDoiInfo['URL'] ?? ''; + } + + /** + * 解析 Crossref date-parts + */ + public function parseDateParts($dateObj) + { + $parts = $dateObj['date-parts'][0] ?? []; + if (empty($parts)) return ''; + $y = $parts[0] ?? ''; + $m = $parts[1] ?? ''; + $d = $parts[2] ?? ''; + $out = (string)$y; + if ($m !== '') $out .= '-' . str_pad((string)$m, 2, '0', STR_PAD_LEFT); + if ($d !== '') $out .= '-' . str_pad((string)$d, 2, '0', STR_PAD_LEFT); + return $out; + } + + /** + * 引用质检:判断(1) 被引条目是否疑似退稿/更正(基于 Crossref);(2) 引用上下文是否与被引条目相关(基于证据命中)。 + * + * 说明: + * - 适用于没有 abstract/keywords 的场景(仅用 title/author/journal/year + 引用上下文句子)。 + * - 如果 refer_doi/doilink 为空,则 problem_flag 只能返回 unknown。 + * + * @param string $contextText 引用处的上下文句子(英文,最好只包含引用所在句 + 少量相邻句) + * @param array $refMeta 被引条目元信息(建议来自 production_article_refer) + * - refer_doi / doilink / title / author / joura / dateno + * @param array $options 可选参数 + * - check_retraction(bool) 是否调用 Crossref 判断退稿/更正;默认 true + * - background_phrases(array) 背景堆引用触发短语;默认使用内置 + * + * @return array + * [ + * 'problem_flag' => 'ok'|'retracted_or_corrected'|'unknown', + * 'problem_reason' => string, + * 'relevance_flag' => 'related'|'unsure'|'unsure_background'|'suspicious_unrelated', + * 'relevance_score' => float, + * 'reason' => string + * ] + */ + public function qcCitation(string $contextText, array $refMeta, array $options = []): array + { + $contextText = trim($contextText); + $checkRetraction = isset($options['check_retraction']) ? (bool)$options['check_retraction'] : true; + + $refTitle = (string)($refMeta['title'] ?? ''); + $refAuthor = (string)($refMeta['author'] ?? ''); + $refJoura = (string)($refMeta['joura'] ?? ''); + $refDateno = (string)($refMeta['dateno'] ?? ''); + $referDoi = (string)($refMeta['refer_doi'] ?? ''); + $doilink = (string)($refMeta['doilink'] ?? ''); + + $doi = $this->extractDoiFromMeta($referDoi, $doilink); + + // 1) 退稿/更正判断(强规则,影响 problem_flag) + $problemFlag = 'unknown'; + $problemReason = ''; + if ($checkRetraction) { + if (!empty($doi)) { + $summary = $this->fetchWorkSummary($doi); + if ($summary && isset($summary['is_retracted'])) { + if ((int)$summary['is_retracted'] === 1) { + $problemFlag = 'retracted_or_corrected'; + $problemReason = !empty($summary['retract_reason']) ? $summary['retract_reason'] : 'Crossref indicates retraction/correction'; + } else { + $problemFlag = 'ok'; + $problemReason = 'Crossref indicates not retracted/corrected'; + } + } else { + $problemFlag = 'unknown'; + $problemReason = 'Crossref fetch failed or returned unexpected data'; + } + } else { + $problemFlag = 'unknown'; + $problemReason = 'DOI is empty'; + } + } else { + $problemFlag = 'unknown'; + $problemReason = 'Skip retraction check'; + } + + // 2) 相关性判断(弱规则+证据命中) + $backgroundPhrases = isset($options['background_phrases']) ? (array)$options['background_phrases'] : [ + 'several studies', + 'many studies', + 'the literature', + 'the existing literature', + 'has been reported', + 'have been reported', + 'it has been shown', + 'previous studies', + 'the study suggests', + 'the literature suggests', + 'in the literature', + ]; + + $ctxLower = strtolower($contextText); + $isBackground = false; + foreach ($backgroundPhrases as $ph) { + $ph = strtolower(trim((string)$ph)); + if ($ph !== '' && $ph !== '0' && strpos($ctxLower, $ph) !== false) { + $isBackground = true; + break; + } + } + + $refTokens = $this->buildEvidenceTokens([ + 'title' => $refTitle, + 'author' => $refAuthor, + 'journal' => $refJoura, + 'year' => $refDateno, + ]); + + $ctxTokens = $this->tokenize($contextText); + + $titleOverlap = 0.0; + $authorHit = 0.0; + $journalOverlap = 0.0; + $yearHit = 0.0; + + $titleTokens = $refTokens['titleTokens'] ?? []; + $authorTokens = $refTokens['authorTokens'] ?? []; + $journalTokens = $refTokens['journalTokens'] ?? []; + $yearToken = $refTokens['yearToken'] ?? ''; + + if (!empty($titleTokens)) { + $inter = array_intersect($titleTokens, $ctxTokens); + $titleOverlap = count($inter) / max(1, count($titleTokens)); + } + + if (!empty($authorTokens)) { + foreach ($authorTokens as $at) { + if ($at !== '' && in_array($at, $ctxTokens, true)) { + $authorHit = 1.0; + break; + } + } + } + + if (!empty($journalTokens)) { + $interJ = array_intersect($journalTokens, $ctxTokens); + $journalOverlap = count($interJ) / max(1, count($journalTokens)); + } + + if (!empty($yearToken) && strpos($ctxLower, (string)$yearToken) !== false) { + $yearHit = 1.0; + } + + // 综合得分(保持解释性:越高越相关) + $score = round(( + 0.60 * $titleOverlap + + 0.20 * $authorHit + + 0.15 * $yearHit + + 0.05 * $journalOverlap + ), 4); + + $relevanceFlag = 'unsure'; + $reasonParts = []; + + if ($score >= 0.35 && ($authorHit > 0.0 || $yearHit > 0.0)) { + $relevanceFlag = 'related'; + $reasonParts[] = 'title_keyword_overlap_high=' . $titleOverlap; + } elseif ($score >= 0.25) { + $relevanceFlag = 'unsure'; + $reasonParts[] = 'evidence_score_mid=' . $score; + } else { + if ($isBackground) { + $relevanceFlag = 'unsure_background'; + $reasonParts[] = 'background_phrases_detected'; + } else { + $relevanceFlag = 'suspicious_unrelated'; + $reasonParts[] = 'evidence_score_low=' . $score; + } + } + + $reasonParts[] = 'titleOverlap=' . $titleOverlap; + $reasonParts[] = 'authorHit=' . $authorHit; + $reasonParts[] = 'yearHit=' . $yearHit; + $reasonParts[] = 'journalOverlap=' . $journalOverlap; + + $reason = implode('; ', $reasonParts); + + return [ + 'problem_flag' => $problemFlag, + 'problem_reason' => $problemReason, + 'relevance_flag' => $relevanceFlag, + 'relevance_score' => (float)$score, + 'reason' => $reason, + ]; + } + + /** + * 从 refer_doi / doilink 中抽取 DOI 字符串。 + * @param string $referDoi + * @param string $doilink + * @return string + */ + private function extractDoiFromMeta(string $referDoi, string $doilink): string + { + $doi = trim($referDoi); + if (!empty($doi)) { + return $this->filterValidDoi($doi); + } + + $link = trim($doilink); + if ($link === '') return ''; + + // 常见:https://doi.org/10.xxxx/xxxx 或 http://doi.org/... + if (preg_match('#doi\.org/([^?#]+)#i', $link, $m)) { + $candidate = trim((string)$m[1]); + return $this->filterValidDoi($candidate); + } + + // 兜底:如果doilink本身就是doi格式 + return $this->filterValidDoi($link); + } + + /** + * 构建证据 token(用于证据命中/相似度粗判) + * @param array $src + * @return array + */ + private function buildEvidenceTokens(array $src): array + { + $stop = [ + 'the','a','an','and','or','of','in','on','for','with','to','from','by','at','as','is','are', + 'was','were','be','been','being','that','this','these','those','which','who','whom','it','its', + 'we','our','us','they','their','them','i','you','your','he','she','his','her', + 'study','studies','report','reported','reports','model','models','analysis','analyses','method','methods', + 'results','result','using','used','show','shown','demonstrated','demonstrate', + ]; + + $titleTokens = $this->tokenize((string)($src['title'] ?? '')); + $titleTokens = array_values(array_filter(array_unique($titleTokens), function ($t) use ($stop) { + return !in_array($t, $stop, true) && mb_strlen($t) >= 4; + })); + + $authorTokens = $this->extractAuthorTokens((string)($src['author'] ?? '')); + $authorTokens = array_values(array_unique(array_filter(array_map(function ($t) use ($stop) { + $t = trim($t); + if ($t === '') return ''; + if (in_array($t, $stop, true)) return ''; + return $t; + }, $authorTokens)))); + + $journalTokens = $this->tokenize((string)($src['journal'] ?? '')); + $journalTokens = array_values(array_filter(array_unique($journalTokens), function ($t) use ($stop) { + return !in_array($t, $stop, true) && mb_strlen($t) >= 4; + })); + + $yearToken = ''; + $yearRaw = (string)($src['year'] ?? ''); + if (preg_match('/(19\d{2}|20\d{2})/', $yearRaw, $m)) { + $yearToken = (string)$m[1]; + } + + return [ + 'titleTokens' => $titleTokens, + 'authorTokens' => $authorTokens, + 'journalTokens' => $journalTokens, + 'yearToken' => $yearToken, + ]; + } + + /** + * 提取作者姓/缩写 token(简化版) + * @param string $authorStr + * @return array + */ + private function extractAuthorTokens(string $authorStr): array + { + $authorStr = trim($authorStr); + if ($authorStr === '') return []; + + // 把常见分隔符拆开 + $parts = preg_split('/[,;]| and /i', $authorStr); + $tokens = []; + foreach ($parts as $p) { + $p = trim((string)$p); + if ($p === '') continue; + + // 取最后一个词当作姓(例如 "Smith J" -> "Smith"),或取首段词 + $words = preg_split('/\s+/', $p); + if (empty($words)) continue; + + $cand = trim((string)end($words)); + if ($cand === '') $cand = trim((string)($words[0] ?? '')); + + // 只保留字母/点号(去掉异常符号) + $cand = preg_replace('/[^A-Za-z\.\-]/', '', $cand); + $cand = strtolower($cand); + if ($cand !== '') { + $tokens[] = $cand; + } + } + + // 去掉过短的 token + $tokens = array_values(array_filter(array_unique($tokens), function ($t) { + return mb_strlen($t) >= 4; + })); + + return $tokens; + } + + /** + * 文本 tokenize(英文下的轻量分词) + * @param string $text + * @return array + */ + private function tokenize(string $text): array + { + $text = strtolower(trim($text)); + if ($text === '') return []; + + $parts = preg_split('/[^a-z0-9]+/i', $text); + $tokens = []; + foreach ($parts as $p) { + $p = trim((string)$p); + if ($p === '') continue; + // 保留较有信息量的 token + if (mb_strlen($p) < 3) continue; + $tokens[] = $p; + } + + return array_values(array_unique($tokens)); + } + + /** + * 英文切句(轻量实现):按 .?! 分割,同时保留句内的 [n]。 + * @param string $text + * @return array + */ + private function splitEnglishSentences(string $text): array + { + $text = trim($text); + if ($text === '') return []; + + // 先把换行统一为空格,避免断句被打断 + $text = preg_replace('/\s+/u', ' ', $text); + + // 按句末标点断句:. ? ! 后面跟空格/结尾 + $parts = preg_split('/(?<=[\.\?\!])\s+/', $text); + $sentences = []; + foreach ($parts as $p) { + $p = trim((string)$p); + if ($p === '') continue; + $sentences[] = $p; + } + return $sentences; + } +} + diff --git a/application/common/PubmedService.php b/application/common/PubmedService.php new file mode 100644 index 0000000..ad17e2d --- /dev/null +++ b/application/common/PubmedService.php @@ -0,0 +1,237 @@ + PMID + * - PMID -> 文章结构化信息(title/abstract/mesh/publication_types/year/journal) + * + * 说明: + * - 默认使用 runtime 文件缓存,避免重复请求 NCBI + */ +class PubmedService +{ + private $base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'; + private $timeout = 20; + private $tool = 'tmrjournals'; + private $email = ''; + + public function __construct(array $config = []) + { + if (isset($config['base'])) $this->base = rtrim((string)$config['base'], '/') . '/'; + if (isset($config['timeout'])) $this->timeout = max(5, intval($config['timeout'])); + if (isset($config['tool'])) $this->tool = (string)$config['tool']; + if (isset($config['email'])) $this->email = (string)$config['email']; + } + + /** + * DOI -> PMID(优先用 [DOI],命中不到再用 [AID]) + */ + public function doiToPmid(string $doi): ?string + { + $doi = trim($doi); + if ($doi === '') return null; + + $cacheKey = 'doi2pmid_' . sha1(strtolower($doi)); + $cached = $this->cacheGet($cacheKey, 30 * 86400); + if (is_string($cached) && $cached !== '') { + return $cached; + } + + $pmid = $this->esearch($doi . '[DOI]'); + if (!$pmid) { + $pmid = $this->esearch($doi . '[AID]'); + } + if ($pmid) { + $this->cacheSet($cacheKey, $pmid); + return $pmid; + } + return null; + } + + /** + * PMID -> 文章信息(title/abstract/mesh/publication_types/year/journal) + */ + public function fetchByPmid(string $pmid): ?array + { + $pmid = trim($pmid); + if ($pmid === '') return null; + + $cacheKey = 'pmid_' . $pmid; + $cached = $this->cacheGet($cacheKey, 30 * 86400); + if (is_array($cached)) return $cached; + + $url = $this->base . 'efetch.fcgi?' . http_build_query([ + 'db' => 'pubmed', + 'id' => $pmid, + 'retmode' => 'xml', + 'tool' => $this->tool, + 'email' => $this->email, + ]); + + $xml = $this->httpGet($url); + if (!is_string($xml) || trim($xml) === '') return null; + + $data = $this->parseEfetchXml($xml); + if (!$data) return null; + + $this->cacheSet($cacheKey, $data); + return $data; + } + + /** + * DOI -> PubMed 信息(含 abstract/mesh) + */ + public function fetchByDoi(string $doi): ?array + { + $pmid = $this->doiToPmid($doi); + if (!$pmid) return null; + $info = $this->fetchByPmid($pmid); + if (!$info) return null; + $info['pmid'] = $pmid; + $info['doi'] = $doi; + return $info; + } + + // ----------------- Internals ----------------- + + private function esearch(string $term): ?string + { + $url = $this->base . 'esearch.fcgi?' . http_build_query([ + 'db' => 'pubmed', + 'retmode' => 'json', + 'retmax' => 1, + 'term' => $term, + 'tool' => $this->tool, + 'email' => $this->email, + ]); + + $res = $this->httpGet($url); + $json = json_decode((string)$res, true); + $ids = $json['esearchresult']['idlist'] ?? []; + if (!empty($ids[0])) return (string)$ids[0]; + return null; + } + + private function parseEfetchXml(string $xml): ?array + { + libxml_use_internal_errors(true); + $doc = new \DOMDocument(); + if (!$doc->loadXML($xml)) { + return null; + } + $xp = new \DOMXPath($doc); + + $title = $this->xpText($xp, '//PubmedArticle//ArticleTitle'); + + $abstractParts = []; + $absNodes = $xp->query('//PubmedArticle//Abstract//AbstractText'); + if ($absNodes) { + foreach ($absNodes as $n) { + $label = $n->attributes && $n->attributes->getNamedItem('Label') + ? trim($n->attributes->getNamedItem('Label')->nodeValue) + : ''; + $txt = trim($n->textContent); + if ($txt === '') continue; + $abstractParts[] = $label ? ($label . ': ' . $txt) : $txt; + } + } + $abstract = trim(implode("\n", $abstractParts)); + + $mesh = []; + $meshNodes = $xp->query('//PubmedArticle//MeshHeadingList//MeshHeading//DescriptorName'); + if ($meshNodes) { + foreach ($meshNodes as $n) { + $t = trim($n->textContent); + if ($t !== '') $mesh[] = $t; + } + } + $mesh = array_values(array_unique($mesh)); + + $pubTypes = []; + $ptNodes = $xp->query('//PubmedArticle//PublicationTypeList//PublicationType'); + if ($ptNodes) { + foreach ($ptNodes as $n) { + $t = trim($n->textContent); + if ($t !== '') $pubTypes[] = $t; + } + } + $pubTypes = array_values(array_unique($pubTypes)); + + $journal = $this->xpText($xp, '//PubmedArticle//Journal//Title'); + + $year = ''; + $year = $this->xpText($xp, '//PubmedArticle//JournalIssue//PubDate//Year'); + if ($year === '') { + $medlineDate = $this->xpText($xp, '//PubmedArticle//JournalIssue//PubDate//MedlineDate'); + if (preg_match('/(19\\d{2}|20\\d{2})/', $medlineDate, $m)) { + $year = $m[1]; + } + } + + if ($title === '' && $abstract === '') { + return null; + } + + return [ + 'title' => $title, + 'abstract' => $abstract, + 'mesh_terms' => $mesh, + 'publication_types' => $pubTypes, + 'journal' => $journal, + 'year' => $year, + ]; + } + + private function xpText(\DOMXPath $xp, string $query): string + { + $n = $xp->query($query); + if ($n && $n->length > 0) { + return trim($n->item(0)->textContent); + } + return ''; + } + + private function httpGet(string $url): string + { + $ch = curl_init(); + curl_setopt($ch, CURLOPT_URL, $url); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); + curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout); + curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); + curl_setopt($ch, CURLOPT_HTTPHEADER, [ + 'User-Agent: TMRjournals-PubMed/1.0' + ]); + $res = curl_exec($ch); + curl_close($ch); + return is_string($res) ? $res : ''; + } + + private function cacheDir(): string + { + return rtrim(ROOT_PATH, '/') . '/runtime/pubmed_cache'; + } + + private function cacheGet(string $key, int $ttlSeconds) + { + $file = $this->cacheDir() . '/' . $key . '.json'; + if (!is_file($file)) return null; + $mtime = filemtime($file); + if (!$mtime || (time() - $mtime) > $ttlSeconds) return null; + $raw = @file_get_contents($file); + $decoded = json_decode((string)$raw, true); + return $decoded; + } + + private function cacheSet(string $key, $value): void + { + $dir = $this->cacheDir(); + if (!is_dir($dir)) @mkdir($dir, 0777, true); + $file = $dir . '/' . $key . '.json'; + @file_put_contents($file, json_encode($value, JSON_UNESCAPED_UNICODE)); + } +} +