Files
tougao/application/common/PubmedService.php
wangjinlei 978c81ea10 升级
2026-06-23 09:55:38 +08:00

260 lines
8.2 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
namespace app\common;
/**
* PubMed 工具类E-utilities
*
* 功能:
* - DOI -> PMID
* - PMID -> 文章结构化信息title/abstract/mesh/publication_types/year/journal
*
* 说明:
* - 默认使用 runtime 文件缓存,避免重复请求 NCBI
*/
class PubmedService
{
private $base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/';
private $timeout = 20;
private $tool = 'tmrjournals';
private $email = '';
public function __construct(array $config = [])
{
if (isset($config['base'])) $this->base = rtrim((string)$config['base'], '/') . '/';
if (isset($config['timeout'])) $this->timeout = max(5, intval($config['timeout']));
if (isset($config['tool'])) $this->tool = (string)$config['tool'];
if (isset($config['email'])) $this->email = (string)$config['email'];
}
/**
* DOI -> PMID优先用 [DOI],命中不到再用 [AID]
*/
public function doiToPmid(string $doi): ?string
{
$doi = trim($doi);
if ($doi === '') return null;
$cacheKey = 'doi2pmid_' . sha1(strtolower($doi));
$cached = $this->cacheGet($cacheKey, 30 * 86400);
if (is_string($cached) && $cached !== '') {
return $cached;
}
$pmid = $this->esearch($doi . '[DOI]');
if (!$pmid) {
$pmid = $this->esearch($doi . '[AID]');
}
if ($pmid) {
$this->cacheSet($cacheKey, $pmid);
return $pmid;
}
return null;
}
/**
* PMID -> 文章信息title/abstract/mesh/publication_types/year/journal
*/
public function fetchByPmid(string $pmid): ?array
{
$pmid = trim($pmid);
if ($pmid === '') return null;
// v2解析结果新增 journal_iso_abbr / journal_medline_ta换 key 避免命中旧缓存
$cacheKey = 'pmid_v2_' . $pmid;
$cached = $this->cacheGet($cacheKey, 30 * 86400);
if (is_array($cached)) return $cached;
$url = $this->base . 'efetch.fcgi?' . http_build_query([
'db' => 'pubmed',
'id' => $pmid,
'retmode' => 'xml',
'tool' => $this->tool,
'email' => $this->email,
]);
$xml = $this->httpGet($url);
if (!is_string($xml) || trim($xml) === '') return null;
$data = $this->parseEfetchXml($xml);
if (!$data) return null;
$this->cacheSet($cacheKey, $data);
return $data;
}
/**
* DOI -> PubMed 信息(含 abstract/mesh
*/
public function fetchByDoi(string $doi): ?array
{
$pmid = $this->doiToPmid($doi);
if (!$pmid) return null;
$info = $this->fetchByPmid($pmid);
if (!$info) return null;
$info['pmid'] = $pmid;
$info['doi'] = $doi;
return $info;
}
/**
* DOI -> 期刊规范缩写NLM/ISO 形式,如 "J Clin Oncol"
* 优先 ISOAbbreviation回退 MedlineTA查不到返回 null。
*/
public function journalAbbrByDoi(string $doi): ?string
{
$info = $this->fetchByDoi($doi);
if (!is_array($info)) return null;
$abbr = trim((string)($info['journal_iso_abbr'] ?? ''));
if ($abbr === '') {
$abbr = trim((string)($info['journal_medline_ta'] ?? ''));
}
return $abbr !== '' ? $abbr : null;
}
// ----------------- Internals -----------------
private function esearch(string $term): ?string
{
$url = $this->base . 'esearch.fcgi?' . http_build_query([
'db' => 'pubmed',
'retmode' => 'json',
'retmax' => 1,
'term' => $term,
'tool' => $this->tool,
'email' => $this->email,
]);
$res = $this->httpGet($url);
$json = json_decode((string)$res, true);
$ids = $json['esearchresult']['idlist'] ?? [];
if (!empty($ids[0])) return (string)$ids[0];
return null;
}
private function parseEfetchXml(string $xml): ?array
{
libxml_use_internal_errors(true);
$doc = new \DOMDocument();
if (!$doc->loadXML($xml)) {
return null;
}
$xp = new \DOMXPath($doc);
$title = $this->xpText($xp, '//PubmedArticle//ArticleTitle');
$abstractParts = [];
$absNodes = $xp->query('//PubmedArticle//Abstract//AbstractText');
if ($absNodes) {
foreach ($absNodes as $n) {
$label = $n->attributes && $n->attributes->getNamedItem('Label')
? trim($n->attributes->getNamedItem('Label')->nodeValue)
: '';
$txt = trim($n->textContent);
if ($txt === '') continue;
$abstractParts[] = $label ? ($label . ': ' . $txt) : $txt;
}
}
$abstract = trim(implode("\n", $abstractParts));
$mesh = [];
$meshNodes = $xp->query('//PubmedArticle//MeshHeadingList//MeshHeading//DescriptorName');
if ($meshNodes) {
foreach ($meshNodes as $n) {
$t = trim($n->textContent);
if ($t !== '') $mesh[] = $t;
}
}
$mesh = array_values(array_unique($mesh));
$pubTypes = [];
$ptNodes = $xp->query('//PubmedArticle//PublicationTypeList//PublicationType');
if ($ptNodes) {
foreach ($ptNodes as $n) {
$t = trim($n->textContent);
if ($t !== '') $pubTypes[] = $t;
}
}
$pubTypes = array_values(array_unique($pubTypes));
$journal = $this->xpText($xp, '//PubmedArticle//Journal//Title');
// 期刊规范缩写ISOAbbreviationJournal 下)与 MedlineTAMedlineJournalInfo 下)
$journalIsoAbbr = $this->xpText($xp, '//PubmedArticle//Journal//ISOAbbreviation');
$journalMedlineTa = $this->xpText($xp, '//PubmedArticle//MedlineJournalInfo//MedlineTA');
$year = '';
$year = $this->xpText($xp, '//PubmedArticle//JournalIssue//PubDate//Year');
if ($year === '') {
$medlineDate = $this->xpText($xp, '//PubmedArticle//JournalIssue//PubDate//MedlineDate');
if (preg_match('/(19\\d{2}|20\\d{2})/', $medlineDate, $m)) {
$year = $m[1];
}
}
if ($title === '' && $abstract === '') {
return null;
}
return [
'title' => $title,
'abstract' => $abstract,
'mesh_terms' => $mesh,
'publication_types' => $pubTypes,
'journal' => $journal,
'journal_iso_abbr' => $journalIsoAbbr,
'journal_medline_ta' => $journalMedlineTa,
'year' => $year,
];
}
private function xpText(\DOMXPath $xp, string $query): string
{
$n = $xp->query($query);
if ($n && $n->length > 0) {
return trim($n->item(0)->textContent);
}
return '';
}
private function httpGet(string $url): string
{
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_HTTPHEADER, [
'User-Agent: TMRjournals-PubMed/1.0'
]);
$res = curl_exec($ch);
curl_close($ch);
return is_string($res) ? $res : '';
}
private function cacheDir(): string
{
return rtrim(ROOT_PATH, '/') . '/runtime/pubmed_cache';
}
private function cacheGet(string $key, int $ttlSeconds)
{
$file = $this->cacheDir() . '/' . $key . '.json';
if (!is_file($file)) return null;
$mtime = filemtime($file);
if (!$mtime || (time() - $mtime) > $ttlSeconds) return null;
$raw = @file_get_contents($file);
$decoded = json_decode((string)$raw, true);
return $decoded;
}
private function cacheSet(string $key, $value): void
{
$dir = $this->cacheDir();
if (!is_dir($dir)) @mkdir($dir, 0777, true);
$file = $dir . '/' . $key . '.json';
@file_put_contents($file, json_encode($value, JSON_UNESCAPED_UNICODE));
}
}