238 lines
7.1 KiB
PHP
238 lines
7.1 KiB
PHP
<?php
|
||
|
||
namespace app\common;
|
||
|
||
/**
|
||
* PubMed 工具类(E-utilities)
|
||
*
|
||
* 功能:
|
||
* - DOI -> PMID
|
||
* - PMID -> 文章结构化信息(title/abstract/mesh/publication_types/year/journal)
|
||
*
|
||
* 说明:
|
||
* - 默认使用 runtime 文件缓存,避免重复请求 NCBI
|
||
*/
|
||
class PubmedService
|
||
{
|
||
private $base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/';
|
||
private $timeout = 20;
|
||
private $tool = 'tmrjournals';
|
||
private $email = '';
|
||
|
||
public function __construct(array $config = [])
|
||
{
|
||
if (isset($config['base'])) $this->base = rtrim((string)$config['base'], '/') . '/';
|
||
if (isset($config['timeout'])) $this->timeout = max(5, intval($config['timeout']));
|
||
if (isset($config['tool'])) $this->tool = (string)$config['tool'];
|
||
if (isset($config['email'])) $this->email = (string)$config['email'];
|
||
}
|
||
|
||
/**
|
||
* DOI -> PMID(优先用 [DOI],命中不到再用 [AID])
|
||
*/
|
||
public function doiToPmid(string $doi): ?string
|
||
{
|
||
$doi = trim($doi);
|
||
if ($doi === '') return null;
|
||
|
||
$cacheKey = 'doi2pmid_' . sha1(strtolower($doi));
|
||
$cached = $this->cacheGet($cacheKey, 30 * 86400);
|
||
if (is_string($cached) && $cached !== '') {
|
||
return $cached;
|
||
}
|
||
|
||
$pmid = $this->esearch($doi . '[DOI]');
|
||
if (!$pmid) {
|
||
$pmid = $this->esearch($doi . '[AID]');
|
||
}
|
||
if ($pmid) {
|
||
$this->cacheSet($cacheKey, $pmid);
|
||
return $pmid;
|
||
}
|
||
return null;
|
||
}
|
||
|
||
/**
|
||
* PMID -> 文章信息(title/abstract/mesh/publication_types/year/journal)
|
||
*/
|
||
public function fetchByPmid(string $pmid): ?array
|
||
{
|
||
$pmid = trim($pmid);
|
||
if ($pmid === '') return null;
|
||
|
||
$cacheKey = 'pmid_' . $pmid;
|
||
$cached = $this->cacheGet($cacheKey, 30 * 86400);
|
||
if (is_array($cached)) return $cached;
|
||
|
||
$url = $this->base . 'efetch.fcgi?' . http_build_query([
|
||
'db' => 'pubmed',
|
||
'id' => $pmid,
|
||
'retmode' => 'xml',
|
||
'tool' => $this->tool,
|
||
'email' => $this->email,
|
||
]);
|
||
|
||
$xml = $this->httpGet($url);
|
||
if (!is_string($xml) || trim($xml) === '') return null;
|
||
|
||
$data = $this->parseEfetchXml($xml);
|
||
if (!$data) return null;
|
||
|
||
$this->cacheSet($cacheKey, $data);
|
||
return $data;
|
||
}
|
||
|
||
/**
|
||
* DOI -> PubMed 信息(含 abstract/mesh)
|
||
*/
|
||
public function fetchByDoi(string $doi): ?array
|
||
{
|
||
$pmid = $this->doiToPmid($doi);
|
||
if (!$pmid) return null;
|
||
$info = $this->fetchByPmid($pmid);
|
||
if (!$info) return null;
|
||
$info['pmid'] = $pmid;
|
||
$info['doi'] = $doi;
|
||
return $info;
|
||
}
|
||
|
||
// ----------------- Internals -----------------
|
||
|
||
private function esearch(string $term): ?string
|
||
{
|
||
$url = $this->base . 'esearch.fcgi?' . http_build_query([
|
||
'db' => 'pubmed',
|
||
'retmode' => 'json',
|
||
'retmax' => 1,
|
||
'term' => $term,
|
||
'tool' => $this->tool,
|
||
'email' => $this->email,
|
||
]);
|
||
|
||
$res = $this->httpGet($url);
|
||
$json = json_decode((string)$res, true);
|
||
$ids = $json['esearchresult']['idlist'] ?? [];
|
||
if (!empty($ids[0])) return (string)$ids[0];
|
||
return null;
|
||
}
|
||
|
||
private function parseEfetchXml(string $xml): ?array
|
||
{
|
||
libxml_use_internal_errors(true);
|
||
$doc = new \DOMDocument();
|
||
if (!$doc->loadXML($xml)) {
|
||
return null;
|
||
}
|
||
$xp = new \DOMXPath($doc);
|
||
|
||
$title = $this->xpText($xp, '//PubmedArticle//ArticleTitle');
|
||
|
||
$abstractParts = [];
|
||
$absNodes = $xp->query('//PubmedArticle//Abstract//AbstractText');
|
||
if ($absNodes) {
|
||
foreach ($absNodes as $n) {
|
||
$label = $n->attributes && $n->attributes->getNamedItem('Label')
|
||
? trim($n->attributes->getNamedItem('Label')->nodeValue)
|
||
: '';
|
||
$txt = trim($n->textContent);
|
||
if ($txt === '') continue;
|
||
$abstractParts[] = $label ? ($label . ': ' . $txt) : $txt;
|
||
}
|
||
}
|
||
$abstract = trim(implode("\n", $abstractParts));
|
||
|
||
$mesh = [];
|
||
$meshNodes = $xp->query('//PubmedArticle//MeshHeadingList//MeshHeading//DescriptorName');
|
||
if ($meshNodes) {
|
||
foreach ($meshNodes as $n) {
|
||
$t = trim($n->textContent);
|
||
if ($t !== '') $mesh[] = $t;
|
||
}
|
||
}
|
||
$mesh = array_values(array_unique($mesh));
|
||
|
||
$pubTypes = [];
|
||
$ptNodes = $xp->query('//PubmedArticle//PublicationTypeList//PublicationType');
|
||
if ($ptNodes) {
|
||
foreach ($ptNodes as $n) {
|
||
$t = trim($n->textContent);
|
||
if ($t !== '') $pubTypes[] = $t;
|
||
}
|
||
}
|
||
$pubTypes = array_values(array_unique($pubTypes));
|
||
|
||
$journal = $this->xpText($xp, '//PubmedArticle//Journal//Title');
|
||
|
||
$year = '';
|
||
$year = $this->xpText($xp, '//PubmedArticle//JournalIssue//PubDate//Year');
|
||
if ($year === '') {
|
||
$medlineDate = $this->xpText($xp, '//PubmedArticle//JournalIssue//PubDate//MedlineDate');
|
||
if (preg_match('/(19\\d{2}|20\\d{2})/', $medlineDate, $m)) {
|
||
$year = $m[1];
|
||
}
|
||
}
|
||
|
||
if ($title === '' && $abstract === '') {
|
||
return null;
|
||
}
|
||
|
||
return [
|
||
'title' => $title,
|
||
'abstract' => $abstract,
|
||
'mesh_terms' => $mesh,
|
||
'publication_types' => $pubTypes,
|
||
'journal' => $journal,
|
||
'year' => $year,
|
||
];
|
||
}
|
||
|
||
private function xpText(\DOMXPath $xp, string $query): string
|
||
{
|
||
$n = $xp->query($query);
|
||
if ($n && $n->length > 0) {
|
||
return trim($n->item(0)->textContent);
|
||
}
|
||
return '';
|
||
}
|
||
|
||
private function httpGet(string $url): string
|
||
{
|
||
$ch = curl_init();
|
||
curl_setopt($ch, CURLOPT_URL, $url);
|
||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
||
curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout);
|
||
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
|
||
curl_setopt($ch, CURLOPT_HTTPHEADER, [
|
||
'User-Agent: TMRjournals-PubMed/1.0'
|
||
]);
|
||
$res = curl_exec($ch);
|
||
curl_close($ch);
|
||
return is_string($res) ? $res : '';
|
||
}
|
||
|
||
private function cacheDir(): string
|
||
{
|
||
return rtrim(ROOT_PATH, '/') . '/runtime/pubmed_cache';
|
||
}
|
||
|
||
private function cacheGet(string $key, int $ttlSeconds)
|
||
{
|
||
$file = $this->cacheDir() . '/' . $key . '.json';
|
||
if (!is_file($file)) return null;
|
||
$mtime = filemtime($file);
|
||
if (!$mtime || (time() - $mtime) > $ttlSeconds) return null;
|
||
$raw = @file_get_contents($file);
|
||
$decoded = json_decode((string)$raw, true);
|
||
return $decoded;
|
||
}
|
||
|
||
private function cacheSet(string $key, $value): void
|
||
{
|
||
$dir = $this->cacheDir();
|
||
if (!is_dir($dir)) @mkdir($dir, 0777, true);
|
||
$file = $dir . '/' . $key . '.json';
|
||
@file_put_contents($file, json_encode($value, JSON_UNESCAPED_UNICODE));
|
||
}
|
||
}
|
||
|