Files
tougao/application/common/PubmedService.php
2026-04-03 11:45:45 +08:00

238 lines
7.1 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
namespace app\common;
/**
* PubMed 工具类E-utilities
*
* 功能:
* - DOI -> PMID
* - PMID -> 文章结构化信息title/abstract/mesh/publication_types/year/journal
*
* 说明:
* - 默认使用 runtime 文件缓存,避免重复请求 NCBI
*/
class PubmedService
{
private $base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/';
private $timeout = 20;
private $tool = 'tmrjournals';
private $email = '';
public function __construct(array $config = [])
{
if (isset($config['base'])) $this->base = rtrim((string)$config['base'], '/') . '/';
if (isset($config['timeout'])) $this->timeout = max(5, intval($config['timeout']));
if (isset($config['tool'])) $this->tool = (string)$config['tool'];
if (isset($config['email'])) $this->email = (string)$config['email'];
}
/**
* DOI -> PMID优先用 [DOI],命中不到再用 [AID]
*/
public function doiToPmid(string $doi): ?string
{
$doi = trim($doi);
if ($doi === '') return null;
$cacheKey = 'doi2pmid_' . sha1(strtolower($doi));
$cached = $this->cacheGet($cacheKey, 30 * 86400);
if (is_string($cached) && $cached !== '') {
return $cached;
}
$pmid = $this->esearch($doi . '[DOI]');
if (!$pmid) {
$pmid = $this->esearch($doi . '[AID]');
}
if ($pmid) {
$this->cacheSet($cacheKey, $pmid);
return $pmid;
}
return null;
}
/**
* PMID -> 文章信息title/abstract/mesh/publication_types/year/journal
*/
public function fetchByPmid(string $pmid): ?array
{
$pmid = trim($pmid);
if ($pmid === '') return null;
$cacheKey = 'pmid_' . $pmid;
$cached = $this->cacheGet($cacheKey, 30 * 86400);
if (is_array($cached)) return $cached;
$url = $this->base . 'efetch.fcgi?' . http_build_query([
'db' => 'pubmed',
'id' => $pmid,
'retmode' => 'xml',
'tool' => $this->tool,
'email' => $this->email,
]);
$xml = $this->httpGet($url);
if (!is_string($xml) || trim($xml) === '') return null;
$data = $this->parseEfetchXml($xml);
if (!$data) return null;
$this->cacheSet($cacheKey, $data);
return $data;
}
/**
* DOI -> PubMed 信息(含 abstract/mesh
*/
public function fetchByDoi(string $doi): ?array
{
$pmid = $this->doiToPmid($doi);
if (!$pmid) return null;
$info = $this->fetchByPmid($pmid);
if (!$info) return null;
$info['pmid'] = $pmid;
$info['doi'] = $doi;
return $info;
}
// ----------------- Internals -----------------
private function esearch(string $term): ?string
{
$url = $this->base . 'esearch.fcgi?' . http_build_query([
'db' => 'pubmed',
'retmode' => 'json',
'retmax' => 1,
'term' => $term,
'tool' => $this->tool,
'email' => $this->email,
]);
$res = $this->httpGet($url);
$json = json_decode((string)$res, true);
$ids = $json['esearchresult']['idlist'] ?? [];
if (!empty($ids[0])) return (string)$ids[0];
return null;
}
private function parseEfetchXml(string $xml): ?array
{
libxml_use_internal_errors(true);
$doc = new \DOMDocument();
if (!$doc->loadXML($xml)) {
return null;
}
$xp = new \DOMXPath($doc);
$title = $this->xpText($xp, '//PubmedArticle//ArticleTitle');
$abstractParts = [];
$absNodes = $xp->query('//PubmedArticle//Abstract//AbstractText');
if ($absNodes) {
foreach ($absNodes as $n) {
$label = $n->attributes && $n->attributes->getNamedItem('Label')
? trim($n->attributes->getNamedItem('Label')->nodeValue)
: '';
$txt = trim($n->textContent);
if ($txt === '') continue;
$abstractParts[] = $label ? ($label . ': ' . $txt) : $txt;
}
}
$abstract = trim(implode("\n", $abstractParts));
$mesh = [];
$meshNodes = $xp->query('//PubmedArticle//MeshHeadingList//MeshHeading//DescriptorName');
if ($meshNodes) {
foreach ($meshNodes as $n) {
$t = trim($n->textContent);
if ($t !== '') $mesh[] = $t;
}
}
$mesh = array_values(array_unique($mesh));
$pubTypes = [];
$ptNodes = $xp->query('//PubmedArticle//PublicationTypeList//PublicationType');
if ($ptNodes) {
foreach ($ptNodes as $n) {
$t = trim($n->textContent);
if ($t !== '') $pubTypes[] = $t;
}
}
$pubTypes = array_values(array_unique($pubTypes));
$journal = $this->xpText($xp, '//PubmedArticle//Journal//Title');
$year = '';
$year = $this->xpText($xp, '//PubmedArticle//JournalIssue//PubDate//Year');
if ($year === '') {
$medlineDate = $this->xpText($xp, '//PubmedArticle//JournalIssue//PubDate//MedlineDate');
if (preg_match('/(19\\d{2}|20\\d{2})/', $medlineDate, $m)) {
$year = $m[1];
}
}
if ($title === '' && $abstract === '') {
return null;
}
return [
'title' => $title,
'abstract' => $abstract,
'mesh_terms' => $mesh,
'publication_types' => $pubTypes,
'journal' => $journal,
'year' => $year,
];
}
private function xpText(\DOMXPath $xp, string $query): string
{
$n = $xp->query($query);
if ($n && $n->length > 0) {
return trim($n->item(0)->textContent);
}
return '';
}
private function httpGet(string $url): string
{
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_HTTPHEADER, [
'User-Agent: TMRjournals-PubMed/1.0'
]);
$res = curl_exec($ch);
curl_close($ch);
return is_string($res) ? $res : '';
}
private function cacheDir(): string
{
return rtrim(ROOT_PATH, '/') . '/runtime/pubmed_cache';
}
private function cacheGet(string $key, int $ttlSeconds)
{
$file = $this->cacheDir() . '/' . $key . '.json';
if (!is_file($file)) return null;
$mtime = filemtime($file);
if (!$mtime || (time() - $mtime) > $ttlSeconds) return null;
$raw = @file_get_contents($file);
$decoded = json_decode((string)$raw, true);
return $decoded;
}
private function cacheSet(string $key, $value): void
{
$dir = $this->cacheDir();
if (!is_dir($dir)) @mkdir($dir, 0777, true);
$file = $dir . '/' . $key . '.json';
@file_put_contents($file, json_encode($value, JSON_UNESCAPED_UNICODE));
}
}