agent功能
This commit is contained in:
612
application/api/controller/ExpertFinder.php
Normal file
612
application/api/controller/ExpertFinder.php
Normal file
@@ -0,0 +1,612 @@
|
||||
<?php
|
||||
|
||||
namespace app\api\controller;
|
||||
|
||||
use think\Cache;
|
||||
use GuzzleHttp\Client;
|
||||
|
||||
class ExpertFinder extends Base
|
||||
{
|
||||
private $httpClient;
|
||||
private $ncbiBaseUrl = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/';
|
||||
|
||||
public function __construct(\think\Request $request = null)
|
||||
{
|
||||
parent::__construct($request);
|
||||
$this->httpClient = new Client([
|
||||
'timeout' => 60,
|
||||
'verify' => false,
|
||||
]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Main search endpoint
|
||||
* Params:
|
||||
* keyword - search term (e.g. "biomedical engineering")
|
||||
* max_results - max articles to scan, default 200, max 1000
|
||||
* min_year - earliest publication year, default current-3
|
||||
* source - "pubmed" (fast, email from affiliation) or "pmc" (slower, structured email)
|
||||
*/
|
||||
public function search()
|
||||
{
|
||||
$keyword = trim($this->request->param('keyword', ''));
|
||||
$maxResults = intval($this->request->param('max_results', 200));
|
||||
$minYear = intval($this->request->param('min_year', date('Y') - 3));
|
||||
$source = $this->request->param('source', 'pubmed');
|
||||
|
||||
if (empty($keyword)) {
|
||||
return jsonError('keyword is required');
|
||||
}
|
||||
|
||||
$maxResults = max(10, min($maxResults, 1000));
|
||||
|
||||
$cacheKey = 'expert_finder_' . md5($keyword . $maxResults . $minYear . $source);
|
||||
$cached = Cache::get($cacheKey);
|
||||
if ($cached) {
|
||||
return jsonSuccess($cached);
|
||||
}
|
||||
|
||||
try {
|
||||
if ($source === 'pmc') {
|
||||
$result = $this->searchViaPMC($keyword, $maxResults, $minYear);
|
||||
} else {
|
||||
$result = $this->searchViaPubMed($keyword, $maxResults, $minYear);
|
||||
}
|
||||
} catch (\Exception $e) {
|
||||
return jsonError('Search failed: ' . $e->getMessage());
|
||||
}
|
||||
|
||||
Cache::set($cacheKey, $result, 3600);
|
||||
|
||||
return jsonSuccess($result);
|
||||
}
|
||||
|
||||
/**
|
||||
* Export search results to Excel
|
||||
* Same params as search()
|
||||
*/
|
||||
public function export()
|
||||
{
|
||||
$keyword = trim($this->request->param('keyword', ''));
|
||||
$maxResults = intval($this->request->param('max_results', 200));
|
||||
$minYear = intval($this->request->param('min_year', date('Y') - 3));
|
||||
$source = $this->request->param('source', 'pubmed');
|
||||
|
||||
if (empty($keyword)) {
|
||||
return jsonError('keyword is required');
|
||||
}
|
||||
|
||||
$maxResults = max(10, min($maxResults, 1000));
|
||||
|
||||
$cacheKey = 'expert_finder_' . md5($keyword . $maxResults . $minYear . $source);
|
||||
$cached = Cache::get($cacheKey);
|
||||
|
||||
if (!$cached) {
|
||||
try {
|
||||
if ($source === 'pmc') {
|
||||
$cached = $this->searchViaPMC($keyword, $maxResults, $minYear);
|
||||
} else {
|
||||
$cached = $this->searchViaPubMed($keyword, $maxResults, $minYear);
|
||||
}
|
||||
Cache::set($cacheKey, $cached, 3600);
|
||||
} catch (\Exception $e) {
|
||||
return jsonError('Search failed: ' . $e->getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
if (empty($cached['experts'])) {
|
||||
return jsonError('No experts found to export');
|
||||
}
|
||||
|
||||
return $this->generateExcel($cached['experts'], $keyword);
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear search cache
|
||||
*/
|
||||
public function clearCache()
|
||||
{
|
||||
$keyword = trim($this->request->param('keyword', ''));
|
||||
$maxResults = intval($this->request->param('max_results', 200));
|
||||
$minYear = intval($this->request->param('min_year', date('Y') - 3));
|
||||
$source = $this->request->param('source', 'pubmed');
|
||||
|
||||
$cacheKey = 'expert_finder_' . md5($keyword . $maxResults . $minYear . $source);
|
||||
Cache::rm($cacheKey);
|
||||
|
||||
return jsonSuccess(['msg' => 'Cache cleared']);
|
||||
}
|
||||
|
||||
// ==================== PubMed Search ====================
|
||||
|
||||
private function searchViaPubMed($keyword, $maxResults, $minYear)
|
||||
{
|
||||
$ids = $this->esearch('pubmed', $keyword, $maxResults, $minYear);
|
||||
if (empty($ids)) {
|
||||
return ['experts' => [], 'total' => 0, 'articles_scanned' => 0, 'source' => 'pubmed'];
|
||||
}
|
||||
|
||||
$allAuthors = [];
|
||||
$batches = array_chunk($ids, 200);
|
||||
foreach ($batches as $batch) {
|
||||
$xml = $this->efetch('pubmed', $batch);
|
||||
$authors = $this->parsePubMedXml($xml);
|
||||
$allAuthors = array_merge($allAuthors, $authors);
|
||||
usleep(340000);
|
||||
}
|
||||
|
||||
$experts = $this->aggregateExperts($allAuthors);
|
||||
|
||||
return [
|
||||
'experts' => $experts,
|
||||
'total' => count($experts),
|
||||
'articles_scanned' => count($ids),
|
||||
'source' => 'pubmed',
|
||||
];
|
||||
}
|
||||
|
||||
// ==================== PMC Search ====================
|
||||
|
||||
private function searchViaPMC($keyword, $maxResults, $minYear)
|
||||
{
|
||||
$ids = $this->esearch('pmc', $keyword, $maxResults, $minYear);
|
||||
if (empty($ids)) {
|
||||
return ['experts' => [], 'total' => 0, 'articles_scanned' => 0, 'source' => 'pmc'];
|
||||
}
|
||||
|
||||
$allAuthors = [];
|
||||
$batches = array_chunk($ids, 20);
|
||||
foreach ($batches as $batch) {
|
||||
$xml = $this->efetch('pmc', $batch);
|
||||
$authors = $this->parsePMCXml($xml);
|
||||
$allAuthors = array_merge($allAuthors, $authors);
|
||||
usleep(400000);
|
||||
}
|
||||
|
||||
$experts = $this->aggregateExperts($allAuthors);
|
||||
|
||||
return [
|
||||
'experts' => $experts,
|
||||
'total' => count($experts),
|
||||
'articles_scanned' => count($ids),
|
||||
'source' => 'pmc',
|
||||
];
|
||||
}
|
||||
|
||||
// ==================== NCBI API Calls ====================
|
||||
|
||||
private function esearch($db, $keyword, $maxResults, $minYear)
|
||||
{
|
||||
$term = $keyword . ' AND ' . $minYear . ':' . date('Y') . '[pdat]';
|
||||
|
||||
$response = $this->httpClient->get($this->ncbiBaseUrl . 'esearch.fcgi', [
|
||||
'query' => [
|
||||
'db' => $db,
|
||||
'term' => $term,
|
||||
'retmax' => $maxResults,
|
||||
'retmode' => 'json',
|
||||
'sort' => 'relevance',
|
||||
],
|
||||
]);
|
||||
|
||||
$data = json_decode($response->getBody()->getContents(), true);
|
||||
|
||||
return $data['esearchresult']['idlist'] ?? [];
|
||||
}
|
||||
|
||||
private function efetch($db, $ids)
|
||||
{
|
||||
$response = $this->httpClient->post($this->ncbiBaseUrl . 'efetch.fcgi', [
|
||||
'form_params' => [
|
||||
'db' => $db,
|
||||
'id' => implode(',', $ids),
|
||||
'retmode' => 'xml',
|
||||
],
|
||||
]);
|
||||
|
||||
return $response->getBody()->getContents();
|
||||
}
|
||||
|
||||
// ==================== PubMed XML Parsing ====================
|
||||
|
||||
private function parsePubMedXml($xmlString)
|
||||
{
|
||||
$results = [];
|
||||
|
||||
libxml_use_internal_errors(true);
|
||||
$xml = simplexml_load_string($xmlString);
|
||||
if ($xml === false) {
|
||||
return $results;
|
||||
}
|
||||
|
||||
foreach ($xml->PubmedArticle as $article) {
|
||||
$citation = $article->MedlineCitation;
|
||||
$articleData = $citation->Article;
|
||||
|
||||
$title = $this->xmlNodeToString($articleData->ArticleTitle);
|
||||
$pmid = (string) $citation->PMID;
|
||||
|
||||
$journal = '';
|
||||
if (isset($articleData->Journal->Title)) {
|
||||
$journal = (string) $articleData->Journal->Title;
|
||||
}
|
||||
|
||||
if (!isset($articleData->AuthorList->Author)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
foreach ($articleData->AuthorList->Author as $author) {
|
||||
$lastName = (string) ($author->LastName ?? '');
|
||||
$foreName = (string) ($author->ForeName ?? '');
|
||||
$fullName = trim($foreName . ' ' . $lastName);
|
||||
|
||||
if (empty($fullName)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$email = '';
|
||||
$affiliation = '';
|
||||
|
||||
if (isset($author->AffiliationInfo)) {
|
||||
foreach ($author->AffiliationInfo as $affInfo) {
|
||||
$affText = (string) $affInfo->Affiliation;
|
||||
if (empty($affiliation)) {
|
||||
$affiliation = $affText;
|
||||
}
|
||||
if (empty($email)) {
|
||||
$email = $this->extractEmailFromText($affText);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (empty($email)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$results[] = [
|
||||
'name' => $fullName,
|
||||
'email' => strtolower($email),
|
||||
'affiliation' => $this->cleanAffiliation($affiliation),
|
||||
'article_title' => $title,
|
||||
'article_id' => $pmid,
|
||||
'journal' => $journal,
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
return $results;
|
||||
}
|
||||
|
||||
// ==================== PMC XML Parsing ====================
|
||||
|
||||
private function parsePMCXml($xmlString)
|
||||
{
|
||||
$results = [];
|
||||
|
||||
libxml_use_internal_errors(true);
|
||||
$xml = simplexml_load_string($xmlString);
|
||||
if ($xml === false) {
|
||||
return $results;
|
||||
}
|
||||
|
||||
$articles = $xml->article ?? $xml->children();
|
||||
|
||||
foreach ($articles as $article) {
|
||||
if ($article->getName() !== 'article') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$front = $article->front;
|
||||
if (!$front) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$articleMeta = $front->{'article-meta'};
|
||||
if (!$articleMeta) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$title = $this->xmlNodeToString($articleMeta->{'title-group'}->{'article-title'} ?? null);
|
||||
$pmcId = '';
|
||||
if (isset($articleMeta->{'article-id'})) {
|
||||
foreach ($articleMeta->{'article-id'} as $idNode) {
|
||||
if ((string) $idNode['pub-id-type'] === 'pmc') {
|
||||
$pmcId = (string) $idNode;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$journal = '';
|
||||
if (isset($front->{'journal-meta'}->{'journal-title'})) {
|
||||
$journal = (string) $front->{'journal-meta'}->{'journal-title'};
|
||||
} elseif (isset($front->{'journal-meta'}->{'journal-title-group'}->{'journal-title'})) {
|
||||
$journal = (string) $front->{'journal-meta'}->{'journal-title-group'}->{'journal-title'};
|
||||
}
|
||||
|
||||
$correspEmails = [];
|
||||
if (isset($articleMeta->{'author-notes'})) {
|
||||
$this->extractEmailsFromNode($articleMeta->{'author-notes'}, $correspEmails);
|
||||
}
|
||||
|
||||
$affiliationMap = [];
|
||||
if (isset($articleMeta->{'contrib-group'})) {
|
||||
foreach ($articleMeta->{'contrib-group'}->children() as $child) {
|
||||
if ($child->getName() === 'aff') {
|
||||
$affId = (string) ($child['id'] ?? '');
|
||||
$affText = $this->xmlNodeToString($child);
|
||||
if ($affId) {
|
||||
$affiliationMap[$affId] = $affText;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (isset($front->{'article-meta'}->{'aff'})) {
|
||||
foreach ($front->{'article-meta'}->{'aff'} as $aff) {
|
||||
$affId = (string) ($aff['id'] ?? '');
|
||||
$affText = $this->xmlNodeToString($aff);
|
||||
if ($affId) {
|
||||
$affiliationMap[$affId] = $affText;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!isset($articleMeta->{'contrib-group'})) {
|
||||
continue;
|
||||
}
|
||||
|
||||
foreach ($articleMeta->{'contrib-group'}->contrib as $contrib) {
|
||||
$contribType = (string) ($contrib['contrib-type'] ?? '');
|
||||
if ($contribType !== 'author') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$nameNode = $contrib->name;
|
||||
if (!$nameNode) {
|
||||
continue;
|
||||
}
|
||||
$surname = (string) ($nameNode->surname ?? '');
|
||||
$givenNames = (string) ($nameNode->{'given-names'} ?? '');
|
||||
$fullName = trim($givenNames . ' ' . $surname);
|
||||
|
||||
if (empty($fullName)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$email = '';
|
||||
if (isset($contrib->email)) {
|
||||
$email = strtolower(trim((string) $contrib->email));
|
||||
}
|
||||
|
||||
$affiliation = '';
|
||||
if (isset($contrib->xref)) {
|
||||
foreach ($contrib->xref as $xref) {
|
||||
if ((string) $xref['ref-type'] === 'aff') {
|
||||
$rid = (string) $xref['rid'];
|
||||
if (isset($affiliationMap[$rid])) {
|
||||
$affiliation = $affiliationMap[$rid];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (empty($affiliation) && isset($contrib->aff)) {
|
||||
$affiliation = $this->xmlNodeToString($contrib->aff);
|
||||
}
|
||||
|
||||
$isCorresponding = false;
|
||||
if (isset($contrib->xref)) {
|
||||
foreach ($contrib->xref as $xref) {
|
||||
if ((string) $xref['ref-type'] === 'corresp') {
|
||||
$isCorresponding = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if ((string) ($contrib['corresp'] ?? '') === 'yes') {
|
||||
$isCorresponding = true;
|
||||
}
|
||||
|
||||
if (empty($email) && $isCorresponding && !empty($correspEmails)) {
|
||||
$email = $correspEmails[0];
|
||||
}
|
||||
|
||||
if (empty($email)) {
|
||||
$extracted = $this->extractEmailFromText($affiliation);
|
||||
if ($extracted) {
|
||||
$email = $extracted;
|
||||
}
|
||||
}
|
||||
|
||||
if (empty($email)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$results[] = [
|
||||
'name' => $fullName,
|
||||
'email' => strtolower($email),
|
||||
'affiliation' => $this->cleanAffiliation($affiliation),
|
||||
'article_title' => $title,
|
||||
'article_id' => $pmcId,
|
||||
'journal' => $journal,
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
return $results;
|
||||
}
|
||||
|
||||
// ==================== Aggregation ====================
|
||||
|
||||
private function aggregateExperts($authorRecords)
|
||||
{
|
||||
$map = [];
|
||||
|
||||
foreach ($authorRecords as $record) {
|
||||
$key = strtolower(trim($record['email']));
|
||||
if (empty($key)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!isset($map[$key])) {
|
||||
$map[$key] = [
|
||||
'name' => $record['name'],
|
||||
'email' => $record['email'],
|
||||
'affiliation' => $record['affiliation'],
|
||||
'paper_count' => 0,
|
||||
'papers' => [],
|
||||
];
|
||||
}
|
||||
|
||||
$map[$key]['paper_count']++;
|
||||
|
||||
if (count($map[$key]['papers']) < 10) {
|
||||
$map[$key]['papers'][] = [
|
||||
'title' => $record['article_title'],
|
||||
'article_id' => $record['article_id'],
|
||||
'journal' => $record['journal'],
|
||||
];
|
||||
}
|
||||
|
||||
if (empty($map[$key]['affiliation']) && !empty($record['affiliation'])) {
|
||||
$map[$key]['affiliation'] = $record['affiliation'];
|
||||
}
|
||||
}
|
||||
|
||||
$experts = array_values($map);
|
||||
|
||||
usort($experts, function ($a, $b) {
|
||||
return $b['paper_count'] - $a['paper_count'];
|
||||
});
|
||||
|
||||
return $experts;
|
||||
}
|
||||
|
||||
// ==================== Excel Export ====================
|
||||
|
||||
private function generateExcel($experts, $keyword)
|
||||
{
|
||||
$spreadsheet = new \PhpOffice\PhpSpreadsheet\Spreadsheet();
|
||||
$sheet = $spreadsheet->getActiveSheet();
|
||||
$sheet->setTitle('Experts');
|
||||
|
||||
$headers = ['#', 'Name', 'Email', 'Affiliation', 'Paper Count', 'Representative Papers'];
|
||||
foreach ($headers as $col => $header) {
|
||||
$sheet->setCellValueByColumnAndRow($col + 1, 1, $header);
|
||||
}
|
||||
|
||||
$headerStyle = [
|
||||
'font' => ['bold' => true, 'color' => ['rgb' => 'FFFFFF']],
|
||||
'fill' => ['fillType' => \PhpOffice\PhpSpreadsheet\Style\Fill::FILL_SOLID, 'startColor' => ['rgb' => '4472C4']],
|
||||
'alignment' => ['horizontal' => \PhpOffice\PhpSpreadsheet\Style\Alignment::HORIZONTAL_CENTER],
|
||||
];
|
||||
$sheet->getStyle('A1:F1')->applyFromArray($headerStyle);
|
||||
|
||||
foreach ($experts as $i => $expert) {
|
||||
$row = $i + 2;
|
||||
$paperTitles = array_map(function ($p) {
|
||||
return $p['title'];
|
||||
}, $expert['papers']);
|
||||
|
||||
$sheet->setCellValueByColumnAndRow(1, $row, $i + 1);
|
||||
$sheet->setCellValueByColumnAndRow(2, $row, $expert['name']);
|
||||
$sheet->setCellValueByColumnAndRow(3, $row, $expert['email']);
|
||||
$sheet->setCellValueByColumnAndRow(4, $row, $expert['affiliation']);
|
||||
$sheet->setCellValueByColumnAndRow(5, $row, $expert['paper_count']);
|
||||
$sheet->setCellValueByColumnAndRow(6, $row, implode("\n", $paperTitles));
|
||||
}
|
||||
|
||||
$sheet->getColumnDimension('A')->setWidth(6);
|
||||
$sheet->getColumnDimension('B')->setWidth(25);
|
||||
$sheet->getColumnDimension('C')->setWidth(35);
|
||||
$sheet->getColumnDimension('D')->setWidth(50);
|
||||
$sheet->getColumnDimension('E')->setWidth(12);
|
||||
$sheet->getColumnDimension('F')->setWidth(60);
|
||||
|
||||
$filename = 'experts_' . preg_replace('/[^a-zA-Z0-9]/', '_', $keyword) . '_' . date('Ymd_His') . '.xlsx';
|
||||
$filepath = ROOT_PATH . 'public' . DS . 'exports' . DS . $filename;
|
||||
|
||||
$dir = ROOT_PATH . 'public' . DS . 'exports';
|
||||
if (!is_dir($dir)) {
|
||||
mkdir($dir, 0777, true);
|
||||
}
|
||||
|
||||
$writer = new \PhpOffice\PhpSpreadsheet\Writer\Xlsx($spreadsheet);
|
||||
$writer->save($filepath);
|
||||
|
||||
return jsonSuccess([
|
||||
'file_url' => '/exports/' . $filename,
|
||||
'file_name' => $filename,
|
||||
'count' => count($experts),
|
||||
]);
|
||||
}
|
||||
|
||||
// ==================== Helper Methods ====================
|
||||
|
||||
private function extractEmailFromText($text)
|
||||
{
|
||||
if (empty($text)) {
|
||||
return '';
|
||||
}
|
||||
|
||||
if (preg_match('/[Ee]lectronic address:\s*([^\s;,]+@[^\s;,]+)/', $text, $m)) {
|
||||
return strtolower(trim($m[1], '.'));
|
||||
}
|
||||
|
||||
if (preg_match('/[Ee]-?mail:\s*([^\s;,]+@[^\s;,]+)/', $text, $m)) {
|
||||
return strtolower(trim($m[1], '.'));
|
||||
}
|
||||
|
||||
if (preg_match('/\b([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})\b/', $text, $m)) {
|
||||
return strtolower(trim($m[1], '.'));
|
||||
}
|
||||
|
||||
return '';
|
||||
}
|
||||
|
||||
private function extractEmailsFromNode($node, &$emails)
|
||||
{
|
||||
if ($node === null) {
|
||||
return;
|
||||
}
|
||||
|
||||
foreach ($node->children() as $child) {
|
||||
if ($child->getName() === 'email') {
|
||||
$email = strtolower(trim((string) $child));
|
||||
if (!empty($email) && !in_array($email, $emails)) {
|
||||
$emails[] = $email;
|
||||
}
|
||||
}
|
||||
$this->extractEmailsFromNode($child, $emails);
|
||||
}
|
||||
|
||||
$text = (string) $node;
|
||||
if (preg_match_all('/\b([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})\b/', $text, $matches)) {
|
||||
foreach ($matches[1] as $email) {
|
||||
$email = strtolower(trim($email, '.'));
|
||||
if (!in_array($email, $emails)) {
|
||||
$emails[] = $email;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private function cleanAffiliation($text)
|
||||
{
|
||||
$text = preg_replace('/\s*[Ee]lectronic address:\s*[^\s;,]+@[^\s;,]+/', '', $text);
|
||||
$text = preg_replace('/\s*[Ee]-?mail:\s*[^\s;,]+@[^\s;,]+/', '', $text);
|
||||
$text = preg_replace('/\s*\b[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}\b/', '', $text);
|
||||
$text = trim($text, " \t\n\r\0\x0B.,;");
|
||||
return $text;
|
||||
}
|
||||
|
||||
private function xmlNodeToString($node)
|
||||
{
|
||||
if ($node === null) {
|
||||
return '';
|
||||
}
|
||||
|
||||
$xml = $node->asXML();
|
||||
$text = strip_tags($xml);
|
||||
$text = html_entity_decode($text, ENT_QUOTES | ENT_XML1, 'UTF-8');
|
||||
return trim(preg_replace('/\s+/', ' ', $text));
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user