httpClient = new Client([ 'timeout' => 60, 'verify' => false, ]); } /** * Main search endpoint * Params: * keyword - search term (e.g. "biomedical engineering") * max_results - max articles to scan, default 200, max 1000 * min_year - earliest publication year, default current-3 * source - "pubmed" (fast, email from affiliation) or "pmc" (slower, structured email) */ public function search() { $keyword = trim($this->request->param('keyword', '')); $maxResults = intval($this->request->param('max_results', 200)); $minYear = intval($this->request->param('min_year', date('Y') - 3)); $source = $this->request->param('source', 'pubmed'); if (empty($keyword)) { return jsonError('keyword is required'); } $maxResults = max(10, min($maxResults, 1000)); $cacheKey = 'expert_finder_' . md5($keyword . $maxResults . $minYear . $source); $cached = Cache::get($cacheKey); if ($cached) { return jsonSuccess($cached); } try { if ($source === 'pmc') { $result = $this->searchViaPMC($keyword, $maxResults, $minYear); } else { $result = $this->searchViaPubMed($keyword, $maxResults, $minYear); } } catch (\Exception $e) { return jsonError('Search failed: ' . $e->getMessage()); } Cache::set($cacheKey, $result, 3600); return jsonSuccess($result); } /** * Export search results to Excel * Same params as search() */ public function export() { $keyword = trim($this->request->param('keyword', '')); $maxResults = intval($this->request->param('max_results', 200)); $minYear = intval($this->request->param('min_year', date('Y') - 3)); $source = $this->request->param('source', 'pubmed'); if (empty($keyword)) { return jsonError('keyword is required'); } $maxResults = max(10, min($maxResults, 1000)); $cacheKey = 'expert_finder_' . md5($keyword . $maxResults . $minYear . $source); $cached = Cache::get($cacheKey); if (!$cached) { try { if ($source === 'pmc') { $cached = $this->searchViaPMC($keyword, $maxResults, $minYear); } else { $cached = $this->searchViaPubMed($keyword, $maxResults, $minYear); } Cache::set($cacheKey, $cached, 3600); } catch (\Exception $e) { return jsonError('Search failed: ' . $e->getMessage()); } } if (empty($cached['experts'])) { return jsonError('No experts found to export'); } return $this->generateExcel($cached['experts'], $keyword); } /** * Clear search cache */ public function clearCache() { $keyword = trim($this->request->param('keyword', '')); $maxResults = intval($this->request->param('max_results', 200)); $minYear = intval($this->request->param('min_year', date('Y') - 3)); $source = $this->request->param('source', 'pubmed'); $cacheKey = 'expert_finder_' . md5($keyword . $maxResults . $minYear . $source); Cache::rm($cacheKey); return jsonSuccess(['msg' => 'Cache cleared']); } // ==================== PubMed Search ==================== private function searchViaPubMed($keyword, $maxResults, $minYear) { $ids = $this->esearch('pubmed', $keyword, $maxResults, $minYear); if (empty($ids)) { return ['experts' => [], 'total' => 0, 'articles_scanned' => 0, 'source' => 'pubmed']; } $allAuthors = []; $batches = array_chunk($ids, 200); foreach ($batches as $batch) { $xml = $this->efetch('pubmed', $batch); $authors = $this->parsePubMedXml($xml); $allAuthors = array_merge($allAuthors, $authors); usleep(340000); } $experts = $this->aggregateExperts($allAuthors); return [ 'experts' => $experts, 'total' => count($experts), 'articles_scanned' => count($ids), 'source' => 'pubmed', ]; } // ==================== PMC Search ==================== private function searchViaPMC($keyword, $maxResults, $minYear) { $ids = $this->esearch('pmc', $keyword, $maxResults, $minYear); if (empty($ids)) { return ['experts' => [], 'total' => 0, 'articles_scanned' => 0, 'source' => 'pmc']; } $allAuthors = []; $batches = array_chunk($ids, 20); foreach ($batches as $batch) { $xml = $this->efetch('pmc', $batch); $authors = $this->parsePMCXml($xml); $allAuthors = array_merge($allAuthors, $authors); usleep(400000); } $experts = $this->aggregateExperts($allAuthors); return [ 'experts' => $experts, 'total' => count($experts), 'articles_scanned' => count($ids), 'source' => 'pmc', ]; } // ==================== NCBI API Calls ==================== private function esearch($db, $keyword, $maxResults, $minYear) { $term = $keyword . ' AND ' . $minYear . ':' . date('Y') . '[pdat]'; $response = $this->httpClient->get($this->ncbiBaseUrl . 'esearch.fcgi', [ 'query' => [ 'db' => $db, 'term' => $term, 'retmax' => $maxResults, 'retmode' => 'json', 'sort' => 'relevance', ], ]); $data = json_decode($response->getBody()->getContents(), true); return $data['esearchresult']['idlist'] ?? []; } private function efetch($db, $ids) { $response = $this->httpClient->post($this->ncbiBaseUrl . 'efetch.fcgi', [ 'form_params' => [ 'db' => $db, 'id' => implode(',', $ids), 'retmode' => 'xml', ], ]); return $response->getBody()->getContents(); } // ==================== PubMed XML Parsing ==================== private function parsePubMedXml($xmlString) { $results = []; libxml_use_internal_errors(true); $xml = simplexml_load_string($xmlString); if ($xml === false) { return $results; } foreach ($xml->PubmedArticle as $article) { $citation = $article->MedlineCitation; $articleData = $citation->Article; $title = $this->xmlNodeToString($articleData->ArticleTitle); $pmid = (string) $citation->PMID; $journal = ''; if (isset($articleData->Journal->Title)) { $journal = (string) $articleData->Journal->Title; } if (!isset($articleData->AuthorList->Author)) { continue; } foreach ($articleData->AuthorList->Author as $author) { $lastName = (string) ($author->LastName ?? ''); $foreName = (string) ($author->ForeName ?? ''); $fullName = trim($foreName . ' ' . $lastName); if (empty($fullName)) { continue; } $email = ''; $affiliation = ''; if (isset($author->AffiliationInfo)) { foreach ($author->AffiliationInfo as $affInfo) { $affText = (string) $affInfo->Affiliation; if (empty($affiliation)) { $affiliation = $affText; } if (empty($email)) { $email = $this->extractEmailFromText($affText); } } } if (empty($email)) { continue; } $results[] = [ 'name' => $fullName, 'email' => strtolower($email), 'affiliation' => $this->cleanAffiliation($affiliation), 'article_title' => $title, 'article_id' => $pmid, 'journal' => $journal, ]; } } return $results; } // ==================== PMC XML Parsing ==================== private function parsePMCXml($xmlString) { $results = []; libxml_use_internal_errors(true); $xml = simplexml_load_string($xmlString); if ($xml === false) { return $results; } $articles = $xml->article ?? $xml->children(); foreach ($articles as $article) { if ($article->getName() !== 'article') { continue; } $front = $article->front; if (!$front) { continue; } $articleMeta = $front->{'article-meta'}; if (!$articleMeta) { continue; } $title = $this->xmlNodeToString($articleMeta->{'title-group'}->{'article-title'} ?? null); $pmcId = ''; if (isset($articleMeta->{'article-id'})) { foreach ($articleMeta->{'article-id'} as $idNode) { if ((string) $idNode['pub-id-type'] === 'pmc') { $pmcId = (string) $idNode; } } } $journal = ''; if (isset($front->{'journal-meta'}->{'journal-title'})) { $journal = (string) $front->{'journal-meta'}->{'journal-title'}; } elseif (isset($front->{'journal-meta'}->{'journal-title-group'}->{'journal-title'})) { $journal = (string) $front->{'journal-meta'}->{'journal-title-group'}->{'journal-title'}; } $correspEmails = []; if (isset($articleMeta->{'author-notes'})) { $this->extractEmailsFromNode($articleMeta->{'author-notes'}, $correspEmails); } $affiliationMap = []; if (isset($articleMeta->{'contrib-group'})) { foreach ($articleMeta->{'contrib-group'}->children() as $child) { if ($child->getName() === 'aff') { $affId = (string) ($child['id'] ?? ''); $affText = $this->xmlNodeToString($child); if ($affId) { $affiliationMap[$affId] = $affText; } } } } if (isset($front->{'article-meta'}->{'aff'})) { foreach ($front->{'article-meta'}->{'aff'} as $aff) { $affId = (string) ($aff['id'] ?? ''); $affText = $this->xmlNodeToString($aff); if ($affId) { $affiliationMap[$affId] = $affText; } } } if (!isset($articleMeta->{'contrib-group'})) { continue; } foreach ($articleMeta->{'contrib-group'}->contrib as $contrib) { $contribType = (string) ($contrib['contrib-type'] ?? ''); if ($contribType !== 'author') { continue; } $nameNode = $contrib->name; if (!$nameNode) { continue; } $surname = (string) ($nameNode->surname ?? ''); $givenNames = (string) ($nameNode->{'given-names'} ?? ''); $fullName = trim($givenNames . ' ' . $surname); if (empty($fullName)) { continue; } $email = ''; if (isset($contrib->email)) { $email = strtolower(trim((string) $contrib->email)); } $affiliation = ''; if (isset($contrib->xref)) { foreach ($contrib->xref as $xref) { if ((string) $xref['ref-type'] === 'aff') { $rid = (string) $xref['rid']; if (isset($affiliationMap[$rid])) { $affiliation = $affiliationMap[$rid]; break; } } } } if (empty($affiliation) && isset($contrib->aff)) { $affiliation = $this->xmlNodeToString($contrib->aff); } $isCorresponding = false; if (isset($contrib->xref)) { foreach ($contrib->xref as $xref) { if ((string) $xref['ref-type'] === 'corresp') { $isCorresponding = true; } } } if ((string) ($contrib['corresp'] ?? '') === 'yes') { $isCorresponding = true; } if (empty($email) && $isCorresponding && !empty($correspEmails)) { $email = $correspEmails[0]; } if (empty($email)) { $extracted = $this->extractEmailFromText($affiliation); if ($extracted) { $email = $extracted; } } if (empty($email)) { continue; } $results[] = [ 'name' => $fullName, 'email' => strtolower($email), 'affiliation' => $this->cleanAffiliation($affiliation), 'article_title' => $title, 'article_id' => $pmcId, 'journal' => $journal, ]; } } return $results; } // ==================== Aggregation ==================== private function aggregateExperts($authorRecords) { $map = []; foreach ($authorRecords as $record) { $key = strtolower(trim($record['email'])); if (empty($key)) { continue; } if (!isset($map[$key])) { $map[$key] = [ 'name' => $record['name'], 'email' => $record['email'], 'affiliation' => $record['affiliation'], 'paper_count' => 0, 'papers' => [], ]; } $map[$key]['paper_count']++; if (count($map[$key]['papers']) < 10) { $map[$key]['papers'][] = [ 'title' => $record['article_title'], 'article_id' => $record['article_id'], 'journal' => $record['journal'], ]; } if (empty($map[$key]['affiliation']) && !empty($record['affiliation'])) { $map[$key]['affiliation'] = $record['affiliation']; } } $experts = array_values($map); usort($experts, function ($a, $b) { return $b['paper_count'] - $a['paper_count']; }); return $experts; } // ==================== Excel Export ==================== private function generateExcel($experts, $keyword) { $spreadsheet = new \PhpOffice\PhpSpreadsheet\Spreadsheet(); $sheet = $spreadsheet->getActiveSheet(); $sheet->setTitle('Experts'); $headers = ['#', 'Name', 'Email', 'Affiliation', 'Paper Count', 'Representative Papers']; foreach ($headers as $col => $header) { $sheet->setCellValueByColumnAndRow($col + 1, 1, $header); } $headerStyle = [ 'font' => ['bold' => true, 'color' => ['rgb' => 'FFFFFF']], 'fill' => ['fillType' => \PhpOffice\PhpSpreadsheet\Style\Fill::FILL_SOLID, 'startColor' => ['rgb' => '4472C4']], 'alignment' => ['horizontal' => \PhpOffice\PhpSpreadsheet\Style\Alignment::HORIZONTAL_CENTER], ]; $sheet->getStyle('A1:F1')->applyFromArray($headerStyle); foreach ($experts as $i => $expert) { $row = $i + 2; $paperTitles = array_map(function ($p) { return $p['title']; }, $expert['papers']); $sheet->setCellValueByColumnAndRow(1, $row, $i + 1); $sheet->setCellValueByColumnAndRow(2, $row, $expert['name']); $sheet->setCellValueByColumnAndRow(3, $row, $expert['email']); $sheet->setCellValueByColumnAndRow(4, $row, $expert['affiliation']); $sheet->setCellValueByColumnAndRow(5, $row, $expert['paper_count']); $sheet->setCellValueByColumnAndRow(6, $row, implode("\n", $paperTitles)); } $sheet->getColumnDimension('A')->setWidth(6); $sheet->getColumnDimension('B')->setWidth(25); $sheet->getColumnDimension('C')->setWidth(35); $sheet->getColumnDimension('D')->setWidth(50); $sheet->getColumnDimension('E')->setWidth(12); $sheet->getColumnDimension('F')->setWidth(60); $filename = 'experts_' . preg_replace('/[^a-zA-Z0-9]/', '_', $keyword) . '_' . date('Ymd_His') . '.xlsx'; $filepath = ROOT_PATH . 'public' . DS . 'exports' . DS . $filename; $dir = ROOT_PATH . 'public' . DS . 'exports'; if (!is_dir($dir)) { mkdir($dir, 0777, true); } $writer = new \PhpOffice\PhpSpreadsheet\Writer\Xlsx($spreadsheet); $writer->save($filepath); return jsonSuccess([ 'file_url' => '/exports/' . $filename, 'file_name' => $filename, 'count' => count($experts), ]); } // ==================== Helper Methods ==================== private function extractEmailFromText($text) { if (empty($text)) { return ''; } if (preg_match('/[Ee]lectronic address:\s*([^\s;,]+@[^\s;,]+)/', $text, $m)) { return strtolower(trim($m[1], '.')); } if (preg_match('/[Ee]-?mail:\s*([^\s;,]+@[^\s;,]+)/', $text, $m)) { return strtolower(trim($m[1], '.')); } if (preg_match('/\b([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})\b/', $text, $m)) { return strtolower(trim($m[1], '.')); } return ''; } private function extractEmailsFromNode($node, &$emails) { if ($node === null) { return; } foreach ($node->children() as $child) { if ($child->getName() === 'email') { $email = strtolower(trim((string) $child)); if (!empty($email) && !in_array($email, $emails)) { $emails[] = $email; } } $this->extractEmailsFromNode($child, $emails); } $text = (string) $node; if (preg_match_all('/\b([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})\b/', $text, $matches)) { foreach ($matches[1] as $email) { $email = strtolower(trim($email, '.')); if (!in_array($email, $emails)) { $emails[] = $email; } } } } private function cleanAffiliation($text) { $text = preg_replace('/\s*[Ee]lectronic address:\s*[^\s;,]+@[^\s;,]+/', '', $text); $text = preg_replace('/\s*[Ee]-?mail:\s*[^\s;,]+@[^\s;,]+/', '', $text); $text = preg_replace('/\s*\b[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}\b/', '', $text); $text = trim($text, " \t\n\r\0\x0B.,;"); return $text; } private function xmlNodeToString($node) { if ($node === null) { return ''; } $xml = $node->asXML(); $text = strip_tags($xml); $text = html_entity_decode($text, ENT_QUOTES | ENT_XML1, 'UTF-8'); return trim(preg_replace('/\s+/', ' ', $text)); } }