tougao/application/common/ExpertFinderService.php

<?php

namespace app\common;

use think\Db;
use GuzzleHttp\Client;

class ExpertFinderService
{
    private $httpClient;
    private $ncbiBaseUrl = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/';
    private $logFile;

    public function __construct()
    {
        $this->httpClient = new Client([
            'timeout'         => 180,
            'connect_timeout' => 15,
            'verify'          => false,
        ]);
        $this->logFile = ROOT_PATH . 'runtime' . DS . 'expert_finder.log';
    }

    public function doFetchForField($field, $source = 'pubmed', $perPage = 100, $minYear = null)
    {
        if ($minYear === null) {
            $minYear = date('Y') - 3;
        }

        $fetchLog = $this->getFetchLog($field, $source);
        $page     = $fetchLog['last_page'] + 1;

        if ($source === 'pmc') {
            $result = $this->searchViaPMC($field, $perPage, $minYear, $page);
        } else {
            $result = $this->searchViaPubMed($field, $perPage, $minYear, $page);
        }

        $saveResult = $this->saveExperts($result['experts'], $field, $source);

        $nextPage   = $result['has_more'] ? $page : 0;
        $totalPages = isset($result['total_pages']) ? $result['total_pages'] : 0;
        $this->updateFetchLog($field, $source, $nextPage, $totalPages);

        return [
            'keyword'        => $field,
            'page'           => $page,
            'experts_found'  => $result['total'],
            'saved_new'      => $saveResult['inserted'],
            'saved_exist'    => $saveResult['existing'],
            'field_enriched' => $saveResult['field_enriched'],
            'has_more'       => $result['has_more'],
        ];
    }

    public function searchExperts($keyword, $perPage, $minYear, $page, $source)
    {
        if ($source === 'pmc') {
            return $this->searchViaPMC($keyword, $perPage, $minYear, $page);
        }
        return $this->searchViaPubMed($keyword, $perPage, $minYear, $page);
    }

    public function saveExperts($experts, $field, $source)
    {
        $inserted    = 0;
        $existing    = 0;
        $fieldEnrich = 0;

        foreach ($experts as $expert) {
            $email = strtolower(trim($expert['email']));
            if (empty($email)) {
                continue;
            }

            $exists = Db::name('expert')->where('email', $email)->find();

            if ($exists) {
                $existing++;
                $fieldEnrich += $this->enrichExpertField($exists['expert_id'], $field);
                continue;
            }

            $insert = [
                'name'        => mb_substr($expert['name'], 0, 255),
                'email'       => mb_substr($email, 0, 128),
                'affiliation' => mb_substr($expert['affiliation'], 0, 128),
                'source'      => mb_substr($source, 0, 128),
                'ctime'       => time(),
                'ltime'       => 0,
                'state'       => 0,
            ];

            try {
                $expertId = Db::name('expert')->insertGetId($insert);
                $this->enrichExpertField($expertId, $field);
                $inserted++;
            } catch (\Exception $e) {
                $existing++;
            }
        }

        return ['inserted' => $inserted, 'existing' => $existing, 'field_enriched' => $fieldEnrich];
    }

    public function getFetchLog($field, $source)
    {
        $log = Db::name('expert_fetch')
            ->where('field', $field)
            ->where('source', $source)
            ->find();

        if (!$log) {
            return ['last_page' => 0, 'total_pages' => 0, 'last_time' => 0];
        }

        return $log;
    }

    public function updateFetchLog($field, $source, $lastPage, $totalPages)
    {
        $exists = Db::name('expert_fetch')
            ->where('field', $field)
            ->where('source', $source)
            ->find();

        if ($exists) {
            Db::name('expert_fetch')
                ->where('expert_fetch_id', $exists['expert_fetch_id'])
                ->update([
                    'last_page'   => $lastPage,
                    'total_pages' => $totalPages,
                    'last_time'   => time(),
                ]);
        } else {
            Db::name('expert_fetch')->insert([
                'field'       => mb_substr($field, 0, 128),
                'source'      => mb_substr($source, 0, 128),
                'last_page'   => $lastPage,
                'total_pages' => $totalPages,
                'last_time'   => time(),
            ]);
        }
    }

    // ==================== PubMed Search ====================

    private function searchViaPubMed($keyword, $perPage, $minYear, $page = 1)
    {
        set_time_limit(600);

        $searchResult  = $this->esearch('pubmed', $keyword, $perPage, $minYear, $page);
        $ids           = $searchResult['ids'];
        $totalArticles = $searchResult['total'];

        if (empty($ids)) {
            return $this->buildPagedResult([], 0, 0, $totalArticles, $page, $perPage, 'pubmed');
        }

        $allAuthors = [];
        $batches = array_chunk($ids, 50);
        foreach ($batches as $batch) {
            $xml = $this->efetchWithRetry('pubmed', $batch);
            if ($xml) {
                $authors = $this->parsePubMedXml($xml);
                $allAuthors = array_merge($allAuthors, $authors);
            }
            usleep(400000);
        }

        $experts = $this->aggregateExperts($allAuthors);

        return $this->buildPagedResult($experts, count($experts), count($ids), $totalArticles, $page, $perPage, 'pubmed');
    }

    // ==================== PMC Search ====================

    private function searchViaPMC($keyword, $perPage, $minYear, $page = 1)
    {
        set_time_limit(600);

        $searchResult  = $this->esearch('pmc', $keyword, $perPage, $minYear, $page);
        $ids           = $searchResult['ids'];
        $totalArticles = $searchResult['total'];

        if (empty($ids)) {
            return $this->buildPagedResult([], 0, 0, $totalArticles, $page, $perPage, 'pmc');
        }

        $allAuthors = [];
        $batches = array_chunk($ids, 5);
        foreach ($batches as $batch) {
            $xml = $this->efetchWithRetry('pmc', $batch);
            if ($xml) {
                $authors = $this->parsePMCXml($xml);
                $allAuthors = array_merge($allAuthors, $authors);
            }
            usleep(500000);
        }

        $experts = $this->aggregateExperts($allAuthors);

        return $this->buildPagedResult($experts, count($experts), count($ids), $totalArticles, $page, $perPage, 'pmc');
    }

    // ==================== NCBI API ====================

    private function esearch($db, $keyword, $perPage, $minYear, $page = 1)
    {
        $term     = $keyword . ' AND ' . $minYear . ':' . date('Y') . '[pdat]';
        $retstart = ($page - 1) * $perPage;

        $response = $this->httpClient->get($this->ncbiBaseUrl . 'esearch.fcgi', [
            'query' => [
                'db'       => $db,
                'term'     => $term,
                'retstart' => $retstart,
                'retmax'   => $perPage,
                'retmode'  => 'json',
                'sort'     => 'relevance',
            ],
        ]);

        $data  = json_decode($response->getBody()->getContents(), true);
        $ids   = $data['esearchresult']['idlist'] ?? [];
        $total = intval($data['esearchresult']['count'] ?? 0);

        return ['ids' => $ids, 'total' => $total];
    }

    private function efetch($db, $ids)
    {
        $response = $this->httpClient->post($this->ncbiBaseUrl . 'efetch.fcgi', [
            'form_params' => [
                'db'      => $db,
                'id'      => implode(',', $ids),
                'retmode' => 'xml',
            ],
        ]);

        return $response->getBody()->getContents();
    }

    private function efetchWithRetry($db, $ids, $maxRetries = 3)
    {
        for ($attempt = 1; $attempt <= $maxRetries; $attempt++) {
            try {
                return $this->efetch($db, $ids);
            } catch (\Exception $e) {
                if ($attempt === $maxRetries) {
                    if (count($ids) > 1) {
                        $half = ceil(count($ids) / 2);
                        $firstHalf  = array_slice($ids, 0, $half);
                        $secondHalf = array_slice($ids, $half);
                        $xml1 = $this->efetchWithRetry($db, $firstHalf, 2);
                        $xml2 = $this->efetchWithRetry($db, $secondHalf, 2);
                        return $this->mergeXml($xml1, $xml2);
                    }
                    return null;
                }
                sleep($attempt * 2);
            }
        }
        return null;
    }

    private function mergeXml($xml1, $xml2)
    {
        if (empty($xml1)) return $xml2;
        if (empty($xml2)) return $xml1;
        return $xml1 . "\n" . $xml2;
    }

    // ==================== PubMed XML Parsing ====================

    private function parsePubMedXml($xmlString)
    {
        $results = [];
        libxml_use_internal_errors(true);
        $xml = simplexml_load_string($xmlString);
        if ($xml === false) {
            return $results;
        }

        foreach ($xml->PubmedArticle as $article) {
            $citation   = $article->MedlineCitation;
            $articleData = $citation->Article;
            $title = $this->xmlNodeToString($articleData->ArticleTitle);
            $pmid  = (string) $citation->PMID;

            $journal = '';
            if (isset($articleData->Journal->Title)) {
                $journal = (string) $articleData->Journal->Title;
            }
            if (!isset($articleData->AuthorList->Author)) {
                continue;
            }

            foreach ($articleData->AuthorList->Author as $author) {
                $lastName  = (string) ($author->LastName ?? '');
                $foreName  = (string) ($author->ForeName ?? '');
                $fullName  = trim($foreName . ' ' . $lastName);
                if (empty($fullName)) continue;

                $email       = '';
                $affiliation = '';
                if (isset($author->AffiliationInfo)) {
                    foreach ($author->AffiliationInfo as $affInfo) {
                        $affText = (string) $affInfo->Affiliation;
                        if (empty($affiliation)) $affiliation = $affText;
                        if (empty($email)) $email = $this->extractEmailFromText($affText);
                    }
                }
                if (empty($email)) continue;

                $results[] = [
                    'name'          => $fullName,
                    'email'         => strtolower($email),
                    'affiliation'   => $this->cleanAffiliation($affiliation),
                    'article_title' => $title,
                    'article_id'    => $pmid,
                    'journal'       => $journal,
                ];
            }
        }

        return $results;
    }

    // ==================== PMC XML Parsing ====================

    private function parsePMCXml($xmlString)
    {
        $results = [];
        libxml_use_internal_errors(true);
        $xml = simplexml_load_string($xmlString);
        if ($xml === false) {
            return $results;
        }

        $articles = $xml->article ?? $xml->children();

        foreach ($articles as $article) {
            if ($article->getName() !== 'article') continue;

            $front = $article->front;
            if (!$front) continue;
            $articleMeta = $front->{'article-meta'};
            if (!$articleMeta) continue;

            $title = $this->xmlNodeToString($articleMeta->{'title-group'}->{'article-title'} ?? null);
            $pmcId = '';
            if (isset($articleMeta->{'article-id'})) {
                foreach ($articleMeta->{'article-id'} as $idNode) {
                    if ((string) $idNode['pub-id-type'] === 'pmc') {
                        $pmcId = (string) $idNode;
                    }
                }
            }

            $journal = '';
            if (isset($front->{'journal-meta'}->{'journal-title'})) {
                $journal = (string) $front->{'journal-meta'}->{'journal-title'};
            } elseif (isset($front->{'journal-meta'}->{'journal-title-group'}->{'journal-title'})) {
                $journal = (string) $front->{'journal-meta'}->{'journal-title-group'}->{'journal-title'};
            }

            $correspEmails = [];
            if (isset($articleMeta->{'author-notes'})) {
                $this->extractEmailsFromNode($articleMeta->{'author-notes'}, $correspEmails);
            }

            $affiliationMap = [];
            if (isset($articleMeta->{'contrib-group'})) {
                foreach ($articleMeta->{'contrib-group'}->children() as $child) {
                    if ($child->getName() === 'aff') {
                        $affId   = (string) ($child['id'] ?? '');
                        $affText = $this->xmlNodeToString($child);
                        if ($affId) $affiliationMap[$affId] = $affText;
                    }
                }
            }
            if (isset($front->{'article-meta'}->{'aff'})) {
                foreach ($front->{'article-meta'}->{'aff'} as $aff) {
                    $affId   = (string) ($aff['id'] ?? '');
                    $affText = $this->xmlNodeToString($aff);
                    if ($affId) $affiliationMap[$affId] = $affText;
                }
            }

            if (!isset($articleMeta->{'contrib-group'})) continue;

            foreach ($articleMeta->{'contrib-group'}->contrib as $contrib) {
                if ((string) ($contrib['contrib-type'] ?? '') !== 'author') continue;
                $nameNode = $contrib->name;
                if (!$nameNode) continue;

                $surname    = (string) ($nameNode->surname ?? '');
                $givenNames = (string) ($nameNode->{'given-names'} ?? '');
                $fullName   = trim($givenNames . ' ' . $surname);
                if (empty($fullName)) continue;

                $email = '';
                if (isset($contrib->email)) {
                    $email = strtolower(trim((string) $contrib->email));
                }

                $affiliation = '';
                if (isset($contrib->xref)) {
                    foreach ($contrib->xref as $xref) {
                        if ((string) $xref['ref-type'] === 'aff') {
                            $rid = (string) $xref['rid'];
                            if (isset($affiliationMap[$rid])) {
                                $affiliation = $affiliationMap[$rid];
                                break;
                            }
                        }
                    }
                }
                if (empty($affiliation) && isset($contrib->aff)) {
                    $affiliation = $this->xmlNodeToString($contrib->aff);
                }

                $isCorresponding = false;
                if (isset($contrib->xref)) {
                    foreach ($contrib->xref as $xref) {
                        if ((string) $xref['ref-type'] === 'corresp') $isCorresponding = true;
                    }
                }
                if ((string) ($contrib['corresp'] ?? '') === 'yes') $isCorresponding = true;

                if (empty($email) && $isCorresponding && !empty($correspEmails)) {
                    $email = $correspEmails[0];
                }
                if (empty($email)) {
                    $extracted = $this->extractEmailFromText($affiliation);
                    if ($extracted) $email = $extracted;
                }
                if (empty($email)) continue;

                $results[] = [
                    'name'          => $fullName,
                    'email'         => strtolower($email),
                    'affiliation'   => $this->cleanAffiliation($affiliation),
                    'article_title' => $title,
                    'article_id'    => $pmcId,
                    'journal'       => $journal,
                ];
            }
        }

        return $results;
    }

    // ==================== Aggregation / Pagination ====================

    private function aggregateExperts($authorRecords)
    {
        $map = [];
        foreach ($authorRecords as $record) {
            $key = strtolower(trim($record['email']));
            if (empty($key)) continue;

            if (!isset($map[$key])) {
                $map[$key] = [
                    'name'        => $record['name'],
                    'email'       => $record['email'],
                    'affiliation' => $record['affiliation'],
                    'paper_count' => 0,
                    'papers'      => [],
                ];
            }
            $map[$key]['paper_count']++;
            if (count($map[$key]['papers']) < 10) {
                $map[$key]['papers'][] = [
                    'title'      => $record['article_title'],
                    'article_id' => $record['article_id'],
                    'journal'    => $record['journal'],
                ];
            }
            if (empty($map[$key]['affiliation']) && !empty($record['affiliation'])) {
                $map[$key]['affiliation'] = $record['affiliation'];
            }
        }

        $experts = array_values($map);
        usort($experts, function ($a, $b) {
            return $b['paper_count'] - $a['paper_count'];
        });
        return $experts;
    }

    private function buildPagedResult($experts, $expertCount, $articlesScanned, $totalArticles, $page, $perPage, $source)
    {
        $totalPages = $totalArticles > 0 ? ceil($totalArticles / $perPage) : 0;
        return [
            'experts'          => $experts,
            'total'            => $expertCount,
            'articles_scanned' => $articlesScanned,
            'total_articles'   => $totalArticles,
            'page'             => $page,
            'per_page'         => $perPage,
            'total_pages'      => $totalPages,
            'has_more'         => $page < $totalPages,
            'source'           => $source,
        ];
    }

    // ==================== DB Helpers ====================

    private function enrichExpertField($expertId, $field)
    {
        $field = trim($field);
        if (empty($field)) return 0;

        $exists = Db::name('expert_field')
            ->where('expert_id', $expertId)
            ->where('field', $field)
            ->where('state', 0)
            ->find();
        if ($exists) return 0;
        $major = Db::name("major")->where("major_title",$field)->where("state",0)->find();
        $major_id = $major ? $major['major_id'] : 0;
        Db::name('expert_field')->insert([
            'expert_id' => $expertId,
            'major_id'  => $major_id,
            'field'     => mb_substr($field, 0, 128),
            'state'     => 0,
        ]);
        return 1;
    }

    // ==================== Text Helpers ====================

    private function extractEmailFromText($text)
    {
        if (empty($text)) return '';
        if (preg_match('/[Ee]lectronic address:\s*([^\s;,]+@[^\s;,]+)/', $text, $m)) {
            return strtolower(trim($m[1], '.'));
        }
        if (preg_match('/[Ee]-?mail:\s*([^\s;,]+@[^\s;,]+)/', $text, $m)) {
            return strtolower(trim($m[1], '.'));
        }
        if (preg_match('/\b([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})\b/', $text, $m)) {
            return strtolower(trim($m[1], '.'));
        }
        return '';
    }

    private function extractEmailsFromNode($node, &$emails)
    {
        if ($node === null) return;
        foreach ($node->children() as $child) {
            if ($child->getName() === 'email') {
                $email = strtolower(trim((string) $child));
                if (!empty($email) && !in_array($email, $emails)) $emails[] = $email;
            }
            $this->extractEmailsFromNode($child, $emails);
        }
        $text = (string) $node;
        if (preg_match_all('/\b([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})\b/', $text, $matches)) {
            foreach ($matches[1] as $email) {
                $email = strtolower(trim($email, '.'));
                if (!in_array($email, $emails)) $emails[] = $email;
            }
        }
    }

    private function cleanAffiliation($text)
    {
        $text = preg_replace('/\s*[Ee]lectronic address:\s*[^\s;,]+@[^\s;,]+/', '', $text);
        $text = preg_replace('/\s*[Ee]-?mail:\s*[^\s;,]+@[^\s;,]+/', '', $text);
        $text = preg_replace('/\s*\b[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}\b/', '', $text);
        return trim($text, " \t\n\r\0\x0B.,;");
    }

    private function xmlNodeToString($node)
    {
        if ($node === null) return '';
        $xml  = $node->asXML();
        $text = strip_tags($xml);
        $text = html_entity_decode($text, ENT_QUOTES | ENT_XML1, 'UTF-8');
        return trim(preg_replace('/\s+/', ' ', $text));
    }

    // ==================== Logging ====================

    public function log($msg)
    {
        $line = date('Y-m-d H:i:s') . ' ' . $msg . PHP_EOL;
        @file_put_contents($this->logFile, $line, FILE_APPEND);
    }
}