httpClient = new Client([ 'timeout' => 180, 'connect_timeout' => 15, 'verify' => false, ]); } /** * Main search endpoint * Params: * keyword - search term (e.g. "biomedical engineering") * page - page number, default 1 * per_page - articles per page, default 100, max 100 * min_year - earliest publication year, default current-3 * source - "pubmed" (fast, email from affiliation) or "pmc" (slower, structured email) */ public function search() { $keyword = trim($this->request->param('keyword', '')); $page = max(1, intval($this->request->param('page', 1))); $perPage = max(10, min(intval($this->request->param('per_page', 100)), 100)); $minYear = intval($this->request->param('min_year', date('Y') - 3)); $source = $this->request->param('source', 'pubmed'); if (empty($keyword)) { return jsonError('keyword is required'); } $cacheKey = 'expert_finder_' . md5($keyword . $page . $perPage . $minYear . $source); $cached = Cache::get($cacheKey); if ($cached) { return jsonSuccess($cached); } try { if ($source === 'pmc') { $result = $this->searchViaPMC($keyword, $perPage, $minYear, $page); } else { $result = $this->searchViaPubMed($keyword, $perPage, $minYear, $page); } } catch (\Exception $e) { return jsonError('Search failed: ' . $e->getMessage()); } $saveResult = $this->saveExperts($result['experts'], $keyword, $source); $result['saved_new'] = $saveResult['inserted']; $result['saved_exist'] = $saveResult['existing']; Cache::set($cacheKey, $result, 3600); return jsonSuccess($result); } /** * Get experts from local database * Params: * field - filter by field keyword (searches t_expert_field) * major_id - filter by major_id (searches t_expert_field) * state - filter by state (0-5), -1 for all * keyword - search name/email/affiliation * no_recent - if 1, exclude experts promoted within last N days (default 30) * recent_days - days threshold for no_recent filter, default 30 * page - page number, default 1 * per_page - items per page, default 20 * min_experts - auto-fetch threshold, default 50 */ public function getList() { $field = trim($this->request->param('field', '')); $majorId = intval($this->request->param('major_id', 0)); $state = $this->request->param('state', '-1'); $keyword = trim($this->request->param('keyword', '')); $noRecent = intval($this->request->param('no_recent', 0)); $recentDays = max(1, intval($this->request->param('recent_days', 30))); $page = max(1, intval($this->request->param('page', 1))); $perPage = max(1, min(intval($this->request->param('per_page', 20)), 100)); $minExperts = max(0, intval($this->request->param('min_experts', 50))); $query = Db::name('expert')->alias('e'); $needJoin = ($field !== '' || $majorId > 0); if ($needJoin) { $query->join('t_expert_field ef', 'ef.expert_id = e.expert_id AND ef.state = 0', 'inner'); if ($field !== '') { $query->where('ef.field', 'like', '%' . $field . '%'); } if ($majorId > 0) { $query->where('ef.major_id', $majorId); } $query->group('e.expert_id'); } if ($state !== '-1' && $state !== '') { $query->where('e.state', intval($state)); } if ($keyword !== '') { $query->where('e.name|e.email|e.affiliation', 'like', '%' . $keyword . '%'); } if ($noRecent) { $cutoff = time() - ($recentDays * 86400); $query->where(function ($q) use ($cutoff) { $q->where('e.ltime', 0)->whereOr('e.ltime', '<', $cutoff); }); } $countQuery = clone $query; $total = $countQuery->count('distinct e.expert_id'); $list = $query ->field('e.*') ->order('e.ctime desc') ->page($page, $perPage) ->select(); foreach ($list as &$item) { $item['fields'] = Db::name('expert_field') ->where('expert_id', $item['expert_id']) ->where('state', 0) ->column('field'); } $fetching = false; if ($field !== '' && $total < $minExperts && $minExperts > 0) { $this->triggerBackgroundFetch($field); $fetching = true; } return jsonSuccess([ 'list' => $list, 'total' => $total, 'page' => $page, 'per_page' => $perPage, 'total_pages' => $total > 0 ? ceil($total / $perPage) : 0, 'fetching' => $fetching, ]); } /** * Get all fields associated with an expert */ public function getExpertFields() { $expertId = intval($this->request->param('expert_id', 0)); if (!$expertId) { return jsonError('expert_id is required'); } $fields = Db::name('expert_field') ->where('expert_id', $expertId) ->where('state', 0) ->select(); return jsonSuccess($fields); } /** * Update expert state * Params: * expert_id - single id or comma-separated ids * state - new state (0-5) */ public function updateState() { $expertId = $this->request->param('expert_id', ''); $state = intval($this->request->param('state', 0)); if (empty($expertId)) { return jsonError('expert_id is required'); } if ($state < 0 || $state > 5) { return jsonError('state must be 0-5'); } $ids = array_map('intval', explode(',', $expertId)); $count = Db::name('expert')->where('expert_id', 'in', $ids)->update(['state' => $state]); return jsonSuccess(['updated' => $count]); } /** * Delete expert (soft: set state=5 blacklist, or hard delete) * Params: * expert_id - single id or comma-separated ids * hard - 1 for hard delete, default 0 (blacklist) */ public function deleteExpert() { $expertId = $this->request->param('expert_id', ''); $hard = intval($this->request->param('hard', 0)); if (empty($expertId)) { return jsonError('expert_id is required'); } $ids = array_map('intval', explode(',', $expertId)); if ($hard) { $count = Db::name('expert')->where('expert_id', 'in', $ids)->delete(); } else { $count = Db::name('expert')->where('expert_id', 'in', $ids)->update(['state' => 5]); } return jsonSuccess(['affected' => $count]); } /** * Export search results to Excel * Same params as search(), exports current page results */ public function export() { $keyword = trim($this->request->param('keyword', '')); $page = max(1, intval($this->request->param('page', 1))); $perPage = max(10, min(intval($this->request->param('per_page', 100)), 100)); $minYear = intval($this->request->param('min_year', date('Y') - 3)); $source = $this->request->param('source', 'pubmed'); if (empty($keyword)) { return jsonError('keyword is required'); } $cacheKey = 'expert_finder_' . md5($keyword . $page . $perPage . $minYear . $source); $cached = Cache::get($cacheKey); if (!$cached) { try { if ($source === 'pmc') { $cached = $this->searchViaPMC($keyword, $perPage, $minYear, $page); } else { $cached = $this->searchViaPubMed($keyword, $perPage, $minYear, $page); } Cache::set($cacheKey, $cached, 3600); } catch (\Exception $e) { return jsonError('Search failed: ' . $e->getMessage()); } } if (empty($cached['experts'])) { return jsonError('No experts found to export'); } return $this->generateExcel($cached['experts'], $keyword, $page); } /** * Clear search cache */ public function clearCache() { $keyword = trim($this->request->param('keyword', '')); $maxResults = intval($this->request->param('max_results', 200)); $minYear = intval($this->request->param('min_year', date('Y') - 3)); $source = $this->request->param('source', 'pubmed'); $cacheKey = 'expert_finder_' . md5($keyword . $maxResults . $minYear . $source); Cache::rm($cacheKey); return jsonSuccess(['msg' => 'Cache cleared']); } // ==================== Cron / Auto Fetch ==================== /** * Cron job: daily fetch experts for given keywords * Params: * keywords - comma-separated keywords (e.g. "biomedical engineering,tissue engineering") * source - pubmed or pmc, default pubmed * per_page - articles per page, default 100 * min_year - default current-3 * * Uses cache to remember which page was last fetched per keyword, * so each cron run fetches the next page automatically. */ public function cronFetch() { $keywordsStr = trim($this->request->param('keywords', '')); $source = $this->request->param('source', 'pubmed'); $perPage = max(10, min(intval($this->request->param('per_page', 100)), 100)); $minYear = intval($this->request->param('min_year', date('Y') - 3)); if (empty($keywordsStr)) { return jsonError('keywords is required'); } set_time_limit(0); $keywords = array_map('trim', explode(',', $keywordsStr)); $report = []; foreach ($keywords as $kw) { if (empty($kw)) { continue; } $fetchLog = $this->getFetchLog($kw, $source); $page = $fetchLog['last_page'] + 1; try { if ($source === 'pmc') { $result = $this->searchViaPMC($kw, $perPage, $minYear, $page); } else { $result = $this->searchViaPubMed($kw, $perPage, $minYear, $page); } $saveResult = $this->saveExperts($result['experts'], $kw, $source); $nextPage = $result['has_more'] ? $page : 0; $totalPages = isset($result['total_pages']) ? $result['total_pages'] : 0; $this->updateFetchLog($kw, $source, $nextPage, $totalPages); $report[] = [ 'keyword' => $kw, 'page' => $page, 'experts_found' => $result['total'], 'saved_new' => $saveResult['inserted'], 'saved_exist' => $saveResult['existing'], 'field_enriched' => $saveResult['field_enriched'], 'has_more' => $result['has_more'], ]; } catch (\Exception $e) { $report[] = [ 'keyword' => $kw, 'page' => $page, 'error' => $e->getMessage(), ]; } sleep(2); } return jsonSuccess(['report' => $report]); } /** * Trigger a background fetch for a specific field via queue */ private function triggerBackgroundFetch($field) { $lockKey = 'fetch_lock_' . md5($field); if (Cache::get($lockKey)) { return; } Cache::set($lockKey, 1, 300); \think\Queue::push('app\api\job\FetchExperts', [ 'field' => $field, 'source' => 'pubmed', 'per_page' => 100, 'min_year' => date('Y') - 3, ], 'FetchExperts'); } /** * Internal method: run a fetch for a single keyword (used by both cron and queue job) */ public function doFetchForField($field, $source = 'pubmed', $perPage = 100, $minYear = null) { if ($minYear === null) { $minYear = date('Y') - 3; } $fetchLog = $this->getFetchLog($field, $source); $page = $fetchLog['last_page'] + 1; if ($source === 'pmc') { $result = $this->searchViaPMC($field, $perPage, $minYear, $page); } else { $result = $this->searchViaPubMed($field, $perPage, $minYear, $page); } $saveResult = $this->saveExperts($result['experts'], $field, $source); $nextPage = $result['has_more'] ? $page : 0; $totalPages = isset($result['total_pages']) ? $result['total_pages'] : 0; $this->updateFetchLog($field, $source, $nextPage, $totalPages); return [ 'keyword' => $field, 'page' => $page, 'experts_found' => $result['total'], 'saved_new' => $saveResult['inserted'], 'saved_exist' => $saveResult['existing'], 'field_enriched' => $saveResult['field_enriched'], 'has_more' => $result['has_more'], ]; } // ==================== PubMed Search ==================== private function searchViaPubMed($keyword, $perPage, $minYear, $page = 1) { set_time_limit(600); $searchResult = $this->esearch('pubmed', $keyword, $perPage, $minYear, $page); $ids = $searchResult['ids']; $totalArticles = $searchResult['total']; if (empty($ids)) { return $this->buildPagedResult([], 0, 0, $totalArticles, $page, $perPage, 'pubmed'); } $allAuthors = []; $batches = array_chunk($ids, 50); foreach ($batches as $batch) { $xml = $this->efetchWithRetry('pubmed', $batch); if ($xml) { $authors = $this->parsePubMedXml($xml); $allAuthors = array_merge($allAuthors, $authors); } usleep(400000); } $experts = $this->aggregateExperts($allAuthors); return $this->buildPagedResult($experts, count($experts), count($ids), $totalArticles, $page, $perPage, 'pubmed'); } // ==================== PMC Search ==================== private function searchViaPMC($keyword, $perPage, $minYear, $page = 1) { set_time_limit(600); $searchResult = $this->esearch('pmc', $keyword, $perPage, $minYear, $page); $ids = $searchResult['ids']; $totalArticles = $searchResult['total']; if (empty($ids)) { return $this->buildPagedResult([], 0, 0, $totalArticles, $page, $perPage, 'pmc'); } $allAuthors = []; $batches = array_chunk($ids, 5); foreach ($batches as $batch) { $xml = $this->efetchWithRetry('pmc', $batch); if ($xml) { $authors = $this->parsePMCXml($xml); $allAuthors = array_merge($allAuthors, $authors); } usleep(500000); } $experts = $this->aggregateExperts($allAuthors); return $this->buildPagedResult($experts, count($experts), count($ids), $totalArticles, $page, $perPage, 'pmc'); } // ==================== NCBI API Calls ==================== private function esearch($db, $keyword, $perPage, $minYear, $page = 1) { $term = $keyword . ' AND ' . $minYear . ':' . date('Y') . '[pdat]'; $retstart = ($page - 1) * $perPage; $response = $this->httpClient->get($this->ncbiBaseUrl . 'esearch.fcgi', [ 'query' => [ 'db' => $db, 'term' => $term, 'retstart' => $retstart, 'retmax' => $perPage, 'retmode' => 'json', 'sort' => 'relevance', ], ]); $data = json_decode($response->getBody()->getContents(), true); $ids = $data['esearchresult']['idlist'] ?? []; $total = intval($data['esearchresult']['count'] ?? 0); return ['ids' => $ids, 'total' => $total]; } private function efetch($db, $ids) { $response = $this->httpClient->post($this->ncbiBaseUrl . 'efetch.fcgi', [ 'form_params' => [ 'db' => $db, 'id' => implode(',', $ids), 'retmode' => 'xml', ], ]); return $response->getBody()->getContents(); } private function efetchWithRetry($db, $ids, $maxRetries = 3) { for ($attempt = 1; $attempt <= $maxRetries; $attempt++) { try { return $this->efetch($db, $ids); } catch (\Exception $e) { if ($attempt === $maxRetries) { if (count($ids) > 1) { $half = ceil(count($ids) / 2); $firstHalf = array_slice($ids, 0, $half); $secondHalf = array_slice($ids, $half); $xml1 = $this->efetchWithRetry($db, $firstHalf, 2); $xml2 = $this->efetchWithRetry($db, $secondHalf, 2); return $this->mergeXml($xml1, $xml2); } return null; } sleep($attempt * 2); } } return null; } private function mergeXml($xml1, $xml2) { if (empty($xml1)) return $xml2; if (empty($xml2)) return $xml1; return $xml1 . "\n" . $xml2; } // ==================== PubMed XML Parsing ==================== private function parsePubMedXml($xmlString) { $results = []; libxml_use_internal_errors(true); $xml = simplexml_load_string($xmlString); if ($xml === false) { return $results; } foreach ($xml->PubmedArticle as $article) { $citation = $article->MedlineCitation; $articleData = $citation->Article; $title = $this->xmlNodeToString($articleData->ArticleTitle); $pmid = (string) $citation->PMID; $journal = ''; if (isset($articleData->Journal->Title)) { $journal = (string) $articleData->Journal->Title; } if (!isset($articleData->AuthorList->Author)) { continue; } foreach ($articleData->AuthorList->Author as $author) { $lastName = (string) ($author->LastName ?? ''); $foreName = (string) ($author->ForeName ?? ''); $fullName = trim($foreName . ' ' . $lastName); if (empty($fullName)) { continue; } $email = ''; $affiliation = ''; if (isset($author->AffiliationInfo)) { foreach ($author->AffiliationInfo as $affInfo) { $affText = (string) $affInfo->Affiliation; if (empty($affiliation)) { $affiliation = $affText; } if (empty($email)) { $email = $this->extractEmailFromText($affText); } } } if (empty($email)) { continue; } $results[] = [ 'name' => $fullName, 'email' => strtolower($email), 'affiliation' => $this->cleanAffiliation($affiliation), 'article_title' => $title, 'article_id' => $pmid, 'journal' => $journal, ]; } } return $results; } // ==================== PMC XML Parsing ==================== private function parsePMCXml($xmlString) { $results = []; libxml_use_internal_errors(true); $xml = simplexml_load_string($xmlString); if ($xml === false) { return $results; } $articles = $xml->article ?? $xml->children(); foreach ($articles as $article) { if ($article->getName() !== 'article') { continue; } $front = $article->front; if (!$front) { continue; } $articleMeta = $front->{'article-meta'}; if (!$articleMeta) { continue; } $title = $this->xmlNodeToString($articleMeta->{'title-group'}->{'article-title'} ?? null); $pmcId = ''; if (isset($articleMeta->{'article-id'})) { foreach ($articleMeta->{'article-id'} as $idNode) { if ((string) $idNode['pub-id-type'] === 'pmc') { $pmcId = (string) $idNode; } } } $journal = ''; if (isset($front->{'journal-meta'}->{'journal-title'})) { $journal = (string) $front->{'journal-meta'}->{'journal-title'}; } elseif (isset($front->{'journal-meta'}->{'journal-title-group'}->{'journal-title'})) { $journal = (string) $front->{'journal-meta'}->{'journal-title-group'}->{'journal-title'}; } $correspEmails = []; if (isset($articleMeta->{'author-notes'})) { $this->extractEmailsFromNode($articleMeta->{'author-notes'}, $correspEmails); } $affiliationMap = []; if (isset($articleMeta->{'contrib-group'})) { foreach ($articleMeta->{'contrib-group'}->children() as $child) { if ($child->getName() === 'aff') { $affId = (string) ($child['id'] ?? ''); $affText = $this->xmlNodeToString($child); if ($affId) { $affiliationMap[$affId] = $affText; } } } } if (isset($front->{'article-meta'}->{'aff'})) { foreach ($front->{'article-meta'}->{'aff'} as $aff) { $affId = (string) ($aff['id'] ?? ''); $affText = $this->xmlNodeToString($aff); if ($affId) { $affiliationMap[$affId] = $affText; } } } if (!isset($articleMeta->{'contrib-group'})) { continue; } foreach ($articleMeta->{'contrib-group'}->contrib as $contrib) { $contribType = (string) ($contrib['contrib-type'] ?? ''); if ($contribType !== 'author') { continue; } $nameNode = $contrib->name; if (!$nameNode) { continue; } $surname = (string) ($nameNode->surname ?? ''); $givenNames = (string) ($nameNode->{'given-names'} ?? ''); $fullName = trim($givenNames . ' ' . $surname); if (empty($fullName)) { continue; } $email = ''; if (isset($contrib->email)) { $email = strtolower(trim((string) $contrib->email)); } $affiliation = ''; if (isset($contrib->xref)) { foreach ($contrib->xref as $xref) { if ((string) $xref['ref-type'] === 'aff') { $rid = (string) $xref['rid']; if (isset($affiliationMap[$rid])) { $affiliation = $affiliationMap[$rid]; break; } } } } if (empty($affiliation) && isset($contrib->aff)) { $affiliation = $this->xmlNodeToString($contrib->aff); } $isCorresponding = false; if (isset($contrib->xref)) { foreach ($contrib->xref as $xref) { if ((string) $xref['ref-type'] === 'corresp') { $isCorresponding = true; } } } if ((string) ($contrib['corresp'] ?? '') === 'yes') { $isCorresponding = true; } if (empty($email) && $isCorresponding && !empty($correspEmails)) { $email = $correspEmails[0]; } if (empty($email)) { $extracted = $this->extractEmailFromText($affiliation); if ($extracted) { $email = $extracted; } } if (empty($email)) { continue; } $results[] = [ 'name' => $fullName, 'email' => strtolower($email), 'affiliation' => $this->cleanAffiliation($affiliation), 'article_title' => $title, 'article_id' => $pmcId, 'journal' => $journal, ]; } } return $results; } // ==================== Pagination ==================== private function buildPagedResult($experts, $expertCount, $articlesScanned, $totalArticles, $page, $perPage, $source) { $totalPages = $totalArticles > 0 ? ceil($totalArticles / $perPage) : 0; return [ 'experts' => $experts, 'total' => $expertCount, 'articles_scanned' => $articlesScanned, 'total_articles' => $totalArticles, 'page' => $page, 'per_page' => $perPage, 'total_pages' => $totalPages, 'has_more' => $page < $totalPages, 'source' => $source, ]; } // ==================== Aggregation ==================== private function aggregateExperts($authorRecords) { $map = []; foreach ($authorRecords as $record) { $key = strtolower(trim($record['email'])); if (empty($key)) { continue; } if (!isset($map[$key])) { $map[$key] = [ 'name' => $record['name'], 'email' => $record['email'], 'affiliation' => $record['affiliation'], 'paper_count' => 0, 'papers' => [], ]; } $map[$key]['paper_count']++; if (count($map[$key]['papers']) < 10) { $map[$key]['papers'][] = [ 'title' => $record['article_title'], 'article_id' => $record['article_id'], 'journal' => $record['journal'], ]; } if (empty($map[$key]['affiliation']) && !empty($record['affiliation'])) { $map[$key]['affiliation'] = $record['affiliation']; } } $experts = array_values($map); usort($experts, function ($a, $b) { return $b['paper_count'] - $a['paper_count']; }); return $experts; } // ==================== Excel Export ==================== private function generateExcel($experts, $keyword, $page = 1) { vendor("PHPExcel.PHPExcel"); $objPHPExcel = new \PHPExcel(); $sheet = $objPHPExcel->getActiveSheet(); $sheet->setTitle('Experts'); $headers = ['A' => '#', 'B' => 'Name', 'C' => 'Email', 'D' => 'Affiliation', 'E' => 'Paper Count', 'F' => 'Representative Papers']; foreach ($headers as $col => $header) { $sheet->setCellValue($col . '1', $header); } $headerStyle = [ 'font' => ['bold' => true, 'color' => ['rgb' => 'FFFFFF']], 'fill' => ['type' => \PHPExcel_Style_Fill::FILL_SOLID, 'startcolor' => ['rgb' => '4472C4']], 'alignment' => ['horizontal' => \PHPExcel_Style_Alignment::HORIZONTAL_CENTER], ]; $sheet->getStyle('A1:F1')->applyFromArray($headerStyle); foreach ($experts as $i => $expert) { $row = $i + 2; $paperTitles = array_map(function ($p) { return $p['title']; }, $expert['papers']); $sheet->setCellValue('A' . $row, $i + 1); $sheet->setCellValue('B' . $row, $expert['name']); $sheet->setCellValue('C' . $row, $expert['email']); $sheet->setCellValue('D' . $row, $expert['affiliation']); $sheet->setCellValue('E' . $row, $expert['paper_count']); $sheet->setCellValue('F' . $row, implode("\n", $paperTitles)); } $sheet->getColumnDimension('A')->setWidth(6); $sheet->getColumnDimension('B')->setWidth(25); $sheet->getColumnDimension('C')->setWidth(35); $sheet->getColumnDimension('D')->setWidth(50); $sheet->getColumnDimension('E')->setWidth(12); $sheet->getColumnDimension('F')->setWidth(60); $filename = 'experts_' . preg_replace('/[^a-zA-Z0-9]/', '_', $keyword) . '_p' . $page . '_' . date('Ymd_His') . '.xlsx'; $filepath = ROOT_PATH . 'public' . DS . 'exports' . DS . $filename; $dir = ROOT_PATH . 'public' . DS . 'exports'; if (!is_dir($dir)) { mkdir($dir, 0777, true); } $writer = \PHPExcel_IOFactory::createWriter($objPHPExcel, 'Excel2007'); $writer->save($filepath); return jsonSuccess([ 'file_url' => '/exports/' . $filename, 'file_name' => $filename, 'count' => count($experts), ]); } // ==================== Database Storage ==================== private function saveExperts($experts, $field, $source) { $inserted = 0; $existing = 0; $fieldEnrich = 0; foreach ($experts as $expert) { $email = strtolower(trim($expert['email'])); if (empty($email)) { continue; } $exists = Db::name('expert')->where('email', $email)->find(); if ($exists) { $existing++; $fieldEnrich += $this->enrichExpertField($exists['expert_id'], $field); continue; } $insert = [ 'name' => mb_substr($expert['name'], 0, 255), 'email' => mb_substr($email, 0, 128), 'affiliation' => mb_substr($expert['affiliation'], 0, 128), 'source' => mb_substr($source, 0, 128), 'ctime' => time(), 'ltime' => 0, 'state' => 0, ]; try { $expertId = Db::name('expert')->insertGetId($insert); $this->enrichExpertField($expertId, $field); $inserted++; } catch (\Exception $e) { $existing++; } } return ['inserted' => $inserted, 'existing' => $existing, 'field_enriched' => $fieldEnrich]; } // ==================== Fetch Log (t_expert_fetch) ==================== private function getFetchLog($field, $source) { $log = Db::name('expert_fetch') ->where('field', $field) ->where('source', $source) ->find(); if (!$log) { return ['last_page' => 0, 'total_pages' => 0, 'last_time' => 0]; } return $log; } private function updateFetchLog($field, $source, $lastPage, $totalPages) { $exists = Db::name('expert_fetch') ->where('field', $field) ->where('source', $source) ->find(); if ($exists) { Db::name('expert_fetch') ->where('expert_fetch_id', $exists['expert_fetch_id']) ->update([ 'last_page' => $lastPage, 'total_pages' => $totalPages, 'last_time' => time(), ]); } else { Db::name('expert_fetch')->insert([ 'field' => mb_substr($field, 0, 128), 'source' => mb_substr($source, 0, 128), 'last_page' => $lastPage, 'total_pages' => $totalPages, 'last_time' => time(), ]); } } private function enrichExpertField($expertId, $field) { $field = trim($field); if (empty($field)) { return 0; } $exists = Db::name('expert_field') ->where('expert_id', $expertId) ->where('field', $field) ->where('state', 0) ->find(); if ($exists) { return 0; } Db::name('expert_field')->insert([ 'expert_id' => $expertId, 'major_id' => 0, 'field' => mb_substr($field, 0, 128), 'state' => 0, ]); return 1; } // ==================== Helper Methods ==================== private function extractEmailFromText($text) { if (empty($text)) { return ''; } if (preg_match('/[Ee]lectronic address:\s*([^\s;,]+@[^\s;,]+)/', $text, $m)) { return strtolower(trim($m[1], '.')); } if (preg_match('/[Ee]-?mail:\s*([^\s;,]+@[^\s;,]+)/', $text, $m)) { return strtolower(trim($m[1], '.')); } if (preg_match('/\b([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})\b/', $text, $m)) { return strtolower(trim($m[1], '.')); } return ''; } private function extractEmailsFromNode($node, &$emails) { if ($node === null) { return; } foreach ($node->children() as $child) { if ($child->getName() === 'email') { $email = strtolower(trim((string) $child)); if (!empty($email) && !in_array($email, $emails)) { $emails[] = $email; } } $this->extractEmailsFromNode($child, $emails); } $text = (string) $node; if (preg_match_all('/\b([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})\b/', $text, $matches)) { foreach ($matches[1] as $email) { $email = strtolower(trim($email, '.')); if (!in_array($email, $emails)) { $emails[] = $email; } } } } private function cleanAffiliation($text) { $text = preg_replace('/\s*[Ee]lectronic address:\s*[^\s;,]+@[^\s;,]+/', '', $text); $text = preg_replace('/\s*[Ee]-?mail:\s*[^\s;,]+@[^\s;,]+/', '', $text); $text = preg_replace('/\s*\b[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}\b/', '', $text); $text = trim($text, " \t\n\r\0\x0B.,;"); return $text; } private function xmlNodeToString($node) { if ($node === null) { return ''; } $xml = $node->asXML(); $text = strip_tags($xml); $text = html_entity_decode($text, ENT_QUOTES | ENT_XML1, 'UTF-8'); return trim(preg_replace('/\s+/', ' ', $text)); } }