diff --git a/application/api/controller/Agent.php b/application/api/controller/Agent.php index 768ed56..cf499e2 100644 --- a/application/api/controller/Agent.php +++ b/application/api/controller/Agent.php @@ -182,6 +182,9 @@ class Agent extends Base if (!isset($data['user_id']) || $data['user_id'] == '') { return jsonError('user_id不能为空'); } + if (!isset($data['field']) || $data['field'] == '') { + return jsonError('field不能为空'); + } $userId = intval($data['user_id']); $reviewerInfo = $this->user_reviewer_info_obj @@ -193,10 +196,10 @@ class Agent extends Base return jsonError('未找到该用户的reviewer信息'); } - $field = trim($reviewerInfo['field']); - if ($field == '') { - return jsonError('该用户的field字段为空'); - } + $field = trim($data['field']); +// if ($field == '') { +// return jsonError('该用户的field字段为空'); +// } $majorTree = $this->getMajorTree(); if (empty($majorTree)) { @@ -223,22 +226,24 @@ class Agent extends Base $validMajors = $this->major_obj ->where('major_id', 'in', $matchedIds) ->where('major_state', 0) - ->select(); -// ->column('major_id'); -// $matchedIds = array_intersect($matchedIds, $validMajors); - + ->column('major_id'); + $existing = $this->major_to_user_obj + ->where('user_id', $userId) + ->where('state', 0) + ->column('major_id'); + $unionArray = array_unique(array_merge($validMajors, $existing)); + $ms = $this->major_obj->where('major_id', 'in', $unionArray)->where('major_state', 0)->select(); // $inserted = $this->saveMajorToUser($userId, $matchedIds); - foreach ($validMajors as $k => $major){ - $validMajors[$k]['shu'] = getMajorShu($major['major_id']); + foreach ($ms as $k => $major){ + $ms[$k]['shu'] = getMajorShu($major['major_id']); + $ms[$k]['str'] = getMajorStr($major['major_id']); } - - - + return jsonSuccess([ 'user_id' => $userId, 'field' => $field, - 'majors' => $validMajors, + 'majors' => $ms, // 'inserted' => $inserted, ]); } @@ -377,6 +382,1062 @@ class Agent extends Base ]); } + // ========== CrossRef DOI 查询 & 撤稿检测 ========== + + /** + * 清洗 DOI,去掉前缀 + */ + private function cleanDoi($doi) + { + $doi = trim($doi); + $doi = preg_replace('/^https?:\/\/doi\.org\//', '', $doi); + $doi = preg_replace('/^doi:\s*/i', '', $doi); + return trim($doi); + } + + /** + * 请求 CrossRef API 获取 DOI 的原始 message 数据 + */ + private function fetchCrossRefData($doi) + { + $url = 'https://api.crossref.org/works/' . urlencode($doi); + + $ch = curl_init(); + curl_setopt($ch, CURLOPT_URL, $url); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); + curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); + curl_setopt($ch, CURLOPT_TIMEOUT, 30); + curl_setopt($ch, CURLOPT_HTTPHEADER, [ + 'User-Agent: TMRJournals/1.0 (mailto:publisher@tmrjournals.com)', + 'Accept: application/json', + ]); + + $result = curl_exec($ch); + $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); + + if (curl_errno($ch)) { + $error = curl_error($ch); + curl_close($ch); + return ['success' => false, 'error' => 'CURL错误: ' . $error]; + } + curl_close($ch); + + if ($httpCode == 404) { + return ['success' => false, 'error' => 'DOI在CrossRef中未找到']; + } + if ($httpCode != 200) { + return ['success' => false, 'error' => 'CrossRef返回HTTP ' . $httpCode]; + } + + $data = json_decode($result, true); + if (!isset($data['message'])) { + return ['success' => false, 'error' => 'CrossRef返回数据格式异常']; + } + + return ['success' => true, 'message' => $data['message']]; + } + + /** + * 从 CrossRef date-parts 中提取日期字符串 + */ + private function parseDateParts($dateObj) + { + if (!isset($dateObj['date-parts'][0])) { + return ''; + } + $parts = $dateObj['date-parts'][0]; + $y = isset($parts[0]) ? $parts[0] : ''; + $m = isset($parts[1]) ? sprintf('%02d', $parts[1]) : ''; + $d = isset($parts[2]) ? sprintf('%02d', $parts[2]) : ''; + if ($y && $m && $d) { + return "{$y}-{$m}-{$d}"; + } + if ($y && $m) { + return "{$y}-{$m}"; + } + return (string)$y; + } + + /** + * 解析作者列表 + */ + private function parseAuthors($authorList) + { + if (empty($authorList) || !is_array($authorList)) { + return []; + } + $result = []; + foreach ($authorList as $a) { + $author = [ + 'given' => $a['given'] ?? '', + 'family' => $a['family'] ?? '', + 'name' => isset($a['name']) ? $a['name'] : ((isset($a['given']) ? $a['given'] . ' ' : '') . ($a['family'] ?? '')), + 'ORCID' => $a['ORCID'] ?? '', + 'sequence' => $a['sequence'] ?? '', + 'affiliation' => [], + ]; + if (isset($a['affiliation']) && is_array($a['affiliation'])) { + foreach ($a['affiliation'] as $aff) { + $author['affiliation'][] = $aff['name'] ?? ''; + } + } + $result[] = $author; + } + return $result; + } + + /** + * 检测撤稿状态 + */ + private function detectRetraction($message) + { + $isRetracted = false; + $retractionDetail = []; + + // 1. update-to 字段 + if (isset($message['update-to']) && is_array($message['update-to'])) { + foreach ($message['update-to'] as $update) { + $updateType = strtolower($update['type'] ?? ''); + $updateLabel = strtolower($update['label'] ?? ''); + if (strpos($updateType, 'retract') !== false || strpos($updateLabel, 'retract') !== false) { + $isRetracted = true; + $retractionDetail['retraction_notice'] = [ + 'type' => $update['type'] ?? '', + 'label' => $update['label'] ?? '', + 'DOI' => $update['DOI'] ?? '', + 'date' => isset($update['updated']) ? $this->parseDateParts($update['updated']) : '', + ]; + break; + } + } + } + + // 2. type/subtype + $type = strtolower($message['type'] ?? ''); + $subtype = strtolower($message['subtype'] ?? ''); + if (strpos($type, 'retract') !== false || strpos($subtype, 'retract') !== false) { + $isRetracted = true; + $retractionDetail['is_retraction_notice'] = true; + } + + // 3. relation + if (isset($message['relation']) && is_array($message['relation'])) { + foreach ($message['relation'] as $relType => $relations) { + if (strpos(strtolower($relType), 'retract') !== false) { + $isRetracted = true; + $retractionDetail['relation'] = [$relType => $relations]; + break; + } + } + } + + // 4. title 关键词 + $titles = $message['title'] ?? []; + foreach ($titles as $title) { + $lower = strtolower($title); + if (strpos($lower, 'retraction') !== false || strpos($lower, 'retracted') !== false + || strpos($lower, 'withdrawal') !== false || strpos($lower, 'withdrawn') !== false) { + $isRetracted = true; + $retractionDetail['title_keyword'] = $title; + break; + } + } + + return ['is_retracted' => $isRetracted, 'retraction_detail' => $retractionDetail]; + } + + /** + * 解析 CrossRef message 为结构化数据 + */ + private function parseCrossRefMessage($doi, $message) + { + // 基础信息 + $info = [ + 'doi' => $doi, + 'url' => $message['URL'] ?? ('https://doi.org/' . $doi), + 'type' => $message['type'] ?? '', + 'title' => isset($message['title'][0]) ? $message['title'][0] : '', + ]; + + // 作者 + $info['authors'] = $this->parseAuthors($message['author'] ?? []); + $info['author_string'] = implode(', ', array_column($info['authors'], 'name')); + + // 期刊/来源 + $info['journal'] = [ + 'title' => isset($message['container-title'][0]) ? $message['container-title'][0] : '', + 'short_title'=> isset($message['short-container-title'][0]) ? $message['short-container-title'][0] : '', + 'ISSN' => $message['ISSN'] ?? [], + 'publisher' => $message['publisher'] ?? '', + ]; + + // 卷/期/页码 + $info['volume'] = $message['volume'] ?? ''; + $info['issue'] = $message['issue'] ?? ''; + $info['page'] = $message['page'] ?? ''; + $info['article_number'] = $message['article-number'] ?? ''; + + // 日期 + $info['dates'] = [ + 'published_print' => isset($message['published-print']) ? $this->parseDateParts($message['published-print']) : '', + 'published_online' => isset($message['published-online']) ? $this->parseDateParts($message['published-online']) : '', + 'published' => isset($message['published']) ? $this->parseDateParts($message['published']) : '', + 'created' => isset($message['created']) ? $this->parseDateParts($message['created']) : '', + 'deposited' => isset($message['deposited']) ? $this->parseDateParts($message['deposited']) : '', + 'indexed' => isset($message['indexed']) ? $this->parseDateParts($message['indexed']) : '', + ]; + $info['year'] = ''; + foreach (['published-print', 'published-online', 'published', 'created'] as $dk) { + if (isset($message[$dk]['date-parts'][0][0]) && $message[$dk]['date-parts'][0][0]) { + $info['year'] = (string)$message[$dk]['date-parts'][0][0]; + break; + } + } + + // 摘要 + $info['abstract'] = $message['abstract'] ?? ''; + + // 学科/主题 + $info['subject'] = $message['subject'] ?? []; + + // 引用统计 + $info['references_count'] = $message['references-count'] ?? 0; + $info['is_referenced_by_count'] = $message['is-referenced-by-count'] ?? 0; + + // 资助信息 + $funders = []; + if (isset($message['funder']) && is_array($message['funder'])) { + foreach ($message['funder'] as $f) { + $funders[] = [ + 'name' => $f['name'] ?? '', + 'DOI' => $f['DOI'] ?? '', + 'award' => $f['award'] ?? [], + ]; + } + } + $info['funders'] = $funders; + + // 许可证 + $licenses = []; + if (isset($message['license']) && is_array($message['license'])) { + foreach ($message['license'] as $lic) { + $licenses[] = [ + 'URL' => $lic['URL'] ?? '', + 'start_date' => isset($lic['start']) ? $this->parseDateParts($lic['start']) : '', + ]; + } + } + $info['licenses'] = $licenses; + + // 撤稿检测 + $retraction = $this->detectRetraction($message); + $info['is_retracted'] = $retraction['is_retracted']; + $info['retraction_detail'] = $retraction['retraction_detail']; + + // update-to(勘误/更正/撤稿通知 等所有更新关系) + $updates = []; + if (isset($message['update-to']) && is_array($message['update-to'])) { + foreach ($message['update-to'] as $up) { + $updates[] = [ + 'type' => $up['type'] ?? '', + 'label' => $up['label'] ?? '', + 'DOI' => $up['DOI'] ?? '', + 'date' => isset($up['updated']) ? $this->parseDateParts($up['updated']) : '', + ]; + } + } + $info['updates'] = $updates; + + // 关联关系 relation + $info['relation'] = $message['relation'] ?? []; + + // 分数/评分 + $info['score'] = $message['score'] ?? 0; + + return $info; + } + + /** + * 查询单个 DOI,返回完整的结构化元数据 + * + * @param string doi 文章DOI + */ + public function queryDoi() + { + $data = $this->request->param(); + if (!isset($data['doi']) || trim($data['doi']) == '') { + return jsonError('doi不能为空'); + } + + $doi = $this->cleanDoi($data['doi']); + $res = $this->fetchCrossRefData($doi); + if (!$res['success']) { + return jsonError($res['error']); + } + + $parsed = $this->parseCrossRefMessage($doi, $res['message']); + return jsonSuccess($parsed); + } + + /** + * 批量查询多个 DOI 的完整元数据 + * + * @param string dois 逗号分隔的DOI列表 + */ + public function batchQueryDois() + { + $data = $this->request->param(); + if (!isset($data['dois']) || trim($data['dois']) == '') { + return jsonError('dois不能为空'); + } + + $doiList = array_filter(array_map('trim', explode(',', $data['dois']))); + if (empty($doiList)) { + return jsonError('未提供有效的DOI'); + } + if (count($doiList) > 50) { + return jsonError('单次最多查询50个DOI'); + } + + $results = []; + $retractedCount = 0; + + foreach ($doiList as $rawDoi) { + $doi = $this->cleanDoi($rawDoi); + $res = $this->fetchCrossRefData($doi); + if (!$res['success']) { + $results[] = ['doi' => $doi, 'success' => false, 'error' => $res['error']]; + } else { + $parsed = $this->parseCrossRefMessage($doi, $res['message']); + $parsed['success'] = true; + if ($parsed['is_retracted']) { + $retractedCount++; + } + $results[] = $parsed; + } + usleep(200000); + } + + return jsonSuccess([ + 'total' => count($results), + 'retracted_count' => $retractedCount, + 'list' => $results, + ]); + } + + /** + * 检查一篇文章的所有参考文献(返回每条引用的完整 CrossRef 元数据 + 撤稿标记) + * + * @param int p_article_id 生产文章ID + */ + public function checkArticleReferences() + { + $data = $this->request->param(); + if (!isset($data['p_article_id']) || $data['p_article_id'] == '') { + return jsonError('p_article_id不能为空'); + } + + $pArticleId = intval($data['p_article_id']); + $refers = $this->production_article_refer_obj + ->where('p_article_id', $pArticleId) + ->where('state', 0) + ->where('refer_doi', '<>', '') + ->select(); + + if (empty($refers)) { + return jsonSuccess([ + 'p_article_id' => $pArticleId, + 'total_checked' => 0, + 'retracted_count' => 0, + 'list' => [], + ]); + } + + $list = []; + $retractedCount = 0; + $errorCount = 0; + + foreach ($refers as $refer) { + $doi = $this->cleanDoi($refer['refer_doi']); + if ($doi == '') { + continue; + } + + $item = [ + 'p_refer_id' => $refer['p_refer_id'], + 'index' => $refer['index'], + 'refer_doi' => $doi, + 'refer_content' => $refer['refer_content'] ?? '', + ]; + + $res = $this->fetchCrossRefData($doi); + if (!$res['success']) { + $item['crossref_success'] = false; + $item['crossref_error'] = $res['error']; + $errorCount++; + } else { + $parsed = $this->parseCrossRefMessage($doi, $res['message']); + $item['crossref_success'] = true; + $item['crossref'] = $parsed; + if ($parsed['is_retracted']) { + $retractedCount++; + } + } + + $list[] = $item; + usleep(200000); + } + + return jsonSuccess([ + 'p_article_id' => $pArticleId, + 'total_checked' => count($list), + 'retracted_count' => $retractedCount, + 'error_count' => $errorCount, + 'list' => $list, + ]); + } + + /** + * 通过 article_id 检查参考文献(完整元数据 + 撤稿检测) + * + * @param int article_id 文章ID + */ + public function checkReferencesByArticleId() + { + $data = $this->request->param(); + if (!isset($data['article_id']) || $data['article_id'] == '') { + return jsonError('article_id不能为空'); + } + + $articleId = intval($data['article_id']); + $pInfo = $this->production_article_obj + ->where('article_id', $articleId) + ->where('state', 0) + ->find(); + + if (!$pInfo) { + return jsonError('未找到该文章的生产信息'); + } + + $refers = $this->production_article_refer_obj + ->where('p_article_id', $pInfo['p_article_id']) + ->where('state', 0) + ->where('refer_doi', '<>', '') + ->select(); + + if (empty($refers)) { + return jsonSuccess([ + 'article_id' => $articleId, + 'p_article_id' => $pInfo['p_article_id'], + 'total_checked' => 0, + 'retracted_count' => 0, + 'list' => [], + ]); + } + + $list = []; + $retractedCount = 0; + + foreach ($refers as $refer) { + $doi = $this->cleanDoi($refer['refer_doi']); + if ($doi == '') { + continue; + } + + $item = [ + 'p_refer_id' => $refer['p_refer_id'], + 'index' => $refer['index'], + 'refer_doi' => $doi, + 'refer_content' => $refer['refer_content'] ?? '', + ]; + + $res = $this->fetchCrossRefData($doi); + if (!$res['success']) { + $item['crossref_success'] = false; + $item['crossref_error'] = $res['error']; + } else { + $parsed = $this->parseCrossRefMessage($doi, $res['message']); + $item['crossref_success'] = true; + $item['crossref'] = $parsed; + if ($parsed['is_retracted']) { + $retractedCount++; + } + } + + $list[] = $item; + usleep(200000); + } + + return jsonSuccess([ + 'article_id' => $articleId, + 'p_article_id' => $pInfo['p_article_id'], + 'total_checked' => count($list), + 'retracted_count' => $retractedCount, + 'list' => $list, + ]); + } + + // ========== DOI 网页抓取通讯作者邮箱 ========== + + /** + * 通过 DOI 跳转获取出版商页面 HTML + */ + private function fetchPageByDoi($doi) + { + $url = 'https://doi.org/' . $doi; + + $ch = curl_init(); + curl_setopt($ch, CURLOPT_URL, $url); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); + curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); + curl_setopt($ch, CURLOPT_MAXREDIRS, 10); + curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); + curl_setopt($ch, CURLOPT_TIMEOUT, 30); + curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'); + curl_setopt($ch, CURLOPT_HTTPHEADER, [ + 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language: en-US,en;q=0.5', + ]); + + $html = curl_exec($ch); + $finalUrl = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL); + $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); + + if (curl_errno($ch)) { + $error = curl_error($ch); + curl_close($ch); + return ['success' => false, 'error' => 'CURL错误: ' . $error]; + } + curl_close($ch); + + if ($httpCode != 200) { + return ['success' => false, 'error' => 'HTTP ' . $httpCode]; + } + + return ['success' => true, 'html' => $html, 'final_url' => $finalUrl]; + } + + /** + * 根据最终 URL 判断出版商 + */ + private function detectPublisher($url) + { + $host = strtolower(parse_url($url, PHP_URL_HOST) ?: ''); + + $map = [ + 'mdpi.com' => 'mdpi', + 'springer.com' => 'springer', + 'springerlink.com' => 'springer', + 'nature.com' => 'springer', + 'biomedcentral.com' => 'springer', + 'sciencedirect.com' => 'elsevier', + 'elsevier.com' => 'elsevier', + 'wiley.com' => 'wiley', + 'onlinelibrary.wiley'=> 'wiley', + 'frontiersin.org' => 'frontiers', + 'tandfonline.com' => 'taylor_francis', + 'sagepub.com' => 'sage', + 'oup.com' => 'oxford', + 'plos.org' => 'plos', + 'hindawi.com' => 'hindawi', + 'cell.com' => 'cell', + 'jci.org' => 'jci', + 'asm.org' => 'asm', + 'iucr.org' => 'iucr', + 'rsc.org' => 'rsc', + 'acs.org' => 'acs', + ]; + + foreach ($map as $domain => $publisher) { + if (strpos($host, $domain) !== false) { + return $publisher; + } + } + return 'unknown'; + } + + /** + * 从 HTML 中提取所有有效邮箱(过滤系统邮箱) + */ + private function extractEmails($html) + { + $all = []; + if (preg_match_all('/[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}/', $html, $m)) { + $all = array_unique($m[0]); + } + + $systemKeywords = [ + 'noreply', 'no-reply', 'support', 'info@', 'admin@', 'webmaster', + 'editor@', 'editorial@', 'help@', 'contact@', 'privacy@', 'service@', + 'marketing@', 'feedback@', 'copyright@', 'permissions@', + 'mdpi.com', 'springer.com', 'elsevier.com', 'wiley.com', + 'frontiersin.org', 'nature.com', 'oup.com', 'sagepub.com', + 'tandfonline.com', 'plos.org', 'hindawi.com', 'biomedcentral.com', + 'crossref.org', 'doi.org', 'example.com', 'sentry.io', + 'acs.org', 'rsc.org', + ]; + + $filtered = []; + foreach ($all as $email) { + $lower = strtolower($email); + $skip = false; + foreach ($systemKeywords as $kw) { + if (strpos($lower, $kw) !== false) { + $skip = true; + break; + } + } + if (strpos($lower, '.png') !== false || strpos($lower, '.jpg') !== false + || strpos($lower, '.gif') !== false || strpos($lower, '.css') !== false + || strpos($lower, '.js') !== false) { + $skip = true; + } + if (!$skip) { + $filtered[] = $email; + } + } + + return $filtered; + } + + /** + * MDPI 页面解析 + */ + private function parseMdpiEmail($html) + { + $result = ['corresponding_authors' => [], 'all_emails' => []]; + $result['all_emails'] = $this->extractEmails($html); + + // MDPI: 通讯作者标 *,邮箱用 "Author to whom correspondence should be addressed" + // 找对应作者名:带 * 的 或文本 + $corrNames = []; + if (preg_match_all('/]*>\s*([^<]+?)\s*<\/span>\s*\*/', $html, $m)) { + $corrNames = array_map('trim', $m[1]); + } + if (empty($corrNames) && preg_match_all('/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)\s*\*/', $html, $m)) { + $corrNames = array_map('trim', $m[1]); + } + + // 找 mailto 链接(通常就是通讯作者邮箱) + $mailtoEmails = []; + if (preg_match_all('/href=["\']mailto:([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})["\']/', $html, $m)) { + $mailtoEmails = array_values(array_unique($m[1])); + } + $mailtoEmails = array_values(array_filter($mailtoEmails, function ($e) { + return stripos($e, 'mdpi') === false; + })); + + foreach ($corrNames as $i => $name) { + $entry = ['name' => $name, 'email' => '']; + if (isset($mailtoEmails[$i])) { + $entry['email'] = $mailtoEmails[$i]; + } + $result['corresponding_authors'][] = $entry; + } + + if (empty($result['corresponding_authors']) && !empty($mailtoEmails)) { + foreach ($mailtoEmails as $email) { + $result['corresponding_authors'][] = ['name' => '', 'email' => $email]; + } + } + + return $result; + } + + /** + * Springer / Nature / BMC 页面解析 + */ + private function parseSpringerEmail($html) + { + $result = ['corresponding_authors' => [], 'all_emails' => []]; + $result['all_emails'] = $this->extractEmails($html); + + // Springer: + if (preg_match_all('/data-test=["\']author-letter["\'][^>]*href=["\']mailto:([^"\']+)["\']/', $html, $m)) { + foreach ($m[1] as $email) { + $result['corresponding_authors'][] = ['name' => '', 'email' => $email]; + } + } + + // 或者 "Correspondence to" / "Corresponding author" 区域 + if (empty($result['corresponding_authors'])) { + if (preg_match('/[Cc]orrespond(?:ence|ing\s+author)[^<]{0,50}<[^>]*>([^<]*<[^>]*>)*?[^<]*?href=["\']mailto:([^"\']+)["\']/', $html, $m)) { + $result['corresponding_authors'][] = ['name' => '', 'email' => $m[2]]; + } + } + + if (empty($result['corresponding_authors'])) { + $patterns = [ + '/[Cc]orrespond[a-z]*[^<]{0,300}?([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})/', + ]; + foreach ($patterns as $p) { + if (preg_match_all($p, $html, $m)) { + foreach ($m[1] as $email) { + $clean = $this->extractEmails($email); + if (!empty($clean)) { + $result['corresponding_authors'][] = ['name' => '', 'email' => $clean[0]]; + } elseif (filter_var($email, FILTER_VALIDATE_EMAIL)) { + $result['corresponding_authors'][] = ['name' => '', 'email' => $email]; + } + } + break; + } + } + } + + // 尝试找名字 + if (!empty($result['corresponding_authors'])) { + if (preg_match('/[Cc]orrespond[a-z]*\s+(?:to|author)[:\s]*([A-Z][a-zA-Z\s.\-]+?)(?:\.|<|,)/', $html, $m)) { + $name = trim($m[1]); + if (strlen($name) > 2 && strlen($name) < 80) { + $result['corresponding_authors'][0]['name'] = $name; + } + } + } + + return $result; + } + + /** + * Frontiers 页面解析 + */ + private function parseFrontiersEmail($html) + { + $result = ['corresponding_authors' => [], 'all_emails' => []]; + $result['all_emails'] = $this->extractEmails($html); + + // Frontiers: "*Correspondence: Name, email@xxx.com" 或 "*Correspondence:" 后跟 mailto: + if (preg_match_all('/\*\s*[Cc]orrespondence:\s*(.*?)(?:'); + $emails = []; + if (preg_match_all('/([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})/', $block, $em)) { + $emails = $em[1]; + } + $plainText = strip_tags($block); + $parts = preg_split('/[,;]/', $plainText); + $name = ''; + foreach ($parts as $part) { + $part = trim($part); + if ($part && strpos($part, '@') === false && preg_match('/[A-Z]/', $part)) { + $name = $part; + break; + } + } + foreach ($emails as $email) { + $result['corresponding_authors'][] = ['name' => $name, 'email' => $email]; + } + } + } + + if (empty($result['corresponding_authors'])) { + if (preg_match_all('/href=["\']mailto:([^"\']+)["\'][^>]*>([^<]*) $email) { + $label = trim(strip_tags($m[2][$i])); + if (stripos($email, 'frontiersin') === false) { + $result['corresponding_authors'][] = ['name' => $label ?: '', 'email' => $email]; + } + } + } + } + + return $result; + } + + /** + * Wiley 页面解析 + */ + private function parseWileyEmail($html) + { + $result = ['corresponding_authors' => [], 'all_emails' => []]; + $result['all_emails'] = $this->extractEmails($html); + + // Wiley: "Correspondence" 段落或 data-widget-def 中含有 mailto: + if (preg_match('/[Cc]orrespond[a-z]*[:\s].*?href=["\']mailto:([^"\']+)["\'].*?<\/p/s', $html, $m)) { + $result['corresponding_authors'][] = ['name' => '', 'email' => $m[1]]; + } + + if (empty($result['corresponding_authors'])) { + if (preg_match('/[Cc]orrespond[a-z]*[^<]{0,500}?([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})/', $html, $m)) { + $result['corresponding_authors'][] = ['name' => '', 'email' => $m[1]]; + } + } + + if (!empty($result['corresponding_authors'])) { + if (preg_match('/[Cc]orrespond[a-z]*[:\s]*([A-Z][a-zA-Z.\-\s]{2,40}?),/', $html, $m)) { + $result['corresponding_authors'][0]['name'] = trim($m[1]); + } + } + + return $result; + } + + /** + * Elsevier / ScienceDirect 页面解析 + */ + private function parseElsevierEmail($html) + { + $result = ['corresponding_authors' => [], 'all_emails' => []]; + $result['all_emails'] = $this->extractEmails($html); + + // ScienceDirect: "Corresponding author" 按钮/区域 + mailto: + if (preg_match_all('/class="[^"]*corresponding[^"]*"[^>]*>.*?href=["\']mailto:([^"\']+)["\']/si', $html, $m)) { + foreach ($m[1] as $email) { + $result['corresponding_authors'][] = ['name' => '', 'email' => $email]; + } + } + + if (empty($result['corresponding_authors'])) { + if (preg_match_all('/data-[a-z\-]*=["\']corresponding[^"\']*["\'][^>]*>([^<]+)/i', $html, $m)) { + foreach ($m[1] as $name) { + $result['corresponding_authors'][] = ['name' => trim(strip_tags($name)), 'email' => '']; + } + } + } + + if (empty($result['corresponding_authors'])) { + if (preg_match('/[Cc]orrespond[a-z]*[^<]{0,300}?([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})/', $html, $m)) { + $result['corresponding_authors'][] = ['name' => '', 'email' => $m[1]]; + } + } + + return $result; + } + + /** + * Taylor & Francis 页面解析 + */ + private function parseTaylorFrancisEmail($html) + { + $result = ['corresponding_authors' => [], 'all_emails' => []]; + $result['all_emails'] = $this->extractEmails($html); + + // T&F: or "CONTACT" section + if (preg_match('/class="[^"]*corresp[^"]*"[^>]*>(.*?)<\/span>/si', $html, $m)) { + $block = $m[1]; + if (preg_match('/([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})/', $block, $em)) { + $name = trim(strip_tags(preg_replace('/[a-zA-Z0-9._%+\-]+@.*/', '', $block))); + $result['corresponding_authors'][] = ['name' => $name, 'email' => $em[1]]; + } + } + + if (empty($result['corresponding_authors'])) { + if (preg_match('/CONTACT\s+(.*?)([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})/s', $html, $m)) { + $name = trim(strip_tags($m[1])); + $result['corresponding_authors'][] = ['name' => $name, 'email' => $m[2]]; + } + } + + return $result; + } + + /** + * PLOS 页面解析 + */ + private function parsePlosEmail($html) + { + $result = ['corresponding_authors' => [], 'all_emails' => []]; + $result['all_emails'] = $this->extractEmails($html); + + // PLOS: "* E-mail: xxx@yyy.com" + if (preg_match_all('/\*\s*E-?mail:\s*([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})/', $html, $m)) { + foreach ($m[1] as $email) { + $result['corresponding_authors'][] = ['name' => '', 'email' => $email]; + } + } + + return $result; + } + + /** + * 通用邮箱提取(兜底方案) + */ + private function parseGenericEmail($html) + { + $result = ['corresponding_authors' => [], 'all_emails' => []]; + $result['all_emails'] = $this->extractEmails($html); + + // 策略1: 找 "Correspondence" / "Corresponding author" 附近的邮箱 + $corrPatterns = [ + '/[Cc]orrespond(?:ing\s+author|ence)[:\s]*(?:<[^>]*>)*\s*(?:<[^>]*>)*\s*([^<]*?)\s*(?:<[^>]*>)*\s*(?:href=["\'])?mailto:([^"\'>\s]+)/s', + '/[Cc]orrespond[a-z]*[^<]{0,500}?([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})/', + '/\*\s*(?:E-?mail|Correspondence)[:\s]*([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})/', + ]; + + foreach ($corrPatterns as $pattern) { + if (preg_match_all($pattern, $html, $m)) { + $lastGroup = end($m); + foreach ($lastGroup as $val) { + if (filter_var($val, FILTER_VALIDATE_EMAIL)) { + $alreadyExists = false; + foreach ($result['corresponding_authors'] as $existing) { + if ($existing['email'] === $val) { + $alreadyExists = true; + break; + } + } + if (!$alreadyExists) { + $result['corresponding_authors'][] = ['name' => '', 'email' => $val]; + } + } + } + } + if (!empty($result['corresponding_authors'])) { + break; + } + } + + // 策略2: 找所有 mailto 链接 + if (empty($result['corresponding_authors'])) { + $mailtoEmails = []; + if (preg_match_all('/href=["\']mailto:([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})["\']/', $html, $m)) { + $mailtoEmails = array_unique($m[1]); + } + + $filtered = array_values(array_filter($mailtoEmails, function ($email) { + $lower = strtolower($email); + $skip = ['noreply', 'support', 'info@', 'admin', 'editor', 'help@', 'contact@', + 'privacy', 'service', 'marketing', 'copyright', 'permission']; + foreach ($skip as $kw) { + if (strpos($lower, $kw) !== false) return false; + } + return true; + })); + + foreach ($filtered as $email) { + $result['corresponding_authors'][] = ['name' => '', 'email' => $email]; + } + } + + return $result; + } + + /** + * 主方法:根据 DOI 抓取出版商网页,解析通讯作者和邮箱 + */ + private function scrapeCorrespondingAuthor($doi) + { + $doi = $this->cleanDoi($doi); + if ($doi == '') { + return ['success' => false, 'error' => 'DOI为空']; + } + + $page = $this->fetchPageByDoi($doi); + if (!$page['success']) { + return ['success' => false, 'doi' => $doi, 'error' => $page['error']]; + } + + $finalUrl = $page['final_url']; + $html = $page['html']; + $publisher = $this->detectPublisher($finalUrl); + + switch ($publisher) { + case 'mdpi': + $parsed = $this->parseMdpiEmail($html); + break; + case 'springer': + $parsed = $this->parseSpringerEmail($html); + break; + case 'frontiers': + $parsed = $this->parseFrontiersEmail($html); + break; + case 'wiley': + $parsed = $this->parseWileyEmail($html); + break; + case 'elsevier': + $parsed = $this->parseElsevierEmail($html); + break; + case 'taylor_francis': + $parsed = $this->parseTaylorFrancisEmail($html); + break; + case 'plos': + $parsed = $this->parsePlosEmail($html); + break; + default: + $parsed = $this->parseGenericEmail($html); + break; + } + + // 如果专用解析器没找到,用通用方案兜底 + if (empty($parsed['corresponding_authors'])) { + $generic = $this->parseGenericEmail($html); + $parsed['corresponding_authors'] = $generic['corresponding_authors']; + } + + return [ + 'success' => true, + 'doi' => $doi, + 'url' => $finalUrl, + 'publisher' => $publisher, + 'corresponding_authors' => $parsed['corresponding_authors'], + 'all_emails' => $parsed['all_emails'], + ]; + } + + /** + * 通过 DOI 获取通讯作者邮箱(单个) + * + * @param string doi 文章DOI + */ + public function getAuthorEmail() + { + $data = $this->request->param(); + if (!isset($data['doi']) || trim($data['doi']) == '') { + return jsonError('doi不能为空'); + } + + $result = $this->scrapeCorrespondingAuthor($data['doi']); + if (!$result['success']) { + return jsonError($result['error']); + } + + return jsonSuccess($result); + } + + /** + * 批量通过 DOI 获取通讯作者邮箱 + * + * @param string dois 逗号分隔的DOI列表 + */ + public function batchGetAuthorEmails() + { + $data = $this->request->param(); + if (!isset($data['dois']) || trim($data['dois']) == '') { + return jsonError('dois不能为空'); + } + + $doiList = array_filter(array_map('trim', explode(',', $data['dois']))); + if (empty($doiList)) { + return jsonError('未提供有效的DOI'); + } + if (count($doiList) > 20) { + return jsonError('单次最多查询20个DOI'); + } + + $results = []; + $successCount = 0; + $emailFoundCount = 0; + + foreach ($doiList as $rawDoi) { + $result = $this->scrapeCorrespondingAuthor($rawDoi); + if ($result['success']) { + $successCount++; + if (!empty($result['corresponding_authors'])) { + $emailFoundCount++; + } + } + $results[] = $result; + usleep(500000); + } + + return jsonSuccess([ + 'total' => count($results), + 'success_count' => $successCount, + 'email_found_count' => $emailFoundCount, + 'list' => $results, + ]); + } + /** * 统计当前 field 转 major 的覆盖情况 */ diff --git a/application/api/controller/ExpertFinder.php b/application/api/controller/ExpertFinder.php new file mode 100644 index 0000000..498a87d --- /dev/null +++ b/application/api/controller/ExpertFinder.php @@ -0,0 +1,612 @@ +httpClient = new Client([ + 'timeout' => 60, + 'verify' => false, + ]); + } + + /** + * Main search endpoint + * Params: + * keyword - search term (e.g. "biomedical engineering") + * max_results - max articles to scan, default 200, max 1000 + * min_year - earliest publication year, default current-3 + * source - "pubmed" (fast, email from affiliation) or "pmc" (slower, structured email) + */ + public function search() + { + $keyword = trim($this->request->param('keyword', '')); + $maxResults = intval($this->request->param('max_results', 200)); + $minYear = intval($this->request->param('min_year', date('Y') - 3)); + $source = $this->request->param('source', 'pubmed'); + + if (empty($keyword)) { + return jsonError('keyword is required'); + } + + $maxResults = max(10, min($maxResults, 1000)); + + $cacheKey = 'expert_finder_' . md5($keyword . $maxResults . $minYear . $source); + $cached = Cache::get($cacheKey); + if ($cached) { + return jsonSuccess($cached); + } + + try { + if ($source === 'pmc') { + $result = $this->searchViaPMC($keyword, $maxResults, $minYear); + } else { + $result = $this->searchViaPubMed($keyword, $maxResults, $minYear); + } + } catch (\Exception $e) { + return jsonError('Search failed: ' . $e->getMessage()); + } + + Cache::set($cacheKey, $result, 3600); + + return jsonSuccess($result); + } + + /** + * Export search results to Excel + * Same params as search() + */ + public function export() + { + $keyword = trim($this->request->param('keyword', '')); + $maxResults = intval($this->request->param('max_results', 200)); + $minYear = intval($this->request->param('min_year', date('Y') - 3)); + $source = $this->request->param('source', 'pubmed'); + + if (empty($keyword)) { + return jsonError('keyword is required'); + } + + $maxResults = max(10, min($maxResults, 1000)); + + $cacheKey = 'expert_finder_' . md5($keyword . $maxResults . $minYear . $source); + $cached = Cache::get($cacheKey); + + if (!$cached) { + try { + if ($source === 'pmc') { + $cached = $this->searchViaPMC($keyword, $maxResults, $minYear); + } else { + $cached = $this->searchViaPubMed($keyword, $maxResults, $minYear); + } + Cache::set($cacheKey, $cached, 3600); + } catch (\Exception $e) { + return jsonError('Search failed: ' . $e->getMessage()); + } + } + + if (empty($cached['experts'])) { + return jsonError('No experts found to export'); + } + + return $this->generateExcel($cached['experts'], $keyword); + } + + /** + * Clear search cache + */ + public function clearCache() + { + $keyword = trim($this->request->param('keyword', '')); + $maxResults = intval($this->request->param('max_results', 200)); + $minYear = intval($this->request->param('min_year', date('Y') - 3)); + $source = $this->request->param('source', 'pubmed'); + + $cacheKey = 'expert_finder_' . md5($keyword . $maxResults . $minYear . $source); + Cache::rm($cacheKey); + + return jsonSuccess(['msg' => 'Cache cleared']); + } + + // ==================== PubMed Search ==================== + + private function searchViaPubMed($keyword, $maxResults, $minYear) + { + $ids = $this->esearch('pubmed', $keyword, $maxResults, $minYear); + if (empty($ids)) { + return ['experts' => [], 'total' => 0, 'articles_scanned' => 0, 'source' => 'pubmed']; + } + + $allAuthors = []; + $batches = array_chunk($ids, 200); + foreach ($batches as $batch) { + $xml = $this->efetch('pubmed', $batch); + $authors = $this->parsePubMedXml($xml); + $allAuthors = array_merge($allAuthors, $authors); + usleep(340000); + } + + $experts = $this->aggregateExperts($allAuthors); + + return [ + 'experts' => $experts, + 'total' => count($experts), + 'articles_scanned' => count($ids), + 'source' => 'pubmed', + ]; + } + + // ==================== PMC Search ==================== + + private function searchViaPMC($keyword, $maxResults, $minYear) + { + $ids = $this->esearch('pmc', $keyword, $maxResults, $minYear); + if (empty($ids)) { + return ['experts' => [], 'total' => 0, 'articles_scanned' => 0, 'source' => 'pmc']; + } + + $allAuthors = []; + $batches = array_chunk($ids, 20); + foreach ($batches as $batch) { + $xml = $this->efetch('pmc', $batch); + $authors = $this->parsePMCXml($xml); + $allAuthors = array_merge($allAuthors, $authors); + usleep(400000); + } + + $experts = $this->aggregateExperts($allAuthors); + + return [ + 'experts' => $experts, + 'total' => count($experts), + 'articles_scanned' => count($ids), + 'source' => 'pmc', + ]; + } + + // ==================== NCBI API Calls ==================== + + private function esearch($db, $keyword, $maxResults, $minYear) + { + $term = $keyword . ' AND ' . $minYear . ':' . date('Y') . '[pdat]'; + + $response = $this->httpClient->get($this->ncbiBaseUrl . 'esearch.fcgi', [ + 'query' => [ + 'db' => $db, + 'term' => $term, + 'retmax' => $maxResults, + 'retmode' => 'json', + 'sort' => 'relevance', + ], + ]); + + $data = json_decode($response->getBody()->getContents(), true); + + return $data['esearchresult']['idlist'] ?? []; + } + + private function efetch($db, $ids) + { + $response = $this->httpClient->post($this->ncbiBaseUrl . 'efetch.fcgi', [ + 'form_params' => [ + 'db' => $db, + 'id' => implode(',', $ids), + 'retmode' => 'xml', + ], + ]); + + return $response->getBody()->getContents(); + } + + // ==================== PubMed XML Parsing ==================== + + private function parsePubMedXml($xmlString) + { + $results = []; + + libxml_use_internal_errors(true); + $xml = simplexml_load_string($xmlString); + if ($xml === false) { + return $results; + } + + foreach ($xml->PubmedArticle as $article) { + $citation = $article->MedlineCitation; + $articleData = $citation->Article; + + $title = $this->xmlNodeToString($articleData->ArticleTitle); + $pmid = (string) $citation->PMID; + + $journal = ''; + if (isset($articleData->Journal->Title)) { + $journal = (string) $articleData->Journal->Title; + } + + if (!isset($articleData->AuthorList->Author)) { + continue; + } + + foreach ($articleData->AuthorList->Author as $author) { + $lastName = (string) ($author->LastName ?? ''); + $foreName = (string) ($author->ForeName ?? ''); + $fullName = trim($foreName . ' ' . $lastName); + + if (empty($fullName)) { + continue; + } + + $email = ''; + $affiliation = ''; + + if (isset($author->AffiliationInfo)) { + foreach ($author->AffiliationInfo as $affInfo) { + $affText = (string) $affInfo->Affiliation; + if (empty($affiliation)) { + $affiliation = $affText; + } + if (empty($email)) { + $email = $this->extractEmailFromText($affText); + } + } + } + + if (empty($email)) { + continue; + } + + $results[] = [ + 'name' => $fullName, + 'email' => strtolower($email), + 'affiliation' => $this->cleanAffiliation($affiliation), + 'article_title' => $title, + 'article_id' => $pmid, + 'journal' => $journal, + ]; + } + } + + return $results; + } + + // ==================== PMC XML Parsing ==================== + + private function parsePMCXml($xmlString) + { + $results = []; + + libxml_use_internal_errors(true); + $xml = simplexml_load_string($xmlString); + if ($xml === false) { + return $results; + } + + $articles = $xml->article ?? $xml->children(); + + foreach ($articles as $article) { + if ($article->getName() !== 'article') { + continue; + } + + $front = $article->front; + if (!$front) { + continue; + } + + $articleMeta = $front->{'article-meta'}; + if (!$articleMeta) { + continue; + } + + $title = $this->xmlNodeToString($articleMeta->{'title-group'}->{'article-title'} ?? null); + $pmcId = ''; + if (isset($articleMeta->{'article-id'})) { + foreach ($articleMeta->{'article-id'} as $idNode) { + if ((string) $idNode['pub-id-type'] === 'pmc') { + $pmcId = (string) $idNode; + } + } + } + + $journal = ''; + if (isset($front->{'journal-meta'}->{'journal-title'})) { + $journal = (string) $front->{'journal-meta'}->{'journal-title'}; + } elseif (isset($front->{'journal-meta'}->{'journal-title-group'}->{'journal-title'})) { + $journal = (string) $front->{'journal-meta'}->{'journal-title-group'}->{'journal-title'}; + } + + $correspEmails = []; + if (isset($articleMeta->{'author-notes'})) { + $this->extractEmailsFromNode($articleMeta->{'author-notes'}, $correspEmails); + } + + $affiliationMap = []; + if (isset($articleMeta->{'contrib-group'})) { + foreach ($articleMeta->{'contrib-group'}->children() as $child) { + if ($child->getName() === 'aff') { + $affId = (string) ($child['id'] ?? ''); + $affText = $this->xmlNodeToString($child); + if ($affId) { + $affiliationMap[$affId] = $affText; + } + } + } + } + if (isset($front->{'article-meta'}->{'aff'})) { + foreach ($front->{'article-meta'}->{'aff'} as $aff) { + $affId = (string) ($aff['id'] ?? ''); + $affText = $this->xmlNodeToString($aff); + if ($affId) { + $affiliationMap[$affId] = $affText; + } + } + } + + if (!isset($articleMeta->{'contrib-group'})) { + continue; + } + + foreach ($articleMeta->{'contrib-group'}->contrib as $contrib) { + $contribType = (string) ($contrib['contrib-type'] ?? ''); + if ($contribType !== 'author') { + continue; + } + + $nameNode = $contrib->name; + if (!$nameNode) { + continue; + } + $surname = (string) ($nameNode->surname ?? ''); + $givenNames = (string) ($nameNode->{'given-names'} ?? ''); + $fullName = trim($givenNames . ' ' . $surname); + + if (empty($fullName)) { + continue; + } + + $email = ''; + if (isset($contrib->email)) { + $email = strtolower(trim((string) $contrib->email)); + } + + $affiliation = ''; + if (isset($contrib->xref)) { + foreach ($contrib->xref as $xref) { + if ((string) $xref['ref-type'] === 'aff') { + $rid = (string) $xref['rid']; + if (isset($affiliationMap[$rid])) { + $affiliation = $affiliationMap[$rid]; + break; + } + } + } + } + if (empty($affiliation) && isset($contrib->aff)) { + $affiliation = $this->xmlNodeToString($contrib->aff); + } + + $isCorresponding = false; + if (isset($contrib->xref)) { + foreach ($contrib->xref as $xref) { + if ((string) $xref['ref-type'] === 'corresp') { + $isCorresponding = true; + } + } + } + if ((string) ($contrib['corresp'] ?? '') === 'yes') { + $isCorresponding = true; + } + + if (empty($email) && $isCorresponding && !empty($correspEmails)) { + $email = $correspEmails[0]; + } + + if (empty($email)) { + $extracted = $this->extractEmailFromText($affiliation); + if ($extracted) { + $email = $extracted; + } + } + + if (empty($email)) { + continue; + } + + $results[] = [ + 'name' => $fullName, + 'email' => strtolower($email), + 'affiliation' => $this->cleanAffiliation($affiliation), + 'article_title' => $title, + 'article_id' => $pmcId, + 'journal' => $journal, + ]; + } + } + + return $results; + } + + // ==================== Aggregation ==================== + + private function aggregateExperts($authorRecords) + { + $map = []; + + foreach ($authorRecords as $record) { + $key = strtolower(trim($record['email'])); + if (empty($key)) { + continue; + } + + if (!isset($map[$key])) { + $map[$key] = [ + 'name' => $record['name'], + 'email' => $record['email'], + 'affiliation' => $record['affiliation'], + 'paper_count' => 0, + 'papers' => [], + ]; + } + + $map[$key]['paper_count']++; + + if (count($map[$key]['papers']) < 10) { + $map[$key]['papers'][] = [ + 'title' => $record['article_title'], + 'article_id' => $record['article_id'], + 'journal' => $record['journal'], + ]; + } + + if (empty($map[$key]['affiliation']) && !empty($record['affiliation'])) { + $map[$key]['affiliation'] = $record['affiliation']; + } + } + + $experts = array_values($map); + + usort($experts, function ($a, $b) { + return $b['paper_count'] - $a['paper_count']; + }); + + return $experts; + } + + // ==================== Excel Export ==================== + + private function generateExcel($experts, $keyword) + { + $spreadsheet = new \PhpOffice\PhpSpreadsheet\Spreadsheet(); + $sheet = $spreadsheet->getActiveSheet(); + $sheet->setTitle('Experts'); + + $headers = ['#', 'Name', 'Email', 'Affiliation', 'Paper Count', 'Representative Papers']; + foreach ($headers as $col => $header) { + $sheet->setCellValueByColumnAndRow($col + 1, 1, $header); + } + + $headerStyle = [ + 'font' => ['bold' => true, 'color' => ['rgb' => 'FFFFFF']], + 'fill' => ['fillType' => \PhpOffice\PhpSpreadsheet\Style\Fill::FILL_SOLID, 'startColor' => ['rgb' => '4472C4']], + 'alignment' => ['horizontal' => \PhpOffice\PhpSpreadsheet\Style\Alignment::HORIZONTAL_CENTER], + ]; + $sheet->getStyle('A1:F1')->applyFromArray($headerStyle); + + foreach ($experts as $i => $expert) { + $row = $i + 2; + $paperTitles = array_map(function ($p) { + return $p['title']; + }, $expert['papers']); + + $sheet->setCellValueByColumnAndRow(1, $row, $i + 1); + $sheet->setCellValueByColumnAndRow(2, $row, $expert['name']); + $sheet->setCellValueByColumnAndRow(3, $row, $expert['email']); + $sheet->setCellValueByColumnAndRow(4, $row, $expert['affiliation']); + $sheet->setCellValueByColumnAndRow(5, $row, $expert['paper_count']); + $sheet->setCellValueByColumnAndRow(6, $row, implode("\n", $paperTitles)); + } + + $sheet->getColumnDimension('A')->setWidth(6); + $sheet->getColumnDimension('B')->setWidth(25); + $sheet->getColumnDimension('C')->setWidth(35); + $sheet->getColumnDimension('D')->setWidth(50); + $sheet->getColumnDimension('E')->setWidth(12); + $sheet->getColumnDimension('F')->setWidth(60); + + $filename = 'experts_' . preg_replace('/[^a-zA-Z0-9]/', '_', $keyword) . '_' . date('Ymd_His') . '.xlsx'; + $filepath = ROOT_PATH . 'public' . DS . 'exports' . DS . $filename; + + $dir = ROOT_PATH . 'public' . DS . 'exports'; + if (!is_dir($dir)) { + mkdir($dir, 0777, true); + } + + $writer = new \PhpOffice\PhpSpreadsheet\Writer\Xlsx($spreadsheet); + $writer->save($filepath); + + return jsonSuccess([ + 'file_url' => '/exports/' . $filename, + 'file_name' => $filename, + 'count' => count($experts), + ]); + } + + // ==================== Helper Methods ==================== + + private function extractEmailFromText($text) + { + if (empty($text)) { + return ''; + } + + if (preg_match('/[Ee]lectronic address:\s*([^\s;,]+@[^\s;,]+)/', $text, $m)) { + return strtolower(trim($m[1], '.')); + } + + if (preg_match('/[Ee]-?mail:\s*([^\s;,]+@[^\s;,]+)/', $text, $m)) { + return strtolower(trim($m[1], '.')); + } + + if (preg_match('/\b([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})\b/', $text, $m)) { + return strtolower(trim($m[1], '.')); + } + + return ''; + } + + private function extractEmailsFromNode($node, &$emails) + { + if ($node === null) { + return; + } + + foreach ($node->children() as $child) { + if ($child->getName() === 'email') { + $email = strtolower(trim((string) $child)); + if (!empty($email) && !in_array($email, $emails)) { + $emails[] = $email; + } + } + $this->extractEmailsFromNode($child, $emails); + } + + $text = (string) $node; + if (preg_match_all('/\b([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})\b/', $text, $matches)) { + foreach ($matches[1] as $email) { + $email = strtolower(trim($email, '.')); + if (!in_array($email, $emails)) { + $emails[] = $email; + } + } + } + } + + private function cleanAffiliation($text) + { + $text = preg_replace('/\s*[Ee]lectronic address:\s*[^\s;,]+@[^\s;,]+/', '', $text); + $text = preg_replace('/\s*[Ee]-?mail:\s*[^\s;,]+@[^\s;,]+/', '', $text); + $text = preg_replace('/\s*\b[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}\b/', '', $text); + $text = trim($text, " \t\n\r\0\x0B.,;"); + return $text; + } + + private function xmlNodeToString($node) + { + if ($node === null) { + return ''; + } + + $xml = $node->asXML(); + $text = strip_tags($xml); + $text = html_entity_decode($text, ENT_QUOTES | ENT_XML1, 'UTF-8'); + return trim(preg_replace('/\s+/', ' ', $text)); + } +}