major_obj ->where('major_state', 0) ->where('major_type', 0) ->select(); $majorMap = []; foreach ($allMajors as $m) { $majorMap[$m['major_id']] = $m; } $result = []; foreach ($allMajors as $m) { $hasChild = false; foreach ($allMajors as $check) { if ($check['pid'] == $m['major_id']) { $hasChild = true; break; } } if (!$hasChild) { $path = $this->buildMajorPath($m['major_id'], $majorMap); $result[] = [ 'major_id' => $m['major_id'], 'major_title' => $m['major_title'], 'full_path' => $path, ]; } } Cache::set($cacheKey, $result, 3600); return $result; } /** * 递归构建 major 的完整路径 */ private function buildMajorPath($majorId, &$majorMap) { if (!isset($majorMap[$majorId])) { return ''; } $m = $majorMap[$majorId]; if ($m['pid'] == 0 || $m['pid'] == 1 || !isset($majorMap[$m['pid']])) { return $m['major_title']; } return $this->buildMajorPath($m['pid'], $majorMap) . ' > ' . $m['major_title']; } /** * 构建 major 列表提示文本(供 AI 使用) */ private function buildMajorListPrompt($majorTree) { $lines = []; foreach ($majorTree as $item) { $lines[] = "ID:{$item['major_id']} - {$item['full_path']}"; } return implode("\n", $lines); } /** * 调用 AI 将用户 field 描述匹配到标准 major_id */ private function matchFieldToMajor($field, $majorListPrompt) { $systemPrompt = "你是一位医学领域分类专家。用户会提供一段研究领域的描述文本,你需要从给定的标准领域列表中找出最匹配的1-3个领域。\n" . "请严格按照JSON数组格式返回匹配结果,只返回major_id数组,如 [12,34,56]。\n" . "如果没有合适的匹配,返回空数组 []。\n" . "不要返回任何其他内容,只返回JSON数组。\n\n" . "标准领域列表:\n" . $majorListPrompt; $userPrompt = "请为以下研究领域描述匹配最合适的标准领域ID:\n" . $field; $messages = [ ['role' => 'system', 'content' => $systemPrompt], ['role' => 'user', 'content' => $userPrompt], ]; $apiKey = Env::get("gpt.api_key1", Env::get("gpt.api_key", "")); $url = 'http://chat.taimed.cn/v1/chat/completions'; $data = [ 'model' => 'gpt-4.1', 'messages' => $messages, 'temperature' => 0.1, 'max_tokens' => 200, ]; $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_HTTPHEADER, [ 'Content-Type: application/json', 'Authorization: Bearer ' . $apiKey, ]); curl_setopt($ch, CURLOPT_POST, true); curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($data)); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($ch, CURLOPT_TIMEOUT, 60); $result = curl_exec($ch); if (curl_errno($ch)) { curl_close($ch); return []; } curl_close($ch); $res = json_decode($result, true); if (!isset($res['choices'][0]['message']['content'])) { return []; } $content = trim($res['choices'][0]['message']['content']); // 提取 JSON 数组 if (preg_match('/\[[\d,\s]*\]/', $content, $matches)) { $ids = json_decode($matches[0], true); if (is_array($ids)) { return array_map('intval', $ids); } } return []; } /** * 将匹配结果写入 t_major_to_user */ private function saveMajorToUser($userId, $majorIds) { $existing = $this->major_to_user_obj ->where('user_id', $userId) ->where('state', 0) ->column('major_id'); $toInsert = array_diff($majorIds, $existing); foreach ($toInsert as $majorId) { $this->major_to_user_obj->insert([ 'user_id' => $userId, 'major_id' => $majorId, 'ctime' => time(), ]); } return count($toInsert); } /** * 处理单个用户的 field 转 major * * @param int user_id 用户ID */ public function processOneUser() { $data = $this->request->param(); if (!isset($data['user_id']) || $data['user_id'] == '') { return jsonError('user_id不能为空'); } if (!isset($data['field']) || $data['field'] == '') { return jsonError('field不能为空'); } $userId = intval($data['user_id']); $reviewerInfo = $this->user_reviewer_info_obj ->where('reviewer_id', $userId) ->where('state', 0) ->find(); if (!$reviewerInfo) { return jsonError('未找到该用户的reviewer信息'); } $field = trim($data['field']); // if ($field == '') { // return jsonError('该用户的field字段为空'); // } $majorTree = $this->getMajorTree(); if (empty($majorTree)) { return jsonError('未获取到标准领域数据'); } $majorListPrompt = $this->buildMajorListPrompt($majorTree); $matchedIds = $this->matchFieldToMajor($field, $majorListPrompt); if (empty($matchedIds)) { return jsonSuccess([ 'user_id' => $userId, 'field' => $field, 'matched_ids' => [], 'inserted' => 0, 'msg' => 'AI未匹配到合适的领域', ]); } // 验证 major_id 确实存在 $validMajors = $this->major_obj ->where('major_id', 'in', $matchedIds) ->where('major_state', 0) ->column('major_id'); $existing = $this->major_to_user_obj ->where('user_id', $userId) ->where('state', 0) ->column('major_id'); $unionArray = array_unique(array_merge($validMajors, $existing)); $ms = $this->major_obj->where('major_id', 'in', $unionArray)->where('major_state', 0)->select(); // $inserted = $this->saveMajorToUser($userId, $matchedIds); foreach ($ms as $k => $major){ $ms[$k]['shu'] = getMajorShu($major['major_id']); $ms[$k]['str'] = getMajorStr($major['major_id']); } return jsonSuccess([ 'user_id' => $userId, 'field' => $field, 'majors' => $ms, // 'inserted' => $inserted, ]); } /** * 批量处理:获取有 field 但没有 major_to_user 记录的用户,逐个用 AI 匹配 * * @param int limit 每次处理的数量,默认10 * @param int skip_has_major 是否跳过已有major_to_user记录的用户,默认1 */ public function batchProcess() { $data = $this->request->param(); $limit = isset($data['limit']) ? intval($data['limit']) : 10; $skipHasMajor = isset($data['skip_has_major']) ? intval($data['skip_has_major']) : 1; if ($limit > 50) { $limit = 50; } $query = $this->user_reviewer_info_obj ->alias('ri') ->field('ri.reviewer_id, ri.field') ->where('ri.state', 0) ->where('ri.field', '<>', ''); if ($skipHasMajor) { $subQuery = Db::name('major_to_user')->where('state', 0)->field('user_id')->buildSql(); $query = $query->where('ri.reviewer_id', 'not in', $subQuery); } $users = $query->limit($limit)->select(); if (empty($users)) { return jsonSuccess([ 'processed' => 0, 'msg' => '没有需要处理的用户', ]); } $majorTree = $this->getMajorTree(); if (empty($majorTree)) { return jsonError('未获取到标准领域数据'); } $majorListPrompt = $this->buildMajorListPrompt($majorTree); $validMajorIds = $this->major_obj->where('major_state', 0)->column('major_id'); $results = []; $successCount = 0; $failCount = 0; foreach ($users as $user) { $field = trim($user['field']); if ($field == '') { continue; } $matchedIds = $this->matchFieldToMajor($field, $majorListPrompt); $matchedIds = array_intersect($matchedIds, $validMajorIds); if (!empty($matchedIds)) { $inserted = $this->saveMajorToUser($user['reviewer_id'], $matchedIds); $results[] = [ 'user_id' => $user['reviewer_id'], 'field' => mb_substr($field, 0, 100), 'matched_ids' => array_values($matchedIds), 'inserted' => $inserted, ]; $successCount++; } else { $results[] = [ 'user_id' => $user['reviewer_id'], 'field' => mb_substr($field, 0, 100), 'matched_ids' => [], 'inserted' => 0, ]; $failCount++; } } return jsonSuccess([ 'processed' => count($results), 'success_count' => $successCount, 'fail_count' => $failCount, 'details' => $results, ]); } /** * 查看当前 major 树结构(调试用) */ public function getMajorList() { $majorTree = $this->getMajorTree(); return jsonSuccess([ 'total' => count($majorTree), 'list' => $majorTree, ]); } /** * 从 Excel 文件导入 major 数据到数据库(如需要) */ public function importMajorFromExcel() { $file = ROOT_PATH . 'public' . DS . 'system' . DS . 't_major.xlsx'; if (!file_exists($file)) { return jsonError('Excel文件不存在: public/system/t_major.xlsx'); } $spreadsheet = \PhpOffice\PhpSpreadsheet\IOFactory::load($file); $sheet = $spreadsheet->getActiveSheet(); $highestRow = $sheet->getHighestRow(); $highestColumn = $sheet->getHighestColumn(); $headers = []; $colCount = \PhpOffice\PhpSpreadsheet\Cell\Coordinate::columnIndexFromString($highestColumn); for ($col = 1; $col <= $colCount; $col++) { $headers[$col] = $sheet->getCellByColumnAndRow($col, 1)->getValue(); } $rows = []; for ($row = 2; $row <= $highestRow; $row++) { $rowData = []; for ($col = 1; $col <= $colCount; $col++) { $rowData[$headers[$col]] = $sheet->getCellByColumnAndRow($col, $row)->getValue(); } $rows[] = $rowData; } return jsonSuccess([ 'headers' => array_values($headers), 'total' => count($rows), 'preview' => array_slice($rows, 0, 20), ]); } // ========== CrossRef DOI 查询 & 撤稿检测 ========== /** * 清洗 DOI,去掉前缀 */ private function cleanDoi($doi) { $doi = trim($doi); $doi = preg_replace('/^https?:\/\/doi\.org\//', '', $doi); $doi = preg_replace('/^doi:\s*/i', '', $doi); return trim($doi); } /** * 请求 CrossRef API 获取 DOI 的原始 message 数据 */ private function fetchCrossRefData($doi) { $url = 'https://api.crossref.org/works/' . urlencode($doi); $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($ch, CURLOPT_TIMEOUT, 30); curl_setopt($ch, CURLOPT_HTTPHEADER, [ 'User-Agent: TMRJournals/1.0 (mailto:publisher@tmrjournals.com)', 'Accept: application/json', ]); $result = curl_exec($ch); $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); if (curl_errno($ch)) { $error = curl_error($ch); curl_close($ch); return ['success' => false, 'error' => 'CURL错误: ' . $error]; } curl_close($ch); if ($httpCode == 404) { return ['success' => false, 'error' => 'DOI在CrossRef中未找到']; } if ($httpCode != 200) { return ['success' => false, 'error' => 'CrossRef返回HTTP ' . $httpCode]; } $data = json_decode($result, true); if (!isset($data['message'])) { return ['success' => false, 'error' => 'CrossRef返回数据格式异常']; } return ['success' => true, 'message' => $data['message']]; } /** * 从 CrossRef date-parts 中提取日期字符串 */ private function parseDateParts($dateObj) { if (!isset($dateObj['date-parts'][0])) { return ''; } $parts = $dateObj['date-parts'][0]; $y = isset($parts[0]) ? $parts[0] : ''; $m = isset($parts[1]) ? sprintf('%02d', $parts[1]) : ''; $d = isset($parts[2]) ? sprintf('%02d', $parts[2]) : ''; if ($y && $m && $d) { return "{$y}-{$m}-{$d}"; } if ($y && $m) { return "{$y}-{$m}"; } return (string)$y; } /** * 解析作者列表 */ private function parseAuthors($authorList) { if (empty($authorList) || !is_array($authorList)) { return []; } $result = []; foreach ($authorList as $a) { $author = [ 'given' => $a['given'] ?? '', 'family' => $a['family'] ?? '', 'name' => isset($a['name']) ? $a['name'] : ((isset($a['given']) ? $a['given'] . ' ' : '') . ($a['family'] ?? '')), 'ORCID' => $a['ORCID'] ?? '', 'sequence' => $a['sequence'] ?? '', 'affiliation' => [], ]; if (isset($a['affiliation']) && is_array($a['affiliation'])) { foreach ($a['affiliation'] as $aff) { $author['affiliation'][] = $aff['name'] ?? ''; } } $result[] = $author; } return $result; } /** * 检测撤稿状态 */ private function detectRetraction($message) { $isRetracted = false; $retractionDetail = []; // 1. update-to 字段 if (isset($message['update-to']) && is_array($message['update-to'])) { foreach ($message['update-to'] as $update) { $updateType = strtolower($update['type'] ?? ''); $updateLabel = strtolower($update['label'] ?? ''); if (strpos($updateType, 'retract') !== false || strpos($updateLabel, 'retract') !== false) { $isRetracted = true; $retractionDetail['retraction_notice'] = [ 'type' => $update['type'] ?? '', 'label' => $update['label'] ?? '', 'DOI' => $update['DOI'] ?? '', 'date' => isset($update['updated']) ? $this->parseDateParts($update['updated']) : '', ]; break; } } } // 2. type/subtype $type = strtolower($message['type'] ?? ''); $subtype = strtolower($message['subtype'] ?? ''); if (strpos($type, 'retract') !== false || strpos($subtype, 'retract') !== false) { $isRetracted = true; $retractionDetail['is_retraction_notice'] = true; } // 3. relation if (isset($message['relation']) && is_array($message['relation'])) { foreach ($message['relation'] as $relType => $relations) { if (strpos(strtolower($relType), 'retract') !== false) { $isRetracted = true; $retractionDetail['relation'] = [$relType => $relations]; break; } } } // 4. title 关键词 $titles = $message['title'] ?? []; foreach ($titles as $title) { $lower = strtolower($title); if (strpos($lower, 'retraction') !== false || strpos($lower, 'retracted') !== false || strpos($lower, 'withdrawal') !== false || strpos($lower, 'withdrawn') !== false) { $isRetracted = true; $retractionDetail['title_keyword'] = $title; break; } } return ['is_retracted' => $isRetracted, 'retraction_detail' => $retractionDetail]; } /** * 解析 CrossRef message 为结构化数据 */ private function parseCrossRefMessage($doi, $message) { // 基础信息 $info = [ 'doi' => $doi, 'url' => $message['URL'] ?? ('https://doi.org/' . $doi), 'type' => $message['type'] ?? '', 'title' => isset($message['title'][0]) ? $message['title'][0] : '', ]; // 作者 $info['authors'] = $this->parseAuthors($message['author'] ?? []); $info['author_string'] = implode(', ', array_column($info['authors'], 'name')); // 期刊/来源 $info['journal'] = [ 'title' => isset($message['container-title'][0]) ? $message['container-title'][0] : '', 'short_title'=> isset($message['short-container-title'][0]) ? $message['short-container-title'][0] : '', 'ISSN' => $message['ISSN'] ?? [], 'publisher' => $message['publisher'] ?? '', ]; // 卷/期/页码 $info['volume'] = $message['volume'] ?? ''; $info['issue'] = $message['issue'] ?? ''; $info['page'] = $message['page'] ?? ''; $info['article_number'] = $message['article-number'] ?? ''; // 日期 $info['dates'] = [ 'published_print' => isset($message['published-print']) ? $this->parseDateParts($message['published-print']) : '', 'published_online' => isset($message['published-online']) ? $this->parseDateParts($message['published-online']) : '', 'published' => isset($message['published']) ? $this->parseDateParts($message['published']) : '', 'created' => isset($message['created']) ? $this->parseDateParts($message['created']) : '', 'deposited' => isset($message['deposited']) ? $this->parseDateParts($message['deposited']) : '', 'indexed' => isset($message['indexed']) ? $this->parseDateParts($message['indexed']) : '', ]; $info['year'] = ''; foreach (['published-print', 'published-online', 'published', 'created'] as $dk) { if (isset($message[$dk]['date-parts'][0][0]) && $message[$dk]['date-parts'][0][0]) { $info['year'] = (string)$message[$dk]['date-parts'][0][0]; break; } } // 摘要 $info['abstract'] = $message['abstract'] ?? ''; // 学科/主题 $info['subject'] = $message['subject'] ?? []; // 引用统计 $info['references_count'] = $message['references-count'] ?? 0; $info['is_referenced_by_count'] = $message['is-referenced-by-count'] ?? 0; // 资助信息 $funders = []; if (isset($message['funder']) && is_array($message['funder'])) { foreach ($message['funder'] as $f) { $funders[] = [ 'name' => $f['name'] ?? '', 'DOI' => $f['DOI'] ?? '', 'award' => $f['award'] ?? [], ]; } } $info['funders'] = $funders; // 许可证 $licenses = []; if (isset($message['license']) && is_array($message['license'])) { foreach ($message['license'] as $lic) { $licenses[] = [ 'URL' => $lic['URL'] ?? '', 'start_date' => isset($lic['start']) ? $this->parseDateParts($lic['start']) : '', ]; } } $info['licenses'] = $licenses; // 撤稿检测 $retraction = $this->detectRetraction($message); $info['is_retracted'] = $retraction['is_retracted']; $info['retraction_detail'] = $retraction['retraction_detail']; // update-to(勘误/更正/撤稿通知 等所有更新关系) $updates = []; if (isset($message['update-to']) && is_array($message['update-to'])) { foreach ($message['update-to'] as $up) { $updates[] = [ 'type' => $up['type'] ?? '', 'label' => $up['label'] ?? '', 'DOI' => $up['DOI'] ?? '', 'date' => isset($up['updated']) ? $this->parseDateParts($up['updated']) : '', ]; } } $info['updates'] = $updates; // 关联关系 relation $info['relation'] = $message['relation'] ?? []; // 分数/评分 $info['score'] = $message['score'] ?? 0; return $info; } /** * 查询单个 DOI,返回完整的结构化元数据 * * @param string doi 文章DOI */ public function queryDoi() { $data = $this->request->param(); if (!isset($data['doi']) || trim($data['doi']) == '') { return jsonError('doi不能为空'); } $doi = $this->cleanDoi($data['doi']); $res = $this->fetchCrossRefData($doi); if (!$res['success']) { return jsonError($res['error']); } $parsed = $this->parseCrossRefMessage($doi, $res['message']); return jsonSuccess($parsed); } /** * 批量查询多个 DOI 的完整元数据 * * @param string dois 逗号分隔的DOI列表 */ public function batchQueryDois() { $data = $this->request->param(); if (!isset($data['dois']) || trim($data['dois']) == '') { return jsonError('dois不能为空'); } $doiList = array_filter(array_map('trim', explode(',', $data['dois']))); if (empty($doiList)) { return jsonError('未提供有效的DOI'); } if (count($doiList) > 50) { return jsonError('单次最多查询50个DOI'); } $results = []; $retractedCount = 0; foreach ($doiList as $rawDoi) { $doi = $this->cleanDoi($rawDoi); $res = $this->fetchCrossRefData($doi); if (!$res['success']) { $results[] = ['doi' => $doi, 'success' => false, 'error' => $res['error']]; } else { $parsed = $this->parseCrossRefMessage($doi, $res['message']); $parsed['success'] = true; if ($parsed['is_retracted']) { $retractedCount++; } $results[] = $parsed; } usleep(200000); } return jsonSuccess([ 'total' => count($results), 'retracted_count' => $retractedCount, 'list' => $results, ]); } /** * 检查一篇文章的所有参考文献(返回每条引用的完整 CrossRef 元数据 + 撤稿标记) * * @param int p_article_id 生产文章ID */ public function checkArticleReferences() { $data = $this->request->param(); if (!isset($data['p_article_id']) || $data['p_article_id'] == '') { return jsonError('p_article_id不能为空'); } $pArticleId = intval($data['p_article_id']); $refers = $this->production_article_refer_obj ->where('p_article_id', $pArticleId) ->where('state', 0) ->where('refer_doi', '<>', '') ->select(); if (empty($refers)) { return jsonSuccess([ 'p_article_id' => $pArticleId, 'total_checked' => 0, 'retracted_count' => 0, 'list' => [], ]); } $list = []; $retractedCount = 0; $errorCount = 0; foreach ($refers as $refer) { $doi = $this->cleanDoi($refer['refer_doi']); if ($doi == '') { continue; } $item = [ 'p_refer_id' => $refer['p_refer_id'], 'index' => $refer['index'], 'refer_doi' => $doi, 'refer_content' => $refer['refer_content'] ?? '', ]; $res = $this->fetchCrossRefData($doi); if (!$res['success']) { $item['crossref_success'] = false; $item['crossref_error'] = $res['error']; $errorCount++; } else { $parsed = $this->parseCrossRefMessage($doi, $res['message']); $item['crossref_success'] = true; $item['crossref'] = $parsed; if ($parsed['is_retracted']) { $retractedCount++; } } $list[] = $item; usleep(200000); } return jsonSuccess([ 'p_article_id' => $pArticleId, 'total_checked' => count($list), 'retracted_count' => $retractedCount, 'error_count' => $errorCount, 'list' => $list, ]); } /** * 通过 article_id 检查参考文献(完整元数据 + 撤稿检测) * * @param int article_id 文章ID */ public function checkReferencesByArticleId() { $data = $this->request->param(); if (!isset($data['article_id']) || $data['article_id'] == '') { return jsonError('article_id不能为空'); } $articleId = intval($data['article_id']); $pInfo = $this->production_article_obj ->where('article_id', $articleId) ->where('state', 0) ->find(); if (!$pInfo) { return jsonError('未找到该文章的生产信息'); } $refers = $this->production_article_refer_obj ->where('p_article_id', $pInfo['p_article_id']) ->where('state', 0) ->where('refer_doi', '<>', '') ->select(); if (empty($refers)) { return jsonSuccess([ 'article_id' => $articleId, 'p_article_id' => $pInfo['p_article_id'], 'total_checked' => 0, 'retracted_count' => 0, 'list' => [], ]); } $list = []; $retractedCount = 0; foreach ($refers as $refer) { $doi = $this->cleanDoi($refer['refer_doi']); if ($doi == '') { continue; } $item = [ 'p_refer_id' => $refer['p_refer_id'], 'index' => $refer['index'], 'refer_doi' => $doi, 'refer_content' => $refer['refer_content'] ?? '', ]; $res = $this->fetchCrossRefData($doi); if (!$res['success']) { $item['crossref_success'] = false; $item['crossref_error'] = $res['error']; } else { $parsed = $this->parseCrossRefMessage($doi, $res['message']); $item['crossref_success'] = true; $item['crossref'] = $parsed; if ($parsed['is_retracted']) { $retractedCount++; } } $list[] = $item; usleep(200000); } return jsonSuccess([ 'article_id' => $articleId, 'p_article_id' => $pInfo['p_article_id'], 'total_checked' => count($list), 'retracted_count' => $retractedCount, 'list' => $list, ]); } // ========== DOI 网页抓取通讯作者邮箱 ========== /** * 通过 DOI 跳转获取出版商页面 HTML */ private function fetchPageByDoi($doi) { $url = 'https://doi.org/' . $doi; $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); curl_setopt($ch, CURLOPT_MAXREDIRS, 10); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($ch, CURLOPT_TIMEOUT, 30); curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'); curl_setopt($ch, CURLOPT_HTTPHEADER, [ 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language: en-US,en;q=0.5', ]); $html = curl_exec($ch); $finalUrl = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL); $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); if (curl_errno($ch)) { $error = curl_error($ch); curl_close($ch); return ['success' => false, 'error' => 'CURL错误: ' . $error]; } curl_close($ch); if ($httpCode != 200) { return ['success' => false, 'error' => 'HTTP ' . $httpCode]; } return ['success' => true, 'html' => $html, 'final_url' => $finalUrl]; } /** * 根据最终 URL 判断出版商 */ private function detectPublisher($url) { $host = strtolower(parse_url($url, PHP_URL_HOST) ?: ''); $map = [ 'mdpi.com' => 'mdpi', 'springer.com' => 'springer', 'springerlink.com' => 'springer', 'nature.com' => 'springer', 'biomedcentral.com' => 'springer', 'sciencedirect.com' => 'elsevier', 'elsevier.com' => 'elsevier', 'wiley.com' => 'wiley', 'onlinelibrary.wiley'=> 'wiley', 'frontiersin.org' => 'frontiers', 'tandfonline.com' => 'taylor_francis', 'sagepub.com' => 'sage', 'oup.com' => 'oxford', 'plos.org' => 'plos', 'hindawi.com' => 'hindawi', 'cell.com' => 'cell', 'jci.org' => 'jci', 'asm.org' => 'asm', 'iucr.org' => 'iucr', 'rsc.org' => 'rsc', 'acs.org' => 'acs', ]; foreach ($map as $domain => $publisher) { if (strpos($host, $domain) !== false) { return $publisher; } } return 'unknown'; } /** * 从 HTML 中提取所有有效邮箱(过滤系统邮箱) */ private function extractEmails($html) { $all = []; if (preg_match_all('/[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}/', $html, $m)) { $all = array_unique($m[0]); } $systemKeywords = [ 'noreply', 'no-reply', 'support', 'info@', 'admin@', 'webmaster', 'editor@', 'editorial@', 'help@', 'contact@', 'privacy@', 'service@', 'marketing@', 'feedback@', 'copyright@', 'permissions@', 'mdpi.com', 'springer.com', 'elsevier.com', 'wiley.com', 'frontiersin.org', 'nature.com', 'oup.com', 'sagepub.com', 'tandfonline.com', 'plos.org', 'hindawi.com', 'biomedcentral.com', 'crossref.org', 'doi.org', 'example.com', 'sentry.io', 'acs.org', 'rsc.org', ]; $filtered = []; foreach ($all as $email) { $lower = strtolower($email); $skip = false; foreach ($systemKeywords as $kw) { if (strpos($lower, $kw) !== false) { $skip = true; break; } } if (strpos($lower, '.png') !== false || strpos($lower, '.jpg') !== false || strpos($lower, '.gif') !== false || strpos($lower, '.css') !== false || strpos($lower, '.js') !== false) { $skip = true; } if (!$skip) { $filtered[] = $email; } } return $filtered; } /** * MDPI 页面解析 */ private function parseMdpiEmail($html) { $result = ['corresponding_authors' => [], 'all_emails' => []]; $result['all_emails'] = $this->extractEmails($html); // MDPI: 通讯作者标 *,邮箱用 "Author to whom correspondence should be addressed" // 找对应作者名:带 * 的 或文本 $corrNames = []; if (preg_match_all('/]*>\s*([^<]+?)\s*<\/span>\s*\*/', $html, $m)) { $corrNames = array_map('trim', $m[1]); } if (empty($corrNames) && preg_match_all('/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)\s*\*/', $html, $m)) { $corrNames = array_map('trim', $m[1]); } // 找 mailto 链接(通常就是通讯作者邮箱) $mailtoEmails = []; if (preg_match_all('/href=["\']mailto:([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})["\']/', $html, $m)) { $mailtoEmails = array_values(array_unique($m[1])); } $mailtoEmails = array_values(array_filter($mailtoEmails, function ($e) { return stripos($e, 'mdpi') === false; })); foreach ($corrNames as $i => $name) { $entry = ['name' => $name, 'email' => '']; if (isset($mailtoEmails[$i])) { $entry['email'] = $mailtoEmails[$i]; } $result['corresponding_authors'][] = $entry; } if (empty($result['corresponding_authors']) && !empty($mailtoEmails)) { foreach ($mailtoEmails as $email) { $result['corresponding_authors'][] = ['name' => '', 'email' => $email]; } } return $result; } /** * Springer / Nature / BMC 页面解析 */ private function parseSpringerEmail($html) { $result = ['corresponding_authors' => [], 'all_emails' => []]; $result['all_emails'] = $this->extractEmails($html); // Springer: if (preg_match_all('/data-test=["\']author-letter["\'][^>]*href=["\']mailto:([^"\']+)["\']/', $html, $m)) { foreach ($m[1] as $email) { $result['corresponding_authors'][] = ['name' => '', 'email' => $email]; } } // 或者 "Correspondence to" / "Corresponding author" 区域 if (empty($result['corresponding_authors'])) { if (preg_match('/[Cc]orrespond(?:ence|ing\s+author)[^<]{0,50}<[^>]*>([^<]*<[^>]*>)*?[^<]*?href=["\']mailto:([^"\']+)["\']/', $html, $m)) { $result['corresponding_authors'][] = ['name' => '', 'email' => $m[2]]; } } if (empty($result['corresponding_authors'])) { $patterns = [ '/[Cc]orrespond[a-z]*[^<]{0,300}?([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})/', ]; foreach ($patterns as $p) { if (preg_match_all($p, $html, $m)) { foreach ($m[1] as $email) { $clean = $this->extractEmails($email); if (!empty($clean)) { $result['corresponding_authors'][] = ['name' => '', 'email' => $clean[0]]; } elseif (filter_var($email, FILTER_VALIDATE_EMAIL)) { $result['corresponding_authors'][] = ['name' => '', 'email' => $email]; } } break; } } } // 尝试找名字 if (!empty($result['corresponding_authors'])) { if (preg_match('/[Cc]orrespond[a-z]*\s+(?:to|author)[:\s]*([A-Z][a-zA-Z\s.\-]+?)(?:\.|<|,)/', $html, $m)) { $name = trim($m[1]); if (strlen($name) > 2 && strlen($name) < 80) { $result['corresponding_authors'][0]['name'] = $name; } } } return $result; } /** * Frontiers 页面解析 */ private function parseFrontiersEmail($html) { $result = ['corresponding_authors' => [], 'all_emails' => []]; $result['all_emails'] = $this->extractEmails($html); // Frontiers: "*Correspondence: Name, email@xxx.com" 或 "*Correspondence:" 后跟 mailto: if (preg_match_all('/\*\s*[Cc]orrespondence:\s*(.*?)(?:'); $emails = []; if (preg_match_all('/([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})/', $block, $em)) { $emails = $em[1]; } $plainText = strip_tags($block); $parts = preg_split('/[,;]/', $plainText); $name = ''; foreach ($parts as $part) { $part = trim($part); if ($part && strpos($part, '@') === false && preg_match('/[A-Z]/', $part)) { $name = $part; break; } } foreach ($emails as $email) { $result['corresponding_authors'][] = ['name' => $name, 'email' => $email]; } } } if (empty($result['corresponding_authors'])) { if (preg_match_all('/href=["\']mailto:([^"\']+)["\'][^>]*>([^<]*) $email) { $label = trim(strip_tags($m[2][$i])); if (stripos($email, 'frontiersin') === false) { $result['corresponding_authors'][] = ['name' => $label ?: '', 'email' => $email]; } } } } return $result; } /** * Wiley 页面解析 */ private function parseWileyEmail($html) { $result = ['corresponding_authors' => [], 'all_emails' => []]; $result['all_emails'] = $this->extractEmails($html); // Wiley: "Correspondence" 段落或 data-widget-def 中含有 mailto: if (preg_match('/[Cc]orrespond[a-z]*[:\s].*?href=["\']mailto:([^"\']+)["\'].*?<\/p/s', $html, $m)) { $result['corresponding_authors'][] = ['name' => '', 'email' => $m[1]]; } if (empty($result['corresponding_authors'])) { if (preg_match('/[Cc]orrespond[a-z]*[^<]{0,500}?([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})/', $html, $m)) { $result['corresponding_authors'][] = ['name' => '', 'email' => $m[1]]; } } if (!empty($result['corresponding_authors'])) { if (preg_match('/[Cc]orrespond[a-z]*[:\s]*([A-Z][a-zA-Z.\-\s]{2,40}?),/', $html, $m)) { $result['corresponding_authors'][0]['name'] = trim($m[1]); } } return $result; } /** * Elsevier / ScienceDirect 页面解析 */ private function parseElsevierEmail($html) { $result = ['corresponding_authors' => [], 'all_emails' => []]; $result['all_emails'] = $this->extractEmails($html); // ScienceDirect: "Corresponding author" 按钮/区域 + mailto: if (preg_match_all('/class="[^"]*corresponding[^"]*"[^>]*>.*?href=["\']mailto:([^"\']+)["\']/si', $html, $m)) { foreach ($m[1] as $email) { $result['corresponding_authors'][] = ['name' => '', 'email' => $email]; } } if (empty($result['corresponding_authors'])) { if (preg_match_all('/data-[a-z\-]*=["\']corresponding[^"\']*["\'][^>]*>([^<]+)/i', $html, $m)) { foreach ($m[1] as $name) { $result['corresponding_authors'][] = ['name' => trim(strip_tags($name)), 'email' => '']; } } } if (empty($result['corresponding_authors'])) { if (preg_match('/[Cc]orrespond[a-z]*[^<]{0,300}?([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})/', $html, $m)) { $result['corresponding_authors'][] = ['name' => '', 'email' => $m[1]]; } } return $result; } /** * Taylor & Francis 页面解析 */ private function parseTaylorFrancisEmail($html) { $result = ['corresponding_authors' => [], 'all_emails' => []]; $result['all_emails'] = $this->extractEmails($html); // T&F: or "CONTACT" section if (preg_match('/class="[^"]*corresp[^"]*"[^>]*>(.*?)<\/span>/si', $html, $m)) { $block = $m[1]; if (preg_match('/([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})/', $block, $em)) { $name = trim(strip_tags(preg_replace('/[a-zA-Z0-9._%+\-]+@.*/', '', $block))); $result['corresponding_authors'][] = ['name' => $name, 'email' => $em[1]]; } } if (empty($result['corresponding_authors'])) { if (preg_match('/CONTACT\s+(.*?)([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})/s', $html, $m)) { $name = trim(strip_tags($m[1])); $result['corresponding_authors'][] = ['name' => $name, 'email' => $m[2]]; } } return $result; } /** * PLOS 页面解析 */ private function parsePlosEmail($html) { $result = ['corresponding_authors' => [], 'all_emails' => []]; $result['all_emails'] = $this->extractEmails($html); // PLOS: "* E-mail: xxx@yyy.com" if (preg_match_all('/\*\s*E-?mail:\s*([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})/', $html, $m)) { foreach ($m[1] as $email) { $result['corresponding_authors'][] = ['name' => '', 'email' => $email]; } } return $result; } /** * 通用邮箱提取(兜底方案) */ private function parseGenericEmail($html) { $result = ['corresponding_authors' => [], 'all_emails' => []]; $result['all_emails'] = $this->extractEmails($html); // 策略1: 找 "Correspondence" / "Corresponding author" 附近的邮箱 $corrPatterns = [ '/[Cc]orrespond(?:ing\s+author|ence)[:\s]*(?:<[^>]*>)*\s*(?:<[^>]*>)*\s*([^<]*?)\s*(?:<[^>]*>)*\s*(?:href=["\'])?mailto:([^"\'>\s]+)/s', '/[Cc]orrespond[a-z]*[^<]{0,500}?([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})/', '/\*\s*(?:E-?mail|Correspondence)[:\s]*([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})/', ]; foreach ($corrPatterns as $pattern) { if (preg_match_all($pattern, $html, $m)) { $lastGroup = end($m); foreach ($lastGroup as $val) { if (filter_var($val, FILTER_VALIDATE_EMAIL)) { $alreadyExists = false; foreach ($result['corresponding_authors'] as $existing) { if ($existing['email'] === $val) { $alreadyExists = true; break; } } if (!$alreadyExists) { $result['corresponding_authors'][] = ['name' => '', 'email' => $val]; } } } } if (!empty($result['corresponding_authors'])) { break; } } // 策略2: 找所有 mailto 链接 if (empty($result['corresponding_authors'])) { $mailtoEmails = []; if (preg_match_all('/href=["\']mailto:([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})["\']/', $html, $m)) { $mailtoEmails = array_unique($m[1]); } $filtered = array_values(array_filter($mailtoEmails, function ($email) { $lower = strtolower($email); $skip = ['noreply', 'support', 'info@', 'admin', 'editor', 'help@', 'contact@', 'privacy', 'service', 'marketing', 'copyright', 'permission']; foreach ($skip as $kw) { if (strpos($lower, $kw) !== false) return false; } return true; })); foreach ($filtered as $email) { $result['corresponding_authors'][] = ['name' => '', 'email' => $email]; } } return $result; } /** * 主方法:根据 DOI 抓取出版商网页,解析通讯作者和邮箱 */ private function scrapeCorrespondingAuthor($doi) { $doi = $this->cleanDoi($doi); if ($doi == '') { return ['success' => false, 'error' => 'DOI为空']; } $page = $this->fetchPageByDoi($doi); if (!$page['success']) { return ['success' => false, 'doi' => $doi, 'error' => $page['error']]; } $finalUrl = $page['final_url']; $html = $page['html']; $publisher = $this->detectPublisher($finalUrl); switch ($publisher) { case 'mdpi': $parsed = $this->parseMdpiEmail($html); break; case 'springer': $parsed = $this->parseSpringerEmail($html); break; case 'frontiers': $parsed = $this->parseFrontiersEmail($html); break; case 'wiley': $parsed = $this->parseWileyEmail($html); break; case 'elsevier': $parsed = $this->parseElsevierEmail($html); break; case 'taylor_francis': $parsed = $this->parseTaylorFrancisEmail($html); break; case 'plos': $parsed = $this->parsePlosEmail($html); break; default: $parsed = $this->parseGenericEmail($html); break; } // 如果专用解析器没找到,用通用方案兜底 if (empty($parsed['corresponding_authors'])) { $generic = $this->parseGenericEmail($html); $parsed['corresponding_authors'] = $generic['corresponding_authors']; } return [ 'success' => true, 'doi' => $doi, 'url' => $finalUrl, 'publisher' => $publisher, 'corresponding_authors' => $parsed['corresponding_authors'], 'all_emails' => $parsed['all_emails'], ]; } /** * 通过 DOI 获取通讯作者邮箱(单个) * * @param string doi 文章DOI */ public function getAuthorEmail() { $data = $this->request->param(); if (!isset($data['doi']) || trim($data['doi']) == '') { return jsonError('doi不能为空'); } $result = $this->scrapeCorrespondingAuthor($data['doi']); if (!$result['success']) { return jsonError($result['error']); } return jsonSuccess($result); } /** * 批量通过 DOI 获取通讯作者邮箱 * * @param string dois 逗号分隔的DOI列表 */ public function batchGetAuthorEmails() { $data = $this->request->param(); if (!isset($data['dois']) || trim($data['dois']) == '') { return jsonError('dois不能为空'); } $doiList = array_filter(array_map('trim', explode(',', $data['dois']))); if (empty($doiList)) { return jsonError('未提供有效的DOI'); } if (count($doiList) > 20) { return jsonError('单次最多查询20个DOI'); } $results = []; $successCount = 0; $emailFoundCount = 0; foreach ($doiList as $rawDoi) { $result = $this->scrapeCorrespondingAuthor($rawDoi); if ($result['success']) { $successCount++; if (!empty($result['corresponding_authors'])) { $emailFoundCount++; } } $results[] = $result; usleep(500000); } return jsonSuccess([ 'total' => count($results), 'success_count' => $successCount, 'email_found_count' => $emailFoundCount, 'list' => $results, ]); } /** * 统计当前 field 转 major 的覆盖情况 */ public function statistics() { $totalReviewers = $this->user_reviewer_info_obj ->where('state', 0) ->count(); $hasField = $this->user_reviewer_info_obj ->where('state', 0) ->where('field', '<>', '') ->count(); $hasMajorToUser = Db::name('major_to_user') ->where('state', 0) ->group('user_id') ->count(); $hasFieldNoMajor = $this->user_reviewer_info_obj ->alias('ri') ->where('ri.state', 0) ->where('ri.field', '<>', '') ->where('ri.reviewer_id', 'not in', Db::name('major_to_user')->where('state', 0)->field('user_id')->buildSql()) ->count(); return jsonSuccess([ 'total_reviewers' => $totalReviewers, 'has_field' => $hasField, 'has_major_to_user' => $hasMajorToUser, 'has_field_no_major' => $hasFieldNoMajor, ]); } }