Files
tougao/application/api/controller/Agent.php
wangjinlei d31279e684 agent功能
2026-03-04 15:22:50 +08:00

1475 lines
50 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
namespace app\api\controller;
use think\Db;
use think\Cache;
use think\Env;
class Agent extends Base
{
public function __construct(\think\Request $request = null)
{
parent::__construct($request);
}
/**
* 获取所有叶子节点 major 及其完整路径,用于 AI 匹配
*/
private function getMajorTree()
{
$cacheKey = 'agent_major_tree';
$cached = Cache::get($cacheKey);
if ($cached) {
return $cached;
}
$allMajors = $this->major_obj
->where('major_state', 0)
->where('major_type', 0)
->select();
$majorMap = [];
foreach ($allMajors as $m) {
$majorMap[$m['major_id']] = $m;
}
$result = [];
foreach ($allMajors as $m) {
$hasChild = false;
foreach ($allMajors as $check) {
if ($check['pid'] == $m['major_id']) {
$hasChild = true;
break;
}
}
if (!$hasChild) {
$path = $this->buildMajorPath($m['major_id'], $majorMap);
$result[] = [
'major_id' => $m['major_id'],
'major_title' => $m['major_title'],
'full_path' => $path,
];
}
}
Cache::set($cacheKey, $result, 3600);
return $result;
}
/**
* 递归构建 major 的完整路径
*/
private function buildMajorPath($majorId, &$majorMap)
{
if (!isset($majorMap[$majorId])) {
return '';
}
$m = $majorMap[$majorId];
if ($m['pid'] == 0 || $m['pid'] == 1 || !isset($majorMap[$m['pid']])) {
return $m['major_title'];
}
return $this->buildMajorPath($m['pid'], $majorMap) . ' > ' . $m['major_title'];
}
/**
* 构建 major 列表提示文本(供 AI 使用)
*/
private function buildMajorListPrompt($majorTree)
{
$lines = [];
foreach ($majorTree as $item) {
$lines[] = "ID:{$item['major_id']} - {$item['full_path']}";
}
return implode("\n", $lines);
}
/**
* 调用 AI 将用户 field 描述匹配到标准 major_id
*/
private function matchFieldToMajor($field, $majorListPrompt)
{
$systemPrompt = "你是一位医学领域分类专家。用户会提供一段研究领域的描述文本你需要从给定的标准领域列表中找出最匹配的1-3个领域。\n"
. "请严格按照JSON数组格式返回匹配结果只返回major_id数组如 [12,34,56]。\n"
. "如果没有合适的匹配,返回空数组 []。\n"
. "不要返回任何其他内容只返回JSON数组。\n\n"
. "标准领域列表:\n" . $majorListPrompt;
$userPrompt = "请为以下研究领域描述匹配最合适的标准领域ID\n" . $field;
$messages = [
['role' => 'system', 'content' => $systemPrompt],
['role' => 'user', 'content' => $userPrompt],
];
$apiKey = Env::get("gpt.api_key1", Env::get("gpt.api_key", ""));
$url = 'http://chat.taimed.cn/v1/chat/completions';
$data = [
'model' => 'gpt-4.1',
'messages' => $messages,
'temperature' => 0.1,
'max_tokens' => 200,
];
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HTTPHEADER, [
'Content-Type: application/json',
'Authorization: Bearer ' . $apiKey,
]);
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($data));
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_TIMEOUT, 60);
$result = curl_exec($ch);
if (curl_errno($ch)) {
curl_close($ch);
return [];
}
curl_close($ch);
$res = json_decode($result, true);
if (!isset($res['choices'][0]['message']['content'])) {
return [];
}
$content = trim($res['choices'][0]['message']['content']);
// 提取 JSON 数组
if (preg_match('/\[[\d,\s]*\]/', $content, $matches)) {
$ids = json_decode($matches[0], true);
if (is_array($ids)) {
return array_map('intval', $ids);
}
}
return [];
}
/**
* 将匹配结果写入 t_major_to_user
*/
private function saveMajorToUser($userId, $majorIds)
{
$existing = $this->major_to_user_obj
->where('user_id', $userId)
->where('state', 0)
->column('major_id');
$toInsert = array_diff($majorIds, $existing);
foreach ($toInsert as $majorId) {
$this->major_to_user_obj->insert([
'user_id' => $userId,
'major_id' => $majorId,
'ctime' => time(),
]);
}
return count($toInsert);
}
/**
* 处理单个用户的 field 转 major
*
* @param int user_id 用户ID
*/
public function processOneUser()
{
$data = $this->request->param();
if (!isset($data['user_id']) || $data['user_id'] == '') {
return jsonError('user_id不能为空');
}
if (!isset($data['field']) || $data['field'] == '') {
return jsonError('field不能为空');
}
$userId = intval($data['user_id']);
$reviewerInfo = $this->user_reviewer_info_obj
->where('reviewer_id', $userId)
->where('state', 0)
->find();
if (!$reviewerInfo) {
return jsonError('未找到该用户的reviewer信息');
}
$field = trim($data['field']);
// if ($field == '') {
// return jsonError('该用户的field字段为空');
// }
$majorTree = $this->getMajorTree();
if (empty($majorTree)) {
return jsonError('未获取到标准领域数据');
}
$majorListPrompt = $this->buildMajorListPrompt($majorTree);
$matchedIds = $this->matchFieldToMajor($field, $majorListPrompt);
if (empty($matchedIds)) {
return jsonSuccess([
'user_id' => $userId,
'field' => $field,
'matched_ids' => [],
'inserted' => 0,
'msg' => 'AI未匹配到合适的领域',
]);
}
// 验证 major_id 确实存在
$validMajors = $this->major_obj
->where('major_id', 'in', $matchedIds)
->where('major_state', 0)
->column('major_id');
$existing = $this->major_to_user_obj
->where('user_id', $userId)
->where('state', 0)
->column('major_id');
$unionArray = array_unique(array_merge($validMajors, $existing));
$ms = $this->major_obj->where('major_id', 'in', $unionArray)->where('major_state', 0)->select();
// $inserted = $this->saveMajorToUser($userId, $matchedIds);
foreach ($ms as $k => $major){
$ms[$k]['shu'] = getMajorShu($major['major_id']);
$ms[$k]['str'] = getMajorStr($major['major_id']);
}
return jsonSuccess([
'user_id' => $userId,
'field' => $field,
'majors' => $ms,
// 'inserted' => $inserted,
]);
}
/**
* 批量处理:获取有 field 但没有 major_to_user 记录的用户,逐个用 AI 匹配
*
* @param int limit 每次处理的数量默认10
* @param int skip_has_major 是否跳过已有major_to_user记录的用户默认1
*/
public function batchProcess()
{
$data = $this->request->param();
$limit = isset($data['limit']) ? intval($data['limit']) : 10;
$skipHasMajor = isset($data['skip_has_major']) ? intval($data['skip_has_major']) : 1;
if ($limit > 50) {
$limit = 50;
}
$query = $this->user_reviewer_info_obj
->alias('ri')
->field('ri.reviewer_id, ri.field')
->where('ri.state', 0)
->where('ri.field', '<>', '');
if ($skipHasMajor) {
$subQuery = Db::name('major_to_user')->where('state', 0)->field('user_id')->buildSql();
$query = $query->where('ri.reviewer_id', 'not in', $subQuery);
}
$users = $query->limit($limit)->select();
if (empty($users)) {
return jsonSuccess([
'processed' => 0,
'msg' => '没有需要处理的用户',
]);
}
$majorTree = $this->getMajorTree();
if (empty($majorTree)) {
return jsonError('未获取到标准领域数据');
}
$majorListPrompt = $this->buildMajorListPrompt($majorTree);
$validMajorIds = $this->major_obj->where('major_state', 0)->column('major_id');
$results = [];
$successCount = 0;
$failCount = 0;
foreach ($users as $user) {
$field = trim($user['field']);
if ($field == '') {
continue;
}
$matchedIds = $this->matchFieldToMajor($field, $majorListPrompt);
$matchedIds = array_intersect($matchedIds, $validMajorIds);
if (!empty($matchedIds)) {
$inserted = $this->saveMajorToUser($user['reviewer_id'], $matchedIds);
$results[] = [
'user_id' => $user['reviewer_id'],
'field' => mb_substr($field, 0, 100),
'matched_ids' => array_values($matchedIds),
'inserted' => $inserted,
];
$successCount++;
} else {
$results[] = [
'user_id' => $user['reviewer_id'],
'field' => mb_substr($field, 0, 100),
'matched_ids' => [],
'inserted' => 0,
];
$failCount++;
}
}
return jsonSuccess([
'processed' => count($results),
'success_count' => $successCount,
'fail_count' => $failCount,
'details' => $results,
]);
}
/**
* 查看当前 major 树结构(调试用)
*/
public function getMajorList()
{
$majorTree = $this->getMajorTree();
return jsonSuccess([
'total' => count($majorTree),
'list' => $majorTree,
]);
}
/**
* 从 Excel 文件导入 major 数据到数据库(如需要)
*/
public function importMajorFromExcel()
{
$file = ROOT_PATH . 'public' . DS . 'system' . DS . 't_major.xlsx';
if (!file_exists($file)) {
return jsonError('Excel文件不存在: public/system/t_major.xlsx');
}
$spreadsheet = \PhpOffice\PhpSpreadsheet\IOFactory::load($file);
$sheet = $spreadsheet->getActiveSheet();
$highestRow = $sheet->getHighestRow();
$highestColumn = $sheet->getHighestColumn();
$headers = [];
$colCount = \PhpOffice\PhpSpreadsheet\Cell\Coordinate::columnIndexFromString($highestColumn);
for ($col = 1; $col <= $colCount; $col++) {
$headers[$col] = $sheet->getCellByColumnAndRow($col, 1)->getValue();
}
$rows = [];
for ($row = 2; $row <= $highestRow; $row++) {
$rowData = [];
for ($col = 1; $col <= $colCount; $col++) {
$rowData[$headers[$col]] = $sheet->getCellByColumnAndRow($col, $row)->getValue();
}
$rows[] = $rowData;
}
return jsonSuccess([
'headers' => array_values($headers),
'total' => count($rows),
'preview' => array_slice($rows, 0, 20),
]);
}
// ========== CrossRef DOI 查询 & 撤稿检测 ==========
/**
* 清洗 DOI去掉前缀
*/
private function cleanDoi($doi)
{
$doi = trim($doi);
$doi = preg_replace('/^https?:\/\/doi\.org\//', '', $doi);
$doi = preg_replace('/^doi:\s*/i', '', $doi);
return trim($doi);
}
/**
* 请求 CrossRef API 获取 DOI 的原始 message 数据
*/
private function fetchCrossRefData($doi)
{
$url = 'https://api.crossref.org/works/' . urlencode($doi);
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_TIMEOUT, 30);
curl_setopt($ch, CURLOPT_HTTPHEADER, [
'User-Agent: TMRJournals/1.0 (mailto:publisher@tmrjournals.com)',
'Accept: application/json',
]);
$result = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
if (curl_errno($ch)) {
$error = curl_error($ch);
curl_close($ch);
return ['success' => false, 'error' => 'CURL错误: ' . $error];
}
curl_close($ch);
if ($httpCode == 404) {
return ['success' => false, 'error' => 'DOI在CrossRef中未找到'];
}
if ($httpCode != 200) {
return ['success' => false, 'error' => 'CrossRef返回HTTP ' . $httpCode];
}
$data = json_decode($result, true);
if (!isset($data['message'])) {
return ['success' => false, 'error' => 'CrossRef返回数据格式异常'];
}
return ['success' => true, 'message' => $data['message']];
}
/**
* 从 CrossRef date-parts 中提取日期字符串
*/
private function parseDateParts($dateObj)
{
if (!isset($dateObj['date-parts'][0])) {
return '';
}
$parts = $dateObj['date-parts'][0];
$y = isset($parts[0]) ? $parts[0] : '';
$m = isset($parts[1]) ? sprintf('%02d', $parts[1]) : '';
$d = isset($parts[2]) ? sprintf('%02d', $parts[2]) : '';
if ($y && $m && $d) {
return "{$y}-{$m}-{$d}";
}
if ($y && $m) {
return "{$y}-{$m}";
}
return (string)$y;
}
/**
* 解析作者列表
*/
private function parseAuthors($authorList)
{
if (empty($authorList) || !is_array($authorList)) {
return [];
}
$result = [];
foreach ($authorList as $a) {
$author = [
'given' => $a['given'] ?? '',
'family' => $a['family'] ?? '',
'name' => isset($a['name']) ? $a['name'] : ((isset($a['given']) ? $a['given'] . ' ' : '') . ($a['family'] ?? '')),
'ORCID' => $a['ORCID'] ?? '',
'sequence' => $a['sequence'] ?? '',
'affiliation' => [],
];
if (isset($a['affiliation']) && is_array($a['affiliation'])) {
foreach ($a['affiliation'] as $aff) {
$author['affiliation'][] = $aff['name'] ?? '';
}
}
$result[] = $author;
}
return $result;
}
/**
* 检测撤稿状态
*/
private function detectRetraction($message)
{
$isRetracted = false;
$retractionDetail = [];
// 1. update-to 字段
if (isset($message['update-to']) && is_array($message['update-to'])) {
foreach ($message['update-to'] as $update) {
$updateType = strtolower($update['type'] ?? '');
$updateLabel = strtolower($update['label'] ?? '');
if (strpos($updateType, 'retract') !== false || strpos($updateLabel, 'retract') !== false) {
$isRetracted = true;
$retractionDetail['retraction_notice'] = [
'type' => $update['type'] ?? '',
'label' => $update['label'] ?? '',
'DOI' => $update['DOI'] ?? '',
'date' => isset($update['updated']) ? $this->parseDateParts($update['updated']) : '',
];
break;
}
}
}
// 2. type/subtype
$type = strtolower($message['type'] ?? '');
$subtype = strtolower($message['subtype'] ?? '');
if (strpos($type, 'retract') !== false || strpos($subtype, 'retract') !== false) {
$isRetracted = true;
$retractionDetail['is_retraction_notice'] = true;
}
// 3. relation
if (isset($message['relation']) && is_array($message['relation'])) {
foreach ($message['relation'] as $relType => $relations) {
if (strpos(strtolower($relType), 'retract') !== false) {
$isRetracted = true;
$retractionDetail['relation'] = [$relType => $relations];
break;
}
}
}
// 4. title 关键词
$titles = $message['title'] ?? [];
foreach ($titles as $title) {
$lower = strtolower($title);
if (strpos($lower, 'retraction') !== false || strpos($lower, 'retracted') !== false
|| strpos($lower, 'withdrawal') !== false || strpos($lower, 'withdrawn') !== false) {
$isRetracted = true;
$retractionDetail['title_keyword'] = $title;
break;
}
}
return ['is_retracted' => $isRetracted, 'retraction_detail' => $retractionDetail];
}
/**
* 解析 CrossRef message 为结构化数据
*/
private function parseCrossRefMessage($doi, $message)
{
// 基础信息
$info = [
'doi' => $doi,
'url' => $message['URL'] ?? ('https://doi.org/' . $doi),
'type' => $message['type'] ?? '',
'title' => isset($message['title'][0]) ? $message['title'][0] : '',
];
// 作者
$info['authors'] = $this->parseAuthors($message['author'] ?? []);
$info['author_string'] = implode(', ', array_column($info['authors'], 'name'));
// 期刊/来源
$info['journal'] = [
'title' => isset($message['container-title'][0]) ? $message['container-title'][0] : '',
'short_title'=> isset($message['short-container-title'][0]) ? $message['short-container-title'][0] : '',
'ISSN' => $message['ISSN'] ?? [],
'publisher' => $message['publisher'] ?? '',
];
// 卷/期/页码
$info['volume'] = $message['volume'] ?? '';
$info['issue'] = $message['issue'] ?? '';
$info['page'] = $message['page'] ?? '';
$info['article_number'] = $message['article-number'] ?? '';
// 日期
$info['dates'] = [
'published_print' => isset($message['published-print']) ? $this->parseDateParts($message['published-print']) : '',
'published_online' => isset($message['published-online']) ? $this->parseDateParts($message['published-online']) : '',
'published' => isset($message['published']) ? $this->parseDateParts($message['published']) : '',
'created' => isset($message['created']) ? $this->parseDateParts($message['created']) : '',
'deposited' => isset($message['deposited']) ? $this->parseDateParts($message['deposited']) : '',
'indexed' => isset($message['indexed']) ? $this->parseDateParts($message['indexed']) : '',
];
$info['year'] = '';
foreach (['published-print', 'published-online', 'published', 'created'] as $dk) {
if (isset($message[$dk]['date-parts'][0][0]) && $message[$dk]['date-parts'][0][0]) {
$info['year'] = (string)$message[$dk]['date-parts'][0][0];
break;
}
}
// 摘要
$info['abstract'] = $message['abstract'] ?? '';
// 学科/主题
$info['subject'] = $message['subject'] ?? [];
// 引用统计
$info['references_count'] = $message['references-count'] ?? 0;
$info['is_referenced_by_count'] = $message['is-referenced-by-count'] ?? 0;
// 资助信息
$funders = [];
if (isset($message['funder']) && is_array($message['funder'])) {
foreach ($message['funder'] as $f) {
$funders[] = [
'name' => $f['name'] ?? '',
'DOI' => $f['DOI'] ?? '',
'award' => $f['award'] ?? [],
];
}
}
$info['funders'] = $funders;
// 许可证
$licenses = [];
if (isset($message['license']) && is_array($message['license'])) {
foreach ($message['license'] as $lic) {
$licenses[] = [
'URL' => $lic['URL'] ?? '',
'start_date' => isset($lic['start']) ? $this->parseDateParts($lic['start']) : '',
];
}
}
$info['licenses'] = $licenses;
// 撤稿检测
$retraction = $this->detectRetraction($message);
$info['is_retracted'] = $retraction['is_retracted'];
$info['retraction_detail'] = $retraction['retraction_detail'];
// update-to勘误/更正/撤稿通知 等所有更新关系)
$updates = [];
if (isset($message['update-to']) && is_array($message['update-to'])) {
foreach ($message['update-to'] as $up) {
$updates[] = [
'type' => $up['type'] ?? '',
'label' => $up['label'] ?? '',
'DOI' => $up['DOI'] ?? '',
'date' => isset($up['updated']) ? $this->parseDateParts($up['updated']) : '',
];
}
}
$info['updates'] = $updates;
// 关联关系 relation
$info['relation'] = $message['relation'] ?? [];
// 分数/评分
$info['score'] = $message['score'] ?? 0;
return $info;
}
/**
* 查询单个 DOI返回完整的结构化元数据
*
* @param string doi 文章DOI
*/
public function queryDoi()
{
$data = $this->request->param();
if (!isset($data['doi']) || trim($data['doi']) == '') {
return jsonError('doi不能为空');
}
$doi = $this->cleanDoi($data['doi']);
$res = $this->fetchCrossRefData($doi);
if (!$res['success']) {
return jsonError($res['error']);
}
$parsed = $this->parseCrossRefMessage($doi, $res['message']);
return jsonSuccess($parsed);
}
/**
* 批量查询多个 DOI 的完整元数据
*
* @param string dois 逗号分隔的DOI列表
*/
public function batchQueryDois()
{
$data = $this->request->param();
if (!isset($data['dois']) || trim($data['dois']) == '') {
return jsonError('dois不能为空');
}
$doiList = array_filter(array_map('trim', explode(',', $data['dois'])));
if (empty($doiList)) {
return jsonError('未提供有效的DOI');
}
if (count($doiList) > 50) {
return jsonError('单次最多查询50个DOI');
}
$results = [];
$retractedCount = 0;
foreach ($doiList as $rawDoi) {
$doi = $this->cleanDoi($rawDoi);
$res = $this->fetchCrossRefData($doi);
if (!$res['success']) {
$results[] = ['doi' => $doi, 'success' => false, 'error' => $res['error']];
} else {
$parsed = $this->parseCrossRefMessage($doi, $res['message']);
$parsed['success'] = true;
if ($parsed['is_retracted']) {
$retractedCount++;
}
$results[] = $parsed;
}
usleep(200000);
}
return jsonSuccess([
'total' => count($results),
'retracted_count' => $retractedCount,
'list' => $results,
]);
}
/**
* 检查一篇文章的所有参考文献(返回每条引用的完整 CrossRef 元数据 + 撤稿标记)
*
* @param int p_article_id 生产文章ID
*/
public function checkArticleReferences()
{
$data = $this->request->param();
if (!isset($data['p_article_id']) || $data['p_article_id'] == '') {
return jsonError('p_article_id不能为空');
}
$pArticleId = intval($data['p_article_id']);
$refers = $this->production_article_refer_obj
->where('p_article_id', $pArticleId)
->where('state', 0)
->where('refer_doi', '<>', '')
->select();
if (empty($refers)) {
return jsonSuccess([
'p_article_id' => $pArticleId,
'total_checked' => 0,
'retracted_count' => 0,
'list' => [],
]);
}
$list = [];
$retractedCount = 0;
$errorCount = 0;
foreach ($refers as $refer) {
$doi = $this->cleanDoi($refer['refer_doi']);
if ($doi == '') {
continue;
}
$item = [
'p_refer_id' => $refer['p_refer_id'],
'index' => $refer['index'],
'refer_doi' => $doi,
'refer_content' => $refer['refer_content'] ?? '',
];
$res = $this->fetchCrossRefData($doi);
if (!$res['success']) {
$item['crossref_success'] = false;
$item['crossref_error'] = $res['error'];
$errorCount++;
} else {
$parsed = $this->parseCrossRefMessage($doi, $res['message']);
$item['crossref_success'] = true;
$item['crossref'] = $parsed;
if ($parsed['is_retracted']) {
$retractedCount++;
}
}
$list[] = $item;
usleep(200000);
}
return jsonSuccess([
'p_article_id' => $pArticleId,
'total_checked' => count($list),
'retracted_count' => $retractedCount,
'error_count' => $errorCount,
'list' => $list,
]);
}
/**
* 通过 article_id 检查参考文献(完整元数据 + 撤稿检测)
*
* @param int article_id 文章ID
*/
public function checkReferencesByArticleId()
{
$data = $this->request->param();
if (!isset($data['article_id']) || $data['article_id'] == '') {
return jsonError('article_id不能为空');
}
$articleId = intval($data['article_id']);
$pInfo = $this->production_article_obj
->where('article_id', $articleId)
->where('state', 0)
->find();
if (!$pInfo) {
return jsonError('未找到该文章的生产信息');
}
$refers = $this->production_article_refer_obj
->where('p_article_id', $pInfo['p_article_id'])
->where('state', 0)
->where('refer_doi', '<>', '')
->select();
if (empty($refers)) {
return jsonSuccess([
'article_id' => $articleId,
'p_article_id' => $pInfo['p_article_id'],
'total_checked' => 0,
'retracted_count' => 0,
'list' => [],
]);
}
$list = [];
$retractedCount = 0;
foreach ($refers as $refer) {
$doi = $this->cleanDoi($refer['refer_doi']);
if ($doi == '') {
continue;
}
$item = [
'p_refer_id' => $refer['p_refer_id'],
'index' => $refer['index'],
'refer_doi' => $doi,
'refer_content' => $refer['refer_content'] ?? '',
];
$res = $this->fetchCrossRefData($doi);
if (!$res['success']) {
$item['crossref_success'] = false;
$item['crossref_error'] = $res['error'];
} else {
$parsed = $this->parseCrossRefMessage($doi, $res['message']);
$item['crossref_success'] = true;
$item['crossref'] = $parsed;
if ($parsed['is_retracted']) {
$retractedCount++;
}
}
$list[] = $item;
usleep(200000);
}
return jsonSuccess([
'article_id' => $articleId,
'p_article_id' => $pInfo['p_article_id'],
'total_checked' => count($list),
'retracted_count' => $retractedCount,
'list' => $list,
]);
}
// ========== DOI 网页抓取通讯作者邮箱 ==========
/**
* 通过 DOI 跳转获取出版商页面 HTML
*/
private function fetchPageByDoi($doi)
{
$url = 'https://doi.org/' . $doi;
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_MAXREDIRS, 10);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_TIMEOUT, 30);
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
curl_setopt($ch, CURLOPT_HTTPHEADER, [
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language: en-US,en;q=0.5',
]);
$html = curl_exec($ch);
$finalUrl = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
if (curl_errno($ch)) {
$error = curl_error($ch);
curl_close($ch);
return ['success' => false, 'error' => 'CURL错误: ' . $error];
}
curl_close($ch);
if ($httpCode != 200) {
return ['success' => false, 'error' => 'HTTP ' . $httpCode];
}
return ['success' => true, 'html' => $html, 'final_url' => $finalUrl];
}
/**
* 根据最终 URL 判断出版商
*/
private function detectPublisher($url)
{
$host = strtolower(parse_url($url, PHP_URL_HOST) ?: '');
$map = [
'mdpi.com' => 'mdpi',
'springer.com' => 'springer',
'springerlink.com' => 'springer',
'nature.com' => 'springer',
'biomedcentral.com' => 'springer',
'sciencedirect.com' => 'elsevier',
'elsevier.com' => 'elsevier',
'wiley.com' => 'wiley',
'onlinelibrary.wiley'=> 'wiley',
'frontiersin.org' => 'frontiers',
'tandfonline.com' => 'taylor_francis',
'sagepub.com' => 'sage',
'oup.com' => 'oxford',
'plos.org' => 'plos',
'hindawi.com' => 'hindawi',
'cell.com' => 'cell',
'jci.org' => 'jci',
'asm.org' => 'asm',
'iucr.org' => 'iucr',
'rsc.org' => 'rsc',
'acs.org' => 'acs',
];
foreach ($map as $domain => $publisher) {
if (strpos($host, $domain) !== false) {
return $publisher;
}
}
return 'unknown';
}
/**
* 从 HTML 中提取所有有效邮箱(过滤系统邮箱)
*/
private function extractEmails($html)
{
$all = [];
if (preg_match_all('/[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}/', $html, $m)) {
$all = array_unique($m[0]);
}
$systemKeywords = [
'noreply', 'no-reply', 'support', 'info@', 'admin@', 'webmaster',
'editor@', 'editorial@', 'help@', 'contact@', 'privacy@', 'service@',
'marketing@', 'feedback@', 'copyright@', 'permissions@',
'mdpi.com', 'springer.com', 'elsevier.com', 'wiley.com',
'frontiersin.org', 'nature.com', 'oup.com', 'sagepub.com',
'tandfonline.com', 'plos.org', 'hindawi.com', 'biomedcentral.com',
'crossref.org', 'doi.org', 'example.com', 'sentry.io',
'acs.org', 'rsc.org',
];
$filtered = [];
foreach ($all as $email) {
$lower = strtolower($email);
$skip = false;
foreach ($systemKeywords as $kw) {
if (strpos($lower, $kw) !== false) {
$skip = true;
break;
}
}
if (strpos($lower, '.png') !== false || strpos($lower, '.jpg') !== false
|| strpos($lower, '.gif') !== false || strpos($lower, '.css') !== false
|| strpos($lower, '.js') !== false) {
$skip = true;
}
if (!$skip) {
$filtered[] = $email;
}
}
return $filtered;
}
/**
* MDPI 页面解析
*/
private function parseMdpiEmail($html)
{
$result = ['corresponding_authors' => [], 'all_emails' => []];
$result['all_emails'] = $this->extractEmails($html);
// MDPI: 通讯作者标 *,邮箱用 "Author to whom correspondence should be addressed"
// 找对应作者名:带 * 的 <span> 或文本
$corrNames = [];
if (preg_match_all('/<span[^>]*>\s*([^<]+?)\s*<\/span>\s*\*/', $html, $m)) {
$corrNames = array_map('trim', $m[1]);
}
if (empty($corrNames) && preg_match_all('/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)\s*\*/', $html, $m)) {
$corrNames = array_map('trim', $m[1]);
}
// 找 mailto 链接(通常就是通讯作者邮箱)
$mailtoEmails = [];
if (preg_match_all('/href=["\']mailto:([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})["\']/', $html, $m)) {
$mailtoEmails = array_values(array_unique($m[1]));
}
$mailtoEmails = array_values(array_filter($mailtoEmails, function ($e) {
return stripos($e, 'mdpi') === false;
}));
foreach ($corrNames as $i => $name) {
$entry = ['name' => $name, 'email' => ''];
if (isset($mailtoEmails[$i])) {
$entry['email'] = $mailtoEmails[$i];
}
$result['corresponding_authors'][] = $entry;
}
if (empty($result['corresponding_authors']) && !empty($mailtoEmails)) {
foreach ($mailtoEmails as $email) {
$result['corresponding_authors'][] = ['name' => '', 'email' => $email];
}
}
return $result;
}
/**
* Springer / Nature / BMC 页面解析
*/
private function parseSpringerEmail($html)
{
$result = ['corresponding_authors' => [], 'all_emails' => []];
$result['all_emails'] = $this->extractEmails($html);
// Springer: <a data-test="author-letter" href="mailto:xxx">
if (preg_match_all('/data-test=["\']author-letter["\'][^>]*href=["\']mailto:([^"\']+)["\']/', $html, $m)) {
foreach ($m[1] as $email) {
$result['corresponding_authors'][] = ['name' => '', 'email' => $email];
}
}
// 或者 "Correspondence to" / "Corresponding author" 区域
if (empty($result['corresponding_authors'])) {
if (preg_match('/[Cc]orrespond(?:ence|ing\s+author)[^<]{0,50}<[^>]*>([^<]*<[^>]*>)*?[^<]*?href=["\']mailto:([^"\']+)["\']/', $html, $m)) {
$result['corresponding_authors'][] = ['name' => '', 'email' => $m[2]];
}
}
if (empty($result['corresponding_authors'])) {
$patterns = [
'/[Cc]orrespond[a-z]*[^<]{0,300}?([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})/',
];
foreach ($patterns as $p) {
if (preg_match_all($p, $html, $m)) {
foreach ($m[1] as $email) {
$clean = $this->extractEmails($email);
if (!empty($clean)) {
$result['corresponding_authors'][] = ['name' => '', 'email' => $clean[0]];
} elseif (filter_var($email, FILTER_VALIDATE_EMAIL)) {
$result['corresponding_authors'][] = ['name' => '', 'email' => $email];
}
}
break;
}
}
}
// 尝试找名字
if (!empty($result['corresponding_authors'])) {
if (preg_match('/[Cc]orrespond[a-z]*\s+(?:to|author)[:\s]*([A-Z][a-zA-Z\s.\-]+?)(?:\.|<|,)/', $html, $m)) {
$name = trim($m[1]);
if (strlen($name) > 2 && strlen($name) < 80) {
$result['corresponding_authors'][0]['name'] = $name;
}
}
}
return $result;
}
/**
* Frontiers 页面解析
*/
private function parseFrontiersEmail($html)
{
$result = ['corresponding_authors' => [], 'all_emails' => []];
$result['all_emails'] = $this->extractEmails($html);
// Frontiers: "*Correspondence: Name, email@xxx.com" 或 "*Correspondence:" 后跟 mailto:
if (preg_match_all('/\*\s*[Cc]orrespondence:\s*(.*?)(?:<br|<\/p|<\/div|\n)/s', $html, $blocks)) {
foreach ($blocks[1] as $block) {
$block = strip_tags($block, '<a>');
$emails = [];
if (preg_match_all('/([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})/', $block, $em)) {
$emails = $em[1];
}
$plainText = strip_tags($block);
$parts = preg_split('/[,;]/', $plainText);
$name = '';
foreach ($parts as $part) {
$part = trim($part);
if ($part && strpos($part, '@') === false && preg_match('/[A-Z]/', $part)) {
$name = $part;
break;
}
}
foreach ($emails as $email) {
$result['corresponding_authors'][] = ['name' => $name, 'email' => $email];
}
}
}
if (empty($result['corresponding_authors'])) {
if (preg_match_all('/href=["\']mailto:([^"\']+)["\'][^>]*>([^<]*)</i', $html, $m)) {
foreach ($m[1] as $i => $email) {
$label = trim(strip_tags($m[2][$i]));
if (stripos($email, 'frontiersin') === false) {
$result['corresponding_authors'][] = ['name' => $label ?: '', 'email' => $email];
}
}
}
}
return $result;
}
/**
* Wiley 页面解析
*/
private function parseWileyEmail($html)
{
$result = ['corresponding_authors' => [], 'all_emails' => []];
$result['all_emails'] = $this->extractEmails($html);
// Wiley: "Correspondence" 段落或 data-widget-def 中含有 mailto:
if (preg_match('/[Cc]orrespond[a-z]*[:\s].*?href=["\']mailto:([^"\']+)["\'].*?<\/p/s', $html, $m)) {
$result['corresponding_authors'][] = ['name' => '', 'email' => $m[1]];
}
if (empty($result['corresponding_authors'])) {
if (preg_match('/[Cc]orrespond[a-z]*[^<]{0,500}?([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})/', $html, $m)) {
$result['corresponding_authors'][] = ['name' => '', 'email' => $m[1]];
}
}
if (!empty($result['corresponding_authors'])) {
if (preg_match('/[Cc]orrespond[a-z]*[:\s]*([A-Z][a-zA-Z.\-\s]{2,40}?),/', $html, $m)) {
$result['corresponding_authors'][0]['name'] = trim($m[1]);
}
}
return $result;
}
/**
* Elsevier / ScienceDirect 页面解析
*/
private function parseElsevierEmail($html)
{
$result = ['corresponding_authors' => [], 'all_emails' => []];
$result['all_emails'] = $this->extractEmails($html);
// ScienceDirect: "Corresponding author" 按钮/区域 + mailto:
if (preg_match_all('/class="[^"]*corresponding[^"]*"[^>]*>.*?href=["\']mailto:([^"\']+)["\']/si', $html, $m)) {
foreach ($m[1] as $email) {
$result['corresponding_authors'][] = ['name' => '', 'email' => $email];
}
}
if (empty($result['corresponding_authors'])) {
if (preg_match_all('/data-[a-z\-]*=["\']corresponding[^"\']*["\'][^>]*>([^<]+)/i', $html, $m)) {
foreach ($m[1] as $name) {
$result['corresponding_authors'][] = ['name' => trim(strip_tags($name)), 'email' => ''];
}
}
}
if (empty($result['corresponding_authors'])) {
if (preg_match('/[Cc]orrespond[a-z]*[^<]{0,300}?([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})/', $html, $m)) {
$result['corresponding_authors'][] = ['name' => '', 'email' => $m[1]];
}
}
return $result;
}
/**
* Taylor & Francis 页面解析
*/
private function parseTaylorFrancisEmail($html)
{
$result = ['corresponding_authors' => [], 'all_emails' => []];
$result['all_emails'] = $this->extractEmails($html);
// T&F: <span class="NLM_corresp"> or "CONTACT" section
if (preg_match('/class="[^"]*corresp[^"]*"[^>]*>(.*?)<\/span>/si', $html, $m)) {
$block = $m[1];
if (preg_match('/([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})/', $block, $em)) {
$name = trim(strip_tags(preg_replace('/[a-zA-Z0-9._%+\-]+@.*/', '', $block)));
$result['corresponding_authors'][] = ['name' => $name, 'email' => $em[1]];
}
}
if (empty($result['corresponding_authors'])) {
if (preg_match('/CONTACT\s+(.*?)([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})/s', $html, $m)) {
$name = trim(strip_tags($m[1]));
$result['corresponding_authors'][] = ['name' => $name, 'email' => $m[2]];
}
}
return $result;
}
/**
* PLOS 页面解析
*/
private function parsePlosEmail($html)
{
$result = ['corresponding_authors' => [], 'all_emails' => []];
$result['all_emails'] = $this->extractEmails($html);
// PLOS: "* E-mail: xxx@yyy.com"
if (preg_match_all('/\*\s*E-?mail:\s*([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})/', $html, $m)) {
foreach ($m[1] as $email) {
$result['corresponding_authors'][] = ['name' => '', 'email' => $email];
}
}
return $result;
}
/**
* 通用邮箱提取(兜底方案)
*/
private function parseGenericEmail($html)
{
$result = ['corresponding_authors' => [], 'all_emails' => []];
$result['all_emails'] = $this->extractEmails($html);
// 策略1: 找 "Correspondence" / "Corresponding author" 附近的邮箱
$corrPatterns = [
'/[Cc]orrespond(?:ing\s+author|ence)[:\s]*(?:<[^>]*>)*\s*(?:<[^>]*>)*\s*([^<]*?)\s*(?:<[^>]*>)*\s*(?:href=["\'])?mailto:([^"\'>\s]+)/s',
'/[Cc]orrespond[a-z]*[^<]{0,500}?([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})/',
'/\*\s*(?:E-?mail|Correspondence)[:\s]*([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})/',
];
foreach ($corrPatterns as $pattern) {
if (preg_match_all($pattern, $html, $m)) {
$lastGroup = end($m);
foreach ($lastGroup as $val) {
if (filter_var($val, FILTER_VALIDATE_EMAIL)) {
$alreadyExists = false;
foreach ($result['corresponding_authors'] as $existing) {
if ($existing['email'] === $val) {
$alreadyExists = true;
break;
}
}
if (!$alreadyExists) {
$result['corresponding_authors'][] = ['name' => '', 'email' => $val];
}
}
}
}
if (!empty($result['corresponding_authors'])) {
break;
}
}
// 策略2: 找所有 mailto 链接
if (empty($result['corresponding_authors'])) {
$mailtoEmails = [];
if (preg_match_all('/href=["\']mailto:([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})["\']/', $html, $m)) {
$mailtoEmails = array_unique($m[1]);
}
$filtered = array_values(array_filter($mailtoEmails, function ($email) {
$lower = strtolower($email);
$skip = ['noreply', 'support', 'info@', 'admin', 'editor', 'help@', 'contact@',
'privacy', 'service', 'marketing', 'copyright', 'permission'];
foreach ($skip as $kw) {
if (strpos($lower, $kw) !== false) return false;
}
return true;
}));
foreach ($filtered as $email) {
$result['corresponding_authors'][] = ['name' => '', 'email' => $email];
}
}
return $result;
}
/**
* 主方法:根据 DOI 抓取出版商网页,解析通讯作者和邮箱
*/
private function scrapeCorrespondingAuthor($doi)
{
$doi = $this->cleanDoi($doi);
if ($doi == '') {
return ['success' => false, 'error' => 'DOI为空'];
}
$page = $this->fetchPageByDoi($doi);
if (!$page['success']) {
return ['success' => false, 'doi' => $doi, 'error' => $page['error']];
}
$finalUrl = $page['final_url'];
$html = $page['html'];
$publisher = $this->detectPublisher($finalUrl);
switch ($publisher) {
case 'mdpi':
$parsed = $this->parseMdpiEmail($html);
break;
case 'springer':
$parsed = $this->parseSpringerEmail($html);
break;
case 'frontiers':
$parsed = $this->parseFrontiersEmail($html);
break;
case 'wiley':
$parsed = $this->parseWileyEmail($html);
break;
case 'elsevier':
$parsed = $this->parseElsevierEmail($html);
break;
case 'taylor_francis':
$parsed = $this->parseTaylorFrancisEmail($html);
break;
case 'plos':
$parsed = $this->parsePlosEmail($html);
break;
default:
$parsed = $this->parseGenericEmail($html);
break;
}
// 如果专用解析器没找到,用通用方案兜底
if (empty($parsed['corresponding_authors'])) {
$generic = $this->parseGenericEmail($html);
$parsed['corresponding_authors'] = $generic['corresponding_authors'];
}
return [
'success' => true,
'doi' => $doi,
'url' => $finalUrl,
'publisher' => $publisher,
'corresponding_authors' => $parsed['corresponding_authors'],
'all_emails' => $parsed['all_emails'],
];
}
/**
* 通过 DOI 获取通讯作者邮箱(单个)
*
* @param string doi 文章DOI
*/
public function getAuthorEmail()
{
$data = $this->request->param();
if (!isset($data['doi']) || trim($data['doi']) == '') {
return jsonError('doi不能为空');
}
$result = $this->scrapeCorrespondingAuthor($data['doi']);
if (!$result['success']) {
return jsonError($result['error']);
}
return jsonSuccess($result);
}
/**
* 批量通过 DOI 获取通讯作者邮箱
*
* @param string dois 逗号分隔的DOI列表
*/
public function batchGetAuthorEmails()
{
$data = $this->request->param();
if (!isset($data['dois']) || trim($data['dois']) == '') {
return jsonError('dois不能为空');
}
$doiList = array_filter(array_map('trim', explode(',', $data['dois'])));
if (empty($doiList)) {
return jsonError('未提供有效的DOI');
}
if (count($doiList) > 20) {
return jsonError('单次最多查询20个DOI');
}
$results = [];
$successCount = 0;
$emailFoundCount = 0;
foreach ($doiList as $rawDoi) {
$result = $this->scrapeCorrespondingAuthor($rawDoi);
if ($result['success']) {
$successCount++;
if (!empty($result['corresponding_authors'])) {
$emailFoundCount++;
}
}
$results[] = $result;
usleep(500000);
}
return jsonSuccess([
'total' => count($results),
'success_count' => $successCount,
'email_found_count' => $emailFoundCount,
'list' => $results,
]);
}
/**
* 统计当前 field 转 major 的覆盖情况
*/
public function statistics()
{
$totalReviewers = $this->user_reviewer_info_obj
->where('state', 0)
->count();
$hasField = $this->user_reviewer_info_obj
->where('state', 0)
->where('field', '<>', '')
->count();
$hasMajorToUser = Db::name('major_to_user')
->where('state', 0)
->group('user_id')
->count();
$hasFieldNoMajor = $this->user_reviewer_info_obj
->alias('ri')
->where('ri.state', 0)
->where('ri.field', '<>', '')
->where('ri.reviewer_id', 'not in', Db::name('major_to_user')->where('state', 0)->field('user_id')->buildSql())
->count();
return jsonSuccess([
'total_reviewers' => $totalReviewers,
'has_field' => $hasField,
'has_major_to_user' => $hasMajorToUser,
'has_field_no_major' => $hasFieldNoMajor,
]);
}
}