1475 lines
50 KiB
PHP
1475 lines
50 KiB
PHP
<?php
|
||
|
||
namespace app\api\controller;
|
||
|
||
use think\Db;
|
||
use think\Cache;
|
||
use think\Env;
|
||
|
||
class Agent extends Base
|
||
{
|
||
|
||
public function __construct(\think\Request $request = null)
|
||
{
|
||
parent::__construct($request);
|
||
}
|
||
|
||
/**
|
||
* 获取所有叶子节点 major 及其完整路径,用于 AI 匹配
|
||
*/
|
||
private function getMajorTree()
|
||
{
|
||
$cacheKey = 'agent_major_tree';
|
||
$cached = Cache::get($cacheKey);
|
||
if ($cached) {
|
||
return $cached;
|
||
}
|
||
|
||
$allMajors = $this->major_obj
|
||
->where('major_state', 0)
|
||
->where('major_type', 0)
|
||
->select();
|
||
|
||
$majorMap = [];
|
||
foreach ($allMajors as $m) {
|
||
$majorMap[$m['major_id']] = $m;
|
||
}
|
||
|
||
$result = [];
|
||
foreach ($allMajors as $m) {
|
||
$hasChild = false;
|
||
foreach ($allMajors as $check) {
|
||
if ($check['pid'] == $m['major_id']) {
|
||
$hasChild = true;
|
||
break;
|
||
}
|
||
}
|
||
if (!$hasChild) {
|
||
$path = $this->buildMajorPath($m['major_id'], $majorMap);
|
||
$result[] = [
|
||
'major_id' => $m['major_id'],
|
||
'major_title' => $m['major_title'],
|
||
'full_path' => $path,
|
||
];
|
||
}
|
||
}
|
||
|
||
Cache::set($cacheKey, $result, 3600);
|
||
return $result;
|
||
}
|
||
|
||
/**
|
||
* 递归构建 major 的完整路径
|
||
*/
|
||
private function buildMajorPath($majorId, &$majorMap)
|
||
{
|
||
if (!isset($majorMap[$majorId])) {
|
||
return '';
|
||
}
|
||
$m = $majorMap[$majorId];
|
||
if ($m['pid'] == 0 || $m['pid'] == 1 || !isset($majorMap[$m['pid']])) {
|
||
return $m['major_title'];
|
||
}
|
||
return $this->buildMajorPath($m['pid'], $majorMap) . ' > ' . $m['major_title'];
|
||
}
|
||
|
||
/**
|
||
* 构建 major 列表提示文本(供 AI 使用)
|
||
*/
|
||
private function buildMajorListPrompt($majorTree)
|
||
{
|
||
$lines = [];
|
||
foreach ($majorTree as $item) {
|
||
$lines[] = "ID:{$item['major_id']} - {$item['full_path']}";
|
||
}
|
||
return implode("\n", $lines);
|
||
}
|
||
|
||
/**
|
||
* 调用 AI 将用户 field 描述匹配到标准 major_id
|
||
*/
|
||
private function matchFieldToMajor($field, $majorListPrompt)
|
||
{
|
||
$systemPrompt = "你是一位医学领域分类专家。用户会提供一段研究领域的描述文本,你需要从给定的标准领域列表中找出最匹配的1-3个领域。\n"
|
||
. "请严格按照JSON数组格式返回匹配结果,只返回major_id数组,如 [12,34,56]。\n"
|
||
. "如果没有合适的匹配,返回空数组 []。\n"
|
||
. "不要返回任何其他内容,只返回JSON数组。\n\n"
|
||
. "标准领域列表:\n" . $majorListPrompt;
|
||
|
||
$userPrompt = "请为以下研究领域描述匹配最合适的标准领域ID:\n" . $field;
|
||
|
||
$messages = [
|
||
['role' => 'system', 'content' => $systemPrompt],
|
||
['role' => 'user', 'content' => $userPrompt],
|
||
];
|
||
|
||
$apiKey = Env::get("gpt.api_key1", Env::get("gpt.api_key", ""));
|
||
$url = 'http://chat.taimed.cn/v1/chat/completions';
|
||
|
||
$data = [
|
||
'model' => 'gpt-4.1',
|
||
'messages' => $messages,
|
||
'temperature' => 0.1,
|
||
'max_tokens' => 200,
|
||
];
|
||
|
||
$ch = curl_init();
|
||
curl_setopt($ch, CURLOPT_URL, $url);
|
||
curl_setopt($ch, CURLOPT_HTTPHEADER, [
|
||
'Content-Type: application/json',
|
||
'Authorization: Bearer ' . $apiKey,
|
||
]);
|
||
curl_setopt($ch, CURLOPT_POST, true);
|
||
curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($data));
|
||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
||
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
|
||
curl_setopt($ch, CURLOPT_TIMEOUT, 60);
|
||
|
||
$result = curl_exec($ch);
|
||
if (curl_errno($ch)) {
|
||
curl_close($ch);
|
||
return [];
|
||
}
|
||
curl_close($ch);
|
||
|
||
$res = json_decode($result, true);
|
||
if (!isset($res['choices'][0]['message']['content'])) {
|
||
return [];
|
||
}
|
||
|
||
$content = trim($res['choices'][0]['message']['content']);
|
||
// 提取 JSON 数组
|
||
if (preg_match('/\[[\d,\s]*\]/', $content, $matches)) {
|
||
$ids = json_decode($matches[0], true);
|
||
if (is_array($ids)) {
|
||
return array_map('intval', $ids);
|
||
}
|
||
}
|
||
|
||
return [];
|
||
}
|
||
|
||
/**
|
||
* 将匹配结果写入 t_major_to_user
|
||
*/
|
||
private function saveMajorToUser($userId, $majorIds)
|
||
{
|
||
$existing = $this->major_to_user_obj
|
||
->where('user_id', $userId)
|
||
->where('state', 0)
|
||
->column('major_id');
|
||
|
||
$toInsert = array_diff($majorIds, $existing);
|
||
foreach ($toInsert as $majorId) {
|
||
$this->major_to_user_obj->insert([
|
||
'user_id' => $userId,
|
||
'major_id' => $majorId,
|
||
'ctime' => time(),
|
||
]);
|
||
}
|
||
|
||
return count($toInsert);
|
||
}
|
||
|
||
/**
|
||
* 处理单个用户的 field 转 major
|
||
*
|
||
* @param int user_id 用户ID
|
||
*/
|
||
public function processOneUser()
|
||
{
|
||
$data = $this->request->param();
|
||
if (!isset($data['user_id']) || $data['user_id'] == '') {
|
||
return jsonError('user_id不能为空');
|
||
}
|
||
if (!isset($data['field']) || $data['field'] == '') {
|
||
return jsonError('field不能为空');
|
||
}
|
||
|
||
$userId = intval($data['user_id']);
|
||
$reviewerInfo = $this->user_reviewer_info_obj
|
||
->where('reviewer_id', $userId)
|
||
->where('state', 0)
|
||
->find();
|
||
|
||
if (!$reviewerInfo) {
|
||
return jsonError('未找到该用户的reviewer信息');
|
||
}
|
||
|
||
$field = trim($data['field']);
|
||
// if ($field == '') {
|
||
// return jsonError('该用户的field字段为空');
|
||
// }
|
||
|
||
$majorTree = $this->getMajorTree();
|
||
if (empty($majorTree)) {
|
||
return jsonError('未获取到标准领域数据');
|
||
}
|
||
|
||
$majorListPrompt = $this->buildMajorListPrompt($majorTree);
|
||
|
||
|
||
|
||
$matchedIds = $this->matchFieldToMajor($field, $majorListPrompt);
|
||
|
||
if (empty($matchedIds)) {
|
||
return jsonSuccess([
|
||
'user_id' => $userId,
|
||
'field' => $field,
|
||
'matched_ids' => [],
|
||
'inserted' => 0,
|
||
'msg' => 'AI未匹配到合适的领域',
|
||
]);
|
||
}
|
||
|
||
// 验证 major_id 确实存在
|
||
$validMajors = $this->major_obj
|
||
->where('major_id', 'in', $matchedIds)
|
||
->where('major_state', 0)
|
||
->column('major_id');
|
||
$existing = $this->major_to_user_obj
|
||
->where('user_id', $userId)
|
||
->where('state', 0)
|
||
->column('major_id');
|
||
$unionArray = array_unique(array_merge($validMajors, $existing));
|
||
$ms = $this->major_obj->where('major_id', 'in', $unionArray)->where('major_state', 0)->select();
|
||
// $inserted = $this->saveMajorToUser($userId, $matchedIds);
|
||
|
||
foreach ($ms as $k => $major){
|
||
$ms[$k]['shu'] = getMajorShu($major['major_id']);
|
||
$ms[$k]['str'] = getMajorStr($major['major_id']);
|
||
}
|
||
|
||
return jsonSuccess([
|
||
'user_id' => $userId,
|
||
'field' => $field,
|
||
'majors' => $ms,
|
||
// 'inserted' => $inserted,
|
||
]);
|
||
}
|
||
|
||
/**
|
||
* 批量处理:获取有 field 但没有 major_to_user 记录的用户,逐个用 AI 匹配
|
||
*
|
||
* @param int limit 每次处理的数量,默认10
|
||
* @param int skip_has_major 是否跳过已有major_to_user记录的用户,默认1
|
||
*/
|
||
public function batchProcess()
|
||
{
|
||
$data = $this->request->param();
|
||
$limit = isset($data['limit']) ? intval($data['limit']) : 10;
|
||
$skipHasMajor = isset($data['skip_has_major']) ? intval($data['skip_has_major']) : 1;
|
||
|
||
if ($limit > 50) {
|
||
$limit = 50;
|
||
}
|
||
|
||
$query = $this->user_reviewer_info_obj
|
||
->alias('ri')
|
||
->field('ri.reviewer_id, ri.field')
|
||
->where('ri.state', 0)
|
||
->where('ri.field', '<>', '');
|
||
|
||
if ($skipHasMajor) {
|
||
$subQuery = Db::name('major_to_user')->where('state', 0)->field('user_id')->buildSql();
|
||
$query = $query->where('ri.reviewer_id', 'not in', $subQuery);
|
||
}
|
||
|
||
$users = $query->limit($limit)->select();
|
||
|
||
if (empty($users)) {
|
||
return jsonSuccess([
|
||
'processed' => 0,
|
||
'msg' => '没有需要处理的用户',
|
||
]);
|
||
}
|
||
|
||
$majorTree = $this->getMajorTree();
|
||
if (empty($majorTree)) {
|
||
return jsonError('未获取到标准领域数据');
|
||
}
|
||
$majorListPrompt = $this->buildMajorListPrompt($majorTree);
|
||
|
||
$validMajorIds = $this->major_obj->where('major_state', 0)->column('major_id');
|
||
|
||
$results = [];
|
||
$successCount = 0;
|
||
$failCount = 0;
|
||
|
||
foreach ($users as $user) {
|
||
$field = trim($user['field']);
|
||
if ($field == '') {
|
||
continue;
|
||
}
|
||
|
||
$matchedIds = $this->matchFieldToMajor($field, $majorListPrompt);
|
||
$matchedIds = array_intersect($matchedIds, $validMajorIds);
|
||
|
||
if (!empty($matchedIds)) {
|
||
$inserted = $this->saveMajorToUser($user['reviewer_id'], $matchedIds);
|
||
$results[] = [
|
||
'user_id' => $user['reviewer_id'],
|
||
'field' => mb_substr($field, 0, 100),
|
||
'matched_ids' => array_values($matchedIds),
|
||
'inserted' => $inserted,
|
||
];
|
||
$successCount++;
|
||
} else {
|
||
$results[] = [
|
||
'user_id' => $user['reviewer_id'],
|
||
'field' => mb_substr($field, 0, 100),
|
||
'matched_ids' => [],
|
||
'inserted' => 0,
|
||
];
|
||
$failCount++;
|
||
}
|
||
}
|
||
|
||
return jsonSuccess([
|
||
'processed' => count($results),
|
||
'success_count' => $successCount,
|
||
'fail_count' => $failCount,
|
||
'details' => $results,
|
||
]);
|
||
}
|
||
|
||
/**
|
||
* 查看当前 major 树结构(调试用)
|
||
*/
|
||
public function getMajorList()
|
||
{
|
||
$majorTree = $this->getMajorTree();
|
||
return jsonSuccess([
|
||
'total' => count($majorTree),
|
||
'list' => $majorTree,
|
||
]);
|
||
}
|
||
|
||
/**
|
||
* 从 Excel 文件导入 major 数据到数据库(如需要)
|
||
*/
|
||
public function importMajorFromExcel()
|
||
{
|
||
$file = ROOT_PATH . 'public' . DS . 'system' . DS . 't_major.xlsx';
|
||
if (!file_exists($file)) {
|
||
return jsonError('Excel文件不存在: public/system/t_major.xlsx');
|
||
}
|
||
|
||
$spreadsheet = \PhpOffice\PhpSpreadsheet\IOFactory::load($file);
|
||
$sheet = $spreadsheet->getActiveSheet();
|
||
$highestRow = $sheet->getHighestRow();
|
||
$highestColumn = $sheet->getHighestColumn();
|
||
|
||
$headers = [];
|
||
$colCount = \PhpOffice\PhpSpreadsheet\Cell\Coordinate::columnIndexFromString($highestColumn);
|
||
for ($col = 1; $col <= $colCount; $col++) {
|
||
$headers[$col] = $sheet->getCellByColumnAndRow($col, 1)->getValue();
|
||
}
|
||
|
||
$rows = [];
|
||
for ($row = 2; $row <= $highestRow; $row++) {
|
||
$rowData = [];
|
||
for ($col = 1; $col <= $colCount; $col++) {
|
||
$rowData[$headers[$col]] = $sheet->getCellByColumnAndRow($col, $row)->getValue();
|
||
}
|
||
$rows[] = $rowData;
|
||
}
|
||
|
||
return jsonSuccess([
|
||
'headers' => array_values($headers),
|
||
'total' => count($rows),
|
||
'preview' => array_slice($rows, 0, 20),
|
||
]);
|
||
}
|
||
|
||
// ========== CrossRef DOI 查询 & 撤稿检测 ==========
|
||
|
||
/**
|
||
* 清洗 DOI,去掉前缀
|
||
*/
|
||
private function cleanDoi($doi)
|
||
{
|
||
$doi = trim($doi);
|
||
$doi = preg_replace('/^https?:\/\/doi\.org\//', '', $doi);
|
||
$doi = preg_replace('/^doi:\s*/i', '', $doi);
|
||
return trim($doi);
|
||
}
|
||
|
||
/**
|
||
* 请求 CrossRef API 获取 DOI 的原始 message 数据
|
||
*/
|
||
private function fetchCrossRefData($doi)
|
||
{
|
||
$url = 'https://api.crossref.org/works/' . urlencode($doi);
|
||
|
||
$ch = curl_init();
|
||
curl_setopt($ch, CURLOPT_URL, $url);
|
||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
||
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
|
||
curl_setopt($ch, CURLOPT_TIMEOUT, 30);
|
||
curl_setopt($ch, CURLOPT_HTTPHEADER, [
|
||
'User-Agent: TMRJournals/1.0 (mailto:publisher@tmrjournals.com)',
|
||
'Accept: application/json',
|
||
]);
|
||
|
||
$result = curl_exec($ch);
|
||
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
||
|
||
if (curl_errno($ch)) {
|
||
$error = curl_error($ch);
|
||
curl_close($ch);
|
||
return ['success' => false, 'error' => 'CURL错误: ' . $error];
|
||
}
|
||
curl_close($ch);
|
||
|
||
if ($httpCode == 404) {
|
||
return ['success' => false, 'error' => 'DOI在CrossRef中未找到'];
|
||
}
|
||
if ($httpCode != 200) {
|
||
return ['success' => false, 'error' => 'CrossRef返回HTTP ' . $httpCode];
|
||
}
|
||
|
||
$data = json_decode($result, true);
|
||
if (!isset($data['message'])) {
|
||
return ['success' => false, 'error' => 'CrossRef返回数据格式异常'];
|
||
}
|
||
|
||
return ['success' => true, 'message' => $data['message']];
|
||
}
|
||
|
||
/**
|
||
* 从 CrossRef date-parts 中提取日期字符串
|
||
*/
|
||
private function parseDateParts($dateObj)
|
||
{
|
||
if (!isset($dateObj['date-parts'][0])) {
|
||
return '';
|
||
}
|
||
$parts = $dateObj['date-parts'][0];
|
||
$y = isset($parts[0]) ? $parts[0] : '';
|
||
$m = isset($parts[1]) ? sprintf('%02d', $parts[1]) : '';
|
||
$d = isset($parts[2]) ? sprintf('%02d', $parts[2]) : '';
|
||
if ($y && $m && $d) {
|
||
return "{$y}-{$m}-{$d}";
|
||
}
|
||
if ($y && $m) {
|
||
return "{$y}-{$m}";
|
||
}
|
||
return (string)$y;
|
||
}
|
||
|
||
/**
|
||
* 解析作者列表
|
||
*/
|
||
private function parseAuthors($authorList)
|
||
{
|
||
if (empty($authorList) || !is_array($authorList)) {
|
||
return [];
|
||
}
|
||
$result = [];
|
||
foreach ($authorList as $a) {
|
||
$author = [
|
||
'given' => $a['given'] ?? '',
|
||
'family' => $a['family'] ?? '',
|
||
'name' => isset($a['name']) ? $a['name'] : ((isset($a['given']) ? $a['given'] . ' ' : '') . ($a['family'] ?? '')),
|
||
'ORCID' => $a['ORCID'] ?? '',
|
||
'sequence' => $a['sequence'] ?? '',
|
||
'affiliation' => [],
|
||
];
|
||
if (isset($a['affiliation']) && is_array($a['affiliation'])) {
|
||
foreach ($a['affiliation'] as $aff) {
|
||
$author['affiliation'][] = $aff['name'] ?? '';
|
||
}
|
||
}
|
||
$result[] = $author;
|
||
}
|
||
return $result;
|
||
}
|
||
|
||
/**
|
||
* 检测撤稿状态
|
||
*/
|
||
private function detectRetraction($message)
|
||
{
|
||
$isRetracted = false;
|
||
$retractionDetail = [];
|
||
|
||
// 1. update-to 字段
|
||
if (isset($message['update-to']) && is_array($message['update-to'])) {
|
||
foreach ($message['update-to'] as $update) {
|
||
$updateType = strtolower($update['type'] ?? '');
|
||
$updateLabel = strtolower($update['label'] ?? '');
|
||
if (strpos($updateType, 'retract') !== false || strpos($updateLabel, 'retract') !== false) {
|
||
$isRetracted = true;
|
||
$retractionDetail['retraction_notice'] = [
|
||
'type' => $update['type'] ?? '',
|
||
'label' => $update['label'] ?? '',
|
||
'DOI' => $update['DOI'] ?? '',
|
||
'date' => isset($update['updated']) ? $this->parseDateParts($update['updated']) : '',
|
||
];
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
// 2. type/subtype
|
||
$type = strtolower($message['type'] ?? '');
|
||
$subtype = strtolower($message['subtype'] ?? '');
|
||
if (strpos($type, 'retract') !== false || strpos($subtype, 'retract') !== false) {
|
||
$isRetracted = true;
|
||
$retractionDetail['is_retraction_notice'] = true;
|
||
}
|
||
|
||
// 3. relation
|
||
if (isset($message['relation']) && is_array($message['relation'])) {
|
||
foreach ($message['relation'] as $relType => $relations) {
|
||
if (strpos(strtolower($relType), 'retract') !== false) {
|
||
$isRetracted = true;
|
||
$retractionDetail['relation'] = [$relType => $relations];
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
// 4. title 关键词
|
||
$titles = $message['title'] ?? [];
|
||
foreach ($titles as $title) {
|
||
$lower = strtolower($title);
|
||
if (strpos($lower, 'retraction') !== false || strpos($lower, 'retracted') !== false
|
||
|| strpos($lower, 'withdrawal') !== false || strpos($lower, 'withdrawn') !== false) {
|
||
$isRetracted = true;
|
||
$retractionDetail['title_keyword'] = $title;
|
||
break;
|
||
}
|
||
}
|
||
|
||
return ['is_retracted' => $isRetracted, 'retraction_detail' => $retractionDetail];
|
||
}
|
||
|
||
/**
|
||
* 解析 CrossRef message 为结构化数据
|
||
*/
|
||
private function parseCrossRefMessage($doi, $message)
|
||
{
|
||
// 基础信息
|
||
$info = [
|
||
'doi' => $doi,
|
||
'url' => $message['URL'] ?? ('https://doi.org/' . $doi),
|
||
'type' => $message['type'] ?? '',
|
||
'title' => isset($message['title'][0]) ? $message['title'][0] : '',
|
||
];
|
||
|
||
// 作者
|
||
$info['authors'] = $this->parseAuthors($message['author'] ?? []);
|
||
$info['author_string'] = implode(', ', array_column($info['authors'], 'name'));
|
||
|
||
// 期刊/来源
|
||
$info['journal'] = [
|
||
'title' => isset($message['container-title'][0]) ? $message['container-title'][0] : '',
|
||
'short_title'=> isset($message['short-container-title'][0]) ? $message['short-container-title'][0] : '',
|
||
'ISSN' => $message['ISSN'] ?? [],
|
||
'publisher' => $message['publisher'] ?? '',
|
||
];
|
||
|
||
// 卷/期/页码
|
||
$info['volume'] = $message['volume'] ?? '';
|
||
$info['issue'] = $message['issue'] ?? '';
|
||
$info['page'] = $message['page'] ?? '';
|
||
$info['article_number'] = $message['article-number'] ?? '';
|
||
|
||
// 日期
|
||
$info['dates'] = [
|
||
'published_print' => isset($message['published-print']) ? $this->parseDateParts($message['published-print']) : '',
|
||
'published_online' => isset($message['published-online']) ? $this->parseDateParts($message['published-online']) : '',
|
||
'published' => isset($message['published']) ? $this->parseDateParts($message['published']) : '',
|
||
'created' => isset($message['created']) ? $this->parseDateParts($message['created']) : '',
|
||
'deposited' => isset($message['deposited']) ? $this->parseDateParts($message['deposited']) : '',
|
||
'indexed' => isset($message['indexed']) ? $this->parseDateParts($message['indexed']) : '',
|
||
];
|
||
$info['year'] = '';
|
||
foreach (['published-print', 'published-online', 'published', 'created'] as $dk) {
|
||
if (isset($message[$dk]['date-parts'][0][0]) && $message[$dk]['date-parts'][0][0]) {
|
||
$info['year'] = (string)$message[$dk]['date-parts'][0][0];
|
||
break;
|
||
}
|
||
}
|
||
|
||
// 摘要
|
||
$info['abstract'] = $message['abstract'] ?? '';
|
||
|
||
// 学科/主题
|
||
$info['subject'] = $message['subject'] ?? [];
|
||
|
||
// 引用统计
|
||
$info['references_count'] = $message['references-count'] ?? 0;
|
||
$info['is_referenced_by_count'] = $message['is-referenced-by-count'] ?? 0;
|
||
|
||
// 资助信息
|
||
$funders = [];
|
||
if (isset($message['funder']) && is_array($message['funder'])) {
|
||
foreach ($message['funder'] as $f) {
|
||
$funders[] = [
|
||
'name' => $f['name'] ?? '',
|
||
'DOI' => $f['DOI'] ?? '',
|
||
'award' => $f['award'] ?? [],
|
||
];
|
||
}
|
||
}
|
||
$info['funders'] = $funders;
|
||
|
||
// 许可证
|
||
$licenses = [];
|
||
if (isset($message['license']) && is_array($message['license'])) {
|
||
foreach ($message['license'] as $lic) {
|
||
$licenses[] = [
|
||
'URL' => $lic['URL'] ?? '',
|
||
'start_date' => isset($lic['start']) ? $this->parseDateParts($lic['start']) : '',
|
||
];
|
||
}
|
||
}
|
||
$info['licenses'] = $licenses;
|
||
|
||
// 撤稿检测
|
||
$retraction = $this->detectRetraction($message);
|
||
$info['is_retracted'] = $retraction['is_retracted'];
|
||
$info['retraction_detail'] = $retraction['retraction_detail'];
|
||
|
||
// update-to(勘误/更正/撤稿通知 等所有更新关系)
|
||
$updates = [];
|
||
if (isset($message['update-to']) && is_array($message['update-to'])) {
|
||
foreach ($message['update-to'] as $up) {
|
||
$updates[] = [
|
||
'type' => $up['type'] ?? '',
|
||
'label' => $up['label'] ?? '',
|
||
'DOI' => $up['DOI'] ?? '',
|
||
'date' => isset($up['updated']) ? $this->parseDateParts($up['updated']) : '',
|
||
];
|
||
}
|
||
}
|
||
$info['updates'] = $updates;
|
||
|
||
// 关联关系 relation
|
||
$info['relation'] = $message['relation'] ?? [];
|
||
|
||
// 分数/评分
|
||
$info['score'] = $message['score'] ?? 0;
|
||
|
||
return $info;
|
||
}
|
||
|
||
/**
|
||
* 查询单个 DOI,返回完整的结构化元数据
|
||
*
|
||
* @param string doi 文章DOI
|
||
*/
|
||
public function queryDoi()
|
||
{
|
||
$data = $this->request->param();
|
||
if (!isset($data['doi']) || trim($data['doi']) == '') {
|
||
return jsonError('doi不能为空');
|
||
}
|
||
|
||
$doi = $this->cleanDoi($data['doi']);
|
||
$res = $this->fetchCrossRefData($doi);
|
||
if (!$res['success']) {
|
||
return jsonError($res['error']);
|
||
}
|
||
|
||
$parsed = $this->parseCrossRefMessage($doi, $res['message']);
|
||
return jsonSuccess($parsed);
|
||
}
|
||
|
||
/**
|
||
* 批量查询多个 DOI 的完整元数据
|
||
*
|
||
* @param string dois 逗号分隔的DOI列表
|
||
*/
|
||
public function batchQueryDois()
|
||
{
|
||
$data = $this->request->param();
|
||
if (!isset($data['dois']) || trim($data['dois']) == '') {
|
||
return jsonError('dois不能为空');
|
||
}
|
||
|
||
$doiList = array_filter(array_map('trim', explode(',', $data['dois'])));
|
||
if (empty($doiList)) {
|
||
return jsonError('未提供有效的DOI');
|
||
}
|
||
if (count($doiList) > 50) {
|
||
return jsonError('单次最多查询50个DOI');
|
||
}
|
||
|
||
$results = [];
|
||
$retractedCount = 0;
|
||
|
||
foreach ($doiList as $rawDoi) {
|
||
$doi = $this->cleanDoi($rawDoi);
|
||
$res = $this->fetchCrossRefData($doi);
|
||
if (!$res['success']) {
|
||
$results[] = ['doi' => $doi, 'success' => false, 'error' => $res['error']];
|
||
} else {
|
||
$parsed = $this->parseCrossRefMessage($doi, $res['message']);
|
||
$parsed['success'] = true;
|
||
if ($parsed['is_retracted']) {
|
||
$retractedCount++;
|
||
}
|
||
$results[] = $parsed;
|
||
}
|
||
usleep(200000);
|
||
}
|
||
|
||
return jsonSuccess([
|
||
'total' => count($results),
|
||
'retracted_count' => $retractedCount,
|
||
'list' => $results,
|
||
]);
|
||
}
|
||
|
||
/**
|
||
* 检查一篇文章的所有参考文献(返回每条引用的完整 CrossRef 元数据 + 撤稿标记)
|
||
*
|
||
* @param int p_article_id 生产文章ID
|
||
*/
|
||
public function checkArticleReferences()
|
||
{
|
||
$data = $this->request->param();
|
||
if (!isset($data['p_article_id']) || $data['p_article_id'] == '') {
|
||
return jsonError('p_article_id不能为空');
|
||
}
|
||
|
||
$pArticleId = intval($data['p_article_id']);
|
||
$refers = $this->production_article_refer_obj
|
||
->where('p_article_id', $pArticleId)
|
||
->where('state', 0)
|
||
->where('refer_doi', '<>', '')
|
||
->select();
|
||
|
||
if (empty($refers)) {
|
||
return jsonSuccess([
|
||
'p_article_id' => $pArticleId,
|
||
'total_checked' => 0,
|
||
'retracted_count' => 0,
|
||
'list' => [],
|
||
]);
|
||
}
|
||
|
||
$list = [];
|
||
$retractedCount = 0;
|
||
$errorCount = 0;
|
||
|
||
foreach ($refers as $refer) {
|
||
$doi = $this->cleanDoi($refer['refer_doi']);
|
||
if ($doi == '') {
|
||
continue;
|
||
}
|
||
|
||
$item = [
|
||
'p_refer_id' => $refer['p_refer_id'],
|
||
'index' => $refer['index'],
|
||
'refer_doi' => $doi,
|
||
'refer_content' => $refer['refer_content'] ?? '',
|
||
];
|
||
|
||
$res = $this->fetchCrossRefData($doi);
|
||
if (!$res['success']) {
|
||
$item['crossref_success'] = false;
|
||
$item['crossref_error'] = $res['error'];
|
||
$errorCount++;
|
||
} else {
|
||
$parsed = $this->parseCrossRefMessage($doi, $res['message']);
|
||
$item['crossref_success'] = true;
|
||
$item['crossref'] = $parsed;
|
||
if ($parsed['is_retracted']) {
|
||
$retractedCount++;
|
||
}
|
||
}
|
||
|
||
$list[] = $item;
|
||
usleep(200000);
|
||
}
|
||
|
||
return jsonSuccess([
|
||
'p_article_id' => $pArticleId,
|
||
'total_checked' => count($list),
|
||
'retracted_count' => $retractedCount,
|
||
'error_count' => $errorCount,
|
||
'list' => $list,
|
||
]);
|
||
}
|
||
|
||
/**
|
||
* 通过 article_id 检查参考文献(完整元数据 + 撤稿检测)
|
||
*
|
||
* @param int article_id 文章ID
|
||
*/
|
||
public function checkReferencesByArticleId()
|
||
{
|
||
$data = $this->request->param();
|
||
if (!isset($data['article_id']) || $data['article_id'] == '') {
|
||
return jsonError('article_id不能为空');
|
||
}
|
||
|
||
$articleId = intval($data['article_id']);
|
||
$pInfo = $this->production_article_obj
|
||
->where('article_id', $articleId)
|
||
->where('state', 0)
|
||
->find();
|
||
|
||
if (!$pInfo) {
|
||
return jsonError('未找到该文章的生产信息');
|
||
}
|
||
|
||
$refers = $this->production_article_refer_obj
|
||
->where('p_article_id', $pInfo['p_article_id'])
|
||
->where('state', 0)
|
||
->where('refer_doi', '<>', '')
|
||
->select();
|
||
|
||
if (empty($refers)) {
|
||
return jsonSuccess([
|
||
'article_id' => $articleId,
|
||
'p_article_id' => $pInfo['p_article_id'],
|
||
'total_checked' => 0,
|
||
'retracted_count' => 0,
|
||
'list' => [],
|
||
]);
|
||
}
|
||
|
||
$list = [];
|
||
$retractedCount = 0;
|
||
|
||
foreach ($refers as $refer) {
|
||
$doi = $this->cleanDoi($refer['refer_doi']);
|
||
if ($doi == '') {
|
||
continue;
|
||
}
|
||
|
||
$item = [
|
||
'p_refer_id' => $refer['p_refer_id'],
|
||
'index' => $refer['index'],
|
||
'refer_doi' => $doi,
|
||
'refer_content' => $refer['refer_content'] ?? '',
|
||
];
|
||
|
||
$res = $this->fetchCrossRefData($doi);
|
||
if (!$res['success']) {
|
||
$item['crossref_success'] = false;
|
||
$item['crossref_error'] = $res['error'];
|
||
} else {
|
||
$parsed = $this->parseCrossRefMessage($doi, $res['message']);
|
||
$item['crossref_success'] = true;
|
||
$item['crossref'] = $parsed;
|
||
if ($parsed['is_retracted']) {
|
||
$retractedCount++;
|
||
}
|
||
}
|
||
|
||
$list[] = $item;
|
||
usleep(200000);
|
||
}
|
||
|
||
return jsonSuccess([
|
||
'article_id' => $articleId,
|
||
'p_article_id' => $pInfo['p_article_id'],
|
||
'total_checked' => count($list),
|
||
'retracted_count' => $retractedCount,
|
||
'list' => $list,
|
||
]);
|
||
}
|
||
|
||
// ========== DOI 网页抓取通讯作者邮箱 ==========
|
||
|
||
/**
|
||
* 通过 DOI 跳转获取出版商页面 HTML
|
||
*/
|
||
private function fetchPageByDoi($doi)
|
||
{
|
||
$url = 'https://doi.org/' . $doi;
|
||
|
||
$ch = curl_init();
|
||
curl_setopt($ch, CURLOPT_URL, $url);
|
||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
||
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
|
||
curl_setopt($ch, CURLOPT_MAXREDIRS, 10);
|
||
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
|
||
curl_setopt($ch, CURLOPT_TIMEOUT, 30);
|
||
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
|
||
curl_setopt($ch, CURLOPT_HTTPHEADER, [
|
||
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||
'Accept-Language: en-US,en;q=0.5',
|
||
]);
|
||
|
||
$html = curl_exec($ch);
|
||
$finalUrl = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
|
||
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
||
|
||
if (curl_errno($ch)) {
|
||
$error = curl_error($ch);
|
||
curl_close($ch);
|
||
return ['success' => false, 'error' => 'CURL错误: ' . $error];
|
||
}
|
||
curl_close($ch);
|
||
|
||
if ($httpCode != 200) {
|
||
return ['success' => false, 'error' => 'HTTP ' . $httpCode];
|
||
}
|
||
|
||
return ['success' => true, 'html' => $html, 'final_url' => $finalUrl];
|
||
}
|
||
|
||
/**
|
||
* 根据最终 URL 判断出版商
|
||
*/
|
||
private function detectPublisher($url)
|
||
{
|
||
$host = strtolower(parse_url($url, PHP_URL_HOST) ?: '');
|
||
|
||
$map = [
|
||
'mdpi.com' => 'mdpi',
|
||
'springer.com' => 'springer',
|
||
'springerlink.com' => 'springer',
|
||
'nature.com' => 'springer',
|
||
'biomedcentral.com' => 'springer',
|
||
'sciencedirect.com' => 'elsevier',
|
||
'elsevier.com' => 'elsevier',
|
||
'wiley.com' => 'wiley',
|
||
'onlinelibrary.wiley'=> 'wiley',
|
||
'frontiersin.org' => 'frontiers',
|
||
'tandfonline.com' => 'taylor_francis',
|
||
'sagepub.com' => 'sage',
|
||
'oup.com' => 'oxford',
|
||
'plos.org' => 'plos',
|
||
'hindawi.com' => 'hindawi',
|
||
'cell.com' => 'cell',
|
||
'jci.org' => 'jci',
|
||
'asm.org' => 'asm',
|
||
'iucr.org' => 'iucr',
|
||
'rsc.org' => 'rsc',
|
||
'acs.org' => 'acs',
|
||
];
|
||
|
||
foreach ($map as $domain => $publisher) {
|
||
if (strpos($host, $domain) !== false) {
|
||
return $publisher;
|
||
}
|
||
}
|
||
return 'unknown';
|
||
}
|
||
|
||
/**
|
||
* 从 HTML 中提取所有有效邮箱(过滤系统邮箱)
|
||
*/
|
||
private function extractEmails($html)
|
||
{
|
||
$all = [];
|
||
if (preg_match_all('/[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}/', $html, $m)) {
|
||
$all = array_unique($m[0]);
|
||
}
|
||
|
||
$systemKeywords = [
|
||
'noreply', 'no-reply', 'support', 'info@', 'admin@', 'webmaster',
|
||
'editor@', 'editorial@', 'help@', 'contact@', 'privacy@', 'service@',
|
||
'marketing@', 'feedback@', 'copyright@', 'permissions@',
|
||
'mdpi.com', 'springer.com', 'elsevier.com', 'wiley.com',
|
||
'frontiersin.org', 'nature.com', 'oup.com', 'sagepub.com',
|
||
'tandfonline.com', 'plos.org', 'hindawi.com', 'biomedcentral.com',
|
||
'crossref.org', 'doi.org', 'example.com', 'sentry.io',
|
||
'acs.org', 'rsc.org',
|
||
];
|
||
|
||
$filtered = [];
|
||
foreach ($all as $email) {
|
||
$lower = strtolower($email);
|
||
$skip = false;
|
||
foreach ($systemKeywords as $kw) {
|
||
if (strpos($lower, $kw) !== false) {
|
||
$skip = true;
|
||
break;
|
||
}
|
||
}
|
||
if (strpos($lower, '.png') !== false || strpos($lower, '.jpg') !== false
|
||
|| strpos($lower, '.gif') !== false || strpos($lower, '.css') !== false
|
||
|| strpos($lower, '.js') !== false) {
|
||
$skip = true;
|
||
}
|
||
if (!$skip) {
|
||
$filtered[] = $email;
|
||
}
|
||
}
|
||
|
||
return $filtered;
|
||
}
|
||
|
||
/**
|
||
* MDPI 页面解析
|
||
*/
|
||
private function parseMdpiEmail($html)
|
||
{
|
||
$result = ['corresponding_authors' => [], 'all_emails' => []];
|
||
$result['all_emails'] = $this->extractEmails($html);
|
||
|
||
// MDPI: 通讯作者标 *,邮箱用 "Author to whom correspondence should be addressed"
|
||
// 找对应作者名:带 * 的 <span> 或文本
|
||
$corrNames = [];
|
||
if (preg_match_all('/<span[^>]*>\s*([^<]+?)\s*<\/span>\s*\*/', $html, $m)) {
|
||
$corrNames = array_map('trim', $m[1]);
|
||
}
|
||
if (empty($corrNames) && preg_match_all('/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)\s*\*/', $html, $m)) {
|
||
$corrNames = array_map('trim', $m[1]);
|
||
}
|
||
|
||
// 找 mailto 链接(通常就是通讯作者邮箱)
|
||
$mailtoEmails = [];
|
||
if (preg_match_all('/href=["\']mailto:([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})["\']/', $html, $m)) {
|
||
$mailtoEmails = array_values(array_unique($m[1]));
|
||
}
|
||
$mailtoEmails = array_values(array_filter($mailtoEmails, function ($e) {
|
||
return stripos($e, 'mdpi') === false;
|
||
}));
|
||
|
||
foreach ($corrNames as $i => $name) {
|
||
$entry = ['name' => $name, 'email' => ''];
|
||
if (isset($mailtoEmails[$i])) {
|
||
$entry['email'] = $mailtoEmails[$i];
|
||
}
|
||
$result['corresponding_authors'][] = $entry;
|
||
}
|
||
|
||
if (empty($result['corresponding_authors']) && !empty($mailtoEmails)) {
|
||
foreach ($mailtoEmails as $email) {
|
||
$result['corresponding_authors'][] = ['name' => '', 'email' => $email];
|
||
}
|
||
}
|
||
|
||
return $result;
|
||
}
|
||
|
||
/**
|
||
* Springer / Nature / BMC 页面解析
|
||
*/
|
||
private function parseSpringerEmail($html)
|
||
{
|
||
$result = ['corresponding_authors' => [], 'all_emails' => []];
|
||
$result['all_emails'] = $this->extractEmails($html);
|
||
|
||
// Springer: <a data-test="author-letter" href="mailto:xxx">
|
||
if (preg_match_all('/data-test=["\']author-letter["\'][^>]*href=["\']mailto:([^"\']+)["\']/', $html, $m)) {
|
||
foreach ($m[1] as $email) {
|
||
$result['corresponding_authors'][] = ['name' => '', 'email' => $email];
|
||
}
|
||
}
|
||
|
||
// 或者 "Correspondence to" / "Corresponding author" 区域
|
||
if (empty($result['corresponding_authors'])) {
|
||
if (preg_match('/[Cc]orrespond(?:ence|ing\s+author)[^<]{0,50}<[^>]*>([^<]*<[^>]*>)*?[^<]*?href=["\']mailto:([^"\']+)["\']/', $html, $m)) {
|
||
$result['corresponding_authors'][] = ['name' => '', 'email' => $m[2]];
|
||
}
|
||
}
|
||
|
||
if (empty($result['corresponding_authors'])) {
|
||
$patterns = [
|
||
'/[Cc]orrespond[a-z]*[^<]{0,300}?([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})/',
|
||
];
|
||
foreach ($patterns as $p) {
|
||
if (preg_match_all($p, $html, $m)) {
|
||
foreach ($m[1] as $email) {
|
||
$clean = $this->extractEmails($email);
|
||
if (!empty($clean)) {
|
||
$result['corresponding_authors'][] = ['name' => '', 'email' => $clean[0]];
|
||
} elseif (filter_var($email, FILTER_VALIDATE_EMAIL)) {
|
||
$result['corresponding_authors'][] = ['name' => '', 'email' => $email];
|
||
}
|
||
}
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
// 尝试找名字
|
||
if (!empty($result['corresponding_authors'])) {
|
||
if (preg_match('/[Cc]orrespond[a-z]*\s+(?:to|author)[:\s]*([A-Z][a-zA-Z\s.\-]+?)(?:\.|<|,)/', $html, $m)) {
|
||
$name = trim($m[1]);
|
||
if (strlen($name) > 2 && strlen($name) < 80) {
|
||
$result['corresponding_authors'][0]['name'] = $name;
|
||
}
|
||
}
|
||
}
|
||
|
||
return $result;
|
||
}
|
||
|
||
/**
|
||
* Frontiers 页面解析
|
||
*/
|
||
private function parseFrontiersEmail($html)
|
||
{
|
||
$result = ['corresponding_authors' => [], 'all_emails' => []];
|
||
$result['all_emails'] = $this->extractEmails($html);
|
||
|
||
// Frontiers: "*Correspondence: Name, email@xxx.com" 或 "*Correspondence:" 后跟 mailto:
|
||
if (preg_match_all('/\*\s*[Cc]orrespondence:\s*(.*?)(?:<br|<\/p|<\/div|\n)/s', $html, $blocks)) {
|
||
foreach ($blocks[1] as $block) {
|
||
$block = strip_tags($block, '<a>');
|
||
$emails = [];
|
||
if (preg_match_all('/([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})/', $block, $em)) {
|
||
$emails = $em[1];
|
||
}
|
||
$plainText = strip_tags($block);
|
||
$parts = preg_split('/[,;]/', $plainText);
|
||
$name = '';
|
||
foreach ($parts as $part) {
|
||
$part = trim($part);
|
||
if ($part && strpos($part, '@') === false && preg_match('/[A-Z]/', $part)) {
|
||
$name = $part;
|
||
break;
|
||
}
|
||
}
|
||
foreach ($emails as $email) {
|
||
$result['corresponding_authors'][] = ['name' => $name, 'email' => $email];
|
||
}
|
||
}
|
||
}
|
||
|
||
if (empty($result['corresponding_authors'])) {
|
||
if (preg_match_all('/href=["\']mailto:([^"\']+)["\'][^>]*>([^<]*)</i', $html, $m)) {
|
||
foreach ($m[1] as $i => $email) {
|
||
$label = trim(strip_tags($m[2][$i]));
|
||
if (stripos($email, 'frontiersin') === false) {
|
||
$result['corresponding_authors'][] = ['name' => $label ?: '', 'email' => $email];
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
return $result;
|
||
}
|
||
|
||
/**
|
||
* Wiley 页面解析
|
||
*/
|
||
private function parseWileyEmail($html)
|
||
{
|
||
$result = ['corresponding_authors' => [], 'all_emails' => []];
|
||
$result['all_emails'] = $this->extractEmails($html);
|
||
|
||
// Wiley: "Correspondence" 段落或 data-widget-def 中含有 mailto:
|
||
if (preg_match('/[Cc]orrespond[a-z]*[:\s].*?href=["\']mailto:([^"\']+)["\'].*?<\/p/s', $html, $m)) {
|
||
$result['corresponding_authors'][] = ['name' => '', 'email' => $m[1]];
|
||
}
|
||
|
||
if (empty($result['corresponding_authors'])) {
|
||
if (preg_match('/[Cc]orrespond[a-z]*[^<]{0,500}?([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})/', $html, $m)) {
|
||
$result['corresponding_authors'][] = ['name' => '', 'email' => $m[1]];
|
||
}
|
||
}
|
||
|
||
if (!empty($result['corresponding_authors'])) {
|
||
if (preg_match('/[Cc]orrespond[a-z]*[:\s]*([A-Z][a-zA-Z.\-\s]{2,40}?),/', $html, $m)) {
|
||
$result['corresponding_authors'][0]['name'] = trim($m[1]);
|
||
}
|
||
}
|
||
|
||
return $result;
|
||
}
|
||
|
||
/**
|
||
* Elsevier / ScienceDirect 页面解析
|
||
*/
|
||
private function parseElsevierEmail($html)
|
||
{
|
||
$result = ['corresponding_authors' => [], 'all_emails' => []];
|
||
$result['all_emails'] = $this->extractEmails($html);
|
||
|
||
// ScienceDirect: "Corresponding author" 按钮/区域 + mailto:
|
||
if (preg_match_all('/class="[^"]*corresponding[^"]*"[^>]*>.*?href=["\']mailto:([^"\']+)["\']/si', $html, $m)) {
|
||
foreach ($m[1] as $email) {
|
||
$result['corresponding_authors'][] = ['name' => '', 'email' => $email];
|
||
}
|
||
}
|
||
|
||
if (empty($result['corresponding_authors'])) {
|
||
if (preg_match_all('/data-[a-z\-]*=["\']corresponding[^"\']*["\'][^>]*>([^<]+)/i', $html, $m)) {
|
||
foreach ($m[1] as $name) {
|
||
$result['corresponding_authors'][] = ['name' => trim(strip_tags($name)), 'email' => ''];
|
||
}
|
||
}
|
||
}
|
||
|
||
if (empty($result['corresponding_authors'])) {
|
||
if (preg_match('/[Cc]orrespond[a-z]*[^<]{0,300}?([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})/', $html, $m)) {
|
||
$result['corresponding_authors'][] = ['name' => '', 'email' => $m[1]];
|
||
}
|
||
}
|
||
|
||
return $result;
|
||
}
|
||
|
||
/**
|
||
* Taylor & Francis 页面解析
|
||
*/
|
||
private function parseTaylorFrancisEmail($html)
|
||
{
|
||
$result = ['corresponding_authors' => [], 'all_emails' => []];
|
||
$result['all_emails'] = $this->extractEmails($html);
|
||
|
||
// T&F: <span class="NLM_corresp"> or "CONTACT" section
|
||
if (preg_match('/class="[^"]*corresp[^"]*"[^>]*>(.*?)<\/span>/si', $html, $m)) {
|
||
$block = $m[1];
|
||
if (preg_match('/([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})/', $block, $em)) {
|
||
$name = trim(strip_tags(preg_replace('/[a-zA-Z0-9._%+\-]+@.*/', '', $block)));
|
||
$result['corresponding_authors'][] = ['name' => $name, 'email' => $em[1]];
|
||
}
|
||
}
|
||
|
||
if (empty($result['corresponding_authors'])) {
|
||
if (preg_match('/CONTACT\s+(.*?)([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})/s', $html, $m)) {
|
||
$name = trim(strip_tags($m[1]));
|
||
$result['corresponding_authors'][] = ['name' => $name, 'email' => $m[2]];
|
||
}
|
||
}
|
||
|
||
return $result;
|
||
}
|
||
|
||
/**
|
||
* PLOS 页面解析
|
||
*/
|
||
private function parsePlosEmail($html)
|
||
{
|
||
$result = ['corresponding_authors' => [], 'all_emails' => []];
|
||
$result['all_emails'] = $this->extractEmails($html);
|
||
|
||
// PLOS: "* E-mail: xxx@yyy.com"
|
||
if (preg_match_all('/\*\s*E-?mail:\s*([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})/', $html, $m)) {
|
||
foreach ($m[1] as $email) {
|
||
$result['corresponding_authors'][] = ['name' => '', 'email' => $email];
|
||
}
|
||
}
|
||
|
||
return $result;
|
||
}
|
||
|
||
/**
|
||
* 通用邮箱提取(兜底方案)
|
||
*/
|
||
private function parseGenericEmail($html)
|
||
{
|
||
$result = ['corresponding_authors' => [], 'all_emails' => []];
|
||
$result['all_emails'] = $this->extractEmails($html);
|
||
|
||
// 策略1: 找 "Correspondence" / "Corresponding author" 附近的邮箱
|
||
$corrPatterns = [
|
||
'/[Cc]orrespond(?:ing\s+author|ence)[:\s]*(?:<[^>]*>)*\s*(?:<[^>]*>)*\s*([^<]*?)\s*(?:<[^>]*>)*\s*(?:href=["\'])?mailto:([^"\'>\s]+)/s',
|
||
'/[Cc]orrespond[a-z]*[^<]{0,500}?([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})/',
|
||
'/\*\s*(?:E-?mail|Correspondence)[:\s]*([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})/',
|
||
];
|
||
|
||
foreach ($corrPatterns as $pattern) {
|
||
if (preg_match_all($pattern, $html, $m)) {
|
||
$lastGroup = end($m);
|
||
foreach ($lastGroup as $val) {
|
||
if (filter_var($val, FILTER_VALIDATE_EMAIL)) {
|
||
$alreadyExists = false;
|
||
foreach ($result['corresponding_authors'] as $existing) {
|
||
if ($existing['email'] === $val) {
|
||
$alreadyExists = true;
|
||
break;
|
||
}
|
||
}
|
||
if (!$alreadyExists) {
|
||
$result['corresponding_authors'][] = ['name' => '', 'email' => $val];
|
||
}
|
||
}
|
||
}
|
||
}
|
||
if (!empty($result['corresponding_authors'])) {
|
||
break;
|
||
}
|
||
}
|
||
|
||
// 策略2: 找所有 mailto 链接
|
||
if (empty($result['corresponding_authors'])) {
|
||
$mailtoEmails = [];
|
||
if (preg_match_all('/href=["\']mailto:([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})["\']/', $html, $m)) {
|
||
$mailtoEmails = array_unique($m[1]);
|
||
}
|
||
|
||
$filtered = array_values(array_filter($mailtoEmails, function ($email) {
|
||
$lower = strtolower($email);
|
||
$skip = ['noreply', 'support', 'info@', 'admin', 'editor', 'help@', 'contact@',
|
||
'privacy', 'service', 'marketing', 'copyright', 'permission'];
|
||
foreach ($skip as $kw) {
|
||
if (strpos($lower, $kw) !== false) return false;
|
||
}
|
||
return true;
|
||
}));
|
||
|
||
foreach ($filtered as $email) {
|
||
$result['corresponding_authors'][] = ['name' => '', 'email' => $email];
|
||
}
|
||
}
|
||
|
||
return $result;
|
||
}
|
||
|
||
/**
|
||
* 主方法:根据 DOI 抓取出版商网页,解析通讯作者和邮箱
|
||
*/
|
||
private function scrapeCorrespondingAuthor($doi)
|
||
{
|
||
$doi = $this->cleanDoi($doi);
|
||
if ($doi == '') {
|
||
return ['success' => false, 'error' => 'DOI为空'];
|
||
}
|
||
|
||
$page = $this->fetchPageByDoi($doi);
|
||
if (!$page['success']) {
|
||
return ['success' => false, 'doi' => $doi, 'error' => $page['error']];
|
||
}
|
||
|
||
$finalUrl = $page['final_url'];
|
||
$html = $page['html'];
|
||
$publisher = $this->detectPublisher($finalUrl);
|
||
|
||
switch ($publisher) {
|
||
case 'mdpi':
|
||
$parsed = $this->parseMdpiEmail($html);
|
||
break;
|
||
case 'springer':
|
||
$parsed = $this->parseSpringerEmail($html);
|
||
break;
|
||
case 'frontiers':
|
||
$parsed = $this->parseFrontiersEmail($html);
|
||
break;
|
||
case 'wiley':
|
||
$parsed = $this->parseWileyEmail($html);
|
||
break;
|
||
case 'elsevier':
|
||
$parsed = $this->parseElsevierEmail($html);
|
||
break;
|
||
case 'taylor_francis':
|
||
$parsed = $this->parseTaylorFrancisEmail($html);
|
||
break;
|
||
case 'plos':
|
||
$parsed = $this->parsePlosEmail($html);
|
||
break;
|
||
default:
|
||
$parsed = $this->parseGenericEmail($html);
|
||
break;
|
||
}
|
||
|
||
// 如果专用解析器没找到,用通用方案兜底
|
||
if (empty($parsed['corresponding_authors'])) {
|
||
$generic = $this->parseGenericEmail($html);
|
||
$parsed['corresponding_authors'] = $generic['corresponding_authors'];
|
||
}
|
||
|
||
return [
|
||
'success' => true,
|
||
'doi' => $doi,
|
||
'url' => $finalUrl,
|
||
'publisher' => $publisher,
|
||
'corresponding_authors' => $parsed['corresponding_authors'],
|
||
'all_emails' => $parsed['all_emails'],
|
||
];
|
||
}
|
||
|
||
/**
|
||
* 通过 DOI 获取通讯作者邮箱(单个)
|
||
*
|
||
* @param string doi 文章DOI
|
||
*/
|
||
public function getAuthorEmail()
|
||
{
|
||
$data = $this->request->param();
|
||
if (!isset($data['doi']) || trim($data['doi']) == '') {
|
||
return jsonError('doi不能为空');
|
||
}
|
||
|
||
$result = $this->scrapeCorrespondingAuthor($data['doi']);
|
||
if (!$result['success']) {
|
||
return jsonError($result['error']);
|
||
}
|
||
|
||
return jsonSuccess($result);
|
||
}
|
||
|
||
/**
|
||
* 批量通过 DOI 获取通讯作者邮箱
|
||
*
|
||
* @param string dois 逗号分隔的DOI列表
|
||
*/
|
||
public function batchGetAuthorEmails()
|
||
{
|
||
$data = $this->request->param();
|
||
if (!isset($data['dois']) || trim($data['dois']) == '') {
|
||
return jsonError('dois不能为空');
|
||
}
|
||
|
||
$doiList = array_filter(array_map('trim', explode(',', $data['dois'])));
|
||
if (empty($doiList)) {
|
||
return jsonError('未提供有效的DOI');
|
||
}
|
||
if (count($doiList) > 20) {
|
||
return jsonError('单次最多查询20个DOI');
|
||
}
|
||
|
||
$results = [];
|
||
$successCount = 0;
|
||
$emailFoundCount = 0;
|
||
|
||
foreach ($doiList as $rawDoi) {
|
||
$result = $this->scrapeCorrespondingAuthor($rawDoi);
|
||
if ($result['success']) {
|
||
$successCount++;
|
||
if (!empty($result['corresponding_authors'])) {
|
||
$emailFoundCount++;
|
||
}
|
||
}
|
||
$results[] = $result;
|
||
usleep(500000);
|
||
}
|
||
|
||
return jsonSuccess([
|
||
'total' => count($results),
|
||
'success_count' => $successCount,
|
||
'email_found_count' => $emailFoundCount,
|
||
'list' => $results,
|
||
]);
|
||
}
|
||
|
||
/**
|
||
* 统计当前 field 转 major 的覆盖情况
|
||
*/
|
||
public function statistics()
|
||
{
|
||
$totalReviewers = $this->user_reviewer_info_obj
|
||
->where('state', 0)
|
||
->count();
|
||
|
||
$hasField = $this->user_reviewer_info_obj
|
||
->where('state', 0)
|
||
->where('field', '<>', '')
|
||
->count();
|
||
|
||
$hasMajorToUser = Db::name('major_to_user')
|
||
->where('state', 0)
|
||
->group('user_id')
|
||
->count();
|
||
|
||
$hasFieldNoMajor = $this->user_reviewer_info_obj
|
||
->alias('ri')
|
||
->where('ri.state', 0)
|
||
->where('ri.field', '<>', '')
|
||
->where('ri.reviewer_id', 'not in', Db::name('major_to_user')->where('state', 0)->field('user_id')->buildSql())
|
||
->count();
|
||
|
||
return jsonSuccess([
|
||
'total_reviewers' => $totalReviewers,
|
||
'has_field' => $hasField,
|
||
'has_major_to_user' => $hasMajorToUser,
|
||
'has_field_no_major' => $hasFieldNoMajor,
|
||
]);
|
||
}
|
||
}
|