Files
tougao/application/common/service/AuthorBackgroundService.php
2026-06-05 11:14:10 +08:00

1379 lines
52 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
namespace app\common\service;
use think\Env;
/**
* 医学/护理期刊 — 青年编委/作者背景调查ORCID 精准查询)
* 数据源OpenAlex、ORCID、PubMed、Retraction Watch、Scopus
*/
class AuthorBackgroundService
{
const RW_CSV_URL = 'https://gitlab.com/crossref/retraction-watch-data/-/raw/main/retraction_watch.csv';
const RW_CACHE_H = 86400;
/** @var string PubMed / OpenAlex 联系邮箱 */
private $email = '';
/** @var string Elsevier Scopus API Key */
private $scopusApiKey = '';
public function __construct()
{
$this->email = trim((string) Env::get('author_bg.email', ''));
if ($this->email === '') {
$this->email = trim((string) Env::get('pubmed.email', 'yananwang898@gmail.com'));
}
$this->scopusApiKey = trim((string) Env::get('scopus.api_key', ''));
if ($this->scopusApiKey === '') {
$this->scopusApiKey = trim((string) config('scopus.api_key', ''));
}
}
/**
* 生成完整背调报告数据(前后端分离 JSON
*
* @return array{ok:bool,msg?:string,data?:array}
*/
public function buildReport($orcid, $lastName, $firstName, $institution)
{
$orcidNorm = $this->normalizeOrcid($orcid);
$lastName = trim((string) $lastName);
$firstName = trim((string) $firstName);
$institution = trim((string) $institution);
$hasQuery = ($orcidNorm !== '') || ($lastName !== '') || ($firstName !== '');
if (!$hasQuery) {
return ['ok' => false, 'msg' => '请提供 ORCID 或姓名'];
}
$orcidSource = 'provided';
if ($orcidNorm === '') {
if ($lastName === '') {
return [
'ok' => false,
'msg' => '未提供 ORCID 时,需填写作者姓氏',
'data' => [
'orcid_required' => true,
'submitted' => [
'last_name' => $lastName,
'first_name' => $firstName,
'institution' => $institution,
],
'hint' => '请填写 ORCID或至少填写姓氏机构选填仅用于候选列表排序',
],
];
}
$search = $this->searchOrcidCandidates($lastName, $firstName, $institution);
$candidates = $search['candidates'] ?? [];
if (empty($candidates)) {
return [
'ok' => false,
'msg' => '未能按姓名检索到 ORCID请手动填写',
'data' => [
'orcid_required' => true,
'submitted' => [
'last_name' => $lastName,
'first_name' => $firstName,
'institution' => $institution,
],
'hint' => '已在 OpenAlex、ORCID 官网、Scopus 按姓名检索,未找到带 ORCID 的作者',
'lookup_attempts' => $search['attempts'] ?? [],
],
];
}
if (count($candidates) > 1) {
return [
'ok' => false,
'need_select' => true,
'msg' => '匹配到 ' . count($candidates) . ' 位作者,请选择',
'data' => [
'candidates' => $candidates,
'submitted' => [
'last_name' => $lastName,
'first_name' => $firstName,
'institution' => $institution,
],
'lookup_attempts' => $search['attempts'] ?? [],
],
];
}
$orcidNorm = $candidates[0]['orcid'];
$orcidSource = 'name_search';
if ($firstName === '' && !empty($candidates[0]['display_name'])) {
$parts = preg_split('/\s+/u', trim($candidates[0]['display_name']));
if (count($parts) > 1) {
$lastName = array_pop($parts);
$firstName = implode(' ', $parts);
}
}
}
$way = $this->describeQueryWay($orcidSource);
$authorDisplay = trim("$firstName $lastName");
$orcidData = $this->orcidProfile($orcidNorm);
if ($orcidData['name'] !== '') {
$authorDisplay = $orcidData['name'];
}
$openalexAuthor = $this->resolveOpenAlexAuthor($orcidNorm, $firstName, $lastName, $institution);
$metrics = $this->openalexMetrics($openalexAuthor);
$pubmed = $this->pubmedSearch($lastName, $firstName, $institution, $orcidNorm, 50);
$rw = $this->searchRetractionsHybrid($orcidData['papers'], $firstName, $lastName, $institution, $authorDisplay);
$scopusUrl = $this->scopusDirectUrl($lastName, $firstName, $institution, $orcidNorm);
$scopusApi = $this->scopusApiSearch($orcidNorm, $lastName, $firstName, $institution);
$dups = $this->checkDuplicateTitles($this->papersForDupCheck($orcidData['papers'], $pubmed['papers']));
$worksCount = $this->resolveWorksCount($metrics, $orcidData, $pubmed, $scopusApi, $orcidNorm);
$risk = $this->riskLevel($rw, $metrics['h_index'], $worksCount);
$orcidPapers = array_slice($orcidData['papers'], 0, 10);
$pubmedPapers = array_slice($pubmed['papers'], 0, 10);
foreach ($orcidPapers as &$p) {
$p['open_url'] = $this->paperOpenUrl($p);
}
unset($p);
foreach ($pubmedPapers as &$p) {
$p['open_url'] = $this->paperOpenUrl($p);
}
unset($p);
foreach ($dups as &$dg) {
foreach ($dg['papers'] as &$dp) {
$dp['open_url'] = $this->paperOpenUrl($dp);
}
unset($dp);
}
unset($dg);
return [
'ok' => true,
'data' => [
'report_at' => date('Y-m-d H:i:s'),
'query' => [
'way' => $way,
'orcid' => $orcidNorm,
'orcid_source' => $orcidSource,
'orcid_resolved' => $orcidSource !== 'provided',
'last_name' => $lastName,
'first_name' => $firstName,
'institution' => $institution,
],
'conclusion' => [
'risk_level' => $risk,
'notes' => [
'有 ORCID 时优先以 ORCID + OpenAlex 为准,指标更稳定。',
'撤稿数据来自 Retraction Watch有 DOI 作品按 DOI 精确比对;无 DOI 作品回退姓名/题目匹配(同名有风险,需人工核实)。',
'本报告不构成法律认定,重大决策请结合原始文献、单位证明及人工调查。',
],
],
'basic' => [
'display_name' => $authorDisplay,
'orcid' => $orcidNorm,
'orcid_url' => 'https://orcid.org/' . $orcidNorm,
'orcid_affiliations'=> $orcidData['affiliations'],
'openalex_institutions' => $metrics['institutions'],
'openalex_url' => $metrics['openalex_url'],
'scopus_id' => $metrics['scopus_id'],
'scopus_url' => $metrics['scopus_url'],
],
'scopus' => [
'search_url' => $scopusUrl,
'api' => $scopusApi,
],
'metrics' => [
'works_count' => $worksCount,
'cited_by_count' => (int) $metrics['cited_by_count'],
'h_index' => (int) $metrics['h_index'],
'i10_index' => (int) $metrics['i10_index'],
'topics' => $metrics['topics'],
'pubmed_total' => (int) $pubmed['total'],
'pubmed_query' => $pubmed['query'],
'pubmed_url' => $pubmed['pubmed_url'],
],
'retraction_watch' => $rw,
'duplicates' => $dups,
'pubmed_papers' => $pubmedPapers,
'orcid_papers' => [
'total' => (int) $orcidData['papers_total'],
'papers' => $orcidPapers,
],
'sources' => ['OpenAlex', 'ORCID', 'PubMed', 'Scopus', 'Retraction Watch'],
],
];
}
public function normalizeOrcid($raw)
{
$raw = trim((string) $raw);
if ($raw === '') {
return '';
}
if (preg_match('/(\d{4}-\d{4}-\d{4}-\d{3}[\dX])/i', $raw, $m)) {
return strtolower($m[1]);
}
return '';
}
/**
* 按姓名检索 ORCID 候选(机构仅用于排序/校验,不参与搜索)
*
* @return array{candidates:array,attempts:array}
*/
public function searchOrcidCandidates($lastName, $firstName, $institution)
{
$attempts = [];
$lastName = trim((string) $lastName);
$firstName = trim((string) $firstName);
$institution = trim((string) $institution);
$pool = [];
if ($lastName === '') {
return ['candidates' => [], 'attempts' => []];
}
$openalexList = $this->openalexAuthorsByName($firstName, $lastName);
$attempts[] = ['source' => 'openalex', 'count' => count($openalexList)];
foreach ($openalexList as $author) {
$orcid = $this->extractOrcidFromOpenAlexAuthor($author);
if ($orcid === '') {
continue;
}
$displayName = $author['display_name'] ?? '';
if (!$this->isAcceptableNameMatch($displayName, $firstName, $lastName)) {
continue;
}
$affs = [];
foreach ($author['last_known_institutions'] ?? [] as $ins) {
$n = trim((string) ($ins['display_name'] ?? ''));
if ($n !== '') {
$affs[] = $n;
}
}
$this->addOrcidCandidate($pool, $orcid, $displayName, $affs, 'openalex', $institution);
}
$orcidResults = $this->orcidRegistrySearch($lastName, $firstName);
$attempts[] = ['source' => 'orcid_registry', 'count' => count($orcidResults)];
foreach ($orcidResults as $row) {
$orcid = $this->normalizeOrcid($row['orcid-id'] ?? $row['orcid_id'] ?? '');
if ($orcid === '') {
continue;
}
$given = trim((string) ($row['given-names'] ?? $row['given_names'] ?? ''));
$family = trim((string) ($row['family-names'] ?? $row['family_names'] ?? ''));
$displayName = trim($given . ' ' . $family);
if (!$this->isAcceptableNameMatch($displayName, $firstName, $lastName, $given, $family)) {
continue;
}
$instNames = $row['institution-name'] ?? $row['institution_name'] ?? [];
if (!is_array($instNames)) {
$instNames = $instNames !== '' ? [$instNames] : [];
}
$this->addOrcidCandidate($pool, $orcid, $displayName, $instNames, 'orcid_registry', $institution);
}
$scopus = $this->scopusApiSearch('', $lastName, $firstName, $institution, true);
$attempts[] = ['source' => 'scopus', 'count' => count($scopus['entries'] ?? [])];
foreach ($scopus['entries'] ?? [] as $entry) {
$orcid = $this->normalizeOrcid($entry['orcid'] ?? '');
if ($orcid === '') {
continue;
}
$displayName = $entry['name'] ?? '';
if (!$this->isAcceptableNameMatch($displayName, $firstName, $lastName)) {
continue;
}
$affs = [];
if (!empty($entry['affiliation'])) {
$affs[] = $entry['affiliation'];
}
$this->addOrcidCandidate($pool, $orcid, $displayName, $affs, 'scopus', $institution);
}
return [
'candidates' => $this->sortOrcidCandidates(array_values($pool), $firstName, $lastName),
'attempts' => $attempts,
];
}
/**
* 姓名匹配评分(名+姓);提供 firstName 时名不匹配则拒绝(如 Yanan ≠ Yuxuan
*/
private function scoreCandidateNameMatch($displayName, $firstName, $lastName, $givenName = '', $familyName = '')
{
$firstName = strtolower(trim((string) $firstName));
$lastName = strtolower(trim((string) $lastName));
$givenName = strtolower(trim((string) $givenName));
$familyName = strtolower(trim((string) $familyName));
$displayName = trim((string) $displayName);
if ($familyName !== '' && $lastName !== '') {
if (!$this->nameTokenMatches($familyName, $lastName)) {
return 0;
}
$score = 60;
if ($firstName === '') {
return $score;
}
if ($givenName === '') {
return 20;
}
if ($givenName === $firstName) {
return $score + 120;
}
if ($this->nameTokenMatches($givenName, $firstName)) {
return $score + 100;
}
return 0;
}
if ($displayName === '' || $lastName === '') {
return 0;
}
$nameLow = strtolower($displayName);
if (!$this->nameContainsToken($nameLow, $lastName)) {
return 0;
}
$score = 50;
if ($firstName === '') {
return $score;
}
$targetA = $firstName . ' ' . $lastName;
$targetB = $lastName . ' ' . $firstName;
if ($nameLow === $targetA || $nameLow === $targetB) {
return $score + 120;
}
$tokens = preg_split('/[\s,]+/u', $nameLow);
$tokens = array_values(array_filter($tokens, function ($t) {
return $t !== '';
}));
$firstHit = false;
foreach ($tokens as $token) {
if ($this->nameTokenMatches($token, $lastName)) {
continue;
}
if ($this->nameTokenMatches($token, $firstName)) {
$firstHit = true;
$score += 100;
break;
}
}
if (!$firstHit && $this->nameContainsToken($nameLow, $firstName)) {
$firstHit = true;
$score += 80;
}
return $firstHit ? $score : 0;
}
private function isAcceptableNameMatch($displayName, $firstName, $lastName, $givenName = '', $familyName = '')
{
$minScore = trim((string) $firstName) !== '' ? 70 : 40;
return $this->scoreCandidateNameMatch($displayName, $firstName, $lastName, $givenName, $familyName) >= $minScore;
}
private function nameTokenMatches($token, $target)
{
$token = strtolower(trim((string) $token));
$target = strtolower(trim((string) $target));
if ($token === '' || $target === '') {
return false;
}
return $token === $target
|| strpos($token, $target) === 0
|| strpos($target, $token) === 0;
}
private function nameContainsToken($haystack, $token)
{
$token = strtolower(trim((string) $token));
if ($token === '') {
return false;
}
return preg_match('/\b' . preg_quote($token, '/') . '\b/u', strtolower($haystack)) === 1;
}
private function addOrcidCandidate(array &$pool, $orcid, $name, array $affiliations, $source, $institution)
{
$orcid = $this->normalizeOrcid($orcid);
if ($orcid === '') {
return;
}
if (!isset($pool[$orcid])) {
$pool[$orcid] = [
'orcid' => $orcid,
'display_name' => '',
'affiliations' => [],
'affiliations_text' => '',
'sources' => [],
'sources_text' => '',
'institution_matched' => false,
'orcid_url' => 'https://orcid.org/' . $orcid,
];
}
$name = trim((string) $name);
if ($name !== '' && $pool[$orcid]['display_name'] === '') {
$pool[$orcid]['display_name'] = $name;
}
foreach ($affiliations as $aff) {
$aff = trim((string) $aff);
if ($aff === '') {
continue;
}
if (!in_array($aff, $pool[$orcid]['affiliations'], true)) {
$pool[$orcid]['affiliations'][] = $aff;
}
if ($institution !== '' && $this->institutionMatches($aff, $institution)) {
$pool[$orcid]['institution_matched'] = true;
}
}
if (!in_array($source, $pool[$orcid]['sources'], true)) {
$pool[$orcid]['sources'][] = $source;
}
}
private function sortOrcidCandidates(array $candidates, $firstName = '', $lastName = '')
{
foreach ($candidates as &$item) {
$item['name_match_score'] = $this->scoreCandidateNameMatch(
$item['display_name'] ?? '',
$firstName,
$lastName
);
$item['name_matched'] = $item['name_match_score'] >= 70;
$item['affiliations_text'] = implode('', $item['affiliations'] ?? []);
$srcMap = [
'openalex' => 'OpenAlex',
'orcid_registry' => 'ORCID',
'scopus' => 'Scopus',
];
$labels = [];
foreach ($item['sources'] ?? [] as $s) {
$labels[] = $srcMap[$s] ?? $s;
}
$item['sources_text'] = implode(' / ', $labels);
}
unset($item);
usort($candidates, function ($a, $b) {
$nameCmp = ($b['name_match_score'] ?? 0) <=> ($a['name_match_score'] ?? 0);
if ($nameCmp !== 0) {
return $nameCmp;
}
if (($a['institution_matched'] ?? false) !== ($b['institution_matched'] ?? false)) {
return ($b['institution_matched'] ?? false) <=> ($a['institution_matched'] ?? false);
}
return strcmp($a['display_name'] ?? '', $b['display_name'] ?? '');
});
return $candidates;
}
private function institutionMatches($candidateInst, $targetInstitution)
{
$instLow = strtolower(trim((string) $targetInstitution));
$candLow = strtolower(trim((string) $candidateInst));
if ($instLow === '' || $candLow === '') {
return false;
}
return strpos($candLow, $instLow) !== false || strpos($instLow, $candLow) !== false;
}
private function describeQueryWay($orcidSource)
{
$map = [
'provided' => 'ORCID 精准查询',
'name_search' => '姓名自动匹配 ORCID',
];
return $map[$orcidSource] ?? 'ORCID 查询';
}
private function extractOrcidFromOpenAlexAuthor($author)
{
if (!is_array($author)) {
return '';
}
$raw = $author['orcid'] ?? ($author['ids']['orcid'] ?? '');
return $this->normalizeOrcid((string) $raw);
}
private function openalexAuthorsByName($first, $last)
{
$q = trim("$first $last");
if ($q === '' && $last !== '') {
$q = $last;
}
if ($q === '') {
return [];
}
$url = 'https://api.openalex.org/authors?search=' . urlencode($q) . '&per_page=25';
$json = $this->httpGet($url);
if (!$json) {
return [];
}
$data = json_decode($json, true);
return $data['results'] ?? [];
}
private function orcidRegistrySearch($lastName, $firstName)
{
$parts = [];
if ($lastName !== '') {
$parts[] = 'family-name:' . $lastName;
}
if ($firstName !== '') {
$parts[] = 'given-names:' . $firstName;
}
if (empty($parts)) {
return [];
}
$url = 'https://pub.orcid.org/v3.0/expanded-search/?q='
. urlencode(implode(' AND ', $parts)) . '&rows=25';
$json = $this->httpGet($url, ['Accept: application/json']);
if (!$json) {
return [];
}
$data = json_decode($json, true);
return $data['expanded-result'] ?? $data['result'] ?? [];
}
private function httpGet($url, array $headers = [], $timeout = 25)
{
$headers[] = 'User-Agent: MedicalAuthorCheck/1.0 (mailto:' . $this->email . ')';
if (function_exists('curl_init')) {
$ch = curl_init($url);
curl_setopt_array($ch, [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_TIMEOUT => $timeout,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_HTTPHEADER => $headers,
CURLOPT_SSL_VERIFYPEER => true,
]);
$body = curl_exec($ch);
$code = (int) curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
return ($body !== false && $code >= 200 && $code < 300) ? $body : null;
}
$ctx = stream_context_create([
'http' => ['method' => 'GET', 'header' => implode("\r\n", $headers), 'timeout' => $timeout],
'ssl' => ['verify_peer' => true, 'verify_peer_name' => true],
]);
$body = @file_get_contents($url, false, $ctx);
return $body !== false ? $body : null;
}
private function openalexAuthorByOrcid($orcid)
{
$url = 'https://api.openalex.org/authors?filter=orcid:' . urlencode('https://orcid.org/' . $orcid) . '&per_page=5';
$json = $this->httpGet($url);
if (!$json) {
return null;
}
$data = json_decode($json, true);
return $data['results'][0] ?? null;
}
private function resolveOpenAlexAuthor($orcid, $firstName, $lastName, $institution)
{
$author = $this->openalexAuthorByOrcid($orcid);
if ($author) {
return $author;
}
$orcidNorm = $this->normalizeOrcid($orcid);
if ($orcidNorm !== '') {
foreach ($this->openalexAuthorsByName($firstName, $lastName) as $candidate) {
if ($this->extractOrcidFromOpenAlexAuthor($candidate) === $orcidNorm) {
return $candidate;
}
}
return null;
}
if ($lastName === '' && $firstName === '') {
return null;
}
$best = null;
$bestScore = -1;
foreach ($this->openalexAuthorsByName($firstName, $lastName) as $candidate) {
$displayName = $candidate['display_name'] ?? '';
if (!$this->isAcceptableNameMatch($displayName, $firstName, $lastName)) {
continue;
}
$score = $this->scoreCandidateNameMatch($displayName, $firstName, $lastName);
if ($institution !== '') {
foreach ($candidate['last_known_institutions'] ?? [] as $inst) {
if ($this->institutionMatches($inst['display_name'] ?? '', $institution)) {
$score += 50;
break;
}
}
}
if ($score > $bestScore) {
$bestScore = $score;
$best = $candidate;
continue;
}
if ($score === $bestScore && $best !== null) {
$candidateWorks = (int) ($candidate['works_count'] ?? 0);
$bestWorks = (int) ($best['works_count'] ?? 0);
if ($candidateWorks > $bestWorks) {
$best = $candidate;
}
}
}
return $best;
}
private function resolveWorksCount($metrics, $orcidData, $pubmed, $scopusApi, $orcid)
{
$counts = [
(int) ($metrics['works_count'] ?? 0),
(int) ($orcidData['papers_total'] ?? 0),
(int) ($pubmed['total'] ?? 0),
];
$orcidNorm = $this->normalizeOrcid($orcid);
$entries = $scopusApi['entries'] ?? [];
foreach ($entries as $entry) {
if ($orcidNorm !== '' && $this->normalizeOrcid($entry['orcid'] ?? '') === $orcidNorm) {
$counts[] = (int) ($entry['document_count'] ?? 0);
break;
}
}
if ($orcidNorm === '' && count($entries) === 1) {
$counts[] = (int) ($entries[0]['document_count'] ?? 0);
}
return max($counts);
}
private function openalexMetrics($author)
{
if (!$author) {
return [
'found' => false, 'display_name' => '', 'openalex_id' => '', 'orcid' => '',
'works_count' => 0, 'cited_by_count' => 0, 'h_index' => 0, 'i10_index' => 0,
'institutions' => [], 'topics' => [], 'openalex_url' => '', 'scopus_id' => '', 'scopus_url' => '',
];
}
$stats = $author['summary_stats'] ?? [];
$scopusId = $this->extractScopusId($author['ids']['scopus'] ?? '');
$insts = [];
foreach ($author['last_known_institutions'] ?? [] as $i) {
$insts[] = $i['display_name'] ?? '';
}
$topics = [];
foreach (array_slice($author['topics'] ?? [], 0, 5) as $t) {
$topics[] = ($t['display_name'] ?? '') . ' (' . ($t['count'] ?? 0) . '篇)';
}
$oid = $author['id'] ?? '';
return [
'found' => true,
'display_name' => $author['display_name'] ?? '',
'openalex_id' => $oid,
'orcid' => preg_replace('#.*/#', '', $author['orcid'] ?? ''),
'works_count' => (int) ($author['works_count'] ?? 0),
'cited_by_count' => (int) ($author['cited_by_count'] ?? 0),
'h_index' => (int) ($stats['h_index'] ?? 0),
'i10_index' => (int) ($stats['i10_index'] ?? 0),
'institutions' => $insts,
'topics' => $topics,
'openalex_url' => str_replace('https://openalex.org/', 'https://openalex.org/authors/', $oid),
'scopus_id' => $scopusId,
'scopus_url' => $scopusId ? $this->scopusAuthorUrl($scopusId) : '',
];
}
private function extractScopusId($raw)
{
if (!$raw) {
return '';
}
if (preg_match('/authorID=(\d+)/i', (string) $raw, $m)) {
return $m[1];
}
if (preg_match('/^(\d{8,})$/', (string) $raw, $m)) {
return $m[1];
}
return '';
}
private function scopusAuthorUrl($authorId)
{
return 'https://www.scopus.com/authid/detail.uri?authorId=' . urlencode($authorId);
}
private function scopusDirectUrl($last, $first, $institution, $orcid)
{
$base = 'https://www.scopus.com/results/authorNamesList.uri';
$params = [
'sort' => 'count-f', 'src' => 'al', 'selectionPageSearch' => 'anl',
'origin' => 'searchauthorfreelookup', 'activeFlag' => 'true',
'resultsPerPage' => '20', 'exactAuthorSearch' => 'false',
];
if ($orcid !== '') {
$params['orcidId'] = $orcid;
$params['s'] = 'AUTH--ORCID--ID(' . $orcid . ')';
return $base . '?' . http_build_query($params, '', '&', PHP_QUERY_RFC3986);
}
if ($last === '' && $first === '') {
return null;
}
$params['authorLastName'] = $last;
$params['authorFirstName'] = $first;
if ($institution !== '') {
$params['affilname'] = $institution;
}
$s = [];
if ($last !== '') {
$s[] = 'AUTHLASTNAME(' . $last . ')';
}
if ($first !== '') {
$s[] = 'AUTHFIRST(' . $first . ')';
}
if ($institution !== '') {
$s[] = 'AFFIL(' . $institution . ')';
}
$params['s'] = implode(' AND ', $s);
return $base . '?' . http_build_query($params, '', '&', PHP_QUERY_RFC3986);
}
private function httpGetElsevier($url, $apiKey, $timeout = 25)
{
$headers = ['Accept: application/json', 'X-ELS-APIKey: ' . $apiKey];
if (!function_exists('curl_init')) {
return null;
}
$ch = curl_init($url);
curl_setopt_array($ch, [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_TIMEOUT => $timeout,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_HTTPHEADER => $headers,
CURLOPT_SSL_VERIFYPEER => true,
]);
$body = curl_exec($ch);
$code = (int) curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
return ($body !== false && $code >= 200 && $code < 300) ? $body : null;
}
private function scopusApiSearch($orcid, $last, $first, $institution, $nameOnly = false)
{
$apiKey = $this->scopusApiKey;
if (trim($apiKey) === '') {
return ['ok' => false, 'msg' => '未配置 Scopus API Key', 'entries' => []];
}
if ($orcid !== '') {
$query = 'ORCID(' . $orcid . ')';
} else {
$parts = [];
if ($last !== '') {
$parts[] = 'AUTHLASTNAME(' . preg_replace('/[^\pL\pN\s\-]/u', '', $last) . ')';
}
if ($first !== '') {
$parts[] = 'AUTHFIRST(' . preg_replace('/[^\pL\pN\s\-]/u', '', $first) . ')';
}
if (!$nameOnly && $institution !== '') {
$parts[] = 'AFFIL(' . $institution . ')';
}
if (empty($parts)) {
return ['ok' => false, 'msg' => '缺少检索条件', 'entries' => []];
}
$query = implode(' AND ', $parts);
}
$count = $nameOnly ? 25 : 10;
$url = 'https://api.elsevier.com/content/search/author?query=' . urlencode($query) . '&count=' . $count;
$json = $this->httpGetElsevier($url, $apiKey);
if (!$json) {
return ['ok' => false, 'msg' => 'Scopus API 请求失败,请检查 Key 或网络', 'entries' => []];
}
$data = json_decode($json, true);
$entries = [];
foreach ($data['search-results']['entry'] ?? [] as $e) {
if (!is_array($e)) {
continue;
}
$idRaw = $e['dc:identifier'] ?? '';
$authorId = '';
if (preg_match('/AUTHOR_ID:(\d+)/', $idRaw, $m)) {
$authorId = $m[1];
}
$name = '';
if (!empty($e['preferred-name'])) {
$pn = $e['preferred-name'];
$name = ($pn['ce:indexed-name'] ?? '')
?: trim(($pn['ce:given-name'] ?? '') . ' ' . ($pn['ce:surname'] ?? ''));
}
$aff = '';
if (!empty($e['affiliation-current']['affiliation-name'])) {
$aff = $e['affiliation-current']['affiliation-name'];
} elseif (!empty($e['affiliation-current']['ip-doc']['afdispname'])) {
$aff = $e['affiliation-current']['ip-doc']['afdispname'];
}
$entryOrcid = '';
foreach (['orcid', 'ORCID'] as $orcidKey) {
if (!empty($e[$orcidKey])) {
$entryOrcid = $this->normalizeOrcid((string) $e[$orcidKey]);
if ($entryOrcid !== '') {
break;
}
}
}
$entries[] = [
'author_id' => $authorId,
'name' => $name,
'affiliation' => $aff,
'orcid' => $entryOrcid,
'document_count' => (int) ($e['document-count'] ?? 0),
'cited_by_count' => (int) ($e['cited-by-count'] ?? 0),
'h_index' => (int) ($e['h-index'] ?? 0),
'url' => $authorId ? $this->scopusAuthorUrl($authorId) : '',
];
}
return ['ok' => true, 'msg' => '共匹配 ' . count($entries) . ' 位作者', 'entries' => $entries];
}
private function orcidProfile($orcid)
{
$base = "https://pub.orcid.org/v3.0/$orcid";
$headers = ['Accept: application/json'];
$person = json_decode($this->httpGet("$base/person", $headers) ?: '{}', true);
$works = json_decode($this->httpGet("$base/works", $headers) ?: '{}', true);
$name = '';
$affs = [];
if (!empty($person['name'])) {
$g = $person['name']['given-names']['value'] ?? '';
$f = $person['name']['family-name']['value'] ?? '';
$name = trim("$g $f");
}
foreach ($person['activities-summary']['employments']['affiliation-group'] ?? [] as $g) {
$s = $g['summaries'][0]['employment-summary'] ?? [];
$org = $s['organization']['name'] ?? '';
if ($org) {
$affs[] = $org;
}
}
$papers = [];
foreach ($works['group'] ?? [] as $grp) {
$w = $grp['work-summary'][0] ?? [];
$doi = '';
$pmid = '';
foreach ($w['external-ids']['external-id'] ?? [] as $ext) {
$type = strtolower($ext['external-id-type'] ?? '');
$val = $ext['external-id-value'] ?? '';
if ($type === 'doi' && $doi === '') {
$doi = $val;
}
if ($type === 'pmid' && $pmid === '') {
$pmid = $val;
}
}
$papers[] = [
'title' => $w['title']['title']['value'] ?? '无标题',
'year' => $w['publication-date']['year']['value'] ?? '',
'journal' => $w['journal-title']['value'] ?? '',
'doi' => $doi,
'pmid' => $pmid,
'url' => $w['url']['value'] ?? '',
];
}
usort($papers, function ($a, $b) {
return (int) ($b['year'] ?? 0) <=> (int) ($a['year'] ?? 0);
});
return ['name' => $name, 'affiliations' => $affs, 'papers' => $papers, 'papers_total' => count($papers)];
}
private function pubmedEsearch($term, $retmax = 0, $sort = '')
{
$email = urlencode($this->email);
$url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term='
. urlencode($term) . '&retmode=json&retmax=' . $retmax
. '&tool=MedicalAuthorCheck&email=' . $email;
if ($sort !== '') {
$url .= '&sort=' . urlencode($sort);
}
$data = json_decode($this->httpGet($url) ?: '{}', true);
$res = $data['esearchresult'] ?? [];
return ['count' => (int) ($res['count'] ?? 0), 'ids' => $res['idlist'] ?? [], 'term' => $term];
}
private function pubmedFetchSummaries(array $ids)
{
if (empty($ids)) {
return [];
}
$email = urlencode($this->email);
$papers = [];
foreach (array_chunk($ids, 20) as $chunk) {
$sumUrl = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id='
. implode(',', $chunk) . '&retmode=json&tool=MedicalAuthorCheck&email=' . $email;
$sum = json_decode($this->httpGet($sumUrl) ?: '{}', true);
foreach ($chunk as $pmid) {
$r = $sum['result'][$pmid] ?? [];
$doi = '';
foreach ($r['articleids'] ?? [] as $aid) {
if (strtolower($aid['idtype'] ?? '') === 'doi') {
$doi = $aid['value'] ?? '';
break;
}
}
$papers[] = [
'pmid' => $pmid, 'title' => $r['title'] ?? "PMID $pmid",
'year' => substr($r['pubdate'] ?? '', 0, 4),
'journal' => $r['fulljournalname'] ?? $r['source'] ?? '',
'doi' => $doi, 'url' => '',
];
}
}
return $papers;
}
private function pubmedBuildQueries($last, $first, $institution, $orcid)
{
$queries = [];
if ($orcid !== '') {
$queries[] = $orcid . '[ORCID]';
}
$firstParts = preg_split('/\s+/', trim($first));
$firstInitial = $firstParts[0] ?? '';
if ($last !== '') {
if ($firstInitial !== '' && $institution !== '') {
$queries[] = $last . '[Author] AND ' . $firstInitial . '[Author] AND ' . $institution . '[Affiliation]';
}
if ($first !== '' && $institution !== '') {
$queries[] = $last . '[Author] AND "' . $first . '"[Author] AND ' . $institution . '[Affiliation]';
}
if ($firstInitial !== '') {
$queries[] = $last . '[Author] AND ' . $firstInitial . '[Author]';
}
if ($first !== '') {
$queries[] = $last . '[Author] AND "' . $first . '"[Author]';
}
if ($first === '') {
$queries[] = $last . '[Author]';
}
}
return array_values(array_unique($queries));
}
private function pubmedSearch($last, $first, $institution, $orcid = '', $listMax = 10)
{
$queries = $this->pubmedBuildQueries($last, $first, $institution, $orcid);
$best = ['count' => 0, 'ids' => [], 'term' => ''];
foreach ($queries as $term) {
$r = $this->pubmedEsearch($term, 0);
if ($r['count'] > 0) {
$best = $r;
break;
}
}
$total = $best['count'];
$usedTerm = $best['term'];
$papers = [];
if ($total > 0) {
$recent = $this->pubmedEsearch($usedTerm, $listMax, 'pub date');
$ids = $recent['ids'];
if (empty($ids)) {
$recent = $this->pubmedEsearch($usedTerm, $listMax);
$ids = $recent['ids'];
}
$papers = $this->pubmedFetchSummaries($ids);
}
return [
'total' => $total, 'papers' => $papers, 'query' => $usedTerm,
'pubmed_url' => 'https://pubmed.ncbi.nlm.nih.gov/?term=' . urlencode($usedTerm),
];
}
public function paperOpenUrl(array $p)
{
if (!empty($p['url']) && filter_var($p['url'], FILTER_VALIDATE_URL)) {
return $p['url'];
}
if (!empty($p['doi'])) {
return 'https://doi.org/' . ltrim($p['doi'], 'https://doi.org/');
}
if (!empty($p['pmid'])) {
return 'https://pubmed.ncbi.nlm.nih.gov/' . $p['pmid'] . '/';
}
return '';
}
private function rwCachePath()
{
$root = defined('ROOT_PATH') ? rtrim(ROOT_PATH, '/\\') : dirname(dirname(dirname(__DIR__)));
return $root . DIRECTORY_SEPARATOR . 'runtime' . DIRECTORY_SEPARATOR . 'retraction_watch_cache.csv';
}
private function downloadRetractionWatch()
{
$path = $this->rwCachePath();
$dir = dirname($path);
if (!is_dir($dir)) {
@mkdir($dir, 0755, true);
}
if (is_file($path) && (time() - filemtime($path)) < self::RW_CACHE_H) {
return true;
}
$csv = $this->httpGet(self::RW_CSV_URL, [], 60);
if (!$csv || strlen($csv) < 1000) {
return is_file($path);
}
return file_put_contents($path, $csv) !== false;
}
private function normalizeDoi($raw)
{
$raw = strtolower(trim((string) $raw));
if ($raw === '' || in_array($raw, ['unavailable', 'na', 'n/a'], true)) {
return '';
}
$raw = preg_replace('#^https?://(dx\.)?doi\.org/#i', '', $raw);
return trim($raw, '/');
}
private function collectPaperDois(array $papers)
{
$map = [];
foreach ($papers as $p) {
$doi = $this->normalizeDoi($p['doi'] ?? '');
if ($doi !== '') {
$map[$doi] = $p;
}
}
return $map;
}
private function isMisconductReason($reason)
{
$r = strtolower((string) $reason);
$keys = [
'misconduct', 'fabrication', 'falsification', 'plagiarism',
'fake peer', 'paper mill', 'ethical violation', 'breach of policy',
'complaints about author', 'fraud', 'manipulation',
'不端', '造假', '抄袭', '剽窃',
];
foreach ($keys as $k) {
if (strpos($r, $k) !== false) {
return true;
}
}
return false;
}
private function titleMatchKey($title)
{
return strtolower(preg_replace('/[^a-z0-9\x{4e00}-\x{9fff}]+/u', '', trim((string) $title)));
}
private function rwTitleMatchesPaper($rwTitle, array $paper)
{
$k1 = $this->titleMatchKey($rwTitle);
$k2 = $this->titleMatchKey($paper['title'] ?? '');
if ($k1 === '' || $k2 === '') {
return false;
}
return $k1 === $k2 || strpos($k1, $k2) !== false || strpos($k2, $k1) !== false;
}
private function nameTokens($first, $last, $displayName = '')
{
$tokens = [];
$last = strtolower(trim((string) $last));
$first = strtolower(trim((string) $first));
if ($last) {
$tokens[] = $last;
}
if ($first) {
$tokens[] = $first;
foreach (preg_split('/\s+/', $first) as $p) {
if (strlen($p) > 1) {
$tokens[] = $p;
}
}
}
if ($displayName) {
foreach (preg_split('/\s+/', strtolower($displayName)) as $p) {
if (strlen($p) > 2) {
$tokens[] = $p;
}
}
}
return array_unique($tokens);
}
private function authorMatchesRw($authorField, array $tokens)
{
$field = strtolower((string) $authorField);
if ($field === '') {
return false;
}
if (!empty($tokens[0]) && strpos($field, $tokens[0]) === false) {
return false;
}
$hits = 0;
foreach ($tokens as $t) {
if (strlen($t) >= 3 && strpos($field, $t) !== false) {
$hits++;
}
}
return $hits >= 2 || (count($tokens) === 1 && $hits >= 1);
}
private function rwDetailUrl(array $row, array $col)
{
$urls = $row[$col['URLS'] ?? 7] ?? '';
foreach (preg_split('/\s*;\s*/', (string) $urls) as $u) {
$u = trim($u);
if ($u !== '' && preg_match('#^https?://#i', $u)) {
return $u;
}
}
$origDoi = $this->normalizeDoi($row[$col['OriginalPaperDOI'] ?? 12] ?? '');
if ($origDoi !== '') {
return 'https://doi.org/' . $origDoi;
}
$retDoi = $this->normalizeDoi($row[$col['RetractionDOI'] ?? 9] ?? '');
if ($retDoi !== '') {
return 'https://doi.org/' . $retDoi;
}
$title = trim($row[$col['Title'] ?? 1] ?? '');
if ($title !== '') {
return 'https://retractionwatch.com/?s=' . rawurlencode($title);
}
return '';
}
private function buildRwItem(array $row, array $col, $matchType, $authorPaper = null)
{
$title = $row[$col['Title'] ?? 1] ?? '';
$reason = $row[$col['Reason'] ?? 11] ?? '';
$nature = $row[$col['RetractionNature'] ?? 10] ?? '';
$date = $row[$col['RetractionDate'] ?? 5] ?? '';
$authors = $row[$col['Author'] ?? 6] ?? '';
$origDoi = $this->normalizeDoi($row[$col['OriginalPaperDOI'] ?? 12] ?? '');
$misconduct = $this->isMisconductReason($reason);
$detailUrl = $this->rwDetailUrl($row, $col);
if ($authorPaper && !$detailUrl) {
$detailUrl = $this->paperOpenUrl($authorPaper);
}
$matchLabels = [
'doi' => 'DOI 精确匹配(高可信度)',
'name' => '姓名+题目匹配(参考,已关联 ORCID 无 DOI 作品)',
'name_loose' => '姓名匹配(低可信度,存在同名误报风险)',
];
return [
'record_id' => $row[$col['Record ID'] ?? 0] ?? '',
'title' => $title,
'nature' => $nature,
'reason' => $reason,
'date' => $date,
'misconduct' => $misconduct,
'authors' => $authors,
'doi' => $origDoi,
'author_title' => $authorPaper['title'] ?? '',
'author_year' => $authorPaper['year'] ?? '',
'url' => $detailUrl,
'match_type' => $matchType,
'match_label' => $matchLabels[$matchType] ?? $matchType,
];
}
private function countRwStats(array $items)
{
$mis = 0;
$ret = 0;
foreach ($items as $it) {
if (stripos($it['nature'] ?? '', 'retraction') !== false) {
$ret++;
}
if (!empty($it['misconduct'])) {
$mis++;
}
}
return ['misconduct_count' => $mis, 'retraction_count' => $ret];
}
private function searchRetractionsHybrid(array $papers, $first, $last, $institution, $displayName = '')
{
$empty = [
'ok' => false, 'msg' => '', 'items' => [],
'misconduct_count' => 0, 'retraction_count' => 0,
'checked_doi_count' => 0, 'no_doi_count' => count($papers),
'doi_match_count' => 0, 'name_match_count' => 0, 'name_loose_match_count' => 0,
];
if (!$this->downloadRetractionWatch()) {
$empty['msg'] = '撤稿数据库暂不可用';
return $empty;
}
$paperByDoi = $this->collectPaperDois($papers);
$noDoiPapers = [];
foreach ($papers as $p) {
if ($this->normalizeDoi($p['doi'] ?? '') === '') {
$noDoiPapers[] = $p;
}
}
$checkedCount = count($paperByDoi);
$noDoiCount = count($noDoiPapers);
$path = $this->rwCachePath();
$fp = fopen($path, 'r');
if (!$fp) {
$empty['msg'] = '撤稿数据库读取失败';
return $empty;
}
$header = fgetcsv($fp);
$col = array_flip($header ?: []);
$doiIndex = array_flip(array_keys($paperByDoi));
$tokens = $this->nameTokens($first, $last, $displayName);
$instLow = strtolower((string) $institution);
$items = [];
$seenKeys = [];
$addItem = function (array $item) use (&$items, &$seenKeys) {
$key = ($item['record_id'] ?? '') . '|' . ($item['doi'] ?? '') . '|' . $this->titleMatchKey($item['title'] ?? '');
if (isset($seenKeys[$key])) {
return;
}
$seenKeys[$key] = true;
$items[] = $item;
};
while (($row = fgetcsv($fp)) !== false) {
$origDoi = $this->normalizeDoi($row[$col['OriginalPaperDOI'] ?? 12] ?? '');
if ($origDoi !== '' && isset($doiIndex[$origDoi])) {
$addItem($this->buildRwItem($row, $col, 'doi', $paperByDoi[$origDoi]));
}
}
rewind($fp);
fgetcsv($fp);
if ($noDoiCount > 0 && !empty($tokens)) {
while (($row = fgetcsv($fp)) !== false) {
$origDoi = $this->normalizeDoi($row[$col['OriginalPaperDOI'] ?? 12] ?? '');
if ($origDoi !== '' && isset($doiIndex[$origDoi])) {
continue;
}
$authors = $row[$col['Author'] ?? 6] ?? '';
if (!$this->authorMatchesRw($authors, $tokens)) {
continue;
}
$rwTitle = $row[$col['Title'] ?? 1] ?? '';
$linkedPaper = null;
foreach ($noDoiPapers as $p) {
if ($this->rwTitleMatchesPaper($rwTitle, $p)) {
$linkedPaper = $p;
break;
}
}
if ($linkedPaper) {
$addItem($this->buildRwItem($row, $col, 'name', $linkedPaper));
continue;
}
if ($instLow !== '') {
$inst = strtolower($row[$col['Institution'] ?? 4] ?? '');
if ($inst !== '' && strpos($inst, $instLow) === false && strpos($instLow, $inst) === false) {
continue;
}
}
if (count($items) < 50) {
$addItem($this->buildRwItem($row, $col, 'name_loose', null));
}
}
}
fclose($fp);
$doiMatch = $nameMatch = $nameLooseMatch = 0;
foreach ($items as $it) {
if ($it['match_type'] === 'doi') {
$doiMatch++;
} elseif ($it['match_type'] === 'name') {
$nameMatch++;
} else {
$nameLooseMatch++;
}
}
$stats = $this->countRwStats($items);
return [
'ok' => true,
'msg' => 'DOI 比对 ' . $checkedCount . ' 篇,无 DOI 作品 ' . $noDoiCount . ' 篇已启用姓名回退',
'items' => $items,
'misconduct_count' => $stats['misconduct_count'],
'retraction_count' => $stats['retraction_count'],
'checked_doi_count' => $checkedCount,
'no_doi_count' => $noDoiCount,
'doi_match_count' => $doiMatch,
'name_match_count' => $nameMatch,
'name_loose_match_count' => $nameLooseMatch,
];
}
private function papersForDupCheck(array $orcidPapers, array $pubmedPapers)
{
$all = [];
foreach ($orcidPapers as $p) {
$p['source'] = 'ORCID';
$all[] = $p;
}
foreach ($pubmedPapers as $p) {
$p['source'] = 'PubMed';
$all[] = $p;
}
return $all;
}
private function checkDuplicateTitles(array $papers)
{
$groups = [];
foreach ($papers as $p) {
$t = trim($p['title'] ?? '');
if ($t === '' || strpos($t, 'PMID') === 0) {
continue;
}
$key = strtolower(preg_replace('/[^a-z0-9\x{4e00}-\x{9fff}]+/u', '', $t));
if ($key === '') {
continue;
}
$groups[$key][] = $p;
}
$dups = [];
foreach ($groups as $items) {
if (count($items) >= 2) {
$dups[] = ['title' => $items[0]['title'], 'papers' => $items];
}
}
return $dups;
}
private function riskLevel(array $rw, $hIndex, $works)
{
if (($rw['misconduct_count'] ?? 0) > 0) {
return '高风险 — 存在学术不端相关撤稿记录,建议人工复核';
}
if (($rw['retraction_count'] ?? 0) > 0) {
return '中风险 — 存在撤稿 / 关注声明,请核对是否与本人相关';
}
if ($works === 0) {
return '待核实 — 未检索到论文,请核对 ORCID / 姓名拼写';
}
if ($hIndex >= 10 || $works >= 20) {
return '低风险 — 学术产出指标正常';
}
return '一般 — 青年学者常见产出区间,建议结合研究方向综合判断';
}
}