Files
tougao/application/common/BackgroundCheckService.php
2026-06-04 13:33:13 +08:00

706 lines
24 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
namespace app\common;
/**
* 背景调查公共服务
* 封装 OpenAlex / CrossRef / Retraction Watch 数据查询
*/
class BackgroundCheckService
{
private $openAlexBase = 'https://api.openalex.org';
private $crossRefBase = 'https://api.crossref.org';
private $mailto = 'publisher@tmrjournals.com';
// ===================== OpenAlex =====================
public function openAlexGet($path, $query = [])
{
$query['mailto'] = $this->mailto;
$url = $this->openAlexBase . $path . '?' . http_build_query($query);
$result = $this->httpGet($url, [
'Accept: application/json',
'User-Agent: TMRJournals-BackgroundCheck/1.0 (mailto:' . $this->mailto . ')',
]);
if (!$result['success']) {
return $result;
}
$data = json_decode($result['body'], true);
if (!is_array($data)) {
return ['success' => false, 'error' => 'OpenAlex返回数据格式异常'];
}
return ['success' => true, 'data' => $data];
}
public function resolveAuthor($params)
{
if (!empty($params['openalex_id'])) {
$id = preg_replace('/^https?:\/\/openalex\.org\//', '', $params['openalex_id']);
$res = $this->openAlexGet('/authors/' . urlencode($id));
if (!$res['success']) {
return ['success' => false, 'error' => $res['error']];
}
return ['success' => true, 'data' => $res['data']];
}
if (!empty($params['orcid'])) {
$orcid = $this->cleanOrcid($params['orcid']);
$res = $this->openAlexGet('/authors/https://orcid.org/' . $orcid);
if (!$res['success']) {
return ['success' => false, 'error' => '未在 OpenAlex 找到该 ORCID 对应学者'];
}
return ['success' => true, 'data' => $res['data']];
}
if (empty($params['name'])) {
return ['success' => false, 'error' => '请提供 openalex_id、orcid 或 name'];
}
$filter = 'display_name.search:' . $params['name'];
if (!empty($params['affiliation'])) {
$filter .= ',last_known_institutions.display_name.search:' . $params['affiliation'];
}
$res = $this->openAlexGet('/authors', [
'search' => $params['name'],
'filter' => $filter,
'sort' => 'cited_by_count:desc',
'per-page' => 1,
]);
if (!$res['success']) {
return ['success' => false, 'error' => $res['error']];
}
$results = $res['data']['results'] ?? [];
if (empty($results)) {
return ['success' => false, 'error' => '未找到匹配学者,请补充 affiliation 或使用 orcid'];
}
return ['success' => true, 'data' => $results[0]];
}
public function fetchRetractedWorksOpenAlex($openAlexId)
{
$res = $this->openAlexGet('/works', [
'filter' => 'authorships.author.id:' . $openAlexId . ',is_retracted:true',
'sort' => 'publication_date:desc',
'per-page' => 25,
]);
if (!$res['success']) {
return ['count' => 0, 'list' => [], 'error' => $res['error']];
}
$list = [];
foreach ($res['data']['results'] ?? [] as $work) {
$list[] = $this->formatOpenAlexWork($work);
}
return ['count' => count($list), 'list' => $list, 'source' => 'openalex'];
}
public function fetchRecentWorks($openAlexId, $limit = 5)
{
$res = $this->openAlexGet('/works', [
'filter' => 'authorships.author.id:' . $openAlexId,
'sort' => 'publication_date:desc',
'per-page' => $limit,
]);
if (!$res['success']) {
return [];
}
$list = [];
foreach ($res['data']['results'] ?? [] as $work) {
$item = $this->formatOpenAlexWork($work);
$item['is_retracted'] = !empty($work['is_retracted']);
$list[] = $item;
}
return $list;
}
/**
* 按领域/关键词批量搜索学者OpenAlex
*/
public function searchAuthorsByField($keyword, $options = [])
{
$minHIndex = intval($options['min_h_index'] ?? 5);
$limit = min(max(intval($options['limit'] ?? 10), 1), 30);
$page = max(intval($options['page'] ?? 1), 1);
$topicId = $this->resolveTopicId($keyword);
$filters = [];
if ($topicId !== '') {
$filters[] = 'topics.id:' . $topicId;
}
if ($minHIndex > 0) {
$filters[] = 'summary_stats.h_index:>' . $minHIndex;
}
$query = [
'sort' => 'cited_by_count:desc',
'per-page' => $limit,
'page' => $page,
];
if (!empty($filters)) {
$query['filter'] = implode(',', $filters);
$query['search'] = $keyword;
} else {
$query['search'] = $keyword;
}
$res = $this->openAlexGet('/authors', $query);
if (!$res['success']) {
return ['success' => false, 'error' => $res['error']];
}
$authors = [];
foreach ($res['data']['results'] ?? [] as $author) {
$authors[] = $this->formatAuthorBrief($author);
}
return [
'success' => true,
'data' => [
'keyword' => $keyword,
'topic_id' => $topicId,
'page' => $page,
'limit' => $limit,
'total' => $res['data']['meta']['count'] ?? count($authors),
'list' => $authors,
],
];
}
private function resolveTopicId($keyword)
{
$res = $this->openAlexGet('/topics', [
'search' => $keyword,
'sort' => 'works_count:desc',
'per-page' => 1,
]);
if (!$res['success']) {
return '';
}
$results = $res['data']['results'] ?? [];
if (empty($results)) {
return '';
}
return $this->extractOpenAlexId($results[0]['id'] ?? '');
}
// ===================== CrossRef =====================
public function cleanDoi($doi)
{
$doi = trim($doi);
$doi = preg_replace('/^https?:\/\/doi\.org\//', '', $doi);
$doi = preg_replace('/^doi:\s*/i', '', $doi);
return trim($doi);
}
public function fetchCrossRefWork($doi)
{
$doi = $this->cleanDoi($doi);
if ($doi === '') {
return ['success' => false, 'error' => 'DOI为空'];
}
$url = $this->crossRefBase . '/works/' . urlencode($doi);
$result = $this->httpGet($url, [
'Accept: application/json',
'User-Agent: TMRJournals-BackgroundCheck/1.0 (mailto:' . $this->mailto . ')',
]);
if (!$result['success']) {
return ['success' => false, 'error' => $result['error']];
}
if ($result['http_code'] == 404) {
return ['success' => false, 'error' => 'DOI在CrossRef中未找到'];
}
if ($result['http_code'] != 200) {
return ['success' => false, 'error' => 'CrossRef返回 HTTP ' . $result['http_code']];
}
$data = json_decode($result['body'], true);
if (!isset($data['message'])) {
return ['success' => false, 'error' => 'CrossRef返回数据格式异常'];
}
return ['success' => true, 'message' => $data['message']];
}
public function parseCrossRefRetractionDetail($doi, $message)
{
$retraction = $this->detectCrossRefRetraction($message);
return [
'doi' => $this->cleanDoi($doi),
'title' => isset($message['title'][0]) ? $message['title'][0] : '',
'is_retracted' => $retraction['is_retracted'],
'retraction_detail' => $retraction['retraction_detail'],
'journal' => isset($message['container-title'][0]) ? $message['container-title'][0] : '',
'publisher' => $message['publisher'] ?? '',
'published_date' => isset($message['published-print']) ? $this->parseDateParts($message['published-print']) : '',
'authors' => $this->parseCrossRefAuthors($message['author'] ?? []),
'url' => $message['URL'] ?? ('https://doi.org/' . $this->cleanDoi($doi)),
];
}
public function enrichRetractionsWithCrossRef($retractionList)
{
$enriched = [];
foreach ($retractionList as $item) {
$doi = $this->cleanDoi($item['doi'] ?? '');
if ($doi === '') {
$item['crossref'] = ['success' => false, 'error' => '无DOI'];
$enriched[] = $item;
continue;
}
$res = $this->fetchCrossRefWork($doi);
if (!$res['success']) {
$item['crossref'] = ['success' => false, 'error' => $res['error']];
} else {
$item['crossref'] = [
'success' => true,
'data' => $this->parseCrossRefRetractionDetail($doi, $res['message']),
];
}
$enriched[] = $item;
usleep(200000);
}
return $enriched;
}
private function detectCrossRefRetraction($message)
{
$isRetracted = false;
$retractionDetail = [
'sources' => [],
'retraction_notices' => [],
'record_ids' => [],
];
foreach (['updated-by', 'update-to'] as $field) {
if (!isset($message[$field]) || !is_array($message[$field])) {
continue;
}
foreach ($message[$field] as $update) {
$updateType = strtolower($update['type'] ?? '');
$updateLabel = strtolower($update['label'] ?? '');
if (strpos($updateType, 'retract') === false && strpos($updateLabel, 'retract') === false) {
continue;
}
$isRetracted = true;
$source = $update['source'] ?? 'publisher';
$retractionDetail['sources'][] = $source;
$notice = [
'type' => $update['type'] ?? '',
'label' => $update['label'] ?? '',
'source' => $source,
'notice_doi'=> $update['DOI'] ?? '',
'date' => isset($update['updated']) ? $this->parseDateParts($update['updated']) : '',
'record_id' => $update['record-id'] ?? '',
];
$retractionDetail['retraction_notices'][] = $notice;
if (!empty($notice['record_id'])) {
$retractionDetail['record_ids'][] = $notice['record_id'];
}
}
}
$type = strtolower($message['type'] ?? '');
$subtype = strtolower($message['subtype'] ?? '');
if (strpos($type, 'retract') !== false || strpos($subtype, 'retract') !== false) {
$isRetracted = true;
$retractionDetail['is_retraction_notice'] = true;
}
if (isset($message['relation']) && is_array($message['relation'])) {
foreach ($message['relation'] as $relType => $relations) {
if (strpos(strtolower($relType), 'retract') !== false) {
$isRetracted = true;
$retractionDetail['relation'] = [$relType => $relations];
break;
}
}
}
$retractionDetail['sources'] = array_values(array_unique($retractionDetail['sources']));
$retractionDetail['record_ids'] = array_values(array_unique($retractionDetail['record_ids']));
return ['is_retracted' => $isRetracted, 'retraction_detail' => $retractionDetail];
}
// ===================== Retraction Watch (via CrossRef) =====================
/**
* 通过 CrossRef 检索 Retraction Watch 来源的撤稿记录(按作者姓名)
*/
public function fetchRetractionWatchByAuthor($authorName)
{
$url = $this->crossRefBase . '/works?' . http_build_query([
'query.author' => $authorName,
'filter' => 'update-type:retraction',
'rows' => 25,
'mailto' => $this->mailto,
]);
$result = $this->httpGet($url, [
'Accept: application/json',
'User-Agent: TMRJournals-BackgroundCheck/1.0 (mailto:' . $this->mailto . ')',
]);
if (!$result['success']) {
return ['count' => 0, 'list' => [], 'error' => $result['error']];
}
if ($result['http_code'] != 200) {
return ['count' => 0, 'list' => [], 'error' => 'CrossRef返回 HTTP ' . $result['http_code']];
}
$data = json_decode($result['body'], true);
$items = $data['message']['items'] ?? [];
$list = [];
foreach ($items as $message) {
$parsed = $this->parseCrossRefRetractionDetail($message['DOI'] ?? '', $message);
if (!$parsed['is_retracted']) {
continue;
}
$rwSources = array_filter($parsed['retraction_detail']['sources'] ?? [], function ($s) {
return stripos($s, 'retraction-watch') !== false || stripos($s, 'retraction_watch') !== false;
});
$list[] = [
'title' => $parsed['title'],
'doi' => $parsed['doi'],
'journal' => $parsed['journal'],
'publisher' => $parsed['publisher'],
'published_date' => $parsed['published_date'],
'is_retracted' => true,
'retraction_detail' => $parsed['retraction_detail'],
'from_retraction_watch' => !empty($rwSources) || !empty($parsed['retraction_detail']['record_ids']),
'source' => 'retraction_watch',
];
}
return [
'count' => count($list),
'list' => $list,
'source' => 'retraction_watch',
];
}
/**
* 合并 OpenAlex + Retraction Watch 撤稿记录(按 DOI 去重)
*/
public function mergeRetractionRecords($openAlexRetractions, $rwRetractions, $withCrossRefDetail = false)
{
$merged = [];
$doiMap = [];
foreach ([$openAlexRetractions, $rwRetractions] as $sourceData) {
foreach ($sourceData['list'] ?? [] as $item) {
$doi = $this->cleanDoi($item['doi'] ?? '');
$key = $doi !== '' ? strtolower($doi) : md5(json_encode($item));
if (!isset($doiMap[$key])) {
$doiMap[$key] = [
'title' => $item['title'] ?? '',
'doi' => $doi,
'journal' => $item['journal'] ?? '',
'publication_date' => $item['publication_date'] ?? ($item['published_date'] ?? ''),
'sources' => [],
'retraction_detail'=> $item['retraction_detail'] ?? [],
'from_retraction_watch' => !empty($item['from_retraction_watch']),
];
}
$src = $item['source'] ?? 'unknown';
if (!in_array($src, $doiMap[$key]['sources'])) {
$doiMap[$key]['sources'][] = $src;
}
if (!empty($item['from_retraction_watch'])) {
$doiMap[$key]['from_retraction_watch'] = true;
}
if (!empty($item['retraction_detail']) && empty($doiMap[$key]['retraction_detail'])) {
$doiMap[$key]['retraction_detail'] = $item['retraction_detail'];
}
}
}
$merged = array_values($doiMap);
if ($withCrossRefDetail) {
$merged = $this->enrichRetractionsWithCrossRef($merged);
}
$rwOnlyCount = 0;
foreach ($merged as $row) {
if (!empty($row['from_retraction_watch']) && count($row['sources'] ?? []) <= 1) {
$rwOnlyCount++;
}
}
return [
'count' => count($merged),
'openalex_count' => intval($openAlexRetractions['count'] ?? 0),
'rw_count' => intval($rwRetractions['count'] ?? 0),
'rw_only_count' => $rwOnlyCount,
'list' => $merged,
];
}
// ===================== 格式化 =====================
public function formatAuthorBrief($author)
{
$institutions = [];
foreach ($author['last_known_institutions'] ?? [] as $inst) {
$institutions[] = [
'name' => $inst['display_name'] ?? '',
'country' => $inst['country_code'] ?? '',
];
}
return [
'openalex_id' => $this->extractOpenAlexId($author['id'] ?? ''),
'name' => $author['display_name'] ?? '',
'orcid' => $this->extractOrcid($author['orcid'] ?? ''),
'works_count' => intval($author['works_count'] ?? 0),
'cited_by_count' => intval($author['cited_by_count'] ?? 0),
'h_index' => intval($author['summary_stats']['h_index'] ?? 0),
'institutions' => $institutions,
'openalex_url' => $author['id'] ?? '',
];
}
public function parseAuthorMetrics($author)
{
$stats = $author['summary_stats'] ?? [];
return [
'works_count' => intval($author['works_count'] ?? 0),
'cited_by_count' => intval($author['cited_by_count'] ?? 0),
'h_index' => intval($stats['h_index'] ?? 0),
'i10_index' => intval($stats['i10_index'] ?? 0),
'two_year_mean_cited' => round(floatval($stats['2yr_mean_citedness'] ?? 0), 2),
'level_label' => $this->getAcademicLevelLabel($stats),
];
}
public function parseResearchTopics($author)
{
$topics = [];
foreach ($author['x_concepts'] ?? [] as $concept) {
if (empty($concept['display_name'])) {
continue;
}
$topics[] = [
'name' => $concept['display_name'],
'score' => round(floatval($concept['score'] ?? 0), 3),
];
}
if (empty($topics)) {
foreach ($author['topics'] ?? [] as $topic) {
if (empty($topic['display_name'])) {
continue;
}
$topics[] = [
'name' => $topic['display_name'],
'score' => round(floatval($topic['score'] ?? 0), 3),
];
}
}
return array_slice($topics, 0, 8);
}
public function assessRisk($metrics, $retractions)
{
$retractionCount = intval($retractions['count'] ?? 0);
$rwOnlyCount = intval($retractions['rw_only_count'] ?? 0);
$level = 'low';
$score = 0;
$reasons = [];
if ($retractionCount === 0) {
$level = 'low';
$score = 10;
$reasons[] = 'OpenAlex 与 Retraction Watch 均未发现撤稿记录';
} elseif ($retractionCount === 1) {
$level = 'medium';
$score = 50;
$reasons[] = '发现 1 篇撤稿论文,建议人工核实撤稿原因';
} else {
$level = 'high';
$score = 80 + min($retractionCount * 5, 20);
$reasons[] = '发现 ' . $retractionCount . ' 篇撤稿论文,存在较高学术风险';
}
if ($rwOnlyCount > 0) {
$reasons[] = 'Retraction Watch 额外发现 ' . $rwOnlyCount . ' 条 OpenAlex 未收录的撤稿记录';
if ($level === 'low') {
$level = 'medium';
$score = max($score, 45);
}
}
$worksCount = max(intval($metrics['works_count'] ?? 0), 1);
$retractionRate = round($retractionCount / $worksCount * 100, 2);
if ($retractionCount > 0 && $retractionRate >= 5) {
$reasons[] = '撤稿率 ' . $retractionRate . '%,比例偏高';
if ($level === 'medium') {
$level = 'high';
$score = max($score, 70);
}
}
return [
'level' => $level,
'level_label' => $this->getRiskLevelLabel($level),
'score' => min($score, 100),
'retraction_count' => $retractionCount,
'retraction_rate' => $retractionRate . '%',
'rw_only_count' => $rwOnlyCount,
'reasons' => $reasons,
];
}
// ===================== 内部工具 =====================
private function formatOpenAlexWork($work)
{
return [
'title' => $work['display_name'] ?? '',
'doi' => $this->extractDoi($work),
'publication_date' => $work['publication_date'] ?? '',
'journal' => $work['primary_location']['source']['display_name'] ?? '',
'cited_by_count' => intval($work['cited_by_count'] ?? 0),
'openalex_url' => $work['id'] ?? '',
'source' => 'openalex',
];
}
private function parseCrossRefAuthors($authorList)
{
if (empty($authorList) || !is_array($authorList)) {
return [];
}
$result = [];
foreach ($authorList as $a) {
$result[] = [
'given' => $a['given'] ?? '',
'family' => $a['family'] ?? '',
'name' => isset($a['name']) ? $a['name'] : trim(($a['given'] ?? '') . ' ' . ($a['family'] ?? '')),
'orcid' => $a['ORCID'] ?? '',
];
}
return $result;
}
private function parseDateParts($dateObj)
{
if (!isset($dateObj['date-parts'][0])) {
return '';
}
$parts = $dateObj['date-parts'][0];
$y = isset($parts[0]) ? $parts[0] : '';
$m = isset($parts[1]) ? sprintf('%02d', $parts[1]) : '';
$d = isset($parts[2]) ? sprintf('%02d', $parts[2]) : '';
if ($y && $m && $d) {
return "{$y}-{$m}-{$d}";
}
if ($y && $m) {
return "{$y}-{$m}";
}
return (string)$y;
}
private function getAcademicLevelLabel($stats)
{
$h = intval($stats['h_index'] ?? 0);
if ($h >= 50) return '国际顶尖学者';
if ($h >= 30) return '资深专家';
if ($h >= 15) return '活跃研究者';
if ($h >= 5) return '青年学者';
if ($h > 0) return '初入领域';
return '暂无足够公开数据';
}
private function getRiskLevelLabel($level)
{
$map = ['low' => '低风险', 'medium' => '中风险', 'high' => '高风险'];
return $map[$level] ?? '未知';
}
public function extractOpenAlexId($id)
{
return preg_replace('/^https?:\/\/openalex\.org\//', '', $id);
}
public function extractOrcid($orcid)
{
if ($orcid === '') return '';
return preg_replace('/^https?:\/\/orcid\.org\//', '', $orcid);
}
public function cleanOrcid($orcid)
{
$orcid = trim($orcid);
$orcid = preg_replace('/^https?:\/\/orcid\.org\//', '', $orcid);
return trim($orcid);
}
private function extractDoi($work)
{
$doi = $work['doi'] ?? '';
return preg_replace('/^https?:\/\/doi\.org\//', '', $doi);
}
private function httpGet($url, $headers = [])
{
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_TIMEOUT, 30);
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
$body = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
if (curl_errno($ch)) {
$error = curl_error($ch);
curl_close($ch);
return ['success' => false, 'error' => 'HTTP请求失败: ' . $error];
}
curl_close($ch);
return ['success' => true, 'body' => $body, 'http_code' => $httpCode];
}
}