tougao/application/common/BackgroundCheckService.php

<?php

namespace app\common;

/**
 * 背景调查公共服务
 * 封装 OpenAlex / CrossRef / Retraction Watch 数据查询
 */
class BackgroundCheckService
{
    private $openAlexBase = 'https://api.openalex.org';
    private $crossRefBase = 'https://api.crossref.org';
    private $mailto = 'publisher@tmrjournals.com';

    // ===================== OpenAlex =====================

    public function openAlexGet($path, $query = [])
    {
        $query['mailto'] = $this->mailto;
        $url = $this->openAlexBase . $path . '?' . http_build_query($query);

        $result = $this->httpGet($url, [
            'Accept: application/json',
            'User-Agent: TMRJournals-BackgroundCheck/1.0 (mailto:' . $this->mailto . ')',
        ]);

        if (!$result['success']) {
            return $result;
        }

        $data = json_decode($result['body'], true);
        if (!is_array($data)) {
            return ['success' => false, 'error' => 'OpenAlex返回数据格式异常'];
        }

        return ['success' => true, 'data' => $data];
    }

    public function resolveAuthor($params)
    {
        if (!empty($params['openalex_id'])) {
            $id = preg_replace('/^https?:\/\/openalex\.org\//', '', $params['openalex_id']);
            $res = $this->openAlexGet('/authors/' . urlencode($id));
            if (!$res['success']) {
                return ['success' => false, 'error' => $res['error']];
            }
            return ['success' => true, 'data' => $res['data']];
        }

        if (!empty($params['orcid'])) {
            $orcid = $this->cleanOrcid($params['orcid']);
            $res = $this->openAlexGet('/authors/https://orcid.org/' . $orcid);
            if (!$res['success']) {
                return ['success' => false, 'error' => '未在 OpenAlex 找到该 ORCID 对应学者'];
            }
            return ['success' => true, 'data' => $res['data']];
        }

        if (empty($params['name'])) {
            return ['success' => false, 'error' => '请提供 openalex_id、orcid 或 name'];
        }

        $filter = 'display_name.search:' . $params['name'];
        if (!empty($params['affiliation'])) {
            $filter .= ',last_known_institutions.display_name.search:' . $params['affiliation'];
        }

        $res = $this->openAlexGet('/authors', [
            'search'   => $params['name'],
            'filter'   => $filter,
            'sort'     => 'cited_by_count:desc',
            'per-page' => 1,
        ]);

        if (!$res['success']) {
            return ['success' => false, 'error' => $res['error']];
        }

        $results = $res['data']['results'] ?? [];
        if (empty($results)) {
            return ['success' => false, 'error' => '未找到匹配学者，请补充 affiliation 或使用 orcid'];
        }

        return ['success' => true, 'data' => $results[0]];
    }

    public function fetchRetractedWorksOpenAlex($openAlexId)
    {
        $res = $this->openAlexGet('/works', [
            'filter'   => 'authorships.author.id:' . $openAlexId . ',is_retracted:true',
            'sort'     => 'publication_date:desc',
            'per-page' => 25,
        ]);

        if (!$res['success']) {
            return ['count' => 0, 'list' => [], 'error' => $res['error']];
        }

        $list = [];
        foreach ($res['data']['results'] ?? [] as $work) {
            $list[] = $this->formatOpenAlexWork($work);
        }

        return ['count' => count($list), 'list' => $list, 'source' => 'openalex'];
    }

    public function fetchRecentWorks($openAlexId, $limit = 5)
    {
        $res = $this->openAlexGet('/works', [
            'filter'   => 'authorships.author.id:' . $openAlexId,
            'sort'     => 'publication_date:desc',
            'per-page' => $limit,
        ]);

        if (!$res['success']) {
            return [];
        }

        $list = [];
        foreach ($res['data']['results'] ?? [] as $work) {
            $item = $this->formatOpenAlexWork($work);
            $item['is_retracted'] = !empty($work['is_retracted']);
            $list[] = $item;
        }

        return $list;
    }

    /**
     * 按领域/关键词批量搜索学者（OpenAlex）
     */
    public function searchAuthorsByField($keyword, $options = [])
    {
        $minHIndex = intval($options['min_h_index'] ?? 5);
        $limit     = min(max(intval($options['limit'] ?? 10), 1), 30);
        $page      = max(intval($options['page'] ?? 1), 1);

        $topicId = $this->resolveTopicId($keyword);
        $filters = [];

        if ($topicId !== '') {
            $filters[] = 'topics.id:' . $topicId;
        }
        if ($minHIndex > 0) {
            $filters[] = 'summary_stats.h_index:>' . $minHIndex;
        }

        $query = [
            'sort'     => 'cited_by_count:desc',
            'per-page' => $limit,
            'page'     => $page,
        ];

        if (!empty($filters)) {
            $query['filter'] = implode(',', $filters);
            $query['search'] = $keyword;
        } else {
            $query['search'] = $keyword;
        }

        $res = $this->openAlexGet('/authors', $query);
        if (!$res['success']) {
            return ['success' => false, 'error' => $res['error']];
        }

        $authors = [];
        foreach ($res['data']['results'] ?? [] as $author) {
            $authors[] = $this->formatAuthorBrief($author);
        }

        return [
            'success' => true,
            'data'    => [
                'keyword'  => $keyword,
                'topic_id' => $topicId,
                'page'     => $page,
                'limit'    => $limit,
                'total'    => $res['data']['meta']['count'] ?? count($authors),
                'list'     => $authors,
            ],
        ];
    }

    private function resolveTopicId($keyword)
    {
        $res = $this->openAlexGet('/topics', [
            'search'   => $keyword,
            'sort'     => 'works_count:desc',
            'per-page' => 1,
        ]);

        if (!$res['success']) {
            return '';
        }

        $results = $res['data']['results'] ?? [];
        if (empty($results)) {
            return '';
        }

        return $this->extractOpenAlexId($results[0]['id'] ?? '');
    }

    // ===================== CrossRef =====================

    public function cleanDoi($doi)
    {
        $doi = trim($doi);
        $doi = preg_replace('/^https?:\/\/doi\.org\//', '', $doi);
        $doi = preg_replace('/^doi:\s*/i', '', $doi);
        return trim($doi);
    }

    public function fetchCrossRefWork($doi)
    {
        $doi = $this->cleanDoi($doi);
        if ($doi === '') {
            return ['success' => false, 'error' => 'DOI为空'];
        }

        $url = $this->crossRefBase . '/works/' . urlencode($doi);
        $result = $this->httpGet($url, [
            'Accept: application/json',
            'User-Agent: TMRJournals-BackgroundCheck/1.0 (mailto:' . $this->mailto . ')',
        ]);

        if (!$result['success']) {
            return ['success' => false, 'error' => $result['error']];
        }

        if ($result['http_code'] == 404) {
            return ['success' => false, 'error' => 'DOI在CrossRef中未找到'];
        }
        if ($result['http_code'] != 200) {
            return ['success' => false, 'error' => 'CrossRef返回 HTTP ' . $result['http_code']];
        }

        $data = json_decode($result['body'], true);
        if (!isset($data['message'])) {
            return ['success' => false, 'error' => 'CrossRef返回数据格式异常'];
        }

        return ['success' => true, 'message' => $data['message']];
    }

    public function parseCrossRefRetractionDetail($doi, $message)
    {
        $retraction = $this->detectCrossRefRetraction($message);

        return [
            'doi'               => $this->cleanDoi($doi),
            'title'             => isset($message['title'][0]) ? $message['title'][0] : '',
            'is_retracted'      => $retraction['is_retracted'],
            'retraction_detail' => $retraction['retraction_detail'],
            'journal'           => isset($message['container-title'][0]) ? $message['container-title'][0] : '',
            'publisher'         => $message['publisher'] ?? '',
            'published_date'    => isset($message['published-print']) ? $this->parseDateParts($message['published-print']) : '',
            'authors'           => $this->parseCrossRefAuthors($message['author'] ?? []),
            'url'               => $message['URL'] ?? ('https://doi.org/' . $this->cleanDoi($doi)),
        ];
    }

    public function enrichRetractionsWithCrossRef($retractionList)
    {
        $enriched = [];
        foreach ($retractionList as $item) {
            $doi = $this->cleanDoi($item['doi'] ?? '');
            if ($doi === '') {
                $item['crossref'] = ['success' => false, 'error' => '无DOI'];
                $enriched[] = $item;
                continue;
            }

            $res = $this->fetchCrossRefWork($doi);
            if (!$res['success']) {
                $item['crossref'] = ['success' => false, 'error' => $res['error']];
            } else {
                $item['crossref'] = [
                    'success' => true,
                    'data'    => $this->parseCrossRefRetractionDetail($doi, $res['message']),
                ];
            }

            $enriched[] = $item;
            usleep(200000);
        }

        return $enriched;
    }

    private function detectCrossRefRetraction($message)
    {
        $isRetracted = false;
        $retractionDetail = [
            'sources'            => [],
            'retraction_notices' => [],
            'record_ids'         => [],
        ];

        foreach (['updated-by', 'update-to'] as $field) {
            if (!isset($message[$field]) || !is_array($message[$field])) {
                continue;
            }
            foreach ($message[$field] as $update) {
                $updateType  = strtolower($update['type'] ?? '');
                $updateLabel = strtolower($update['label'] ?? '');
                if (strpos($updateType, 'retract') === false && strpos($updateLabel, 'retract') === false) {
                    continue;
                }

                $isRetracted = true;
                $source = $update['source'] ?? 'publisher';
                $retractionDetail['sources'][] = $source;

                $notice = [
                    'type'      => $update['type'] ?? '',
                    'label'     => $update['label'] ?? '',
                    'source'    => $source,
                    'notice_doi'=> $update['DOI'] ?? '',
                    'date'      => isset($update['updated']) ? $this->parseDateParts($update['updated']) : '',
                    'record_id' => $update['record-id'] ?? '',
                ];
                $retractionDetail['retraction_notices'][] = $notice;

                if (!empty($notice['record_id'])) {
                    $retractionDetail['record_ids'][] = $notice['record_id'];
                }
            }
        }

        $type    = strtolower($message['type'] ?? '');
        $subtype = strtolower($message['subtype'] ?? '');
        if (strpos($type, 'retract') !== false || strpos($subtype, 'retract') !== false) {
            $isRetracted = true;
            $retractionDetail['is_retraction_notice'] = true;
        }

        if (isset($message['relation']) && is_array($message['relation'])) {
            foreach ($message['relation'] as $relType => $relations) {
                if (strpos(strtolower($relType), 'retract') !== false) {
                    $isRetracted = true;
                    $retractionDetail['relation'] = [$relType => $relations];
                    break;
                }
            }
        }

        $retractionDetail['sources'] = array_values(array_unique($retractionDetail['sources']));
        $retractionDetail['record_ids'] = array_values(array_unique($retractionDetail['record_ids']));

        return ['is_retracted' => $isRetracted, 'retraction_detail' => $retractionDetail];
    }

    // ===================== Retraction Watch (via CrossRef) =====================

    /**
     * 通过 CrossRef 检索 Retraction Watch 来源的撤稿记录（按作者姓名）
     */
    public function fetchRetractionWatchByAuthor($authorName)
    {
        $url = $this->crossRefBase . '/works?' . http_build_query([
            'query.author' => $authorName,
            'filter'       => 'update-type:retraction',
            'rows'         => 25,
            'mailto'       => $this->mailto,
        ]);

        $result = $this->httpGet($url, [
            'Accept: application/json',
            'User-Agent: TMRJournals-BackgroundCheck/1.0 (mailto:' . $this->mailto . ')',
        ]);

        if (!$result['success']) {
            return ['count' => 0, 'list' => [], 'error' => $result['error']];
        }

        if ($result['http_code'] != 200) {
            return ['count' => 0, 'list' => [], 'error' => 'CrossRef返回 HTTP ' . $result['http_code']];
        }

        $data = json_decode($result['body'], true);
        $items = $data['message']['items'] ?? [];

        $list = [];
        foreach ($items as $message) {
            $parsed = $this->parseCrossRefRetractionDetail($message['DOI'] ?? '', $message);
            if (!$parsed['is_retracted']) {
                continue;
            }

            $rwSources = array_filter($parsed['retraction_detail']['sources'] ?? [], function ($s) {
                return stripos($s, 'retraction-watch') !== false || stripos($s, 'retraction_watch') !== false;
            });

            $list[] = [
                'title'             => $parsed['title'],
                'doi'               => $parsed['doi'],
                'journal'           => $parsed['journal'],
                'publisher'         => $parsed['publisher'],
                'published_date'    => $parsed['published_date'],
                'is_retracted'      => true,
                'retraction_detail' => $parsed['retraction_detail'],
                'from_retraction_watch' => !empty($rwSources) || !empty($parsed['retraction_detail']['record_ids']),
                'source'            => 'retraction_watch',
            ];
        }

        return [
            'count'  => count($list),
            'list'   => $list,
            'source' => 'retraction_watch',
        ];
    }

    /**
     * 合并 OpenAlex + Retraction Watch 撤稿记录（按 DOI 去重）
     */
    public function mergeRetractionRecords($openAlexRetractions, $rwRetractions, $withCrossRefDetail = false)
    {
        $merged = [];
        $doiMap = [];

        foreach ([$openAlexRetractions, $rwRetractions] as $sourceData) {
            foreach ($sourceData['list'] ?? [] as $item) {
                $doi = $this->cleanDoi($item['doi'] ?? '');
                $key = $doi !== '' ? strtolower($doi) : md5(json_encode($item));

                if (!isset($doiMap[$key])) {
                    $doiMap[$key] = [
                        'title'            => $item['title'] ?? '',
                        'doi'              => $doi,
                        'journal'          => $item['journal'] ?? '',
                        'publication_date' => $item['publication_date'] ?? ($item['published_date'] ?? ''),
                        'sources'          => [],
                        'retraction_detail'=> $item['retraction_detail'] ?? [],
                        'from_retraction_watch' => !empty($item['from_retraction_watch']),
                    ];
                }

                $src = $item['source'] ?? 'unknown';
                if (!in_array($src, $doiMap[$key]['sources'])) {
                    $doiMap[$key]['sources'][] = $src;
                }
                if (!empty($item['from_retraction_watch'])) {
                    $doiMap[$key]['from_retraction_watch'] = true;
                }
                if (!empty($item['retraction_detail']) && empty($doiMap[$key]['retraction_detail'])) {
                    $doiMap[$key]['retraction_detail'] = $item['retraction_detail'];
                }
            }
        }

        $merged = array_values($doiMap);

        if ($withCrossRefDetail) {
            $merged = $this->enrichRetractionsWithCrossRef($merged);
        }

        $rwOnlyCount = 0;
        foreach ($merged as $row) {
            if (!empty($row['from_retraction_watch']) && count($row['sources'] ?? []) <= 1) {
                $rwOnlyCount++;
            }
        }

        return [
            'count'            => count($merged),
            'openalex_count'   => intval($openAlexRetractions['count'] ?? 0),
            'rw_count'         => intval($rwRetractions['count'] ?? 0),
            'rw_only_count'    => $rwOnlyCount,
            'list'             => $merged,
        ];
    }

    // ===================== 格式化 =====================

    public function formatAuthorBrief($author)
    {
        $institutions = [];
        foreach ($author['last_known_institutions'] ?? [] as $inst) {
            $institutions[] = [
                'name'    => $inst['display_name'] ?? '',
                'country' => $inst['country_code'] ?? '',
            ];
        }

        return [
            'openalex_id'    => $this->extractOpenAlexId($author['id'] ?? ''),
            'name'           => $author['display_name'] ?? '',
            'orcid'          => $this->extractOrcid($author['orcid'] ?? ''),
            'works_count'    => intval($author['works_count'] ?? 0),
            'cited_by_count' => intval($author['cited_by_count'] ?? 0),
            'h_index'        => intval($author['summary_stats']['h_index'] ?? 0),
            'institutions'   => $institutions,
            'openalex_url'   => $author['id'] ?? '',
        ];
    }

    public function parseAuthorMetrics($author)
    {
        $stats = $author['summary_stats'] ?? [];

        return [
            'works_count'         => intval($author['works_count'] ?? 0),
            'cited_by_count'      => intval($author['cited_by_count'] ?? 0),
            'h_index'             => intval($stats['h_index'] ?? 0),
            'i10_index'           => intval($stats['i10_index'] ?? 0),
            'two_year_mean_cited' => round(floatval($stats['2yr_mean_citedness'] ?? 0), 2),
            'level_label'         => $this->getAcademicLevelLabel($stats),
        ];
    }

    public function parseResearchTopics($author)
    {
        $topics = [];
        foreach ($author['x_concepts'] ?? [] as $concept) {
            if (empty($concept['display_name'])) {
                continue;
            }
            $topics[] = [
                'name'  => $concept['display_name'],
                'score' => round(floatval($concept['score'] ?? 0), 3),
            ];
        }

        if (empty($topics)) {
            foreach ($author['topics'] ?? [] as $topic) {
                if (empty($topic['display_name'])) {
                    continue;
                }
                $topics[] = [
                    'name'  => $topic['display_name'],
                    'score' => round(floatval($topic['score'] ?? 0), 3),
                ];
            }
        }

        return array_slice($topics, 0, 8);
    }

    public function assessRisk($metrics, $retractions)
    {
        $retractionCount = intval($retractions['count'] ?? 0);
        $rwOnlyCount     = intval($retractions['rw_only_count'] ?? 0);
        $level = 'low';
        $score = 0;
        $reasons = [];

        if ($retractionCount === 0) {
            $level = 'low';
            $score = 10;
            $reasons[] = 'OpenAlex 与 Retraction Watch 均未发现撤稿记录';
        } elseif ($retractionCount === 1) {
            $level = 'medium';
            $score = 50;
            $reasons[] = '发现 1 篇撤稿论文，建议人工核实撤稿原因';
        } else {
            $level = 'high';
            $score = 80 + min($retractionCount * 5, 20);
            $reasons[] = '发现 ' . $retractionCount . ' 篇撤稿论文，存在较高学术风险';
        }

        if ($rwOnlyCount > 0) {
            $reasons[] = 'Retraction Watch 额外发现 ' . $rwOnlyCount . ' 条 OpenAlex 未收录的撤稿记录';
            if ($level === 'low') {
                $level = 'medium';
                $score = max($score, 45);
            }
        }

        $worksCount = max(intval($metrics['works_count'] ?? 0), 1);
        $retractionRate = round($retractionCount / $worksCount * 100, 2);
        if ($retractionCount > 0 && $retractionRate >= 5) {
            $reasons[] = '撤稿率 ' . $retractionRate . '%，比例偏高';
            if ($level === 'medium') {
                $level = 'high';
                $score = max($score, 70);
            }
        }

        return [
            'level'            => $level,
            'level_label'      => $this->getRiskLevelLabel($level),
            'score'            => min($score, 100),
            'retraction_count' => $retractionCount,
            'retraction_rate'  => $retractionRate . '%',
            'rw_only_count'    => $rwOnlyCount,
            'reasons'          => $reasons,
        ];
    }

    // ===================== 内部工具 =====================

    private function formatOpenAlexWork($work)
    {
        return [
            'title'            => $work['display_name'] ?? '',
            'doi'              => $this->extractDoi($work),
            'publication_date' => $work['publication_date'] ?? '',
            'journal'          => $work['primary_location']['source']['display_name'] ?? '',
            'cited_by_count'   => intval($work['cited_by_count'] ?? 0),
            'openalex_url'     => $work['id'] ?? '',
            'source'           => 'openalex',
        ];
    }

    private function parseCrossRefAuthors($authorList)
    {
        if (empty($authorList) || !is_array($authorList)) {
            return [];
        }

        $result = [];
        foreach ($authorList as $a) {
            $result[] = [
                'given'  => $a['given'] ?? '',
                'family' => $a['family'] ?? '',
                'name'   => isset($a['name']) ? $a['name'] : trim(($a['given'] ?? '') . ' ' . ($a['family'] ?? '')),
                'orcid'  => $a['ORCID'] ?? '',
            ];
        }
        return $result;
    }

    private function parseDateParts($dateObj)
    {
        if (!isset($dateObj['date-parts'][0])) {
            return '';
        }
        $parts = $dateObj['date-parts'][0];
        $y = isset($parts[0]) ? $parts[0] : '';
        $m = isset($parts[1]) ? sprintf('%02d', $parts[1]) : '';
        $d = isset($parts[2]) ? sprintf('%02d', $parts[2]) : '';
        if ($y && $m && $d) {
            return "{$y}-{$m}-{$d}";
        }
        if ($y && $m) {
            return "{$y}-{$m}";
        }
        return (string)$y;
    }

    private function getAcademicLevelLabel($stats)
    {
        $h = intval($stats['h_index'] ?? 0);
        if ($h >= 50) return '国际顶尖学者';
        if ($h >= 30) return '资深专家';
        if ($h >= 15) return '活跃研究者';
        if ($h >= 5)  return '青年学者';
        if ($h > 0)   return '初入领域';
        return '暂无足够公开数据';
    }

    private function getRiskLevelLabel($level)
    {
        $map = ['low' => '低风险', 'medium' => '中风险', 'high' => '高风险'];
        return $map[$level] ?? '未知';
    }

    public function extractOpenAlexId($id)
    {
        return preg_replace('/^https?:\/\/openalex\.org\//', '', $id);
    }

    public function extractOrcid($orcid)
    {
        if ($orcid === '') return '';
        return preg_replace('/^https?:\/\/orcid\.org\//', '', $orcid);
    }

    public function cleanOrcid($orcid)
    {
        $orcid = trim($orcid);
        $orcid = preg_replace('/^https?:\/\/orcid\.org\//', '', $orcid);
        return trim($orcid);
    }

    private function extractDoi($work)
    {
        $doi = $work['doi'] ?? '';
        return preg_replace('/^https?:\/\/doi\.org\//', '', $doi);
    }

    private function httpGet($url, $headers = [])
    {
        $ch = curl_init();
        curl_setopt($ch, CURLOPT_URL, $url);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
        curl_setopt($ch, CURLOPT_TIMEOUT, 30);
        curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);

        $body = curl_exec($ch);
        $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);

        if (curl_errno($ch)) {
            $error = curl_error($ch);
            curl_close($ch);
            return ['success' => false, 'error' => 'HTTP请求失败: ' . $error];
        }
        curl_close($ch);

        return ['success' => true, 'body' => $body, 'http_code' => $httpCode];
    }
}