Files
tougao/application/api/controller/Author.php
2026-06-02 17:59:17 +08:00

303 lines
11 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
/**
* Created by PhpStorm.
* User: Administrator
* Date: 2026/6/2
* Time: 15:04
*/
namespace app\api\controller;
class Author
{
public function get_hindex()
{
$name = trim(input('get.name'));
$affil = trim(input('get.affil'));
$debug = (int) input('get.debug', 0);
$cookieFile = tempnam(sys_get_temp_dir(), 'scopus_cookie_');
if (empty($name)) {
return json(['code' => 0, 'msg' => '请输入作者姓名']);
}
// 1) 获取 freelookup 页面,用于拿到真实提交地址和隐藏字段。
$lookupUrl = 'https://www.scopus.com/freelookup/form/author.uri?zone=TopNavBar&origin=NO%20ORIGIN%20DEFINED';
$lookupRes = $this->httpRequest($lookupUrl, null, true, '', $cookieFile);
if (!$lookupRes['ok']) {
@unlink($cookieFile);
$ret = ['code' => 0, 'msg' => '访问 Scopus 失败:' . $lookupRes['msg']];
if ($debug === 1) {
$ret['debug'] = $this->buildDebugInfo($lookupRes['url'], $lookupRes['http_code'], $lookupRes['body']);
}
return json($ret);
}
$formInfo = $this->extractScopusLookupForm($lookupRes['body']);
if (empty($formInfo['action'])) {
@unlink($cookieFile);
$ret = ['code' => 0, 'msg' => 'Scopus 页面结构已变化,未找到查询表单'];
if ($debug === 1) {
$ret['debug'] = $this->buildDebugInfo($lookupRes['url'], $lookupRes['http_code'], $lookupRes['body']);
}
return json($ret);
}
// 2) 组装查询参数(姓名 + 机构),并携带隐藏字段提交。
$postData = $formInfo['hidden_fields'];
$postData['authLast'] = $name;
$postData['affil'] = $affil;
$searchRes = $this->httpRequest($formInfo['action'], $postData, true, $lookupUrl, $cookieFile);
if (!$searchRes['ok']) {
@unlink($cookieFile);
$ret = ['code' => 0, 'msg' => '查询 Scopus 失败:' . $searchRes['msg']];
if ($debug === 1) {
$ret['debug'] = $this->buildDebugInfo($searchRes['url'], $searchRes['http_code'], $searchRes['body']);
}
return json($ret);
}
$blockMsg = $this->detectScopusBlocking($searchRes['body']);
if (!empty($blockMsg)) {
@unlink($cookieFile);
$ret = ['code' => 0, 'msg' => $blockMsg];
$fallback = $this->fallbackByOpenAlex($name, $affil);
if ($fallback !== null) {
$ret = array_merge($fallback, [
'msg' => $blockMsg . ',已自动降级 OpenAlex 结果'
]);
}
if ($debug === 1) {
$ret['debug'] = $this->buildDebugInfo($searchRes['url'], $searchRes['http_code'], $searchRes['body']);
}
return json($ret);
}
// 3) 从返回页提取 h-index优先匹配“h-index”关键词附近数字
$hIndex = $this->extractHIndexFromHtml($searchRes['body']);
if ($hIndex === null) {
@unlink($cookieFile);
$ret = [
'code' => 0,
'msg' => '未从 Scopus 结果页解析到 H 指数(可能需要人工登录或页面结构调整)'
];
if ($debug === 1) {
$ret['debug'] = $this->buildDebugInfo($searchRes['url'], $searchRes['http_code'], $searchRes['body']);
}
return json($ret);
}
@unlink($cookieFile);
$ret = [
'code' => 1,
'name' => $name,
'affil' => $affil,
'h_index_scopus' => $hIndex,
'source' => 'scopus_freelookup',
];
if ($debug === 1) {
$ret['debug'] = $this->buildDebugInfo($searchRes['url'], $searchRes['http_code'], $searchRes['body']);
}
return json($ret);
}
private function httpRequest($url, $postData = null, $followLocation = true, $referer = '', $cookieFile = '')
{
$ch = curl_init();
$options = [
CURLOPT_URL => $url,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_SSL_VERIFYHOST => false,
CURLOPT_FOLLOWLOCATION => $followLocation,
CURLOPT_MAXREDIRS => 8,
CURLOPT_TIMEOUT => 30,
CURLOPT_CONNECTTIMEOUT => 15,
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
CURLOPT_ENCODING => '',
CURLOPT_HTTPHEADER => [
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language: zh-CN,zh;q=0.9,en;q=0.8',
],
];
if (!empty($referer)) {
$options[CURLOPT_REFERER] = $referer;
}
if (!empty($cookieFile)) {
$options[CURLOPT_COOKIEJAR] = $cookieFile;
$options[CURLOPT_COOKIEFILE] = $cookieFile;
}
if (is_array($postData)) {
$options[CURLOPT_POST] = true;
$options[CURLOPT_POSTFIELDS] = http_build_query($postData);
}
curl_setopt_array($ch, $options);
$body = curl_exec($ch);
$error = curl_error($ch);
$httpCode = (int) curl_getinfo($ch, CURLINFO_HTTP_CODE);
$finalUrl = (string) curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
curl_close($ch);
if ($error) {
if (strpos($error, 'Maximum (') !== false && strpos($error, 'redirects followed') !== false) {
return [
'ok' => false,
'msg' => 'Scopus 跳转过多(可能触发登录/验证页面),请稍后重试或先在浏览器登录 Scopus',
'body' => '',
'http_code' => $httpCode,
'url' => $finalUrl
];
}
return ['ok' => false, 'msg' => $error, 'body' => '', 'http_code' => $httpCode, 'url' => $finalUrl];
}
if ($httpCode >= 400 || $httpCode === 0) {
return ['ok' => false, 'msg' => 'HTTP ' . $httpCode, 'body' => (string) $body, 'http_code' => $httpCode, 'url' => $finalUrl];
}
return ['ok' => true, 'msg' => '', 'body' => (string) $body, 'http_code' => $httpCode, 'url' => $finalUrl];
}
private function detectScopusBlocking($html)
{
if (empty($html)) {
return '';
}
$text = strtolower(strip_tags($html));
if (strpos($text, 'sign in') !== false || strpos($text, 'institutional sign in') !== false) {
return 'Scopus 返回登录页,当前环境未授权访问作者详情页面';
}
if (strpos($text, 'captcha') !== false || strpos($text, 'are you a robot') !== false) {
return 'Scopus 触发了人机验证,当前接口无法自动通过';
}
return '';
}
private function buildDebugInfo($finalUrl, $httpCode, $html)
{
$normalized = html_entity_decode(strip_tags((string) $html), ENT_QUOTES, 'UTF-8');
$normalized = preg_replace('/\s+/u', ' ', $normalized);
$snippet = mb_substr($normalized, 0, 300, 'UTF-8');
return [
'final_url' => (string) $finalUrl,
'http_code' => (int) $httpCode,
'page_snippet' => $snippet,
'contains_signin' => stripos($normalized, 'sign in') !== false ? 1 : 0,
'contains_captcha' => stripos($normalized, 'captcha') !== false ? 1 : 0,
];
}
private function extractScopusLookupForm($html)
{
$ret = [
'action' => '',
'hidden_fields' => [],
];
if (empty($html)) {
return $ret;
}
// 优先定位包含 author 的 form减少解析误匹配。
if (preg_match('/<form[^>]*action=["\']([^"\']+)["\'][^>]*>.*?<\/form>/is', $html, $formMatch)) {
$action = trim($formMatch[1]);
if (!preg_match('/^https?:\/\//i', $action)) {
$action = 'https://www.scopus.com' . (substr($action, 0, 1) === '/' ? '' : '/') . $action;
}
$ret['action'] = $action;
if (preg_match_all('/<input[^>]*type=["\']hidden["\'][^>]*>/is', $formMatch[0], $inputs)) {
foreach ($inputs[0] as $inputTag) {
if (preg_match('/name=["\']([^"\']+)["\']/i', $inputTag, $nameMatch)) {
$fieldName = trim($nameMatch[1]);
$fieldVal = '';
if (preg_match('/value=["\']([^"\']*)["\']/i', $inputTag, $valMatch)) {
$fieldVal = $valMatch[1];
}
$ret['hidden_fields'][$fieldName] = $fieldVal;
}
}
}
}
return $ret;
}
private function extractHIndexFromHtml($html)
{
if (empty($html)) {
return null;
}
$text = html_entity_decode(strip_tags($html), ENT_QUOTES, 'UTF-8');
$text = preg_replace('/\s+/u', ' ', $text);
$patterns = [
'/h[\-\s]?index[^0-9]{0,20}([0-9]{1,3})/iu',
'/([0-9]{1,3})[^0-9]{0,20}h[\-\s]?index/iu',
];
foreach ($patterns as $pattern) {
if (preg_match($pattern, $text, $m)) {
return (int) $m[1];
}
}
return null;
}
private function fallbackByOpenAlex($name, $affil)
{
$search = urlencode($name);
$url = "https://api.openalex.org/authors?search={$search}&limit=8";
$res = $this->httpRequest($url, null, true);
if (!$res['ok']) {
return null;
}
$data = json_decode($res['body'], true);
$list = $data['results'] ?? [];
if (empty($list)) {
return null;
}
$targetAffil = strtolower((string) $affil);
$match = null;
foreach ($list as $item) {
if (empty($targetAffil)) {
$match = $item;
break;
}
$insts = $item['affiliations'] ?? [];
foreach ($insts as $inst) {
$instName = strtolower($inst['display_name'] ?? '');
if ($instName !== '' && strpos($instName, $targetAffil) !== false) {
$match = $item;
break 2;
}
}
}
if ($match === null) {
$match = $list[0];
}
return [
'code' => 1,
'name' => $match['display_name'] ?? $name,
'affil' => !empty($match['affiliations'][0]['display_name']) ? $match['affiliations'][0]['display_name'] : $affil,
'h_index_scopus' => $match['summary_stats']['h_index_scopus'] ?? null,
'h_index_openalex' => $match['summary_stats']['h_index'] ?? null,
'source' => 'openalex_fallback',
];
}
}