文献校对功能完善
This commit is contained in:
303
application/api/controller/Author.php
Normal file
303
application/api/controller/Author.php
Normal file
@@ -0,0 +1,303 @@
|
||||
<?php
|
||||
/**
|
||||
* Created by PhpStorm.
|
||||
* User: Administrator
|
||||
* Date: 2026/6/2
|
||||
* Time: 15:04
|
||||
*/
|
||||
|
||||
namespace app\api\controller;
|
||||
|
||||
|
||||
class Author
|
||||
{
|
||||
public function get_hindex()
|
||||
{
|
||||
$name = trim(input('get.name'));
|
||||
$affil = trim(input('get.affil'));
|
||||
$debug = (int) input('get.debug', 0);
|
||||
$cookieFile = tempnam(sys_get_temp_dir(), 'scopus_cookie_');
|
||||
|
||||
if (empty($name)) {
|
||||
return json(['code' => 0, 'msg' => '请输入作者姓名']);
|
||||
}
|
||||
|
||||
// 1) 获取 freelookup 页面,用于拿到真实提交地址和隐藏字段。
|
||||
$lookupUrl = 'https://www.scopus.com/freelookup/form/author.uri?zone=TopNavBar&origin=NO%20ORIGIN%20DEFINED';
|
||||
$lookupRes = $this->httpRequest($lookupUrl, null, true, '', $cookieFile);
|
||||
if (!$lookupRes['ok']) {
|
||||
@unlink($cookieFile);
|
||||
$ret = ['code' => 0, 'msg' => '访问 Scopus 失败:' . $lookupRes['msg']];
|
||||
if ($debug === 1) {
|
||||
$ret['debug'] = $this->buildDebugInfo($lookupRes['url'], $lookupRes['http_code'], $lookupRes['body']);
|
||||
}
|
||||
return json($ret);
|
||||
}
|
||||
|
||||
$formInfo = $this->extractScopusLookupForm($lookupRes['body']);
|
||||
if (empty($formInfo['action'])) {
|
||||
@unlink($cookieFile);
|
||||
$ret = ['code' => 0, 'msg' => 'Scopus 页面结构已变化,未找到查询表单'];
|
||||
if ($debug === 1) {
|
||||
$ret['debug'] = $this->buildDebugInfo($lookupRes['url'], $lookupRes['http_code'], $lookupRes['body']);
|
||||
}
|
||||
return json($ret);
|
||||
}
|
||||
|
||||
// 2) 组装查询参数(姓名 + 机构),并携带隐藏字段提交。
|
||||
$postData = $formInfo['hidden_fields'];
|
||||
$postData['authLast'] = $name;
|
||||
$postData['affil'] = $affil;
|
||||
|
||||
$searchRes = $this->httpRequest($formInfo['action'], $postData, true, $lookupUrl, $cookieFile);
|
||||
if (!$searchRes['ok']) {
|
||||
@unlink($cookieFile);
|
||||
$ret = ['code' => 0, 'msg' => '查询 Scopus 失败:' . $searchRes['msg']];
|
||||
if ($debug === 1) {
|
||||
$ret['debug'] = $this->buildDebugInfo($searchRes['url'], $searchRes['http_code'], $searchRes['body']);
|
||||
}
|
||||
return json($ret);
|
||||
}
|
||||
|
||||
$blockMsg = $this->detectScopusBlocking($searchRes['body']);
|
||||
if (!empty($blockMsg)) {
|
||||
@unlink($cookieFile);
|
||||
$ret = ['code' => 0, 'msg' => $blockMsg];
|
||||
$fallback = $this->fallbackByOpenAlex($name, $affil);
|
||||
if ($fallback !== null) {
|
||||
$ret = array_merge($fallback, [
|
||||
'msg' => $blockMsg . ',已自动降级 OpenAlex 结果'
|
||||
]);
|
||||
}
|
||||
if ($debug === 1) {
|
||||
$ret['debug'] = $this->buildDebugInfo($searchRes['url'], $searchRes['http_code'], $searchRes['body']);
|
||||
}
|
||||
return json($ret);
|
||||
}
|
||||
|
||||
// 3) 从返回页提取 h-index(优先匹配“h-index”关键词附近数字)。
|
||||
$hIndex = $this->extractHIndexFromHtml($searchRes['body']);
|
||||
if ($hIndex === null) {
|
||||
@unlink($cookieFile);
|
||||
$ret = [
|
||||
'code' => 0,
|
||||
'msg' => '未从 Scopus 结果页解析到 H 指数(可能需要人工登录或页面结构调整)'
|
||||
];
|
||||
if ($debug === 1) {
|
||||
$ret['debug'] = $this->buildDebugInfo($searchRes['url'], $searchRes['http_code'], $searchRes['body']);
|
||||
}
|
||||
return json($ret);
|
||||
}
|
||||
|
||||
@unlink($cookieFile);
|
||||
|
||||
$ret = [
|
||||
'code' => 1,
|
||||
'name' => $name,
|
||||
'affil' => $affil,
|
||||
'h_index_scopus' => $hIndex,
|
||||
'source' => 'scopus_freelookup',
|
||||
];
|
||||
if ($debug === 1) {
|
||||
$ret['debug'] = $this->buildDebugInfo($searchRes['url'], $searchRes['http_code'], $searchRes['body']);
|
||||
}
|
||||
return json($ret);
|
||||
}
|
||||
|
||||
private function httpRequest($url, $postData = null, $followLocation = true, $referer = '', $cookieFile = '')
|
||||
{
|
||||
$ch = curl_init();
|
||||
$options = [
|
||||
CURLOPT_URL => $url,
|
||||
CURLOPT_RETURNTRANSFER => true,
|
||||
CURLOPT_SSL_VERIFYPEER => false,
|
||||
CURLOPT_SSL_VERIFYHOST => false,
|
||||
CURLOPT_FOLLOWLOCATION => $followLocation,
|
||||
CURLOPT_MAXREDIRS => 8,
|
||||
CURLOPT_TIMEOUT => 30,
|
||||
CURLOPT_CONNECTTIMEOUT => 15,
|
||||
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
|
||||
CURLOPT_ENCODING => '',
|
||||
CURLOPT_HTTPHEADER => [
|
||||
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language: zh-CN,zh;q=0.9,en;q=0.8',
|
||||
],
|
||||
];
|
||||
|
||||
if (!empty($referer)) {
|
||||
$options[CURLOPT_REFERER] = $referer;
|
||||
}
|
||||
|
||||
if (!empty($cookieFile)) {
|
||||
$options[CURLOPT_COOKIEJAR] = $cookieFile;
|
||||
$options[CURLOPT_COOKIEFILE] = $cookieFile;
|
||||
}
|
||||
|
||||
if (is_array($postData)) {
|
||||
$options[CURLOPT_POST] = true;
|
||||
$options[CURLOPT_POSTFIELDS] = http_build_query($postData);
|
||||
}
|
||||
|
||||
curl_setopt_array($ch, $options);
|
||||
$body = curl_exec($ch);
|
||||
$error = curl_error($ch);
|
||||
$httpCode = (int) curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
||||
$finalUrl = (string) curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
|
||||
curl_close($ch);
|
||||
|
||||
if ($error) {
|
||||
if (strpos($error, 'Maximum (') !== false && strpos($error, 'redirects followed') !== false) {
|
||||
return [
|
||||
'ok' => false,
|
||||
'msg' => 'Scopus 跳转过多(可能触发登录/验证页面),请稍后重试或先在浏览器登录 Scopus',
|
||||
'body' => '',
|
||||
'http_code' => $httpCode,
|
||||
'url' => $finalUrl
|
||||
];
|
||||
}
|
||||
return ['ok' => false, 'msg' => $error, 'body' => '', 'http_code' => $httpCode, 'url' => $finalUrl];
|
||||
}
|
||||
|
||||
if ($httpCode >= 400 || $httpCode === 0) {
|
||||
return ['ok' => false, 'msg' => 'HTTP ' . $httpCode, 'body' => (string) $body, 'http_code' => $httpCode, 'url' => $finalUrl];
|
||||
}
|
||||
|
||||
return ['ok' => true, 'msg' => '', 'body' => (string) $body, 'http_code' => $httpCode, 'url' => $finalUrl];
|
||||
}
|
||||
|
||||
private function detectScopusBlocking($html)
|
||||
{
|
||||
if (empty($html)) {
|
||||
return '';
|
||||
}
|
||||
|
||||
$text = strtolower(strip_tags($html));
|
||||
if (strpos($text, 'sign in') !== false || strpos($text, 'institutional sign in') !== false) {
|
||||
return 'Scopus 返回登录页,当前环境未授权访问作者详情页面';
|
||||
}
|
||||
if (strpos($text, 'captcha') !== false || strpos($text, 'are you a robot') !== false) {
|
||||
return 'Scopus 触发了人机验证,当前接口无法自动通过';
|
||||
}
|
||||
|
||||
return '';
|
||||
}
|
||||
|
||||
private function buildDebugInfo($finalUrl, $httpCode, $html)
|
||||
{
|
||||
$normalized = html_entity_decode(strip_tags((string) $html), ENT_QUOTES, 'UTF-8');
|
||||
$normalized = preg_replace('/\s+/u', ' ', $normalized);
|
||||
$snippet = mb_substr($normalized, 0, 300, 'UTF-8');
|
||||
|
||||
return [
|
||||
'final_url' => (string) $finalUrl,
|
||||
'http_code' => (int) $httpCode,
|
||||
'page_snippet' => $snippet,
|
||||
'contains_signin' => stripos($normalized, 'sign in') !== false ? 1 : 0,
|
||||
'contains_captcha' => stripos($normalized, 'captcha') !== false ? 1 : 0,
|
||||
];
|
||||
}
|
||||
|
||||
private function extractScopusLookupForm($html)
|
||||
{
|
||||
$ret = [
|
||||
'action' => '',
|
||||
'hidden_fields' => [],
|
||||
];
|
||||
|
||||
if (empty($html)) {
|
||||
return $ret;
|
||||
}
|
||||
|
||||
// 优先定位包含 author 的 form,减少解析误匹配。
|
||||
if (preg_match('/<form[^>]*action=["\']([^"\']+)["\'][^>]*>.*?<\/form>/is', $html, $formMatch)) {
|
||||
$action = trim($formMatch[1]);
|
||||
if (!preg_match('/^https?:\/\//i', $action)) {
|
||||
$action = 'https://www.scopus.com' . (substr($action, 0, 1) === '/' ? '' : '/') . $action;
|
||||
}
|
||||
$ret['action'] = $action;
|
||||
|
||||
if (preg_match_all('/<input[^>]*type=["\']hidden["\'][^>]*>/is', $formMatch[0], $inputs)) {
|
||||
foreach ($inputs[0] as $inputTag) {
|
||||
if (preg_match('/name=["\']([^"\']+)["\']/i', $inputTag, $nameMatch)) {
|
||||
$fieldName = trim($nameMatch[1]);
|
||||
$fieldVal = '';
|
||||
if (preg_match('/value=["\']([^"\']*)["\']/i', $inputTag, $valMatch)) {
|
||||
$fieldVal = $valMatch[1];
|
||||
}
|
||||
$ret['hidden_fields'][$fieldName] = $fieldVal;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return $ret;
|
||||
}
|
||||
|
||||
private function extractHIndexFromHtml($html)
|
||||
{
|
||||
if (empty($html)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$text = html_entity_decode(strip_tags($html), ENT_QUOTES, 'UTF-8');
|
||||
$text = preg_replace('/\s+/u', ' ', $text);
|
||||
|
||||
$patterns = [
|
||||
'/h[\-\s]?index[^0-9]{0,20}([0-9]{1,3})/iu',
|
||||
'/([0-9]{1,3})[^0-9]{0,20}h[\-\s]?index/iu',
|
||||
];
|
||||
foreach ($patterns as $pattern) {
|
||||
if (preg_match($pattern, $text, $m)) {
|
||||
return (int) $m[1];
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private function fallbackByOpenAlex($name, $affil)
|
||||
{
|
||||
$search = urlencode($name);
|
||||
$url = "https://api.openalex.org/authors?search={$search}&limit=8";
|
||||
$res = $this->httpRequest($url, null, true);
|
||||
if (!$res['ok']) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$data = json_decode($res['body'], true);
|
||||
$list = $data['results'] ?? [];
|
||||
if (empty($list)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$targetAffil = strtolower((string) $affil);
|
||||
$match = null;
|
||||
foreach ($list as $item) {
|
||||
if (empty($targetAffil)) {
|
||||
$match = $item;
|
||||
break;
|
||||
}
|
||||
$insts = $item['affiliations'] ?? [];
|
||||
foreach ($insts as $inst) {
|
||||
$instName = strtolower($inst['display_name'] ?? '');
|
||||
if ($instName !== '' && strpos($instName, $targetAffil) !== false) {
|
||||
$match = $item;
|
||||
break 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ($match === null) {
|
||||
$match = $list[0];
|
||||
}
|
||||
|
||||
return [
|
||||
'code' => 1,
|
||||
'name' => $match['display_name'] ?? $name,
|
||||
'affil' => !empty($match['affiliations'][0]['display_name']) ? $match['affiliations'][0]['display_name'] : $affil,
|
||||
'h_index_scopus' => $match['summary_stats']['h_index_scopus'] ?? null,
|
||||
'h_index_openalex' => $match['summary_stats']['h_index'] ?? null,
|
||||
'source' => 'openalex_fallback',
|
||||
];
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user