文献校对功能完善

This commit is contained in:
wyn
2026-06-02 17:59:17 +08:00
parent ff7e373633
commit 991ca7ce8c
6 changed files with 726 additions and 6 deletions

View File

@@ -0,0 +1,303 @@
<?php
/**
* Created by PhpStorm.
* User: Administrator
* Date: 2026/6/2
* Time: 15:04
*/
namespace app\api\controller;
class Author
{
public function get_hindex()
{
$name = trim(input('get.name'));
$affil = trim(input('get.affil'));
$debug = (int) input('get.debug', 0);
$cookieFile = tempnam(sys_get_temp_dir(), 'scopus_cookie_');
if (empty($name)) {
return json(['code' => 0, 'msg' => '请输入作者姓名']);
}
// 1) 获取 freelookup 页面,用于拿到真实提交地址和隐藏字段。
$lookupUrl = 'https://www.scopus.com/freelookup/form/author.uri?zone=TopNavBar&origin=NO%20ORIGIN%20DEFINED';
$lookupRes = $this->httpRequest($lookupUrl, null, true, '', $cookieFile);
if (!$lookupRes['ok']) {
@unlink($cookieFile);
$ret = ['code' => 0, 'msg' => '访问 Scopus 失败:' . $lookupRes['msg']];
if ($debug === 1) {
$ret['debug'] = $this->buildDebugInfo($lookupRes['url'], $lookupRes['http_code'], $lookupRes['body']);
}
return json($ret);
}
$formInfo = $this->extractScopusLookupForm($lookupRes['body']);
if (empty($formInfo['action'])) {
@unlink($cookieFile);
$ret = ['code' => 0, 'msg' => 'Scopus 页面结构已变化,未找到查询表单'];
if ($debug === 1) {
$ret['debug'] = $this->buildDebugInfo($lookupRes['url'], $lookupRes['http_code'], $lookupRes['body']);
}
return json($ret);
}
// 2) 组装查询参数(姓名 + 机构),并携带隐藏字段提交。
$postData = $formInfo['hidden_fields'];
$postData['authLast'] = $name;
$postData['affil'] = $affil;
$searchRes = $this->httpRequest($formInfo['action'], $postData, true, $lookupUrl, $cookieFile);
if (!$searchRes['ok']) {
@unlink($cookieFile);
$ret = ['code' => 0, 'msg' => '查询 Scopus 失败:' . $searchRes['msg']];
if ($debug === 1) {
$ret['debug'] = $this->buildDebugInfo($searchRes['url'], $searchRes['http_code'], $searchRes['body']);
}
return json($ret);
}
$blockMsg = $this->detectScopusBlocking($searchRes['body']);
if (!empty($blockMsg)) {
@unlink($cookieFile);
$ret = ['code' => 0, 'msg' => $blockMsg];
$fallback = $this->fallbackByOpenAlex($name, $affil);
if ($fallback !== null) {
$ret = array_merge($fallback, [
'msg' => $blockMsg . ',已自动降级 OpenAlex 结果'
]);
}
if ($debug === 1) {
$ret['debug'] = $this->buildDebugInfo($searchRes['url'], $searchRes['http_code'], $searchRes['body']);
}
return json($ret);
}
// 3) 从返回页提取 h-index优先匹配“h-index”关键词附近数字
$hIndex = $this->extractHIndexFromHtml($searchRes['body']);
if ($hIndex === null) {
@unlink($cookieFile);
$ret = [
'code' => 0,
'msg' => '未从 Scopus 结果页解析到 H 指数(可能需要人工登录或页面结构调整)'
];
if ($debug === 1) {
$ret['debug'] = $this->buildDebugInfo($searchRes['url'], $searchRes['http_code'], $searchRes['body']);
}
return json($ret);
}
@unlink($cookieFile);
$ret = [
'code' => 1,
'name' => $name,
'affil' => $affil,
'h_index_scopus' => $hIndex,
'source' => 'scopus_freelookup',
];
if ($debug === 1) {
$ret['debug'] = $this->buildDebugInfo($searchRes['url'], $searchRes['http_code'], $searchRes['body']);
}
return json($ret);
}
private function httpRequest($url, $postData = null, $followLocation = true, $referer = '', $cookieFile = '')
{
$ch = curl_init();
$options = [
CURLOPT_URL => $url,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_SSL_VERIFYHOST => false,
CURLOPT_FOLLOWLOCATION => $followLocation,
CURLOPT_MAXREDIRS => 8,
CURLOPT_TIMEOUT => 30,
CURLOPT_CONNECTTIMEOUT => 15,
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
CURLOPT_ENCODING => '',
CURLOPT_HTTPHEADER => [
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language: zh-CN,zh;q=0.9,en;q=0.8',
],
];
if (!empty($referer)) {
$options[CURLOPT_REFERER] = $referer;
}
if (!empty($cookieFile)) {
$options[CURLOPT_COOKIEJAR] = $cookieFile;
$options[CURLOPT_COOKIEFILE] = $cookieFile;
}
if (is_array($postData)) {
$options[CURLOPT_POST] = true;
$options[CURLOPT_POSTFIELDS] = http_build_query($postData);
}
curl_setopt_array($ch, $options);
$body = curl_exec($ch);
$error = curl_error($ch);
$httpCode = (int) curl_getinfo($ch, CURLINFO_HTTP_CODE);
$finalUrl = (string) curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
curl_close($ch);
if ($error) {
if (strpos($error, 'Maximum (') !== false && strpos($error, 'redirects followed') !== false) {
return [
'ok' => false,
'msg' => 'Scopus 跳转过多(可能触发登录/验证页面),请稍后重试或先在浏览器登录 Scopus',
'body' => '',
'http_code' => $httpCode,
'url' => $finalUrl
];
}
return ['ok' => false, 'msg' => $error, 'body' => '', 'http_code' => $httpCode, 'url' => $finalUrl];
}
if ($httpCode >= 400 || $httpCode === 0) {
return ['ok' => false, 'msg' => 'HTTP ' . $httpCode, 'body' => (string) $body, 'http_code' => $httpCode, 'url' => $finalUrl];
}
return ['ok' => true, 'msg' => '', 'body' => (string) $body, 'http_code' => $httpCode, 'url' => $finalUrl];
}
private function detectScopusBlocking($html)
{
if (empty($html)) {
return '';
}
$text = strtolower(strip_tags($html));
if (strpos($text, 'sign in') !== false || strpos($text, 'institutional sign in') !== false) {
return 'Scopus 返回登录页,当前环境未授权访问作者详情页面';
}
if (strpos($text, 'captcha') !== false || strpos($text, 'are you a robot') !== false) {
return 'Scopus 触发了人机验证,当前接口无法自动通过';
}
return '';
}
private function buildDebugInfo($finalUrl, $httpCode, $html)
{
$normalized = html_entity_decode(strip_tags((string) $html), ENT_QUOTES, 'UTF-8');
$normalized = preg_replace('/\s+/u', ' ', $normalized);
$snippet = mb_substr($normalized, 0, 300, 'UTF-8');
return [
'final_url' => (string) $finalUrl,
'http_code' => (int) $httpCode,
'page_snippet' => $snippet,
'contains_signin' => stripos($normalized, 'sign in') !== false ? 1 : 0,
'contains_captcha' => stripos($normalized, 'captcha') !== false ? 1 : 0,
];
}
private function extractScopusLookupForm($html)
{
$ret = [
'action' => '',
'hidden_fields' => [],
];
if (empty($html)) {
return $ret;
}
// 优先定位包含 author 的 form减少解析误匹配。
if (preg_match('/<form[^>]*action=["\']([^"\']+)["\'][^>]*>.*?<\/form>/is', $html, $formMatch)) {
$action = trim($formMatch[1]);
if (!preg_match('/^https?:\/\//i', $action)) {
$action = 'https://www.scopus.com' . (substr($action, 0, 1) === '/' ? '' : '/') . $action;
}
$ret['action'] = $action;
if (preg_match_all('/<input[^>]*type=["\']hidden["\'][^>]*>/is', $formMatch[0], $inputs)) {
foreach ($inputs[0] as $inputTag) {
if (preg_match('/name=["\']([^"\']+)["\']/i', $inputTag, $nameMatch)) {
$fieldName = trim($nameMatch[1]);
$fieldVal = '';
if (preg_match('/value=["\']([^"\']*)["\']/i', $inputTag, $valMatch)) {
$fieldVal = $valMatch[1];
}
$ret['hidden_fields'][$fieldName] = $fieldVal;
}
}
}
}
return $ret;
}
private function extractHIndexFromHtml($html)
{
if (empty($html)) {
return null;
}
$text = html_entity_decode(strip_tags($html), ENT_QUOTES, 'UTF-8');
$text = preg_replace('/\s+/u', ' ', $text);
$patterns = [
'/h[\-\s]?index[^0-9]{0,20}([0-9]{1,3})/iu',
'/([0-9]{1,3})[^0-9]{0,20}h[\-\s]?index/iu',
];
foreach ($patterns as $pattern) {
if (preg_match($pattern, $text, $m)) {
return (int) $m[1];
}
}
return null;
}
private function fallbackByOpenAlex($name, $affil)
{
$search = urlencode($name);
$url = "https://api.openalex.org/authors?search={$search}&limit=8";
$res = $this->httpRequest($url, null, true);
if (!$res['ok']) {
return null;
}
$data = json_decode($res['body'], true);
$list = $data['results'] ?? [];
if (empty($list)) {
return null;
}
$targetAffil = strtolower((string) $affil);
$match = null;
foreach ($list as $item) {
if (empty($targetAffil)) {
$match = $item;
break;
}
$insts = $item['affiliations'] ?? [];
foreach ($insts as $inst) {
$instName = strtolower($inst['display_name'] ?? '');
if ($instName !== '' && strpos($instName, $targetAffil) !== false) {
$match = $item;
break 2;
}
}
}
if ($match === null) {
$match = $list[0];
}
return [
'code' => 1,
'name' => $match['display_name'] ?? $name,
'affil' => !empty($match['affiliations'][0]['display_name']) ? $match['affiliations'][0]['display_name'] : $affil,
'h_index_scopus' => $match['summary_stats']['h_index_scopus'] ?? null,
'h_index_openalex' => $match['summary_stats']['h_index'] ?? null,
'source' => 'openalex_fallback',
];
}
}