diff --git a/application/api/controller/Author.php b/application/api/controller/Author.php new file mode 100644 index 00000000..2ea83479 --- /dev/null +++ b/application/api/controller/Author.php @@ -0,0 +1,303 @@ + 0, 'msg' => '请输入作者姓名']); + } + + // 1) 获取 freelookup 页面,用于拿到真实提交地址和隐藏字段。 + $lookupUrl = 'https://www.scopus.com/freelookup/form/author.uri?zone=TopNavBar&origin=NO%20ORIGIN%20DEFINED'; + $lookupRes = $this->httpRequest($lookupUrl, null, true, '', $cookieFile); + if (!$lookupRes['ok']) { + @unlink($cookieFile); + $ret = ['code' => 0, 'msg' => '访问 Scopus 失败:' . $lookupRes['msg']]; + if ($debug === 1) { + $ret['debug'] = $this->buildDebugInfo($lookupRes['url'], $lookupRes['http_code'], $lookupRes['body']); + } + return json($ret); + } + + $formInfo = $this->extractScopusLookupForm($lookupRes['body']); + if (empty($formInfo['action'])) { + @unlink($cookieFile); + $ret = ['code' => 0, 'msg' => 'Scopus 页面结构已变化,未找到查询表单']; + if ($debug === 1) { + $ret['debug'] = $this->buildDebugInfo($lookupRes['url'], $lookupRes['http_code'], $lookupRes['body']); + } + return json($ret); + } + + // 2) 组装查询参数(姓名 + 机构),并携带隐藏字段提交。 + $postData = $formInfo['hidden_fields']; + $postData['authLast'] = $name; + $postData['affil'] = $affil; + + $searchRes = $this->httpRequest($formInfo['action'], $postData, true, $lookupUrl, $cookieFile); + if (!$searchRes['ok']) { + @unlink($cookieFile); + $ret = ['code' => 0, 'msg' => '查询 Scopus 失败:' . $searchRes['msg']]; + if ($debug === 1) { + $ret['debug'] = $this->buildDebugInfo($searchRes['url'], $searchRes['http_code'], $searchRes['body']); + } + return json($ret); + } + + $blockMsg = $this->detectScopusBlocking($searchRes['body']); + if (!empty($blockMsg)) { + @unlink($cookieFile); + $ret = ['code' => 0, 'msg' => $blockMsg]; + $fallback = $this->fallbackByOpenAlex($name, $affil); + if ($fallback !== null) { + $ret = array_merge($fallback, [ + 'msg' => $blockMsg . ',已自动降级 OpenAlex 结果' + ]); + } + if ($debug === 1) { + $ret['debug'] = $this->buildDebugInfo($searchRes['url'], $searchRes['http_code'], $searchRes['body']); + } + return json($ret); + } + + // 3) 从返回页提取 h-index(优先匹配“h-index”关键词附近数字)。 + $hIndex = $this->extractHIndexFromHtml($searchRes['body']); + if ($hIndex === null) { + @unlink($cookieFile); + $ret = [ + 'code' => 0, + 'msg' => '未从 Scopus 结果页解析到 H 指数(可能需要人工登录或页面结构调整)' + ]; + if ($debug === 1) { + $ret['debug'] = $this->buildDebugInfo($searchRes['url'], $searchRes['http_code'], $searchRes['body']); + } + return json($ret); + } + + @unlink($cookieFile); + + $ret = [ + 'code' => 1, + 'name' => $name, + 'affil' => $affil, + 'h_index_scopus' => $hIndex, + 'source' => 'scopus_freelookup', + ]; + if ($debug === 1) { + $ret['debug'] = $this->buildDebugInfo($searchRes['url'], $searchRes['http_code'], $searchRes['body']); + } + return json($ret); + } + + private function httpRequest($url, $postData = null, $followLocation = true, $referer = '', $cookieFile = '') + { + $ch = curl_init(); + $options = [ + CURLOPT_URL => $url, + CURLOPT_RETURNTRANSFER => true, + CURLOPT_SSL_VERIFYPEER => false, + CURLOPT_SSL_VERIFYHOST => false, + CURLOPT_FOLLOWLOCATION => $followLocation, + CURLOPT_MAXREDIRS => 8, + CURLOPT_TIMEOUT => 30, + CURLOPT_CONNECTTIMEOUT => 15, + CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36', + CURLOPT_ENCODING => '', + CURLOPT_HTTPHEADER => [ + 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language: zh-CN,zh;q=0.9,en;q=0.8', + ], + ]; + + if (!empty($referer)) { + $options[CURLOPT_REFERER] = $referer; + } + + if (!empty($cookieFile)) { + $options[CURLOPT_COOKIEJAR] = $cookieFile; + $options[CURLOPT_COOKIEFILE] = $cookieFile; + } + + if (is_array($postData)) { + $options[CURLOPT_POST] = true; + $options[CURLOPT_POSTFIELDS] = http_build_query($postData); + } + + curl_setopt_array($ch, $options); + $body = curl_exec($ch); + $error = curl_error($ch); + $httpCode = (int) curl_getinfo($ch, CURLINFO_HTTP_CODE); + $finalUrl = (string) curl_getinfo($ch, CURLINFO_EFFECTIVE_URL); + curl_close($ch); + + if ($error) { + if (strpos($error, 'Maximum (') !== false && strpos($error, 'redirects followed') !== false) { + return [ + 'ok' => false, + 'msg' => 'Scopus 跳转过多(可能触发登录/验证页面),请稍后重试或先在浏览器登录 Scopus', + 'body' => '', + 'http_code' => $httpCode, + 'url' => $finalUrl + ]; + } + return ['ok' => false, 'msg' => $error, 'body' => '', 'http_code' => $httpCode, 'url' => $finalUrl]; + } + + if ($httpCode >= 400 || $httpCode === 0) { + return ['ok' => false, 'msg' => 'HTTP ' . $httpCode, 'body' => (string) $body, 'http_code' => $httpCode, 'url' => $finalUrl]; + } + + return ['ok' => true, 'msg' => '', 'body' => (string) $body, 'http_code' => $httpCode, 'url' => $finalUrl]; + } + + private function detectScopusBlocking($html) + { + if (empty($html)) { + return ''; + } + + $text = strtolower(strip_tags($html)); + if (strpos($text, 'sign in') !== false || strpos($text, 'institutional sign in') !== false) { + return 'Scopus 返回登录页,当前环境未授权访问作者详情页面'; + } + if (strpos($text, 'captcha') !== false || strpos($text, 'are you a robot') !== false) { + return 'Scopus 触发了人机验证,当前接口无法自动通过'; + } + + return ''; + } + + private function buildDebugInfo($finalUrl, $httpCode, $html) + { + $normalized = html_entity_decode(strip_tags((string) $html), ENT_QUOTES, 'UTF-8'); + $normalized = preg_replace('/\s+/u', ' ', $normalized); + $snippet = mb_substr($normalized, 0, 300, 'UTF-8'); + + return [ + 'final_url' => (string) $finalUrl, + 'http_code' => (int) $httpCode, + 'page_snippet' => $snippet, + 'contains_signin' => stripos($normalized, 'sign in') !== false ? 1 : 0, + 'contains_captcha' => stripos($normalized, 'captcha') !== false ? 1 : 0, + ]; + } + + private function extractScopusLookupForm($html) + { + $ret = [ + 'action' => '', + 'hidden_fields' => [], + ]; + + if (empty($html)) { + return $ret; + } + + // 优先定位包含 author 的 form,减少解析误匹配。 + if (preg_match('/