0, 'msg' => '请输入作者姓名']); } // 1) 获取 freelookup 页面,用于拿到真实提交地址和隐藏字段。 $lookupUrl = 'https://www.scopus.com/freelookup/form/author.uri?zone=TopNavBar&origin=NO%20ORIGIN%20DEFINED'; $lookupRes = $this->httpRequest($lookupUrl, null, true, '', $cookieFile); if (!$lookupRes['ok']) { @unlink($cookieFile); $ret = ['code' => 0, 'msg' => '访问 Scopus 失败:' . $lookupRes['msg']]; if ($debug === 1) { $ret['debug'] = $this->buildDebugInfo($lookupRes['url'], $lookupRes['http_code'], $lookupRes['body']); } return json($ret); } $formInfo = $this->extractScopusLookupForm($lookupRes['body']); if (empty($formInfo['action'])) { @unlink($cookieFile); $ret = ['code' => 0, 'msg' => 'Scopus 页面结构已变化,未找到查询表单']; if ($debug === 1) { $ret['debug'] = $this->buildDebugInfo($lookupRes['url'], $lookupRes['http_code'], $lookupRes['body']); } return json($ret); } // 2) 组装查询参数(姓名 + 机构),并携带隐藏字段提交。 $postData = $formInfo['hidden_fields']; $postData['authLast'] = $name; $postData['affil'] = $affil; $searchRes = $this->httpRequest($formInfo['action'], $postData, true, $lookupUrl, $cookieFile); if (!$searchRes['ok']) { @unlink($cookieFile); $ret = ['code' => 0, 'msg' => '查询 Scopus 失败:' . $searchRes['msg']]; if ($debug === 1) { $ret['debug'] = $this->buildDebugInfo($searchRes['url'], $searchRes['http_code'], $searchRes['body']); } return json($ret); } $blockMsg = $this->detectScopusBlocking($searchRes['body']); if (!empty($blockMsg)) { @unlink($cookieFile); $ret = ['code' => 0, 'msg' => $blockMsg]; $fallback = $this->fallbackByOpenAlex($name, $affil); if ($fallback !== null) { $ret = array_merge($fallback, [ 'msg' => $blockMsg . ',已自动降级 OpenAlex 结果' ]); } if ($debug === 1) { $ret['debug'] = $this->buildDebugInfo($searchRes['url'], $searchRes['http_code'], $searchRes['body']); } return json($ret); } // 3) 从返回页提取 h-index(优先匹配“h-index”关键词附近数字)。 $hIndex = $this->extractHIndexFromHtml($searchRes['body']); if ($hIndex === null) { @unlink($cookieFile); $ret = [ 'code' => 0, 'msg' => '未从 Scopus 结果页解析到 H 指数(可能需要人工登录或页面结构调整)' ]; if ($debug === 1) { $ret['debug'] = $this->buildDebugInfo($searchRes['url'], $searchRes['http_code'], $searchRes['body']); } return json($ret); } @unlink($cookieFile); $ret = [ 'code' => 1, 'name' => $name, 'affil' => $affil, 'h_index_scopus' => $hIndex, 'source' => 'scopus_freelookup', ]; if ($debug === 1) { $ret['debug'] = $this->buildDebugInfo($searchRes['url'], $searchRes['http_code'], $searchRes['body']); } return json($ret); } private function httpRequest($url, $postData = null, $followLocation = true, $referer = '', $cookieFile = '') { $ch = curl_init(); $options = [ CURLOPT_URL => $url, CURLOPT_RETURNTRANSFER => true, CURLOPT_SSL_VERIFYPEER => false, CURLOPT_SSL_VERIFYHOST => false, CURLOPT_FOLLOWLOCATION => $followLocation, CURLOPT_MAXREDIRS => 8, CURLOPT_TIMEOUT => 30, CURLOPT_CONNECTTIMEOUT => 15, CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36', CURLOPT_ENCODING => '', CURLOPT_HTTPHEADER => [ 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language: zh-CN,zh;q=0.9,en;q=0.8', ], ]; if (!empty($referer)) { $options[CURLOPT_REFERER] = $referer; } if (!empty($cookieFile)) { $options[CURLOPT_COOKIEJAR] = $cookieFile; $options[CURLOPT_COOKIEFILE] = $cookieFile; } if (is_array($postData)) { $options[CURLOPT_POST] = true; $options[CURLOPT_POSTFIELDS] = http_build_query($postData); } curl_setopt_array($ch, $options); $body = curl_exec($ch); $error = curl_error($ch); $httpCode = (int) curl_getinfo($ch, CURLINFO_HTTP_CODE); $finalUrl = (string) curl_getinfo($ch, CURLINFO_EFFECTIVE_URL); curl_close($ch); if ($error) { if (strpos($error, 'Maximum (') !== false && strpos($error, 'redirects followed') !== false) { return [ 'ok' => false, 'msg' => 'Scopus 跳转过多(可能触发登录/验证页面),请稍后重试或先在浏览器登录 Scopus', 'body' => '', 'http_code' => $httpCode, 'url' => $finalUrl ]; } return ['ok' => false, 'msg' => $error, 'body' => '', 'http_code' => $httpCode, 'url' => $finalUrl]; } if ($httpCode >= 400 || $httpCode === 0) { return ['ok' => false, 'msg' => 'HTTP ' . $httpCode, 'body' => (string) $body, 'http_code' => $httpCode, 'url' => $finalUrl]; } return ['ok' => true, 'msg' => '', 'body' => (string) $body, 'http_code' => $httpCode, 'url' => $finalUrl]; } private function detectScopusBlocking($html) { if (empty($html)) { return ''; } $text = strtolower(strip_tags($html)); if (strpos($text, 'sign in') !== false || strpos($text, 'institutional sign in') !== false) { return 'Scopus 返回登录页,当前环境未授权访问作者详情页面'; } if (strpos($text, 'captcha') !== false || strpos($text, 'are you a robot') !== false) { return 'Scopus 触发了人机验证,当前接口无法自动通过'; } return ''; } private function buildDebugInfo($finalUrl, $httpCode, $html) { $normalized = html_entity_decode(strip_tags((string) $html), ENT_QUOTES, 'UTF-8'); $normalized = preg_replace('/\s+/u', ' ', $normalized); $snippet = mb_substr($normalized, 0, 300, 'UTF-8'); return [ 'final_url' => (string) $finalUrl, 'http_code' => (int) $httpCode, 'page_snippet' => $snippet, 'contains_signin' => stripos($normalized, 'sign in') !== false ? 1 : 0, 'contains_captcha' => stripos($normalized, 'captcha') !== false ? 1 : 0, ]; } private function extractScopusLookupForm($html) { $ret = [ 'action' => '', 'hidden_fields' => [], ]; if (empty($html)) { return $ret; } // 优先定位包含 author 的 form,减少解析误匹配。 if (preg_match('/]*action=["\']([^"\']+)["\'][^>]*>.*?<\/form>/is', $html, $formMatch)) { $action = trim($formMatch[1]); if (!preg_match('/^https?:\/\//i', $action)) { $action = 'https://www.scopus.com' . (substr($action, 0, 1) === '/' ? '' : '/') . $action; } $ret['action'] = $action; if (preg_match_all('/]*type=["\']hidden["\'][^>]*>/is', $formMatch[0], $inputs)) { foreach ($inputs[0] as $inputTag) { if (preg_match('/name=["\']([^"\']+)["\']/i', $inputTag, $nameMatch)) { $fieldName = trim($nameMatch[1]); $fieldVal = ''; if (preg_match('/value=["\']([^"\']*)["\']/i', $inputTag, $valMatch)) { $fieldVal = $valMatch[1]; } $ret['hidden_fields'][$fieldName] = $fieldVal; } } } } return $ret; } private function extractHIndexFromHtml($html) { if (empty($html)) { return null; } $text = html_entity_decode(strip_tags($html), ENT_QUOTES, 'UTF-8'); $text = preg_replace('/\s+/u', ' ', $text); $patterns = [ '/h[\-\s]?index[^0-9]{0,20}([0-9]{1,3})/iu', '/([0-9]{1,3})[^0-9]{0,20}h[\-\s]?index/iu', ]; foreach ($patterns as $pattern) { if (preg_match($pattern, $text, $m)) { return (int) $m[1]; } } return null; } private function fallbackByOpenAlex($name, $affil) { $search = urlencode($name); $url = "https://api.openalex.org/authors?search={$search}&limit=8"; $res = $this->httpRequest($url, null, true); if (!$res['ok']) { return null; } $data = json_decode($res['body'], true); $list = $data['results'] ?? []; if (empty($list)) { return null; } $targetAffil = strtolower((string) $affil); $match = null; foreach ($list as $item) { if (empty($targetAffil)) { $match = $item; break; } $insts = $item['affiliations'] ?? []; foreach ($insts as $inst) { $instName = strtolower($inst['display_name'] ?? ''); if ($instName !== '' && strpos($instName, $targetAffil) !== false) { $match = $item; break 2; } } } if ($match === null) { $match = $list[0]; } return [ 'code' => 1, 'name' => $match['display_name'] ?? $name, 'affil' => !empty($match['affiliations'][0]['display_name']) ? $match['affiliations'][0]['display_name'] : $affil, 'h_index_scopus' => $match['summary_stats']['h_index_scopus'] ?? null, 'h_index_openalex' => $match['summary_stats']['h_index'] ?? null, 'source' => 'openalex_fallback', ]; } }