From cd7b148ad4a73c2a3a2bfc2add926d10e18c8c19 Mon Sep 17 00:00:00 2001 From: wangjinlei <751475802@qq.com> Date: Mon, 15 Jun 2026 16:36:03 +0800 Subject: [PATCH] =?UTF-8?q?=E5=8D=87=E7=BA=A7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- application/api/controller/ExpertFinder.php | 2 +- application/common/ExpertFinderService.php | 120 ++++++++++++++++---- 2 files changed, 98 insertions(+), 24 deletions(-) diff --git a/application/api/controller/ExpertFinder.php b/application/api/controller/ExpertFinder.php index 19984547..ee8e5aac 100644 --- a/application/api/controller/ExpertFinder.php +++ b/application/api/controller/ExpertFinder.php @@ -298,7 +298,7 @@ class ExpertFinder extends Base */ public function dailyFetchAll() { - $perPage = max(10, intval($this->request->param('per_page', 10))); + $perPage = max(10, intval($this->request->param('per_page', 50))); $source = $this->request->param('source', 'pubmed'); $minYear = intval($this->request->param('min_year', date('Y') - 3)); diff --git a/application/common/ExpertFinderService.php b/application/common/ExpertFinderService.php index bd5a67e7..e92cb5c5 100644 --- a/application/common/ExpertFinderService.php +++ b/application/common/ExpertFinderService.php @@ -13,6 +13,9 @@ class ExpertFinderService private $ncbiBaseUrl = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'; private $logFile; + /** @var bool|null */ + private static $schemaReady = null; + public function __construct() { $this->httpClient = new Client([ @@ -21,6 +24,54 @@ class ExpertFinderService 'verify' => false, ]); $this->logFile = ROOT_PATH . 'runtime' . DS . 'expert_finder.log'; + + try { + $this->ensureSchema(); + } catch (\Throwable $e) { + $this->log('[ExpertFinder] ensureSchema fail: ' . $e->getMessage()); + } + } + + /** + * 历史遗留数据迁移用:旧版每天按页抓取时使用的 per_page。 + * 用于把旧的 last_page 换算成新的 last_offset(last_offset = last_page × 此值)。 + */ + const MIGRATE_LEGACY_PER_PAGE = 10; + + /** + * 自动补全 expert_fetch 上缺失的 last_offset 列,并一次性回填历史进度(可重复执行)。 + * last_offset 为累计抓取偏移量(已扫到第几篇),与 per_page 解耦, + * 改 per_page 不会再导致翻页错位。 + */ + public function ensureSchema() + { + if (self::$schemaReady === true) { + return; + } + + $table = config('database.prefix') . 'expert_fetch'; + $columns = Db::query('SHOW COLUMNS FROM `' . $table . '`'); + $existing = []; + foreach ($columns as $col) { + $existing[$col['Field']] = true; + } + + if (!isset($existing['last_offset'])) { + Db::execute('ALTER TABLE `' . $table . '` ADD COLUMN `last_offset` INT NOT NULL DEFAULT 0 COMMENT \'累计抓取偏移量(与per_page解耦)\' AFTER `last_page`'); + $this->log('[ExpertFinder] schema patched: add last_offset'); + } + + // 一次性迁移:把旧 last_page 按历史 per_page 换算成 last_offset。 + // 只命中"未迁移"的遗留行(last_offset=0 且 last_page>0),幂等,不会重复执行。 + $affected = Db::execute( + 'UPDATE `' . $table . '` SET `last_offset` = `last_page` * ' . intval(self::MIGRATE_LEGACY_PER_PAGE) + . ' WHERE `last_offset` = 0 AND `last_page` > 0' + ); + if ($affected > 0) { + $this->log('[ExpertFinder] migrated last_offset from last_page for ' . $affected . ' rows (×' . self::MIGRATE_LEGACY_PER_PAGE . ')'); + } + + self::$schemaReady = true; } public function doFetchForField($field, $source = 'pubmed', $perPage = 100, $minYear = null) @@ -30,12 +81,13 @@ class ExpertFinderService } $fetchLog = $this->getFetchLog($field, $source); - $page = $fetchLog['last_page'] + 1; + // 基于累计偏移量(offset)的游标:改 per_page 也不会错位 + $offset = intval($fetchLog['last_offset'] ?? 0); if ($source === 'pmc') { - $result = $this->searchViaPMC($field, $perPage, $minYear, $page); + $result = $this->searchViaPMC($field, $perPage, $minYear, $offset); } else { - $result = $this->searchViaPubMed($field, $perPage, $minYear, $page); + $result = $this->searchViaPubMed($field, $perPage, $minYear, $offset); } if(!isset($result['total'])){ @@ -45,13 +97,15 @@ class ExpertFinderService } $saveResult = $this->saveExperts($result['experts'], $field, $source); - $nextPage = $result['has_more'] ? $page : $fetchLog['last_page']; - $totalPages = $result['total_pages'] ?? $fetchLog['total_pages']; - $this->updateFetchLog($field, $source, $nextPage, $totalPages); + // 抓到下一篇则前移一个窗口;抓完则保持当前 offset + $nextOffset = $result['has_more'] ? ($offset + $perPage) : $offset; + $totalPages = $result['total_pages'] ?? ($fetchLog['total_pages'] ?? 0); + $this->updateFetchLog($field, $source, $nextOffset, $totalPages, $perPage); return [ 'keyword' => $field, - 'page' => $page, + 'page' => $result['page'] ?? 1, + 'offset' => $offset, 'experts_found' => $result['total'], 'saved_new' => $saveResult['inserted'], 'saved_exist' => $saveResult['existing'], @@ -63,10 +117,12 @@ class ExpertFinderService public function searchExperts($keyword, $perPage, $minYear, $page, $source) { + // 交互式按页搜索:把页码换算成偏移量后走统一的 offset 逻辑 + $retstart = max(0, (intval($page) - 1) * intval($perPage)); if ($source === 'pmc') { - return $this->searchViaPMC($keyword, $perPage, $minYear, $page); + return $this->searchViaPMC($keyword, $perPage, $minYear, $retstart); } - return $this->searchViaPubMed($keyword, $perPage, $minYear, $page); + return $this->searchViaPubMed($keyword, $perPage, $minYear, $retstart); } public function saveExperts($experts, $field, $source) @@ -184,14 +240,25 @@ class ExpertFinderService ->find(); if (!$log) { - return ['last_page' => 0, 'total_pages' => 0, 'last_time' => 0]; + return ['last_page' => 0, 'last_offset' => 0, 'total_pages' => 0, 'last_time' => 0]; } return $log; } - public function updateFetchLog($field, $source, $lastPage, $totalPages) + /** + * 回写抓取进度。 + * @param int $lastOffset 累计偏移量(权威游标) + * @param int $totalPages 总页数(仅展示) + * @param int $perPage 本次窗口大小,用于换算展示用 last_page + */ + public function updateFetchLog($field, $source, $lastOffset, $totalPages, $perPage = 0) { + $lastOffset = max(0, intval($lastOffset)); + $perPage = intval($perPage); + // last_page 仅作展示:由偏移量换算(per_page 未知时退化为偏移量本身) + $lastPage = $perPage > 0 ? intval(floor($lastOffset / $perPage)) : $lastOffset; + $exists = Db::name('expert_fetch') ->where('field', $field) ->where('source', $source) @@ -201,6 +268,7 @@ class ExpertFinderService Db::name('expert_fetch') ->where('expert_fetch_id', $exists['expert_fetch_id']) ->update([ + 'last_offset' => $lastOffset, 'last_page' => $lastPage, 'total_pages' => $totalPages, 'last_time' => time(), @@ -209,6 +277,7 @@ class ExpertFinderService Db::name('expert_fetch')->insert([ 'field' => mb_substr($field, 0, 128), 'source' => mb_substr($source, 0, 128), + 'last_offset' => $lastOffset, 'last_page' => $lastPage, 'total_pages' => $totalPages, 'last_time' => time(), @@ -218,16 +287,16 @@ class ExpertFinderService // ==================== PubMed Search ==================== - private function searchViaPubMed($keyword, $perPage, $minYear, $page = 1) + private function searchViaPubMed($keyword, $perPage, $minYear, $retstart = 0) { set_time_limit(600); - $searchResult = $this->esearch('pubmed', $keyword, $perPage, $minYear, $page); + $searchResult = $this->esearch('pubmed', $keyword, $perPage, $minYear, $retstart); $ids = $searchResult['ids']; $totalArticles = $searchResult['total']; if (empty($ids)) { - return $this->buildPagedResult([], 0, 0, $totalArticles, $page, $perPage, 'pubmed'); + return $this->buildPagedResult([], 0, 0, $totalArticles, $retstart, $perPage, 'pubmed'); } $allAuthors = []; @@ -243,21 +312,21 @@ class ExpertFinderService $experts = $this->aggregateExperts($allAuthors); - return $this->buildPagedResult($experts, count($experts), count($ids), $totalArticles, $page, $perPage, 'pubmed'); + return $this->buildPagedResult($experts, count($experts), count($ids), $totalArticles, $retstart, $perPage, 'pubmed'); } // ==================== PMC Search ==================== - private function searchViaPMC($keyword, $perPage, $minYear, $page = 1) + private function searchViaPMC($keyword, $perPage, $minYear, $retstart = 0) { set_time_limit(600); - $searchResult = $this->esearch('pmc', $keyword, $perPage, $minYear, $page); + $searchResult = $this->esearch('pmc', $keyword, $perPage, $minYear, $retstart); $ids = $searchResult['ids']; $totalArticles = $searchResult['total']; if (empty($ids)) { - return $this->buildPagedResult([], 0, 0, $totalArticles, $page, $perPage, 'pmc'); + return $this->buildPagedResult([], 0, 0, $totalArticles, $retstart, $perPage, 'pmc'); } $allAuthors = []; @@ -273,15 +342,15 @@ class ExpertFinderService $experts = $this->aggregateExperts($allAuthors); - return $this->buildPagedResult($experts, count($experts), count($ids), $totalArticles, $page, $perPage, 'pmc'); + return $this->buildPagedResult($experts, count($experts), count($ids), $totalArticles, $retstart, $perPage, 'pmc'); } // ==================== NCBI API ==================== - private function esearch($db, $keyword, $perPage, $minYear, $page = 1) + private function esearch($db, $keyword, $perPage, $minYear, $retstart = 0) { $term = $keyword . ' AND ' . $minYear . ':' . date('Y') . '[pdat]'; - $retstart = ($page - 1) * $perPage; + $retstart = max(0, intval($retstart)); $response = $this->httpClient->get($this->ncbiBaseUrl . 'esearch.fcgi', [ 'query' => [ @@ -563,18 +632,23 @@ class ExpertFinderService return $experts; } - private function buildPagedResult($experts, $expertCount, $articlesScanned, $totalArticles, $page, $perPage, $source) + private function buildPagedResult($experts, $expertCount, $articlesScanned, $totalArticles, $retstart, $perPage, $source) { + $perPage = max(1, intval($perPage)); + $retstart = max(0, intval($retstart)); $totalPages = $totalArticles > 0 ? ceil($totalArticles / $perPage) : 0; + $page = intval(floor($retstart / $perPage)) + 1; return [ 'experts' => $experts, 'total' => $expertCount, 'articles_scanned' => $articlesScanned, 'total_articles' => $totalArticles, 'page' => $page, + 'offset' => $retstart, 'per_page' => $perPage, 'total_pages' => $totalPages, - 'has_more' => $page < $totalPages, + // 偏移量驱动:下一个窗口还在范围内才有更多 + 'has_more' => ($retstart + $perPage) < $totalArticles, 'source' => $source, ]; }