升级
This commit is contained in:
@@ -298,7 +298,7 @@ class ExpertFinder extends Base
|
||||
*/
|
||||
public function dailyFetchAll()
|
||||
{
|
||||
$perPage = max(10, intval($this->request->param('per_page', 10)));
|
||||
$perPage = max(10, intval($this->request->param('per_page', 50)));
|
||||
$source = $this->request->param('source', 'pubmed');
|
||||
$minYear = intval($this->request->param('min_year', date('Y') - 3));
|
||||
|
||||
|
||||
@@ -13,6 +13,9 @@ class ExpertFinderService
|
||||
private $ncbiBaseUrl = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/';
|
||||
private $logFile;
|
||||
|
||||
/** @var bool|null */
|
||||
private static $schemaReady = null;
|
||||
|
||||
public function __construct()
|
||||
{
|
||||
$this->httpClient = new Client([
|
||||
@@ -21,6 +24,54 @@ class ExpertFinderService
|
||||
'verify' => false,
|
||||
]);
|
||||
$this->logFile = ROOT_PATH . 'runtime' . DS . 'expert_finder.log';
|
||||
|
||||
try {
|
||||
$this->ensureSchema();
|
||||
} catch (\Throwable $e) {
|
||||
$this->log('[ExpertFinder] ensureSchema fail: ' . $e->getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 历史遗留数据迁移用:旧版每天按页抓取时使用的 per_page。
|
||||
* 用于把旧的 last_page 换算成新的 last_offset(last_offset = last_page × 此值)。
|
||||
*/
|
||||
const MIGRATE_LEGACY_PER_PAGE = 10;
|
||||
|
||||
/**
|
||||
* 自动补全 expert_fetch 上缺失的 last_offset 列,并一次性回填历史进度(可重复执行)。
|
||||
* last_offset 为累计抓取偏移量(已扫到第几篇),与 per_page 解耦,
|
||||
* 改 per_page 不会再导致翻页错位。
|
||||
*/
|
||||
public function ensureSchema()
|
||||
{
|
||||
if (self::$schemaReady === true) {
|
||||
return;
|
||||
}
|
||||
|
||||
$table = config('database.prefix') . 'expert_fetch';
|
||||
$columns = Db::query('SHOW COLUMNS FROM `' . $table . '`');
|
||||
$existing = [];
|
||||
foreach ($columns as $col) {
|
||||
$existing[$col['Field']] = true;
|
||||
}
|
||||
|
||||
if (!isset($existing['last_offset'])) {
|
||||
Db::execute('ALTER TABLE `' . $table . '` ADD COLUMN `last_offset` INT NOT NULL DEFAULT 0 COMMENT \'累计抓取偏移量(与per_page解耦)\' AFTER `last_page`');
|
||||
$this->log('[ExpertFinder] schema patched: add last_offset');
|
||||
}
|
||||
|
||||
// 一次性迁移:把旧 last_page 按历史 per_page 换算成 last_offset。
|
||||
// 只命中"未迁移"的遗留行(last_offset=0 且 last_page>0),幂等,不会重复执行。
|
||||
$affected = Db::execute(
|
||||
'UPDATE `' . $table . '` SET `last_offset` = `last_page` * ' . intval(self::MIGRATE_LEGACY_PER_PAGE)
|
||||
. ' WHERE `last_offset` = 0 AND `last_page` > 0'
|
||||
);
|
||||
if ($affected > 0) {
|
||||
$this->log('[ExpertFinder] migrated last_offset from last_page for ' . $affected . ' rows (×' . self::MIGRATE_LEGACY_PER_PAGE . ')');
|
||||
}
|
||||
|
||||
self::$schemaReady = true;
|
||||
}
|
||||
|
||||
public function doFetchForField($field, $source = 'pubmed', $perPage = 100, $minYear = null)
|
||||
@@ -30,12 +81,13 @@ class ExpertFinderService
|
||||
}
|
||||
|
||||
$fetchLog = $this->getFetchLog($field, $source);
|
||||
$page = $fetchLog['last_page'] + 1;
|
||||
// 基于累计偏移量(offset)的游标:改 per_page 也不会错位
|
||||
$offset = intval($fetchLog['last_offset'] ?? 0);
|
||||
|
||||
if ($source === 'pmc') {
|
||||
$result = $this->searchViaPMC($field, $perPage, $minYear, $page);
|
||||
$result = $this->searchViaPMC($field, $perPage, $minYear, $offset);
|
||||
} else {
|
||||
$result = $this->searchViaPubMed($field, $perPage, $minYear, $page);
|
||||
$result = $this->searchViaPubMed($field, $perPage, $minYear, $offset);
|
||||
}
|
||||
|
||||
if(!isset($result['total'])){
|
||||
@@ -45,13 +97,15 @@ class ExpertFinderService
|
||||
}
|
||||
$saveResult = $this->saveExperts($result['experts'], $field, $source);
|
||||
|
||||
$nextPage = $result['has_more'] ? $page : $fetchLog['last_page'];
|
||||
$totalPages = $result['total_pages'] ?? $fetchLog['total_pages'];
|
||||
$this->updateFetchLog($field, $source, $nextPage, $totalPages);
|
||||
// 抓到下一篇则前移一个窗口;抓完则保持当前 offset
|
||||
$nextOffset = $result['has_more'] ? ($offset + $perPage) : $offset;
|
||||
$totalPages = $result['total_pages'] ?? ($fetchLog['total_pages'] ?? 0);
|
||||
$this->updateFetchLog($field, $source, $nextOffset, $totalPages, $perPage);
|
||||
|
||||
return [
|
||||
'keyword' => $field,
|
||||
'page' => $page,
|
||||
'page' => $result['page'] ?? 1,
|
||||
'offset' => $offset,
|
||||
'experts_found' => $result['total'],
|
||||
'saved_new' => $saveResult['inserted'],
|
||||
'saved_exist' => $saveResult['existing'],
|
||||
@@ -63,10 +117,12 @@ class ExpertFinderService
|
||||
|
||||
public function searchExperts($keyword, $perPage, $minYear, $page, $source)
|
||||
{
|
||||
// 交互式按页搜索:把页码换算成偏移量后走统一的 offset 逻辑
|
||||
$retstart = max(0, (intval($page) - 1) * intval($perPage));
|
||||
if ($source === 'pmc') {
|
||||
return $this->searchViaPMC($keyword, $perPage, $minYear, $page);
|
||||
return $this->searchViaPMC($keyword, $perPage, $minYear, $retstart);
|
||||
}
|
||||
return $this->searchViaPubMed($keyword, $perPage, $minYear, $page);
|
||||
return $this->searchViaPubMed($keyword, $perPage, $minYear, $retstart);
|
||||
}
|
||||
|
||||
public function saveExperts($experts, $field, $source)
|
||||
@@ -184,14 +240,25 @@ class ExpertFinderService
|
||||
->find();
|
||||
|
||||
if (!$log) {
|
||||
return ['last_page' => 0, 'total_pages' => 0, 'last_time' => 0];
|
||||
return ['last_page' => 0, 'last_offset' => 0, 'total_pages' => 0, 'last_time' => 0];
|
||||
}
|
||||
|
||||
return $log;
|
||||
}
|
||||
|
||||
public function updateFetchLog($field, $source, $lastPage, $totalPages)
|
||||
/**
|
||||
* 回写抓取进度。
|
||||
* @param int $lastOffset 累计偏移量(权威游标)
|
||||
* @param int $totalPages 总页数(仅展示)
|
||||
* @param int $perPage 本次窗口大小,用于换算展示用 last_page
|
||||
*/
|
||||
public function updateFetchLog($field, $source, $lastOffset, $totalPages, $perPage = 0)
|
||||
{
|
||||
$lastOffset = max(0, intval($lastOffset));
|
||||
$perPage = intval($perPage);
|
||||
// last_page 仅作展示:由偏移量换算(per_page 未知时退化为偏移量本身)
|
||||
$lastPage = $perPage > 0 ? intval(floor($lastOffset / $perPage)) : $lastOffset;
|
||||
|
||||
$exists = Db::name('expert_fetch')
|
||||
->where('field', $field)
|
||||
->where('source', $source)
|
||||
@@ -201,6 +268,7 @@ class ExpertFinderService
|
||||
Db::name('expert_fetch')
|
||||
->where('expert_fetch_id', $exists['expert_fetch_id'])
|
||||
->update([
|
||||
'last_offset' => $lastOffset,
|
||||
'last_page' => $lastPage,
|
||||
'total_pages' => $totalPages,
|
||||
'last_time' => time(),
|
||||
@@ -209,6 +277,7 @@ class ExpertFinderService
|
||||
Db::name('expert_fetch')->insert([
|
||||
'field' => mb_substr($field, 0, 128),
|
||||
'source' => mb_substr($source, 0, 128),
|
||||
'last_offset' => $lastOffset,
|
||||
'last_page' => $lastPage,
|
||||
'total_pages' => $totalPages,
|
||||
'last_time' => time(),
|
||||
@@ -218,16 +287,16 @@ class ExpertFinderService
|
||||
|
||||
// ==================== PubMed Search ====================
|
||||
|
||||
private function searchViaPubMed($keyword, $perPage, $minYear, $page = 1)
|
||||
private function searchViaPubMed($keyword, $perPage, $minYear, $retstart = 0)
|
||||
{
|
||||
set_time_limit(600);
|
||||
|
||||
$searchResult = $this->esearch('pubmed', $keyword, $perPage, $minYear, $page);
|
||||
$searchResult = $this->esearch('pubmed', $keyword, $perPage, $minYear, $retstart);
|
||||
$ids = $searchResult['ids'];
|
||||
$totalArticles = $searchResult['total'];
|
||||
|
||||
if (empty($ids)) {
|
||||
return $this->buildPagedResult([], 0, 0, $totalArticles, $page, $perPage, 'pubmed');
|
||||
return $this->buildPagedResult([], 0, 0, $totalArticles, $retstart, $perPage, 'pubmed');
|
||||
}
|
||||
|
||||
$allAuthors = [];
|
||||
@@ -243,21 +312,21 @@ class ExpertFinderService
|
||||
|
||||
$experts = $this->aggregateExperts($allAuthors);
|
||||
|
||||
return $this->buildPagedResult($experts, count($experts), count($ids), $totalArticles, $page, $perPage, 'pubmed');
|
||||
return $this->buildPagedResult($experts, count($experts), count($ids), $totalArticles, $retstart, $perPage, 'pubmed');
|
||||
}
|
||||
|
||||
// ==================== PMC Search ====================
|
||||
|
||||
private function searchViaPMC($keyword, $perPage, $minYear, $page = 1)
|
||||
private function searchViaPMC($keyword, $perPage, $minYear, $retstart = 0)
|
||||
{
|
||||
set_time_limit(600);
|
||||
|
||||
$searchResult = $this->esearch('pmc', $keyword, $perPage, $minYear, $page);
|
||||
$searchResult = $this->esearch('pmc', $keyword, $perPage, $minYear, $retstart);
|
||||
$ids = $searchResult['ids'];
|
||||
$totalArticles = $searchResult['total'];
|
||||
|
||||
if (empty($ids)) {
|
||||
return $this->buildPagedResult([], 0, 0, $totalArticles, $page, $perPage, 'pmc');
|
||||
return $this->buildPagedResult([], 0, 0, $totalArticles, $retstart, $perPage, 'pmc');
|
||||
}
|
||||
|
||||
$allAuthors = [];
|
||||
@@ -273,15 +342,15 @@ class ExpertFinderService
|
||||
|
||||
$experts = $this->aggregateExperts($allAuthors);
|
||||
|
||||
return $this->buildPagedResult($experts, count($experts), count($ids), $totalArticles, $page, $perPage, 'pmc');
|
||||
return $this->buildPagedResult($experts, count($experts), count($ids), $totalArticles, $retstart, $perPage, 'pmc');
|
||||
}
|
||||
|
||||
// ==================== NCBI API ====================
|
||||
|
||||
private function esearch($db, $keyword, $perPage, $minYear, $page = 1)
|
||||
private function esearch($db, $keyword, $perPage, $minYear, $retstart = 0)
|
||||
{
|
||||
$term = $keyword . ' AND ' . $minYear . ':' . date('Y') . '[pdat]';
|
||||
$retstart = ($page - 1) * $perPage;
|
||||
$retstart = max(0, intval($retstart));
|
||||
|
||||
$response = $this->httpClient->get($this->ncbiBaseUrl . 'esearch.fcgi', [
|
||||
'query' => [
|
||||
@@ -563,18 +632,23 @@ class ExpertFinderService
|
||||
return $experts;
|
||||
}
|
||||
|
||||
private function buildPagedResult($experts, $expertCount, $articlesScanned, $totalArticles, $page, $perPage, $source)
|
||||
private function buildPagedResult($experts, $expertCount, $articlesScanned, $totalArticles, $retstart, $perPage, $source)
|
||||
{
|
||||
$perPage = max(1, intval($perPage));
|
||||
$retstart = max(0, intval($retstart));
|
||||
$totalPages = $totalArticles > 0 ? ceil($totalArticles / $perPage) : 0;
|
||||
$page = intval(floor($retstart / $perPage)) + 1;
|
||||
return [
|
||||
'experts' => $experts,
|
||||
'total' => $expertCount,
|
||||
'articles_scanned' => $articlesScanned,
|
||||
'total_articles' => $totalArticles,
|
||||
'page' => $page,
|
||||
'offset' => $retstart,
|
||||
'per_page' => $perPage,
|
||||
'total_pages' => $totalPages,
|
||||
'has_more' => $page < $totalPages,
|
||||
// 偏移量驱动:下一个窗口还在范围内才有更多
|
||||
'has_more' => ($retstart + $perPage) < $totalArticles,
|
||||
'source' => $source,
|
||||
];
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user