自动查重

This commit is contained in:
wangjinlei
2026-05-20 11:58:10 +08:00
parent 53e6ddbd9e
commit cfa3f791f4
11 changed files with 938 additions and 58 deletions

View File

@@ -0,0 +1,404 @@
<?php
namespace app\common;
use think\Db;
use think\Env;
use think\Exception;
use think\Queue;
/**
* 根据投稿记录 / 审稿人资料,用大模型总结用户主领域(中文)写入 field_ai。
* 队列链UserFieldAiFill → 处理一条 → enqueueNextFieldAi → 下一条。
*/
class UserFieldAiService
{
const QUEUE_NAME = 'UserFieldAi';
const STATUS_PENDING = 0;
const STATUS_DONE = 1;
const STATUS_INSUFFICIENT = 2;
const STATUS_FAILED = 3;
private $logFile;
public function __construct()
{
$this->logFile = ROOT_PATH . 'runtime' . DS . 'user_field_ai.log';
}
/**
* 启动链式处理(从 user_id=0 之后找第一个待处理用户)。
*
* @param bool $force true 时重算已生成用户
* @return bool 是否已推入首条 job
*/
public function startChain($force = false, $delay = 1, $queue = '')
{
return $this->enqueueNextFieldAi($delay, $queue, 0, $force);
}
/**
* 链式:找 user_id > $afterUserId 的下一位待处理用户并入队。
*/
public function enqueueNextFieldAi($delay = 1, $queue = '', $afterUserId = 0, $force = false)
{
if ($queue === '') {
$queue = self::QUEUE_NAME;
}
$afterUserId = intval($afterUserId);
$userId = $this->findNextPendingUserId($afterUserId, $force);
if ($userId <= 0) {
$this->log('[FieldAi] chain finished after user_id=' . $afterUserId . ' force=' . ($force ? '1' : '0'));
return false;
}
$data = [
'user_id' => $userId,
'queue' => $queue,
'force' => $force ? 1 : 0,
];
$jobClass = 'app\\api\\job\\UserFieldAiFill@fire';
if ($delay > 0) {
Queue::later($delay, $jobClass, $data, $queue);
} else {
Queue::push($jobClass, $data, $queue);
}
$this->log('[FieldAi] enqueued user_id=' . $userId . ' queue=' . $queue);
return true;
}
/**
* 处理单个用户(队列 Job 或同步调试)。
*
* @return array{ok:bool, skipped?:bool, insufficient?:bool, field_ai?:string, error?:string}
*/
public function processUser($userId, $force = false)
{
$userId = intval($userId);
if ($userId <= 0) {
return ['ok' => false, 'error' => 'invalid user_id'];
}
$this->ensureReviewerInfoRow($userId);
$uri = Db::name('user_reviewer_info')->where('reviewer_id', $userId)->find();
if (!$uri) {
return ['ok' => false, 'error' => 'reviewer_info missing'];
}
if (!$force && intval($uri['field_ai_status']) === self::STATUS_DONE && trim((string)$uri['field_ai']) !== '') {
return ['ok' => true, 'skipped' => true, 'field_ai' => (string)$uri['field_ai']];
}
if (!$this->isEligible($userId, $uri)) {
$this->updateFieldAi($userId, '', self::STATUS_INSUFFICIENT, 'insufficient profile/articles');
return ['ok' => true, 'insufficient' => true];
}
try {
$context = $this->buildContext($userId, $uri);
$fieldAi = $this->summarizeWithLlm($context);
if ($fieldAi === '') {
throw new Exception('LLM returned empty field');
}
$this->updateFieldAi($userId, $fieldAi, self::STATUS_DONE, '');
return ['ok' => true, 'field_ai' => $fieldAi];
} catch (\Throwable $e) {
$this->updateFieldAi($userId, '', self::STATUS_FAILED, mb_substr($e->getMessage(), 0, 500));
$this->log('[FieldAi] user_id=' . $userId . ' fail: ' . $e->getMessage());
return ['ok' => false, 'error' => $e->getMessage()];
}
}
/**
* 是否满足「可总结」:有投稿 或 审稿人资料较全。
*/
public function isEligible($userId, $uri = null)
{
if ($this->hasSubmittedArticles($userId)) {
return true;
}
if ($uri === null) {
$uri = Db::name('user_reviewer_info')->where('reviewer_id', $userId)->find();
}
return $this->isReviewerProfileComplete($uri);
}
public function hasSubmittedArticles($userId)
{
$n = Db::name('article')
->where('user_id', intval($userId))
->where('title', '<>', '')
->count();
return $n > 0;
}
/**
* 审稿人资料字段填充数达到阈值视为「较全」。
*/
public function isReviewerProfileComplete($uri)
{
if (!$uri || !is_array($uri)) {
return false;
}
$minFilled = max(3, (int) Env::get('user_field_ai.min_profile_fields', 4));
$keys = ['field', 'company', 'country', 'technical', 'introduction', 'department', 'website'];
$filled = 0;
foreach ($keys as $k) {
if (!empty($uri[$k]) && trim((string)$uri[$k]) !== '') {
$filled++;
}
}
if (!empty($uri['major']) && trim((string)$uri['major']) !== '' && trim((string)$uri['major']) !== '0') {
$filled++;
}
$majorCount = Db::name('major_to_user')->where('user_id', intval($uri['reviewer_id']))->where('state', 0)->count();
if ($majorCount > 0) {
$filled++;
}
return $filled >= $minFilled;
}
private function findNextPendingUserId($afterUserId, $force)
{
$batch = max(20, (int) Env::get('user_field_ai.scan_batch', 80));
$cursor = intval($afterUserId);
while (true) {
$query = Db::name('user')->alias('u')
->leftJoin('t_user_reviewer_info uri', 'uri.reviewer_id = u.user_id')
->where('u.user_id', '>', $cursor);
if (!$force) {
$query->where(function ($q) {
$q->where('uri.field_ai_status', self::STATUS_PENDING)
->whereOr('uri.field_ai_status', self::STATUS_FAILED)
->whereOr('uri.reviewer_info_id', 'null');
});
}
$ids = $query->order('u.user_id asc')->limit($batch)->column('u.user_id');
if (empty($ids)) {
return 0;
}
foreach ($ids as $uid) {
$uid = intval($uid);
$cursor = $uid;
$this->ensureReviewerInfoRow($uid);
$uri = Db::name('user_reviewer_info')->where('reviewer_id', $uid)->find();
if (!$force && intval($uri['field_ai_status']) === self::STATUS_DONE) {
continue;
}
if (!$force && intval($uri['field_ai_status']) === self::STATUS_INSUFFICIENT) {
continue;
}
if ($this->isEligible($uid, $uri)) {
return $uid;
}
if (!$force) {
$this->updateFieldAi($uid, '', self::STATUS_INSUFFICIENT, 'auto skip: insufficient data');
}
}
}
}
private function buildContext($userId, array $uri)
{
$user = Db::name('user')->where('user_id', $userId)->field('user_id,realname,email,account')->find();
$majorTitles = $this->resolveMajorTitles($userId, $uri);
$maxArticles = max(1, min(10, (int) Env::get('user_field_ai.max_articles', 5)));
$articles = Db::name('article')
->where('user_id', $userId)
->where('title', '<>', '')
->order('article_id desc')
->limit($maxArticles)
->field('article_id,title,keywords,abstrart,journal_id,ctime')
->select();
$journalNames = [];
if (!empty($articles)) {
$jids = array_unique(array_filter(array_column($articles, 'journal_id')));
if (!empty($jids)) {
$journalNames = Db::name('journal')->where('journal_id', 'in', $jids)->column('title', 'journal_id');
}
}
$articleBlocks = [];
foreach ($articles as $a) {
$jid = intval($a['journal_id']);
$articleBlocks[] = [
'title' => (string) $a['title'],
'journal' => isset($journalNames[$jid]) ? (string) $journalNames[$jid] : '',
'keywords' => (string) ($a['keywords'] ?? ''),
'abstract' => mb_substr(trim((string) ($a['abstrart'] ?? '')), 0, 800),
];
}
return [
'user' => [
'realname' => $user ? (string) $user['realname'] : '',
'email' => $user ? (string) $user['email'] : '',
],
'profile' => [
'field' => trim((string) ($uri['field'] ?? '')),
'technical' => trim((string) ($uri['technical'] ?? '')),
'company' => trim((string) ($uri['company'] ?? '')),
'department' => trim((string) ($uri['department'] ?? '')),
'country' => trim((string) ($uri['country'] ?? '')),
'introduction' => mb_substr(trim((string) ($uri['introduction'] ?? '')), 0, 1200),
'website' => trim((string) ($uri['website'] ?? '')),
'majors' => $majorTitles,
],
'articles' => $articleBlocks,
];
}
private function resolveMajorTitles($userId, array $uri)
{
$titles = [];
$ids = Db::name('major_to_user')->where('user_id', $userId)->where('state', 0)->column('major_id');
if (!empty($ids)) {
$titles = Db::name('reviewer_major')->where('major_id', 'in', $ids)->where('state', 0)->column('title');
}
if (empty($titles) && !empty($uri['major'])) {
$legacy = array_filter(array_map('intval', explode(',', (string) $uri['major'])));
if (!empty($legacy)) {
$titles = Db::name('reviewer_major')->where('major_id', 'in', $legacy)->column('title');
}
}
return array_values(array_unique(array_filter(array_map('trim', $titles))));
}
private function summarizeWithLlm(array $context)
{
$url = trim((string) Env::get('user_field_ai.chat_url', Env::get('expert_country_chat_url', Env::get('citation_chat_url', ''))));
$model = trim((string) Env::get('user_field_ai.chat_model', Env::get('expert_country_chat_model', Env::get('citation_chat_model', 'gpt-4.1'))));
$apiKey = trim((string) Env::get('user_field_ai.chat_api_key', Env::get('expert_country_chat_api_key', Env::get('citation_chat_api_key', ''))));
if ($url === '' || $model === '') {
throw new Exception('user_field_ai chat not configured (chat_url / chat_model)');
}
$payloadJson = json_encode($context, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES);
$messages = [
[
'role' => 'system',
'content' => '你是学术领域分类助手。根据用户的投稿与个人资料,用简体中文给出该用户最主要的研究领域总结。'
. '要求精确、简洁13 个中文领域词或短短语,用顿号分隔;不要解释、不要英文、不要 JSON 以外的多余文字。'
. '只输出 JSON{"field_ai":"..."}。',
],
[
'role' => 'user',
'content' => "请根据以下 JSON 资料总结该用户的主要研究领域:\n" . $payloadJson,
],
];
$body = [
'model' => $model,
'temperature' => 0.2,
'messages' => $messages,
];
$ch = curl_init();
curl_setopt_array($ch, [
CURLOPT_URL => $url,
CURLOPT_POST => true,
CURLOPT_POSTFIELDS => json_encode($body, JSON_UNESCAPED_UNICODE),
CURLOPT_RETURNTRANSFER => true,
CURLOPT_CONNECTTIMEOUT => 15,
CURLOPT_TIMEOUT => max(30, (int) Env::get('user_field_ai.timeout', 90)),
CURLOPT_HTTPHEADER => array_filter([
'Content-Type: application/json',
$apiKey !== '' ? 'Authorization: Bearer ' . $apiKey : null,
]),
]);
$raw = curl_exec($ch);
$code = (int) curl_getinfo($ch, CURLINFO_HTTP_CODE);
$err = curl_error($ch);
curl_close($ch);
if ($raw === false) {
throw new Exception('LLM curl error: ' . $err);
}
if ($code < 200 || $code >= 300) {
throw new Exception('LLM HTTP ' . $code . ': ' . mb_substr((string) $raw, 0, 400));
}
$data = json_decode($raw, true);
$content = '';
if (is_array($data) && isset($data['choices'][0]['message']['content'])) {
$content = trim((string) $data['choices'][0]['message']['content']);
} elseif (is_string($raw)) {
$content = trim($raw);
}
$fieldAi = $this->parseFieldAiFromContent($content);
if ($fieldAi === '' && $content !== '') {
$fieldAi = $this->cleanFieldAiText($content);
}
return $fieldAi;
}
private function parseFieldAiFromContent($content)
{
$content = trim((string) $content);
if ($content === '') {
return '';
}
$content = preg_replace('/^```[a-zA-Z]*\s*|```$/m', '', $content);
if (preg_match('/\{.*\}/s', $content, $m)) {
$obj = json_decode($m[0], true);
if (is_array($obj) && !empty($obj['field_ai'])) {
return $this->cleanFieldAiText((string) $obj['field_ai']);
}
}
$obj = json_decode($content, true);
if (is_array($obj) && !empty($obj['field_ai'])) {
return $this->cleanFieldAiText((string) $obj['field_ai']);
}
return '';
}
private function cleanFieldAiText($text)
{
$text = trim((string) $text);
$text = trim($text, "\"' \t\n\r");
$text = preg_replace('/\s+/u', '', $text);
if (mb_strlen($text) > 200) {
$text = mb_substr($text, 0, 200);
}
return $text;
}
public function ensureReviewerInfoRow($userId)
{
$exists = Db::name('user_reviewer_info')->where('reviewer_id', $userId)->find();
if ($exists) {
return;
}
Db::name('user_reviewer_info')->insert([
'reviewer_id' => $userId,
'ctime' => time(),
'state' => 0,
]);
}
private function updateFieldAi($userId, $fieldAi, $status, $note)
{
$data = [
'field_ai' => mb_substr(trim((string) $fieldAi), 0, 512),
'field_ai_status' => intval($status),
'field_ai_utime' => time(),
];
Db::name('user_reviewer_info')->where('reviewer_id', $userId)->update($data);
if ($note !== '') {
$this->log('[FieldAi] user_id=' . $userId . ' status=' . $status . ' note=' . $note);
}
}
public function log($msg)
{
$line = date('Y-m-d H:i:s') . ' ' . $msg . PHP_EOL;
@file_put_contents($this->logFile, $line, FILE_APPEND);
}
}