diff --git a/application/api/controller/EmailClient.php b/application/api/controller/EmailClient.php index d9da30cf..e1142acf 100644 --- a/application/api/controller/EmailClient.php +++ b/application/api/controller/EmailClient.php @@ -2817,7 +2817,28 @@ class EmailClient extends Base break; case 1: // 主编(预留,本期不实现) + break; case 4: // 作者(预留) + Db::name("article_author")->alias('aa') + ->join('t_user u', 'u.email = aa.email', 'inner') + ->join("t_article a","a.article_id = aa.article_id","left") + ->join('t_user_reviewer_info uri', 'uri.reviewer_id = u.user_id', 'left') + ->where('a.journal_id', $journalId) + ->where('u.email', '<>', '') + ->where('u.unsubscribed', 0); + break; + case 6: //获取往期的青年编委2025年以前的,中国人 + $now = strtotime('2025-01-01'); + $query = Db::name('user_to_yboard')->alias('y') + ->join('t_user u', 'u.user_id = y.user_id', 'inner') + ->join('t_user_reviewer_info uri', 'uri.reviewer_id = u.user_id', 'left') + ->where('y.journal_id', $journalId) + ->where('y.state', 0) + ->where('y.start_date', '<=', $now) + ->where('uri.country', 'China') + ->where('u.email', '<>', '') + ->where('u.unsubscribed', 0); + break;// default: return []; } diff --git a/application/api/controller/Plagiarism.php b/application/api/controller/Plagiarism.php index a19110e4..74068ffa 100644 --- a/application/api/controller/Plagiarism.php +++ b/application/api/controller/Plagiarism.php @@ -12,7 +12,7 @@ use think\Validate; * 论文查重(Turnitin / Crossref Similarity Check)控制器。 * * 触发方式:纯手工(编辑后台点"查重"按钮)。 - * 报告策略:在线 viewer URL 临时签名 + PDF 永久落盘 runtime/plagiarism/。 + * 报告策略:PDF 在 poll 完成时落盘;在线 viewer URL 通过 getReportUrl 按需生成(临时签名)。 * * 主要接口: * POST submit 触发查重 @@ -37,12 +37,14 @@ class Plagiarism extends Base * article_id 必填 * file_url 选填;不传则按 article_id 在 t_article_file 找 manuscirpt * editor_id 选填;触发人 user_id(前端拿不到也可以传 0) + * check_type 选填;full(默认全文)| body_only(正文)| both(各提交一条) */ public function submit() { $articleId = intval($this->request->param('article_id', 0)); $fileUrl = trim($this->request->param('file_url', '')); $editorId = intval($this->request->param('editor_id', 0)); + $checkType = trim($this->request->param('check_type', 'full')); if ($articleId <= 0) { return jsonError('article_id required'); @@ -53,8 +55,12 @@ class Plagiarism extends Base $localPath = $fileUrl !== '' ? $svc->resolveFileUrlToLocal($fileUrl) : $svc->locateArticleManuscript($articleId); - $checkId = $svc->submit($articleId, $localPath, $editorId, 'manual'); - return jsonSuccess(['check_id' => $checkId]); + if (strtolower($checkType) === 'both') { + $ids = $svc->submitBoth($articleId, $localPath, $editorId, 'manual'); + return jsonSuccess($ids); + } + $checkId = $svc->submit($articleId, $localPath, $editorId, 'manual', $checkType); + return jsonSuccess(['check_id' => $checkId, 'check_type' => strtolower($checkType) ?: 'full']); } catch (\Throwable $e) { return jsonError($e->getMessage()); } @@ -257,10 +263,14 @@ class Plagiarism extends Base 'similarity_score' => floatval($r['similarity_score']), 'tii_report_status' => (string)$r['tii_report_status'], 'has_pdf' => !empty($r['pdf_local_path']), + 'local_pdf_url' => $r['pdf_local_path'], 'has_viewer_url' => !empty($r['view_only_url']) && intval($r['view_only_url_expire']) > time(), 'attempts' => intval($r['attempts']), 'error_msg' => (string)$r['error_msg'], 'source_file_name' => (string)$r['source_file_name'], + 'check_type' => (string)($r['check_type'] ?? 'full'), + 'check_type_label' => $this->checkTypeLabel($r['check_type'] ?? 'full'), + 'derived_file_path'=> (string)($r['derived_file_path'] ?? ''), 'trigger_source' => (string)$r['trigger_source'], 'triggered_by' => intval($r['triggered_by']), 'ctime' => intval($r['ctime']), @@ -268,6 +278,15 @@ class Plagiarism extends Base ]; } + private function checkTypeLabel($checkType) + { + $t = strtolower(trim((string) $checkType)); + if ($t === 'body_only' || $t === 'body') { + return '正文查重'; + } + return '全文查重'; + } + private function stateLabel($state) { $map = [ diff --git a/application/api/controller/UserFieldAi.php b/application/api/controller/UserFieldAi.php new file mode 100644 index 00000000..78680155 --- /dev/null +++ b/application/api/controller/UserFieldAi.php @@ -0,0 +1,92 @@ +request->param('force', 0)) === 1; + $delay = max(0, intval($this->request->param('delay', 1))); + + $svc = new UserFieldAiService(); + $started = $svc->startChain($force, $delay); + + return jsonSuccess([ + 'started' => $started, + 'queue' => UserFieldAiService::QUEUE_NAME, + 'force' => $force, + 'msg' => $started ? 'chain enqueued' : 'no pending users', + ]); + } + + /** + * 同步处理单个用户(不调队列)。 + */ + public function processOne() + { + $userId = intval($this->request->param('user_id', 0)); + $force = intval($this->request->param('force', 0)) === 1; + if ($userId <= 0) { + return jsonError('user_id required'); + } + + $svc = new UserFieldAiService(); + $result = $svc->processUser($userId, $force); + if (empty($result['ok'])) { + return jsonError(isset($result['error']) ? $result['error'] : 'failed'); + } + return jsonSuccess($result); + } + + /** + * 预览:是否满足条件、当前 field_ai 状态。 + */ + public function preview() + { + $userId = intval($this->request->param('user_id', 0)); + if ($userId <= 0) { + return jsonError('user_id required'); + } + + $svc = new UserFieldAiService(); + $svc->ensureReviewerInfoRow($userId); + $uri = Db::name('user_reviewer_info')->where('reviewer_id', $userId)->find(); + + return jsonSuccess([ + 'user_id' => $userId, + 'has_articles' => $svc->hasSubmittedArticles($userId), + 'profile_complete' => $svc->isReviewerProfileComplete($uri), + 'eligible' => $svc->isEligible($userId, $uri), + 'field_ai' => $uri ? (string) $uri['field_ai'] : '', + 'field_ai_status' => $uri ? intval($uri['field_ai_status']) : 0, + 'field_ai_utime' => $uri ? intval($uri['field_ai_utime']) : 0, + 'field_ai_status_text' => $this->statusLabel($uri ? intval($uri['field_ai_status']) : 0), + ]); + } + + private function statusLabel($status) + { + $map = [ + UserFieldAiService::STATUS_PENDING => 'pending', + UserFieldAiService::STATUS_DONE => 'done', + UserFieldAiService::STATUS_INSUFFICIENT => 'insufficient', + UserFieldAiService::STATUS_FAILED => 'failed', + ]; + return isset($map[$status]) ? $map[$status] : 'unknown'; + } +} diff --git a/application/api/job/PlagiarismPoll.php b/application/api/job/PlagiarismPoll.php index c4313d2a..f4e1b214 100644 --- a/application/api/job/PlagiarismPoll.php +++ b/application/api/job/PlagiarismPoll.php @@ -23,16 +23,16 @@ class PlagiarismPoll public function fire(Job $job, $data) { -// $checkId = isset($data['check_id']) ? intval($data['check_id']) : 0; -// $attempt = isset($data['attempt']) ? intval($data['attempt']) : 1; -// -// if ($checkId <= 0) { -// $job->delete(); -// return; -// } + $checkId = isset($data['check_id']) ? intval($data['check_id']) : 0; + $attempt = isset($data['attempt']) ? intval($data['attempt']) : 1; + + if ($checkId <= 0) { + $job->delete(); + return; + } $svc = new PlagiarismService(); $svc->log("PlagiarismPoll job is running"); -// $svc->runPollStatus($checkId, $attempt); + $svc->runPollStatus($checkId, $attempt); $job->delete(); } } diff --git a/application/api/job/UserFieldAiFill.php b/application/api/job/UserFieldAiFill.php new file mode 100644 index 00000000..acb0dcd6 --- /dev/null +++ b/application/api/job/UserFieldAiFill.php @@ -0,0 +1,35 @@ + 0) { + $svc->processUser($userId, $force); + } + $job->delete(); + + $delay = max(0, (int) (isset($data['delay']) ? $data['delay'] : 1)); + $svc->enqueueNextFieldAi($delay, $queue, $userId, $force); + } +} diff --git a/application/common/ArticleParserService.php b/application/common/ArticleParserService.php index 2d4619c8..2996ed01 100644 --- a/application/common/ArticleParserService.php +++ b/application/common/ArticleParserService.php @@ -1153,12 +1153,12 @@ class ArticleParserService } /** - * 提取 Word 文档中的参考文献列表(仅返回数组,不做入库) - * @return array 每条为一个参考文献的纯文本字符串 + * 按段落提取 Word 全文行(供正文裁切、参考文献识别等复用) + * @return array */ - public static function getReferencesFromWord($filePath): array + public static function collectParagraphLines($filePath): array { - $othis = new self($filePath) ; + $othis = new self($filePath); if (empty($othis->sections)) { return []; } @@ -1166,13 +1166,26 @@ class ArticleParserService $lines = []; foreach ($othis->sections as $section) { foreach ($section->getElements() as $element) { - $text = $othis->getTextFromElement($element); - $text = trim((string)$text); - if ($text === '') continue; - $lines[] = $text; + $text = trim((string) $othis->getTextFromElement($element)); + if ($text === '') { + continue; + } + if (!mb_check_encoding($text, 'UTF-8')) { + $text = mb_convert_encoding($text, 'UTF-8', 'GBK'); + } + $lines[] = preg_replace('/\s+/u', ' ', $text); } } + return $lines; + } + /** + * 提取 Word 文档中的参考文献列表(仅返回数组,不做入库) + * @return array 每条为一个参考文献的纯文本字符串 + */ + public static function getReferencesFromWord($filePath): array + { + $lines = self::collectParagraphLines($filePath); if (empty($lines)) { return []; } diff --git a/application/common/ManuscriptBodyExtractor.php b/application/common/ManuscriptBodyExtractor.php new file mode 100644 index 00000000..cd44e0fd --- /dev/null +++ b/application/common/ManuscriptBodyExtractor.php @@ -0,0 +1,242 @@ + */ + private $lines = []; + + /** + * @return array{path:string, rel_path:string, line_count:int, ref_start:int, body_start:int, warnings:array} + */ + public function buildBodyOnlyDocx($sourcePath, $articleId = 0) + { + $sourcePath = trim((string) $sourcePath); + if (!is_file($sourcePath) || !is_readable($sourcePath)) { + throw new Exception('Manuscript not readable: ' . $sourcePath); + } + $ext = strtolower(pathinfo($sourcePath, PATHINFO_EXTENSION)); + if ($ext !== 'docx') { + throw new Exception('body_only check requires DOCX manuscript, got: ' . $ext); + } + + $this->lines = ArticleParserService::collectParagraphLines($sourcePath); + if (empty($this->lines)) { + throw new Exception('No text extracted from manuscript'); + } + + $refStart = $this->findReferenceStartIndex(); + $bodyStart = $this->findBodyStartIndex(); + $warnings = []; + + if ($refStart < 0) { + $warnings[] = 'references_heading_not_found; using document end'; + $refStart = count($this->lines); + } + if ($bodyStart >= $refStart) { + throw new Exception('Could not locate main body (front matter may include entire document)'); + } + + $bodyLines = array_slice($this->lines, $bodyStart, $refStart - $bodyStart); + $bodyLines = $this->normalizeBodyLines($bodyLines); + if (count($bodyLines) < 3) { + throw new Exception('Body text too short after extraction (' . count($bodyLines) . ' paragraphs)'); + } + + $relPath = $this->writeBodyDocx($bodyLines, $articleId); + $rootDir = rtrim(ROOT_PATH ?: dirname(dirname(__DIR__)), '/\\'); + $absPath = $rootDir . DIRECTORY_SEPARATOR . str_replace(['/', '\\'], DIRECTORY_SEPARATOR, $relPath); + + return [ + 'path' => $absPath, + 'rel_path' => $relPath, + 'line_count' => count($bodyLines), + 'ref_start' => $refStart, + 'body_start' => $bodyStart, + 'warnings' => $warnings, + ]; + } + + private function findReferenceStartIndex() + { + $stopKeywords = [ + 'acknowledgements', 'acknowledgments', 'funding', 'appendix', 'supplementary', + 'conflict of interest', 'competing interests', 'author contributions', + '致谢', '基金', '附录', '补充材料', '利益冲突', '作者贡献', + ]; + + foreach ($this->lines as $i => $line) { + $t = trim($line); + if ($t === '') { + continue; + } + if (preg_match('/^\s*(references|reference|bibliography|参考文献|文献)\b\s*[::]?\s*/iu', $t)) { + return $i; + } + $lower = strtolower($t); + foreach ($stopKeywords as $sk) { + $skLower = strtolower($sk); + if ($lower === $skLower || $lower === $skLower . ':' || $lower === $skLower . ':') { + if ($i > count($this->lines) * 0.4) { + return $i; + } + } + } + } + return -1; + } + + private function findBodyStartIndex() + { + $n = count($this->lines); + $introIdx = -1; + $keywordsIdx = -1; + + for ($i = 0; $i < $n; $i++) { + $t = trim($this->lines[$i]); + if ($t === '') { + continue; + } + if ($introIdx < 0 && $this->isIntroductionHeading($t)) { + $introIdx = $i; + } + if ($keywordsIdx < 0 && preg_match('/^\s*keywords?\b\s*[::]?/iu', $t)) { + $keywordsIdx = $i; + } + } + + if ($introIdx >= 0) { + return $introIdx; + } + + if ($keywordsIdx >= 0) { + $afterKw = $this->indexAfterKeywordsBlock($keywordsIdx); + if ($afterKw < $n) { + return $afterKw; + } + } + + return $this->indexAfterFrontMatterFallback(); + } + + private function isIntroductionHeading($t) + { + if (preg_match('/^\s*(introduction|background|materials and methods|materials & methods|methods and materials)\b\s*[::]?/iu', $t)) { + return true; + } + if (preg_match('/^\s*(引言|前言|背景|材料与方法|资料与方法|研究方法)\b\s*[::]?/iu', $t)) { + return true; + } + if (preg_match('/^\s*1[\.\s、]+(introduction|引言|前言)\b/iu', $t)) { + return true; + } + return false; + } + + private function indexAfterKeywordsBlock($kwIdx) + { + $n = count($this->lines); + for ($i = $kwIdx + 1; $i < $n; $i++) { + $t = trim($this->lines[$i]); + if ($t === '') { + continue; + } + if ($this->isIntroductionHeading($t)) { + return $i; + } + if (preg_match('/^\s*abstract\b/iu', $t)) { + continue; + } + if (mb_strlen($t) >= 30 && !$this->looksLikeAffiliationLine($t)) { + return $i; + } + } + return min($kwIdx + 1, $n - 1); + } + + private function indexAfterFrontMatterFallback() + { + $n = count($this->lines); + $maxSkip = min(20, (int) floor($n * 0.15)); + for ($i = 0; $i < $maxSkip && $i < $n; $i++) { + $t = trim($this->lines[$i]); + if ($t === '') { + continue; + } + if ($this->isIntroductionHeading($t)) { + return $i; + } + } + return min(8, max(0, $n - 1)); + } + + private function looksLikeAffiliationLine($t) + { + if (preg_match('/@|mailto:|correspond|univ|university|hospital|institute|department|^\d+[\s,,]/iu', $t)) { + return true; + } + if (preg_match('/^\s*abstract\b/iu', $t) || preg_match('/^\s*keywords?\b/iu', $t)) { + return true; + } + return false; + } + + /** + * @param array $bodyLines + * @return array + */ + private function normalizeBodyLines(array $bodyLines) + { + $out = []; + foreach ($bodyLines as $line) { + $line = trim($line); + if ($line === '') { + continue; + } + if (preg_match('/^\{\s*ADDIN\s+EN\.REFLIST\s*\}$/i', $line)) { + continue; + } + $out[] = $line; + } + return $out; + } + + /** + * @param array $bodyLines + */ + private function writeBodyDocx(array $bodyLines, $articleId) + { + $rootDir = rtrim(ROOT_PATH ?: dirname(dirname(__DIR__)), '/\\'); + $dir = $rootDir . DIRECTORY_SEPARATOR . self::BODY_SUBDIR; + if (!is_dir($dir)) { + @mkdir($dir, 0755, true); + } + + $name = sprintf('body_article_%d_%s.docx', intval($articleId), date('Ymd_His')); + $absPath = $dir . DIRECTORY_SEPARATOR . $name; + + $phpWord = new PhpWord(); + $section = $phpWord->addSection(); + foreach ($bodyLines as $line) { + $section->addText($line); + } + $writer = IOFactory::createWriter($phpWord, 'Word2007'); + $writer->save($absPath); + + if (!is_file($absPath) || filesize($absPath) < 200) { + throw new Exception('Failed to write body-only docx'); + } + + return self::BODY_SUBDIR . '/' . $name; + } +} diff --git a/application/common/PlagiarismService.php b/application/common/PlagiarismService.php index 63286e98..dff25596 100644 --- a/application/common/PlagiarismService.php +++ b/application/common/PlagiarismService.php @@ -16,7 +16,7 @@ use think\Exception; * PlagiarismRun → 创建 submission + 上传文件 → 入队 PlagiarismWaitIngest * PlagiarismWaitIngest → 单次 GET submission 状态;就绪则入队 PlagiarismTriggerSimilarity,否则延迟再入队 * PlagiarismTriggerSimilarity → PUT similarity → state=2(比对中),入队 PlagiarismPoll - * PlagiarismPoll → 轮询 similarity,完成后下载 PDF → state=3(完成) + * PlagiarismPoll → 轮询 similarity,完成后下载 PDF → state=3(完成);在线 viewer URL 按需 getReportUrl 调用 refreshViewerUrlFor * 任意环节抛异常 → state=4(失败),写 error_msg * * Worker:请用 `queue:work` 消费队列 **plagiarism**(整条链与轮询均在此队列;若此前单独监听 PlagiarismRun / PlagiarismPoll,需改为 plagiarism)。 @@ -31,6 +31,9 @@ class PlagiarismService /** Run / WaitIngest / TriggerSimilarity / Poll 共用队列名 */ const QUEUE_CHAIN = 'plagiarism'; + const CHECK_TYPE_FULL = 'full'; + const CHECK_TYPE_BODY = 'body_only'; + const JOB_WAIT_INGEST = 'app\\api\\job\\PlagiarismWaitIngest'; const JOB_TRIGGER_SIM = 'app\\api\\job\\PlagiarismTriggerSimilarity'; const JOB_POLL = 'app\\api\\job\\PlagiarismPoll'; @@ -61,38 +64,83 @@ class PlagiarismService * @param string $filePath 本地可读的 PDF/DOCX 绝对路径 * @param int $triggeredBy 触发人 user_id(手工触发时编辑后台的 user_id) * @param string $source 'manual' / 'auto_xxx' + * @param string $checkType full | body_only * @return int check_id */ - public function submit($articleId, $filePath, $triggeredBy = 0, $source = 'manual') + public function submit($articleId, $filePath, $triggeredBy = 0, $source = 'manual', $checkType = self::CHECK_TYPE_FULL) { if (!is_file($filePath) || !is_readable($filePath)) { throw new Exception("File not readable: {$filePath}"); } + $checkType = $this->normalizeCheckType($checkType); + $uploadPath = $filePath; + $derivedRel = ''; + $sourceName = basename($filePath); + + if ($checkType === self::CHECK_TYPE_BODY) { + $built = (new ManuscriptBodyExtractor())->buildBodyOnlyDocx($filePath, $articleId); + $uploadPath = $built['path']; + $derivedRel = (string) $built['rel_path']; + $sourceName = basename($uploadPath); + if (!empty($built['warnings'])) { + $this->log('body_only warnings check article=' . $articleId . ' ' . implode('; ', $built['warnings'])); + } + } + $journalId = (int) Db::name('article') ->where('article_id', $articleId) ->value('journal_id'); - $this->log("plagiarism submit is running"); + $this->log("plagiarism submit type={$checkType} article={$articleId}"); $now = time(); - $checkId = Db::name('plagiarism_check')->insertGetId([ + $row = [ 'article_id' => $articleId, 'journal_id' => $journalId, 'triggered_by' => $triggeredBy, 'trigger_source' => $source, - 'state' => 1, // 上传中 - 'source_file_name' => basename($filePath), - 'source_file_size' => filesize($filePath) ?: 0, + 'check_type' => $checkType, + 'state' => 1, + 'source_file_name' => $sourceName, + 'source_file_size' => filesize($uploadPath) ?: 0, 'ctime' => $now, 'utime' => $now, - ]); + ]; + if ($derivedRel !== '') { + $row['derived_file_path'] = $derivedRel; + } + $checkId = Db::name('plagiarism_check')->insertGetId($row); Queue::push( 'app\\api\\job\\PlagiarismRun', - ['check_id' => $checkId, 'file_path' => $filePath], + ['check_id' => $checkId, 'file_path' => $uploadPath], self::QUEUE_CHAIN ); - return (int)$checkId; + return (int) $checkId; + } + + /** + * 同时提交全文 + 正文两次查重 + * @return array{full:int, body_only:int} + */ + public function submitBoth($articleId, $filePath, $triggeredBy = 0, $source = 'manual') + { + return [ + 'full' => $this->submit($articleId, $filePath, $triggeredBy, $source, self::CHECK_TYPE_FULL), + 'body_only' => $this->submit($articleId, $filePath, $triggeredBy, $source, self::CHECK_TYPE_BODY), + ]; + } + + private function normalizeCheckType($checkType) + { + $t = strtolower(trim((string) $checkType)); + if ($t === '' || $t === self::CHECK_TYPE_FULL || $t === 'full') { + return self::CHECK_TYPE_FULL; + } + if ($t === self::CHECK_TYPE_BODY || $t === 'body' || $t === 'bodyonly') { + return self::CHECK_TYPE_BODY; + } + throw new Exception('invalid check_type, use full or body_only'); } /** @@ -252,7 +300,7 @@ class PlagiarismService return; } -// try { + try { $tii = new TurnitinService(); $statusResp = $tii->getSimilarityStatus($check['tii_submission_id']); $status = isset($statusResp['status']) ? strtoupper($statusResp['status']) : ''; @@ -267,17 +315,13 @@ class PlagiarismService $score = isset($statusResp['overall_match_percentage']) ? floatval($statusResp['overall_match_percentage']) : 0; - // 下载 PDF + 取在线查看 URL $localPdf = $this->downloadAndStorePdf($tii, $check['tii_submission_id'], $checkId); - $viewerInfo = $this->refreshViewerUrl($tii, $check['tii_submission_id']); $this->updateCheck($checkId, [ - 'state' => 3, - 'similarity_score' => $score, - 'pdf_local_path' => $localPdf, - 'view_only_url' => $viewerInfo['url'], - 'view_only_url_expire' => $viewerInfo['expire'], - 'error_msg' => '', + 'state' => 3, + 'similarity_score' => $score, + 'pdf_local_path' => $localPdf, + 'error_msg' => '', ]); return; } @@ -288,7 +332,6 @@ class PlagiarismService return; } - // PROCESSING 或其它中间态:继续轮询 if ($attempt >= self::MAX_POLL_ATTEMPTS) { $this->markFailed($checkId, '[poll] timeout after ' . $attempt . ' attempts'); return; @@ -299,28 +342,27 @@ class PlagiarismService ['check_id' => $checkId, 'attempt' => $attempt + 1], self::QUEUE_CHAIN ); -// } catch (\Throwable $e) { -// // 网络抖动不要直接 fail,给一定容错次数 -// if ($attempt < self::MAX_POLL_ATTEMPTS) { -// Queue::later( -// self::POLL_INTERVAL, -// self::JOB_POLL, -// ['check_id' => $checkId, 'attempt' => $attempt + 1], -// self::QUEUE_CHAIN -// ); -// $this->updateCheck($checkId, [ -// 'attempts' => $attempt, -// 'error_msg' => '[poll] transient: ' . $e->getMessage(), -// ]); -// return; -// } -// $this->markFailed($checkId, '[poll] exhausted: ' . $e->getMessage()); -// throw $e; -// } + } catch (\Throwable $e) { + if ($attempt < self::MAX_POLL_ATTEMPTS) { + Queue::later( + self::POLL_INTERVAL, + self::JOB_POLL, + ['check_id' => $checkId, 'attempt' => $attempt + 1], + self::QUEUE_CHAIN + ); + $this->updateCheck($checkId, [ + 'attempts' => $attempt, + 'error_msg' => '[poll] transient: ' . $e->getMessage(), + ]); + return; + } + $this->markFailed($checkId, '[poll] exhausted: ' . $e->getMessage()); + throw $e; + } } /** - * 重新生成在线查看 URL(已有的过期了用) + * 按需获取/刷新 Turnitin 在线报告 URL(与 poll 解耦,避免 viewer-url 失败拖死查重完成)。 * * @return array{url:string, expire:int, local_pdf:string} */ @@ -345,6 +387,9 @@ class PlagiarismService // ---------- 内部 ---------- + /** + * 调用 Turnitin POST viewer-url;仅由 refreshViewerUrlFor / getReportUrl 触发。 + */ private function refreshViewerUrl($tii, $submissionId) { $resp = $tii->getViewerUrl($submissionId); diff --git a/application/common/UserFieldAiService.php b/application/common/UserFieldAiService.php new file mode 100644 index 00000000..672e43ed --- /dev/null +++ b/application/common/UserFieldAiService.php @@ -0,0 +1,404 @@ +logFile = ROOT_PATH . 'runtime' . DS . 'user_field_ai.log'; + } + + /** + * 启动链式处理(从 user_id=0 之后找第一个待处理用户)。 + * + * @param bool $force true 时重算已生成用户 + * @return bool 是否已推入首条 job + */ + public function startChain($force = false, $delay = 1, $queue = '') + { + return $this->enqueueNextFieldAi($delay, $queue, 0, $force); + } + + /** + * 链式:找 user_id > $afterUserId 的下一位待处理用户并入队。 + */ + public function enqueueNextFieldAi($delay = 1, $queue = '', $afterUserId = 0, $force = false) + { + if ($queue === '') { + $queue = self::QUEUE_NAME; + } + $afterUserId = intval($afterUserId); + $userId = $this->findNextPendingUserId($afterUserId, $force); + if ($userId <= 0) { + $this->log('[FieldAi] chain finished after user_id=' . $afterUserId . ' force=' . ($force ? '1' : '0')); + return false; + } + + $data = [ + 'user_id' => $userId, + 'queue' => $queue, + 'force' => $force ? 1 : 0, + ]; + $jobClass = 'app\\api\\job\\UserFieldAiFill@fire'; + if ($delay > 0) { + Queue::later($delay, $jobClass, $data, $queue); + } else { + Queue::push($jobClass, $data, $queue); + } + $this->log('[FieldAi] enqueued user_id=' . $userId . ' queue=' . $queue); + return true; + } + + /** + * 处理单个用户(队列 Job 或同步调试)。 + * + * @return array{ok:bool, skipped?:bool, insufficient?:bool, field_ai?:string, error?:string} + */ + public function processUser($userId, $force = false) + { + $userId = intval($userId); + if ($userId <= 0) { + return ['ok' => false, 'error' => 'invalid user_id']; + } + + $this->ensureReviewerInfoRow($userId); + $uri = Db::name('user_reviewer_info')->where('reviewer_id', $userId)->find(); + if (!$uri) { + return ['ok' => false, 'error' => 'reviewer_info missing']; + } + + if (!$force && intval($uri['field_ai_status']) === self::STATUS_DONE && trim((string)$uri['field_ai']) !== '') { + return ['ok' => true, 'skipped' => true, 'field_ai' => (string)$uri['field_ai']]; + } + + if (!$this->isEligible($userId, $uri)) { + $this->updateFieldAi($userId, '', self::STATUS_INSUFFICIENT, 'insufficient profile/articles'); + return ['ok' => true, 'insufficient' => true]; + } + + try { + $context = $this->buildContext($userId, $uri); + $fieldAi = $this->summarizeWithLlm($context); + if ($fieldAi === '') { + throw new Exception('LLM returned empty field'); + } + $this->updateFieldAi($userId, $fieldAi, self::STATUS_DONE, ''); + return ['ok' => true, 'field_ai' => $fieldAi]; + } catch (\Throwable $e) { + $this->updateFieldAi($userId, '', self::STATUS_FAILED, mb_substr($e->getMessage(), 0, 500)); + $this->log('[FieldAi] user_id=' . $userId . ' fail: ' . $e->getMessage()); + return ['ok' => false, 'error' => $e->getMessage()]; + } + } + + /** + * 是否满足「可总结」:有投稿 或 审稿人资料较全。 + */ + public function isEligible($userId, $uri = null) + { + if ($this->hasSubmittedArticles($userId)) { + return true; + } + if ($uri === null) { + $uri = Db::name('user_reviewer_info')->where('reviewer_id', $userId)->find(); + } + return $this->isReviewerProfileComplete($uri); + } + + public function hasSubmittedArticles($userId) + { + $n = Db::name('article') + ->where('user_id', intval($userId)) + ->where('title', '<>', '') + ->count(); + return $n > 0; + } + + /** + * 审稿人资料字段填充数达到阈值视为「较全」。 + */ + public function isReviewerProfileComplete($uri) + { + if (!$uri || !is_array($uri)) { + return false; + } + $minFilled = max(3, (int) Env::get('user_field_ai.min_profile_fields', 4)); + $keys = ['field', 'company', 'country', 'technical', 'introduction', 'department', 'website']; + $filled = 0; + foreach ($keys as $k) { + if (!empty($uri[$k]) && trim((string)$uri[$k]) !== '') { + $filled++; + } + } + if (!empty($uri['major']) && trim((string)$uri['major']) !== '' && trim((string)$uri['major']) !== '0') { + $filled++; + } + $majorCount = Db::name('major_to_user')->where('user_id', intval($uri['reviewer_id']))->where('state', 0)->count(); + if ($majorCount > 0) { + $filled++; + } + return $filled >= $minFilled; + } + + private function findNextPendingUserId($afterUserId, $force) + { + $batch = max(20, (int) Env::get('user_field_ai.scan_batch', 80)); + $cursor = intval($afterUserId); + + while (true) { + $query = Db::name('user')->alias('u') + ->leftJoin('t_user_reviewer_info uri', 'uri.reviewer_id = u.user_id') + ->where('u.user_id', '>', $cursor); + if (!$force) { + $query->where(function ($q) { + $q->where('uri.field_ai_status', self::STATUS_PENDING) + ->whereOr('uri.field_ai_status', self::STATUS_FAILED) + ->whereOr('uri.reviewer_info_id', 'null'); + }); + } + $ids = $query->order('u.user_id asc')->limit($batch)->column('u.user_id'); + + if (empty($ids)) { + return 0; + } + + foreach ($ids as $uid) { + $uid = intval($uid); + $cursor = $uid; + $this->ensureReviewerInfoRow($uid); + $uri = Db::name('user_reviewer_info')->where('reviewer_id', $uid)->find(); + if (!$force && intval($uri['field_ai_status']) === self::STATUS_DONE) { + continue; + } + if (!$force && intval($uri['field_ai_status']) === self::STATUS_INSUFFICIENT) { + continue; + } + if ($this->isEligible($uid, $uri)) { + return $uid; + } + if (!$force) { + $this->updateFieldAi($uid, '', self::STATUS_INSUFFICIENT, 'auto skip: insufficient data'); + } + } + } + } + + private function buildContext($userId, array $uri) + { + $user = Db::name('user')->where('user_id', $userId)->field('user_id,realname,email,account')->find(); + $majorTitles = $this->resolveMajorTitles($userId, $uri); + + $maxArticles = max(1, min(10, (int) Env::get('user_field_ai.max_articles', 5))); + $articles = Db::name('article') + ->where('user_id', $userId) + ->where('title', '<>', '') + ->order('article_id desc') + ->limit($maxArticles) + ->field('article_id,title,keywords,abstrart,journal_id,ctime') + ->select(); + + $journalNames = []; + if (!empty($articles)) { + $jids = array_unique(array_filter(array_column($articles, 'journal_id'))); + if (!empty($jids)) { + $journalNames = Db::name('journal')->where('journal_id', 'in', $jids)->column('title', 'journal_id'); + } + } + + $articleBlocks = []; + foreach ($articles as $a) { + $jid = intval($a['journal_id']); + $articleBlocks[] = [ + 'title' => (string) $a['title'], + 'journal' => isset($journalNames[$jid]) ? (string) $journalNames[$jid] : '', + 'keywords' => (string) ($a['keywords'] ?? ''), + 'abstract' => mb_substr(trim((string) ($a['abstrart'] ?? '')), 0, 800), + ]; + } + + return [ + 'user' => [ + 'realname' => $user ? (string) $user['realname'] : '', + 'email' => $user ? (string) $user['email'] : '', + ], + 'profile' => [ + 'field' => trim((string) ($uri['field'] ?? '')), + 'technical' => trim((string) ($uri['technical'] ?? '')), + 'company' => trim((string) ($uri['company'] ?? '')), + 'department' => trim((string) ($uri['department'] ?? '')), + 'country' => trim((string) ($uri['country'] ?? '')), + 'introduction' => mb_substr(trim((string) ($uri['introduction'] ?? '')), 0, 1200), + 'website' => trim((string) ($uri['website'] ?? '')), + 'majors' => $majorTitles, + ], + 'articles' => $articleBlocks, + ]; + } + + private function resolveMajorTitles($userId, array $uri) + { + $titles = []; + $ids = Db::name('major_to_user')->where('user_id', $userId)->where('state', 0)->column('major_id'); + if (!empty($ids)) { + $titles = Db::name('reviewer_major')->where('major_id', 'in', $ids)->where('state', 0)->column('title'); + } + if (empty($titles) && !empty($uri['major'])) { + $legacy = array_filter(array_map('intval', explode(',', (string) $uri['major']))); + if (!empty($legacy)) { + $titles = Db::name('reviewer_major')->where('major_id', 'in', $legacy)->column('title'); + } + } + return array_values(array_unique(array_filter(array_map('trim', $titles)))); + } + + private function summarizeWithLlm(array $context) + { + $url = trim((string) Env::get('user_field_ai.chat_url', Env::get('expert_country_chat_url', Env::get('citation_chat_url', '')))); + $model = trim((string) Env::get('user_field_ai.chat_model', Env::get('expert_country_chat_model', Env::get('citation_chat_model', 'gpt-4.1')))); + $apiKey = trim((string) Env::get('user_field_ai.chat_api_key', Env::get('expert_country_chat_api_key', Env::get('citation_chat_api_key', '')))); + if ($url === '' || $model === '') { + throw new Exception('user_field_ai chat not configured (chat_url / chat_model)'); + } + + $payloadJson = json_encode($context, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES); + $messages = [ + [ + 'role' => 'system', + 'content' => '你是学术领域分类助手。根据用户的投稿与个人资料,用简体中文给出该用户最主要的研究领域总结。' + . '要求:精确、简洁,1~3 个中文领域词或短短语,用顿号分隔;不要解释、不要英文、不要 JSON 以外的多余文字。' + . '只输出 JSON:{"field_ai":"..."}。', + ], + [ + 'role' => 'user', + 'content' => "请根据以下 JSON 资料总结该用户的主要研究领域:\n" . $payloadJson, + ], + ]; + + $body = [ + 'model' => $model, + 'temperature' => 0.2, + 'messages' => $messages, + ]; + + $ch = curl_init(); + curl_setopt_array($ch, [ + CURLOPT_URL => $url, + CURLOPT_POST => true, + CURLOPT_POSTFIELDS => json_encode($body, JSON_UNESCAPED_UNICODE), + CURLOPT_RETURNTRANSFER => true, + CURLOPT_CONNECTTIMEOUT => 15, + CURLOPT_TIMEOUT => max(30, (int) Env::get('user_field_ai.timeout', 90)), + CURLOPT_HTTPHEADER => array_filter([ + 'Content-Type: application/json', + $apiKey !== '' ? 'Authorization: Bearer ' . $apiKey : null, + ]), + ]); + $raw = curl_exec($ch); + $code = (int) curl_getinfo($ch, CURLINFO_HTTP_CODE); + $err = curl_error($ch); + curl_close($ch); + + if ($raw === false) { + throw new Exception('LLM curl error: ' . $err); + } + if ($code < 200 || $code >= 300) { + throw new Exception('LLM HTTP ' . $code . ': ' . mb_substr((string) $raw, 0, 400)); + } + + $data = json_decode($raw, true); + $content = ''; + if (is_array($data) && isset($data['choices'][0]['message']['content'])) { + $content = trim((string) $data['choices'][0]['message']['content']); + } elseif (is_string($raw)) { + $content = trim($raw); + } + + $fieldAi = $this->parseFieldAiFromContent($content); + if ($fieldAi === '' && $content !== '') { + $fieldAi = $this->cleanFieldAiText($content); + } + return $fieldAi; + } + + private function parseFieldAiFromContent($content) + { + $content = trim((string) $content); + if ($content === '') { + return ''; + } + $content = preg_replace('/^```[a-zA-Z]*\s*|```$/m', '', $content); + if (preg_match('/\{.*\}/s', $content, $m)) { + $obj = json_decode($m[0], true); + if (is_array($obj) && !empty($obj['field_ai'])) { + return $this->cleanFieldAiText((string) $obj['field_ai']); + } + } + $obj = json_decode($content, true); + if (is_array($obj) && !empty($obj['field_ai'])) { + return $this->cleanFieldAiText((string) $obj['field_ai']); + } + return ''; + } + + private function cleanFieldAiText($text) + { + $text = trim((string) $text); + $text = trim($text, "\"' \t\n\r"); + $text = preg_replace('/\s+/u', '', $text); + if (mb_strlen($text) > 200) { + $text = mb_substr($text, 0, 200); + } + return $text; + } + + public function ensureReviewerInfoRow($userId) + { + $exists = Db::name('user_reviewer_info')->where('reviewer_id', $userId)->find(); + if ($exists) { + return; + } + Db::name('user_reviewer_info')->insert([ + 'reviewer_id' => $userId, + 'ctime' => time(), + 'state' => 0, + ]); + } + + private function updateFieldAi($userId, $fieldAi, $status, $note) + { + $data = [ + 'field_ai' => mb_substr(trim((string) $fieldAi), 0, 512), + 'field_ai_status' => intval($status), + 'field_ai_utime' => time(), + ]; + Db::name('user_reviewer_info')->where('reviewer_id', $userId)->update($data); + if ($note !== '') { + $this->log('[FieldAi] user_id=' . $userId . ' status=' . $status . ' note=' . $note); + } + } + + public function log($msg) + { + $line = date('Y-m-d H:i:s') . ' ' . $msg . PHP_EOL; + @file_put_contents($this->logFile, $line, FILE_APPEND); + } +} diff --git a/sql/add_field_ai_to_user_reviewer_info.sql b/sql/add_field_ai_to_user_reviewer_info.sql new file mode 100644 index 00000000..78dfea26 --- /dev/null +++ b/sql/add_field_ai_to_user_reviewer_info.sql @@ -0,0 +1,5 @@ +-- 用户主领域 AI 总结(中文),由队列链式任务写入 +ALTER TABLE `t_user_reviewer_info` + ADD COLUMN `field_ai` VARCHAR(512) NOT NULL DEFAULT '' COMMENT 'AI总结的主要研究领域(中文)' AFTER `field`, + ADD COLUMN `field_ai_status` TINYINT NOT NULL DEFAULT 0 COMMENT '0待处理 1已生成 2资料不足跳过 3失败' AFTER `field_ai`, + ADD COLUMN `field_ai_utime` INT NOT NULL DEFAULT 0 COMMENT 'field_ai 更新时间' AFTER `field_ai_status`; diff --git a/sql/add_plagiarism_check_type.sql b/sql/add_plagiarism_check_type.sql new file mode 100644 index 00000000..f24ab004 --- /dev/null +++ b/sql/add_plagiarism_check_type.sql @@ -0,0 +1,4 @@ +-- 查重类型:全文 full / 正文 body_only(裁切题名、作者、参考文献后上传) +ALTER TABLE `t_plagiarism_check` + ADD COLUMN `check_type` VARCHAR(16) NOT NULL DEFAULT 'full' COMMENT 'full=全文 body_only=仅正文' AFTER `trigger_source`, + ADD COLUMN `derived_file_path` VARCHAR(255) NOT NULL DEFAULT '' COMMENT 'body_only 时生成的临时稿件相对路径' AFTER `source_file_size`;