自动查重

This commit is contained in:
wangjinlei
2026-05-13 18:02:09 +08:00
parent fa878334cd
commit f99dbc6397
6 changed files with 411 additions and 91 deletions

View File

@@ -12,10 +12,14 @@ use think\Exception;
* 并维护 t_plagiarism_check 状态机。
*
* 状态流:
* submit() → state=1上传中入队 PlagiarismRun
* PlagiarismRun.fire上传 + 触发 similarity → state=2比对中入队 PlagiarismPoll
* PlagiarismPoll.fire → 轮询 status完成后下载 PDF → state=3完成
* 任意环节抛异常 → state=4失败写 error_msg
* submit() → state=1上传中入队 PlagiarismRun
* PlagiarismRun 创建 submission + 上传文件 → 入队 PlagiarismWaitIngest
* PlagiarismWaitIngest → 单次 GET submission 状态;就绪则入队 PlagiarismTriggerSimilarity否则延迟再入队
* PlagiarismTriggerSimilarity → PUT similarity → state=2比对中入队 PlagiarismPoll
* PlagiarismPoll → 轮询 similarity完成后下载 PDF → state=3完成
* 任意环节抛异常 → state=4失败写 error_msg
*
* Worker请用 `queue:work` 消费队列 **plagiarism**(整条链与轮询均在此队列;若此前单独监听 PlagiarismRun / PlagiarismPoll需改为 plagiarism
*/
class PlagiarismService
{
@@ -24,6 +28,13 @@ class PlagiarismService
*/
const REPORT_DIR = 'public/plagiarism';
/** Run / WaitIngest / TriggerSimilarity / Poll 共用队列名 */
const QUEUE_CHAIN = 'plagiarism';
const JOB_WAIT_INGEST = 'app\\api\\job\\PlagiarismWaitIngest';
const JOB_TRIGGER_SIM = 'app\\api\\job\\PlagiarismTriggerSimilarity';
const JOB_POLL = 'app\\api\\job\\PlagiarismPoll';
/**
* 轮询间隔。Turnitin 一般 1-5 分钟出结果30 秒一次比较合适
*/
@@ -76,78 +87,160 @@ class PlagiarismService
'utime' => $now,
]);
$this->log("submit service act");
// 入队执行:上传 + 触发 similarity
Queue::push(
'app\\api\\job\\PlagiarismRun',
['check_id' => $checkId, 'file_path' => $filePath],
'PlagiarismRun'
self::QUEUE_CHAIN
);
return (int)$checkId;
}
/**
* Job 调用:上传文件到 Turnitin 并触发 similarity然后入队 PlagiarismPoll
* Job 调用:仅创建 submission + 上传文件,随后由 PlagiarismWaitIngest 链式轮询 ingest再 PlagiarismTriggerSimilarity。
*/
public function runUploadOnly($checkId, $filePath)
{
$check = $this->mustGetCheck($checkId);
$this->log('runUploadOnly start check_id=' . $checkId);
$tii = new TurnitinService();
$articleTitle = (string) Db::name('article')
->where('article_id', $check['article_id'])
->value('title');
if ($articleTitle === '') {
$articleTitle = 'Article #' . $check['article_id'];
}
$createResp = $tii->createSubmission([
'title' => mb_substr($articleTitle, 0, 250),
'owner' => 'editor_' . $check['triggered_by'],
'submitter' => 'editor_' . $check['triggered_by'],
'metadata' => [
'article_id' => (string) $check['article_id'],
'check_id' => (string) $check['check_id'],
],
]);
$submissionId = isset($createResp['id']) ? $createResp['id'] : '';
if ($submissionId === '') {
throw new Exception('Turnitin createSubmission returned empty id: ' . json_encode($createResp));
}
$this->updateCheck($checkId, [
'tii_submission_id' => $submissionId,
'raw_response' => json_encode($createResp, JSON_UNESCAPED_UNICODE),
]);
$tii->uploadFile($submissionId, $filePath, basename($filePath));
$this->log('runUploadOnly uploaded submission_id=' . $submissionId);
$firstDelay = $this->ingestChainFirstDelaySec();
Queue::later(
$firstDelay,
self::JOB_WAIT_INGEST,
['check_id' => $checkId, 'attempt' => 1],
self::QUEUE_CHAIN
);
}
/**
* 单次 ingest 检查(由 PlagiarismWaitIngest 调用)。不在本方法内 sleep 长循环。
*/
public function runIngestPollStep($checkId, $attempt = 1)
{
$check = $this->mustGetCheck($checkId);
if (empty($check['tii_submission_id'])) {
$this->markFailed($checkId, '[ingest] tii_submission_id empty');
return;
}
$maxAttempts = $this->ingestChainMaxAttempts();
$interval = $this->ingestChainPollIntervalSec();
$tii = new TurnitinService();
try {
$parsed = $tii->parseSubmissionIngestState($check['tii_submission_id']);
} catch (\Throwable $e) {
if ($attempt >= $maxAttempts) {
$this->markFailed($checkId, '[ingest] request failed after ' . $attempt . ' tries: ' . $e->getMessage());
return;
}
Queue::later($interval, self::JOB_WAIT_INGEST, ['check_id' => $checkId, 'attempt' => $attempt + 1], self::QUEUE_CHAIN);
return;
}
if (!empty($parsed['failed'])) {
$this->markFailed($checkId, '[ingest] submission failed status=' . $parsed['status'] . ' ' . $parsed['snippet']);
return;
}
if (!empty($parsed['ready'])) {
Queue::push(self::JOB_TRIGGER_SIM, ['check_id' => $checkId, 'ingest_attempt' => $attempt], self::QUEUE_CHAIN);
return;
}
if ($attempt >= $maxAttempts) {
$this->markFailed($checkId, '[ingest] timeout last_status=' . ($parsed['status'] !== '' ? $parsed['status'] : '(empty)'));
return;
}
Queue::later($interval, self::JOB_WAIT_INGEST, ['check_id' => $checkId, 'attempt' => $attempt + 1], self::QUEUE_CHAIN);
}
/**
* 在 ingest 就绪后触发 similarity并入队 PlagiarismPoll。
* 若仍返回 409则重新入队 PlagiarismWaitIngest不抛异常避免误标失败
*
* @param int $ingestAttempt 来自 WaitIngest 的 attempt供 409 时继续轮询
*/
public function runTriggerSimilarityOnly($checkId, $ingestAttempt = 1)
{
$check = $this->mustGetCheck($checkId);
if (empty($check['tii_submission_id'])) {
$this->markFailed($checkId, '[similarity] tii_submission_id empty');
return;
}
$tii = new TurnitinService();
$sid = $check['tii_submission_id'];
try {
$simResp = $tii->triggerSimilarity($sid);
} catch (\Throwable $e) {
$msg = $e->getMessage();
$is409 = (stripos($msg, '409') !== false || stripos($msg, 'CONFLICT') !== false)
&& (stripos($msg, 'not been completed') !== false || stripos($msg, 'completed yet') !== false);
if ($is409) {
$maxAttempts = $this->ingestChainMaxAttempts();
$next = $ingestAttempt + 1;
if ($next > $maxAttempts) {
$this->markFailed($checkId, '[similarity] still not ready after ingest attempts: ' . $msg);
return;
}
$delay = max($this->ingestChainPollIntervalSec(), 20);
Queue::later($delay, self::JOB_WAIT_INGEST, ['check_id' => $checkId, 'attempt' => $next], self::QUEUE_CHAIN);
return;
}
throw $e;
}
$this->updateCheck($checkId, [
'state' => 2,
'tii_report_status' => 'PROCESSING',
'raw_response' => json_encode($simResp, JSON_UNESCAPED_UNICODE),
]);
Queue::later(
self::POLL_INTERVAL,
self::JOB_POLL,
['check_id' => $checkId, 'attempt' => 1],
self::QUEUE_CHAIN
);
}
/**
* @deprecated 与 runUploadOnly 等价;长耗时 ingest 已拆到队列 PlagiarismWaitIngest勿在本方法内同步 wait。
*/
public function runUploadAndTrigger($checkId, $filePath)
{
$check = $this->mustGetCheck($checkId);
$this->log("runUploadAndTrigger is act0");
try {
$tii = new TurnitinService();
// 1. 创建 submission
$articleTitle = (string) Db::name('article')
->where('article_id', $check['article_id'])
->value('title');
if ($articleTitle === '') {
$articleTitle = 'Article #' . $check['article_id'];
}
$this->log("runUploadAndTrigger is act1");
$createResp = $tii->createSubmission([
'title' => mb_substr($articleTitle, 0, 250),
'owner' => 'editor_' . $check['triggered_by'],
'submitter' => 'editor_' . $check['triggered_by'],
'metadata' => [
'article_id' => (string)$check['article_id'],
'check_id' => (string)$check['check_id'],
],
]);
$submissionId = isset($createResp['id']) ? $createResp['id'] : '';
if ($submissionId === '') {
throw new Exception('Turnitin createSubmission returned empty id: ' . json_encode($createResp));
}
$this->updateCheck($checkId, [
'tii_submission_id' => $submissionId,
'raw_response' => json_encode($createResp, JSON_UNESCAPED_UNICODE),
]);
$this->log("runUploadAndTrigger is act2");
// 2. 上传文件
$tii->uploadFile($submissionId, $filePath, basename($filePath));
// 3. 触发 similarity
$simResp = $tii->triggerSimilarity($submissionId);
$this->updateCheck($checkId, [
'state' => 2, // 比对中
'tii_report_status' => 'PROCESSING',
'raw_response' => json_encode($simResp, JSON_UNESCAPED_UNICODE),
]);
$this->log("runUploadAndTrigger is act3");
// 4. 排队首次轮询(晚一点开始,让 Turnitin 先处理)
Queue::later(
self::POLL_INTERVAL,
'app\\api\\job\\PlagiarismPoll',
['check_id' => $checkId, 'attempt' => 1],
'PlagiarismPoll'
);
} catch (\Throwable $e) {
$this->markFailed($checkId, '[upload] ' . $e->getMessage());
throw $e;
}
$this->runUploadOnly($checkId, $filePath);
}
/**
@@ -204,18 +297,18 @@ class PlagiarismService
}
Queue::later(
self::POLL_INTERVAL,
'app\\api\\job\\PlagiarismPoll',
self::JOB_POLL,
['check_id' => $checkId, 'attempt' => $attempt + 1],
'plagiarism'
self::QUEUE_CHAIN
);
} catch (\Throwable $e) {
// 网络抖动不要直接 fail给一定容错次数
if ($attempt < self::MAX_POLL_ATTEMPTS) {
Queue::later(
self::POLL_INTERVAL,
'app\\api\\job\\PlagiarismPoll',
self::JOB_POLL,
['check_id' => $checkId, 'attempt' => $attempt + 1],
'plagiarism'
self::QUEUE_CHAIN
);
$this->updateCheck($checkId, [
'attempts' => $attempt,
@@ -328,15 +421,30 @@ class PlagiarismService
Db::name('plagiarism_check')->where('check_id', $checkId)->update($data);
}
private function markFailed($checkId, $errMsg)
public function markFailed($checkId, $errMsg)
{
$this->log("markFailed act");
$this->log('markFailed check_id=' . $checkId);
$this->updateCheck($checkId, [
'state' => 4,
'error_msg' => mb_substr($errMsg, 0, 1000),
]);
}
private function ingestChainFirstDelaySec()
{
return max(3, (int) Env::get('turnitin.ingest_chain_first_delay', 10));
}
private function ingestChainPollIntervalSec()
{
return max(5, (int) Env::get('turnitin.ingest_chain_poll_interval', 15));
}
private function ingestChainMaxAttempts()
{
return max(10, (int) Env::get('turnitin.ingest_chain_max_attempts', 80));
}
/**
* 从 t_article_file 找到投稿主稿manuscirpt的本地绝对路径。
* file_url 在系统里可能是 URL 或相对路径,调用方负责保证可读。