自动查重

This commit is contained in:
wangjinlei
2026-05-20 11:58:10 +08:00
parent 53e6ddbd9e
commit cfa3f791f4
11 changed files with 938 additions and 58 deletions

View File

@@ -16,7 +16,7 @@ use think\Exception;
* PlagiarismRun → 创建 submission + 上传文件 → 入队 PlagiarismWaitIngest
* PlagiarismWaitIngest → 单次 GET submission 状态;就绪则入队 PlagiarismTriggerSimilarity否则延迟再入队
* PlagiarismTriggerSimilarity → PUT similarity → state=2比对中入队 PlagiarismPoll
* PlagiarismPoll → 轮询 similarity完成后下载 PDF → state=3完成
* PlagiarismPoll → 轮询 similarity完成后下载 PDF → state=3完成;在线 viewer URL 按需 getReportUrl 调用 refreshViewerUrlFor
* 任意环节抛异常 → state=4失败写 error_msg
*
* Worker请用 `queue:work` 消费队列 **plagiarism**(整条链与轮询均在此队列;若此前单独监听 PlagiarismRun / PlagiarismPoll需改为 plagiarism
@@ -31,6 +31,9 @@ class PlagiarismService
/** Run / WaitIngest / TriggerSimilarity / Poll 共用队列名 */
const QUEUE_CHAIN = 'plagiarism';
const CHECK_TYPE_FULL = 'full';
const CHECK_TYPE_BODY = 'body_only';
const JOB_WAIT_INGEST = 'app\\api\\job\\PlagiarismWaitIngest';
const JOB_TRIGGER_SIM = 'app\\api\\job\\PlagiarismTriggerSimilarity';
const JOB_POLL = 'app\\api\\job\\PlagiarismPoll';
@@ -61,38 +64,83 @@ class PlagiarismService
* @param string $filePath 本地可读的 PDF/DOCX 绝对路径
* @param int $triggeredBy 触发人 user_id手工触发时编辑后台的 user_id
* @param string $source 'manual' / 'auto_xxx'
* @param string $checkType full | body_only
* @return int check_id
*/
public function submit($articleId, $filePath, $triggeredBy = 0, $source = 'manual')
public function submit($articleId, $filePath, $triggeredBy = 0, $source = 'manual', $checkType = self::CHECK_TYPE_FULL)
{
if (!is_file($filePath) || !is_readable($filePath)) {
throw new Exception("File not readable: {$filePath}");
}
$checkType = $this->normalizeCheckType($checkType);
$uploadPath = $filePath;
$derivedRel = '';
$sourceName = basename($filePath);
if ($checkType === self::CHECK_TYPE_BODY) {
$built = (new ManuscriptBodyExtractor())->buildBodyOnlyDocx($filePath, $articleId);
$uploadPath = $built['path'];
$derivedRel = (string) $built['rel_path'];
$sourceName = basename($uploadPath);
if (!empty($built['warnings'])) {
$this->log('body_only warnings check article=' . $articleId . ' ' . implode('; ', $built['warnings']));
}
}
$journalId = (int) Db::name('article')
->where('article_id', $articleId)
->value('journal_id');
$this->log("plagiarism submit is running");
$this->log("plagiarism submit type={$checkType} article={$articleId}");
$now = time();
$checkId = Db::name('plagiarism_check')->insertGetId([
$row = [
'article_id' => $articleId,
'journal_id' => $journalId,
'triggered_by' => $triggeredBy,
'trigger_source' => $source,
'state' => 1, // 上传中
'source_file_name' => basename($filePath),
'source_file_size' => filesize($filePath) ?: 0,
'check_type' => $checkType,
'state' => 1,
'source_file_name' => $sourceName,
'source_file_size' => filesize($uploadPath) ?: 0,
'ctime' => $now,
'utime' => $now,
]);
];
if ($derivedRel !== '') {
$row['derived_file_path'] = $derivedRel;
}
$checkId = Db::name('plagiarism_check')->insertGetId($row);
Queue::push(
'app\\api\\job\\PlagiarismRun',
['check_id' => $checkId, 'file_path' => $filePath],
['check_id' => $checkId, 'file_path' => $uploadPath],
self::QUEUE_CHAIN
);
return (int)$checkId;
return (int) $checkId;
}
/**
* 同时提交全文 + 正文两次查重
* @return array{full:int, body_only:int}
*/
public function submitBoth($articleId, $filePath, $triggeredBy = 0, $source = 'manual')
{
return [
'full' => $this->submit($articleId, $filePath, $triggeredBy, $source, self::CHECK_TYPE_FULL),
'body_only' => $this->submit($articleId, $filePath, $triggeredBy, $source, self::CHECK_TYPE_BODY),
];
}
private function normalizeCheckType($checkType)
{
$t = strtolower(trim((string) $checkType));
if ($t === '' || $t === self::CHECK_TYPE_FULL || $t === 'full') {
return self::CHECK_TYPE_FULL;
}
if ($t === self::CHECK_TYPE_BODY || $t === 'body' || $t === 'bodyonly') {
return self::CHECK_TYPE_BODY;
}
throw new Exception('invalid check_type, use full or body_only');
}
/**
@@ -252,7 +300,7 @@ class PlagiarismService
return;
}
// try {
try {
$tii = new TurnitinService();
$statusResp = $tii->getSimilarityStatus($check['tii_submission_id']);
$status = isset($statusResp['status']) ? strtoupper($statusResp['status']) : '';
@@ -267,17 +315,13 @@ class PlagiarismService
$score = isset($statusResp['overall_match_percentage'])
? floatval($statusResp['overall_match_percentage']) : 0;
// 下载 PDF + 取在线查看 URL
$localPdf = $this->downloadAndStorePdf($tii, $check['tii_submission_id'], $checkId);
$viewerInfo = $this->refreshViewerUrl($tii, $check['tii_submission_id']);
$this->updateCheck($checkId, [
'state' => 3,
'similarity_score' => $score,
'pdf_local_path' => $localPdf,
'view_only_url' => $viewerInfo['url'],
'view_only_url_expire' => $viewerInfo['expire'],
'error_msg' => '',
'state' => 3,
'similarity_score' => $score,
'pdf_local_path' => $localPdf,
'error_msg' => '',
]);
return;
}
@@ -288,7 +332,6 @@ class PlagiarismService
return;
}
// PROCESSING 或其它中间态:继续轮询
if ($attempt >= self::MAX_POLL_ATTEMPTS) {
$this->markFailed($checkId, '[poll] timeout after ' . $attempt . ' attempts');
return;
@@ -299,28 +342,27 @@ class PlagiarismService
['check_id' => $checkId, 'attempt' => $attempt + 1],
self::QUEUE_CHAIN
);
// } catch (\Throwable $e) {
// // 网络抖动不要直接 fail给一定容错次数
// if ($attempt < self::MAX_POLL_ATTEMPTS) {
// Queue::later(
// self::POLL_INTERVAL,
// self::JOB_POLL,
// ['check_id' => $checkId, 'attempt' => $attempt + 1],
// self::QUEUE_CHAIN
// );
// $this->updateCheck($checkId, [
// 'attempts' => $attempt,
// 'error_msg' => '[poll] transient: ' . $e->getMessage(),
// ]);
// return;
// }
// $this->markFailed($checkId, '[poll] exhausted: ' . $e->getMessage());
// throw $e;
// }
} catch (\Throwable $e) {
if ($attempt < self::MAX_POLL_ATTEMPTS) {
Queue::later(
self::POLL_INTERVAL,
self::JOB_POLL,
['check_id' => $checkId, 'attempt' => $attempt + 1],
self::QUEUE_CHAIN
);
$this->updateCheck($checkId, [
'attempts' => $attempt,
'error_msg' => '[poll] transient: ' . $e->getMessage(),
]);
return;
}
$this->markFailed($checkId, '[poll] exhausted: ' . $e->getMessage());
throw $e;
}
}
/**
* 重新生成在线查看 URL已有的过期了用
* 按需获取/刷新 Turnitin 在线报告 URL与 poll 解耦,避免 viewer-url 失败拖死查重完成)。
*
* @return array{url:string, expire:int, local_pdf:string}
*/
@@ -345,6 +387,9 @@ class PlagiarismService
// ---------- 内部 ----------
/**
* 调用 Turnitin POST viewer-url仅由 refreshViewerUrlFor / getReportUrl 触发。
*/
private function refreshViewerUrl($tii, $submissionId)
{
$resp = $tii->getViewerUrl($submissionId);