自动查重
This commit is contained in:
@@ -16,7 +16,7 @@ use think\Exception;
|
||||
* PlagiarismRun → 创建 submission + 上传文件 → 入队 PlagiarismWaitIngest
|
||||
* PlagiarismWaitIngest → 单次 GET submission 状态;就绪则入队 PlagiarismTriggerSimilarity,否则延迟再入队
|
||||
* PlagiarismTriggerSimilarity → PUT similarity → state=2(比对中),入队 PlagiarismPoll
|
||||
* PlagiarismPoll → 轮询 similarity,完成后下载 PDF → state=3(完成)
|
||||
* PlagiarismPoll → 轮询 similarity,完成后下载 PDF → state=3(完成);在线 viewer URL 按需 getReportUrl 调用 refreshViewerUrlFor
|
||||
* 任意环节抛异常 → state=4(失败),写 error_msg
|
||||
*
|
||||
* Worker:请用 `queue:work` 消费队列 **plagiarism**(整条链与轮询均在此队列;若此前单独监听 PlagiarismRun / PlagiarismPoll,需改为 plagiarism)。
|
||||
@@ -31,6 +31,9 @@ class PlagiarismService
|
||||
/** Run / WaitIngest / TriggerSimilarity / Poll 共用队列名 */
|
||||
const QUEUE_CHAIN = 'plagiarism';
|
||||
|
||||
const CHECK_TYPE_FULL = 'full';
|
||||
const CHECK_TYPE_BODY = 'body_only';
|
||||
|
||||
const JOB_WAIT_INGEST = 'app\\api\\job\\PlagiarismWaitIngest';
|
||||
const JOB_TRIGGER_SIM = 'app\\api\\job\\PlagiarismTriggerSimilarity';
|
||||
const JOB_POLL = 'app\\api\\job\\PlagiarismPoll';
|
||||
@@ -61,38 +64,83 @@ class PlagiarismService
|
||||
* @param string $filePath 本地可读的 PDF/DOCX 绝对路径
|
||||
* @param int $triggeredBy 触发人 user_id(手工触发时编辑后台的 user_id)
|
||||
* @param string $source 'manual' / 'auto_xxx'
|
||||
* @param string $checkType full | body_only
|
||||
* @return int check_id
|
||||
*/
|
||||
public function submit($articleId, $filePath, $triggeredBy = 0, $source = 'manual')
|
||||
public function submit($articleId, $filePath, $triggeredBy = 0, $source = 'manual', $checkType = self::CHECK_TYPE_FULL)
|
||||
{
|
||||
if (!is_file($filePath) || !is_readable($filePath)) {
|
||||
throw new Exception("File not readable: {$filePath}");
|
||||
}
|
||||
|
||||
$checkType = $this->normalizeCheckType($checkType);
|
||||
$uploadPath = $filePath;
|
||||
$derivedRel = '';
|
||||
$sourceName = basename($filePath);
|
||||
|
||||
if ($checkType === self::CHECK_TYPE_BODY) {
|
||||
$built = (new ManuscriptBodyExtractor())->buildBodyOnlyDocx($filePath, $articleId);
|
||||
$uploadPath = $built['path'];
|
||||
$derivedRel = (string) $built['rel_path'];
|
||||
$sourceName = basename($uploadPath);
|
||||
if (!empty($built['warnings'])) {
|
||||
$this->log('body_only warnings check article=' . $articleId . ' ' . implode('; ', $built['warnings']));
|
||||
}
|
||||
}
|
||||
|
||||
$journalId = (int) Db::name('article')
|
||||
->where('article_id', $articleId)
|
||||
->value('journal_id');
|
||||
|
||||
$this->log("plagiarism submit is running");
|
||||
$this->log("plagiarism submit type={$checkType} article={$articleId}");
|
||||
$now = time();
|
||||
$checkId = Db::name('plagiarism_check')->insertGetId([
|
||||
$row = [
|
||||
'article_id' => $articleId,
|
||||
'journal_id' => $journalId,
|
||||
'triggered_by' => $triggeredBy,
|
||||
'trigger_source' => $source,
|
||||
'state' => 1, // 上传中
|
||||
'source_file_name' => basename($filePath),
|
||||
'source_file_size' => filesize($filePath) ?: 0,
|
||||
'check_type' => $checkType,
|
||||
'state' => 1,
|
||||
'source_file_name' => $sourceName,
|
||||
'source_file_size' => filesize($uploadPath) ?: 0,
|
||||
'ctime' => $now,
|
||||
'utime' => $now,
|
||||
]);
|
||||
];
|
||||
if ($derivedRel !== '') {
|
||||
$row['derived_file_path'] = $derivedRel;
|
||||
}
|
||||
$checkId = Db::name('plagiarism_check')->insertGetId($row);
|
||||
Queue::push(
|
||||
'app\\api\\job\\PlagiarismRun',
|
||||
['check_id' => $checkId, 'file_path' => $filePath],
|
||||
['check_id' => $checkId, 'file_path' => $uploadPath],
|
||||
self::QUEUE_CHAIN
|
||||
);
|
||||
|
||||
return (int)$checkId;
|
||||
return (int) $checkId;
|
||||
}
|
||||
|
||||
/**
|
||||
* 同时提交全文 + 正文两次查重
|
||||
* @return array{full:int, body_only:int}
|
||||
*/
|
||||
public function submitBoth($articleId, $filePath, $triggeredBy = 0, $source = 'manual')
|
||||
{
|
||||
return [
|
||||
'full' => $this->submit($articleId, $filePath, $triggeredBy, $source, self::CHECK_TYPE_FULL),
|
||||
'body_only' => $this->submit($articleId, $filePath, $triggeredBy, $source, self::CHECK_TYPE_BODY),
|
||||
];
|
||||
}
|
||||
|
||||
private function normalizeCheckType($checkType)
|
||||
{
|
||||
$t = strtolower(trim((string) $checkType));
|
||||
if ($t === '' || $t === self::CHECK_TYPE_FULL || $t === 'full') {
|
||||
return self::CHECK_TYPE_FULL;
|
||||
}
|
||||
if ($t === self::CHECK_TYPE_BODY || $t === 'body' || $t === 'bodyonly') {
|
||||
return self::CHECK_TYPE_BODY;
|
||||
}
|
||||
throw new Exception('invalid check_type, use full or body_only');
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -252,7 +300,7 @@ class PlagiarismService
|
||||
return;
|
||||
}
|
||||
|
||||
// try {
|
||||
try {
|
||||
$tii = new TurnitinService();
|
||||
$statusResp = $tii->getSimilarityStatus($check['tii_submission_id']);
|
||||
$status = isset($statusResp['status']) ? strtoupper($statusResp['status']) : '';
|
||||
@@ -267,17 +315,13 @@ class PlagiarismService
|
||||
$score = isset($statusResp['overall_match_percentage'])
|
||||
? floatval($statusResp['overall_match_percentage']) : 0;
|
||||
|
||||
// 下载 PDF + 取在线查看 URL
|
||||
$localPdf = $this->downloadAndStorePdf($tii, $check['tii_submission_id'], $checkId);
|
||||
$viewerInfo = $this->refreshViewerUrl($tii, $check['tii_submission_id']);
|
||||
|
||||
$this->updateCheck($checkId, [
|
||||
'state' => 3,
|
||||
'similarity_score' => $score,
|
||||
'pdf_local_path' => $localPdf,
|
||||
'view_only_url' => $viewerInfo['url'],
|
||||
'view_only_url_expire' => $viewerInfo['expire'],
|
||||
'error_msg' => '',
|
||||
'state' => 3,
|
||||
'similarity_score' => $score,
|
||||
'pdf_local_path' => $localPdf,
|
||||
'error_msg' => '',
|
||||
]);
|
||||
return;
|
||||
}
|
||||
@@ -288,7 +332,6 @@ class PlagiarismService
|
||||
return;
|
||||
}
|
||||
|
||||
// PROCESSING 或其它中间态:继续轮询
|
||||
if ($attempt >= self::MAX_POLL_ATTEMPTS) {
|
||||
$this->markFailed($checkId, '[poll] timeout after ' . $attempt . ' attempts');
|
||||
return;
|
||||
@@ -299,28 +342,27 @@ class PlagiarismService
|
||||
['check_id' => $checkId, 'attempt' => $attempt + 1],
|
||||
self::QUEUE_CHAIN
|
||||
);
|
||||
// } catch (\Throwable $e) {
|
||||
// // 网络抖动不要直接 fail,给一定容错次数
|
||||
// if ($attempt < self::MAX_POLL_ATTEMPTS) {
|
||||
// Queue::later(
|
||||
// self::POLL_INTERVAL,
|
||||
// self::JOB_POLL,
|
||||
// ['check_id' => $checkId, 'attempt' => $attempt + 1],
|
||||
// self::QUEUE_CHAIN
|
||||
// );
|
||||
// $this->updateCheck($checkId, [
|
||||
// 'attempts' => $attempt,
|
||||
// 'error_msg' => '[poll] transient: ' . $e->getMessage(),
|
||||
// ]);
|
||||
// return;
|
||||
// }
|
||||
// $this->markFailed($checkId, '[poll] exhausted: ' . $e->getMessage());
|
||||
// throw $e;
|
||||
// }
|
||||
} catch (\Throwable $e) {
|
||||
if ($attempt < self::MAX_POLL_ATTEMPTS) {
|
||||
Queue::later(
|
||||
self::POLL_INTERVAL,
|
||||
self::JOB_POLL,
|
||||
['check_id' => $checkId, 'attempt' => $attempt + 1],
|
||||
self::QUEUE_CHAIN
|
||||
);
|
||||
$this->updateCheck($checkId, [
|
||||
'attempts' => $attempt,
|
||||
'error_msg' => '[poll] transient: ' . $e->getMessage(),
|
||||
]);
|
||||
return;
|
||||
}
|
||||
$this->markFailed($checkId, '[poll] exhausted: ' . $e->getMessage());
|
||||
throw $e;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 重新生成在线查看 URL(已有的过期了用)
|
||||
* 按需获取/刷新 Turnitin 在线报告 URL(与 poll 解耦,避免 viewer-url 失败拖死查重完成)。
|
||||
*
|
||||
* @return array{url:string, expire:int, local_pdf:string}
|
||||
*/
|
||||
@@ -345,6 +387,9 @@ class PlagiarismService
|
||||
|
||||
// ---------- 内部 ----------
|
||||
|
||||
/**
|
||||
* 调用 Turnitin POST viewer-url;仅由 refreshViewerUrlFor / getReportUrl 触发。
|
||||
*/
|
||||
private function refreshViewerUrl($tii, $submissionId)
|
||||
{
|
||||
$resp = $tii->getViewerUrl($submissionId);
|
||||
|
||||
Reference in New Issue
Block a user