自动查重

This commit is contained in:
wangjinlei
2026-05-13 18:02:09 +08:00
parent fa878334cd
commit f99dbc6397
6 changed files with 411 additions and 91 deletions

View File

@@ -18,6 +18,11 @@ use think\Exception;
* API_KEY 生成的 Bearer token
* INTEGRATION_NAME Scope Name创建 integration 时填的名字)
* INTEGRATION_VERSION 自定义版本号,便于审计 e.g. 1.0.0
* SUBMISSION_INGEST_MAX_WAIT 上传后轮询 submission 就绪的最长秒数,默认 600仅 waitAfterUploadForSimilarity 同步用)
* SUBMISSION_INGEST_POLL_INTERVAL 同步轮询间隔秒数,默认 3
* INGEST_CHAIN_FIRST_DELAY 上传后首次 ingest 检查延迟秒数,默认 10队列链
* INGEST_CHAIN_POLL_INTERVAL ingest 链每步间隔秒数,默认 15
* INGEST_CHAIN_MAX_ATTEMPTS ingest 链最大步数,默认 80
*
* API 文档https://developers.turnitin.com/docs/tca
*
@@ -36,8 +41,8 @@ class TurnitinService
public function __construct()
{
$this->baseUrl = rtrim(trim((string)Env::get('turnitin.base_url', '')), '/');
$this->apiKey = trim((string)Env::get('turnitin.api_key', ''));
$this->baseUrl = rtrim(trim((string)Env::get('turnitin.base_url', 'https://crossref-20794.turnitin.com/api/v1')), '/');
$this->apiKey = trim((string)Env::get('turnitin.api_key', 'c6315e8291a4433dae09ad5efdb8a89c'));
$this->integrationName = trim((string)Env::get('turnitin.integration_name', 'tmr'));
$this->integrationVersion = trim((string)Env::get('turnitin.integration_version', '1.0.0'));
@@ -80,11 +85,13 @@ class TurnitinService
/**
* 上传文件到 submission
* PUT /submissions/{id}/original/{filename}
*
* TCA 文档路径为 PUT /submissions/{id}/original文件名仅通过 Content-Disposition 传递,
* 不要再拼在 URL 末尾;否则网关会 404错误里常见 path 形如 //v1/submissions/.../original/xxx.docx
*
* @param string $submissionId
* @param string $filePath 本地 PDF/DOCX 路径
* @param string $filename 传给 Turnitin 的文件名(用于报告显示
* @param string $filename 传给 Turnitin 的展示文件名(默认取 basename
* @return array
*/
public function uploadFile($submissionId, $filePath, $filename = '')
@@ -95,15 +102,20 @@ class TurnitinService
if ($filename === '') {
$filename = basename($filePath);
}
// Content-Disposition 里避免未转义的双引号
$safeName = str_replace(['"', "\r", "\n"], '', $filename);
if ($safeName === '') {
$safeName = 'document.bin';
}
$body = file_get_contents($filePath);
return $this->request(
'PUT',
'/submissions/' . urlencode($submissionId) . '/original/' . rawurlencode($filename),
'/submissions/' . rawurlencode($submissionId) . '/original',
$body,
[
'Content-Type' => 'binary/octet-stream',
'Content-Disposition' => 'inline; filename="' . $filename . '"',
'Content-Type' => 'application/octet-stream',
'Content-Disposition' => 'attachment; filename="' . $safeName . '"',
]
);
}
@@ -114,9 +126,10 @@ class TurnitinService
*
* @param string $submissionId
* @param array $opts
* - generation_settings.search_repositories 默认 ['INTERNET','PUBLICATION','CROSSREF','CROSSREF_POSTED_CONTENT','SUBMITTED_WORK']
* - generation_settings.submission_auto_excludes bool
* - view_settings.exclude_quotes / exclude_bibliography / exclude_citations / exclude_abstract / exclude_methods bool
* - generation_settings.search_repositories 默认 ['INTERNET','PUBLICATION',...]
* - generation_settings.submission_auto_excludes **字符串数组**(如 [] 或具体仓库键),不可传 boolean否则会 400
* - generation_settings.auto_exclude_self_matching_scope 可选,如 'GROUP_CONTEXT'
* - view_settings.exclude_* 布尔排除项(与 TCA 文档一致)
* - indexing_settings.add_to_index bool 是否把本文加进 SUBMITTED_WORK 索引(一般 true
* @return array
*/
@@ -125,7 +138,8 @@ class TurnitinService
$body = array_merge([
'generation_settings' => [
'search_repositories' => ['INTERNET', 'PUBLICATION', 'CROSSREF', 'CROSSREF_POSTED_CONTENT', 'SUBMITTED_WORK'],
'submission_auto_excludes' => true,
// 服务端类型为 List<String>,传 true 会 400Cannot deserialize ... from Boolean
'submission_auto_excludes' => [],
'auto_exclude_self_matching_scope' => 'GROUP_CONTEXT',
],
'view_settings' => [
@@ -140,11 +154,122 @@ class TurnitinService
return $this->request(
'PUT',
'/submissions/' . urlencode($submissionId) . '/similarity',
'/submissions/' . rawurlencode($submissionId) . '/similarity',
$body
);
}
/**
* 查询 submission 详情(上传后用于轮询是否解析完成)。
* GET /submissions/{id}
*
* @return array 解码后的 JSON常见为 status=ok + message 内含 id/status
*/
public function getSubmission($submissionId)
{
return $this->request('GET', '/submissions/' . rawurlencode($submissionId));
}
/**
* 单次解析 GET /submissions/{id},判断是否可调用 PUT /similarity不 sleep供队列链逐步轮询
*
* @return array{ready:bool, failed:bool, status:string, snippet:string, message:array}
*/
public function parseSubmissionIngestState($submissionId)
{
$raw = $this->getSubmission($submissionId);
$msg = self::unwrapSubmissionPayload($raw);
$st = strtoupper(trim((string) self::pickSubmissionStatus($msg)));
$snippet = mb_substr(json_encode($msg, JSON_UNESCAPED_UNICODE), 0, 400);
$ready = [
'COMPLETE', 'COMPLETED', 'PROCESSED', 'READY', 'SUCCEEDED',
'COMPLETE_PROCESSING',
];
$failed = ['ERROR', 'FAILED', 'CANCELLED', 'CANCELED', 'DELETED'];
$readyFlag = $st !== '' && in_array($st, $ready, true);
$failedFlag = $st !== '' && in_array($st, $failed, true);
return [
'ready' => $readyFlag,
'failed' => $failedFlag,
'status' => $st,
'snippet' => $snippet,
'message' => $msg,
];
}
/**
* 上传完成后需等待 Turnitin 异步完成文本解析(同步阻塞版,仅 CLI/调试;线上请用队列链 PlagiarismWaitIngest
*
* @param string $submissionId
* @param int $maxWaitSec 最长等待秒数,默认 60010 分钟)
* @param int $intervalSec 轮询间隔秒数,默认 3
* @throws Exception 超时或终态为失败
*/
public function waitAfterUploadForSimilarity($submissionId, $maxWaitSec = 600, $intervalSec = 3)
{
$deadline = time() + max(30, (int)$maxWaitSec);
$intervalSec = max(1, (int)$intervalSec);
$lastStatus = '';
$lastSnippet = '';
while (time() < $deadline) {
$parsed = $this->parseSubmissionIngestState($submissionId);
$lastStatus = $parsed['status'];
$lastSnippet = $parsed['snippet'];
if (!empty($parsed['ready'])) {
return;
}
if (!empty($parsed['failed'])) {
throw new Exception('Turnitin submission failed, status=' . $lastStatus . ' body=' . $lastSnippet);
}
sleep($intervalSec);
}
throw new Exception(
'Timeout waiting for Turnitin submission ingest (last status=' . ($lastStatus ?: '(empty)') . ') snippet=' . $lastSnippet
);
}
/**
* @param mixed $decoded
* @return array
*/
private static function unwrapSubmissionPayload($decoded)
{
if (!is_array($decoded)) {
return [];
}
if (isset($decoded['message']) && is_array($decoded['message'])) {
return $decoded['message'];
}
return $decoded;
}
/**
* @param array $msg
* @return string
*/
private static function pickSubmissionStatus(array $msg)
{
$candidates = [$msg];
if (isset($msg['submission']) && is_array($msg['submission'])) {
$candidates[] = $msg['submission'];
}
foreach ($candidates as $m) {
foreach (['status', 'workflow_status', 'submission_status', 'processing_status', 'paper_status'] as $k) {
if (!empty($m[$k])) {
return (string)$m[$k];
}
}
}
return '';
}
/**
* 查询 similarity 状态
* GET /submissions/{id}/similarity
@@ -156,7 +281,7 @@ class TurnitinService
{
return $this->request(
'GET',
'/submissions/' . urlencode($submissionId) . '/similarity'
'/submissions/' . rawurlencode($submissionId) . '/similarity'
);
}