自动查重
This commit is contained in:
@@ -18,6 +18,11 @@ use think\Exception;
|
||||
* API_KEY 生成的 Bearer token
|
||||
* INTEGRATION_NAME Scope Name(创建 integration 时填的名字)
|
||||
* INTEGRATION_VERSION 自定义版本号,便于审计 e.g. 1.0.0
|
||||
* SUBMISSION_INGEST_MAX_WAIT 上传后轮询 submission 就绪的最长秒数,默认 600(仅 waitAfterUploadForSimilarity 同步用)
|
||||
* SUBMISSION_INGEST_POLL_INTERVAL 同步轮询间隔秒数,默认 3
|
||||
* INGEST_CHAIN_FIRST_DELAY 上传后首次 ingest 检查延迟秒数,默认 10(队列链)
|
||||
* INGEST_CHAIN_POLL_INTERVAL ingest 链每步间隔秒数,默认 15
|
||||
* INGEST_CHAIN_MAX_ATTEMPTS ingest 链最大步数,默认 80
|
||||
*
|
||||
* API 文档:https://developers.turnitin.com/docs/tca
|
||||
*
|
||||
@@ -36,8 +41,8 @@ class TurnitinService
|
||||
|
||||
public function __construct()
|
||||
{
|
||||
$this->baseUrl = rtrim(trim((string)Env::get('turnitin.base_url', '')), '/');
|
||||
$this->apiKey = trim((string)Env::get('turnitin.api_key', ''));
|
||||
$this->baseUrl = rtrim(trim((string)Env::get('turnitin.base_url', 'https://crossref-20794.turnitin.com/api/v1')), '/');
|
||||
$this->apiKey = trim((string)Env::get('turnitin.api_key', 'c6315e8291a4433dae09ad5efdb8a89c'));
|
||||
$this->integrationName = trim((string)Env::get('turnitin.integration_name', 'tmr'));
|
||||
$this->integrationVersion = trim((string)Env::get('turnitin.integration_version', '1.0.0'));
|
||||
|
||||
@@ -80,11 +85,13 @@ class TurnitinService
|
||||
|
||||
/**
|
||||
* 上传文件到 submission
|
||||
* PUT /submissions/{id}/original/{filename}
|
||||
*
|
||||
* TCA 文档路径为 PUT /submissions/{id}/original(文件名仅通过 Content-Disposition 传递,
|
||||
* 不要再拼在 URL 末尾;否则网关会 404,错误里常见 path 形如 //v1/submissions/.../original/xxx.docx)。
|
||||
*
|
||||
* @param string $submissionId
|
||||
* @param string $filePath 本地 PDF/DOCX 路径
|
||||
* @param string $filename 传给 Turnitin 的文件名(用于报告显示)
|
||||
* @param string $filename 传给 Turnitin 的展示文件名(默认取 basename)
|
||||
* @return array
|
||||
*/
|
||||
public function uploadFile($submissionId, $filePath, $filename = '')
|
||||
@@ -95,15 +102,20 @@ class TurnitinService
|
||||
if ($filename === '') {
|
||||
$filename = basename($filePath);
|
||||
}
|
||||
// Content-Disposition 里避免未转义的双引号
|
||||
$safeName = str_replace(['"', "\r", "\n"], '', $filename);
|
||||
if ($safeName === '') {
|
||||
$safeName = 'document.bin';
|
||||
}
|
||||
$body = file_get_contents($filePath);
|
||||
|
||||
return $this->request(
|
||||
'PUT',
|
||||
'/submissions/' . urlencode($submissionId) . '/original/' . rawurlencode($filename),
|
||||
'/submissions/' . rawurlencode($submissionId) . '/original',
|
||||
$body,
|
||||
[
|
||||
'Content-Type' => 'binary/octet-stream',
|
||||
'Content-Disposition' => 'inline; filename="' . $filename . '"',
|
||||
'Content-Type' => 'application/octet-stream',
|
||||
'Content-Disposition' => 'attachment; filename="' . $safeName . '"',
|
||||
]
|
||||
);
|
||||
}
|
||||
@@ -114,9 +126,10 @@ class TurnitinService
|
||||
*
|
||||
* @param string $submissionId
|
||||
* @param array $opts
|
||||
* - generation_settings.search_repositories 默认 ['INTERNET','PUBLICATION','CROSSREF','CROSSREF_POSTED_CONTENT','SUBMITTED_WORK']
|
||||
* - generation_settings.submission_auto_excludes bool
|
||||
* - view_settings.exclude_quotes / exclude_bibliography / exclude_citations / exclude_abstract / exclude_methods bool
|
||||
* - generation_settings.search_repositories 默认 ['INTERNET','PUBLICATION',...]
|
||||
* - generation_settings.submission_auto_excludes **字符串数组**(如 [] 或具体仓库键),不可传 boolean(否则会 400)
|
||||
* - generation_settings.auto_exclude_self_matching_scope 可选,如 'GROUP_CONTEXT'
|
||||
* - view_settings.exclude_* 布尔排除项(与 TCA 文档一致)
|
||||
* - indexing_settings.add_to_index bool 是否把本文加进 SUBMITTED_WORK 索引(一般 true)
|
||||
* @return array
|
||||
*/
|
||||
@@ -125,7 +138,8 @@ class TurnitinService
|
||||
$body = array_merge([
|
||||
'generation_settings' => [
|
||||
'search_repositories' => ['INTERNET', 'PUBLICATION', 'CROSSREF', 'CROSSREF_POSTED_CONTENT', 'SUBMITTED_WORK'],
|
||||
'submission_auto_excludes' => true,
|
||||
// 服务端类型为 List<String>,传 true 会 400:Cannot deserialize ... from Boolean
|
||||
'submission_auto_excludes' => [],
|
||||
'auto_exclude_self_matching_scope' => 'GROUP_CONTEXT',
|
||||
],
|
||||
'view_settings' => [
|
||||
@@ -140,11 +154,122 @@ class TurnitinService
|
||||
|
||||
return $this->request(
|
||||
'PUT',
|
||||
'/submissions/' . urlencode($submissionId) . '/similarity',
|
||||
'/submissions/' . rawurlencode($submissionId) . '/similarity',
|
||||
$body
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* 查询 submission 详情(上传后用于轮询是否解析完成)。
|
||||
* GET /submissions/{id}
|
||||
*
|
||||
* @return array 解码后的 JSON(常见为 status=ok + message 内含 id/status)
|
||||
*/
|
||||
public function getSubmission($submissionId)
|
||||
{
|
||||
return $this->request('GET', '/submissions/' . rawurlencode($submissionId));
|
||||
}
|
||||
|
||||
/**
|
||||
* 单次解析 GET /submissions/{id},判断是否可调用 PUT /similarity(不 sleep,供队列链逐步轮询)。
|
||||
*
|
||||
* @return array{ready:bool, failed:bool, status:string, snippet:string, message:array}
|
||||
*/
|
||||
public function parseSubmissionIngestState($submissionId)
|
||||
{
|
||||
$raw = $this->getSubmission($submissionId);
|
||||
$msg = self::unwrapSubmissionPayload($raw);
|
||||
$st = strtoupper(trim((string) self::pickSubmissionStatus($msg)));
|
||||
$snippet = mb_substr(json_encode($msg, JSON_UNESCAPED_UNICODE), 0, 400);
|
||||
|
||||
$ready = [
|
||||
'COMPLETE', 'COMPLETED', 'PROCESSED', 'READY', 'SUCCEEDED',
|
||||
'COMPLETE_PROCESSING',
|
||||
];
|
||||
$failed = ['ERROR', 'FAILED', 'CANCELLED', 'CANCELED', 'DELETED'];
|
||||
|
||||
$readyFlag = $st !== '' && in_array($st, $ready, true);
|
||||
$failedFlag = $st !== '' && in_array($st, $failed, true);
|
||||
|
||||
return [
|
||||
'ready' => $readyFlag,
|
||||
'failed' => $failedFlag,
|
||||
'status' => $st,
|
||||
'snippet' => $snippet,
|
||||
'message' => $msg,
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* 上传完成后需等待 Turnitin 异步完成文本解析(同步阻塞版,仅 CLI/调试;线上请用队列链 PlagiarismWaitIngest)。
|
||||
*
|
||||
* @param string $submissionId
|
||||
* @param int $maxWaitSec 最长等待秒数,默认 600(10 分钟)
|
||||
* @param int $intervalSec 轮询间隔秒数,默认 3
|
||||
* @throws Exception 超时或终态为失败
|
||||
*/
|
||||
public function waitAfterUploadForSimilarity($submissionId, $maxWaitSec = 600, $intervalSec = 3)
|
||||
{
|
||||
$deadline = time() + max(30, (int)$maxWaitSec);
|
||||
$intervalSec = max(1, (int)$intervalSec);
|
||||
$lastStatus = '';
|
||||
$lastSnippet = '';
|
||||
|
||||
while (time() < $deadline) {
|
||||
$parsed = $this->parseSubmissionIngestState($submissionId);
|
||||
$lastStatus = $parsed['status'];
|
||||
$lastSnippet = $parsed['snippet'];
|
||||
|
||||
if (!empty($parsed['ready'])) {
|
||||
return;
|
||||
}
|
||||
if (!empty($parsed['failed'])) {
|
||||
throw new Exception('Turnitin submission failed, status=' . $lastStatus . ' body=' . $lastSnippet);
|
||||
}
|
||||
|
||||
sleep($intervalSec);
|
||||
}
|
||||
|
||||
throw new Exception(
|
||||
'Timeout waiting for Turnitin submission ingest (last status=' . ($lastStatus ?: '(empty)') . ') snippet=' . $lastSnippet
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param mixed $decoded
|
||||
* @return array
|
||||
*/
|
||||
private static function unwrapSubmissionPayload($decoded)
|
||||
{
|
||||
if (!is_array($decoded)) {
|
||||
return [];
|
||||
}
|
||||
if (isset($decoded['message']) && is_array($decoded['message'])) {
|
||||
return $decoded['message'];
|
||||
}
|
||||
return $decoded;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $msg
|
||||
* @return string
|
||||
*/
|
||||
private static function pickSubmissionStatus(array $msg)
|
||||
{
|
||||
$candidates = [$msg];
|
||||
if (isset($msg['submission']) && is_array($msg['submission'])) {
|
||||
$candidates[] = $msg['submission'];
|
||||
}
|
||||
foreach ($candidates as $m) {
|
||||
foreach (['status', 'workflow_status', 'submission_status', 'processing_status', 'paper_status'] as $k) {
|
||||
if (!empty($m[$k])) {
|
||||
return (string)$m[$k];
|
||||
}
|
||||
}
|
||||
}
|
||||
return '';
|
||||
}
|
||||
|
||||
/**
|
||||
* 查询 similarity 状态
|
||||
* GET /submissions/{id}/similarity
|
||||
@@ -156,7 +281,7 @@ class TurnitinService
|
||||
{
|
||||
return $this->request(
|
||||
'GET',
|
||||
'/submissions/' . urlencode($submissionId) . '/similarity'
|
||||
'/submissions/' . rawurlencode($submissionId) . '/similarity'
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user