Files
tougao/application/common/TurnitinService.php
2026-05-13 18:02:09 +08:00

448 lines
16 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
namespace app\common;
use think\Env;
use think\Exception;
/**
* Turnitin Core API (TCA) REST 客户端封装。
*
* 适用 Crossref Similarity Check 通道product_name=Crossref以及标准 TCA 接入。
*
* 鉴权Authorization: Bearer <API_KEY>
* X-Turnitin-Integration-Name / X-Turnitin-Integration-Version 用于审计
*
* .env 配置([turnitin] 段):
* BASE_URL 形如 https://crossref-12345.turnitin.com/api/v1不带尾斜杠
* API_KEY 生成的 Bearer token
* INTEGRATION_NAME Scope Name创建 integration 时填的名字)
* INTEGRATION_VERSION 自定义版本号,便于审计 e.g. 1.0.0
* SUBMISSION_INGEST_MAX_WAIT 上传后轮询 submission 就绪的最长秒数,默认 600仅 waitAfterUploadForSimilarity 同步用)
* SUBMISSION_INGEST_POLL_INTERVAL 同步轮询间隔秒数,默认 3
* INGEST_CHAIN_FIRST_DELAY 上传后首次 ingest 检查延迟秒数,默认 10队列链
* INGEST_CHAIN_POLL_INTERVAL ingest 链每步间隔秒数,默认 15
* INGEST_CHAIN_MAX_ATTEMPTS ingest 链最大步数,默认 80
*
* API 文档https://developers.turnitin.com/docs/tca
*
* 注意:
* - 所有方法返回原始 decode 后的数组HTTP 错误抛 Exception
* - 不做任何业务层逻辑(业务层在 PlagiarismService 里)
* - 不缓存 tokenBearer 不需要登录,每次请求自带)
*/
class TurnitinService
{
private $baseUrl;
private $apiKey;
private $integrationName;
private $integrationVersion;
private $timeout = 60;
public function __construct()
{
$this->baseUrl = rtrim(trim((string)Env::get('turnitin.base_url', 'https://crossref-20794.turnitin.com/api/v1')), '/');
$this->apiKey = trim((string)Env::get('turnitin.api_key', 'c6315e8291a4433dae09ad5efdb8a89c'));
$this->integrationName = trim((string)Env::get('turnitin.integration_name', 'tmr'));
$this->integrationVersion = trim((string)Env::get('turnitin.integration_version', '1.0.0'));
if ($this->baseUrl === '' || $this->apiKey === '') {
throw new Exception('Turnitin not configured: missing BASE_URL or API_KEY in .env [turnitin] section');
}
}
// ==================== Public API ====================
/**
* 探活 / 拿账户能力
* GET /features-enabled
*/
public function featuresEnabled()
{
return $this->request('GET', '/features-enabled');
}
/**
* 创建 submission拿到 id 之后才能上传文件)
* POST /submissions
*
* @param array $meta 必填字段:
* - title 论文标题
* - owner submission owner 标识符(自定义字符串,比如投稿系统 user_id
* - submitter 提交者标识符(同上)
* - eula (可选) ['version' => '...', 'language' => 'en-US', 'accepted_timestamp' => ISO8601]
* 如果 features-enabled 返回 require_eula=false 可省略
* 可选字段:
* - extract_text_only bool
* - metadata array 自定义键值,供后续追溯
*
* @return array 含 idsubmission UUID, status, owner, ...
*/
public function createSubmission($meta)
{
return $this->request('POST', '/submissions', $meta);
}
/**
* 上传文件到 submission
*
* TCA 文档路径为 PUT /submissions/{id}/original文件名仅通过 Content-Disposition 传递,
* 不要再拼在 URL 末尾;否则网关会 404错误里常见 path 形如 //v1/submissions/.../original/xxx.docx
*
* @param string $submissionId
* @param string $filePath 本地 PDF/DOCX 路径
* @param string $filename 传给 Turnitin 的展示文件名(默认取 basename
* @return array
*/
public function uploadFile($submissionId, $filePath, $filename = '')
{
if (!is_file($filePath) || !is_readable($filePath)) {
throw new Exception("File not found or not readable: {$filePath}");
}
if ($filename === '') {
$filename = basename($filePath);
}
// Content-Disposition 里避免未转义的双引号
$safeName = str_replace(['"', "\r", "\n"], '', $filename);
if ($safeName === '') {
$safeName = 'document.bin';
}
$body = file_get_contents($filePath);
return $this->request(
'PUT',
'/submissions/' . rawurlencode($submissionId) . '/original',
$body,
[
'Content-Type' => 'application/octet-stream',
'Content-Disposition' => 'attachment; filename="' . $safeName . '"',
]
);
}
/**
* 触发 similarity 比对
* PUT /submissions/{id}/similarity
*
* @param string $submissionId
* @param array $opts
* - generation_settings.search_repositories 默认 ['INTERNET','PUBLICATION',...]
* - generation_settings.submission_auto_excludes **字符串数组**(如 [] 或具体仓库键),不可传 boolean否则会 400
* - generation_settings.auto_exclude_self_matching_scope 可选,如 'GROUP_CONTEXT'
* - view_settings.exclude_* 布尔排除项(与 TCA 文档一致)
* - indexing_settings.add_to_index bool 是否把本文加进 SUBMITTED_WORK 索引(一般 true
* @return array
*/
public function triggerSimilarity($submissionId, $opts = [])
{
$body = array_merge([
'generation_settings' => [
'search_repositories' => ['INTERNET', 'PUBLICATION', 'CROSSREF', 'CROSSREF_POSTED_CONTENT', 'SUBMITTED_WORK'],
// 服务端类型为 List<String>,传 true 会 400Cannot deserialize ... from Boolean
'submission_auto_excludes' => [],
'auto_exclude_self_matching_scope' => 'GROUP_CONTEXT',
],
'view_settings' => [
'exclude_quotes' => true,
'exclude_bibliography' => true,
'exclude_citations' => true,
],
'indexing_settings' => [
'add_to_index' => true,
],
], $opts);
return $this->request(
'PUT',
'/submissions/' . rawurlencode($submissionId) . '/similarity',
$body
);
}
/**
* 查询 submission 详情(上传后用于轮询是否解析完成)。
* GET /submissions/{id}
*
* @return array 解码后的 JSON常见为 status=ok + message 内含 id/status
*/
public function getSubmission($submissionId)
{
return $this->request('GET', '/submissions/' . rawurlencode($submissionId));
}
/**
* 单次解析 GET /submissions/{id},判断是否可调用 PUT /similarity不 sleep供队列链逐步轮询
*
* @return array{ready:bool, failed:bool, status:string, snippet:string, message:array}
*/
public function parseSubmissionIngestState($submissionId)
{
$raw = $this->getSubmission($submissionId);
$msg = self::unwrapSubmissionPayload($raw);
$st = strtoupper(trim((string) self::pickSubmissionStatus($msg)));
$snippet = mb_substr(json_encode($msg, JSON_UNESCAPED_UNICODE), 0, 400);
$ready = [
'COMPLETE', 'COMPLETED', 'PROCESSED', 'READY', 'SUCCEEDED',
'COMPLETE_PROCESSING',
];
$failed = ['ERROR', 'FAILED', 'CANCELLED', 'CANCELED', 'DELETED'];
$readyFlag = $st !== '' && in_array($st, $ready, true);
$failedFlag = $st !== '' && in_array($st, $failed, true);
return [
'ready' => $readyFlag,
'failed' => $failedFlag,
'status' => $st,
'snippet' => $snippet,
'message' => $msg,
];
}
/**
* 上传完成后需等待 Turnitin 异步完成文本解析(同步阻塞版,仅 CLI/调试;线上请用队列链 PlagiarismWaitIngest
*
* @param string $submissionId
* @param int $maxWaitSec 最长等待秒数,默认 60010 分钟)
* @param int $intervalSec 轮询间隔秒数,默认 3
* @throws Exception 超时或终态为失败
*/
public function waitAfterUploadForSimilarity($submissionId, $maxWaitSec = 600, $intervalSec = 3)
{
$deadline = time() + max(30, (int)$maxWaitSec);
$intervalSec = max(1, (int)$intervalSec);
$lastStatus = '';
$lastSnippet = '';
while (time() < $deadline) {
$parsed = $this->parseSubmissionIngestState($submissionId);
$lastStatus = $parsed['status'];
$lastSnippet = $parsed['snippet'];
if (!empty($parsed['ready'])) {
return;
}
if (!empty($parsed['failed'])) {
throw new Exception('Turnitin submission failed, status=' . $lastStatus . ' body=' . $lastSnippet);
}
sleep($intervalSec);
}
throw new Exception(
'Timeout waiting for Turnitin submission ingest (last status=' . ($lastStatus ?: '(empty)') . ') snippet=' . $lastSnippet
);
}
/**
* @param mixed $decoded
* @return array
*/
private static function unwrapSubmissionPayload($decoded)
{
if (!is_array($decoded)) {
return [];
}
if (isset($decoded['message']) && is_array($decoded['message'])) {
return $decoded['message'];
}
return $decoded;
}
/**
* @param array $msg
* @return string
*/
private static function pickSubmissionStatus(array $msg)
{
$candidates = [$msg];
if (isset($msg['submission']) && is_array($msg['submission'])) {
$candidates[] = $msg['submission'];
}
foreach ($candidates as $m) {
foreach (['status', 'workflow_status', 'submission_status', 'processing_status', 'paper_status'] as $k) {
if (!empty($m[$k])) {
return (string)$m[$k];
}
}
}
return '';
}
/**
* 查询 similarity 状态
* GET /submissions/{id}/similarity
*
* 返回 status: PROCESSING / COMPLETE / ERROR
* COMPLETE 时返回 overall_match_percentage / time_requested / time_generated
*/
public function getSimilarityStatus($submissionId)
{
return $this->request(
'GET',
'/submissions/' . rawurlencode($submissionId) . '/similarity'
);
}
/**
* 取在线查看报告的临时 URL
* POST /submissions/{id}/viewer-url
*
* 返回 viewer_url数小时有效
*
* @param array $viewer 可选 viewer 设置 e.g. ['viewer_default_permission_set' => 'INSTRUCTOR']
*/
public function getViewerUrl($submissionId, $viewer = [])
{
$body = array_merge([
'viewer_default_permission_set' => 'INSTRUCTOR',
'similarity' => [
'default_mode' => 'MATCH_OVERVIEW',
'view_settings' => ['save_changes' => true],
'modes' => ['match_overview' => true, 'all_sources' => true],
],
'locale' => 'en-US',
], $viewer);
return $this->request(
'POST',
'/submissions/' . urlencode($submissionId) . '/viewer-url',
$body
);
}
/**
* 触发生成 PDF 报告(异步,状态在另一个轮询里看)
* POST /submissions/{id}/similarity/pdf
*
* 返回 idpdf 报告 ID
*/
public function requestPdfReport($submissionId, $opts = [])
{
$body = array_merge([
'locale' => 'en-US',
], $opts);
return $this->request(
'POST',
'/submissions/' . urlencode($submissionId) . '/similarity/pdf',
$body
);
}
/**
* 查询 PDF 报告状态
* GET /submissions/{id}/similarity/pdf/{pdf_id}/status
*
* status: PENDING / SUCCESS / FAILED
*/
public function getPdfReportStatus($submissionId, $pdfId)
{
return $this->request(
'GET',
'/submissions/' . urlencode($submissionId) . '/similarity/pdf/' . urlencode($pdfId) . '/status'
);
}
/**
* 下载 PDF 报告内容status=SUCCESS 后才可调用)
* GET /submissions/{id}/similarity/pdf/{pdf_id}
*
* 返回 raw PDF binary 字符串;调用方负责落盘
*/
public function downloadPdfReport($submissionId, $pdfId)
{
return $this->request(
'GET',
'/submissions/' . urlencode($submissionId) . '/similarity/pdf/' . urlencode($pdfId),
null,
[],
true // raw response (不 json_decode)
);
}
// ==================== Internal HTTP layer ====================
/**
* 统一 HTTP 调用
*
* @param string $method GET/POST/PUT/DELETE
* @param string $path 以 / 开头的相对路径,会拼到 baseUrl 后
* @param mixed $body array 时按 JSON 编码string 时直接当 raw body
* @param array $extraHeaders 额外 header
* @param bool $rawResponse true=返回 raw 字符串false=json_decode
* @return mixed
* @throws Exception
*/
private function request($method, $path, $body = null, $extraHeaders = [], $rawResponse = false)
{
$url = $this->baseUrl . $path;
$headers = [
'Authorization: Bearer ' . $this->apiKey,
'X-Turnitin-Integration-Name: ' . $this->integrationName,
'X-Turnitin-Integration-Version: ' . $this->integrationVersion,
];
$payload = null;
if ($body !== null) {
if (is_array($body)) {
$payload = json_encode($body, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES);
$headers[] = 'Content-Type: application/json';
} else {
$payload = $body;
if (!isset($extraHeaders['Content-Type'])) {
$headers[] = 'Content-Type: application/octet-stream';
}
}
}
foreach ($extraHeaders as $k => $v) {
$headers[] = $k . ': ' . $v;
}
$ch = curl_init();
curl_setopt_array($ch, [
CURLOPT_URL => $url,
CURLOPT_CUSTOMREQUEST => strtoupper($method),
CURLOPT_RETURNTRANSFER => true,
CURLOPT_HTTPHEADER => $headers,
CURLOPT_TIMEOUT => $this->timeout,
CURLOPT_CONNECTTIMEOUT => 15,
CURLOPT_SSL_VERIFYPEER => true,
CURLOPT_SSL_VERIFYHOST => 2,
]);
if ($payload !== null) {
curl_setopt($ch, CURLOPT_POSTFIELDS, $payload);
}
$resp = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
$err = curl_error($ch);
curl_close($ch);
if ($resp === false) {
throw new Exception("Turnitin curl error: {$err} (url={$url})");
}
if ($httpCode < 200 || $httpCode >= 300) {
// 把响应体的前 1k 也带上方便排错
$excerpt = mb_substr((string)$resp, 0, 1000);
throw new Exception("Turnitin HTTP {$httpCode} {$method} {$path}: {$excerpt}");
}
if ($rawResponse) {
return $resp;
}
// 部分响应可能是 204 No Content
if ($resp === '' || $resp === null) {
return [];
}
$data = json_decode($resp, true);
if (json_last_error() !== JSON_ERROR_NONE) {
// 不是 JSON 也直接抛回原文
return $resp;
}
return $data;
}
}