448 lines
16 KiB
PHP
448 lines
16 KiB
PHP
<?php
|
||
|
||
namespace app\common;
|
||
|
||
use think\Env;
|
||
use think\Exception;
|
||
|
||
/**
|
||
* Turnitin Core API (TCA) REST 客户端封装。
|
||
*
|
||
* 适用 Crossref Similarity Check 通道(product_name=Crossref)以及标准 TCA 接入。
|
||
*
|
||
* 鉴权:Authorization: Bearer <API_KEY>
|
||
* X-Turnitin-Integration-Name / X-Turnitin-Integration-Version 用于审计
|
||
*
|
||
* .env 配置([turnitin] 段):
|
||
* BASE_URL 形如 https://crossref-12345.turnitin.com/api/v1(不带尾斜杠)
|
||
* API_KEY 生成的 Bearer token
|
||
* INTEGRATION_NAME Scope Name(创建 integration 时填的名字)
|
||
* INTEGRATION_VERSION 自定义版本号,便于审计 e.g. 1.0.0
|
||
* SUBMISSION_INGEST_MAX_WAIT 上传后轮询 submission 就绪的最长秒数,默认 600(仅 waitAfterUploadForSimilarity 同步用)
|
||
* SUBMISSION_INGEST_POLL_INTERVAL 同步轮询间隔秒数,默认 3
|
||
* INGEST_CHAIN_FIRST_DELAY 上传后首次 ingest 检查延迟秒数,默认 10(队列链)
|
||
* INGEST_CHAIN_POLL_INTERVAL ingest 链每步间隔秒数,默认 15
|
||
* INGEST_CHAIN_MAX_ATTEMPTS ingest 链最大步数,默认 80
|
||
*
|
||
* API 文档:https://developers.turnitin.com/docs/tca
|
||
*
|
||
* 注意:
|
||
* - 所有方法返回原始 decode 后的数组;HTTP 错误抛 Exception
|
||
* - 不做任何业务层逻辑(业务层在 PlagiarismService 里)
|
||
* - 不缓存 token(Bearer 不需要登录,每次请求自带)
|
||
*/
|
||
class TurnitinService
|
||
{
|
||
private $baseUrl;
|
||
private $apiKey;
|
||
private $integrationName;
|
||
private $integrationVersion;
|
||
private $timeout = 60;
|
||
|
||
public function __construct()
|
||
{
|
||
$this->baseUrl = rtrim(trim((string)Env::get('turnitin.base_url', 'https://crossref-20794.turnitin.com/api/v1')), '/');
|
||
$this->apiKey = trim((string)Env::get('turnitin.api_key', 'c6315e8291a4433dae09ad5efdb8a89c'));
|
||
$this->integrationName = trim((string)Env::get('turnitin.integration_name', 'tmr'));
|
||
$this->integrationVersion = trim((string)Env::get('turnitin.integration_version', '1.0.0'));
|
||
|
||
if ($this->baseUrl === '' || $this->apiKey === '') {
|
||
throw new Exception('Turnitin not configured: missing BASE_URL or API_KEY in .env [turnitin] section');
|
||
}
|
||
}
|
||
|
||
// ==================== Public API ====================
|
||
|
||
/**
|
||
* 探活 / 拿账户能力
|
||
* GET /features-enabled
|
||
*/
|
||
public function featuresEnabled()
|
||
{
|
||
return $this->request('GET', '/features-enabled');
|
||
}
|
||
|
||
/**
|
||
* 创建 submission(拿到 id 之后才能上传文件)
|
||
* POST /submissions
|
||
*
|
||
* @param array $meta 必填字段:
|
||
* - title 论文标题
|
||
* - owner submission owner 标识符(自定义字符串,比如投稿系统 user_id)
|
||
* - submitter 提交者标识符(同上)
|
||
* - eula (可选) ['version' => '...', 'language' => 'en-US', 'accepted_timestamp' => ISO8601]
|
||
* 如果 features-enabled 返回 require_eula=false 可省略
|
||
* 可选字段:
|
||
* - extract_text_only bool
|
||
* - metadata array 自定义键值,供后续追溯
|
||
*
|
||
* @return array 含 id(submission UUID), status, owner, ...
|
||
*/
|
||
public function createSubmission($meta)
|
||
{
|
||
return $this->request('POST', '/submissions', $meta);
|
||
}
|
||
|
||
/**
|
||
* 上传文件到 submission
|
||
*
|
||
* TCA 文档路径为 PUT /submissions/{id}/original(文件名仅通过 Content-Disposition 传递,
|
||
* 不要再拼在 URL 末尾;否则网关会 404,错误里常见 path 形如 //v1/submissions/.../original/xxx.docx)。
|
||
*
|
||
* @param string $submissionId
|
||
* @param string $filePath 本地 PDF/DOCX 路径
|
||
* @param string $filename 传给 Turnitin 的展示文件名(默认取 basename)
|
||
* @return array
|
||
*/
|
||
public function uploadFile($submissionId, $filePath, $filename = '')
|
||
{
|
||
if (!is_file($filePath) || !is_readable($filePath)) {
|
||
throw new Exception("File not found or not readable: {$filePath}");
|
||
}
|
||
if ($filename === '') {
|
||
$filename = basename($filePath);
|
||
}
|
||
// Content-Disposition 里避免未转义的双引号
|
||
$safeName = str_replace(['"', "\r", "\n"], '', $filename);
|
||
if ($safeName === '') {
|
||
$safeName = 'document.bin';
|
||
}
|
||
$body = file_get_contents($filePath);
|
||
|
||
return $this->request(
|
||
'PUT',
|
||
'/submissions/' . rawurlencode($submissionId) . '/original',
|
||
$body,
|
||
[
|
||
'Content-Type' => 'application/octet-stream',
|
||
'Content-Disposition' => 'attachment; filename="' . $safeName . '"',
|
||
]
|
||
);
|
||
}
|
||
|
||
/**
|
||
* 触发 similarity 比对
|
||
* PUT /submissions/{id}/similarity
|
||
*
|
||
* @param string $submissionId
|
||
* @param array $opts
|
||
* - generation_settings.search_repositories 默认 ['INTERNET','PUBLICATION',...]
|
||
* - generation_settings.submission_auto_excludes **字符串数组**(如 [] 或具体仓库键),不可传 boolean(否则会 400)
|
||
* - generation_settings.auto_exclude_self_matching_scope 可选,如 'GROUP_CONTEXT'
|
||
* - view_settings.exclude_* 布尔排除项(与 TCA 文档一致)
|
||
* - indexing_settings.add_to_index bool 是否把本文加进 SUBMITTED_WORK 索引(一般 true)
|
||
* @return array
|
||
*/
|
||
public function triggerSimilarity($submissionId, $opts = [])
|
||
{
|
||
$body = array_merge([
|
||
'generation_settings' => [
|
||
'search_repositories' => ['INTERNET', 'PUBLICATION', 'CROSSREF', 'CROSSREF_POSTED_CONTENT', 'SUBMITTED_WORK'],
|
||
// 服务端类型为 List<String>,传 true 会 400:Cannot deserialize ... from Boolean
|
||
'submission_auto_excludes' => [],
|
||
'auto_exclude_self_matching_scope' => 'GROUP_CONTEXT',
|
||
],
|
||
'view_settings' => [
|
||
'exclude_quotes' => true,
|
||
'exclude_bibliography' => true,
|
||
'exclude_citations' => true,
|
||
],
|
||
'indexing_settings' => [
|
||
'add_to_index' => true,
|
||
],
|
||
], $opts);
|
||
|
||
return $this->request(
|
||
'PUT',
|
||
'/submissions/' . rawurlencode($submissionId) . '/similarity',
|
||
$body
|
||
);
|
||
}
|
||
|
||
/**
|
||
* 查询 submission 详情(上传后用于轮询是否解析完成)。
|
||
* GET /submissions/{id}
|
||
*
|
||
* @return array 解码后的 JSON(常见为 status=ok + message 内含 id/status)
|
||
*/
|
||
public function getSubmission($submissionId)
|
||
{
|
||
return $this->request('GET', '/submissions/' . rawurlencode($submissionId));
|
||
}
|
||
|
||
/**
|
||
* 单次解析 GET /submissions/{id},判断是否可调用 PUT /similarity(不 sleep,供队列链逐步轮询)。
|
||
*
|
||
* @return array{ready:bool, failed:bool, status:string, snippet:string, message:array}
|
||
*/
|
||
public function parseSubmissionIngestState($submissionId)
|
||
{
|
||
$raw = $this->getSubmission($submissionId);
|
||
$msg = self::unwrapSubmissionPayload($raw);
|
||
$st = strtoupper(trim((string) self::pickSubmissionStatus($msg)));
|
||
$snippet = mb_substr(json_encode($msg, JSON_UNESCAPED_UNICODE), 0, 400);
|
||
|
||
$ready = [
|
||
'COMPLETE', 'COMPLETED', 'PROCESSED', 'READY', 'SUCCEEDED',
|
||
'COMPLETE_PROCESSING',
|
||
];
|
||
$failed = ['ERROR', 'FAILED', 'CANCELLED', 'CANCELED', 'DELETED'];
|
||
|
||
$readyFlag = $st !== '' && in_array($st, $ready, true);
|
||
$failedFlag = $st !== '' && in_array($st, $failed, true);
|
||
|
||
return [
|
||
'ready' => $readyFlag,
|
||
'failed' => $failedFlag,
|
||
'status' => $st,
|
||
'snippet' => $snippet,
|
||
'message' => $msg,
|
||
];
|
||
}
|
||
|
||
/**
|
||
* 上传完成后需等待 Turnitin 异步完成文本解析(同步阻塞版,仅 CLI/调试;线上请用队列链 PlagiarismWaitIngest)。
|
||
*
|
||
* @param string $submissionId
|
||
* @param int $maxWaitSec 最长等待秒数,默认 600(10 分钟)
|
||
* @param int $intervalSec 轮询间隔秒数,默认 3
|
||
* @throws Exception 超时或终态为失败
|
||
*/
|
||
public function waitAfterUploadForSimilarity($submissionId, $maxWaitSec = 600, $intervalSec = 3)
|
||
{
|
||
$deadline = time() + max(30, (int)$maxWaitSec);
|
||
$intervalSec = max(1, (int)$intervalSec);
|
||
$lastStatus = '';
|
||
$lastSnippet = '';
|
||
|
||
while (time() < $deadline) {
|
||
$parsed = $this->parseSubmissionIngestState($submissionId);
|
||
$lastStatus = $parsed['status'];
|
||
$lastSnippet = $parsed['snippet'];
|
||
|
||
if (!empty($parsed['ready'])) {
|
||
return;
|
||
}
|
||
if (!empty($parsed['failed'])) {
|
||
throw new Exception('Turnitin submission failed, status=' . $lastStatus . ' body=' . $lastSnippet);
|
||
}
|
||
|
||
sleep($intervalSec);
|
||
}
|
||
|
||
throw new Exception(
|
||
'Timeout waiting for Turnitin submission ingest (last status=' . ($lastStatus ?: '(empty)') . ') snippet=' . $lastSnippet
|
||
);
|
||
}
|
||
|
||
/**
|
||
* @param mixed $decoded
|
||
* @return array
|
||
*/
|
||
private static function unwrapSubmissionPayload($decoded)
|
||
{
|
||
if (!is_array($decoded)) {
|
||
return [];
|
||
}
|
||
if (isset($decoded['message']) && is_array($decoded['message'])) {
|
||
return $decoded['message'];
|
||
}
|
||
return $decoded;
|
||
}
|
||
|
||
/**
|
||
* @param array $msg
|
||
* @return string
|
||
*/
|
||
private static function pickSubmissionStatus(array $msg)
|
||
{
|
||
$candidates = [$msg];
|
||
if (isset($msg['submission']) && is_array($msg['submission'])) {
|
||
$candidates[] = $msg['submission'];
|
||
}
|
||
foreach ($candidates as $m) {
|
||
foreach (['status', 'workflow_status', 'submission_status', 'processing_status', 'paper_status'] as $k) {
|
||
if (!empty($m[$k])) {
|
||
return (string)$m[$k];
|
||
}
|
||
}
|
||
}
|
||
return '';
|
||
}
|
||
|
||
/**
|
||
* 查询 similarity 状态
|
||
* GET /submissions/{id}/similarity
|
||
*
|
||
* 返回 status: PROCESSING / COMPLETE / ERROR
|
||
* COMPLETE 时返回 overall_match_percentage / time_requested / time_generated
|
||
*/
|
||
public function getSimilarityStatus($submissionId)
|
||
{
|
||
return $this->request(
|
||
'GET',
|
||
'/submissions/' . rawurlencode($submissionId) . '/similarity'
|
||
);
|
||
}
|
||
|
||
/**
|
||
* 取在线查看报告的临时 URL
|
||
* POST /submissions/{id}/viewer-url
|
||
*
|
||
* 返回 viewer_url(数小时有效)
|
||
*
|
||
* @param array $viewer 可选 viewer 设置 e.g. ['viewer_default_permission_set' => 'INSTRUCTOR']
|
||
*/
|
||
public function getViewerUrl($submissionId, $viewer = [])
|
||
{
|
||
$body = array_merge([
|
||
'viewer_default_permission_set' => 'INSTRUCTOR',
|
||
'similarity' => [
|
||
'default_mode' => 'MATCH_OVERVIEW',
|
||
'view_settings' => ['save_changes' => true],
|
||
'modes' => ['match_overview' => true, 'all_sources' => true],
|
||
],
|
||
'locale' => 'en-US',
|
||
], $viewer);
|
||
|
||
return $this->request(
|
||
'POST',
|
||
'/submissions/' . urlencode($submissionId) . '/viewer-url',
|
||
$body
|
||
);
|
||
}
|
||
|
||
/**
|
||
* 触发生成 PDF 报告(异步,状态在另一个轮询里看)
|
||
* POST /submissions/{id}/similarity/pdf
|
||
*
|
||
* 返回 id(pdf 报告 ID)
|
||
*/
|
||
public function requestPdfReport($submissionId, $opts = [])
|
||
{
|
||
$body = array_merge([
|
||
'locale' => 'en-US',
|
||
], $opts);
|
||
|
||
return $this->request(
|
||
'POST',
|
||
'/submissions/' . urlencode($submissionId) . '/similarity/pdf',
|
||
$body
|
||
);
|
||
}
|
||
|
||
/**
|
||
* 查询 PDF 报告状态
|
||
* GET /submissions/{id}/similarity/pdf/{pdf_id}/status
|
||
*
|
||
* status: PENDING / SUCCESS / FAILED
|
||
*/
|
||
public function getPdfReportStatus($submissionId, $pdfId)
|
||
{
|
||
return $this->request(
|
||
'GET',
|
||
'/submissions/' . urlencode($submissionId) . '/similarity/pdf/' . urlencode($pdfId) . '/status'
|
||
);
|
||
}
|
||
|
||
/**
|
||
* 下载 PDF 报告内容(status=SUCCESS 后才可调用)
|
||
* GET /submissions/{id}/similarity/pdf/{pdf_id}
|
||
*
|
||
* 返回 raw PDF binary 字符串;调用方负责落盘
|
||
*/
|
||
public function downloadPdfReport($submissionId, $pdfId)
|
||
{
|
||
return $this->request(
|
||
'GET',
|
||
'/submissions/' . urlencode($submissionId) . '/similarity/pdf/' . urlencode($pdfId),
|
||
null,
|
||
[],
|
||
true // raw response (不 json_decode)
|
||
);
|
||
}
|
||
|
||
// ==================== Internal HTTP layer ====================
|
||
|
||
/**
|
||
* 统一 HTTP 调用
|
||
*
|
||
* @param string $method GET/POST/PUT/DELETE
|
||
* @param string $path 以 / 开头的相对路径,会拼到 baseUrl 后
|
||
* @param mixed $body array 时按 JSON 编码;string 时直接当 raw body
|
||
* @param array $extraHeaders 额外 header
|
||
* @param bool $rawResponse true=返回 raw 字符串;false=json_decode
|
||
* @return mixed
|
||
* @throws Exception
|
||
*/
|
||
private function request($method, $path, $body = null, $extraHeaders = [], $rawResponse = false)
|
||
{
|
||
$url = $this->baseUrl . $path;
|
||
|
||
$headers = [
|
||
'Authorization: Bearer ' . $this->apiKey,
|
||
'X-Turnitin-Integration-Name: ' . $this->integrationName,
|
||
'X-Turnitin-Integration-Version: ' . $this->integrationVersion,
|
||
];
|
||
|
||
$payload = null;
|
||
if ($body !== null) {
|
||
if (is_array($body)) {
|
||
$payload = json_encode($body, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES);
|
||
$headers[] = 'Content-Type: application/json';
|
||
} else {
|
||
$payload = $body;
|
||
if (!isset($extraHeaders['Content-Type'])) {
|
||
$headers[] = 'Content-Type: application/octet-stream';
|
||
}
|
||
}
|
||
}
|
||
foreach ($extraHeaders as $k => $v) {
|
||
$headers[] = $k . ': ' . $v;
|
||
}
|
||
|
||
$ch = curl_init();
|
||
curl_setopt_array($ch, [
|
||
CURLOPT_URL => $url,
|
||
CURLOPT_CUSTOMREQUEST => strtoupper($method),
|
||
CURLOPT_RETURNTRANSFER => true,
|
||
CURLOPT_HTTPHEADER => $headers,
|
||
CURLOPT_TIMEOUT => $this->timeout,
|
||
CURLOPT_CONNECTTIMEOUT => 15,
|
||
CURLOPT_SSL_VERIFYPEER => true,
|
||
CURLOPT_SSL_VERIFYHOST => 2,
|
||
]);
|
||
if ($payload !== null) {
|
||
curl_setopt($ch, CURLOPT_POSTFIELDS, $payload);
|
||
}
|
||
|
||
$resp = curl_exec($ch);
|
||
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
||
$err = curl_error($ch);
|
||
curl_close($ch);
|
||
|
||
if ($resp === false) {
|
||
throw new Exception("Turnitin curl error: {$err} (url={$url})");
|
||
}
|
||
if ($httpCode < 200 || $httpCode >= 300) {
|
||
// 把响应体的前 1k 也带上方便排错
|
||
$excerpt = mb_substr((string)$resp, 0, 1000);
|
||
throw new Exception("Turnitin HTTP {$httpCode} {$method} {$path}: {$excerpt}");
|
||
}
|
||
|
||
if ($rawResponse) {
|
||
return $resp;
|
||
}
|
||
// 部分响应可能是 204 No Content
|
||
if ($resp === '' || $resp === null) {
|
||
return [];
|
||
}
|
||
$data = json_decode($resp, true);
|
||
if (json_last_error() !== JSON_ERROR_NONE) {
|
||
// 不是 JSON 也直接抛回原文
|
||
return $resp;
|
||
}
|
||
return $data;
|
||
}
|
||
}
|