Files
tougao/application/common/TurnitinService.php
2026-06-03 13:30:27 +08:00

752 lines
27 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
namespace app\common;
use think\Env;
use think\Exception;
/**
* Turnitin Core API (TCA) REST 客户端封装。
*
* 适用 Crossref Similarity Check 通道product_name=Crossref以及标准 TCA 接入。
*
* 鉴权Authorization: Bearer <API_KEY>
* X-Turnitin-Integration-Name / X-Turnitin-Integration-Version 用于审计
*
* .env 配置([turnitin] 段):
* BASE_URL 形如 https://crossref-12345.turnitin.com/api/v1不带尾斜杠
* API_KEY 生成的 Bearer token
* INTEGRATION_NAME Scope Name创建 integration 时填的名字)
* INTEGRATION_VERSION 自定义版本号,便于审计 e.g. 1.0.0
* SUBMISSION_INGEST_MAX_WAIT 上传后轮询 submission 就绪的最长秒数,默认 600仅 waitAfterUploadForSimilarity 同步用)
* SUBMISSION_INGEST_POLL_INTERVAL 同步轮询间隔秒数,默认 3
* INGEST_CHAIN_FIRST_DELAY 上传后首次 ingest 检查延迟秒数,默认 10队列链
* INGEST_CHAIN_POLL_INTERVAL ingest 链每步间隔秒数,默认 15
* INGEST_CHAIN_MAX_ATTEMPTS ingest 链最大步数,默认 80
* EXCLUDE_QUOTES / EXCLUDE_BIBLIOGRAPHY / EXCLUDE_CITATIONS 0|1默认 0与 Crossref 网页手动查重更接近)
* VIEWER_DEFAULT_MODE match_overview | all_sources默认 all_sources便于按来源库分类查看
* ADD_TO_INDEX 0|1默认 1
*
* API 文档https://developers.turnitin.com/docs/tca
*
* 注意:
* - 所有方法返回原始 decode 后的数组HTTP 错误抛 Exception
* - 不做任何业务层逻辑(业务层在 PlagiarismService 里)
* - 不缓存 tokenBearer 不需要登录,每次请求自带)
*/
class TurnitinService
{
private $baseUrl;
private $apiKey;
private $integrationName;
private $integrationVersion;
private $timeout = 60;
public function __construct()
{
$this->baseUrl = rtrim(trim((string)Env::get('turnitin.base_url', 'https://crossref-20794.turnitin.com/api/v1')), '/');
$this->apiKey = trim((string)Env::get('turnitin.api_key', 'c6315e8291a4433dae09ad5efdb8a89c'));
$this->integrationName = trim((string)Env::get('turnitin.integration_name', 'tmr'));
$this->integrationVersion = trim((string)Env::get('turnitin.integration_version', '1.0.0'));
if ($this->baseUrl === '' || $this->apiKey === '') {
throw new Exception('Turnitin not configured: missing BASE_URL or API_KEY in .env [turnitin] section');
}
}
// ==================== Public API ====================
/**
* 探活 / 拿账户能力
* GET /features-enabled
*/
public function featuresEnabled()
{
return $this->request('GET', '/features-enabled');
}
/**
* 创建 submission拿到 id 之后才能上传文件)
* POST /submissions
*
* @param array $meta 必填字段:
* - title 论文标题
* - owner submission owner 标识符(自定义字符串,比如投稿系统 user_id
* - submitter 提交者标识符(同上)
* - eula (可选) ['version' => '...', 'language' => 'en-US', 'accepted_timestamp' => ISO8601]
* 如果 features-enabled 返回 require_eula=false 可省略
* 可选字段:
* - extract_text_only bool
* - metadata array 自定义键值,供后续追溯
*
* @return array 含 idsubmission UUID, status, owner, ...
*/
public function createSubmission($meta)
{
return $this->request('POST', '/submissions', $meta);
}
/**
* 上传文件到 submission
*
* TCA 文档路径为 PUT /submissions/{id}/original文件名仅通过 Content-Disposition 传递,
* 不要再拼在 URL 末尾;否则网关会 404错误里常见 path 形如 //v1/submissions/.../original/xxx.docx
*
* @param string $submissionId
* @param string $filePath 本地 PDF/DOCX 路径
* @param string $filename 传给 Turnitin 的展示文件名(默认取 basename
* @return array
*/
public function uploadFile($submissionId, $filePath, $filename = '')
{
if (!is_file($filePath) || !is_readable($filePath)) {
throw new Exception("File not found or not readable: {$filePath}");
}
if ($filename === '') {
$filename = basename($filePath);
}
// Content-Disposition 里避免未转义的双引号
$safeName = str_replace(['"', "\r", "\n"], '', $filename);
if ($safeName === '') {
$safeName = 'document.bin';
}
$body = file_get_contents($filePath);
return $this->request(
'PUT',
'/submissions/' . rawurlencode($submissionId) . '/original',
$body,
[
'Content-Type' => 'application/octet-stream',
'Content-Disposition' => 'attachment; filename="' . $safeName . '"',
]
);
}
/**
* 触发 similarity 比对
* PUT /submissions/{id}/similarity
*
* @param string $submissionId
* @param array $opts
* - generation_settings.search_repositories 默认 ['INTERNET','PUBLICATION',...]
* - generation_settings.submission_auto_excludes **字符串数组**(如 [] 或具体仓库键),不可传 boolean否则会 400
* - generation_settings.auto_exclude_self_matching_scope 可选,如 'GROUP_CONTEXT'
* - view_settings.exclude_* 布尔排除项(与 TCA 文档一致)
* - indexing_settings.add_to_index bool 是否把本文加进 SUBMITTED_WORK 索引(一般 true
* @return array
*/
public function triggerSimilarity($submissionId, $opts = [])
{
$body = array_merge($this->defaultSimilarityPayload(), $opts);
return $this->request(
'PUT',
'/submissions/' . rawurlencode($submissionId) . '/similarity',
$body
);
}
/**
* PUT /similarity 与 PDF 导出共用的默认参数。
* 此前固定 exclude_*=true 时,总相似度会低于 Crossref 网页手动查重(与「匹配来源编号/类型」无关)。
*/
public function defaultSimilarityPayload()
{
$scope = trim((string) Env::get('turnitin.auto_exclude_self_matching_scope', 'GROUP_CONTEXT'));
if ($scope === '') {
unset($scope);
}
$generation = [
'search_repositories' => ['INTERNET', 'PUBLICATION', 'CROSSREF', 'CROSSREF_POSTED_CONTENT', 'SUBMITTED_WORK'],
'submission_auto_excludes' => [],
];
if (isset($scope)) {
$generation['auto_exclude_self_matching_scope'] = $scope;
}
return [
'generation_settings' => $generation,
'view_settings' => $this->defaultViewSettings(),
'indexing_settings' => [
'add_to_index' => $this->envBool('turnitin.add_to_index', true),
],
];
}
public function defaultViewSettings()
{
return [
'exclude_quotes' => $this->envBool('turnitin.exclude_quotes', false),
'exclude_bibliography' => $this->envBool('turnitin.exclude_bibliography', false),
'exclude_citations' => $this->envBool('turnitin.exclude_citations', false),
];
}
/**
* 从 GET /similarity 响应解析总相似度0100
* 兼容 overall_match_percentage 在 message 嵌套、以及 01 小数形式。
*/
public static function extractOverallMatchPercentage(array $statusResp)
{
$candidates = [];
$push = function ($v) use (&$candidates) {
if ($v === null || $v === '') {
return;
}
if (is_numeric($v)) {
$candidates[] = floatval($v);
}
};
$push($statusResp['overall_match_percentage'] ?? null);
$push($statusResp['overall_match'] ?? null);
$push($statusResp['similarity_percentage'] ?? null);
$msg = $statusResp;
if (isset($statusResp['message']) && is_array($statusResp['message'])) {
$msg = $statusResp['message'];
}
$push($msg['overall_match_percentage'] ?? null);
$push($msg['overall_match'] ?? null);
if (isset($msg['similarity']) && is_array($msg['similarity'])) {
$sim = $msg['similarity'];
$push($sim['overall_match_percentage'] ?? null);
$push($sim['overall_match'] ?? null);
}
foreach ($candidates as $n) {
if ($n < 0) {
continue;
}
// Turnitin TCA 的 overall_match_percentage 是 0100 整数,"1" 即代表 1%。
// 仅当值是「严格小于 1 的非整数」(真正的 01 小数比例,如 0.12=12%)时才 ×100
// 避免把整数 11%)误判成 100%。
if ($n > 0 && $n < 1.0) {
return round(min($n * 100, 100), 2);
}
return round(min($n, 100), 2);
}
return 0.0;
}
/**
* 从 GET /similarity 响应中尽量提取「按来源」的摘要(供列表展示;完整明细仍在 Turnitin 在线报告里)。
*
* @return array{score:float,sources:array<int,array<string,mixed>>}
*/
public static function parseSimilarityReportMeta(array $statusResp)
{
$meta = [
'score' => self::extractOverallMatchPercentage($statusResp),
'sources' => [],
];
$candidates = [];
self::collectSimilaritySourceNodes($statusResp, $candidates, 0);
if (isset($statusResp['message']) && is_array($statusResp['message'])) {
self::collectSimilaritySourceNodes($statusResp['message'], $candidates, 0);
}
$seen = [];
foreach ($candidates as $node) {
if (!is_array($node)) {
continue;
}
$pct = null;
foreach (['percentage', 'match_percentage', 'overall_match_percentage', 'similarity_percentage'] as $k) {
if (isset($node[$k]) && is_numeric($node[$k])) {
$pct = floatval($node[$k]);
break;
}
}
$repo = '';
foreach (['repository', 'repository_name', 'collection', 'source_type', 'type', 'database', 'category'] as $k) {
if (!empty($node[$k])) {
$repo = strtoupper(trim((string) $node[$k]));
break;
}
}
$words = isset($node['matched_word_count']) ? intval($node['matched_word_count'])
: (isset($node['word_count']) ? intval($node['word_count']) : 0);
$key = $repo . '|' . ($pct !== null ? $pct : '') . '|' . $words;
if (isset($seen[$key])) {
continue;
}
$seen[$key] = true;
$meta['sources'][] = array_filter([
'repository' => $repo,
'match_percentage' => $pct,
'matched_word_count' => $words > 0 ? $words : null,
], function ($v) {
return $v !== null && $v !== '';
});
}
return $meta;
}
/**
* @param array<string,mixed> $node
* @param array<int,mixed> $out
*/
private static function collectSimilaritySourceNodes($node, array &$out, $depth)
{
if ($depth > 8 || !is_array($node)) {
return;
}
$hasRepo = false;
foreach (['repository', 'repository_name', 'collection', 'source_type'] as $k) {
if (!empty($node[$k])) {
$hasRepo = true;
break;
}
}
if ($hasRepo) {
$out[] = $node;
}
foreach ($node as $v) {
if (is_array($v)) {
if (isset($v[0]) && is_array($v[0])) {
foreach ($v as $item) {
self::collectSimilaritySourceNodes($item, $out, $depth + 1);
}
} else {
self::collectSimilaritySourceNodes($v, $out, $depth + 1);
}
}
}
}
/**
* 在线 Similarity Report 默认视图(与 Crossref 后台「按来源查看」对齐)。
*/
public function defaultViewerSimilarityBlock()
{
$mode = strtolower(trim((string) Env::get('turnitin.viewer_default_mode', 'all_sources')));
if (!in_array($mode, ['match_overview', 'all_sources'], true)) {
$mode = 'all_sources';
}
return [
'default_mode' => $mode,
'modes' => [
'match_overview' => true,
'all_sources' => true,
],
];
}
private function envBool($name, $default = false)
{
$v = Env::get($name, $default ? '1' : '0');
if ($v === true) {
return true;
}
if ($v === false) {
return false;
}
$v = strtolower(trim((string) $v));
return in_array($v, ['1', 'true', 'yes', 'on'], true);
}
/**
* 查询 submission 详情(上传后用于轮询是否解析完成)。
* GET /submissions/{id}
*
* @return array 解码后的 JSON常见为 status=ok + message 内含 id/status
*/
public function getSubmission($submissionId)
{
return $this->request('GET', '/submissions/' . rawurlencode($submissionId));
}
/**
* 单次解析 GET /submissions/{id},判断是否可调用 PUT /similarity不 sleep供队列链逐步轮询
*
* @return array{ready:bool, failed:bool, status:string, snippet:string, message:array}
*/
public function parseSubmissionIngestState($submissionId)
{
$raw = $this->getSubmission($submissionId);
$msg = self::unwrapSubmissionPayload($raw);
$st = strtoupper(trim((string) self::pickSubmissionStatus($msg)));
$snippet = mb_substr(json_encode($msg, JSON_UNESCAPED_UNICODE), 0, 400);
$ready = [
'COMPLETE', 'COMPLETED', 'PROCESSED', 'READY', 'SUCCEEDED',
'COMPLETE_PROCESSING',
];
$failed = ['ERROR', 'FAILED', 'CANCELLED', 'CANCELED', 'DELETED'];
$readyFlag = $st !== '' && in_array($st, $ready, true);
$failedFlag = $st !== '' && in_array($st, $failed, true);
return [
'ready' => $readyFlag,
'failed' => $failedFlag,
'status' => $st,
'snippet' => $snippet,
'message' => $msg,
];
}
/**
* 上传完成后需等待 Turnitin 异步完成文本解析(同步阻塞版,仅 CLI/调试;线上请用队列链 PlagiarismWaitIngest
*
* @param string $submissionId
* @param int $maxWaitSec 最长等待秒数,默认 60010 分钟)
* @param int $intervalSec 轮询间隔秒数,默认 3
* @throws Exception 超时或终态为失败
*/
public function waitAfterUploadForSimilarity($submissionId, $maxWaitSec = 600, $intervalSec = 3)
{
$deadline = time() + max(30, (int)$maxWaitSec);
$intervalSec = max(1, (int)$intervalSec);
$lastStatus = '';
$lastSnippet = '';
while (time() < $deadline) {
$parsed = $this->parseSubmissionIngestState($submissionId);
$lastStatus = $parsed['status'];
$lastSnippet = $parsed['snippet'];
if (!empty($parsed['ready'])) {
return;
}
if (!empty($parsed['failed'])) {
throw new Exception('Turnitin submission failed, status=' . $lastStatus . ' body=' . $lastSnippet);
}
sleep($intervalSec);
}
throw new Exception(
'Timeout waiting for Turnitin submission ingest (last status=' . ($lastStatus ?: '(empty)') . ') snippet=' . $lastSnippet
);
}
/**
* @param mixed $decoded
* @return array
*/
private static function unwrapSubmissionPayload($decoded)
{
if (!is_array($decoded)) {
return [];
}
if (isset($decoded['message']) && is_array($decoded['message'])) {
return $decoded['message'];
}
return $decoded;
}
/**
* @param array $msg
* @return string
*/
private static function pickSubmissionStatus(array $msg)
{
$candidates = [$msg];
if (isset($msg['submission']) && is_array($msg['submission'])) {
$candidates[] = $msg['submission'];
}
foreach ($candidates as $m) {
foreach (['status', 'workflow_status', 'submission_status', 'processing_status', 'paper_status'] as $k) {
if (!empty($m[$k])) {
return (string)$m[$k];
}
}
}
return '';
}
/**
* 查询 similarity 状态
* GET /submissions/{id}/similarity
*
* 返回 status: PROCESSING / COMPLETE / ERROR
* COMPLETE 时返回 overall_match_percentage / time_requested / time_generated
*/
public function getSimilarityStatus($submissionId)
{
return $this->request(
'GET',
'/submissions/' . rawurlencode($submissionId) . '/similarity'
);
}
/**
* 取在线查看报告的临时 URL
* POST /submissions/{id}/viewer-url
*
* 返回 viewer_url数小时有效
*
* TCA 要求 default_mode 为小写(如 match_overviewsave_changes 等 LTI 字段会导致 400。
* Crossref 通道常用 ADMINISTRATOR/USER非 INSTRUCTOR。可在 .env 配置:
* turnitin.viewer_permission_set=ADMINISTRATOR
*
* @param array $viewer 可选viewer_user_id、triggered_by映射为 editor_{id})、或完整请求体覆盖
*/
public function getViewerUrl($submissionId, $viewer = [])
{
$submissionId = trim((string) $submissionId);
if ($submissionId === '') {
throw new Exception('submissionId required for viewer-url');
}
$statusResp = $this->getSimilarityStatus($submissionId);
$st = strtoupper(trim((string) ($statusResp['status'] ?? '')));
if ($st !== '' && $st !== 'COMPLETE') {
throw new Exception('similarity report not ready for viewer-url, status=' . $st);
}
$path = '/submissions/' . rawurlencode($submissionId) . '/viewer-url';
$lastError = null;
foreach ($this->buildViewerUrlBodies($viewer) as $body) {
try {
return $this->request('POST', $path, $body);
} catch (Exception $e) {
$lastError = $e;
if (strpos($e->getMessage(), 'HTTP 400') === false) {
throw $e;
}
}
}
throw $lastError ?: new Exception('viewer-url failed');
}
/**
* 按优先级生成若干合法请求体(前者失败且为 400 时尝试后者)。
*
* @return array<int,array>
*/
private function buildViewerUrlBodies(array $viewerOverrides)
{
if (!empty($viewerOverrides) && isset($viewerOverrides['viewer_default_permission_set'])) {
$body = $viewerOverrides;
if (empty($body['viewer_user_id'])) {
$body['viewer_user_id'] = $this->resolveViewerUserId($viewerOverrides);
}
return [$body];
}
$locale = trim((string) Env::get('turnitin.viewer_locale', 'en-US')) ?: 'en-US';
$configured = trim((string) Env::get('turnitin.viewer_permission_set', ''));
$permissionSets = $configured !== ''
? array_map('trim', explode(',', $configured))
: $this->defaultViewerPermissionSets();
$viewerUserId = $this->resolveViewerUserId($viewerOverrides);
$saveChanges = $this->envBool('turnitin.viewer_save_changes', false);
$simModes = $this->defaultViewerSimilarityBlock();
$bodies = [];
foreach ($permissionSets as $perm) {
if ($perm === '') {
continue;
}
// TCA 认证要求:必须带 viewer_user_id此前缺失会导致 400 Bad request
$bodies[] = [
'viewer_user_id' => $viewerUserId,
'locale' => $locale,
'viewer_default_permission_set' => $perm,
'similarity' => [
'view_settings' => ['save_changes' => $saveChanges],
],
];
$bodies[] = [
'viewer_user_id' => $viewerUserId,
'locale' => $locale,
'viewer_default_permission_set' => $perm,
'similarity' => array_merge($simModes, [
'view_settings' => ['save_changes' => $saveChanges],
]),
];
$bodies[] = [
'viewer_user_id' => $viewerUserId,
'locale' => $locale,
'viewer_default_permission_set' => $perm,
];
}
return $bodies;
}
/**
* viewer-url 必填:与 createSubmission 的 owner/submitter 同一命名空间editor_{user_id})。
*/
public function resolveViewerUserId(array $opts = [])
{
if (!empty($opts['viewer_user_id'])) {
return trim((string) $opts['viewer_user_id']);
}
// 打开报告的人(当前编辑)须与申请 viewer-url 时一致,否则易出现 session 认证失败
$editorId = isset($opts['editor_id']) ? intval($opts['editor_id']) : 0;
if ($editorId > 0) {
return 'editor_' . $editorId;
}
$triggeredBy = isset($opts['triggered_by']) ? intval($opts['triggered_by']) : 0;
if ($triggeredBy > 0) {
return 'editor_' . $triggeredBy;
}
$custom = trim((string) Env::get('turnitin.viewer_user_id', ''));
if ($custom !== '') {
return $custom;
}
$name = trim((string) $this->integrationName);
return ($name !== '' ? $name : 'tmr') . '_viewer';
}
/**
* Crossref Similarity Check 通常不用 INSTRUCTOR按常见可用角色排序尝试。
*
* @return array<int,string>
*/
private function defaultViewerPermissionSets()
{
if (stripos($this->baseUrl, 'crossref') !== false) {
return ['ADMINISTRATOR', 'USER', 'EDITOR', 'INSTRUCTOR'];
}
return ['INSTRUCTOR', 'ADMINISTRATOR', 'USER'];
}
/**
* 触发生成 PDF 报告(异步,状态在另一个轮询里看)
* POST /submissions/{id}/similarity/pdf
*
* 返回 idpdf 报告 ID
*/
public function requestPdfReport($submissionId, $opts = [])
{
$body = array_merge([
'locale' => trim((string) Env::get('turnitin.viewer_locale', 'en-US')) ?: 'en-US',
'view_settings' => $this->defaultViewSettings(),
], $opts);
return $this->request(
'POST',
'/submissions/' . rawurlencode($submissionId) . '/similarity/pdf',
$body
);
}
/**
* 查询 PDF 报告状态
* GET /submissions/{id}/similarity/pdf/{pdf_id}/status
*
* status: PENDING / SUCCESS / FAILED
*/
public function getPdfReportStatus($submissionId, $pdfId)
{
return $this->request(
'GET',
'/submissions/' . urlencode($submissionId) . '/similarity/pdf/' . urlencode($pdfId) . '/status'
);
}
/**
* 下载 PDF 报告内容status=SUCCESS 后才可调用)
* GET /submissions/{id}/similarity/pdf/{pdf_id}
*
* 返回 raw PDF binary 字符串;调用方负责落盘
*/
public function downloadPdfReport($submissionId, $pdfId)
{
return $this->request(
'GET',
'/submissions/' . urlencode($submissionId) . '/similarity/pdf/' . urlencode($pdfId),
null,
[],
true // raw response (不 json_decode)
);
}
// ==================== Internal HTTP layer ====================
/**
* 统一 HTTP 调用
*
* @param string $method GET/POST/PUT/DELETE
* @param string $path 以 / 开头的相对路径,会拼到 baseUrl 后
* @param mixed $body array 时按 JSON 编码string 时直接当 raw body
* @param array $extraHeaders 额外 header
* @param bool $rawResponse true=返回 raw 字符串false=json_decode
* @return mixed
* @throws Exception
*/
private function request($method, $path, $body = null, $extraHeaders = [], $rawResponse = false)
{
$url = $this->baseUrl . $path;
$headers = [
'Authorization: Bearer ' . $this->apiKey,
'X-Turnitin-Integration-Name: ' . $this->integrationName,
'X-Turnitin-Integration-Version: ' . $this->integrationVersion,
];
$payload = null;
if ($body !== null) {
if (is_array($body)) {
$payload = json_encode($body, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES);
$headers[] = 'Content-Type: application/json';
} else {
$payload = $body;
if (!isset($extraHeaders['Content-Type'])) {
$headers[] = 'Content-Type: application/octet-stream';
}
}
}
foreach ($extraHeaders as $k => $v) {
$headers[] = $k . ': ' . $v;
}
$ch = curl_init();
curl_setopt_array($ch, [
CURLOPT_URL => $url,
CURLOPT_CUSTOMREQUEST => strtoupper($method),
CURLOPT_RETURNTRANSFER => true,
CURLOPT_HTTPHEADER => $headers,
CURLOPT_TIMEOUT => $this->timeout,
CURLOPT_CONNECTTIMEOUT => 15,
CURLOPT_SSL_VERIFYPEER => true,
CURLOPT_SSL_VERIFYHOST => 2,
]);
if ($payload !== null) {
curl_setopt($ch, CURLOPT_POSTFIELDS, $payload);
}
$resp = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
$err = curl_error($ch);
curl_close($ch);
if ($resp === false) {
throw new Exception("Turnitin curl error: {$err} (url={$url})");
}
if ($httpCode < 200 || $httpCode >= 300) {
// 把响应体的前 1k 也带上方便排错
$excerpt = mb_substr((string)$resp, 0, 1000);
throw new Exception("Turnitin HTTP {$httpCode} {$method} {$path}: {$excerpt}");
}
if ($rawResponse) {
return $resp;
}
// 部分响应可能是 204 No Content
if ($resp === '' || $resp === null) {
return [];
}
$data = json_decode($resp, true);
if (json_last_error() !== JSON_ERROR_NONE) {
// 不是 JSON 也直接抛回原文
return $resp;
}
return $data;
}
}