修改自动推广的相关任务

This commit is contained in:
wangjinlei
2026-05-07 11:45:55 +08:00
parent a68742d2c2
commit b1e978ed73
7 changed files with 1221 additions and 0 deletions

View File

@@ -0,0 +1,217 @@
<?php
namespace app\api\controller;
use think\Db;
use think\Response;
use app\common\PlagiarismService;
/**
* 论文查重Turnitin / Crossref Similarity Check控制器。
*
* 触发方式:纯手工(编辑后台点"查重"按钮)。
* 报告策略:在线 viewer URL 临时签名 + PDF 永久落盘 runtime/plagiarism/。
*
* 主要接口:
* POST submit 触发查重
* GET getStatus 轮询单条查重状态(前端 ajax
* GET getList 列出某 article 的全部查重记录
* GET getReportUrl 获取/刷新在线查看 URL
* GET downloadReport 下载本地 PDF
* POST retry 重新触发(创建新行)
* GET features 探活(开发调试用)
*/
class Plagiarism extends Base
{
public function __construct(\think\Request $request = null)
{
parent::__construct($request);
}
/**
* 触发查重
*
* 入参:
* article_id 必填
* file_url 选填;不传则按 article_id 在 t_article_file 找 manuscirpt
* editor_id 选填;触发人 user_id前端拿不到也可以传 0
*/
public function submit()
{
$articleId = intval($this->request->param('article_id', 0));
$fileUrl = trim($this->request->param('file_url', ''));
$editorId = intval($this->request->param('editor_id', 0));
if ($articleId <= 0) {
return jsonError('article_id required');
}
try {
$svc = new PlagiarismService();
$localPath = $fileUrl !== ''
? $svc->resolveFileUrlToLocal($fileUrl)
: $svc->locateArticleManuscript($articleId);
$checkId = $svc->submit($articleId, $localPath, $editorId, 'manual');
return jsonSuccess(['check_id' => $checkId]);
} catch (\Throwable $e) {
return jsonError($e->getMessage());
}
}
/**
* 重试 = 提交一次新查重(保留历史)
*/
public function retry()
{
return $this->submit();
}
/**
* 取单条查重状态
*/
public function getStatus()
{
$checkId = intval($this->request->param('check_id', 0));
if ($checkId <= 0) {
return jsonError('check_id required');
}
$row = Db::name('plagiarism_check')->where('check_id', $checkId)->find();
if (!$row) {
return jsonError('not found');
}
return jsonSuccess($this->formatRow($row));
}
/**
* 列出某 article 的全部查重记录(按时间倒序)
*/
public function getList()
{
$articleId = intval($this->request->param('article_id', 0));
if ($articleId <= 0) {
return jsonError('article_id required');
}
$rows = Db::name('plagiarism_check')
->where('article_id', $articleId)
->order('check_id desc')
->select();
$out = [];
foreach ($rows as $r) {
$out[] = $this->formatRow($r);
}
return jsonSuccess(['list' => $out]);
}
/**
* 取在线查看 URL过期则自动刷新
*/
public function getReportUrl()
{
$checkId = intval($this->request->param('check_id', 0));
if ($checkId <= 0) {
return jsonError('check_id required');
}
try {
$row = Db::name('plagiarism_check')->where('check_id', $checkId)->find();
if (!$row) {
return jsonError('not found');
}
if ($row['state'] != 3) {
return jsonError('check not completed yet, state=' . $row['state']);
}
$needRefresh = empty($row['view_only_url'])
|| intval($row['view_only_url_expire']) < time() + 60;
if ($needRefresh) {
$svc = new PlagiarismService();
$info = $svc->refreshViewerUrlFor($checkId);
return jsonSuccess([
'view_only_url' => $info['url'],
'expire' => $info['expire'],
]);
}
return jsonSuccess([
'view_only_url' => $row['view_only_url'],
'expire' => intval($row['view_only_url_expire']),
]);
} catch (\Throwable $e) {
return jsonError($e->getMessage());
}
}
/**
* 直接吐 PDF 二进制流给浏览器下载
*/
public function downloadReport()
{
$checkId = intval($this->request->param('check_id', 0));
if ($checkId <= 0) {
return jsonError('check_id required');
}
$row = Db::name('plagiarism_check')->where('check_id', $checkId)->find();
if (!$row || empty($row['pdf_local_path'])) {
return jsonError('report not available');
}
$rootDir = ROOT_PATH ?: dirname(dirname(dirname(__DIR__)));
$abs = rtrim($rootDir, '/\\') . DIRECTORY_SEPARATOR . str_replace(['/', '\\'], DIRECTORY_SEPARATOR, $row['pdf_local_path']);
if (!is_file($abs)) {
return jsonError('pdf file missing on disk: ' . $row['pdf_local_path']);
}
$filename = sprintf('plagiarism_check_%d_article_%d.pdf', $row['check_id'], $row['article_id']);
return Response::create(file_get_contents($abs), 'html', 200, [
'Content-Type' => 'application/pdf',
'Content-Disposition' => 'attachment; filename="' . $filename . '"',
'Content-Length' => (string)filesize($abs),
]);
}
/**
* Turnitin 探活(开发调试用)
*/
public function features()
{
try {
$tii = new \app\common\TurnitinService();
return jsonSuccess($tii->featuresEnabled());
} catch (\Throwable $e) {
return jsonError($e->getMessage());
}
}
// ---------- 内部 ----------
private function formatRow($r)
{
return [
'check_id' => intval($r['check_id']),
'article_id' => intval($r['article_id']),
'journal_id' => intval($r['journal_id']),
'state' => intval($r['state']),
'state_label' => $this->stateLabel($r['state']),
'similarity_score' => floatval($r['similarity_score']),
'tii_report_status' => (string)$r['tii_report_status'],
'has_pdf' => !empty($r['pdf_local_path']),
'has_viewer_url' => !empty($r['view_only_url']) && intval($r['view_only_url_expire']) > time(),
'attempts' => intval($r['attempts']),
'error_msg' => (string)$r['error_msg'],
'source_file_name' => (string)$r['source_file_name'],
'trigger_source' => (string)$r['trigger_source'],
'triggered_by' => intval($r['triggered_by']),
'ctime' => intval($r['ctime']),
'utime' => intval($r['utime']),
];
}
private function stateLabel($state)
{
$map = [
0 => '待上传',
1 => '上传中',
2 => '比对中',
3 => '完成',
4 => '失败',
];
return isset($map[$state]) ? $map[$state] : 'unknown';
}
}

View File

@@ -0,0 +1,56 @@
<?php
namespace app\api\job;
use think\queue\Job;
use app\common\PlagiarismService;
use app\common\QueueJob;
/**
* 队列任务:轮询 Turnitin similarity 状态。
*
* 未完成会再次入队(链式延迟),完成后下载 PDF 报告并写本地永久保留。
*
* data:
* - check_id t_plagiarism_check.check_id
* - attempt 当前轮询次数(首次为 1
*
* 注意:单条 job 通常很短1 个 HTTP 请求),但会反复入队,常驻 worker 长时间运行
* 由 QueueJob 在进程超 6h 或致命 DB 错误时主动 exit(1) 让 supervisor 拉起新进程。
*/
class PlagiarismPoll
{
private $oQueueJob;
public function __construct()
{
$this->oQueueJob = new QueueJob();
}
public function fire(Job $job, $data)
{
$this->oQueueJob->init($job);
$checkId = isset($data['check_id']) ? intval($data['check_id']) : 0;
$attempt = isset($data['attempt']) ? intval($data['attempt']) : 1;
if ($checkId <= 0) {
$this->oQueueJob->log("PlagiarismPoll 无效的 check_id删除任务");
$job->delete();
return;
}
try {
$svc = new PlagiarismService();
$svc->runPollStatus($checkId, $attempt);
$this->oQueueJob->log("PlagiarismPoll 完成一次轮询 | check_id={$checkId} attempt={$attempt}");
$job->delete();
} catch (\Exception $e) {
$this->oQueueJob->handleException($e, $job, "check_id={$checkId} attempt={$attempt}");
} catch (\Throwable $e) {
$this->oQueueJob->handleException($e, $job, "check_id={$checkId} attempt={$attempt}");
} finally {
$this->oQueueJob->finnal();
}
}
}

View File

@@ -0,0 +1,57 @@
<?php
namespace app\api\job;
use think\queue\Job;
use app\common\PlagiarismService;
use app\common\QueueJob;
/**
* 队列任务:上传论文到 Turnitin + 触发 similarity 检测。
*
* 完成后会自动入队 PlagiarismPoll 进行后续轮询。
*
* data:
* - check_id t_plagiarism_check.check_id
* - file_path 本地可读的 PDF/DOCX 绝对路径
*
* 注意:上传单个 PDF 可能耗时数十秒,常驻 worker 由 QueueJob 在进程超 6h 或致命 DB
* 错误时主动 exit(1) 让 supervisor 拉起新进程。
*/
class PlagiarismRun
{
private $oQueueJob;
public function __construct()
{
$this->oQueueJob = new QueueJob();
}
public function fire(Job $job, $data)
{
$this->oQueueJob->init($job);
$checkId = isset($data['check_id']) ? intval($data['check_id']) : 0;
$filePath = isset($data['file_path']) ? (string)$data['file_path'] : '';
if ($checkId <= 0 || $filePath === '') {
$this->oQueueJob->log("PlagiarismRun 无效参数 check_id={$checkId} file_path={$filePath},删除任务");
$job->delete();
return;
}
try {
$svc = new PlagiarismService();
$svc->runUploadAndTrigger($checkId, $filePath);
$this->oQueueJob->log("PlagiarismRun 完成 | check_id={$checkId}");
$job->delete();
} catch (\Exception $e) {
// PlagiarismService 内部已经把状态置为 failed致命 DB 错误下 handleException 会 exit(1)
$this->oQueueJob->handleException($e, $job, "check_id={$checkId}");
} catch (\Throwable $e) {
$this->oQueueJob->handleException($e, $job, "check_id={$checkId}");
} finally {
$this->oQueueJob->finnal();
}
}
}

View File

@@ -0,0 +1,423 @@
<?php
namespace app\common;
use think\Db;
use think\Env;
use think\Queue;
use think\Exception;
/**
* 查重业务层:把 TurnitinService 的低层调用包装成"按 article 查重"的高层流程,
* 并维护 t_plagiarism_check 状态机。
*
* 状态流:
* submit() → state=1上传中入队 PlagiarismRun
* PlagiarismRun.fire → 上传 + 触发 similarity → state=2比对中入队 PlagiarismPoll
* PlagiarismPoll.fire → 轮询 status完成后下载 PDF → state=3完成
* 任意环节抛异常 → state=4失败写 error_msg
*/
class PlagiarismService
{
/**
* 报告 PDF 本地保存目录(相对于项目根,永久保留)
*/
const REPORT_DIR = 'runtime/plagiarism';
/**
* 轮询间隔。Turnitin 一般 1-5 分钟出结果30 秒一次比较合适
*/
const POLL_INTERVAL = 30;
/**
* 最长轮询次数30s × 60 = 30 分钟)
*/
const MAX_POLL_ATTEMPTS = 60;
// ---------- 顶层入口 ----------
/**
* 提交查重(入队,立即返回 check_id
*
* @param int $articleId 投稿 ID
* @param string $filePath 本地可读的 PDF/DOCX 绝对路径
* @param int $triggeredBy 触发人 user_id手工触发时编辑后台的 user_id
* @param string $source 'manual' / 'auto_xxx'
* @return int check_id
*/
public function submit($articleId, $filePath, $triggeredBy = 0, $source = 'manual')
{
if (!is_file($filePath) || !is_readable($filePath)) {
throw new Exception("File not readable: {$filePath}");
}
$journalId = (int) Db::name('article')
->where('article_id', $articleId)
->value('journal_id');
$now = time();
$checkId = Db::name('plagiarism_check')->insertGetId([
'article_id' => $articleId,
'journal_id' => $journalId,
'triggered_by' => $triggeredBy,
'trigger_source' => $source,
'state' => 1, // 上传中
'source_file_name' => basename($filePath),
'source_file_size' => filesize($filePath) ?: 0,
'ctime' => $now,
'utime' => $now,
]);
// 入队执行:上传 + 触发 similarity
Queue::push(
'app\\api\\job\\PlagiarismRun',
['check_id' => $checkId, 'file_path' => $filePath],
'plagiarism'
);
return (int)$checkId;
}
/**
* Job 调用:上传文件到 Turnitin 并触发 similarity然后入队 PlagiarismPoll
*/
public function runUploadAndTrigger($checkId, $filePath)
{
$check = $this->mustGetCheck($checkId);
try {
$tii = new TurnitinService();
// 1. 创建 submission
$articleTitle = (string) Db::name('article')
->where('article_id', $check['article_id'])
->value('title');
if ($articleTitle === '') {
$articleTitle = 'Article #' . $check['article_id'];
}
$createResp = $tii->createSubmission([
'title' => mb_substr($articleTitle, 0, 250),
'owner' => 'editor_' . $check['triggered_by'],
'submitter' => 'editor_' . $check['triggered_by'],
'metadata' => [
'article_id' => (string)$check['article_id'],
'check_id' => (string)$check['check_id'],
],
]);
$submissionId = isset($createResp['id']) ? $createResp['id'] : '';
if ($submissionId === '') {
throw new Exception('Turnitin createSubmission returned empty id: ' . json_encode($createResp));
}
$this->updateCheck($checkId, [
'tii_submission_id' => $submissionId,
'raw_response' => json_encode($createResp, JSON_UNESCAPED_UNICODE),
]);
// 2. 上传文件
$tii->uploadFile($submissionId, $filePath, basename($filePath));
// 3. 触发 similarity
$simResp = $tii->triggerSimilarity($submissionId);
$this->updateCheck($checkId, [
'state' => 2, // 比对中
'tii_report_status' => 'PROCESSING',
'raw_response' => json_encode($simResp, JSON_UNESCAPED_UNICODE),
]);
// 4. 排队首次轮询(晚一点开始,让 Turnitin 先处理)
Queue::later(
self::POLL_INTERVAL,
'app\\api\\job\\PlagiarismPoll',
['check_id' => $checkId, 'attempt' => 1],
'plagiarism'
);
} catch (\Throwable $e) {
$this->markFailed($checkId, '[upload] ' . $e->getMessage());
throw $e;
}
}
/**
* Job 调用:轮询 similarity 状态,完成后下载 PDF。未完成则重新入队。
*/
public function runPollStatus($checkId, $attempt = 1)
{
$check = $this->mustGetCheck($checkId);
if (empty($check['tii_submission_id'])) {
$this->markFailed($checkId, '[poll] tii_submission_id empty');
return;
}
try {
$tii = new TurnitinService();
$statusResp = $tii->getSimilarityStatus($check['tii_submission_id']);
$status = isset($statusResp['status']) ? strtoupper($statusResp['status']) : '';
$this->updateCheck($checkId, [
'tii_report_status' => $status,
'attempts' => $attempt,
'raw_response' => json_encode($statusResp, JSON_UNESCAPED_UNICODE),
]);
if ($status === 'COMPLETE') {
$score = isset($statusResp['overall_match_percentage'])
? floatval($statusResp['overall_match_percentage']) : 0;
// 下载 PDF + 取在线查看 URL
$localPdf = $this->downloadAndStorePdf($tii, $check['tii_submission_id'], $checkId);
$viewerInfo = $this->refreshViewerUrl($tii, $check['tii_submission_id']);
$this->updateCheck($checkId, [
'state' => 3,
'similarity_score' => $score,
'pdf_local_path' => $localPdf,
'view_only_url' => $viewerInfo['url'],
'view_only_url_expire' => $viewerInfo['expire'],
'error_msg' => '',
]);
return;
}
if ($status === 'ERROR') {
$errMsg = isset($statusResp['error_code']) ? (string)$statusResp['error_code'] : 'Turnitin reported ERROR';
$this->markFailed($checkId, '[poll] ' . $errMsg);
return;
}
// PROCESSING 或其它中间态:继续轮询
if ($attempt >= self::MAX_POLL_ATTEMPTS) {
$this->markFailed($checkId, '[poll] timeout after ' . $attempt . ' attempts');
return;
}
Queue::later(
self::POLL_INTERVAL,
'app\\api\\job\\PlagiarismPoll',
['check_id' => $checkId, 'attempt' => $attempt + 1],
'plagiarism'
);
} catch (\Throwable $e) {
// 网络抖动不要直接 fail给一定容错次数
if ($attempt < self::MAX_POLL_ATTEMPTS) {
Queue::later(
self::POLL_INTERVAL,
'app\\api\\job\\PlagiarismPoll',
['check_id' => $checkId, 'attempt' => $attempt + 1],
'plagiarism'
);
$this->updateCheck($checkId, [
'attempts' => $attempt,
'error_msg' => '[poll] transient: ' . $e->getMessage(),
]);
return;
}
$this->markFailed($checkId, '[poll] exhausted: ' . $e->getMessage());
throw $e;
}
}
/**
* 重新生成在线查看 URL已有的过期了用
*
* @return array{url:string, expire:int, local_pdf:string}
*/
public function refreshViewerUrlFor($checkId)
{
$check = $this->mustGetCheck($checkId);
if (empty($check['tii_submission_id'])) {
throw new Exception('check has no tii_submission_id');
}
$tii = new TurnitinService();
$info = $this->refreshViewerUrl($tii, $check['tii_submission_id']);
$this->updateCheck($checkId, [
'view_only_url' => $info['url'],
'view_only_url_expire' => $info['expire'],
]);
return [
'url' => $info['url'],
'expire' => $info['expire'],
'local_pdf' => $check['pdf_local_path'],
];
}
// ---------- 内部 ----------
private function refreshViewerUrl($tii, $submissionId)
{
$resp = $tii->getViewerUrl($submissionId);
$url = '';
if (isset($resp['viewer_url'])) {
$url = (string)$resp['viewer_url'];
} elseif (isset($resp['url'])) {
$url = (string)$resp['url'];
}
// 默认 2 小时过期,保守起见
return ['url' => $url, 'expire' => time() + 7200];
}
/**
* 触发生成 + 轮询 + 下载 PDF 到本地,返回相对路径
*/
private function downloadAndStorePdf($tii, $submissionId, $checkId)
{
// 1. 请求生成
$req = $tii->requestPdfReport($submissionId);
$pdfId = isset($req['id']) ? $req['id'] : '';
if ($pdfId === '') {
throw new Exception('requestPdfReport empty id: ' . json_encode($req));
}
// 2. 内联轮询 PDF 状态(最多 3 分钟,每 6 秒一次)
$maxLoops = 30;
for ($i = 0; $i < $maxLoops; $i++) {
$st = $tii->getPdfReportStatus($submissionId, $pdfId);
$stCode = isset($st['status']) ? strtoupper($st['status']) : '';
if ($stCode === 'SUCCESS') {
break;
}
if ($stCode === 'FAILED') {
throw new Exception('PDF report generation failed: ' . json_encode($st));
}
sleep(6);
}
// 3. 下载
$binary = $tii->downloadPdfReport($submissionId, $pdfId);
if (!is_string($binary) || strlen($binary) < 100) {
throw new Exception('downloaded pdf is empty/too small');
}
// 4. 落盘
$rootDir = ROOT_PATH ?: dirname(dirname(__DIR__));
$absDir = rtrim($rootDir, '/\\') . DIRECTORY_SEPARATOR . self::REPORT_DIR;
if (!is_dir($absDir)) {
@mkdir($absDir, 0755, true);
}
$filename = sprintf('check_%d_%s.pdf', $checkId, date('Ymd_His'));
$absPath = $absDir . DIRECTORY_SEPARATOR . $filename;
$bytes = file_put_contents($absPath, $binary);
if ($bytes === false || $bytes < 100) {
throw new Exception('failed to save pdf to ' . $absPath);
}
return self::REPORT_DIR . '/' . $filename;
}
private function mustGetCheck($checkId)
{
$row = Db::name('plagiarism_check')->where('check_id', $checkId)->find();
if (!$row) {
throw new Exception("plagiarism_check #{$checkId} not found");
}
return $row;
}
private function updateCheck($checkId, array $data)
{
$data['utime'] = time();
Db::name('plagiarism_check')->where('check_id', $checkId)->update($data);
}
private function markFailed($checkId, $errMsg)
{
$this->updateCheck($checkId, [
'state' => 4,
'error_msg' => mb_substr($errMsg, 0, 1000),
]);
}
/**
* 从 t_article_file 找到投稿主稿manuscirpt的本地绝对路径。
* file_url 在系统里可能是 URL 或相对路径,调用方负责保证可读。
*
* @return string 文件绝对路径,找不到时抛异常
*/
public function locateArticleManuscript($articleId)
{
$row = Db::name('article_file')
->where('article_id', $articleId)
->where('type_name', 'manuscirpt') // 历史拼写
->order('article_file_id desc')
->find();
if (!$row || empty($row['file_url'])) {
throw new Exception("article #{$articleId} has no manuscirpt file");
}
return $this->resolveFileUrlToLocal($row['file_url']);
}
/**
* 把 file_url可能是 http URL 或相对路径)解析成本地绝对路径。
* 不同环境部署可能有差异,这里用 .env 配置的 STATIC_ROOT 作前缀。
*/
public function resolveFileUrlToLocal($fileUrl)
{
$fileUrl = trim((string)$fileUrl);
if ($fileUrl === '') {
throw new Exception('empty file_url');
}
// 已是绝对路径
if (preg_match('/^([a-zA-Z]:[\\\\\/]|\/)/', $fileUrl) && is_file($fileUrl)) {
return $fileUrl;
}
$staticRoot = trim((string)Env::get('plagiarism.static_root', ''));
$cdnPrefix = trim((string)Env::get('plagiarism.cdn_prefix', ''));
// 是 http URL先试着剥掉 cdn 前缀,映射到本地
if (preg_match('#^https?://#i', $fileUrl)) {
if ($cdnPrefix !== '' && stripos($fileUrl, $cdnPrefix) === 0) {
$rel = ltrim(substr($fileUrl, strlen($cdnPrefix)), '/');
$local = rtrim($staticRoot, '/\\') . DIRECTORY_SEPARATOR . $rel;
if (is_file($local)) {
return $local;
}
}
// 实在不行,下载到 runtime/plagiarism/tmp 临时目录
return $this->downloadRemoteFile($fileUrl);
}
// 相对路径:拼 static_root
if ($staticRoot !== '') {
$local = rtrim($staticRoot, '/\\') . DIRECTORY_SEPARATOR . ltrim($fileUrl, '/\\');
if (is_file($local)) {
return $local;
}
}
throw new Exception("cannot resolve file_url to local path: {$fileUrl} (set [plagiarism] STATIC_ROOT/CDN_PREFIX in .env)");
}
private function downloadRemoteFile($url)
{
$rootDir = ROOT_PATH ?: dirname(dirname(__DIR__));
$tmpDir = rtrim($rootDir, '/\\') . DIRECTORY_SEPARATOR . self::REPORT_DIR . DIRECTORY_SEPARATOR . 'tmp';
if (!is_dir($tmpDir)) {
@mkdir($tmpDir, 0755, true);
}
$ext = pathinfo(parse_url($url, PHP_URL_PATH), PATHINFO_EXTENSION) ?: 'pdf';
$local = $tmpDir . DIRECTORY_SEPARATOR . md5($url) . '_' . time() . '.' . $ext;
$ch = curl_init($url);
$fh = fopen($local, 'wb');
curl_setopt_array($ch, [
CURLOPT_FILE => $fh,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_TIMEOUT => 120,
CURLOPT_SSL_VERIFYPEER => false,
]);
$ok = curl_exec($ch);
$code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
fclose($fh);
if (!$ok || $code !== 200 || filesize($local) < 100) {
@unlink($local);
throw new Exception("download failed url={$url} http={$code}");
}
return $local;
}
public function getCheck($checkId)
{
return Db::name('plagiarism_check')->where('check_id', $checkId)->find();
}
}

View File

@@ -0,0 +1,322 @@
<?php
namespace app\common;
use think\Env;
use think\Exception;
/**
* Turnitin Core API (TCA) REST 客户端封装。
*
* 适用 Crossref Similarity Check 通道product_name=Crossref以及标准 TCA 接入。
*
* 鉴权Authorization: Bearer <API_KEY>
* X-Turnitin-Integration-Name / X-Turnitin-Integration-Version 用于审计
*
* .env 配置([turnitin] 段):
* BASE_URL 形如 https://crossref-12345.turnitin.com/api/v1不带尾斜杠
* API_KEY 生成的 Bearer token
* INTEGRATION_NAME Scope Name创建 integration 时填的名字)
* INTEGRATION_VERSION 自定义版本号,便于审计 e.g. 1.0.0
*
* API 文档https://developers.turnitin.com/docs/tca
*
* 注意:
* - 所有方法返回原始 decode 后的数组HTTP 错误抛 Exception
* - 不做任何业务层逻辑(业务层在 PlagiarismService 里)
* - 不缓存 tokenBearer 不需要登录,每次请求自带)
*/
class TurnitinService
{
private $baseUrl;
private $apiKey;
private $integrationName;
private $integrationVersion;
private $timeout = 60;
public function __construct()
{
$this->baseUrl = rtrim(trim((string)Env::get('turnitin.base_url', '')), '/');
$this->apiKey = trim((string)Env::get('turnitin.api_key', ''));
$this->integrationName = trim((string)Env::get('turnitin.integration_name', 'tmr'));
$this->integrationVersion = trim((string)Env::get('turnitin.integration_version', '1.0.0'));
if ($this->baseUrl === '' || $this->apiKey === '') {
throw new Exception('Turnitin not configured: missing BASE_URL or API_KEY in .env [turnitin] section');
}
}
// ==================== Public API ====================
/**
* 探活 / 拿账户能力
* GET /features-enabled
*/
public function featuresEnabled()
{
return $this->request('GET', '/features-enabled');
}
/**
* 创建 submission拿到 id 之后才能上传文件)
* POST /submissions
*
* @param array $meta 必填字段:
* - title 论文标题
* - owner submission owner 标识符(自定义字符串,比如投稿系统 user_id
* - submitter 提交者标识符(同上)
* - eula (可选) ['version' => '...', 'language' => 'en-US', 'accepted_timestamp' => ISO8601]
* 如果 features-enabled 返回 require_eula=false 可省略
* 可选字段:
* - extract_text_only bool
* - metadata array 自定义键值,供后续追溯
*
* @return array 含 idsubmission UUID, status, owner, ...
*/
public function createSubmission($meta)
{
return $this->request('POST', '/submissions', $meta);
}
/**
* 上传文件到 submission
* PUT /submissions/{id}/original/{filename}
*
* @param string $submissionId
* @param string $filePath 本地 PDF/DOCX 路径
* @param string $filename 传给 Turnitin 的文件名(用于报告显示)
* @return array
*/
public function uploadFile($submissionId, $filePath, $filename = '')
{
if (!is_file($filePath) || !is_readable($filePath)) {
throw new Exception("File not found or not readable: {$filePath}");
}
if ($filename === '') {
$filename = basename($filePath);
}
$body = file_get_contents($filePath);
return $this->request(
'PUT',
'/submissions/' . urlencode($submissionId) . '/original/' . rawurlencode($filename),
$body,
[
'Content-Type' => 'binary/octet-stream',
'Content-Disposition' => 'inline; filename="' . $filename . '"',
]
);
}
/**
* 触发 similarity 比对
* PUT /submissions/{id}/similarity
*
* @param string $submissionId
* @param array $opts
* - generation_settings.search_repositories 默认 ['INTERNET','PUBLICATION','CROSSREF','CROSSREF_POSTED_CONTENT','SUBMITTED_WORK']
* - generation_settings.submission_auto_excludes bool
* - view_settings.exclude_quotes / exclude_bibliography / exclude_citations / exclude_abstract / exclude_methods bool
* - indexing_settings.add_to_index bool 是否把本文加进 SUBMITTED_WORK 索引(一般 true
* @return array
*/
public function triggerSimilarity($submissionId, $opts = [])
{
$body = array_merge([
'generation_settings' => [
'search_repositories' => ['INTERNET', 'PUBLICATION', 'CROSSREF', 'CROSSREF_POSTED_CONTENT', 'SUBMITTED_WORK'],
'submission_auto_excludes' => true,
'auto_exclude_self_matching_scope' => 'GROUP_CONTEXT',
],
'view_settings' => [
'exclude_quotes' => true,
'exclude_bibliography' => true,
'exclude_citations' => true,
],
'indexing_settings' => [
'add_to_index' => true,
],
], $opts);
return $this->request(
'PUT',
'/submissions/' . urlencode($submissionId) . '/similarity',
$body
);
}
/**
* 查询 similarity 状态
* GET /submissions/{id}/similarity
*
* 返回 status: PROCESSING / COMPLETE / ERROR
* COMPLETE 时返回 overall_match_percentage / time_requested / time_generated
*/
public function getSimilarityStatus($submissionId)
{
return $this->request(
'GET',
'/submissions/' . urlencode($submissionId) . '/similarity'
);
}
/**
* 取在线查看报告的临时 URL
* POST /submissions/{id}/viewer-url
*
* 返回 viewer_url数小时有效
*
* @param array $viewer 可选 viewer 设置 e.g. ['viewer_default_permission_set' => 'INSTRUCTOR']
*/
public function getViewerUrl($submissionId, $viewer = [])
{
$body = array_merge([
'viewer_default_permission_set' => 'INSTRUCTOR',
'similarity' => [
'default_mode' => 'MATCH_OVERVIEW',
'view_settings' => ['save_changes' => true],
'modes' => ['match_overview' => true, 'all_sources' => true],
],
'locale' => 'en-US',
], $viewer);
return $this->request(
'POST',
'/submissions/' . urlencode($submissionId) . '/viewer-url',
$body
);
}
/**
* 触发生成 PDF 报告(异步,状态在另一个轮询里看)
* POST /submissions/{id}/similarity/pdf
*
* 返回 idpdf 报告 ID
*/
public function requestPdfReport($submissionId, $opts = [])
{
$body = array_merge([
'locale' => 'en-US',
], $opts);
return $this->request(
'POST',
'/submissions/' . urlencode($submissionId) . '/similarity/pdf',
$body
);
}
/**
* 查询 PDF 报告状态
* GET /submissions/{id}/similarity/pdf/{pdf_id}/status
*
* status: PENDING / SUCCESS / FAILED
*/
public function getPdfReportStatus($submissionId, $pdfId)
{
return $this->request(
'GET',
'/submissions/' . urlencode($submissionId) . '/similarity/pdf/' . urlencode($pdfId) . '/status'
);
}
/**
* 下载 PDF 报告内容status=SUCCESS 后才可调用)
* GET /submissions/{id}/similarity/pdf/{pdf_id}
*
* 返回 raw PDF binary 字符串;调用方负责落盘
*/
public function downloadPdfReport($submissionId, $pdfId)
{
return $this->request(
'GET',
'/submissions/' . urlencode($submissionId) . '/similarity/pdf/' . urlencode($pdfId),
null,
[],
true // raw response (不 json_decode)
);
}
// ==================== Internal HTTP layer ====================
/**
* 统一 HTTP 调用
*
* @param string $method GET/POST/PUT/DELETE
* @param string $path 以 / 开头的相对路径,会拼到 baseUrl 后
* @param mixed $body array 时按 JSON 编码string 时直接当 raw body
* @param array $extraHeaders 额外 header
* @param bool $rawResponse true=返回 raw 字符串false=json_decode
* @return mixed
* @throws Exception
*/
private function request($method, $path, $body = null, $extraHeaders = [], $rawResponse = false)
{
$url = $this->baseUrl . $path;
$headers = [
'Authorization: Bearer ' . $this->apiKey,
'X-Turnitin-Integration-Name: ' . $this->integrationName,
'X-Turnitin-Integration-Version: ' . $this->integrationVersion,
];
$payload = null;
if ($body !== null) {
if (is_array($body)) {
$payload = json_encode($body, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES);
$headers[] = 'Content-Type: application/json';
} else {
$payload = $body;
if (!isset($extraHeaders['Content-Type'])) {
$headers[] = 'Content-Type: application/octet-stream';
}
}
}
foreach ($extraHeaders as $k => $v) {
$headers[] = $k . ': ' . $v;
}
$ch = curl_init();
curl_setopt_array($ch, [
CURLOPT_URL => $url,
CURLOPT_CUSTOMREQUEST => strtoupper($method),
CURLOPT_RETURNTRANSFER => true,
CURLOPT_HTTPHEADER => $headers,
CURLOPT_TIMEOUT => $this->timeout,
CURLOPT_CONNECTTIMEOUT => 15,
CURLOPT_SSL_VERIFYPEER => true,
CURLOPT_SSL_VERIFYHOST => 2,
]);
if ($payload !== null) {
curl_setopt($ch, CURLOPT_POSTFIELDS, $payload);
}
$resp = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
$err = curl_error($ch);
curl_close($ch);
if ($resp === false) {
throw new Exception("Turnitin curl error: {$err} (url={$url})");
}
if ($httpCode < 200 || $httpCode >= 300) {
// 把响应体的前 1k 也带上方便排错
$excerpt = mb_substr((string)$resp, 0, 1000);
throw new Exception("Turnitin HTTP {$httpCode} {$method} {$path}: {$excerpt}");
}
if ($rawResponse) {
return $resp;
}
// 部分响应可能是 204 No Content
if ($resp === '' || $resp === null) {
return [];
}
$data = json_decode($resp, true);
if (json_last_error() !== JSON_ERROR_NONE) {
// 不是 JSON 也直接抛回原文
return $resp;
}
return $data;
}
}

View File

@@ -0,0 +1,44 @@
-- 查重任务表Turnitin TCA / Crossref Similarity Check
--
-- 一篇 article 可重复触发多次查重;同一 article 的最近一次显示在编辑详情页。
-- state 流转0 待上传 → 1 上传中 → 2 比对中 → 3 完成 → 4 失败
-- 失败可重新触发,会创建新行(保留历史)
--
-- 报告永久保留pdf_local_path 指向 runtime/plagiarism/ 下的本地 PDF
-- view_only_url 是 Turnitin 临时签名(数小时过期),过期需重新生成
DROP TABLE IF EXISTS `t_plagiarism_check`;
CREATE TABLE `t_plagiarism_check` (
`check_id` INT NOT NULL AUTO_INCREMENT,
`article_id` INT NOT NULL DEFAULT 0 COMMENT '关联投稿 t_article.article_id',
`journal_id` INT NOT NULL DEFAULT 0 COMMENT '所属期刊(冗余便于按期刊统计)',
`triggered_by` INT NOT NULL DEFAULT 0 COMMENT '触发人 user_id手工触发时编辑的 user_id',
`trigger_source` VARCHAR(32) NOT NULL DEFAULT 'manual' COMMENT 'manual/auto_initial_review/...',
`state` TINYINT NOT NULL DEFAULT 0 COMMENT '0待上传 1上传中 2比对中 3完成 4失败',
-- Turnitin 端的实体 ID
`tii_submission_id` VARCHAR(64) NOT NULL DEFAULT '' COMMENT 'Turnitin submission UUID',
`tii_report_status` VARCHAR(32) NOT NULL DEFAULT '' COMMENT 'PROCESSING/COMPLETE/ERROR',
-- 结果
`similarity_score` DECIMAL(5,2) NOT NULL DEFAULT 0 COMMENT '总相似度 %(如 12.34',
`view_only_url` VARCHAR(1024) NOT NULL DEFAULT '' COMMENT '在线查看报告 URL临时签名',
`view_only_url_expire` INT NOT NULL DEFAULT 0 COMMENT '在线查看 URL 过期时间戳',
`pdf_local_path` VARCHAR(255) NOT NULL DEFAULT '' COMMENT '本地缓存的 PDF 报告相对路径',
-- 文件元数据(上传时记录,便于追踪)
`source_file_name` VARCHAR(255) NOT NULL DEFAULT '' COMMENT '原始 PDF 文件名',
`source_file_size` INT NOT NULL DEFAULT 0 COMMENT '原始 PDF 字节数',
-- 调试与重试
`attempts` INT NOT NULL DEFAULT 0 COMMENT '总轮询/重试次数',
`error_msg` VARCHAR(1024) NOT NULL DEFAULT '' COMMENT '失败原因',
`raw_response` MEDIUMTEXT COMMENT '最近一次 Turnitin API 原始返回(调试用)',
`ctime` INT NOT NULL DEFAULT 0,
`utime` INT NOT NULL DEFAULT 0,
PRIMARY KEY (`check_id`),
KEY `idx_article` (`article_id`, `state`),
KEY `idx_state` (`state`),
KEY `idx_tii_submission` (`tii_submission_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb3 COMMENT='Turnitin 查重任务表';

102
test_plagiarism_e2e.php Normal file
View File

@@ -0,0 +1,102 @@
<?php
/**
* Turnitin TCA 端到端连通性测试。
*
* 用法(在项目根执行):
* php test_plagiarism_e2e.php features # 探活
* php test_plagiarism_e2e.php submit <article_id> # 用 article 主稿提交查重(手工触发)
* php test_plagiarism_e2e.php submit-file <pdf> # 用本地 PDF 提交(不绑定 article
* php test_plagiarism_e2e.php status <check_id> # 查询状态
* php test_plagiarism_e2e.php list <article_id> # 列出某 article 的查重记录
* php test_plagiarism_e2e.php viewer <check_id> # 取在线查看 URL
*
* 说明:
* submit-file 不会真正落库(仅用于联通验证),它会用 article_id=0 走完整套流程。
* submit 会写入 t_plagiarism_check并把 check_id 打回,再用 status 自己轮询。
*/
define('IS_CLI', true);
require __DIR__ . '/thinkphp/start.php';
use think\Db;
use app\common\PlagiarismService;
use app\common\TurnitinService;
if ($argc < 2) {
echo "Usage: php test_plagiarism_e2e.php <command> [args...]\n";
exit(1);
}
$cmd = $argv[1];
try {
switch ($cmd) {
case 'features': {
$tii = new TurnitinService();
print_r($tii->featuresEnabled());
break;
}
case 'submit': {
if ($argc < 3) {
echo "Usage: ... submit <article_id>\n";
exit(1);
}
$articleId = intval($argv[2]);
$svc = new PlagiarismService();
$local = $svc->locateArticleManuscript($articleId);
echo "manuscript local path: {$local}\n";
$checkId = $svc->submit($articleId, $local, 0, 'cli_test');
echo "submitted, check_id = {$checkId}\n";
echo "now run: php think queue:work --queue plagiarism --tries=1 -v\n";
break;
}
case 'submit-file': {
if ($argc < 3) {
echo "Usage: ... submit-file <pdf_path>\n";
exit(1);
}
$path = $argv[2];
if (!is_file($path)) {
echo "file not exists: {$path}\n";
exit(1);
}
$svc = new PlagiarismService();
$checkId = $svc->submit(0, $path, 0, 'cli_test_file');
echo "submitted, check_id = {$checkId}\n";
break;
}
case 'status': {
if ($argc < 3) {
echo "Usage: ... status <check_id>\n";
exit(1);
}
$row = Db::name('plagiarism_check')->where('check_id', intval($argv[2]))->find();
print_r($row);
break;
}
case 'list': {
if ($argc < 3) {
echo "Usage: ... list <article_id>\n";
exit(1);
}
$rows = Db::name('plagiarism_check')->where('article_id', intval($argv[2]))->order('check_id desc')->select();
print_r($rows);
break;
}
case 'viewer': {
if ($argc < 3) {
echo "Usage: ... viewer <check_id>\n";
exit(1);
}
$svc = new PlagiarismService();
print_r($svc->refreshViewerUrlFor(intval($argv[2])));
break;
}
default:
echo "unknown command: {$cmd}\n";
exit(1);
}
} catch (\Throwable $e) {
echo "ERROR: " . $e->getMessage() . "\n" . $e->getTraceAsString() . "\n";
exit(1);
}