diff --git a/application/api/controller/Plagiarism.php b/application/api/controller/Plagiarism.php new file mode 100644 index 0000000..76cc0c1 --- /dev/null +++ b/application/api/controller/Plagiarism.php @@ -0,0 +1,217 @@ +request->param('article_id', 0)); + $fileUrl = trim($this->request->param('file_url', '')); + $editorId = intval($this->request->param('editor_id', 0)); + + if ($articleId <= 0) { + return jsonError('article_id required'); + } + + try { + $svc = new PlagiarismService(); + $localPath = $fileUrl !== '' + ? $svc->resolveFileUrlToLocal($fileUrl) + : $svc->locateArticleManuscript($articleId); + + $checkId = $svc->submit($articleId, $localPath, $editorId, 'manual'); + return jsonSuccess(['check_id' => $checkId]); + } catch (\Throwable $e) { + return jsonError($e->getMessage()); + } + } + + /** + * 重试 = 提交一次新查重(保留历史) + */ + public function retry() + { + return $this->submit(); + } + + /** + * 取单条查重状态 + */ + public function getStatus() + { + $checkId = intval($this->request->param('check_id', 0)); + if ($checkId <= 0) { + return jsonError('check_id required'); + } + $row = Db::name('plagiarism_check')->where('check_id', $checkId)->find(); + if (!$row) { + return jsonError('not found'); + } + return jsonSuccess($this->formatRow($row)); + } + + /** + * 列出某 article 的全部查重记录(按时间倒序) + */ + public function getList() + { + $articleId = intval($this->request->param('article_id', 0)); + if ($articleId <= 0) { + return jsonError('article_id required'); + } + $rows = Db::name('plagiarism_check') + ->where('article_id', $articleId) + ->order('check_id desc') + ->select(); + $out = []; + foreach ($rows as $r) { + $out[] = $this->formatRow($r); + } + return jsonSuccess(['list' => $out]); + } + + /** + * 取在线查看 URL;过期则自动刷新 + */ + public function getReportUrl() + { + $checkId = intval($this->request->param('check_id', 0)); + if ($checkId <= 0) { + return jsonError('check_id required'); + } + try { + $row = Db::name('plagiarism_check')->where('check_id', $checkId)->find(); + if (!$row) { + return jsonError('not found'); + } + if ($row['state'] != 3) { + return jsonError('check not completed yet, state=' . $row['state']); + } + $needRefresh = empty($row['view_only_url']) + || intval($row['view_only_url_expire']) < time() + 60; + + if ($needRefresh) { + $svc = new PlagiarismService(); + $info = $svc->refreshViewerUrlFor($checkId); + return jsonSuccess([ + 'view_only_url' => $info['url'], + 'expire' => $info['expire'], + ]); + } + return jsonSuccess([ + 'view_only_url' => $row['view_only_url'], + 'expire' => intval($row['view_only_url_expire']), + ]); + } catch (\Throwable $e) { + return jsonError($e->getMessage()); + } + } + + /** + * 直接吐 PDF 二进制流给浏览器下载 + */ + public function downloadReport() + { + $checkId = intval($this->request->param('check_id', 0)); + if ($checkId <= 0) { + return jsonError('check_id required'); + } + $row = Db::name('plagiarism_check')->where('check_id', $checkId)->find(); + if (!$row || empty($row['pdf_local_path'])) { + return jsonError('report not available'); + } + $rootDir = ROOT_PATH ?: dirname(dirname(dirname(__DIR__))); + $abs = rtrim($rootDir, '/\\') . DIRECTORY_SEPARATOR . str_replace(['/', '\\'], DIRECTORY_SEPARATOR, $row['pdf_local_path']); + if (!is_file($abs)) { + return jsonError('pdf file missing on disk: ' . $row['pdf_local_path']); + } + $filename = sprintf('plagiarism_check_%d_article_%d.pdf', $row['check_id'], $row['article_id']); + return Response::create(file_get_contents($abs), 'html', 200, [ + 'Content-Type' => 'application/pdf', + 'Content-Disposition' => 'attachment; filename="' . $filename . '"', + 'Content-Length' => (string)filesize($abs), + ]); + } + + /** + * Turnitin 探活(开发调试用) + */ + public function features() + { + try { + $tii = new \app\common\TurnitinService(); + return jsonSuccess($tii->featuresEnabled()); + } catch (\Throwable $e) { + return jsonError($e->getMessage()); + } + } + + // ---------- 内部 ---------- + + private function formatRow($r) + { + return [ + 'check_id' => intval($r['check_id']), + 'article_id' => intval($r['article_id']), + 'journal_id' => intval($r['journal_id']), + 'state' => intval($r['state']), + 'state_label' => $this->stateLabel($r['state']), + 'similarity_score' => floatval($r['similarity_score']), + 'tii_report_status' => (string)$r['tii_report_status'], + 'has_pdf' => !empty($r['pdf_local_path']), + 'has_viewer_url' => !empty($r['view_only_url']) && intval($r['view_only_url_expire']) > time(), + 'attempts' => intval($r['attempts']), + 'error_msg' => (string)$r['error_msg'], + 'source_file_name' => (string)$r['source_file_name'], + 'trigger_source' => (string)$r['trigger_source'], + 'triggered_by' => intval($r['triggered_by']), + 'ctime' => intval($r['ctime']), + 'utime' => intval($r['utime']), + ]; + } + + private function stateLabel($state) + { + $map = [ + 0 => '待上传', + 1 => '上传中', + 2 => '比对中', + 3 => '完成', + 4 => '失败', + ]; + return isset($map[$state]) ? $map[$state] : 'unknown'; + } +} diff --git a/application/api/job/PlagiarismPoll.php b/application/api/job/PlagiarismPoll.php new file mode 100644 index 0000000..6d59198 --- /dev/null +++ b/application/api/job/PlagiarismPoll.php @@ -0,0 +1,56 @@ +oQueueJob = new QueueJob(); + } + + public function fire(Job $job, $data) + { + $this->oQueueJob->init($job); + + $checkId = isset($data['check_id']) ? intval($data['check_id']) : 0; + $attempt = isset($data['attempt']) ? intval($data['attempt']) : 1; + + if ($checkId <= 0) { + $this->oQueueJob->log("PlagiarismPoll 无效的 check_id,删除任务"); + $job->delete(); + return; + } + + try { + $svc = new PlagiarismService(); + $svc->runPollStatus($checkId, $attempt); + $this->oQueueJob->log("PlagiarismPoll 完成一次轮询 | check_id={$checkId} attempt={$attempt}"); + $job->delete(); + } catch (\Exception $e) { + $this->oQueueJob->handleException($e, $job, "check_id={$checkId} attempt={$attempt}"); + } catch (\Throwable $e) { + $this->oQueueJob->handleException($e, $job, "check_id={$checkId} attempt={$attempt}"); + } finally { + $this->oQueueJob->finnal(); + } + } +} diff --git a/application/api/job/PlagiarismRun.php b/application/api/job/PlagiarismRun.php new file mode 100644 index 0000000..d5ff5e6 --- /dev/null +++ b/application/api/job/PlagiarismRun.php @@ -0,0 +1,57 @@ +oQueueJob = new QueueJob(); + } + + public function fire(Job $job, $data) + { + $this->oQueueJob->init($job); + + $checkId = isset($data['check_id']) ? intval($data['check_id']) : 0; + $filePath = isset($data['file_path']) ? (string)$data['file_path'] : ''; + + if ($checkId <= 0 || $filePath === '') { + $this->oQueueJob->log("PlagiarismRun 无效参数 check_id={$checkId} file_path={$filePath},删除任务"); + $job->delete(); + return; + } + + try { + $svc = new PlagiarismService(); + $svc->runUploadAndTrigger($checkId, $filePath); + $this->oQueueJob->log("PlagiarismRun 完成 | check_id={$checkId}"); + $job->delete(); + } catch (\Exception $e) { + // PlagiarismService 内部已经把状态置为 failed;致命 DB 错误下 handleException 会 exit(1) + $this->oQueueJob->handleException($e, $job, "check_id={$checkId}"); + } catch (\Throwable $e) { + $this->oQueueJob->handleException($e, $job, "check_id={$checkId}"); + } finally { + $this->oQueueJob->finnal(); + } + } +} diff --git a/application/common/PlagiarismService.php b/application/common/PlagiarismService.php new file mode 100644 index 0000000..eedb168 --- /dev/null +++ b/application/common/PlagiarismService.php @@ -0,0 +1,423 @@ +where('article_id', $articleId) + ->value('journal_id'); + + $now = time(); + $checkId = Db::name('plagiarism_check')->insertGetId([ + 'article_id' => $articleId, + 'journal_id' => $journalId, + 'triggered_by' => $triggeredBy, + 'trigger_source' => $source, + 'state' => 1, // 上传中 + 'source_file_name' => basename($filePath), + 'source_file_size' => filesize($filePath) ?: 0, + 'ctime' => $now, + 'utime' => $now, + ]); + + // 入队执行:上传 + 触发 similarity + Queue::push( + 'app\\api\\job\\PlagiarismRun', + ['check_id' => $checkId, 'file_path' => $filePath], + 'plagiarism' + ); + + return (int)$checkId; + } + + /** + * Job 调用:上传文件到 Turnitin 并触发 similarity,然后入队 PlagiarismPoll + */ + public function runUploadAndTrigger($checkId, $filePath) + { + $check = $this->mustGetCheck($checkId); + + try { + $tii = new TurnitinService(); + + // 1. 创建 submission + $articleTitle = (string) Db::name('article') + ->where('article_id', $check['article_id']) + ->value('title'); + if ($articleTitle === '') { + $articleTitle = 'Article #' . $check['article_id']; + } + + $createResp = $tii->createSubmission([ + 'title' => mb_substr($articleTitle, 0, 250), + 'owner' => 'editor_' . $check['triggered_by'], + 'submitter' => 'editor_' . $check['triggered_by'], + 'metadata' => [ + 'article_id' => (string)$check['article_id'], + 'check_id' => (string)$check['check_id'], + ], + ]); + $submissionId = isset($createResp['id']) ? $createResp['id'] : ''; + if ($submissionId === '') { + throw new Exception('Turnitin createSubmission returned empty id: ' . json_encode($createResp)); + } + + $this->updateCheck($checkId, [ + 'tii_submission_id' => $submissionId, + 'raw_response' => json_encode($createResp, JSON_UNESCAPED_UNICODE), + ]); + + // 2. 上传文件 + $tii->uploadFile($submissionId, $filePath, basename($filePath)); + + // 3. 触发 similarity + $simResp = $tii->triggerSimilarity($submissionId); + + $this->updateCheck($checkId, [ + 'state' => 2, // 比对中 + 'tii_report_status' => 'PROCESSING', + 'raw_response' => json_encode($simResp, JSON_UNESCAPED_UNICODE), + ]); + + // 4. 排队首次轮询(晚一点开始,让 Turnitin 先处理) + Queue::later( + self::POLL_INTERVAL, + 'app\\api\\job\\PlagiarismPoll', + ['check_id' => $checkId, 'attempt' => 1], + 'plagiarism' + ); + } catch (\Throwable $e) { + $this->markFailed($checkId, '[upload] ' . $e->getMessage()); + throw $e; + } + } + + /** + * Job 调用:轮询 similarity 状态,完成后下载 PDF。未完成则重新入队。 + */ + public function runPollStatus($checkId, $attempt = 1) + { + $check = $this->mustGetCheck($checkId); + if (empty($check['tii_submission_id'])) { + $this->markFailed($checkId, '[poll] tii_submission_id empty'); + return; + } + + try { + $tii = new TurnitinService(); + $statusResp = $tii->getSimilarityStatus($check['tii_submission_id']); + $status = isset($statusResp['status']) ? strtoupper($statusResp['status']) : ''; + + $this->updateCheck($checkId, [ + 'tii_report_status' => $status, + 'attempts' => $attempt, + 'raw_response' => json_encode($statusResp, JSON_UNESCAPED_UNICODE), + ]); + + if ($status === 'COMPLETE') { + $score = isset($statusResp['overall_match_percentage']) + ? floatval($statusResp['overall_match_percentage']) : 0; + + // 下载 PDF + 取在线查看 URL + $localPdf = $this->downloadAndStorePdf($tii, $check['tii_submission_id'], $checkId); + $viewerInfo = $this->refreshViewerUrl($tii, $check['tii_submission_id']); + + $this->updateCheck($checkId, [ + 'state' => 3, + 'similarity_score' => $score, + 'pdf_local_path' => $localPdf, + 'view_only_url' => $viewerInfo['url'], + 'view_only_url_expire' => $viewerInfo['expire'], + 'error_msg' => '', + ]); + return; + } + + if ($status === 'ERROR') { + $errMsg = isset($statusResp['error_code']) ? (string)$statusResp['error_code'] : 'Turnitin reported ERROR'; + $this->markFailed($checkId, '[poll] ' . $errMsg); + return; + } + + // PROCESSING 或其它中间态:继续轮询 + if ($attempt >= self::MAX_POLL_ATTEMPTS) { + $this->markFailed($checkId, '[poll] timeout after ' . $attempt . ' attempts'); + return; + } + Queue::later( + self::POLL_INTERVAL, + 'app\\api\\job\\PlagiarismPoll', + ['check_id' => $checkId, 'attempt' => $attempt + 1], + 'plagiarism' + ); + } catch (\Throwable $e) { + // 网络抖动不要直接 fail,给一定容错次数 + if ($attempt < self::MAX_POLL_ATTEMPTS) { + Queue::later( + self::POLL_INTERVAL, + 'app\\api\\job\\PlagiarismPoll', + ['check_id' => $checkId, 'attempt' => $attempt + 1], + 'plagiarism' + ); + $this->updateCheck($checkId, [ + 'attempts' => $attempt, + 'error_msg' => '[poll] transient: ' . $e->getMessage(), + ]); + return; + } + $this->markFailed($checkId, '[poll] exhausted: ' . $e->getMessage()); + throw $e; + } + } + + /** + * 重新生成在线查看 URL(已有的过期了用) + * + * @return array{url:string, expire:int, local_pdf:string} + */ + public function refreshViewerUrlFor($checkId) + { + $check = $this->mustGetCheck($checkId); + if (empty($check['tii_submission_id'])) { + throw new Exception('check has no tii_submission_id'); + } + $tii = new TurnitinService(); + $info = $this->refreshViewerUrl($tii, $check['tii_submission_id']); + $this->updateCheck($checkId, [ + 'view_only_url' => $info['url'], + 'view_only_url_expire' => $info['expire'], + ]); + return [ + 'url' => $info['url'], + 'expire' => $info['expire'], + 'local_pdf' => $check['pdf_local_path'], + ]; + } + + // ---------- 内部 ---------- + + private function refreshViewerUrl($tii, $submissionId) + { + $resp = $tii->getViewerUrl($submissionId); + $url = ''; + if (isset($resp['viewer_url'])) { + $url = (string)$resp['viewer_url']; + } elseif (isset($resp['url'])) { + $url = (string)$resp['url']; + } + // 默认 2 小时过期,保守起见 + return ['url' => $url, 'expire' => time() + 7200]; + } + + /** + * 触发生成 + 轮询 + 下载 PDF 到本地,返回相对路径 + */ + private function downloadAndStorePdf($tii, $submissionId, $checkId) + { + // 1. 请求生成 + $req = $tii->requestPdfReport($submissionId); + $pdfId = isset($req['id']) ? $req['id'] : ''; + if ($pdfId === '') { + throw new Exception('requestPdfReport empty id: ' . json_encode($req)); + } + + // 2. 内联轮询 PDF 状态(最多 3 分钟,每 6 秒一次) + $maxLoops = 30; + for ($i = 0; $i < $maxLoops; $i++) { + $st = $tii->getPdfReportStatus($submissionId, $pdfId); + $stCode = isset($st['status']) ? strtoupper($st['status']) : ''; + if ($stCode === 'SUCCESS') { + break; + } + if ($stCode === 'FAILED') { + throw new Exception('PDF report generation failed: ' . json_encode($st)); + } + sleep(6); + } + // 3. 下载 + $binary = $tii->downloadPdfReport($submissionId, $pdfId); + if (!is_string($binary) || strlen($binary) < 100) { + throw new Exception('downloaded pdf is empty/too small'); + } + + // 4. 落盘 + $rootDir = ROOT_PATH ?: dirname(dirname(__DIR__)); + $absDir = rtrim($rootDir, '/\\') . DIRECTORY_SEPARATOR . self::REPORT_DIR; + if (!is_dir($absDir)) { + @mkdir($absDir, 0755, true); + } + $filename = sprintf('check_%d_%s.pdf', $checkId, date('Ymd_His')); + $absPath = $absDir . DIRECTORY_SEPARATOR . $filename; + $bytes = file_put_contents($absPath, $binary); + if ($bytes === false || $bytes < 100) { + throw new Exception('failed to save pdf to ' . $absPath); + } + return self::REPORT_DIR . '/' . $filename; + } + + private function mustGetCheck($checkId) + { + $row = Db::name('plagiarism_check')->where('check_id', $checkId)->find(); + if (!$row) { + throw new Exception("plagiarism_check #{$checkId} not found"); + } + return $row; + } + + private function updateCheck($checkId, array $data) + { + $data['utime'] = time(); + Db::name('plagiarism_check')->where('check_id', $checkId)->update($data); + } + + private function markFailed($checkId, $errMsg) + { + $this->updateCheck($checkId, [ + 'state' => 4, + 'error_msg' => mb_substr($errMsg, 0, 1000), + ]); + } + + /** + * 从 t_article_file 找到投稿主稿(manuscirpt)的本地绝对路径。 + * file_url 在系统里可能是 URL 或相对路径,调用方负责保证可读。 + * + * @return string 文件绝对路径,找不到时抛异常 + */ + public function locateArticleManuscript($articleId) + { + $row = Db::name('article_file') + ->where('article_id', $articleId) + ->where('type_name', 'manuscirpt') // 历史拼写 + ->order('article_file_id desc') + ->find(); + if (!$row || empty($row['file_url'])) { + throw new Exception("article #{$articleId} has no manuscirpt file"); + } + return $this->resolveFileUrlToLocal($row['file_url']); + } + + /** + * 把 file_url(可能是 http URL 或相对路径)解析成本地绝对路径。 + * 不同环境部署可能有差异,这里用 .env 配置的 STATIC_ROOT 作前缀。 + */ + public function resolveFileUrlToLocal($fileUrl) + { + $fileUrl = trim((string)$fileUrl); + if ($fileUrl === '') { + throw new Exception('empty file_url'); + } + // 已是绝对路径 + if (preg_match('/^([a-zA-Z]:[\\\\\/]|\/)/', $fileUrl) && is_file($fileUrl)) { + return $fileUrl; + } + + $staticRoot = trim((string)Env::get('plagiarism.static_root', '')); + $cdnPrefix = trim((string)Env::get('plagiarism.cdn_prefix', '')); + + // 是 http URL:先试着剥掉 cdn 前缀,映射到本地 + if (preg_match('#^https?://#i', $fileUrl)) { + if ($cdnPrefix !== '' && stripos($fileUrl, $cdnPrefix) === 0) { + $rel = ltrim(substr($fileUrl, strlen($cdnPrefix)), '/'); + $local = rtrim($staticRoot, '/\\') . DIRECTORY_SEPARATOR . $rel; + if (is_file($local)) { + return $local; + } + } + // 实在不行,下载到 runtime/plagiarism/tmp 临时目录 + return $this->downloadRemoteFile($fileUrl); + } + + // 相对路径:拼 static_root + if ($staticRoot !== '') { + $local = rtrim($staticRoot, '/\\') . DIRECTORY_SEPARATOR . ltrim($fileUrl, '/\\'); + if (is_file($local)) { + return $local; + } + } + + throw new Exception("cannot resolve file_url to local path: {$fileUrl} (set [plagiarism] STATIC_ROOT/CDN_PREFIX in .env)"); + } + + private function downloadRemoteFile($url) + { + $rootDir = ROOT_PATH ?: dirname(dirname(__DIR__)); + $tmpDir = rtrim($rootDir, '/\\') . DIRECTORY_SEPARATOR . self::REPORT_DIR . DIRECTORY_SEPARATOR . 'tmp'; + if (!is_dir($tmpDir)) { + @mkdir($tmpDir, 0755, true); + } + $ext = pathinfo(parse_url($url, PHP_URL_PATH), PATHINFO_EXTENSION) ?: 'pdf'; + $local = $tmpDir . DIRECTORY_SEPARATOR . md5($url) . '_' . time() . '.' . $ext; + + $ch = curl_init($url); + $fh = fopen($local, 'wb'); + curl_setopt_array($ch, [ + CURLOPT_FILE => $fh, + CURLOPT_FOLLOWLOCATION => true, + CURLOPT_TIMEOUT => 120, + CURLOPT_SSL_VERIFYPEER => false, + ]); + $ok = curl_exec($ch); + $code = curl_getinfo($ch, CURLINFO_HTTP_CODE); + curl_close($ch); + fclose($fh); + if (!$ok || $code !== 200 || filesize($local) < 100) { + @unlink($local); + throw new Exception("download failed url={$url} http={$code}"); + } + return $local; + } + + public function getCheck($checkId) + { + return Db::name('plagiarism_check')->where('check_id', $checkId)->find(); + } +} diff --git a/application/common/TurnitinService.php b/application/common/TurnitinService.php new file mode 100644 index 0000000..5b1a889 --- /dev/null +++ b/application/common/TurnitinService.php @@ -0,0 +1,322 @@ + + * X-Turnitin-Integration-Name / X-Turnitin-Integration-Version 用于审计 + * + * .env 配置([turnitin] 段): + * BASE_URL 形如 https://crossref-12345.turnitin.com/api/v1(不带尾斜杠) + * API_KEY 生成的 Bearer token + * INTEGRATION_NAME Scope Name(创建 integration 时填的名字) + * INTEGRATION_VERSION 自定义版本号,便于审计 e.g. 1.0.0 + * + * API 文档:https://developers.turnitin.com/docs/tca + * + * 注意: + * - 所有方法返回原始 decode 后的数组;HTTP 错误抛 Exception + * - 不做任何业务层逻辑(业务层在 PlagiarismService 里) + * - 不缓存 token(Bearer 不需要登录,每次请求自带) + */ +class TurnitinService +{ + private $baseUrl; + private $apiKey; + private $integrationName; + private $integrationVersion; + private $timeout = 60; + + public function __construct() + { + $this->baseUrl = rtrim(trim((string)Env::get('turnitin.base_url', '')), '/'); + $this->apiKey = trim((string)Env::get('turnitin.api_key', '')); + $this->integrationName = trim((string)Env::get('turnitin.integration_name', 'tmr')); + $this->integrationVersion = trim((string)Env::get('turnitin.integration_version', '1.0.0')); + + if ($this->baseUrl === '' || $this->apiKey === '') { + throw new Exception('Turnitin not configured: missing BASE_URL or API_KEY in .env [turnitin] section'); + } + } + + // ==================== Public API ==================== + + /** + * 探活 / 拿账户能力 + * GET /features-enabled + */ + public function featuresEnabled() + { + return $this->request('GET', '/features-enabled'); + } + + /** + * 创建 submission(拿到 id 之后才能上传文件) + * POST /submissions + * + * @param array $meta 必填字段: + * - title 论文标题 + * - owner submission owner 标识符(自定义字符串,比如投稿系统 user_id) + * - submitter 提交者标识符(同上) + * - eula (可选) ['version' => '...', 'language' => 'en-US', 'accepted_timestamp' => ISO8601] + * 如果 features-enabled 返回 require_eula=false 可省略 + * 可选字段: + * - extract_text_only bool + * - metadata array 自定义键值,供后续追溯 + * + * @return array 含 id(submission UUID), status, owner, ... + */ + public function createSubmission($meta) + { + return $this->request('POST', '/submissions', $meta); + } + + /** + * 上传文件到 submission + * PUT /submissions/{id}/original/{filename} + * + * @param string $submissionId + * @param string $filePath 本地 PDF/DOCX 路径 + * @param string $filename 传给 Turnitin 的文件名(用于报告显示) + * @return array + */ + public function uploadFile($submissionId, $filePath, $filename = '') + { + if (!is_file($filePath) || !is_readable($filePath)) { + throw new Exception("File not found or not readable: {$filePath}"); + } + if ($filename === '') { + $filename = basename($filePath); + } + $body = file_get_contents($filePath); + + return $this->request( + 'PUT', + '/submissions/' . urlencode($submissionId) . '/original/' . rawurlencode($filename), + $body, + [ + 'Content-Type' => 'binary/octet-stream', + 'Content-Disposition' => 'inline; filename="' . $filename . '"', + ] + ); + } + + /** + * 触发 similarity 比对 + * PUT /submissions/{id}/similarity + * + * @param string $submissionId + * @param array $opts + * - generation_settings.search_repositories 默认 ['INTERNET','PUBLICATION','CROSSREF','CROSSREF_POSTED_CONTENT','SUBMITTED_WORK'] + * - generation_settings.submission_auto_excludes bool + * - view_settings.exclude_quotes / exclude_bibliography / exclude_citations / exclude_abstract / exclude_methods bool + * - indexing_settings.add_to_index bool 是否把本文加进 SUBMITTED_WORK 索引(一般 true) + * @return array + */ + public function triggerSimilarity($submissionId, $opts = []) + { + $body = array_merge([ + 'generation_settings' => [ + 'search_repositories' => ['INTERNET', 'PUBLICATION', 'CROSSREF', 'CROSSREF_POSTED_CONTENT', 'SUBMITTED_WORK'], + 'submission_auto_excludes' => true, + 'auto_exclude_self_matching_scope' => 'GROUP_CONTEXT', + ], + 'view_settings' => [ + 'exclude_quotes' => true, + 'exclude_bibliography' => true, + 'exclude_citations' => true, + ], + 'indexing_settings' => [ + 'add_to_index' => true, + ], + ], $opts); + + return $this->request( + 'PUT', + '/submissions/' . urlencode($submissionId) . '/similarity', + $body + ); + } + + /** + * 查询 similarity 状态 + * GET /submissions/{id}/similarity + * + * 返回 status: PROCESSING / COMPLETE / ERROR + * COMPLETE 时返回 overall_match_percentage / time_requested / time_generated + */ + public function getSimilarityStatus($submissionId) + { + return $this->request( + 'GET', + '/submissions/' . urlencode($submissionId) . '/similarity' + ); + } + + /** + * 取在线查看报告的临时 URL + * POST /submissions/{id}/viewer-url + * + * 返回 viewer_url(数小时有效) + * + * @param array $viewer 可选 viewer 设置 e.g. ['viewer_default_permission_set' => 'INSTRUCTOR'] + */ + public function getViewerUrl($submissionId, $viewer = []) + { + $body = array_merge([ + 'viewer_default_permission_set' => 'INSTRUCTOR', + 'similarity' => [ + 'default_mode' => 'MATCH_OVERVIEW', + 'view_settings' => ['save_changes' => true], + 'modes' => ['match_overview' => true, 'all_sources' => true], + ], + 'locale' => 'en-US', + ], $viewer); + + return $this->request( + 'POST', + '/submissions/' . urlencode($submissionId) . '/viewer-url', + $body + ); + } + + /** + * 触发生成 PDF 报告(异步,状态在另一个轮询里看) + * POST /submissions/{id}/similarity/pdf + * + * 返回 id(pdf 报告 ID) + */ + public function requestPdfReport($submissionId, $opts = []) + { + $body = array_merge([ + 'locale' => 'en-US', + ], $opts); + + return $this->request( + 'POST', + '/submissions/' . urlencode($submissionId) . '/similarity/pdf', + $body + ); + } + + /** + * 查询 PDF 报告状态 + * GET /submissions/{id}/similarity/pdf/{pdf_id}/status + * + * status: PENDING / SUCCESS / FAILED + */ + public function getPdfReportStatus($submissionId, $pdfId) + { + return $this->request( + 'GET', + '/submissions/' . urlencode($submissionId) . '/similarity/pdf/' . urlencode($pdfId) . '/status' + ); + } + + /** + * 下载 PDF 报告内容(status=SUCCESS 后才可调用) + * GET /submissions/{id}/similarity/pdf/{pdf_id} + * + * 返回 raw PDF binary 字符串;调用方负责落盘 + */ + public function downloadPdfReport($submissionId, $pdfId) + { + return $this->request( + 'GET', + '/submissions/' . urlencode($submissionId) . '/similarity/pdf/' . urlencode($pdfId), + null, + [], + true // raw response (不 json_decode) + ); + } + + // ==================== Internal HTTP layer ==================== + + /** + * 统一 HTTP 调用 + * + * @param string $method GET/POST/PUT/DELETE + * @param string $path 以 / 开头的相对路径,会拼到 baseUrl 后 + * @param mixed $body array 时按 JSON 编码;string 时直接当 raw body + * @param array $extraHeaders 额外 header + * @param bool $rawResponse true=返回 raw 字符串;false=json_decode + * @return mixed + * @throws Exception + */ + private function request($method, $path, $body = null, $extraHeaders = [], $rawResponse = false) + { + $url = $this->baseUrl . $path; + + $headers = [ + 'Authorization: Bearer ' . $this->apiKey, + 'X-Turnitin-Integration-Name: ' . $this->integrationName, + 'X-Turnitin-Integration-Version: ' . $this->integrationVersion, + ]; + + $payload = null; + if ($body !== null) { + if (is_array($body)) { + $payload = json_encode($body, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES); + $headers[] = 'Content-Type: application/json'; + } else { + $payload = $body; + if (!isset($extraHeaders['Content-Type'])) { + $headers[] = 'Content-Type: application/octet-stream'; + } + } + } + foreach ($extraHeaders as $k => $v) { + $headers[] = $k . ': ' . $v; + } + + $ch = curl_init(); + curl_setopt_array($ch, [ + CURLOPT_URL => $url, + CURLOPT_CUSTOMREQUEST => strtoupper($method), + CURLOPT_RETURNTRANSFER => true, + CURLOPT_HTTPHEADER => $headers, + CURLOPT_TIMEOUT => $this->timeout, + CURLOPT_CONNECTTIMEOUT => 15, + CURLOPT_SSL_VERIFYPEER => true, + CURLOPT_SSL_VERIFYHOST => 2, + ]); + if ($payload !== null) { + curl_setopt($ch, CURLOPT_POSTFIELDS, $payload); + } + + $resp = curl_exec($ch); + $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); + $err = curl_error($ch); + curl_close($ch); + + if ($resp === false) { + throw new Exception("Turnitin curl error: {$err} (url={$url})"); + } + if ($httpCode < 200 || $httpCode >= 300) { + // 把响应体的前 1k 也带上方便排错 + $excerpt = mb_substr((string)$resp, 0, 1000); + throw new Exception("Turnitin HTTP {$httpCode} {$method} {$path}: {$excerpt}"); + } + + if ($rawResponse) { + return $resp; + } + // 部分响应可能是 204 No Content + if ($resp === '' || $resp === null) { + return []; + } + $data = json_decode($resp, true); + if (json_last_error() !== JSON_ERROR_NONE) { + // 不是 JSON 也直接抛回原文 + return $resp; + } + return $data; + } +} diff --git a/sql/add_plagiarism_check_table.sql b/sql/add_plagiarism_check_table.sql new file mode 100644 index 0000000..f160390 --- /dev/null +++ b/sql/add_plagiarism_check_table.sql @@ -0,0 +1,44 @@ +-- 查重任务表(Turnitin TCA / Crossref Similarity Check) +-- +-- 一篇 article 可重复触发多次查重;同一 article 的最近一次显示在编辑详情页。 +-- state 流转:0 待上传 → 1 上传中 → 2 比对中 → 3 完成 → 4 失败 +-- 失败可重新触发,会创建新行(保留历史) +-- +-- 报告永久保留:pdf_local_path 指向 runtime/plagiarism/ 下的本地 PDF; +-- view_only_url 是 Turnitin 临时签名(数小时过期),过期需重新生成 + +DROP TABLE IF EXISTS `t_plagiarism_check`; +CREATE TABLE `t_plagiarism_check` ( + `check_id` INT NOT NULL AUTO_INCREMENT, + `article_id` INT NOT NULL DEFAULT 0 COMMENT '关联投稿 t_article.article_id', + `journal_id` INT NOT NULL DEFAULT 0 COMMENT '所属期刊(冗余便于按期刊统计)', + `triggered_by` INT NOT NULL DEFAULT 0 COMMENT '触发人 user_id(手工触发时编辑的 user_id)', + `trigger_source` VARCHAR(32) NOT NULL DEFAULT 'manual' COMMENT 'manual/auto_initial_review/...', + `state` TINYINT NOT NULL DEFAULT 0 COMMENT '0待上传 1上传中 2比对中 3完成 4失败', + + -- Turnitin 端的实体 ID + `tii_submission_id` VARCHAR(64) NOT NULL DEFAULT '' COMMENT 'Turnitin submission UUID', + `tii_report_status` VARCHAR(32) NOT NULL DEFAULT '' COMMENT 'PROCESSING/COMPLETE/ERROR', + + -- 结果 + `similarity_score` DECIMAL(5,2) NOT NULL DEFAULT 0 COMMENT '总相似度 %(如 12.34)', + `view_only_url` VARCHAR(1024) NOT NULL DEFAULT '' COMMENT '在线查看报告 URL(临时签名)', + `view_only_url_expire` INT NOT NULL DEFAULT 0 COMMENT '在线查看 URL 过期时间戳', + `pdf_local_path` VARCHAR(255) NOT NULL DEFAULT '' COMMENT '本地缓存的 PDF 报告相对路径', + + -- 文件元数据(上传时记录,便于追踪) + `source_file_name` VARCHAR(255) NOT NULL DEFAULT '' COMMENT '原始 PDF 文件名', + `source_file_size` INT NOT NULL DEFAULT 0 COMMENT '原始 PDF 字节数', + + -- 调试与重试 + `attempts` INT NOT NULL DEFAULT 0 COMMENT '总轮询/重试次数', + `error_msg` VARCHAR(1024) NOT NULL DEFAULT '' COMMENT '失败原因', + `raw_response` MEDIUMTEXT COMMENT '最近一次 Turnitin API 原始返回(调试用)', + + `ctime` INT NOT NULL DEFAULT 0, + `utime` INT NOT NULL DEFAULT 0, + PRIMARY KEY (`check_id`), + KEY `idx_article` (`article_id`, `state`), + KEY `idx_state` (`state`), + KEY `idx_tii_submission` (`tii_submission_id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb3 COMMENT='Turnitin 查重任务表'; diff --git a/test_plagiarism_e2e.php b/test_plagiarism_e2e.php new file mode 100644 index 0000000..7a1082a --- /dev/null +++ b/test_plagiarism_e2e.php @@ -0,0 +1,102 @@ + # 用 article 主稿提交查重(手工触发) + * php test_plagiarism_e2e.php submit-file # 用本地 PDF 提交(不绑定 article) + * php test_plagiarism_e2e.php status # 查询状态 + * php test_plagiarism_e2e.php list # 列出某 article 的查重记录 + * php test_plagiarism_e2e.php viewer # 取在线查看 URL + * + * 说明: + * submit-file 不会真正落库(仅用于联通验证),它会用 article_id=0 走完整套流程。 + * submit 会写入 t_plagiarism_check,并把 check_id 打回,再用 status 自己轮询。 + */ + +define('IS_CLI', true); + +require __DIR__ . '/thinkphp/start.php'; + +use think\Db; +use app\common\PlagiarismService; +use app\common\TurnitinService; + +if ($argc < 2) { + echo "Usage: php test_plagiarism_e2e.php [args...]\n"; + exit(1); +} +$cmd = $argv[1]; + +try { + switch ($cmd) { + case 'features': { + $tii = new TurnitinService(); + print_r($tii->featuresEnabled()); + break; + } + case 'submit': { + if ($argc < 3) { + echo "Usage: ... submit \n"; + exit(1); + } + $articleId = intval($argv[2]); + $svc = new PlagiarismService(); + $local = $svc->locateArticleManuscript($articleId); + echo "manuscript local path: {$local}\n"; + $checkId = $svc->submit($articleId, $local, 0, 'cli_test'); + echo "submitted, check_id = {$checkId}\n"; + echo "now run: php think queue:work --queue plagiarism --tries=1 -v\n"; + break; + } + case 'submit-file': { + if ($argc < 3) { + echo "Usage: ... submit-file \n"; + exit(1); + } + $path = $argv[2]; + if (!is_file($path)) { + echo "file not exists: {$path}\n"; + exit(1); + } + $svc = new PlagiarismService(); + $checkId = $svc->submit(0, $path, 0, 'cli_test_file'); + echo "submitted, check_id = {$checkId}\n"; + break; + } + case 'status': { + if ($argc < 3) { + echo "Usage: ... status \n"; + exit(1); + } + $row = Db::name('plagiarism_check')->where('check_id', intval($argv[2]))->find(); + print_r($row); + break; + } + case 'list': { + if ($argc < 3) { + echo "Usage: ... list \n"; + exit(1); + } + $rows = Db::name('plagiarism_check')->where('article_id', intval($argv[2]))->order('check_id desc')->select(); + print_r($rows); + break; + } + case 'viewer': { + if ($argc < 3) { + echo "Usage: ... viewer \n"; + exit(1); + } + $svc = new PlagiarismService(); + print_r($svc->refreshViewerUrlFor(intval($argv[2]))); + break; + } + default: + echo "unknown command: {$cmd}\n"; + exit(1); + } +} catch (\Throwable $e) { + echo "ERROR: " . $e->getMessage() . "\n" . $e->getTraceAsString() . "\n"; + exit(1); +}