424 lines
15 KiB
PHP
424 lines
15 KiB
PHP
<?php
|
||
|
||
namespace app\common;
|
||
|
||
use think\Db;
|
||
use think\Env;
|
||
use think\Queue;
|
||
use think\Exception;
|
||
|
||
/**
|
||
* 查重业务层:把 TurnitinService 的低层调用包装成"按 article 查重"的高层流程,
|
||
* 并维护 t_plagiarism_check 状态机。
|
||
*
|
||
* 状态流:
|
||
* submit() → state=1(上传中),入队 PlagiarismRun
|
||
* PlagiarismRun.fire → 上传 + 触发 similarity → state=2(比对中),入队 PlagiarismPoll
|
||
* PlagiarismPoll.fire → 轮询 status,完成后下载 PDF → state=3(完成)
|
||
* 任意环节抛异常 → state=4(失败),写 error_msg
|
||
*/
|
||
class PlagiarismService
|
||
{
|
||
/**
|
||
* 报告 PDF 本地保存目录(相对于项目根,永久保留)
|
||
*/
|
||
const REPORT_DIR = 'public/plagiarism';
|
||
|
||
/**
|
||
* 轮询间隔(秒)。Turnitin 一般 1-5 分钟出结果,30 秒一次比较合适
|
||
*/
|
||
const POLL_INTERVAL = 30;
|
||
|
||
/**
|
||
* 最长轮询次数(30s × 60 = 30 分钟)
|
||
*/
|
||
const MAX_POLL_ATTEMPTS = 60;
|
||
|
||
// ---------- 顶层入口 ----------
|
||
|
||
/**
|
||
* 提交查重(入队,立即返回 check_id)
|
||
*
|
||
* @param int $articleId 投稿 ID
|
||
* @param string $filePath 本地可读的 PDF/DOCX 绝对路径
|
||
* @param int $triggeredBy 触发人 user_id(手工触发时编辑后台的 user_id)
|
||
* @param string $source 'manual' / 'auto_xxx'
|
||
* @return int check_id
|
||
*/
|
||
public function submit($articleId, $filePath, $triggeredBy = 0, $source = 'manual')
|
||
{
|
||
if (!is_file($filePath) || !is_readable($filePath)) {
|
||
throw new Exception("File not readable: {$filePath}");
|
||
}
|
||
|
||
$journalId = (int) Db::name('article')
|
||
->where('article_id', $articleId)
|
||
->value('journal_id');
|
||
|
||
$now = time();
|
||
$checkId = Db::name('plagiarism_check')->insertGetId([
|
||
'article_id' => $articleId,
|
||
'journal_id' => $journalId,
|
||
'triggered_by' => $triggeredBy,
|
||
'trigger_source' => $source,
|
||
'state' => 1, // 上传中
|
||
'source_file_name' => basename($filePath),
|
||
'source_file_size' => filesize($filePath) ?: 0,
|
||
'ctime' => $now,
|
||
'utime' => $now,
|
||
]);
|
||
|
||
// 入队执行:上传 + 触发 similarity
|
||
Queue::push(
|
||
'app\\api\\job\\PlagiarismRun',
|
||
['check_id' => $checkId, 'file_path' => $filePath],
|
||
'plagiarism'
|
||
);
|
||
|
||
return (int)$checkId;
|
||
}
|
||
|
||
/**
|
||
* Job 调用:上传文件到 Turnitin 并触发 similarity,然后入队 PlagiarismPoll
|
||
*/
|
||
public function runUploadAndTrigger($checkId, $filePath)
|
||
{
|
||
$check = $this->mustGetCheck($checkId);
|
||
|
||
try {
|
||
$tii = new TurnitinService();
|
||
|
||
// 1. 创建 submission
|
||
$articleTitle = (string) Db::name('article')
|
||
->where('article_id', $check['article_id'])
|
||
->value('title');
|
||
if ($articleTitle === '') {
|
||
$articleTitle = 'Article #' . $check['article_id'];
|
||
}
|
||
|
||
$createResp = $tii->createSubmission([
|
||
'title' => mb_substr($articleTitle, 0, 250),
|
||
'owner' => 'editor_' . $check['triggered_by'],
|
||
'submitter' => 'editor_' . $check['triggered_by'],
|
||
'metadata' => [
|
||
'article_id' => (string)$check['article_id'],
|
||
'check_id' => (string)$check['check_id'],
|
||
],
|
||
]);
|
||
$submissionId = isset($createResp['id']) ? $createResp['id'] : '';
|
||
if ($submissionId === '') {
|
||
throw new Exception('Turnitin createSubmission returned empty id: ' . json_encode($createResp));
|
||
}
|
||
|
||
$this->updateCheck($checkId, [
|
||
'tii_submission_id' => $submissionId,
|
||
'raw_response' => json_encode($createResp, JSON_UNESCAPED_UNICODE),
|
||
]);
|
||
|
||
// 2. 上传文件
|
||
$tii->uploadFile($submissionId, $filePath, basename($filePath));
|
||
|
||
// 3. 触发 similarity
|
||
$simResp = $tii->triggerSimilarity($submissionId);
|
||
|
||
$this->updateCheck($checkId, [
|
||
'state' => 2, // 比对中
|
||
'tii_report_status' => 'PROCESSING',
|
||
'raw_response' => json_encode($simResp, JSON_UNESCAPED_UNICODE),
|
||
]);
|
||
|
||
// 4. 排队首次轮询(晚一点开始,让 Turnitin 先处理)
|
||
Queue::later(
|
||
self::POLL_INTERVAL,
|
||
'app\\api\\job\\PlagiarismPoll',
|
||
['check_id' => $checkId, 'attempt' => 1],
|
||
'plagiarism'
|
||
);
|
||
} catch (\Throwable $e) {
|
||
$this->markFailed($checkId, '[upload] ' . $e->getMessage());
|
||
throw $e;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Job 调用:轮询 similarity 状态,完成后下载 PDF。未完成则重新入队。
|
||
*/
|
||
public function runPollStatus($checkId, $attempt = 1)
|
||
{
|
||
$check = $this->mustGetCheck($checkId);
|
||
if (empty($check['tii_submission_id'])) {
|
||
$this->markFailed($checkId, '[poll] tii_submission_id empty');
|
||
return;
|
||
}
|
||
|
||
try {
|
||
$tii = new TurnitinService();
|
||
$statusResp = $tii->getSimilarityStatus($check['tii_submission_id']);
|
||
$status = isset($statusResp['status']) ? strtoupper($statusResp['status']) : '';
|
||
|
||
$this->updateCheck($checkId, [
|
||
'tii_report_status' => $status,
|
||
'attempts' => $attempt,
|
||
'raw_response' => json_encode($statusResp, JSON_UNESCAPED_UNICODE),
|
||
]);
|
||
|
||
if ($status === 'COMPLETE') {
|
||
$score = isset($statusResp['overall_match_percentage'])
|
||
? floatval($statusResp['overall_match_percentage']) : 0;
|
||
|
||
// 下载 PDF + 取在线查看 URL
|
||
$localPdf = $this->downloadAndStorePdf($tii, $check['tii_submission_id'], $checkId);
|
||
$viewerInfo = $this->refreshViewerUrl($tii, $check['tii_submission_id']);
|
||
|
||
$this->updateCheck($checkId, [
|
||
'state' => 3,
|
||
'similarity_score' => $score,
|
||
'pdf_local_path' => $localPdf,
|
||
'view_only_url' => $viewerInfo['url'],
|
||
'view_only_url_expire' => $viewerInfo['expire'],
|
||
'error_msg' => '',
|
||
]);
|
||
return;
|
||
}
|
||
|
||
if ($status === 'ERROR') {
|
||
$errMsg = isset($statusResp['error_code']) ? (string)$statusResp['error_code'] : 'Turnitin reported ERROR';
|
||
$this->markFailed($checkId, '[poll] ' . $errMsg);
|
||
return;
|
||
}
|
||
|
||
// PROCESSING 或其它中间态:继续轮询
|
||
if ($attempt >= self::MAX_POLL_ATTEMPTS) {
|
||
$this->markFailed($checkId, '[poll] timeout after ' . $attempt . ' attempts');
|
||
return;
|
||
}
|
||
Queue::later(
|
||
self::POLL_INTERVAL,
|
||
'app\\api\\job\\PlagiarismPoll',
|
||
['check_id' => $checkId, 'attempt' => $attempt + 1],
|
||
'plagiarism'
|
||
);
|
||
} catch (\Throwable $e) {
|
||
// 网络抖动不要直接 fail,给一定容错次数
|
||
if ($attempt < self::MAX_POLL_ATTEMPTS) {
|
||
Queue::later(
|
||
self::POLL_INTERVAL,
|
||
'app\\api\\job\\PlagiarismPoll',
|
||
['check_id' => $checkId, 'attempt' => $attempt + 1],
|
||
'plagiarism'
|
||
);
|
||
$this->updateCheck($checkId, [
|
||
'attempts' => $attempt,
|
||
'error_msg' => '[poll] transient: ' . $e->getMessage(),
|
||
]);
|
||
return;
|
||
}
|
||
$this->markFailed($checkId, '[poll] exhausted: ' . $e->getMessage());
|
||
throw $e;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* 重新生成在线查看 URL(已有的过期了用)
|
||
*
|
||
* @return array{url:string, expire:int, local_pdf:string}
|
||
*/
|
||
public function refreshViewerUrlFor($checkId)
|
||
{
|
||
$check = $this->mustGetCheck($checkId);
|
||
if (empty($check['tii_submission_id'])) {
|
||
throw new Exception('check has no tii_submission_id');
|
||
}
|
||
$tii = new TurnitinService();
|
||
$info = $this->refreshViewerUrl($tii, $check['tii_submission_id']);
|
||
$this->updateCheck($checkId, [
|
||
'view_only_url' => $info['url'],
|
||
'view_only_url_expire' => $info['expire'],
|
||
]);
|
||
return [
|
||
'url' => $info['url'],
|
||
'expire' => $info['expire'],
|
||
'local_pdf' => $check['pdf_local_path'],
|
||
];
|
||
}
|
||
|
||
// ---------- 内部 ----------
|
||
|
||
private function refreshViewerUrl($tii, $submissionId)
|
||
{
|
||
$resp = $tii->getViewerUrl($submissionId);
|
||
$url = '';
|
||
if (isset($resp['viewer_url'])) {
|
||
$url = (string)$resp['viewer_url'];
|
||
} elseif (isset($resp['url'])) {
|
||
$url = (string)$resp['url'];
|
||
}
|
||
// 默认 2 小时过期,保守起见
|
||
return ['url' => $url, 'expire' => time() + 7200];
|
||
}
|
||
|
||
/**
|
||
* 触发生成 + 轮询 + 下载 PDF 到本地,返回相对路径
|
||
*/
|
||
private function downloadAndStorePdf($tii, $submissionId, $checkId)
|
||
{
|
||
// 1. 请求生成
|
||
$req = $tii->requestPdfReport($submissionId);
|
||
$pdfId = isset($req['id']) ? $req['id'] : '';
|
||
if ($pdfId === '') {
|
||
throw new Exception('requestPdfReport empty id: ' . json_encode($req));
|
||
}
|
||
|
||
// 2. 内联轮询 PDF 状态(最多 3 分钟,每 6 秒一次)
|
||
$maxLoops = 30;
|
||
for ($i = 0; $i < $maxLoops; $i++) {
|
||
$st = $tii->getPdfReportStatus($submissionId, $pdfId);
|
||
$stCode = isset($st['status']) ? strtoupper($st['status']) : '';
|
||
if ($stCode === 'SUCCESS') {
|
||
break;
|
||
}
|
||
if ($stCode === 'FAILED') {
|
||
throw new Exception('PDF report generation failed: ' . json_encode($st));
|
||
}
|
||
sleep(6);
|
||
}
|
||
// 3. 下载
|
||
$binary = $tii->downloadPdfReport($submissionId, $pdfId);
|
||
if (!is_string($binary) || strlen($binary) < 100) {
|
||
throw new Exception('downloaded pdf is empty/too small');
|
||
}
|
||
|
||
// 4. 落盘
|
||
$rootDir = ROOT_PATH ?: dirname(dirname(__DIR__));
|
||
$absDir = rtrim($rootDir, '/\\') . DIRECTORY_SEPARATOR . self::REPORT_DIR;
|
||
if (!is_dir($absDir)) {
|
||
@mkdir($absDir, 0755, true);
|
||
}
|
||
$filename = sprintf('check_%d_%s.pdf', $checkId, date('Ymd_His'));
|
||
$absPath = $absDir . DIRECTORY_SEPARATOR . $filename;
|
||
$bytes = file_put_contents($absPath, $binary);
|
||
if ($bytes === false || $bytes < 100) {
|
||
throw new Exception('failed to save pdf to ' . $absPath);
|
||
}
|
||
return self::REPORT_DIR . '/' . $filename;
|
||
}
|
||
|
||
private function mustGetCheck($checkId)
|
||
{
|
||
$row = Db::name('plagiarism_check')->where('check_id', $checkId)->find();
|
||
if (!$row) {
|
||
throw new Exception("plagiarism_check #{$checkId} not found");
|
||
}
|
||
return $row;
|
||
}
|
||
|
||
private function updateCheck($checkId, array $data)
|
||
{
|
||
$data['utime'] = time();
|
||
Db::name('plagiarism_check')->where('check_id', $checkId)->update($data);
|
||
}
|
||
|
||
private function markFailed($checkId, $errMsg)
|
||
{
|
||
$this->updateCheck($checkId, [
|
||
'state' => 4,
|
||
'error_msg' => mb_substr($errMsg, 0, 1000),
|
||
]);
|
||
}
|
||
|
||
/**
|
||
* 从 t_article_file 找到投稿主稿(manuscirpt)的本地绝对路径。
|
||
* file_url 在系统里可能是 URL 或相对路径,调用方负责保证可读。
|
||
*
|
||
* @return string 文件绝对路径,找不到时抛异常
|
||
*/
|
||
public function locateArticleManuscript($articleId)
|
||
{
|
||
$row = Db::name('article_file')
|
||
->where('article_id', $articleId)
|
||
->where('type_name', 'manuscirpt') // 历史拼写
|
||
->order('article_file_id desc')
|
||
->find();
|
||
if (!$row || empty($row['file_url'])) {
|
||
throw new Exception("article #{$articleId} has no manuscirpt file");
|
||
}
|
||
return $this->resolveFileUrlToLocal($row['file_url']);
|
||
}
|
||
|
||
/**
|
||
* 把 file_url(可能是 http URL 或相对路径)解析成本地绝对路径。
|
||
* 不同环境部署可能有差异,这里用 .env 配置的 STATIC_ROOT 作前缀。
|
||
*/
|
||
public function resolveFileUrlToLocal($fileUrl)
|
||
{
|
||
$fileUrl = trim((string)$fileUrl);
|
||
if ($fileUrl === '') {
|
||
throw new Exception('empty file_url');
|
||
}
|
||
// 已是绝对路径
|
||
if (preg_match('/^([a-zA-Z]:[\\\\\/]|\/)/', $fileUrl) && is_file($fileUrl)) {
|
||
return $fileUrl;
|
||
}
|
||
|
||
$staticRoot = trim((string)Env::get('plagiarism.static_root', ''));
|
||
$cdnPrefix = trim((string)Env::get('plagiarism.cdn_prefix', ''));
|
||
|
||
// 是 http URL:先试着剥掉 cdn 前缀,映射到本地
|
||
if (preg_match('#^https?://#i', $fileUrl)) {
|
||
if ($cdnPrefix !== '' && stripos($fileUrl, $cdnPrefix) === 0) {
|
||
$rel = ltrim(substr($fileUrl, strlen($cdnPrefix)), '/');
|
||
$local = rtrim($staticRoot, '/\\') . DIRECTORY_SEPARATOR . $rel;
|
||
if (is_file($local)) {
|
||
return $local;
|
||
}
|
||
}
|
||
// 实在不行,下载到 runtime/plagiarism/tmp 临时目录
|
||
return $this->downloadRemoteFile($fileUrl);
|
||
}
|
||
|
||
// 相对路径:拼 static_root
|
||
if ($staticRoot !== '') {
|
||
$local = rtrim($staticRoot, '/\\') . DIRECTORY_SEPARATOR . ltrim($fileUrl, '/\\');
|
||
if (is_file($local)) {
|
||
return $local;
|
||
}
|
||
}
|
||
|
||
throw new Exception("cannot resolve file_url to local path: {$fileUrl} (set [plagiarism] STATIC_ROOT/CDN_PREFIX in .env)");
|
||
}
|
||
|
||
private function downloadRemoteFile($url)
|
||
{
|
||
$rootDir = ROOT_PATH ?: dirname(dirname(__DIR__));
|
||
$tmpDir = rtrim($rootDir, '/\\') . DIRECTORY_SEPARATOR . self::REPORT_DIR . DIRECTORY_SEPARATOR . 'tmp';
|
||
if (!is_dir($tmpDir)) {
|
||
@mkdir($tmpDir, 0755, true);
|
||
}
|
||
$ext = pathinfo(parse_url($url, PHP_URL_PATH), PATHINFO_EXTENSION) ?: 'pdf';
|
||
$local = $tmpDir . DIRECTORY_SEPARATOR . md5($url) . '_' . time() . '.' . $ext;
|
||
|
||
$ch = curl_init($url);
|
||
$fh = fopen($local, 'wb');
|
||
curl_setopt_array($ch, [
|
||
CURLOPT_FILE => $fh,
|
||
CURLOPT_FOLLOWLOCATION => true,
|
||
CURLOPT_TIMEOUT => 120,
|
||
CURLOPT_SSL_VERIFYPEER => false,
|
||
]);
|
||
$ok = curl_exec($ch);
|
||
$code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
||
curl_close($ch);
|
||
fclose($fh);
|
||
if (!$ok || $code !== 200 || filesize($local) < 100) {
|
||
@unlink($local);
|
||
throw new Exception("download failed url={$url} http={$code}");
|
||
}
|
||
return $local;
|
||
}
|
||
|
||
public function getCheck($checkId)
|
||
{
|
||
return Db::name('plagiarism_check')->where('check_id', $checkId)->find();
|
||
}
|
||
}
|