修改自动推广的相关任务
This commit is contained in:
423
application/common/PlagiarismService.php
Normal file
423
application/common/PlagiarismService.php
Normal file
@@ -0,0 +1,423 @@
|
||||
<?php
|
||||
|
||||
namespace app\common;
|
||||
|
||||
use think\Db;
|
||||
use think\Env;
|
||||
use think\Queue;
|
||||
use think\Exception;
|
||||
|
||||
/**
|
||||
* 查重业务层:把 TurnitinService 的低层调用包装成"按 article 查重"的高层流程,
|
||||
* 并维护 t_plagiarism_check 状态机。
|
||||
*
|
||||
* 状态流:
|
||||
* submit() → state=1(上传中),入队 PlagiarismRun
|
||||
* PlagiarismRun.fire → 上传 + 触发 similarity → state=2(比对中),入队 PlagiarismPoll
|
||||
* PlagiarismPoll.fire → 轮询 status,完成后下载 PDF → state=3(完成)
|
||||
* 任意环节抛异常 → state=4(失败),写 error_msg
|
||||
*/
|
||||
class PlagiarismService
|
||||
{
|
||||
/**
|
||||
* 报告 PDF 本地保存目录(相对于项目根,永久保留)
|
||||
*/
|
||||
const REPORT_DIR = 'runtime/plagiarism';
|
||||
|
||||
/**
|
||||
* 轮询间隔(秒)。Turnitin 一般 1-5 分钟出结果,30 秒一次比较合适
|
||||
*/
|
||||
const POLL_INTERVAL = 30;
|
||||
|
||||
/**
|
||||
* 最长轮询次数(30s × 60 = 30 分钟)
|
||||
*/
|
||||
const MAX_POLL_ATTEMPTS = 60;
|
||||
|
||||
// ---------- 顶层入口 ----------
|
||||
|
||||
/**
|
||||
* 提交查重(入队,立即返回 check_id)
|
||||
*
|
||||
* @param int $articleId 投稿 ID
|
||||
* @param string $filePath 本地可读的 PDF/DOCX 绝对路径
|
||||
* @param int $triggeredBy 触发人 user_id(手工触发时编辑后台的 user_id)
|
||||
* @param string $source 'manual' / 'auto_xxx'
|
||||
* @return int check_id
|
||||
*/
|
||||
public function submit($articleId, $filePath, $triggeredBy = 0, $source = 'manual')
|
||||
{
|
||||
if (!is_file($filePath) || !is_readable($filePath)) {
|
||||
throw new Exception("File not readable: {$filePath}");
|
||||
}
|
||||
|
||||
$journalId = (int) Db::name('article')
|
||||
->where('article_id', $articleId)
|
||||
->value('journal_id');
|
||||
|
||||
$now = time();
|
||||
$checkId = Db::name('plagiarism_check')->insertGetId([
|
||||
'article_id' => $articleId,
|
||||
'journal_id' => $journalId,
|
||||
'triggered_by' => $triggeredBy,
|
||||
'trigger_source' => $source,
|
||||
'state' => 1, // 上传中
|
||||
'source_file_name' => basename($filePath),
|
||||
'source_file_size' => filesize($filePath) ?: 0,
|
||||
'ctime' => $now,
|
||||
'utime' => $now,
|
||||
]);
|
||||
|
||||
// 入队执行:上传 + 触发 similarity
|
||||
Queue::push(
|
||||
'app\\api\\job\\PlagiarismRun',
|
||||
['check_id' => $checkId, 'file_path' => $filePath],
|
||||
'plagiarism'
|
||||
);
|
||||
|
||||
return (int)$checkId;
|
||||
}
|
||||
|
||||
/**
|
||||
* Job 调用:上传文件到 Turnitin 并触发 similarity,然后入队 PlagiarismPoll
|
||||
*/
|
||||
public function runUploadAndTrigger($checkId, $filePath)
|
||||
{
|
||||
$check = $this->mustGetCheck($checkId);
|
||||
|
||||
try {
|
||||
$tii = new TurnitinService();
|
||||
|
||||
// 1. 创建 submission
|
||||
$articleTitle = (string) Db::name('article')
|
||||
->where('article_id', $check['article_id'])
|
||||
->value('title');
|
||||
if ($articleTitle === '') {
|
||||
$articleTitle = 'Article #' . $check['article_id'];
|
||||
}
|
||||
|
||||
$createResp = $tii->createSubmission([
|
||||
'title' => mb_substr($articleTitle, 0, 250),
|
||||
'owner' => 'editor_' . $check['triggered_by'],
|
||||
'submitter' => 'editor_' . $check['triggered_by'],
|
||||
'metadata' => [
|
||||
'article_id' => (string)$check['article_id'],
|
||||
'check_id' => (string)$check['check_id'],
|
||||
],
|
||||
]);
|
||||
$submissionId = isset($createResp['id']) ? $createResp['id'] : '';
|
||||
if ($submissionId === '') {
|
||||
throw new Exception('Turnitin createSubmission returned empty id: ' . json_encode($createResp));
|
||||
}
|
||||
|
||||
$this->updateCheck($checkId, [
|
||||
'tii_submission_id' => $submissionId,
|
||||
'raw_response' => json_encode($createResp, JSON_UNESCAPED_UNICODE),
|
||||
]);
|
||||
|
||||
// 2. 上传文件
|
||||
$tii->uploadFile($submissionId, $filePath, basename($filePath));
|
||||
|
||||
// 3. 触发 similarity
|
||||
$simResp = $tii->triggerSimilarity($submissionId);
|
||||
|
||||
$this->updateCheck($checkId, [
|
||||
'state' => 2, // 比对中
|
||||
'tii_report_status' => 'PROCESSING',
|
||||
'raw_response' => json_encode($simResp, JSON_UNESCAPED_UNICODE),
|
||||
]);
|
||||
|
||||
// 4. 排队首次轮询(晚一点开始,让 Turnitin 先处理)
|
||||
Queue::later(
|
||||
self::POLL_INTERVAL,
|
||||
'app\\api\\job\\PlagiarismPoll',
|
||||
['check_id' => $checkId, 'attempt' => 1],
|
||||
'plagiarism'
|
||||
);
|
||||
} catch (\Throwable $e) {
|
||||
$this->markFailed($checkId, '[upload] ' . $e->getMessage());
|
||||
throw $e;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Job 调用:轮询 similarity 状态,完成后下载 PDF。未完成则重新入队。
|
||||
*/
|
||||
public function runPollStatus($checkId, $attempt = 1)
|
||||
{
|
||||
$check = $this->mustGetCheck($checkId);
|
||||
if (empty($check['tii_submission_id'])) {
|
||||
$this->markFailed($checkId, '[poll] tii_submission_id empty');
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
$tii = new TurnitinService();
|
||||
$statusResp = $tii->getSimilarityStatus($check['tii_submission_id']);
|
||||
$status = isset($statusResp['status']) ? strtoupper($statusResp['status']) : '';
|
||||
|
||||
$this->updateCheck($checkId, [
|
||||
'tii_report_status' => $status,
|
||||
'attempts' => $attempt,
|
||||
'raw_response' => json_encode($statusResp, JSON_UNESCAPED_UNICODE),
|
||||
]);
|
||||
|
||||
if ($status === 'COMPLETE') {
|
||||
$score = isset($statusResp['overall_match_percentage'])
|
||||
? floatval($statusResp['overall_match_percentage']) : 0;
|
||||
|
||||
// 下载 PDF + 取在线查看 URL
|
||||
$localPdf = $this->downloadAndStorePdf($tii, $check['tii_submission_id'], $checkId);
|
||||
$viewerInfo = $this->refreshViewerUrl($tii, $check['tii_submission_id']);
|
||||
|
||||
$this->updateCheck($checkId, [
|
||||
'state' => 3,
|
||||
'similarity_score' => $score,
|
||||
'pdf_local_path' => $localPdf,
|
||||
'view_only_url' => $viewerInfo['url'],
|
||||
'view_only_url_expire' => $viewerInfo['expire'],
|
||||
'error_msg' => '',
|
||||
]);
|
||||
return;
|
||||
}
|
||||
|
||||
if ($status === 'ERROR') {
|
||||
$errMsg = isset($statusResp['error_code']) ? (string)$statusResp['error_code'] : 'Turnitin reported ERROR';
|
||||
$this->markFailed($checkId, '[poll] ' . $errMsg);
|
||||
return;
|
||||
}
|
||||
|
||||
// PROCESSING 或其它中间态:继续轮询
|
||||
if ($attempt >= self::MAX_POLL_ATTEMPTS) {
|
||||
$this->markFailed($checkId, '[poll] timeout after ' . $attempt . ' attempts');
|
||||
return;
|
||||
}
|
||||
Queue::later(
|
||||
self::POLL_INTERVAL,
|
||||
'app\\api\\job\\PlagiarismPoll',
|
||||
['check_id' => $checkId, 'attempt' => $attempt + 1],
|
||||
'plagiarism'
|
||||
);
|
||||
} catch (\Throwable $e) {
|
||||
// 网络抖动不要直接 fail,给一定容错次数
|
||||
if ($attempt < self::MAX_POLL_ATTEMPTS) {
|
||||
Queue::later(
|
||||
self::POLL_INTERVAL,
|
||||
'app\\api\\job\\PlagiarismPoll',
|
||||
['check_id' => $checkId, 'attempt' => $attempt + 1],
|
||||
'plagiarism'
|
||||
);
|
||||
$this->updateCheck($checkId, [
|
||||
'attempts' => $attempt,
|
||||
'error_msg' => '[poll] transient: ' . $e->getMessage(),
|
||||
]);
|
||||
return;
|
||||
}
|
||||
$this->markFailed($checkId, '[poll] exhausted: ' . $e->getMessage());
|
||||
throw $e;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 重新生成在线查看 URL(已有的过期了用)
|
||||
*
|
||||
* @return array{url:string, expire:int, local_pdf:string}
|
||||
*/
|
||||
public function refreshViewerUrlFor($checkId)
|
||||
{
|
||||
$check = $this->mustGetCheck($checkId);
|
||||
if (empty($check['tii_submission_id'])) {
|
||||
throw new Exception('check has no tii_submission_id');
|
||||
}
|
||||
$tii = new TurnitinService();
|
||||
$info = $this->refreshViewerUrl($tii, $check['tii_submission_id']);
|
||||
$this->updateCheck($checkId, [
|
||||
'view_only_url' => $info['url'],
|
||||
'view_only_url_expire' => $info['expire'],
|
||||
]);
|
||||
return [
|
||||
'url' => $info['url'],
|
||||
'expire' => $info['expire'],
|
||||
'local_pdf' => $check['pdf_local_path'],
|
||||
];
|
||||
}
|
||||
|
||||
// ---------- 内部 ----------
|
||||
|
||||
private function refreshViewerUrl($tii, $submissionId)
|
||||
{
|
||||
$resp = $tii->getViewerUrl($submissionId);
|
||||
$url = '';
|
||||
if (isset($resp['viewer_url'])) {
|
||||
$url = (string)$resp['viewer_url'];
|
||||
} elseif (isset($resp['url'])) {
|
||||
$url = (string)$resp['url'];
|
||||
}
|
||||
// 默认 2 小时过期,保守起见
|
||||
return ['url' => $url, 'expire' => time() + 7200];
|
||||
}
|
||||
|
||||
/**
|
||||
* 触发生成 + 轮询 + 下载 PDF 到本地,返回相对路径
|
||||
*/
|
||||
private function downloadAndStorePdf($tii, $submissionId, $checkId)
|
||||
{
|
||||
// 1. 请求生成
|
||||
$req = $tii->requestPdfReport($submissionId);
|
||||
$pdfId = isset($req['id']) ? $req['id'] : '';
|
||||
if ($pdfId === '') {
|
||||
throw new Exception('requestPdfReport empty id: ' . json_encode($req));
|
||||
}
|
||||
|
||||
// 2. 内联轮询 PDF 状态(最多 3 分钟,每 6 秒一次)
|
||||
$maxLoops = 30;
|
||||
for ($i = 0; $i < $maxLoops; $i++) {
|
||||
$st = $tii->getPdfReportStatus($submissionId, $pdfId);
|
||||
$stCode = isset($st['status']) ? strtoupper($st['status']) : '';
|
||||
if ($stCode === 'SUCCESS') {
|
||||
break;
|
||||
}
|
||||
if ($stCode === 'FAILED') {
|
||||
throw new Exception('PDF report generation failed: ' . json_encode($st));
|
||||
}
|
||||
sleep(6);
|
||||
}
|
||||
// 3. 下载
|
||||
$binary = $tii->downloadPdfReport($submissionId, $pdfId);
|
||||
if (!is_string($binary) || strlen($binary) < 100) {
|
||||
throw new Exception('downloaded pdf is empty/too small');
|
||||
}
|
||||
|
||||
// 4. 落盘
|
||||
$rootDir = ROOT_PATH ?: dirname(dirname(__DIR__));
|
||||
$absDir = rtrim($rootDir, '/\\') . DIRECTORY_SEPARATOR . self::REPORT_DIR;
|
||||
if (!is_dir($absDir)) {
|
||||
@mkdir($absDir, 0755, true);
|
||||
}
|
||||
$filename = sprintf('check_%d_%s.pdf', $checkId, date('Ymd_His'));
|
||||
$absPath = $absDir . DIRECTORY_SEPARATOR . $filename;
|
||||
$bytes = file_put_contents($absPath, $binary);
|
||||
if ($bytes === false || $bytes < 100) {
|
||||
throw new Exception('failed to save pdf to ' . $absPath);
|
||||
}
|
||||
return self::REPORT_DIR . '/' . $filename;
|
||||
}
|
||||
|
||||
private function mustGetCheck($checkId)
|
||||
{
|
||||
$row = Db::name('plagiarism_check')->where('check_id', $checkId)->find();
|
||||
if (!$row) {
|
||||
throw new Exception("plagiarism_check #{$checkId} not found");
|
||||
}
|
||||
return $row;
|
||||
}
|
||||
|
||||
private function updateCheck($checkId, array $data)
|
||||
{
|
||||
$data['utime'] = time();
|
||||
Db::name('plagiarism_check')->where('check_id', $checkId)->update($data);
|
||||
}
|
||||
|
||||
private function markFailed($checkId, $errMsg)
|
||||
{
|
||||
$this->updateCheck($checkId, [
|
||||
'state' => 4,
|
||||
'error_msg' => mb_substr($errMsg, 0, 1000),
|
||||
]);
|
||||
}
|
||||
|
||||
/**
|
||||
* 从 t_article_file 找到投稿主稿(manuscirpt)的本地绝对路径。
|
||||
* file_url 在系统里可能是 URL 或相对路径,调用方负责保证可读。
|
||||
*
|
||||
* @return string 文件绝对路径,找不到时抛异常
|
||||
*/
|
||||
public function locateArticleManuscript($articleId)
|
||||
{
|
||||
$row = Db::name('article_file')
|
||||
->where('article_id', $articleId)
|
||||
->where('type_name', 'manuscirpt') // 历史拼写
|
||||
->order('article_file_id desc')
|
||||
->find();
|
||||
if (!$row || empty($row['file_url'])) {
|
||||
throw new Exception("article #{$articleId} has no manuscirpt file");
|
||||
}
|
||||
return $this->resolveFileUrlToLocal($row['file_url']);
|
||||
}
|
||||
|
||||
/**
|
||||
* 把 file_url(可能是 http URL 或相对路径)解析成本地绝对路径。
|
||||
* 不同环境部署可能有差异,这里用 .env 配置的 STATIC_ROOT 作前缀。
|
||||
*/
|
||||
public function resolveFileUrlToLocal($fileUrl)
|
||||
{
|
||||
$fileUrl = trim((string)$fileUrl);
|
||||
if ($fileUrl === '') {
|
||||
throw new Exception('empty file_url');
|
||||
}
|
||||
// 已是绝对路径
|
||||
if (preg_match('/^([a-zA-Z]:[\\\\\/]|\/)/', $fileUrl) && is_file($fileUrl)) {
|
||||
return $fileUrl;
|
||||
}
|
||||
|
||||
$staticRoot = trim((string)Env::get('plagiarism.static_root', ''));
|
||||
$cdnPrefix = trim((string)Env::get('plagiarism.cdn_prefix', ''));
|
||||
|
||||
// 是 http URL:先试着剥掉 cdn 前缀,映射到本地
|
||||
if (preg_match('#^https?://#i', $fileUrl)) {
|
||||
if ($cdnPrefix !== '' && stripos($fileUrl, $cdnPrefix) === 0) {
|
||||
$rel = ltrim(substr($fileUrl, strlen($cdnPrefix)), '/');
|
||||
$local = rtrim($staticRoot, '/\\') . DIRECTORY_SEPARATOR . $rel;
|
||||
if (is_file($local)) {
|
||||
return $local;
|
||||
}
|
||||
}
|
||||
// 实在不行,下载到 runtime/plagiarism/tmp 临时目录
|
||||
return $this->downloadRemoteFile($fileUrl);
|
||||
}
|
||||
|
||||
// 相对路径:拼 static_root
|
||||
if ($staticRoot !== '') {
|
||||
$local = rtrim($staticRoot, '/\\') . DIRECTORY_SEPARATOR . ltrim($fileUrl, '/\\');
|
||||
if (is_file($local)) {
|
||||
return $local;
|
||||
}
|
||||
}
|
||||
|
||||
throw new Exception("cannot resolve file_url to local path: {$fileUrl} (set [plagiarism] STATIC_ROOT/CDN_PREFIX in .env)");
|
||||
}
|
||||
|
||||
private function downloadRemoteFile($url)
|
||||
{
|
||||
$rootDir = ROOT_PATH ?: dirname(dirname(__DIR__));
|
||||
$tmpDir = rtrim($rootDir, '/\\') . DIRECTORY_SEPARATOR . self::REPORT_DIR . DIRECTORY_SEPARATOR . 'tmp';
|
||||
if (!is_dir($tmpDir)) {
|
||||
@mkdir($tmpDir, 0755, true);
|
||||
}
|
||||
$ext = pathinfo(parse_url($url, PHP_URL_PATH), PATHINFO_EXTENSION) ?: 'pdf';
|
||||
$local = $tmpDir . DIRECTORY_SEPARATOR . md5($url) . '_' . time() . '.' . $ext;
|
||||
|
||||
$ch = curl_init($url);
|
||||
$fh = fopen($local, 'wb');
|
||||
curl_setopt_array($ch, [
|
||||
CURLOPT_FILE => $fh,
|
||||
CURLOPT_FOLLOWLOCATION => true,
|
||||
CURLOPT_TIMEOUT => 120,
|
||||
CURLOPT_SSL_VERIFYPEER => false,
|
||||
]);
|
||||
$ok = curl_exec($ch);
|
||||
$code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
||||
curl_close($ch);
|
||||
fclose($fh);
|
||||
if (!$ok || $code !== 200 || filesize($local) < 100) {
|
||||
@unlink($local);
|
||||
throw new Exception("download failed url={$url} http={$code}");
|
||||
}
|
||||
return $local;
|
||||
}
|
||||
|
||||
public function getCheck($checkId)
|
||||
{
|
||||
return Db::name('plagiarism_check')->where('check_id', $checkId)->find();
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user