修改自动推广的相关任务

This commit is contained in:
wangjinlei
2026-05-07 11:45:55 +08:00
parent a68742d2c2
commit b1e978ed73
7 changed files with 1221 additions and 0 deletions

View File

@@ -0,0 +1,423 @@
<?php
namespace app\common;
use think\Db;
use think\Env;
use think\Queue;
use think\Exception;
/**
* 查重业务层:把 TurnitinService 的低层调用包装成"按 article 查重"的高层流程,
* 并维护 t_plagiarism_check 状态机。
*
* 状态流:
* submit() → state=1上传中入队 PlagiarismRun
* PlagiarismRun.fire → 上传 + 触发 similarity → state=2比对中入队 PlagiarismPoll
* PlagiarismPoll.fire → 轮询 status完成后下载 PDF → state=3完成
* 任意环节抛异常 → state=4失败写 error_msg
*/
class PlagiarismService
{
/**
* 报告 PDF 本地保存目录(相对于项目根,永久保留)
*/
const REPORT_DIR = 'runtime/plagiarism';
/**
* 轮询间隔。Turnitin 一般 1-5 分钟出结果30 秒一次比较合适
*/
const POLL_INTERVAL = 30;
/**
* 最长轮询次数30s × 60 = 30 分钟)
*/
const MAX_POLL_ATTEMPTS = 60;
// ---------- 顶层入口 ----------
/**
* 提交查重(入队,立即返回 check_id
*
* @param int $articleId 投稿 ID
* @param string $filePath 本地可读的 PDF/DOCX 绝对路径
* @param int $triggeredBy 触发人 user_id手工触发时编辑后台的 user_id
* @param string $source 'manual' / 'auto_xxx'
* @return int check_id
*/
public function submit($articleId, $filePath, $triggeredBy = 0, $source = 'manual')
{
if (!is_file($filePath) || !is_readable($filePath)) {
throw new Exception("File not readable: {$filePath}");
}
$journalId = (int) Db::name('article')
->where('article_id', $articleId)
->value('journal_id');
$now = time();
$checkId = Db::name('plagiarism_check')->insertGetId([
'article_id' => $articleId,
'journal_id' => $journalId,
'triggered_by' => $triggeredBy,
'trigger_source' => $source,
'state' => 1, // 上传中
'source_file_name' => basename($filePath),
'source_file_size' => filesize($filePath) ?: 0,
'ctime' => $now,
'utime' => $now,
]);
// 入队执行:上传 + 触发 similarity
Queue::push(
'app\\api\\job\\PlagiarismRun',
['check_id' => $checkId, 'file_path' => $filePath],
'plagiarism'
);
return (int)$checkId;
}
/**
* Job 调用:上传文件到 Turnitin 并触发 similarity然后入队 PlagiarismPoll
*/
public function runUploadAndTrigger($checkId, $filePath)
{
$check = $this->mustGetCheck($checkId);
try {
$tii = new TurnitinService();
// 1. 创建 submission
$articleTitle = (string) Db::name('article')
->where('article_id', $check['article_id'])
->value('title');
if ($articleTitle === '') {
$articleTitle = 'Article #' . $check['article_id'];
}
$createResp = $tii->createSubmission([
'title' => mb_substr($articleTitle, 0, 250),
'owner' => 'editor_' . $check['triggered_by'],
'submitter' => 'editor_' . $check['triggered_by'],
'metadata' => [
'article_id' => (string)$check['article_id'],
'check_id' => (string)$check['check_id'],
],
]);
$submissionId = isset($createResp['id']) ? $createResp['id'] : '';
if ($submissionId === '') {
throw new Exception('Turnitin createSubmission returned empty id: ' . json_encode($createResp));
}
$this->updateCheck($checkId, [
'tii_submission_id' => $submissionId,
'raw_response' => json_encode($createResp, JSON_UNESCAPED_UNICODE),
]);
// 2. 上传文件
$tii->uploadFile($submissionId, $filePath, basename($filePath));
// 3. 触发 similarity
$simResp = $tii->triggerSimilarity($submissionId);
$this->updateCheck($checkId, [
'state' => 2, // 比对中
'tii_report_status' => 'PROCESSING',
'raw_response' => json_encode($simResp, JSON_UNESCAPED_UNICODE),
]);
// 4. 排队首次轮询(晚一点开始,让 Turnitin 先处理)
Queue::later(
self::POLL_INTERVAL,
'app\\api\\job\\PlagiarismPoll',
['check_id' => $checkId, 'attempt' => 1],
'plagiarism'
);
} catch (\Throwable $e) {
$this->markFailed($checkId, '[upload] ' . $e->getMessage());
throw $e;
}
}
/**
* Job 调用:轮询 similarity 状态,完成后下载 PDF。未完成则重新入队。
*/
public function runPollStatus($checkId, $attempt = 1)
{
$check = $this->mustGetCheck($checkId);
if (empty($check['tii_submission_id'])) {
$this->markFailed($checkId, '[poll] tii_submission_id empty');
return;
}
try {
$tii = new TurnitinService();
$statusResp = $tii->getSimilarityStatus($check['tii_submission_id']);
$status = isset($statusResp['status']) ? strtoupper($statusResp['status']) : '';
$this->updateCheck($checkId, [
'tii_report_status' => $status,
'attempts' => $attempt,
'raw_response' => json_encode($statusResp, JSON_UNESCAPED_UNICODE),
]);
if ($status === 'COMPLETE') {
$score = isset($statusResp['overall_match_percentage'])
? floatval($statusResp['overall_match_percentage']) : 0;
// 下载 PDF + 取在线查看 URL
$localPdf = $this->downloadAndStorePdf($tii, $check['tii_submission_id'], $checkId);
$viewerInfo = $this->refreshViewerUrl($tii, $check['tii_submission_id']);
$this->updateCheck($checkId, [
'state' => 3,
'similarity_score' => $score,
'pdf_local_path' => $localPdf,
'view_only_url' => $viewerInfo['url'],
'view_only_url_expire' => $viewerInfo['expire'],
'error_msg' => '',
]);
return;
}
if ($status === 'ERROR') {
$errMsg = isset($statusResp['error_code']) ? (string)$statusResp['error_code'] : 'Turnitin reported ERROR';
$this->markFailed($checkId, '[poll] ' . $errMsg);
return;
}
// PROCESSING 或其它中间态:继续轮询
if ($attempt >= self::MAX_POLL_ATTEMPTS) {
$this->markFailed($checkId, '[poll] timeout after ' . $attempt . ' attempts');
return;
}
Queue::later(
self::POLL_INTERVAL,
'app\\api\\job\\PlagiarismPoll',
['check_id' => $checkId, 'attempt' => $attempt + 1],
'plagiarism'
);
} catch (\Throwable $e) {
// 网络抖动不要直接 fail给一定容错次数
if ($attempt < self::MAX_POLL_ATTEMPTS) {
Queue::later(
self::POLL_INTERVAL,
'app\\api\\job\\PlagiarismPoll',
['check_id' => $checkId, 'attempt' => $attempt + 1],
'plagiarism'
);
$this->updateCheck($checkId, [
'attempts' => $attempt,
'error_msg' => '[poll] transient: ' . $e->getMessage(),
]);
return;
}
$this->markFailed($checkId, '[poll] exhausted: ' . $e->getMessage());
throw $e;
}
}
/**
* 重新生成在线查看 URL已有的过期了用
*
* @return array{url:string, expire:int, local_pdf:string}
*/
public function refreshViewerUrlFor($checkId)
{
$check = $this->mustGetCheck($checkId);
if (empty($check['tii_submission_id'])) {
throw new Exception('check has no tii_submission_id');
}
$tii = new TurnitinService();
$info = $this->refreshViewerUrl($tii, $check['tii_submission_id']);
$this->updateCheck($checkId, [
'view_only_url' => $info['url'],
'view_only_url_expire' => $info['expire'],
]);
return [
'url' => $info['url'],
'expire' => $info['expire'],
'local_pdf' => $check['pdf_local_path'],
];
}
// ---------- 内部 ----------
private function refreshViewerUrl($tii, $submissionId)
{
$resp = $tii->getViewerUrl($submissionId);
$url = '';
if (isset($resp['viewer_url'])) {
$url = (string)$resp['viewer_url'];
} elseif (isset($resp['url'])) {
$url = (string)$resp['url'];
}
// 默认 2 小时过期,保守起见
return ['url' => $url, 'expire' => time() + 7200];
}
/**
* 触发生成 + 轮询 + 下载 PDF 到本地,返回相对路径
*/
private function downloadAndStorePdf($tii, $submissionId, $checkId)
{
// 1. 请求生成
$req = $tii->requestPdfReport($submissionId);
$pdfId = isset($req['id']) ? $req['id'] : '';
if ($pdfId === '') {
throw new Exception('requestPdfReport empty id: ' . json_encode($req));
}
// 2. 内联轮询 PDF 状态(最多 3 分钟,每 6 秒一次)
$maxLoops = 30;
for ($i = 0; $i < $maxLoops; $i++) {
$st = $tii->getPdfReportStatus($submissionId, $pdfId);
$stCode = isset($st['status']) ? strtoupper($st['status']) : '';
if ($stCode === 'SUCCESS') {
break;
}
if ($stCode === 'FAILED') {
throw new Exception('PDF report generation failed: ' . json_encode($st));
}
sleep(6);
}
// 3. 下载
$binary = $tii->downloadPdfReport($submissionId, $pdfId);
if (!is_string($binary) || strlen($binary) < 100) {
throw new Exception('downloaded pdf is empty/too small');
}
// 4. 落盘
$rootDir = ROOT_PATH ?: dirname(dirname(__DIR__));
$absDir = rtrim($rootDir, '/\\') . DIRECTORY_SEPARATOR . self::REPORT_DIR;
if (!is_dir($absDir)) {
@mkdir($absDir, 0755, true);
}
$filename = sprintf('check_%d_%s.pdf', $checkId, date('Ymd_His'));
$absPath = $absDir . DIRECTORY_SEPARATOR . $filename;
$bytes = file_put_contents($absPath, $binary);
if ($bytes === false || $bytes < 100) {
throw new Exception('failed to save pdf to ' . $absPath);
}
return self::REPORT_DIR . '/' . $filename;
}
private function mustGetCheck($checkId)
{
$row = Db::name('plagiarism_check')->where('check_id', $checkId)->find();
if (!$row) {
throw new Exception("plagiarism_check #{$checkId} not found");
}
return $row;
}
private function updateCheck($checkId, array $data)
{
$data['utime'] = time();
Db::name('plagiarism_check')->where('check_id', $checkId)->update($data);
}
private function markFailed($checkId, $errMsg)
{
$this->updateCheck($checkId, [
'state' => 4,
'error_msg' => mb_substr($errMsg, 0, 1000),
]);
}
/**
* 从 t_article_file 找到投稿主稿manuscirpt的本地绝对路径。
* file_url 在系统里可能是 URL 或相对路径,调用方负责保证可读。
*
* @return string 文件绝对路径,找不到时抛异常
*/
public function locateArticleManuscript($articleId)
{
$row = Db::name('article_file')
->where('article_id', $articleId)
->where('type_name', 'manuscirpt') // 历史拼写
->order('article_file_id desc')
->find();
if (!$row || empty($row['file_url'])) {
throw new Exception("article #{$articleId} has no manuscirpt file");
}
return $this->resolveFileUrlToLocal($row['file_url']);
}
/**
* 把 file_url可能是 http URL 或相对路径)解析成本地绝对路径。
* 不同环境部署可能有差异,这里用 .env 配置的 STATIC_ROOT 作前缀。
*/
public function resolveFileUrlToLocal($fileUrl)
{
$fileUrl = trim((string)$fileUrl);
if ($fileUrl === '') {
throw new Exception('empty file_url');
}
// 已是绝对路径
if (preg_match('/^([a-zA-Z]:[\\\\\/]|\/)/', $fileUrl) && is_file($fileUrl)) {
return $fileUrl;
}
$staticRoot = trim((string)Env::get('plagiarism.static_root', ''));
$cdnPrefix = trim((string)Env::get('plagiarism.cdn_prefix', ''));
// 是 http URL先试着剥掉 cdn 前缀,映射到本地
if (preg_match('#^https?://#i', $fileUrl)) {
if ($cdnPrefix !== '' && stripos($fileUrl, $cdnPrefix) === 0) {
$rel = ltrim(substr($fileUrl, strlen($cdnPrefix)), '/');
$local = rtrim($staticRoot, '/\\') . DIRECTORY_SEPARATOR . $rel;
if (is_file($local)) {
return $local;
}
}
// 实在不行,下载到 runtime/plagiarism/tmp 临时目录
return $this->downloadRemoteFile($fileUrl);
}
// 相对路径:拼 static_root
if ($staticRoot !== '') {
$local = rtrim($staticRoot, '/\\') . DIRECTORY_SEPARATOR . ltrim($fileUrl, '/\\');
if (is_file($local)) {
return $local;
}
}
throw new Exception("cannot resolve file_url to local path: {$fileUrl} (set [plagiarism] STATIC_ROOT/CDN_PREFIX in .env)");
}
private function downloadRemoteFile($url)
{
$rootDir = ROOT_PATH ?: dirname(dirname(__DIR__));
$tmpDir = rtrim($rootDir, '/\\') . DIRECTORY_SEPARATOR . self::REPORT_DIR . DIRECTORY_SEPARATOR . 'tmp';
if (!is_dir($tmpDir)) {
@mkdir($tmpDir, 0755, true);
}
$ext = pathinfo(parse_url($url, PHP_URL_PATH), PATHINFO_EXTENSION) ?: 'pdf';
$local = $tmpDir . DIRECTORY_SEPARATOR . md5($url) . '_' . time() . '.' . $ext;
$ch = curl_init($url);
$fh = fopen($local, 'wb');
curl_setopt_array($ch, [
CURLOPT_FILE => $fh,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_TIMEOUT => 120,
CURLOPT_SSL_VERIFYPEER => false,
]);
$ok = curl_exec($ch);
$code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
fclose($fh);
if (!$ok || $code !== 200 || filesize($local) < 100) {
@unlink($local);
throw new Exception("download failed url={$url} http={$code}");
}
return $local;
}
public function getCheck($checkId)
{
return Db::name('plagiarism_check')->where('check_id', $checkId)->find();
}
}