Files
tougao/application/common/PlagiarismService.php
2026-05-20 14:46:58 +08:00

597 lines
22 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
namespace app\common;
use think\Db;
use think\Env;
use think\Queue;
use think\Exception;
/**
* 查重业务层:把 TurnitinService 的低层调用包装成"按 article 查重"的高层流程,
* 并维护 t_plagiarism_check 状态机。
*
* 状态流:
* submit() → state=1上传中入队 PlagiarismRun
* PlagiarismRun → 创建 submission + 上传文件 → 入队 PlagiarismWaitIngest
* PlagiarismWaitIngest → 单次 GET submission 状态;就绪则入队 PlagiarismTriggerSimilarity否则延迟再入队
* PlagiarismTriggerSimilarity → PUT similarity → state=2比对中入队 PlagiarismPoll
* PlagiarismPoll → 轮询 similarity完成后下载 PDF → state=3完成在线 viewer URL 按需 getReportUrl 调用 refreshViewerUrlFor
* 任意环节抛异常 → state=4失败写 error_msg
*
* Worker请用 `queue:work` 消费队列 **plagiarism**(整条链与轮询均在此队列;若此前单独监听 PlagiarismRun / PlagiarismPoll需改为 plagiarism
*/
class PlagiarismService
{
/**
* 报告 PDF 本地保存目录(相对于项目根,永久保留)
*/
const REPORT_DIR = 'public/plagiarism';
/** Run / WaitIngest / TriggerSimilarity / Poll 共用队列名 */
const QUEUE_CHAIN = 'plagiarism';
const CHECK_TYPE_FULL = 'full';
const CHECK_TYPE_BODY = 'body_only';
const JOB_WAIT_INGEST = 'app\\api\\job\\PlagiarismWaitIngest';
const JOB_TRIGGER_SIM = 'app\\api\\job\\PlagiarismTriggerSimilarity';
const JOB_POLL = 'app\\api\\job\\PlagiarismPoll';
/**
* 轮询间隔。Turnitin 一般 1-5 分钟出结果30 秒一次比较合适
*/
const POLL_INTERVAL = 30;
/**
* 最长轮询次数30s × 60 = 30 分钟)
*/
const MAX_POLL_ATTEMPTS = 60;
private $logFile;
public function __construct()
{
$this->logFile = ROOT_PATH . 'runtime' . DS . 'plagiarism_task.log';
}
// ---------- 顶层入口 ----------
/**
* 提交查重(入队,立即返回 check_id
*
* @param int $articleId 投稿 ID
* @param string $filePath 本地可读的 PDF/DOCX 绝对路径
* @param int $triggeredBy 触发人 user_id手工触发时编辑后台的 user_id
* @param string $source 'manual' / 'auto_xxx'
* @param string $checkType full | body_only
* @return int check_id
*/
public function submit($articleId, $filePath, $triggeredBy = 0, $source = 'manual', $checkType = self::CHECK_TYPE_FULL)
{
if (!is_file($filePath) || !is_readable($filePath)) {
throw new Exception("File not readable: {$filePath}");
}
$checkType = $this->normalizeCheckType($checkType);
$uploadPath = $filePath;
$derivedRel = '';
$sourceName = basename($filePath);
if ($checkType === self::CHECK_TYPE_BODY) {
$built = (new ManuscriptBodyExtractor())->buildBodyOnlyDocx($filePath, $articleId);
$uploadPath = $built['path'];
$derivedRel = (string) $built['rel_path'];
$sourceName = basename($uploadPath);
if (!empty($built['warnings'])) {
$this->log('body_only warnings check article=' . $articleId . ' ' . implode('; ', $built['warnings']));
}
}
$journalId = (int) Db::name('article')
->where('article_id', $articleId)
->value('journal_id');
$this->log("plagiarism submit type={$checkType} article={$articleId}");
$now = time();
$row = [
'article_id' => $articleId,
'journal_id' => $journalId,
'triggered_by' => $triggeredBy,
'trigger_source' => $source,
'check_type' => $checkType,
'state' => 1,
'source_file_name' => $sourceName,
'source_file_size' => filesize($uploadPath) ?: 0,
'ctime' => $now,
'utime' => $now,
];
if ($derivedRel !== '') {
$row['derived_file_path'] = $derivedRel;
}
$checkId = Db::name('plagiarism_check')->insertGetId($row);
Queue::push(
'app\\api\\job\\PlagiarismRun',
['check_id' => $checkId, 'file_path' => $uploadPath],
self::QUEUE_CHAIN
);
return (int) $checkId;
}
/**
* 同时提交全文 + 正文两次查重
* @return array{full:int, body_only:int}
*/
public function submitBoth($articleId, $filePath, $triggeredBy = 0, $source = 'manual')
{
return [
'full' => $this->submit($articleId, $filePath, $triggeredBy, $source, self::CHECK_TYPE_FULL),
'body_only' => $this->submit($articleId, $filePath, $triggeredBy, $source, self::CHECK_TYPE_BODY),
];
}
private function normalizeCheckType($checkType)
{
$t = strtolower(trim((string) $checkType));
if ($t === '' || $t === self::CHECK_TYPE_FULL || $t === 'full') {
return self::CHECK_TYPE_FULL;
}
if ($t === self::CHECK_TYPE_BODY || $t === 'body' || $t === 'bodyonly') {
return self::CHECK_TYPE_BODY;
}
throw new Exception('invalid check_type, use full or body_only');
}
/**
* Job 调用:仅创建 submission + 上传文件,随后由 PlagiarismWaitIngest 链式轮询 ingest再 PlagiarismTriggerSimilarity。
*/
public function runUploadOnly($checkId, $filePath)
{
$check = $this->mustGetCheck($checkId);
$this->log('runUploadOnly start check_id=' . $checkId);
$tii = new TurnitinService();
$articleTitle = (string) Db::name('article')
->where('article_id', $check['article_id'])
->value('title');
if ($articleTitle === '') {
$articleTitle = 'Article #' . $check['article_id'];
}
$createResp = $tii->createSubmission([
'title' => mb_substr($articleTitle, 0, 250),
'owner' => 'editor_' . $check['triggered_by'],
'submitter' => 'editor_' . $check['triggered_by'],
'metadata' => [
'article_id' => (string) $check['article_id'],
'check_id' => (string) $check['check_id'],
],
]);
$submissionId = isset($createResp['id']) ? $createResp['id'] : '';
if ($submissionId === '') {
throw new Exception('Turnitin createSubmission returned empty id: ' . json_encode($createResp));
}
$this->updateCheck($checkId, [
'tii_submission_id' => $submissionId,
'raw_response' => json_encode($createResp, JSON_UNESCAPED_UNICODE),
]);
$tii->uploadFile($submissionId, $filePath, basename($filePath));
$firstDelay = $this->ingestChainFirstDelaySec();
Queue::later(
$firstDelay,
self::JOB_WAIT_INGEST,
['check_id' => $checkId, 'attempt' => 1],
self::QUEUE_CHAIN
);
}
/**
* 单次 ingest 检查(由 PlagiarismWaitIngest 调用)。不在本方法内 sleep 长循环。
*/
public function runIngestPollStep($checkId, $attempt = 1)
{
$check = $this->mustGetCheck($checkId);
if (empty($check['tii_submission_id'])) {
$this->markFailed($checkId, '[ingest] tii_submission_id empty');
return;
}
$this->log("runIngestPollStep is running");
$maxAttempts = $this->ingestChainMaxAttempts();
$interval = $this->ingestChainPollIntervalSec();
$tii = new TurnitinService();
try {
$parsed = $tii->parseSubmissionIngestState($check['tii_submission_id']);
} catch (\Throwable $e) {
if ($attempt >= $maxAttempts) {
$this->markFailed($checkId, '[ingest] request failed after ' . $attempt . ' tries: ' . $e->getMessage());
return;
}
Queue::later($interval, self::JOB_WAIT_INGEST, ['check_id' => $checkId, 'attempt' => $attempt + 1], self::QUEUE_CHAIN);
return;
}
if (!empty($parsed['failed'])) {
$this->markFailed($checkId, '[ingest] submission failed status=' . $parsed['status'] . ' ' . $parsed['snippet']);
return;
}
if (!empty($parsed['ready'])) {
Queue::push(self::JOB_TRIGGER_SIM, ['check_id' => $checkId, 'ingest_attempt' => $attempt], self::QUEUE_CHAIN);
return;
}
if ($attempt >= $maxAttempts) {
$this->markFailed($checkId, '[ingest] timeout last_status=' . ($parsed['status'] !== '' ? $parsed['status'] : '(empty)'));
return;
}
Queue::later($interval, self::JOB_WAIT_INGEST, ['check_id' => $checkId, 'attempt' => $attempt + 1], self::QUEUE_CHAIN);
}
/**
* 在 ingest 就绪后触发 similarity并入队 PlagiarismPoll。
* 若仍返回 409则重新入队 PlagiarismWaitIngest不抛异常避免误标失败
*
* @param int $ingestAttempt 来自 WaitIngest 的 attempt供 409 时继续轮询
*/
public function runTriggerSimilarityOnly($checkId, $ingestAttempt = 1)
{
$check = $this->mustGetCheck($checkId);
if (empty($check['tii_submission_id'])) {
$this->markFailed($checkId, '[similarity] tii_submission_id empty');
return;
}
$this->log("runTriggerSimilarityOnly is running");
$tii = new TurnitinService();
$sid = $check['tii_submission_id'];
try {
$simResp = $tii->triggerSimilarity($sid);
} catch (\Throwable $e) {
$msg = $e->getMessage();
$is409 = (stripos($msg, '409') !== false || stripos($msg, 'CONFLICT') !== false)
&& (stripos($msg, 'not been completed') !== false || stripos($msg, 'completed yet') !== false);
if ($is409) {
$maxAttempts = $this->ingestChainMaxAttempts();
$next = $ingestAttempt + 1;
if ($next > $maxAttempts) {
$this->markFailed($checkId, '[similarity] still not ready after ingest attempts: ' . $msg);
return;
}
$delay = max($this->ingestChainPollIntervalSec(), 20);
Queue::later($delay, self::JOB_WAIT_INGEST, ['check_id' => $checkId, 'attempt' => $next], self::QUEUE_CHAIN);
return;
}
throw $e;
}
$this->updateCheck($checkId, [
'state' => 2,
'tii_report_status' => 'PROCESSING',
'raw_response' => json_encode($simResp, JSON_UNESCAPED_UNICODE),
]);
Queue::later(
self::POLL_INTERVAL,
self::JOB_POLL,
['check_id' => $checkId, 'attempt' => 1],
self::QUEUE_CHAIN
);
}
/**
* @deprecated 与 runUploadOnly 等价;长耗时 ingest 已拆到队列 PlagiarismWaitIngest勿在本方法内同步 wait。
*/
public function runUploadAndTrigger($checkId, $filePath)
{
$this->runUploadOnly($checkId, $filePath);
}
/**
* Job 调用:轮询 similarity 状态,完成后下载 PDF。未完成则重新入队。
*/
public function runPollStatus($checkId, $attempt = 1)
{
$check = $this->mustGetCheck($checkId);
if (empty($check['tii_submission_id'])) {
$this->markFailed($checkId, '[poll] tii_submission_id empty');
return;
}
try {
$tii = new TurnitinService();
$statusResp = $tii->getSimilarityStatus($check['tii_submission_id']);
$status = isset($statusResp['status']) ? strtoupper($statusResp['status']) : '';
$this->updateCheck($checkId, [
'tii_report_status' => $status,
'attempts' => $attempt,
'raw_response' => json_encode($statusResp, JSON_UNESCAPED_UNICODE),
]);
if ($status === 'COMPLETE') {
$score = isset($statusResp['overall_match_percentage'])
? floatval($statusResp['overall_match_percentage']) : 0;
$localPdf = $this->downloadAndStorePdf($tii, $check['tii_submission_id'], $checkId);
$this->updateCheck($checkId, [
'state' => 3,
'similarity_score' => $score,
'pdf_local_path' => $localPdf,
'error_msg' => '',
]);
return;
}
if ($status === 'ERROR') {
$errMsg = isset($statusResp['error_code']) ? (string)$statusResp['error_code'] : 'Turnitin reported ERROR';
$this->markFailed($checkId, '[poll] ' . $errMsg);
return;
}
if ($attempt >= self::MAX_POLL_ATTEMPTS) {
$this->markFailed($checkId, '[poll] timeout after ' . $attempt . ' attempts');
return;
}
Queue::later(
self::POLL_INTERVAL,
self::JOB_POLL,
['check_id' => $checkId, 'attempt' => $attempt + 1],
self::QUEUE_CHAIN
);
} catch (\Throwable $e) {
if ($attempt < self::MAX_POLL_ATTEMPTS) {
Queue::later(
self::POLL_INTERVAL,
self::JOB_POLL,
['check_id' => $checkId, 'attempt' => $attempt + 1],
self::QUEUE_CHAIN
);
$this->updateCheck($checkId, [
'attempts' => $attempt,
'error_msg' => '[poll] transient: ' . $e->getMessage(),
]);
return;
}
$this->markFailed($checkId, '[poll] exhausted: ' . $e->getMessage());
throw $e;
}
}
/**
* 按需获取/刷新 Turnitin 在线报告 URL与 poll 解耦,避免 viewer-url 失败拖死查重完成)。
*
* @return array{url:string, expire:int, local_pdf:string}
*/
public function refreshViewerUrlFor($checkId)
{
$check = $this->mustGetCheck($checkId);
if (empty($check['tii_submission_id'])) {
throw new Exception('check has no tii_submission_id');
}
$tii = new TurnitinService();
$info = $this->refreshViewerUrl($tii, $check['tii_submission_id']);
$this->updateCheck($checkId, [
'view_only_url' => $info['url'],
'view_only_url_expire' => $info['expire'],
]);
return [
'url' => $info['url'],
'expire' => $info['expire'],
'local_pdf' => $check['pdf_local_path'],
];
}
// ---------- 内部 ----------
/**
* 调用 Turnitin POST viewer-url仅由 refreshViewerUrlFor / getReportUrl 触发。
*/
private function refreshViewerUrl($tii, $submissionId)
{
$resp = $tii->getViewerUrl($submissionId);
$url = '';
if (isset($resp['viewer_url'])) {
$url = (string) $resp['viewer_url'];
} elseif (isset($resp['url'])) {
$url = (string) $resp['url'];
} elseif (isset($resp['launch_url'])) {
$url = (string) $resp['launch_url'];
}
if ($url === '') {
throw new Exception('viewer-url response has no url: ' . json_encode($resp, JSON_UNESCAPED_UNICODE));
}
// 默认 2 小时过期,保守起见
return ['url' => $url, 'expire' => time() + 7200];
}
/**
* 触发生成 + 轮询 + 下载 PDF 到本地,返回相对路径
*/
private function downloadAndStorePdf($tii, $submissionId, $checkId)
{
// 1. 请求生成
$req = $tii->requestPdfReport($submissionId);
$pdfId = isset($req['id']) ? $req['id'] : '';
if ($pdfId === '') {
throw new Exception('requestPdfReport empty id: ' . json_encode($req));
}
// 2. 内联轮询 PDF 状态(最多 3 分钟,每 6 秒一次)
$maxLoops = 30;
for ($i = 0; $i < $maxLoops; $i++) {
$st = $tii->getPdfReportStatus($submissionId, $pdfId);
$stCode = isset($st['status']) ? strtoupper($st['status']) : '';
if ($stCode === 'SUCCESS') {
break;
}
if ($stCode === 'FAILED') {
throw new Exception('PDF report generation failed: ' . json_encode($st));
}
sleep(6);
}
// 3. 下载
$binary = $tii->downloadPdfReport($submissionId, $pdfId);
if (!is_string($binary) || strlen($binary) < 100) {
throw new Exception('downloaded pdf is empty/too small');
}
// 4. 落盘
$rootDir = ROOT_PATH ?: dirname(dirname(__DIR__));
$absDir = rtrim($rootDir, '/\\') . DIRECTORY_SEPARATOR . self::REPORT_DIR;
if (!is_dir($absDir)) {
@mkdir($absDir, 0755, true);
}
$filename = sprintf('check_%d_%s.pdf', $checkId, date('Ymd_His'));
$absPath = $absDir . DIRECTORY_SEPARATOR . $filename;
$bytes = file_put_contents($absPath, $binary);
if ($bytes === false || $bytes < 100) {
throw new Exception('failed to save pdf to ' . $absPath);
}
return self::REPORT_DIR . '/' . $filename;
}
private function mustGetCheck($checkId)
{
$row = Db::name('plagiarism_check')->where('check_id', $checkId)->find();
if (!$row) {
throw new Exception("plagiarism_check #{$checkId} not found");
}
return $row;
}
private function updateCheck($checkId, array $data)
{
$data['utime'] = time();
Db::name('plagiarism_check')->where('check_id', $checkId)->update($data);
}
public function markFailed($checkId, $errMsg)
{
$this->log('markFailed check_id=' . $checkId);
$this->updateCheck($checkId, [
'state' => 4,
'error_msg' => mb_substr($errMsg, 0, 1000),
]);
}
private function ingestChainFirstDelaySec()
{
return max(3, (int) Env::get('turnitin.ingest_chain_first_delay', 10));
}
private function ingestChainPollIntervalSec()
{
return max(60, (int) Env::get('turnitin.ingest_chain_poll_interval', 15));
}
private function ingestChainMaxAttempts()
{
return max(10, (int) Env::get('turnitin.ingest_chain_max_attempts', 80));
}
/**
* 从 t_article_file 找到投稿主稿manuscirpt的本地绝对路径。
* file_url 在系统里可能是 URL 或相对路径,调用方负责保证可读。
*
* @return string 文件绝对路径,找不到时抛异常
*/
public function locateArticleManuscript($articleId)
{
$row = Db::name('article_file')
->where('article_id', $articleId)
->where('type_name', 'manuscirpt') // 历史拼写
->order('file_id desc')
->find();
if (!$row || empty($row['file_url'])) {
throw new Exception("article #{$articleId} has no manuscirpt file");
}
return $this->resolveFileUrlToLocal($row['file_url']);
}
/**
* 把 file_url可能是 http URL 或相对路径)解析成本地绝对路径。
* 不同环境部署可能有差异,这里用 .env 配置的 STATIC_ROOT 作前缀。
*/
public function resolveFileUrlToLocal($fileUrl)
{
$fileUrl = trim((string)$fileUrl);
if ($fileUrl === '') {
throw new Exception('empty file_url');
}
// 已是绝对路径
if (preg_match('/^([a-zA-Z]:[\\\\\/]|\/)/', $fileUrl) && is_file($fileUrl)) {
return $fileUrl;
}
$staticRoot = trim((string)Env::get('plagiarism.static_root', ''));
$cdnPrefix = trim((string)Env::get('plagiarism.cdn_prefix', ''));
// 是 http URL先试着剥掉 cdn 前缀,映射到本地
if (preg_match('#^https?://#i', $fileUrl)) {
if ($cdnPrefix !== '' && stripos($fileUrl, $cdnPrefix) === 0) {
$rel = ltrim(substr($fileUrl, strlen($cdnPrefix)), '/');
$local = rtrim($staticRoot, '/\\') . DIRECTORY_SEPARATOR . $rel;
if (is_file($local)) {
return $local;
}
}
// 实在不行,下载到 runtime/plagiarism/tmp 临时目录
return $this->downloadRemoteFile($fileUrl);
}
// 相对路径:拼 static_root
if ($staticRoot !== '') {
$local = rtrim($staticRoot, '/\\') . DIRECTORY_SEPARATOR . ltrim($fileUrl, '/\\');
if (is_file($local)) {
return $local;
}
}
throw new Exception("cannot resolve file_url to local path: {$fileUrl} (set [plagiarism] STATIC_ROOT/CDN_PREFIX in .env)");
}
private function downloadRemoteFile($url)
{
$rootDir = ROOT_PATH ?: dirname(dirname(__DIR__));
$tmpDir = rtrim($rootDir, '/\\') . DIRECTORY_SEPARATOR . self::REPORT_DIR . DIRECTORY_SEPARATOR . 'tmp';
if (!is_dir($tmpDir)) {
@mkdir($tmpDir, 0755, true);
}
$ext = pathinfo(parse_url($url, PHP_URL_PATH), PATHINFO_EXTENSION) ?: 'pdf';
$local = $tmpDir . DIRECTORY_SEPARATOR . md5($url) . '_' . time() . '.' . $ext;
$ch = curl_init($url);
$fh = fopen($local, 'wb');
curl_setopt_array($ch, [
CURLOPT_FILE => $fh,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_TIMEOUT => 120,
CURLOPT_SSL_VERIFYPEER => false,
]);
$ok = curl_exec($ch);
$code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
fclose($fh);
if (!$ok || $code !== 200 || filesize($local) < 100) {
@unlink($local);
throw new Exception("download failed url={$url} http={$code}");
}
return $local;
}
public function getCheck($checkId)
{
return Db::name('plagiarism_check')->where('check_id', $checkId)->find();
}
public function log($msg)
{
$line = date('Y-m-d H:i:s') . ' ' . $msg . PHP_EOL;
@file_put_contents($this->logFile, $line, FILE_APPEND);
}
}