From 4940db73fe8427fb1de4e697d1f99d058402c2ea Mon Sep 17 00:00:00 2001 From: wangjinlei <751475802@qq.com> Date: Wed, 20 May 2026 14:46:58 +0800 Subject: [PATCH] =?UTF-8?q?=E8=87=AA=E5=8A=A8=E6=9F=A5=E9=87=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .env | 4 + application/api/controller/Plagiarism.php | 14 + .../common/ManuscriptBodyExtractor.php | 256 +++++++++++++----- application/common/PlagiarismService.php | 9 +- application/common/TurnitinService.php | 95 ++++++- 5 files changed, 290 insertions(+), 88 deletions(-) diff --git a/.env b/.env index 2f67571d..b96e1f03 100644 --- a/.env +++ b/.env @@ -33,6 +33,10 @@ UNSUBSCRIBE_BASE_URL=https://submission.tmrjournals.com/api/Unsubscribe/index [yboard] APPLY_URL="https://submission.tmrjournals.com/youthBoardRegister" +[turnitin] +viewer_permission_set=ADMINISTRATOR +viewer_locale=en-US + [plagiarism] static_root="/home/wwwroot/api.tmrjournals.com/public" diff --git a/application/api/controller/Plagiarism.php b/application/api/controller/Plagiarism.php index 74068ffa..c76965bd 100644 --- a/application/api/controller/Plagiarism.php +++ b/application/api/controller/Plagiarism.php @@ -197,16 +197,30 @@ class Plagiarism extends Base if ($needRefresh) { $svc = new PlagiarismService(); $info = $svc->refreshViewerUrlFor($checkId); + if ($info['url'] === '') { + return jsonError('Turnitin returned empty viewer_url'); + } return jsonSuccess([ 'view_only_url' => $info['url'], 'expire' => $info['expire'], + 'has_pdf' => !empty($info['local_pdf']), ]); } return jsonSuccess([ 'view_only_url' => $row['view_only_url'], 'expire' => intval($row['view_only_url_expire']), + 'has_pdf' => !empty($row['pdf_local_path']), ]); } catch (\Throwable $e) { + if (!empty($row['pdf_local_path'])) { + return jsonSuccess([ + 'view_only_url' => '', + 'expire' => 0, + 'has_pdf' => true, + 'viewer_error' => $e->getMessage(), + 'hint' => '在线报告暂不可用,请使用 downloadReport 下载 PDF', + ]); + } return jsonError($e->getMessage()); } } diff --git a/application/common/ManuscriptBodyExtractor.php b/application/common/ManuscriptBodyExtractor.php index cd44e0fd..3115bf2d 100644 --- a/application/common/ManuscriptBodyExtractor.php +++ b/application/common/ManuscriptBodyExtractor.php @@ -2,19 +2,33 @@ namespace app\common; -use PhpOffice\PhpWord\IOFactory; -use PhpOffice\PhpWord\PhpWord; +use DOMDocument; +use DOMElement; +use DOMXPath; use think\Exception; +use ZipArchive; /** - * 从投稿 Word 稿件生成「仅正文」版本:去掉文前题名/作者/单位等,去掉参考文献及之后内容。 + * 从投稿 Word 生成「仅正文」docx:在 document.xml 上按块裁切,保留表格/图片/样式; + * 边界识别仅用可见文本(w:t),不读取域指令(Zotero/EndNote 的 JSON)。 */ class ManuscriptBodyExtractor { const BODY_SUBDIR = 'public/plagiarism/body_only'; + const W_NS = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'; + + /** @var DOMDocument */ + private $dom; + + /** @var DOMElement */ + private $bodyNode; + + /** @var array */ + private $blocks = []; + /** @var array */ - private $lines = []; + private $blockTexts = []; /** * @return array{path:string, rel_path:string, line_count:int, ref_start:int, body_start:int, warnings:array} @@ -30,9 +44,9 @@ class ManuscriptBodyExtractor throw new Exception('body_only check requires DOCX manuscript, got: ' . $ext); } - $this->lines = ArticleParserService::collectParagraphLines($sourcePath); - if (empty($this->lines)) { - throw new Exception('No text extracted from manuscript'); + $this->loadDocumentBlocks($sourcePath); + if (empty($this->blocks)) { + throw new Exception('No content blocks in manuscript'); } $refStart = $this->findReferenceStartIndex(); @@ -41,32 +55,181 @@ class ManuscriptBodyExtractor if ($refStart < 0) { $warnings[] = 'references_heading_not_found; using document end'; - $refStart = count($this->lines); + $refStart = count($this->blocks); } if ($bodyStart >= $refStart) { throw new Exception('Could not locate main body (front matter may include entire document)'); } - $bodyLines = array_slice($this->lines, $bodyStart, $refStart - $bodyStart); - $bodyLines = $this->normalizeBodyLines($bodyLines); - if (count($bodyLines) < 3) { - throw new Exception('Body text too short after extraction (' . count($bodyLines) . ' paragraphs)'); + $kept = 0; + for ($i = $bodyStart; $i < $refStart; $i++) { + if (trim($this->blockTexts[$i]) !== '') { + $kept++; + } + } + if ($kept < 3) { + throw new Exception('Body content too short after extraction (' . $kept . ' non-empty blocks)'); } - $relPath = $this->writeBodyDocx($bodyLines, $articleId); + $relPath = $this->sliceDocxToNewFile($sourcePath, $articleId, $bodyStart, $refStart); $rootDir = rtrim(ROOT_PATH ?: dirname(dirname(__DIR__)), '/\\'); $absPath = $rootDir . DIRECTORY_SEPARATOR . str_replace(['/', '\\'], DIRECTORY_SEPARATOR, $relPath); return [ 'path' => $absPath, 'rel_path' => $relPath, - 'line_count' => count($bodyLines), + 'line_count' => $kept, 'ref_start' => $refStart, 'body_start' => $bodyStart, 'warnings' => $warnings, ]; } + private function loadDocumentBlocks($sourcePath) + { + $zip = new ZipArchive(); + if ($zip->open($sourcePath) !== true) { + throw new Exception('Cannot open docx: ' . $sourcePath); + } + $xml = $zip->getFromName('word/document.xml'); + $zip->close(); + if ($xml === false || $xml === '') { + throw new Exception('word/document.xml missing in docx'); + } + + $this->dom = new DOMDocument(); + $this->dom->preserveWhiteSpace = false; + $this->dom->formatOutput = false; + if (@$this->dom->loadXML($xml) === false) { + throw new Exception('Invalid word/document.xml'); + } + + $xpath = new DOMXPath($this->dom); + $xpath->registerNamespace('w', self::W_NS); + $body = $xpath->query('//w:body')->item(0); + if (!$body instanceof DOMElement) { + throw new Exception('w:body not found'); + } + + $this->bodyNode = $body; + $this->blocks = []; + $this->blockTexts = []; + + foreach ($body->childNodes as $child) { + if ($child->nodeType !== XML_ELEMENT_NODE) { + continue; + } + /** @var DOMElement $child */ + if ($child->localName === 'sectPr') { + continue; + } + $this->blocks[] = $child; + $this->blockTexts[] = $this->extractVisibleTextFromBlock($child); + } + } + + /** + * 仅拼接 w:t 可见文本,忽略 w:instrText 等域指令(避免 Zotero JSON 参与裁切判断)。 + */ + private function extractVisibleTextFromBlock(DOMElement $block) + { + $xpath = new DOMXPath($block->ownerDocument); + $xpath->registerNamespace('w', self::W_NS); + $nodes = $xpath->query('.//w:t', $block); + if (!$nodes || $nodes->length === 0) { + return ''; + } + $parts = []; + foreach ($nodes as $node) { + $parts[] = $node->textContent; + } + $text = preg_replace('/\s+/u', ' ', implode('', $parts)); + return trim((string) $text); + } + + private function sliceDocxToNewFile($sourcePath, $articleId, $bodyStart, $refStart) + { + $rootDir = rtrim(ROOT_PATH ?: dirname(dirname(__DIR__)), '/\\'); + $dir = $rootDir . DIRECTORY_SEPARATOR . self::BODY_SUBDIR; + if (!is_dir($dir)) { + @mkdir($dir, 0755, true); + } + + $name = sprintf('body_article_%d_%s.docx', intval($articleId), date('Ymd_His')); + $absPath = $dir . DIRECTORY_SEPARATOR . $name; + + if (!copy($sourcePath, $absPath)) { + throw new Exception('Failed to copy source docx'); + } + + $n = count($this->blocks); + + $zip = new ZipArchive(); + if ($zip->open($absPath) !== true) { + throw new Exception('Cannot open output docx'); + } + + $xml = $zip->getFromName('word/document.xml'); + if ($xml === false) { + $zip->close(); + throw new Exception('document.xml missing in output docx'); + } + + $outDom = new DOMDocument(); + $outDom->preserveWhiteSpace = false; + $outDom->formatOutput = false; + if (@$outDom->loadXML($xml) === false) { + $zip->close(); + throw new Exception('Invalid document.xml in output docx'); + } + + $xpath = new DOMXPath($outDom); + $xpath->registerNamespace('w', self::W_NS); + $body = $xpath->query('//w:body')->item(0); + if (!$body instanceof DOMElement) { + $zip->close(); + throw new Exception('w:body not found in output docx'); + } + + $children = []; + foreach ($body->childNodes as $child) { + if ($child->nodeType === XML_ELEMENT_NODE) { + $children[] = $child; + } + } + + $blockIdx = 0; + foreach ($children as $child) { + if (!($child instanceof DOMElement)) { + continue; + } + if ($child->localName === 'sectPr') { + continue; + } + if ($blockIdx < $bodyStart || $blockIdx >= $refStart) { + if ($child->parentNode) { + $child->parentNode->removeChild($child); + } + } + $blockIdx++; + } + + if ($blockIdx !== $n) { + $zip->close(); + @unlink($absPath); + throw new Exception('Document block count mismatch during slice'); + } + + $zip->addFromString('word/document.xml', $outDom->saveXML()); + $zip->close(); + + if (!is_file($absPath) || filesize($absPath) < 200) { + throw new Exception('Failed to write body-only docx'); + } + + return self::BODY_SUBDIR . '/' . $name; + } + private function findReferenceStartIndex() { $stopKeywords = [ @@ -75,7 +238,7 @@ class ManuscriptBodyExtractor '致谢', '基金', '附录', '补充材料', '利益冲突', '作者贡献', ]; - foreach ($this->lines as $i => $line) { + foreach ($this->blockTexts as $i => $line) { $t = trim($line); if ($t === '') { continue; @@ -87,7 +250,7 @@ class ManuscriptBodyExtractor foreach ($stopKeywords as $sk) { $skLower = strtolower($sk); if ($lower === $skLower || $lower === $skLower . ':' || $lower === $skLower . ':') { - if ($i > count($this->lines) * 0.4) { + if ($i > count($this->blockTexts) * 0.4) { return $i; } } @@ -98,12 +261,12 @@ class ManuscriptBodyExtractor private function findBodyStartIndex() { - $n = count($this->lines); + $n = count($this->blockTexts); $introIdx = -1; $keywordsIdx = -1; for ($i = 0; $i < $n; $i++) { - $t = trim($this->lines[$i]); + $t = trim($this->blockTexts[$i]); if ($t === '') { continue; } @@ -145,9 +308,9 @@ class ManuscriptBodyExtractor private function indexAfterKeywordsBlock($kwIdx) { - $n = count($this->lines); + $n = count($this->blockTexts); for ($i = $kwIdx + 1; $i < $n; $i++) { - $t = trim($this->lines[$i]); + $t = trim($this->blockTexts[$i]); if ($t === '') { continue; } @@ -166,10 +329,10 @@ class ManuscriptBodyExtractor private function indexAfterFrontMatterFallback() { - $n = count($this->lines); + $n = count($this->blockTexts); $maxSkip = min(20, (int) floor($n * 0.15)); for ($i = 0; $i < $maxSkip && $i < $n; $i++) { - $t = trim($this->lines[$i]); + $t = trim($this->blockTexts[$i]); if ($t === '') { continue; } @@ -190,53 +353,4 @@ class ManuscriptBodyExtractor } return false; } - - /** - * @param array $bodyLines - * @return array - */ - private function normalizeBodyLines(array $bodyLines) - { - $out = []; - foreach ($bodyLines as $line) { - $line = trim($line); - if ($line === '') { - continue; - } - if (preg_match('/^\{\s*ADDIN\s+EN\.REFLIST\s*\}$/i', $line)) { - continue; - } - $out[] = $line; - } - return $out; - } - - /** - * @param array $bodyLines - */ - private function writeBodyDocx(array $bodyLines, $articleId) - { - $rootDir = rtrim(ROOT_PATH ?: dirname(dirname(__DIR__)), '/\\'); - $dir = $rootDir . DIRECTORY_SEPARATOR . self::BODY_SUBDIR; - if (!is_dir($dir)) { - @mkdir($dir, 0755, true); - } - - $name = sprintf('body_article_%d_%s.docx', intval($articleId), date('Ymd_His')); - $absPath = $dir . DIRECTORY_SEPARATOR . $name; - - $phpWord = new PhpWord(); - $section = $phpWord->addSection(); - foreach ($bodyLines as $line) { - $section->addText($line); - } - $writer = IOFactory::createWriter($phpWord, 'Word2007'); - $writer->save($absPath); - - if (!is_file($absPath) || filesize($absPath) < 200) { - throw new Exception('Failed to write body-only docx'); - } - - return self::BODY_SUBDIR . '/' . $name; - } } diff --git a/application/common/PlagiarismService.php b/application/common/PlagiarismService.php index dff25596..4f22290d 100644 --- a/application/common/PlagiarismService.php +++ b/application/common/PlagiarismService.php @@ -395,9 +395,14 @@ class PlagiarismService $resp = $tii->getViewerUrl($submissionId); $url = ''; if (isset($resp['viewer_url'])) { - $url = (string)$resp['viewer_url']; + $url = (string) $resp['viewer_url']; } elseif (isset($resp['url'])) { - $url = (string)$resp['url']; + $url = (string) $resp['url']; + } elseif (isset($resp['launch_url'])) { + $url = (string) $resp['launch_url']; + } + if ($url === '') { + throw new Exception('viewer-url response has no url: ' . json_encode($resp, JSON_UNESCAPED_UNICODE)); } // 默认 2 小时过期,保守起见 return ['url' => $url, 'expire' => time() + 7200]; diff --git a/application/common/TurnitinService.php b/application/common/TurnitinService.php index c858516e..3604442f 100644 --- a/application/common/TurnitinService.php +++ b/application/common/TurnitinService.php @@ -291,25 +291,90 @@ class TurnitinService * * 返回 viewer_url(数小时有效) * - * @param array $viewer 可选 viewer 设置 e.g. ['viewer_default_permission_set' => 'INSTRUCTOR'] + * TCA 要求 default_mode 为小写(如 match_overview);save_changes 等 LTI 字段会导致 400。 + * Crossref 通道常用 ADMINISTRATOR/USER,非 INSTRUCTOR。可在 .env 配置: + * turnitin.viewer_permission_set=ADMINISTRATOR + * + * @param array $viewer 可选,覆盖默认 viewer 请求体字段 */ public function getViewerUrl($submissionId, $viewer = []) { - $body = array_merge([ - 'viewer_default_permission_set' => 'INSTRUCTOR', - 'similarity' => [ - 'default_mode' => 'MATCH_OVERVIEW', - 'view_settings' => ['save_changes' => true], - 'modes' => ['match_overview' => true, 'all_sources' => true], - ], - 'locale' => 'en-US', - ], $viewer); + $submissionId = trim((string) $submissionId); + if ($submissionId === '') { + throw new Exception('submissionId required for viewer-url'); + } - return $this->request( - 'POST', - '/submissions/' . urlencode($submissionId) . '/viewer-url', - $body - ); + $path = '/submissions/' . rawurlencode($submissionId) . '/viewer-url'; + $lastError = null; + + foreach ($this->buildViewerUrlBodies($viewer) as $body) { + try { + return $this->request('POST', $path, $body); + } catch (Exception $e) { + $lastError = $e; + if (strpos($e->getMessage(), 'HTTP 400') === false) { + throw $e; + } + } + } + + throw $lastError ?: new Exception('viewer-url failed'); + } + + /** + * 按优先级生成若干合法请求体(前者失败且为 400 时尝试后者)。 + * + * @return array + */ + private function buildViewerUrlBodies(array $viewerOverrides) + { + if (!empty($viewerOverrides)) { + return [$viewerOverrides]; + } + + $locale = trim((string) Env::get('turnitin.viewer_locale', 'en-US')) ?: 'en-US'; + $configured = trim((string) Env::get('turnitin.viewer_permission_set', '')); + $permissionSets = $configured !== '' + ? array_map('trim', explode(',', $configured)) + : $this->defaultViewerPermissionSets(); + + $bodies = []; + foreach ($permissionSets as $perm) { + if ($perm === '') { + continue; + } + $bodies[] = [ + 'viewer_default_permission_set' => $perm, + 'locale' => $locale, + 'similarity' => [ + 'default_mode' => 'match_overview', + 'modes' => [ + 'match_overview' => true, + 'all_sources' => true, + ], + ], + ]; + // 最简请求体(部分 Crossref 租户只接受 permission + locale) + $bodies[] = [ + 'viewer_default_permission_set' => $perm, + 'locale' => $locale, + ]; + } + + return $bodies; + } + + /** + * Crossref Similarity Check 通常不用 INSTRUCTOR;按常见可用角色排序尝试。 + * + * @return array + */ + private function defaultViewerPermissionSets() + { + if (stripos($this->baseUrl, 'crossref') !== false) { + return ['ADMINISTRATOR', 'USER', 'EDITOR', 'INSTRUCTOR']; + } + return ['INSTRUCTOR', 'ADMINISTRATOR', 'USER']; } /**