自动查重

2026-05-20 14:46:58 +08:00
parent cfa3f791f4
commit 4940db73fe
5 changed files with 290 additions and 88 deletions
--- a/application/api/controller/Plagiarism.php
+++ b/application/api/controller/Plagiarism.php
@@ -197,16 +197,30 @@ class Plagiarism extends Base
            if ($needRefresh) {
                $svc = new PlagiarismService();
                $info = $svc->refreshViewerUrlFor($checkId);
+                if ($info['url'] === '') {
+                    return jsonError('Turnitin returned empty viewer_url');
+                }
                return jsonSuccess([
                    'view_only_url' => $info['url'],
                    'expire'        => $info['expire'],
+                    'has_pdf'       => !empty($info['local_pdf']),
                ]);
            }
            return jsonSuccess([
                'view_only_url' => $row['view_only_url'],
                'expire'        => intval($row['view_only_url_expire']),
+                'has_pdf'       => !empty($row['pdf_local_path']),
            ]);
        } catch (\Throwable $e) {
+            if (!empty($row['pdf_local_path'])) {
+                return jsonSuccess([
+                    'view_only_url' => '',
+                    'expire'        => 0,
+                    'has_pdf'       => true,
+                    'viewer_error'  => $e->getMessage(),
+                    'hint'          => '在线报告暂不可用，请使用 downloadReport 下载 PDF',
+                ]);
+            }
            return jsonError($e->getMessage());
        }
    }
--- a/application/common/ManuscriptBodyExtractor.php
+++ b/application/common/ManuscriptBodyExtractor.php
@@ -2,19 +2,33 @@

 namespace app\common;

-use PhpOffice\PhpWord\IOFactory;
-use PhpOffice\PhpWord\PhpWord;
+use DOMDocument;
+use DOMElement;
+use DOMXPath;
 use think\Exception;
+use ZipArchive;

 /**
- * 从投稿 Word 稿件生成「仅正文」版本：去掉文前题名/作者/单位等，去掉参考文献及之后内容。
+ * 从投稿 Word 生成「仅正文」docx：在 document.xml 上按块裁切，保留表格/图片/样式；
+ * 边界识别仅用可见文本（w:t），不读取域指令（Zotero/EndNote 的 JSON）。
 */
 class ManuscriptBodyExtractor
 {
    const BODY_SUBDIR = 'public/plagiarism/body_only';

+    const W_NS = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main';
+
+    /** @var DOMDocument */
+    private $dom;
+
+    /** @var DOMElement */
+    private $bodyNode;
+
+    /** @var array<int,DOMElement> */
+    private $blocks = [];
+
    /** @var array<int,string> */
-    private $lines = [];
+    private $blockTexts = [];

    /**
     * @return array{path:string, rel_path:string, line_count:int, ref_start:int, body_start:int, warnings:array}
@@ -30,9 +44,9 @@ class ManuscriptBodyExtractor
            throw new Exception('body_only check requires DOCX manuscript, got: ' . $ext);
        }

-        $this->lines = ArticleParserService::collectParagraphLines($sourcePath);
-        if (empty($this->lines)) {
-            throw new Exception('No text extracted from manuscript');
+        $this->loadDocumentBlocks($sourcePath);
+        if (empty($this->blocks)) {
+            throw new Exception('No content blocks in manuscript');
        }

        $refStart = $this->findReferenceStartIndex();
@@ -41,32 +55,181 @@ class ManuscriptBodyExtractor

        if ($refStart < 0) {
            $warnings[] = 'references_heading_not_found; using document end';
-            $refStart = count($this->lines);
+            $refStart = count($this->blocks);
        }
        if ($bodyStart >= $refStart) {
            throw new Exception('Could not locate main body (front matter may include entire document)');
        }

-        $bodyLines = array_slice($this->lines, $bodyStart, $refStart - $bodyStart);
-        $bodyLines = $this->normalizeBodyLines($bodyLines);
-        if (count($bodyLines) < 3) {
-            throw new Exception('Body text too short after extraction (' . count($bodyLines) . ' paragraphs)');
+        $kept = 0;
+        for ($i = $bodyStart; $i < $refStart; $i++) {
+            if (trim($this->blockTexts[$i]) !== '') {
+                $kept++;
+            }
+        }
+        if ($kept < 3) {
+            throw new Exception('Body content too short after extraction (' . $kept . ' non-empty blocks)');
        }

-        $relPath = $this->writeBodyDocx($bodyLines, $articleId);
+        $relPath = $this->sliceDocxToNewFile($sourcePath, $articleId, $bodyStart, $refStart);
        $rootDir = rtrim(ROOT_PATH ?: dirname(dirname(__DIR__)), '/\\');
        $absPath = $rootDir . DIRECTORY_SEPARATOR . str_replace(['/', '\\'], DIRECTORY_SEPARATOR, $relPath);

        return [
            'path'       => $absPath,
            'rel_path'   => $relPath,
-            'line_count' => count($bodyLines),
+            'line_count' => $kept,
            'ref_start'  => $refStart,
            'body_start' => $bodyStart,
            'warnings'   => $warnings,
        ];
    }

+    private function loadDocumentBlocks($sourcePath)
+    {
+        $zip = new ZipArchive();
+        if ($zip->open($sourcePath) !== true) {
+            throw new Exception('Cannot open docx: ' . $sourcePath);
+        }
+        $xml = $zip->getFromName('word/document.xml');
+        $zip->close();
+        if ($xml === false || $xml === '') {
+            throw new Exception('word/document.xml missing in docx');
+        }
+
+        $this->dom = new DOMDocument();
+        $this->dom->preserveWhiteSpace = false;
+        $this->dom->formatOutput = false;
+        if (@$this->dom->loadXML($xml) === false) {
+            throw new Exception('Invalid word/document.xml');
+        }
+
+        $xpath = new DOMXPath($this->dom);
+        $xpath->registerNamespace('w', self::W_NS);
+        $body = $xpath->query('//w:body')->item(0);
+        if (!$body instanceof DOMElement) {
+            throw new Exception('w:body not found');
+        }
+
+        $this->bodyNode = $body;
+        $this->blocks = [];
+        $this->blockTexts = [];
+
+        foreach ($body->childNodes as $child) {
+            if ($child->nodeType !== XML_ELEMENT_NODE) {
+                continue;
+            }
+            /** @var DOMElement $child */
+            if ($child->localName === 'sectPr') {
+                continue;
+            }
+            $this->blocks[] = $child;
+            $this->blockTexts[] = $this->extractVisibleTextFromBlock($child);
+        }
+    }
+
+    /**
+     * 仅拼接 w:t 可见文本，忽略 w:instrText 等域指令（避免 Zotero JSON 参与裁切判断）。
+     */
+    private function extractVisibleTextFromBlock(DOMElement $block)
+    {
+        $xpath = new DOMXPath($block->ownerDocument);
+        $xpath->registerNamespace('w', self::W_NS);
+        $nodes = $xpath->query('.//w:t', $block);
+        if (!$nodes || $nodes->length === 0) {
+            return '';
+        }
+        $parts = [];
+        foreach ($nodes as $node) {
+            $parts[] = $node->textContent;
+        }
+        $text = preg_replace('/\s+/u', ' ', implode('', $parts));
+        return trim((string) $text);
+    }
+
+    private function sliceDocxToNewFile($sourcePath, $articleId, $bodyStart, $refStart)
+    {
+        $rootDir = rtrim(ROOT_PATH ?: dirname(dirname(__DIR__)), '/\\');
+        $dir = $rootDir . DIRECTORY_SEPARATOR . self::BODY_SUBDIR;
+        if (!is_dir($dir)) {
+            @mkdir($dir, 0755, true);
+        }
+
+        $name = sprintf('body_article_%d_%s.docx', intval($articleId), date('Ymd_His'));
+        $absPath = $dir . DIRECTORY_SEPARATOR . $name;
+
+        if (!copy($sourcePath, $absPath)) {
+            throw new Exception('Failed to copy source docx');
+        }
+
+        $n = count($this->blocks);
+
+        $zip = new ZipArchive();
+        if ($zip->open($absPath) !== true) {
+            throw new Exception('Cannot open output docx');
+        }
+
+        $xml = $zip->getFromName('word/document.xml');
+        if ($xml === false) {
+            $zip->close();
+            throw new Exception('document.xml missing in output docx');
+        }
+
+        $outDom = new DOMDocument();
+        $outDom->preserveWhiteSpace = false;
+        $outDom->formatOutput = false;
+        if (@$outDom->loadXML($xml) === false) {
+            $zip->close();
+            throw new Exception('Invalid document.xml in output docx');
+        }
+
+        $xpath = new DOMXPath($outDom);
+        $xpath->registerNamespace('w', self::W_NS);
+        $body = $xpath->query('//w:body')->item(0);
+        if (!$body instanceof DOMElement) {
+            $zip->close();
+            throw new Exception('w:body not found in output docx');
+        }
+
+        $children = [];
+        foreach ($body->childNodes as $child) {
+            if ($child->nodeType === XML_ELEMENT_NODE) {
+                $children[] = $child;
+            }
+        }
+
+        $blockIdx = 0;
+        foreach ($children as $child) {
+            if (!($child instanceof DOMElement)) {
+                continue;
+            }
+            if ($child->localName === 'sectPr') {
+                continue;
+            }
+            if ($blockIdx < $bodyStart || $blockIdx >= $refStart) {
+                if ($child->parentNode) {
+                    $child->parentNode->removeChild($child);
+                }
+            }
+            $blockIdx++;
+        }
+
+        if ($blockIdx !== $n) {
+            $zip->close();
+            @unlink($absPath);
+            throw new Exception('Document block count mismatch during slice');
+        }
+
+        $zip->addFromString('word/document.xml', $outDom->saveXML());
+        $zip->close();
+
+        if (!is_file($absPath) || filesize($absPath) < 200) {
+            throw new Exception('Failed to write body-only docx');
+        }
+
+        return self::BODY_SUBDIR . '/' . $name;
+    }
+
    private function findReferenceStartIndex()
    {
        $stopKeywords = [
@@ -75,7 +238,7 @@ class ManuscriptBodyExtractor
            '致谢', '基金', '附录', '补充材料', '利益冲突', '作者贡献',
        ];

-        foreach ($this->lines as $i => $line) {
+        foreach ($this->blockTexts as $i => $line) {
            $t = trim($line);
            if ($t === '') {
                continue;
@@ -87,7 +250,7 @@ class ManuscriptBodyExtractor
            foreach ($stopKeywords as $sk) {
                $skLower = strtolower($sk);
                if ($lower === $skLower || $lower === $skLower . ':' || $lower === $skLower . '：') {
-                    if ($i > count($this->lines) * 0.4) {
+                    if ($i > count($this->blockTexts) * 0.4) {
                        return $i;
                    }
                }
@@ -98,12 +261,12 @@ class ManuscriptBodyExtractor

    private function findBodyStartIndex()
    {
-        $n = count($this->lines);
+        $n = count($this->blockTexts);
        $introIdx = -1;
        $keywordsIdx = -1;

        for ($i = 0; $i < $n; $i++) {
-            $t = trim($this->lines[$i]);
+            $t = trim($this->blockTexts[$i]);
            if ($t === '') {
                continue;
            }
@@ -145,9 +308,9 @@ class ManuscriptBodyExtractor

    private function indexAfterKeywordsBlock($kwIdx)
    {
-        $n = count($this->lines);
+        $n = count($this->blockTexts);
        for ($i = $kwIdx + 1; $i < $n; $i++) {
-            $t = trim($this->lines[$i]);
+            $t = trim($this->blockTexts[$i]);
            if ($t === '') {
                continue;
            }
@@ -166,10 +329,10 @@ class ManuscriptBodyExtractor

    private function indexAfterFrontMatterFallback()
    {
-        $n = count($this->lines);
+        $n = count($this->blockTexts);
        $maxSkip = min(20, (int) floor($n * 0.15));
        for ($i = 0; $i < $maxSkip && $i < $n; $i++) {
-            $t = trim($this->lines[$i]);
+            $t = trim($this->blockTexts[$i]);
            if ($t === '') {
                continue;
            }
@@ -190,53 +353,4 @@ class ManuscriptBodyExtractor
        }
        return false;
    }
-
-    /**
-     * @param array<int,string> $bodyLines
-     * @return array<int,string>
-     */
-    private function normalizeBodyLines(array $bodyLines)
-    {
-        $out = [];
-        foreach ($bodyLines as $line) {
-            $line = trim($line);
-            if ($line === '') {
-                continue;
-            }
-            if (preg_match('/^\{\s*ADDIN\s+EN\.REFLIST\s*\}$/i', $line)) {
-                continue;
-            }
-            $out[] = $line;
-        }
-        return $out;
-    }
-
-    /**
-     * @param array<int,string> $bodyLines
-     */
-    private function writeBodyDocx(array $bodyLines, $articleId)
-    {
-        $rootDir = rtrim(ROOT_PATH ?: dirname(dirname(__DIR__)), '/\\');
-        $dir = $rootDir . DIRECTORY_SEPARATOR . self::BODY_SUBDIR;
-        if (!is_dir($dir)) {
-            @mkdir($dir, 0755, true);
-        }
-
-        $name = sprintf('body_article_%d_%s.docx', intval($articleId), date('Ymd_His'));
-        $absPath = $dir . DIRECTORY_SEPARATOR . $name;
-
-        $phpWord = new PhpWord();
-        $section = $phpWord->addSection();
-        foreach ($bodyLines as $line) {
-            $section->addText($line);
-        }
-        $writer = IOFactory::createWriter($phpWord, 'Word2007');
-        $writer->save($absPath);
-
-        if (!is_file($absPath) || filesize($absPath) < 200) {
-            throw new Exception('Failed to write body-only docx');
-        }
-
-        return self::BODY_SUBDIR . '/' . $name;
-    }
 }
--- a/application/common/PlagiarismService.php
+++ b/application/common/PlagiarismService.php
@@ -395,9 +395,14 @@ class PlagiarismService
        $resp = $tii->getViewerUrl($submissionId);
        $url = '';
        if (isset($resp['viewer_url'])) {
-            $url = (string)$resp['viewer_url'];
+            $url = (string) $resp['viewer_url'];
        } elseif (isset($resp['url'])) {
-            $url = (string)$resp['url'];
+            $url = (string) $resp['url'];
+        } elseif (isset($resp['launch_url'])) {
+            $url = (string) $resp['launch_url'];
+        }
+        if ($url === '') {
+            throw new Exception('viewer-url response has no url: ' . json_encode($resp, JSON_UNESCAPED_UNICODE));
        }
        // 默认 2 小时过期，保守起见
        return ['url' => $url, 'expire' => time() + 7200];
--- a/application/common/TurnitinService.php
+++ b/application/common/TurnitinService.php
@@ -291,25 +291,90 @@ class TurnitinService
     *
     * 返回 viewer_url（数小时有效）
     *
-     * @param array $viewer 可选 viewer 设置 e.g. ['viewer_default_permission_set' => 'INSTRUCTOR']
+     * TCA 要求 default_mode 为小写（如 match_overview）；save_changes 等 LTI 字段会导致 400。
+     * Crossref 通道常用 ADMINISTRATOR/USER，非 INSTRUCTOR。可在 .env 配置：
+     *   turnitin.viewer_permission_set=ADMINISTRATOR
+     *
+     * @param array $viewer 可选，覆盖默认 viewer 请求体字段
     */
    public function getViewerUrl($submissionId, $viewer = [])
    {
-        $body = array_merge([
-            'viewer_default_permission_set' => 'INSTRUCTOR',
-            'similarity' => [
-                'default_mode' => 'MATCH_OVERVIEW',
-                'view_settings' => ['save_changes' => true],
-                'modes' => ['match_overview' => true, 'all_sources' => true],
-            ],
-            'locale' => 'en-US',
-        ], $viewer);
+        $submissionId = trim((string) $submissionId);
+        if ($submissionId === '') {
+            throw new Exception('submissionId required for viewer-url');
+        }

-        return $this->request(
-            'POST',
-            '/submissions/' . urlencode($submissionId) . '/viewer-url',
-            $body
-        );
+        $path = '/submissions/' . rawurlencode($submissionId) . '/viewer-url';
+        $lastError = null;
+
+        foreach ($this->buildViewerUrlBodies($viewer) as $body) {
+            try {
+                return $this->request('POST', $path, $body);
+            } catch (Exception $e) {
+                $lastError = $e;
+                if (strpos($e->getMessage(), 'HTTP 400') === false) {
+                    throw $e;
+                }
+            }
+        }
+
+        throw $lastError ?: new Exception('viewer-url failed');
+    }
+
+    /**
+     * 按优先级生成若干合法请求体（前者失败且为 400 时尝试后者）。
+     *
+     * @return array<int,array>
+     */
+    private function buildViewerUrlBodies(array $viewerOverrides)
+    {
+        if (!empty($viewerOverrides)) {
+            return [$viewerOverrides];
+        }
+
+        $locale = trim((string) Env::get('turnitin.viewer_locale', 'en-US')) ?: 'en-US';
+        $configured = trim((string) Env::get('turnitin.viewer_permission_set', ''));
+        $permissionSets = $configured !== ''
+            ? array_map('trim', explode(',', $configured))
+            : $this->defaultViewerPermissionSets();
+
+        $bodies = [];
+        foreach ($permissionSets as $perm) {
+            if ($perm === '') {
+                continue;
+            }
+            $bodies[] = [
+                'viewer_default_permission_set' => $perm,
+                'locale'                        => $locale,
+                'similarity'                    => [
+                    'default_mode' => 'match_overview',
+                    'modes'        => [
+                        'match_overview' => true,
+                        'all_sources'    => true,
+                    ],
+                ],
+            ];
+            // 最简请求体（部分 Crossref 租户只接受 permission + locale）
+            $bodies[] = [
+                'viewer_default_permission_set' => $perm,
+                'locale'                        => $locale,
+            ];
+        }
+
+        return $bodies;
+    }
+
+    /**
+     * Crossref Similarity Check 通常不用 INSTRUCTOR；按常见可用角色排序尝试。
+     *
+     * @return array<int,string>
+     */
+    private function defaultViewerPermissionSets()
+    {
+        if (stripos($this->baseUrl, 'crossref') !== false) {
+            return ['ADMINISTRATOR', 'USER', 'EDITOR', 'INSTRUCTOR'];
+        }
+        return ['INSTRUCTOR', 'ADMINISTRATOR', 'USER'];
    }

    /**