tougao/application/common/ManuscriptBodyExtractor.php

<?php

namespace app\common;

use DOMDocument;
use DOMElement;
use DOMXPath;
use think\Exception;
use ZipArchive;

/**
 * 从投稿 Word 生成「仅正文」docx：在 document.xml 上按块裁切，保留表格/图片/样式；
 * 边界识别仅用可见文本（w:t），不读取域指令（Zotero/EndNote 的 JSON）。
 */
class ManuscriptBodyExtractor
{
    const BODY_SUBDIR = 'public/plagiarism/body_only';

    const W_NS = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main';

    /** @var DOMDocument */
    private $dom;

    /** @var DOMElement */
    private $bodyNode;

    /** @var array<int,DOMElement> */
    private $blocks = [];

    /** @var array<int,string> */
    private $blockTexts = [];

    /**
     * @return array{path:string, rel_path:string, line_count:int, ref_start:int, body_start:int, warnings:array}
     */
    public function buildBodyOnlyDocx($sourcePath, $articleId = 0)
    {
        $sourcePath = trim((string) $sourcePath);
        if (!is_file($sourcePath) || !is_readable($sourcePath)) {
            throw new Exception('Manuscript not readable: ' . $sourcePath);
        }
        $ext = strtolower(pathinfo($sourcePath, PATHINFO_EXTENSION));
        if ($ext !== 'docx') {
            throw new Exception('body_only check requires DOCX manuscript, got: ' . $ext);
        }

        $this->loadDocumentBlocks($sourcePath);
        if (empty($this->blocks)) {
            throw new Exception('No content blocks in manuscript');
        }

        $refStart = $this->findReferenceStartIndex();
        $bodyStart = $this->findBodyStartIndex();
        $warnings = [];

        if ($refStart < 0) {
            $warnings[] = 'references_heading_not_found; using document end';
            $refStart = count($this->blocks);
        }
        if ($bodyStart >= $refStart) {
            throw new Exception('Could not locate main body (front matter may include entire document)');
        }

        $kept = 0;
        for ($i = $bodyStart; $i < $refStart; $i++) {
            if (trim($this->blockTexts[$i]) !== '') {
                $kept++;
            }
        }
        if ($kept < 3) {
            throw new Exception('Body content too short after extraction (' . $kept . ' non-empty blocks)');
        }

        $relPath = $this->sliceDocxToNewFile($sourcePath, $articleId, $bodyStart, $refStart);
        $rootDir = rtrim(ROOT_PATH ?: dirname(dirname(__DIR__)), '/\\');
        $absPath = $rootDir . DIRECTORY_SEPARATOR . str_replace(['/', '\\'], DIRECTORY_SEPARATOR, $relPath);

        return [
            'path'       => $absPath,
            'rel_path'   => $relPath,
            'line_count' => $kept,
            'ref_start'  => $refStart,
            'body_start' => $bodyStart,
            'warnings'   => $warnings,
        ];
    }

    private function loadDocumentBlocks($sourcePath)
    {
        $zip = new ZipArchive();
        if ($zip->open($sourcePath) !== true) {
            throw new Exception('Cannot open docx: ' . $sourcePath);
        }
        $xml = $zip->getFromName('word/document.xml');
        $zip->close();
        if ($xml === false || $xml === '') {
            throw new Exception('word/document.xml missing in docx');
        }

        $this->dom = new DOMDocument();
        $this->dom->preserveWhiteSpace = false;
        $this->dom->formatOutput = false;
        if (@$this->dom->loadXML($xml) === false) {
            throw new Exception('Invalid word/document.xml');
        }

        $xpath = new DOMXPath($this->dom);
        $xpath->registerNamespace('w', self::W_NS);
        $body = $xpath->query('//w:body')->item(0);
        if (!$body instanceof DOMElement) {
            throw new Exception('w:body not found');
        }

        $this->bodyNode = $body;
        $this->blocks = [];
        $this->blockTexts = [];

        foreach ($body->childNodes as $child) {
            if ($child->nodeType !== XML_ELEMENT_NODE) {
                continue;
            }
            /** @var DOMElement $child */
            if ($child->localName === 'sectPr') {
                continue;
            }
            $this->blocks[] = $child;
            $this->blockTexts[] = $this->extractVisibleTextFromBlock($child);
        }
    }

    /**
     * 仅拼接 w:t 可见文本，忽略 w:instrText 等域指令（避免 Zotero JSON 参与裁切判断）。
     */
    private function extractVisibleTextFromBlock(DOMElement $block)
    {
        $xpath = new DOMXPath($block->ownerDocument);
        $xpath->registerNamespace('w', self::W_NS);
        $nodes = $xpath->query('.//w:t', $block);
        if (!$nodes || $nodes->length === 0) {
            return '';
        }
        $parts = [];
        foreach ($nodes as $node) {
            $parts[] = $node->textContent;
        }
        $text = preg_replace('/\s+/u', ' ', implode('', $parts));
        return trim((string) $text);
    }

    private function sliceDocxToNewFile($sourcePath, $articleId, $bodyStart, $refStart)
    {
        $rootDir = rtrim(ROOT_PATH ?: dirname(dirname(__DIR__)), '/\\');
        $dir = $rootDir . DIRECTORY_SEPARATOR . self::BODY_SUBDIR;
        if (!is_dir($dir)) {
            @mkdir($dir, 0755, true);
        }

        $name = sprintf('body_article_%d_%s.docx', intval($articleId), date('Ymd_His'));
        $absPath = $dir . DIRECTORY_SEPARATOR . $name;

        if (!copy($sourcePath, $absPath)) {
            throw new Exception('Failed to copy source docx');
        }

        $n = count($this->blocks);

        $zip = new ZipArchive();
        if ($zip->open($absPath) !== true) {
            throw new Exception('Cannot open output docx');
        }

        $xml = $zip->getFromName('word/document.xml');
        if ($xml === false) {
            $zip->close();
            throw new Exception('document.xml missing in output docx');
        }

        $outDom = new DOMDocument();
        $outDom->preserveWhiteSpace = false;
        $outDom->formatOutput = false;
        if (@$outDom->loadXML($xml) === false) {
            $zip->close();
            throw new Exception('Invalid document.xml in output docx');
        }

        $xpath = new DOMXPath($outDom);
        $xpath->registerNamespace('w', self::W_NS);
        $body = $xpath->query('//w:body')->item(0);
        if (!$body instanceof DOMElement) {
            $zip->close();
            throw new Exception('w:body not found in output docx');
        }

        $children = [];
        foreach ($body->childNodes as $child) {
            if ($child->nodeType === XML_ELEMENT_NODE) {
                $children[] = $child;
            }
        }

        $blockIdx = 0;
        foreach ($children as $child) {
            if (!($child instanceof DOMElement)) {
                continue;
            }
            if ($child->localName === 'sectPr') {
                continue;
            }
            if ($blockIdx < $bodyStart || $blockIdx >= $refStart) {
                if ($child->parentNode) {
                    $child->parentNode->removeChild($child);
                }
            }
            $blockIdx++;
        }

        if ($blockIdx !== $n) {
            $zip->close();
            @unlink($absPath);
            throw new Exception('Document block count mismatch during slice');
        }

        $zip->addFromString('word/document.xml', $outDom->saveXML());
        $zip->close();

        if (!is_file($absPath) || filesize($absPath) < 200) {
            throw new Exception('Failed to write body-only docx');
        }

        return self::BODY_SUBDIR . '/' . $name;
    }

    private function findReferenceStartIndex()
    {
        $stopKeywords = [
            'acknowledgements', 'acknowledgments', 'funding', 'appendix', 'supplementary',
            'conflict of interest', 'competing interests', 'author contributions',
            '致谢', '基金', '附录', '补充材料', '利益冲突', '作者贡献',
        ];

        foreach ($this->blockTexts as $i => $line) {
            $t = trim($line);
            if ($t === '') {
                continue;
            }
            if (preg_match('/^\s*(references|reference|bibliography|参考文献|文献)\b\s*[:：]?\s*/iu', $t)) {
                return $i;
            }
            $lower = strtolower($t);
            foreach ($stopKeywords as $sk) {
                $skLower = strtolower($sk);
                if ($lower === $skLower || $lower === $skLower . ':' || $lower === $skLower . '：') {
                    if ($i > count($this->blockTexts) * 0.4) {
                        return $i;
                    }
                }
            }
        }
        return -1;
    }

    private function findBodyStartIndex()
    {
        $n = count($this->blockTexts);
        $introIdx = -1;
        $keywordsIdx = -1;

        for ($i = 0; $i < $n; $i++) {
            $t = trim($this->blockTexts[$i]);
            if ($t === '') {
                continue;
            }
            if ($introIdx < 0 && $this->isIntroductionHeading($t)) {
                $introIdx = $i;
            }
            if ($keywordsIdx < 0 && preg_match('/^\s*keywords?\b\s*[:：]?/iu', $t)) {
                $keywordsIdx = $i;
            }
        }

        if ($introIdx >= 0) {
            return $introIdx;
        }

        if ($keywordsIdx >= 0) {
            $afterKw = $this->indexAfterKeywordsBlock($keywordsIdx);
            if ($afterKw < $n) {
                return $afterKw;
            }
        }

        return $this->indexAfterFrontMatterFallback();
    }

    private function isIntroductionHeading($t)
    {
        if (preg_match('/^\s*(introduction|background|materials and methods|materials & methods|methods and materials)\b\s*[:：]?/iu', $t)) {
            return true;
        }
        if (preg_match('/^\s*(引言|前言|背景|材料与方法|资料与方法|研究方法)\b\s*[:：]?/iu', $t)) {
            return true;
        }
        if (preg_match('/^\s*1[\.\s、]+(introduction|引言|前言)\b/iu', $t)) {
            return true;
        }
        return false;
    }

    private function indexAfterKeywordsBlock($kwIdx)
    {
        $n = count($this->blockTexts);
        for ($i = $kwIdx + 1; $i < $n; $i++) {
            $t = trim($this->blockTexts[$i]);
            if ($t === '') {
                continue;
            }
            if ($this->isIntroductionHeading($t)) {
                return $i;
            }
            if (preg_match('/^\s*abstract\b/iu', $t)) {
                continue;
            }
            if (mb_strlen($t) >= 30 && !$this->looksLikeAffiliationLine($t)) {
                return $i;
            }
        }
        return min($kwIdx + 1, $n - 1);
    }

    private function indexAfterFrontMatterFallback()
    {
        $n = count($this->blockTexts);
        $maxSkip = min(20, (int) floor($n * 0.15));
        for ($i = 0; $i < $maxSkip && $i < $n; $i++) {
            $t = trim($this->blockTexts[$i]);
            if ($t === '') {
                continue;
            }
            if ($this->isIntroductionHeading($t)) {
                return $i;
            }
        }
        return min(8, max(0, $n - 1));
    }

    private function looksLikeAffiliationLine($t)
    {
        if (preg_match('/@|mailto:|correspond|univ|university|hospital|institute|department|^\d+[\s,，]/iu', $t)) {
            return true;
        }
        if (preg_match('/^\s*abstract\b/iu', $t) || preg_match('/^\s*keywords?\b/iu', $t)) {
            return true;
        }
        return false;
    }
}