*/ private $blocks = []; /** @var array */ private $blockTexts = []; /** * @return array{path:string, rel_path:string, line_count:int, ref_start:int, body_start:int, warnings:array} */ public function buildBodyOnlyDocx($sourcePath, $articleId = 0) { $sourcePath = trim((string) $sourcePath); if (!is_file($sourcePath) || !is_readable($sourcePath)) { throw new Exception('Manuscript not readable: ' . $sourcePath); } $ext = strtolower(pathinfo($sourcePath, PATHINFO_EXTENSION)); if ($ext !== 'docx') { throw new Exception('body_only check requires DOCX manuscript, got: ' . $ext); } $this->loadDocumentBlocks($sourcePath); if (empty($this->blocks)) { throw new Exception('No content blocks in manuscript'); } $refStart = $this->findReferenceStartIndex(); $bodyStart = $this->findBodyStartIndex(); $warnings = []; if ($refStart < 0) { $warnings[] = 'references_heading_not_found; using document end'; $refStart = count($this->blocks); } if ($bodyStart >= $refStart) { throw new Exception('Could not locate main body (front matter may include entire document)'); } $kept = 0; for ($i = $bodyStart; $i < $refStart; $i++) { if (trim($this->blockTexts[$i]) !== '') { $kept++; } } if ($kept < 3) { throw new Exception('Body content too short after extraction (' . $kept . ' non-empty blocks)'); } $relPath = $this->sliceDocxToNewFile($sourcePath, $articleId, $bodyStart, $refStart); $rootDir = rtrim(ROOT_PATH ?: dirname(dirname(__DIR__)), '/\\'); $absPath = $rootDir . DIRECTORY_SEPARATOR . str_replace(['/', '\\'], DIRECTORY_SEPARATOR, $relPath); return [ 'path' => $absPath, 'rel_path' => $relPath, 'line_count' => $kept, 'ref_start' => $refStart, 'body_start' => $bodyStart, 'warnings' => $warnings, ]; } private function loadDocumentBlocks($sourcePath) { $zip = new ZipArchive(); if ($zip->open($sourcePath) !== true) { throw new Exception('Cannot open docx: ' . $sourcePath); } $xml = $zip->getFromName('word/document.xml'); $zip->close(); if ($xml === false || $xml === '') { throw new Exception('word/document.xml missing in docx'); } $this->dom = new DOMDocument(); $this->dom->preserveWhiteSpace = false; $this->dom->formatOutput = false; if (@$this->dom->loadXML($xml) === false) { throw new Exception('Invalid word/document.xml'); } $xpath = new DOMXPath($this->dom); $xpath->registerNamespace('w', self::W_NS); $body = $xpath->query('//w:body')->item(0); if (!$body instanceof DOMElement) { throw new Exception('w:body not found'); } $this->bodyNode = $body; $this->blocks = []; $this->blockTexts = []; foreach ($body->childNodes as $child) { if ($child->nodeType !== XML_ELEMENT_NODE) { continue; } /** @var DOMElement $child */ if ($child->localName === 'sectPr') { continue; } $this->blocks[] = $child; $this->blockTexts[] = $this->extractVisibleTextFromBlock($child); } } /** * 仅拼接 w:t 可见文本,忽略 w:instrText 等域指令(避免 Zotero JSON 参与裁切判断)。 */ private function extractVisibleTextFromBlock(DOMElement $block) { $xpath = new DOMXPath($block->ownerDocument); $xpath->registerNamespace('w', self::W_NS); $nodes = $xpath->query('.//w:t', $block); if (!$nodes || $nodes->length === 0) { return ''; } $parts = []; foreach ($nodes as $node) { $parts[] = $node->textContent; } $text = preg_replace('/\s+/u', ' ', implode('', $parts)); return trim((string) $text); } private function sliceDocxToNewFile($sourcePath, $articleId, $bodyStart, $refStart) { $rootDir = rtrim(ROOT_PATH ?: dirname(dirname(__DIR__)), '/\\'); $dir = $rootDir . DIRECTORY_SEPARATOR . self::BODY_SUBDIR; if (!is_dir($dir)) { @mkdir($dir, 0755, true); } $name = sprintf('body_article_%d_%s.docx', intval($articleId), date('Ymd_His')); $absPath = $dir . DIRECTORY_SEPARATOR . $name; if (!copy($sourcePath, $absPath)) { throw new Exception('Failed to copy source docx'); } $n = count($this->blocks); $zip = new ZipArchive(); if ($zip->open($absPath) !== true) { throw new Exception('Cannot open output docx'); } $xml = $zip->getFromName('word/document.xml'); if ($xml === false) { $zip->close(); throw new Exception('document.xml missing in output docx'); } $outDom = new DOMDocument(); $outDom->preserveWhiteSpace = false; $outDom->formatOutput = false; if (@$outDom->loadXML($xml) === false) { $zip->close(); throw new Exception('Invalid document.xml in output docx'); } $xpath = new DOMXPath($outDom); $xpath->registerNamespace('w', self::W_NS); $body = $xpath->query('//w:body')->item(0); if (!$body instanceof DOMElement) { $zip->close(); throw new Exception('w:body not found in output docx'); } $children = []; foreach ($body->childNodes as $child) { if ($child->nodeType === XML_ELEMENT_NODE) { $children[] = $child; } } $blockIdx = 0; foreach ($children as $child) { if (!($child instanceof DOMElement)) { continue; } if ($child->localName === 'sectPr') { continue; } if ($blockIdx < $bodyStart || $blockIdx >= $refStart) { if ($child->parentNode) { $child->parentNode->removeChild($child); } } $blockIdx++; } if ($blockIdx !== $n) { $zip->close(); @unlink($absPath); throw new Exception('Document block count mismatch during slice'); } $zip->addFromString('word/document.xml', $outDom->saveXML()); $zip->close(); if (!is_file($absPath) || filesize($absPath) < 200) { throw new Exception('Failed to write body-only docx'); } return self::BODY_SUBDIR . '/' . $name; } private function findReferenceStartIndex() { $stopKeywords = [ 'acknowledgements', 'acknowledgments', 'funding', 'appendix', 'supplementary', 'conflict of interest', 'competing interests', 'author contributions', '致谢', '基金', '附录', '补充材料', '利益冲突', '作者贡献', ]; foreach ($this->blockTexts as $i => $line) { $t = trim($line); if ($t === '') { continue; } if (preg_match('/^\s*(references|reference|bibliography|参考文献|文献)\b\s*[::]?\s*/iu', $t)) { return $i; } $lower = strtolower($t); foreach ($stopKeywords as $sk) { $skLower = strtolower($sk); if ($lower === $skLower || $lower === $skLower . ':' || $lower === $skLower . ':') { if ($i > count($this->blockTexts) * 0.4) { return $i; } } } } return -1; } private function findBodyStartIndex() { $n = count($this->blockTexts); $introIdx = -1; $keywordsIdx = -1; for ($i = 0; $i < $n; $i++) { $t = trim($this->blockTexts[$i]); if ($t === '') { continue; } if ($introIdx < 0 && $this->isIntroductionHeading($t)) { $introIdx = $i; } if ($keywordsIdx < 0 && preg_match('/^\s*keywords?\b\s*[::]?/iu', $t)) { $keywordsIdx = $i; } } if ($introIdx >= 0) { return $introIdx; } if ($keywordsIdx >= 0) { $afterKw = $this->indexAfterKeywordsBlock($keywordsIdx); if ($afterKw < $n) { return $afterKw; } } return $this->indexAfterFrontMatterFallback(); } private function isIntroductionHeading($t) { if (preg_match('/^\s*(introduction|background|materials and methods|materials & methods|methods and materials)\b\s*[::]?/iu', $t)) { return true; } if (preg_match('/^\s*(引言|前言|背景|材料与方法|资料与方法|研究方法)\b\s*[::]?/iu', $t)) { return true; } if (preg_match('/^\s*1[\.\s、]+(introduction|引言|前言)\b/iu', $t)) { return true; } return false; } private function indexAfterKeywordsBlock($kwIdx) { $n = count($this->blockTexts); for ($i = $kwIdx + 1; $i < $n; $i++) { $t = trim($this->blockTexts[$i]); if ($t === '') { continue; } if ($this->isIntroductionHeading($t)) { return $i; } if (preg_match('/^\s*abstract\b/iu', $t)) { continue; } if (mb_strlen($t) >= 30 && !$this->looksLikeAffiliationLine($t)) { return $i; } } return min($kwIdx + 1, $n - 1); } private function indexAfterFrontMatterFallback() { $n = count($this->blockTexts); $maxSkip = min(20, (int) floor($n * 0.15)); for ($i = 0; $i < $maxSkip && $i < $n; $i++) { $t = trim($this->blockTexts[$i]); if ($t === '') { continue; } if ($this->isIntroductionHeading($t)) { return $i; } } return min(8, max(0, $n - 1)); } private function looksLikeAffiliationLine($t) { if (preg_match('/@|mailto:|correspond|univ|university|hospital|institute|department|^\d+[\s,,]/iu', $t)) { return true; } if (preg_match('/^\s*abstract\b/iu', $t) || preg_match('/^\s*keywords?\b/iu', $t)) { return true; } return false; } }