Files
tougao/application/common/ManuscriptBodyExtractor.php
2026-05-20 14:46:58 +08:00

357 lines
11 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
namespace app\common;
use DOMDocument;
use DOMElement;
use DOMXPath;
use think\Exception;
use ZipArchive;
/**
* 从投稿 Word 生成「仅正文」docx在 document.xml 上按块裁切,保留表格/图片/样式;
* 边界识别仅用可见文本w:t不读取域指令Zotero/EndNote 的 JSON
*/
class ManuscriptBodyExtractor
{
const BODY_SUBDIR = 'public/plagiarism/body_only';
const W_NS = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main';
/** @var DOMDocument */
private $dom;
/** @var DOMElement */
private $bodyNode;
/** @var array<int,DOMElement> */
private $blocks = [];
/** @var array<int,string> */
private $blockTexts = [];
/**
* @return array{path:string, rel_path:string, line_count:int, ref_start:int, body_start:int, warnings:array}
*/
public function buildBodyOnlyDocx($sourcePath, $articleId = 0)
{
$sourcePath = trim((string) $sourcePath);
if (!is_file($sourcePath) || !is_readable($sourcePath)) {
throw new Exception('Manuscript not readable: ' . $sourcePath);
}
$ext = strtolower(pathinfo($sourcePath, PATHINFO_EXTENSION));
if ($ext !== 'docx') {
throw new Exception('body_only check requires DOCX manuscript, got: ' . $ext);
}
$this->loadDocumentBlocks($sourcePath);
if (empty($this->blocks)) {
throw new Exception('No content blocks in manuscript');
}
$refStart = $this->findReferenceStartIndex();
$bodyStart = $this->findBodyStartIndex();
$warnings = [];
if ($refStart < 0) {
$warnings[] = 'references_heading_not_found; using document end';
$refStart = count($this->blocks);
}
if ($bodyStart >= $refStart) {
throw new Exception('Could not locate main body (front matter may include entire document)');
}
$kept = 0;
for ($i = $bodyStart; $i < $refStart; $i++) {
if (trim($this->blockTexts[$i]) !== '') {
$kept++;
}
}
if ($kept < 3) {
throw new Exception('Body content too short after extraction (' . $kept . ' non-empty blocks)');
}
$relPath = $this->sliceDocxToNewFile($sourcePath, $articleId, $bodyStart, $refStart);
$rootDir = rtrim(ROOT_PATH ?: dirname(dirname(__DIR__)), '/\\');
$absPath = $rootDir . DIRECTORY_SEPARATOR . str_replace(['/', '\\'], DIRECTORY_SEPARATOR, $relPath);
return [
'path' => $absPath,
'rel_path' => $relPath,
'line_count' => $kept,
'ref_start' => $refStart,
'body_start' => $bodyStart,
'warnings' => $warnings,
];
}
private function loadDocumentBlocks($sourcePath)
{
$zip = new ZipArchive();
if ($zip->open($sourcePath) !== true) {
throw new Exception('Cannot open docx: ' . $sourcePath);
}
$xml = $zip->getFromName('word/document.xml');
$zip->close();
if ($xml === false || $xml === '') {
throw new Exception('word/document.xml missing in docx');
}
$this->dom = new DOMDocument();
$this->dom->preserveWhiteSpace = false;
$this->dom->formatOutput = false;
if (@$this->dom->loadXML($xml) === false) {
throw new Exception('Invalid word/document.xml');
}
$xpath = new DOMXPath($this->dom);
$xpath->registerNamespace('w', self::W_NS);
$body = $xpath->query('//w:body')->item(0);
if (!$body instanceof DOMElement) {
throw new Exception('w:body not found');
}
$this->bodyNode = $body;
$this->blocks = [];
$this->blockTexts = [];
foreach ($body->childNodes as $child) {
if ($child->nodeType !== XML_ELEMENT_NODE) {
continue;
}
/** @var DOMElement $child */
if ($child->localName === 'sectPr') {
continue;
}
$this->blocks[] = $child;
$this->blockTexts[] = $this->extractVisibleTextFromBlock($child);
}
}
/**
* 仅拼接 w:t 可见文本,忽略 w:instrText 等域指令(避免 Zotero JSON 参与裁切判断)。
*/
private function extractVisibleTextFromBlock(DOMElement $block)
{
$xpath = new DOMXPath($block->ownerDocument);
$xpath->registerNamespace('w', self::W_NS);
$nodes = $xpath->query('.//w:t', $block);
if (!$nodes || $nodes->length === 0) {
return '';
}
$parts = [];
foreach ($nodes as $node) {
$parts[] = $node->textContent;
}
$text = preg_replace('/\s+/u', ' ', implode('', $parts));
return trim((string) $text);
}
private function sliceDocxToNewFile($sourcePath, $articleId, $bodyStart, $refStart)
{
$rootDir = rtrim(ROOT_PATH ?: dirname(dirname(__DIR__)), '/\\');
$dir = $rootDir . DIRECTORY_SEPARATOR . self::BODY_SUBDIR;
if (!is_dir($dir)) {
@mkdir($dir, 0755, true);
}
$name = sprintf('body_article_%d_%s.docx', intval($articleId), date('Ymd_His'));
$absPath = $dir . DIRECTORY_SEPARATOR . $name;
if (!copy($sourcePath, $absPath)) {
throw new Exception('Failed to copy source docx');
}
$n = count($this->blocks);
$zip = new ZipArchive();
if ($zip->open($absPath) !== true) {
throw new Exception('Cannot open output docx');
}
$xml = $zip->getFromName('word/document.xml');
if ($xml === false) {
$zip->close();
throw new Exception('document.xml missing in output docx');
}
$outDom = new DOMDocument();
$outDom->preserveWhiteSpace = false;
$outDom->formatOutput = false;
if (@$outDom->loadXML($xml) === false) {
$zip->close();
throw new Exception('Invalid document.xml in output docx');
}
$xpath = new DOMXPath($outDom);
$xpath->registerNamespace('w', self::W_NS);
$body = $xpath->query('//w:body')->item(0);
if (!$body instanceof DOMElement) {
$zip->close();
throw new Exception('w:body not found in output docx');
}
$children = [];
foreach ($body->childNodes as $child) {
if ($child->nodeType === XML_ELEMENT_NODE) {
$children[] = $child;
}
}
$blockIdx = 0;
foreach ($children as $child) {
if (!($child instanceof DOMElement)) {
continue;
}
if ($child->localName === 'sectPr') {
continue;
}
if ($blockIdx < $bodyStart || $blockIdx >= $refStart) {
if ($child->parentNode) {
$child->parentNode->removeChild($child);
}
}
$blockIdx++;
}
if ($blockIdx !== $n) {
$zip->close();
@unlink($absPath);
throw new Exception('Document block count mismatch during slice');
}
$zip->addFromString('word/document.xml', $outDom->saveXML());
$zip->close();
if (!is_file($absPath) || filesize($absPath) < 200) {
throw new Exception('Failed to write body-only docx');
}
return self::BODY_SUBDIR . '/' . $name;
}
private function findReferenceStartIndex()
{
$stopKeywords = [
'acknowledgements', 'acknowledgments', 'funding', 'appendix', 'supplementary',
'conflict of interest', 'competing interests', 'author contributions',
'致谢', '基金', '附录', '补充材料', '利益冲突', '作者贡献',
];
foreach ($this->blockTexts as $i => $line) {
$t = trim($line);
if ($t === '') {
continue;
}
if (preg_match('/^\s*(references|reference|bibliography|参考文献|文献)\b\s*[:]?\s*/iu', $t)) {
return $i;
}
$lower = strtolower($t);
foreach ($stopKeywords as $sk) {
$skLower = strtolower($sk);
if ($lower === $skLower || $lower === $skLower . ':' || $lower === $skLower . '') {
if ($i > count($this->blockTexts) * 0.4) {
return $i;
}
}
}
}
return -1;
}
private function findBodyStartIndex()
{
$n = count($this->blockTexts);
$introIdx = -1;
$keywordsIdx = -1;
for ($i = 0; $i < $n; $i++) {
$t = trim($this->blockTexts[$i]);
if ($t === '') {
continue;
}
if ($introIdx < 0 && $this->isIntroductionHeading($t)) {
$introIdx = $i;
}
if ($keywordsIdx < 0 && preg_match('/^\s*keywords?\b\s*[:]?/iu', $t)) {
$keywordsIdx = $i;
}
}
if ($introIdx >= 0) {
return $introIdx;
}
if ($keywordsIdx >= 0) {
$afterKw = $this->indexAfterKeywordsBlock($keywordsIdx);
if ($afterKw < $n) {
return $afterKw;
}
}
return $this->indexAfterFrontMatterFallback();
}
private function isIntroductionHeading($t)
{
if (preg_match('/^\s*(introduction|background|materials and methods|materials & methods|methods and materials)\b\s*[:]?/iu', $t)) {
return true;
}
if (preg_match('/^\s*(引言|前言|背景|材料与方法|资料与方法|研究方法)\b\s*[:]?/iu', $t)) {
return true;
}
if (preg_match('/^\s*1[\.\s、]+(introduction|引言|前言)\b/iu', $t)) {
return true;
}
return false;
}
private function indexAfterKeywordsBlock($kwIdx)
{
$n = count($this->blockTexts);
for ($i = $kwIdx + 1; $i < $n; $i++) {
$t = trim($this->blockTexts[$i]);
if ($t === '') {
continue;
}
if ($this->isIntroductionHeading($t)) {
return $i;
}
if (preg_match('/^\s*abstract\b/iu', $t)) {
continue;
}
if (mb_strlen($t) >= 30 && !$this->looksLikeAffiliationLine($t)) {
return $i;
}
}
return min($kwIdx + 1, $n - 1);
}
private function indexAfterFrontMatterFallback()
{
$n = count($this->blockTexts);
$maxSkip = min(20, (int) floor($n * 0.15));
for ($i = 0; $i < $maxSkip && $i < $n; $i++) {
$t = trim($this->blockTexts[$i]);
if ($t === '') {
continue;
}
if ($this->isIntroductionHeading($t)) {
return $i;
}
}
return min(8, max(0, $n - 1));
}
private function looksLikeAffiliationLine($t)
{
if (preg_match('/@|mailto:|correspond|univ|university|hospital|institute|department|^\d+[\s,]/iu', $t)) {
return true;
}
if (preg_match('/^\s*abstract\b/iu', $t) || preg_match('/^\s*keywords?\b/iu', $t)) {
return true;
}
return false;
}
}