Files
tougao/application/common/ManuscriptBodyExtractor.php
2026-05-20 11:58:10 +08:00

243 lines
7.7 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
namespace app\common;
use PhpOffice\PhpWord\IOFactory;
use PhpOffice\PhpWord\PhpWord;
use think\Exception;
/**
* 从投稿 Word 稿件生成「仅正文」版本:去掉文前题名/作者/单位等,去掉参考文献及之后内容。
*/
class ManuscriptBodyExtractor
{
const BODY_SUBDIR = 'public/plagiarism/body_only';
/** @var array<int,string> */
private $lines = [];
/**
* @return array{path:string, rel_path:string, line_count:int, ref_start:int, body_start:int, warnings:array}
*/
public function buildBodyOnlyDocx($sourcePath, $articleId = 0)
{
$sourcePath = trim((string) $sourcePath);
if (!is_file($sourcePath) || !is_readable($sourcePath)) {
throw new Exception('Manuscript not readable: ' . $sourcePath);
}
$ext = strtolower(pathinfo($sourcePath, PATHINFO_EXTENSION));
if ($ext !== 'docx') {
throw new Exception('body_only check requires DOCX manuscript, got: ' . $ext);
}
$this->lines = ArticleParserService::collectParagraphLines($sourcePath);
if (empty($this->lines)) {
throw new Exception('No text extracted from manuscript');
}
$refStart = $this->findReferenceStartIndex();
$bodyStart = $this->findBodyStartIndex();
$warnings = [];
if ($refStart < 0) {
$warnings[] = 'references_heading_not_found; using document end';
$refStart = count($this->lines);
}
if ($bodyStart >= $refStart) {
throw new Exception('Could not locate main body (front matter may include entire document)');
}
$bodyLines = array_slice($this->lines, $bodyStart, $refStart - $bodyStart);
$bodyLines = $this->normalizeBodyLines($bodyLines);
if (count($bodyLines) < 3) {
throw new Exception('Body text too short after extraction (' . count($bodyLines) . ' paragraphs)');
}
$relPath = $this->writeBodyDocx($bodyLines, $articleId);
$rootDir = rtrim(ROOT_PATH ?: dirname(dirname(__DIR__)), '/\\');
$absPath = $rootDir . DIRECTORY_SEPARATOR . str_replace(['/', '\\'], DIRECTORY_SEPARATOR, $relPath);
return [
'path' => $absPath,
'rel_path' => $relPath,
'line_count' => count($bodyLines),
'ref_start' => $refStart,
'body_start' => $bodyStart,
'warnings' => $warnings,
];
}
private function findReferenceStartIndex()
{
$stopKeywords = [
'acknowledgements', 'acknowledgments', 'funding', 'appendix', 'supplementary',
'conflict of interest', 'competing interests', 'author contributions',
'致谢', '基金', '附录', '补充材料', '利益冲突', '作者贡献',
];
foreach ($this->lines as $i => $line) {
$t = trim($line);
if ($t === '') {
continue;
}
if (preg_match('/^\s*(references|reference|bibliography|参考文献|文献)\b\s*[:]?\s*/iu', $t)) {
return $i;
}
$lower = strtolower($t);
foreach ($stopKeywords as $sk) {
$skLower = strtolower($sk);
if ($lower === $skLower || $lower === $skLower . ':' || $lower === $skLower . '') {
if ($i > count($this->lines) * 0.4) {
return $i;
}
}
}
}
return -1;
}
private function findBodyStartIndex()
{
$n = count($this->lines);
$introIdx = -1;
$keywordsIdx = -1;
for ($i = 0; $i < $n; $i++) {
$t = trim($this->lines[$i]);
if ($t === '') {
continue;
}
if ($introIdx < 0 && $this->isIntroductionHeading($t)) {
$introIdx = $i;
}
if ($keywordsIdx < 0 && preg_match('/^\s*keywords?\b\s*[:]?/iu', $t)) {
$keywordsIdx = $i;
}
}
if ($introIdx >= 0) {
return $introIdx;
}
if ($keywordsIdx >= 0) {
$afterKw = $this->indexAfterKeywordsBlock($keywordsIdx);
if ($afterKw < $n) {
return $afterKw;
}
}
return $this->indexAfterFrontMatterFallback();
}
private function isIntroductionHeading($t)
{
if (preg_match('/^\s*(introduction|background|materials and methods|materials & methods|methods and materials)\b\s*[:]?/iu', $t)) {
return true;
}
if (preg_match('/^\s*(引言|前言|背景|材料与方法|资料与方法|研究方法)\b\s*[:]?/iu', $t)) {
return true;
}
if (preg_match('/^\s*1[\.\s、]+(introduction|引言|前言)\b/iu', $t)) {
return true;
}
return false;
}
private function indexAfterKeywordsBlock($kwIdx)
{
$n = count($this->lines);
for ($i = $kwIdx + 1; $i < $n; $i++) {
$t = trim($this->lines[$i]);
if ($t === '') {
continue;
}
if ($this->isIntroductionHeading($t)) {
return $i;
}
if (preg_match('/^\s*abstract\b/iu', $t)) {
continue;
}
if (mb_strlen($t) >= 30 && !$this->looksLikeAffiliationLine($t)) {
return $i;
}
}
return min($kwIdx + 1, $n - 1);
}
private function indexAfterFrontMatterFallback()
{
$n = count($this->lines);
$maxSkip = min(20, (int) floor($n * 0.15));
for ($i = 0; $i < $maxSkip && $i < $n; $i++) {
$t = trim($this->lines[$i]);
if ($t === '') {
continue;
}
if ($this->isIntroductionHeading($t)) {
return $i;
}
}
return min(8, max(0, $n - 1));
}
private function looksLikeAffiliationLine($t)
{
if (preg_match('/@|mailto:|correspond|univ|university|hospital|institute|department|^\d+[\s,]/iu', $t)) {
return true;
}
if (preg_match('/^\s*abstract\b/iu', $t) || preg_match('/^\s*keywords?\b/iu', $t)) {
return true;
}
return false;
}
/**
* @param array<int,string> $bodyLines
* @return array<int,string>
*/
private function normalizeBodyLines(array $bodyLines)
{
$out = [];
foreach ($bodyLines as $line) {
$line = trim($line);
if ($line === '') {
continue;
}
if (preg_match('/^\{\s*ADDIN\s+EN\.REFLIST\s*\}$/i', $line)) {
continue;
}
$out[] = $line;
}
return $out;
}
/**
* @param array<int,string> $bodyLines
*/
private function writeBodyDocx(array $bodyLines, $articleId)
{
$rootDir = rtrim(ROOT_PATH ?: dirname(dirname(__DIR__)), '/\\');
$dir = $rootDir . DIRECTORY_SEPARATOR . self::BODY_SUBDIR;
if (!is_dir($dir)) {
@mkdir($dir, 0755, true);
}
$name = sprintf('body_article_%d_%s.docx', intval($articleId), date('Ymd_His'));
$absPath = $dir . DIRECTORY_SEPARATOR . $name;
$phpWord = new PhpWord();
$section = $phpWord->addSection();
foreach ($bodyLines as $line) {
$section->addText($line);
}
$writer = IOFactory::createWriter($phpWord, 'Word2007');
$writer->save($absPath);
if (!is_file($absPath) || filesize($absPath) < 200) {
throw new Exception('Failed to write body-only docx');
}
return self::BODY_SUBDIR . '/' . $name;
}
}