243 lines
7.7 KiB
PHP
243 lines
7.7 KiB
PHP
<?php
|
||
|
||
namespace app\common;
|
||
|
||
use PhpOffice\PhpWord\IOFactory;
|
||
use PhpOffice\PhpWord\PhpWord;
|
||
use think\Exception;
|
||
|
||
/**
|
||
* 从投稿 Word 稿件生成「仅正文」版本:去掉文前题名/作者/单位等,去掉参考文献及之后内容。
|
||
*/
|
||
class ManuscriptBodyExtractor
|
||
{
|
||
const BODY_SUBDIR = 'public/plagiarism/body_only';
|
||
|
||
/** @var array<int,string> */
|
||
private $lines = [];
|
||
|
||
/**
|
||
* @return array{path:string, rel_path:string, line_count:int, ref_start:int, body_start:int, warnings:array}
|
||
*/
|
||
public function buildBodyOnlyDocx($sourcePath, $articleId = 0)
|
||
{
|
||
$sourcePath = trim((string) $sourcePath);
|
||
if (!is_file($sourcePath) || !is_readable($sourcePath)) {
|
||
throw new Exception('Manuscript not readable: ' . $sourcePath);
|
||
}
|
||
$ext = strtolower(pathinfo($sourcePath, PATHINFO_EXTENSION));
|
||
if ($ext !== 'docx') {
|
||
throw new Exception('body_only check requires DOCX manuscript, got: ' . $ext);
|
||
}
|
||
|
||
$this->lines = ArticleParserService::collectParagraphLines($sourcePath);
|
||
if (empty($this->lines)) {
|
||
throw new Exception('No text extracted from manuscript');
|
||
}
|
||
|
||
$refStart = $this->findReferenceStartIndex();
|
||
$bodyStart = $this->findBodyStartIndex();
|
||
$warnings = [];
|
||
|
||
if ($refStart < 0) {
|
||
$warnings[] = 'references_heading_not_found; using document end';
|
||
$refStart = count($this->lines);
|
||
}
|
||
if ($bodyStart >= $refStart) {
|
||
throw new Exception('Could not locate main body (front matter may include entire document)');
|
||
}
|
||
|
||
$bodyLines = array_slice($this->lines, $bodyStart, $refStart - $bodyStart);
|
||
$bodyLines = $this->normalizeBodyLines($bodyLines);
|
||
if (count($bodyLines) < 3) {
|
||
throw new Exception('Body text too short after extraction (' . count($bodyLines) . ' paragraphs)');
|
||
}
|
||
|
||
$relPath = $this->writeBodyDocx($bodyLines, $articleId);
|
||
$rootDir = rtrim(ROOT_PATH ?: dirname(dirname(__DIR__)), '/\\');
|
||
$absPath = $rootDir . DIRECTORY_SEPARATOR . str_replace(['/', '\\'], DIRECTORY_SEPARATOR, $relPath);
|
||
|
||
return [
|
||
'path' => $absPath,
|
||
'rel_path' => $relPath,
|
||
'line_count' => count($bodyLines),
|
||
'ref_start' => $refStart,
|
||
'body_start' => $bodyStart,
|
||
'warnings' => $warnings,
|
||
];
|
||
}
|
||
|
||
private function findReferenceStartIndex()
|
||
{
|
||
$stopKeywords = [
|
||
'acknowledgements', 'acknowledgments', 'funding', 'appendix', 'supplementary',
|
||
'conflict of interest', 'competing interests', 'author contributions',
|
||
'致谢', '基金', '附录', '补充材料', '利益冲突', '作者贡献',
|
||
];
|
||
|
||
foreach ($this->lines as $i => $line) {
|
||
$t = trim($line);
|
||
if ($t === '') {
|
||
continue;
|
||
}
|
||
if (preg_match('/^\s*(references|reference|bibliography|参考文献|文献)\b\s*[::]?\s*/iu', $t)) {
|
||
return $i;
|
||
}
|
||
$lower = strtolower($t);
|
||
foreach ($stopKeywords as $sk) {
|
||
$skLower = strtolower($sk);
|
||
if ($lower === $skLower || $lower === $skLower . ':' || $lower === $skLower . ':') {
|
||
if ($i > count($this->lines) * 0.4) {
|
||
return $i;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
return -1;
|
||
}
|
||
|
||
private function findBodyStartIndex()
|
||
{
|
||
$n = count($this->lines);
|
||
$introIdx = -1;
|
||
$keywordsIdx = -1;
|
||
|
||
for ($i = 0; $i < $n; $i++) {
|
||
$t = trim($this->lines[$i]);
|
||
if ($t === '') {
|
||
continue;
|
||
}
|
||
if ($introIdx < 0 && $this->isIntroductionHeading($t)) {
|
||
$introIdx = $i;
|
||
}
|
||
if ($keywordsIdx < 0 && preg_match('/^\s*keywords?\b\s*[::]?/iu', $t)) {
|
||
$keywordsIdx = $i;
|
||
}
|
||
}
|
||
|
||
if ($introIdx >= 0) {
|
||
return $introIdx;
|
||
}
|
||
|
||
if ($keywordsIdx >= 0) {
|
||
$afterKw = $this->indexAfterKeywordsBlock($keywordsIdx);
|
||
if ($afterKw < $n) {
|
||
return $afterKw;
|
||
}
|
||
}
|
||
|
||
return $this->indexAfterFrontMatterFallback();
|
||
}
|
||
|
||
private function isIntroductionHeading($t)
|
||
{
|
||
if (preg_match('/^\s*(introduction|background|materials and methods|materials & methods|methods and materials)\b\s*[::]?/iu', $t)) {
|
||
return true;
|
||
}
|
||
if (preg_match('/^\s*(引言|前言|背景|材料与方法|资料与方法|研究方法)\b\s*[::]?/iu', $t)) {
|
||
return true;
|
||
}
|
||
if (preg_match('/^\s*1[\.\s、]+(introduction|引言|前言)\b/iu', $t)) {
|
||
return true;
|
||
}
|
||
return false;
|
||
}
|
||
|
||
private function indexAfterKeywordsBlock($kwIdx)
|
||
{
|
||
$n = count($this->lines);
|
||
for ($i = $kwIdx + 1; $i < $n; $i++) {
|
||
$t = trim($this->lines[$i]);
|
||
if ($t === '') {
|
||
continue;
|
||
}
|
||
if ($this->isIntroductionHeading($t)) {
|
||
return $i;
|
||
}
|
||
if (preg_match('/^\s*abstract\b/iu', $t)) {
|
||
continue;
|
||
}
|
||
if (mb_strlen($t) >= 30 && !$this->looksLikeAffiliationLine($t)) {
|
||
return $i;
|
||
}
|
||
}
|
||
return min($kwIdx + 1, $n - 1);
|
||
}
|
||
|
||
private function indexAfterFrontMatterFallback()
|
||
{
|
||
$n = count($this->lines);
|
||
$maxSkip = min(20, (int) floor($n * 0.15));
|
||
for ($i = 0; $i < $maxSkip && $i < $n; $i++) {
|
||
$t = trim($this->lines[$i]);
|
||
if ($t === '') {
|
||
continue;
|
||
}
|
||
if ($this->isIntroductionHeading($t)) {
|
||
return $i;
|
||
}
|
||
}
|
||
return min(8, max(0, $n - 1));
|
||
}
|
||
|
||
private function looksLikeAffiliationLine($t)
|
||
{
|
||
if (preg_match('/@|mailto:|correspond|univ|university|hospital|institute|department|^\d+[\s,,]/iu', $t)) {
|
||
return true;
|
||
}
|
||
if (preg_match('/^\s*abstract\b/iu', $t) || preg_match('/^\s*keywords?\b/iu', $t)) {
|
||
return true;
|
||
}
|
||
return false;
|
||
}
|
||
|
||
/**
|
||
* @param array<int,string> $bodyLines
|
||
* @return array<int,string>
|
||
*/
|
||
private function normalizeBodyLines(array $bodyLines)
|
||
{
|
||
$out = [];
|
||
foreach ($bodyLines as $line) {
|
||
$line = trim($line);
|
||
if ($line === '') {
|
||
continue;
|
||
}
|
||
if (preg_match('/^\{\s*ADDIN\s+EN\.REFLIST\s*\}$/i', $line)) {
|
||
continue;
|
||
}
|
||
$out[] = $line;
|
||
}
|
||
return $out;
|
||
}
|
||
|
||
/**
|
||
* @param array<int,string> $bodyLines
|
||
*/
|
||
private function writeBodyDocx(array $bodyLines, $articleId)
|
||
{
|
||
$rootDir = rtrim(ROOT_PATH ?: dirname(dirname(__DIR__)), '/\\');
|
||
$dir = $rootDir . DIRECTORY_SEPARATOR . self::BODY_SUBDIR;
|
||
if (!is_dir($dir)) {
|
||
@mkdir($dir, 0755, true);
|
||
}
|
||
|
||
$name = sprintf('body_article_%d_%s.docx', intval($articleId), date('Ymd_His'));
|
||
$absPath = $dir . DIRECTORY_SEPARATOR . $name;
|
||
|
||
$phpWord = new PhpWord();
|
||
$section = $phpWord->addSection();
|
||
foreach ($bodyLines as $line) {
|
||
$section->addText($line);
|
||
}
|
||
$writer = IOFactory::createWriter($phpWord, 'Word2007');
|
||
$writer->save($absPath);
|
||
|
||
if (!is_file($absPath) || filesize($absPath) < 200) {
|
||
throw new Exception('Failed to write body-only docx');
|
||
}
|
||
|
||
return self::BODY_SUBDIR . '/' . $name;
|
||
}
|
||
}
|