自动查重

This commit is contained in:
wangjinlei
2026-05-20 11:58:10 +08:00
parent 53e6ddbd9e
commit cfa3f791f4
11 changed files with 938 additions and 58 deletions

View File

@@ -1153,12 +1153,12 @@ class ArticleParserService
}
/**
* 提取 Word 文档中的参考文献列表(仅返回数组,不做入库
* @return array 每条为一个参考文献的纯文本字符串
* 按段落提取 Word 全文行(供正文裁切、参考文献识别等复用
* @return array<int,string>
*/
public static function getReferencesFromWord($filePath): array
public static function collectParagraphLines($filePath): array
{
$othis = new self($filePath) ;
$othis = new self($filePath);
if (empty($othis->sections)) {
return [];
}
@@ -1166,13 +1166,26 @@ class ArticleParserService
$lines = [];
foreach ($othis->sections as $section) {
foreach ($section->getElements() as $element) {
$text = $othis->getTextFromElement($element);
$text = trim((string)$text);
if ($text === '') continue;
$lines[] = $text;
$text = trim((string) $othis->getTextFromElement($element));
if ($text === '') {
continue;
}
if (!mb_check_encoding($text, 'UTF-8')) {
$text = mb_convert_encoding($text, 'UTF-8', 'GBK');
}
$lines[] = preg_replace('/\s+/u', ' ', $text);
}
}
return $lines;
}
/**
* 提取 Word 文档中的参考文献列表(仅返回数组,不做入库)
* @return array 每条为一个参考文献的纯文本字符串
*/
public static function getReferencesFromWord($filePath): array
{
$lines = self::collectParagraphLines($filePath);
if (empty($lines)) {
return [];
}