自动查重
This commit is contained in:
@@ -1153,12 +1153,12 @@ class ArticleParserService
|
||||
}
|
||||
|
||||
/**
|
||||
* 提取 Word 文档中的参考文献列表(仅返回数组,不做入库)
|
||||
* @return array 每条为一个参考文献的纯文本字符串
|
||||
* 按段落提取 Word 全文行(供正文裁切、参考文献识别等复用)
|
||||
* @return array<int,string>
|
||||
*/
|
||||
public static function getReferencesFromWord($filePath): array
|
||||
public static function collectParagraphLines($filePath): array
|
||||
{
|
||||
$othis = new self($filePath) ;
|
||||
$othis = new self($filePath);
|
||||
if (empty($othis->sections)) {
|
||||
return [];
|
||||
}
|
||||
@@ -1166,13 +1166,26 @@ class ArticleParserService
|
||||
$lines = [];
|
||||
foreach ($othis->sections as $section) {
|
||||
foreach ($section->getElements() as $element) {
|
||||
$text = $othis->getTextFromElement($element);
|
||||
$text = trim((string)$text);
|
||||
if ($text === '') continue;
|
||||
$lines[] = $text;
|
||||
$text = trim((string) $othis->getTextFromElement($element));
|
||||
if ($text === '') {
|
||||
continue;
|
||||
}
|
||||
if (!mb_check_encoding($text, 'UTF-8')) {
|
||||
$text = mb_convert_encoding($text, 'UTF-8', 'GBK');
|
||||
}
|
||||
$lines[] = preg_replace('/\s+/u', ' ', $text);
|
||||
}
|
||||
}
|
||||
return $lines;
|
||||
}
|
||||
|
||||
/**
|
||||
* 提取 Word 文档中的参考文献列表(仅返回数组,不做入库)
|
||||
* @return array 每条为一个参考文献的纯文本字符串
|
||||
*/
|
||||
public static function getReferencesFromWord($filePath): array
|
||||
{
|
||||
$lines = self::collectParagraphLines($filePath);
|
||||
if (empty($lines)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user