自动推广
This commit is contained in:
@@ -1151,6 +1151,123 @@ class ArticleParserService
|
||||
]
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* 提取 Word 文档中的参考文献列表(仅返回数组,不做入库)
|
||||
* @return array 每条为一个参考文献的纯文本字符串
|
||||
*/
|
||||
public static function getReferencesFromWord($filePath): array
|
||||
{
|
||||
$othis = new self($filePath) ;
|
||||
if (empty($othis->sections)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$lines = [];
|
||||
foreach ($othis->sections as $section) {
|
||||
foreach ($section->getElements() as $element) {
|
||||
$text = $othis->getTextFromElement($element);
|
||||
$text = trim((string)$text);
|
||||
if ($text === '') continue;
|
||||
$lines[] = $text;
|
||||
}
|
||||
}
|
||||
|
||||
if (empty($lines)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
// 识别参考文献段落起点(允许同一行包含域代码或第一条内容)
|
||||
$startIdx = -1;
|
||||
$startRemainder = ''; // 标题行后可能跟着第一条参考文献内容
|
||||
foreach ($lines as $i => $line) {
|
||||
$t = trim($line);
|
||||
if ($t === '') continue;
|
||||
|
||||
// 行首命中即可(避免 “References { ADDIN... }” / “References 1. ...” 漏判)
|
||||
if (preg_match('/^\s*(references|reference|bibliography|参考文献|文献)\b\s*[::]?\s*/iu', $t, $m)) {
|
||||
$startIdx = $i;
|
||||
$remainder = preg_replace('/^\s*(references|reference|bibliography|参考文献|文献)\b\s*[::]?\s*/iu', '', $t);
|
||||
$remainder = trim($remainder);
|
||||
// 过滤 EndNote 域代码(允许其出现在标题行后)
|
||||
if ($remainder !== '' && !preg_match('/^\{\s*ADDIN\s+EN\.REFLIST\s*\}$/i', $remainder)) {
|
||||
$startRemainder = $remainder;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ($startIdx < 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
// 收集参考文献区域内容,遇到常见结尾段落标题则停止
|
||||
$stopKeywords = [
|
||||
'acknowledgements', 'acknowledgments', 'funding', 'appendix', 'supplementary',
|
||||
'conflict of interest', 'competing interests', 'author contributions',
|
||||
'致谢', '基金', '附录', '补充材料', '利益冲突', '作者贡献',
|
||||
];
|
||||
|
||||
// startRemainder 已在起点识别时处理
|
||||
|
||||
$raw = [];
|
||||
if ($startRemainder !== '') {
|
||||
$raw[] = $startRemainder;
|
||||
}
|
||||
|
||||
for ($i = $startIdx + 1; $i < count($lines); $i++) {
|
||||
$line = trim($lines[$i]);
|
||||
if ($line === '') continue;
|
||||
// 跳过 EndNote / Word 域代码
|
||||
if (preg_match('/^\{\s*ADDIN\s+EN\.REFLIST\s*\}$/i', $line)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$lineLower = strtolower($line);
|
||||
foreach ($stopKeywords as $sk) {
|
||||
$skLower = strtolower($sk);
|
||||
if ($lineLower === $skLower || $lineLower === $skLower . ':' || $lineLower === $skLower . ':') {
|
||||
$i = count($lines); // break outer
|
||||
continue 2;
|
||||
}
|
||||
}
|
||||
|
||||
$raw[] = $line;
|
||||
}
|
||||
|
||||
if (empty($raw)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
// 合并多行:以 “数字.” / “[数字]” / “数字]” 等作为新条目起始
|
||||
$refs = [];
|
||||
$current = '';
|
||||
foreach ($raw as $line) {
|
||||
$isNew = false;
|
||||
if (preg_match('/^\s*(\[\d+\]|\d+\s*[\.\)]|\d+\s*\])\s*/u', $line)) {
|
||||
$isNew = true;
|
||||
}
|
||||
|
||||
if ($isNew) {
|
||||
if (trim($current) !== '') {
|
||||
$refs[] = trim(preg_replace('/\s+/u', ' ', $current));
|
||||
}
|
||||
$current = $line;
|
||||
} else {
|
||||
// 续行拼接
|
||||
if ($current === '') {
|
||||
$current = $line;
|
||||
} else {
|
||||
$current .= ' ' . $line;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (trim($current) !== '') {
|
||||
$refs[] = trim(preg_replace('/\s+/u', ' ', $current));
|
||||
}
|
||||
|
||||
return $refs;
|
||||
}
|
||||
/**
|
||||
* 核心解码方法
|
||||
* @param string $str 待解码字符串
|
||||
|
||||
Reference in New Issue
Block a user