自动推广

This commit is contained in:
wangjinlei
2026-04-03 11:45:45 +08:00
parent 22947a56a4
commit a802b2e923
11 changed files with 2240 additions and 36 deletions

View File

@@ -1151,6 +1151,123 @@ class ArticleParserService
]
];
}
/**
* 提取 Word 文档中的参考文献列表(仅返回数组,不做入库)
* @return array 每条为一个参考文献的纯文本字符串
*/
public static function getReferencesFromWord($filePath): array
{
$othis = new self($filePath) ;
if (empty($othis->sections)) {
return [];
}
$lines = [];
foreach ($othis->sections as $section) {
foreach ($section->getElements() as $element) {
$text = $othis->getTextFromElement($element);
$text = trim((string)$text);
if ($text === '') continue;
$lines[] = $text;
}
}
if (empty($lines)) {
return [];
}
// 识别参考文献段落起点(允许同一行包含域代码或第一条内容)
$startIdx = -1;
$startRemainder = ''; // 标题行后可能跟着第一条参考文献内容
foreach ($lines as $i => $line) {
$t = trim($line);
if ($t === '') continue;
// 行首命中即可(避免 “References { ADDIN... }” / “References 1. ...” 漏判)
if (preg_match('/^\s*(references|reference|bibliography|参考文献|文献)\b\s*[:]?\s*/iu', $t, $m)) {
$startIdx = $i;
$remainder = preg_replace('/^\s*(references|reference|bibliography|参考文献|文献)\b\s*[:]?\s*/iu', '', $t);
$remainder = trim($remainder);
// 过滤 EndNote 域代码(允许其出现在标题行后)
if ($remainder !== '' && !preg_match('/^\{\s*ADDIN\s+EN\.REFLIST\s*\}$/i', $remainder)) {
$startRemainder = $remainder;
}
break;
}
}
if ($startIdx < 0) {
return [];
}
// 收集参考文献区域内容,遇到常见结尾段落标题则停止
$stopKeywords = [
'acknowledgements', 'acknowledgments', 'funding', 'appendix', 'supplementary',
'conflict of interest', 'competing interests', 'author contributions',
'致谢', '基金', '附录', '补充材料', '利益冲突', '作者贡献',
];
// startRemainder 已在起点识别时处理
$raw = [];
if ($startRemainder !== '') {
$raw[] = $startRemainder;
}
for ($i = $startIdx + 1; $i < count($lines); $i++) {
$line = trim($lines[$i]);
if ($line === '') continue;
// 跳过 EndNote / Word 域代码
if (preg_match('/^\{\s*ADDIN\s+EN\.REFLIST\s*\}$/i', $line)) {
continue;
}
$lineLower = strtolower($line);
foreach ($stopKeywords as $sk) {
$skLower = strtolower($sk);
if ($lineLower === $skLower || $lineLower === $skLower . ':' || $lineLower === $skLower . '') {
$i = count($lines); // break outer
continue 2;
}
}
$raw[] = $line;
}
if (empty($raw)) {
return [];
}
// 合并多行:以 “数字.” / “[数字]” / “数字]” 等作为新条目起始
$refs = [];
$current = '';
foreach ($raw as $line) {
$isNew = false;
if (preg_match('/^\s*(\[\d+\]|\d+\s*[\.\)]|\d+\s*\])\s*/u', $line)) {
$isNew = true;
}
if ($isNew) {
if (trim($current) !== '') {
$refs[] = trim(preg_replace('/\s+/u', ' ', $current));
}
$current = $line;
} else {
// 续行拼接
if ($current === '') {
$current = $line;
} else {
$current .= ' ' . $line;
}
}
}
if (trim($current) !== '') {
$refs[] = trim(preg_replace('/\s+/u', ' ', $current));
}
return $refs;
}
/**
* 核心解码方法
* @param string $str 待解码字符串