自动推广

2026-04-03 11:45:45 +08:00
parent 22947a56a4
commit a802b2e923
11 changed files with 2240 additions and 36 deletions
--- a/application/common/ArticleParserService.php
+++ b/application/common/ArticleParserService.php
@@ -1151,6 +1151,123 @@ class ArticleParserService
            ]
        ];
    }
+
+    /**
+     * 提取 Word 文档中的参考文献列表（仅返回数组，不做入库）
+     * @return array 每条为一个参考文献的纯文本字符串
+     */
+    public static function getReferencesFromWord($filePath): array
+    {
+        $othis = new self($filePath) ;
+        if (empty($othis->sections)) {
+            return [];
+        }
+
+        $lines = [];
+        foreach ($othis->sections as $section) {
+            foreach ($section->getElements() as $element) {
+                $text = $othis->getTextFromElement($element);
+                $text = trim((string)$text);
+                if ($text === '') continue;
+                $lines[] = $text;
+            }
+        }
+
+        if (empty($lines)) {
+            return [];
+        }
+
+        // 识别参考文献段落起点（允许同一行包含域代码或第一条内容）
+        $startIdx = -1;
+        $startRemainder = ''; // 标题行后可能跟着第一条参考文献内容
+        foreach ($lines as $i => $line) {
+            $t = trim($line);
+            if ($t === '') continue;
+
+            // 行首命中即可（避免 “References { ADDIN... }” / “References 1. ...” 漏判）
+            if (preg_match('/^\s*(references|reference|bibliography|参考文献|文献)\b\s*[:：]?\s*/iu', $t, $m)) {
+                $startIdx = $i;
+                $remainder = preg_replace('/^\s*(references|reference|bibliography|参考文献|文献)\b\s*[:：]?\s*/iu', '', $t);
+                $remainder = trim($remainder);
+                // 过滤 EndNote 域代码（允许其出现在标题行后）
+                if ($remainder !== '' && !preg_match('/^\{\s*ADDIN\s+EN\.REFLIST\s*\}$/i', $remainder)) {
+                    $startRemainder = $remainder;
+                }
+                break;
+            }
+        }
+
+        if ($startIdx < 0) {
+            return [];
+        }
+
+        // 收集参考文献区域内容，遇到常见结尾段落标题则停止
+        $stopKeywords = [
+            'acknowledgements', 'acknowledgments', 'funding', 'appendix', 'supplementary',
+            'conflict of interest', 'competing interests', 'author contributions',
+            '致谢', '基金', '附录', '补充材料', '利益冲突', '作者贡献',
+        ];
+
+        // startRemainder 已在起点识别时处理
+
+        $raw = [];
+        if ($startRemainder !== '') {
+            $raw[] = $startRemainder;
+        }
+
+        for ($i = $startIdx + 1; $i < count($lines); $i++) {
+            $line = trim($lines[$i]);
+            if ($line === '') continue;
+            // 跳过 EndNote / Word 域代码
+            if (preg_match('/^\{\s*ADDIN\s+EN\.REFLIST\s*\}$/i', $line)) {
+                continue;
+            }
+
+            $lineLower = strtolower($line);
+            foreach ($stopKeywords as $sk) {
+                $skLower = strtolower($sk);
+                if ($lineLower === $skLower || $lineLower === $skLower . ':' || $lineLower === $skLower . '：') {
+                    $i = count($lines); // break outer
+                    continue 2;
+                }
+            }
+
+            $raw[] = $line;
+        }
+
+        if (empty($raw)) {
+            return [];
+        }
+
+        // 合并多行：以 “数字.” / “[数字]” / “数字]” 等作为新条目起始
+        $refs = [];
+        $current = '';
+        foreach ($raw as $line) {
+            $isNew = false;
+            if (preg_match('/^\s*(\[\d+\]|\d+\s*[\.\)]|\d+\s*\])\s*/u', $line)) {
+                $isNew = true;
+            }
+
+            if ($isNew) {
+                if (trim($current) !== '') {
+                    $refs[] = trim(preg_replace('/\s+/u', ' ', $current));
+                }
+                $current = $line;
+            } else {
+                // 续行拼接
+                if ($current === '') {
+                    $current = $line;
+                } else {
+                    $current .= ' ' . $line;
+                }
+            }
+        }
+        if (trim($current) !== '') {
+            $refs[] = trim(preg_replace('/\s+/u', ' ', $current));
+        }
+
+        return $refs;
+    }
    /**
     * 核心解码方法
     * @param string $str 待解码字符串