升级

2025-11-03 09:37:14 +08:00
parent 65ab2379f8
commit c53ca3aa1f
1 changed files with 764 additions and 0 deletions
--- a/application/common/ArticleParserService.php
+++ b/application/common/ArticleParserService.php
@@ -0,0 +1,764 @@
+<?php
+namespace app\common;
+use PhpOffice\PhpWord\IOFactory;
+use think\Exception;
+use ZipArchive;
+use RecursiveIteratorIterator;
+use RecursiveDirectoryIterator;
+use PhpOffice\PhpWord\Settings;
+use PhpOffice\PhpWord\Element\TextRun;
+use DOMDocument;
+use DOMXPath;
+// use BadMethodCallException;
+class ArticleParserService
+{
+    private $phpWord;
+    private $sections;
+
+    public function __construct($filePath = '')
+    {
+        if (!file_exists($filePath)) {
+            throw new Exception("文档不存在：{$filePath}");
+        }
+        try {
+            // 关键配置：关闭“仅读数据”，保留完整节结构
+            $reader = IOFactory::createReader();
+            $reader->setReadDataOnly(false);
+            Settings::setCompatibility(false);
+            Settings::setOutputEscapingEnabled(true); // 避免XML转义冲突
+
+            $doc = $reader->load($filePath);
+            $sectionCount = count($doc->getSections());
+            // $this->log("✅ 文档直接加载成功，节数量：{$sectionCount}");
+            $this->phpWord = $reader->load($filePath);
+            $this->sections = $this->phpWord->getSections();
+
+        } catch (\Exception $e) {
+            return json(['status' => 'error', 'msg' => $e->getMessage()]);
+        }
+    }
+
+    // 上传并解析文档的入口方法
+    public static function uploadAndParse($sFileUrl){
+        //必填值验证
+        if(empty($sFileUrl)){
+            return json_encode(['status' => 2,'msg' => 'Please upload the submission file']);
+        }
+
+        //判断文件是否执行
+        if (!file_exists($sFileUrl)) {
+            return json_encode(['status' => 3, 'msg' => 'The uploaded file does not exist']);
+        }
+        if (!is_readable($sFileUrl)) {
+            return json_encode(['status' => 4, 'msg' => 'The uploaded file is unreadable']);
+        }
+
+        // 解析文档
+        $oDealFile = new self($sFileUrl);
+        //获取标题
+        $sTitle = $oDealFile->getTitle();
+        if(empty($sTitle)){
+            return json_encode(['status' => 5, 'msg' => 'Article title retrieval failed']);
+        }
+        //获取作者
+        $aParam = ['title' => $sTitle];
+        $aAuthor = $oDealFile->getAuthors($aParam);
+        $aAuthorData = empty($aAuthor['author']) ? [] : $aAuthor['author'];//所有作者信息
+        $aAuthorReportData = empty($aAuthor['report']) ? [] : $aAuthor['report'];//通讯作者信息
+        $aParam['author'] = $aAuthorData;
+        $aParam['report'] = $aAuthorReportData;
+        //获取机构
+        $aCompany = $oDealFile->getCompany($aParam);
+        $aParam['company'] = $aCompany;
+        //获取通讯作者信息
+        $aParam['corresponding'] = $oDealFile->getCorrespondingAuthors($aParam);
+        //keywords 和 摘要
+        $aContent = $oDealFile->extractFromWord();
+        $aParam += empty($aContent['data']) ? [] : $aContent['data'];
+        return json_encode(['status' => 1,'msg' => 'success','data' => $aParam]);
+    }
+
+    // 提取文章标题
+    private function getTitle(){
+        $title = '';
+        $maxLength = 0;
+
+        foreach ($this->sections as $section) {
+            foreach ($section->getElements() as $element) {
+                $text = $this->getTextFromElement($element);
+                $length = mb_strlen(trim($text));
+                if ($length > $maxLength && $length > 10) { // 标题通常较长
+                    $title = trim($text);
+                    $maxLength = $length;
+                    break 2; // 取第一个最长段落作为标题
+                }
+            }
+        }
+        return $title;
+    }
+    // 提取作者
+ //    private function getAuthors($aParam = []) {
+ //        $title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
+ //        $sAuthorContent = $this->getNextParagraphAfterText($title);
+ //        if (empty($sAuthorContent)) {
+ //            return ['author' => [], 'report' => []];
+ //        }
+
+ //        //编码修复
+ //        $possibleEncodings = [
+ //            'Windows-1252', 'UTF-8', 'GBK', 'GB2312', 
+ //            'Latin-1', 'ISO-8859-1', 'CP1252'
+ //        ];
+ //        $encodedContent = @mb_convert_encoding($sAuthorContent, 'UTF-8', implode(',', $possibleEncodings));
+ //        $sAuthorContent = $encodedContent ?: $sAuthorContent;
+
+ //        //清理不可见字符
+ //        $sAuthorContent = preg_replace('/[\x00-\x1F\x7F\x{200B}-\x{200F}]/u', '', $sAuthorContent);
+
+ //        //修复特殊符号乱码
+ //        $symbolMap = [
+ //            'â€ ' => '†', 'â  ' => '†', 'â' => '†', '?†' => '†',
+ //            'ï¼š' => ':', 'ï¼Œ' => ',', 'â€”' => '-',
+ //            '啊' => '' // 针对性移除异常字符“啊”（若为固定乱码）
+ //        ];
+ //        $sAuthorContent = strtr($sAuthorContent, $symbolMap);
+
+ //        //格式标准化
+ //        $sAuthorContent = str_replace(['，', ';', '；', '、'], ',', $sAuthorContent); // 统一分隔符
+ //        $sAuthorContent = preg_replace('/\s+and\s+/i', ', ', $sAuthorContent); // and转逗号
+ //        $sAuthorContent = preg_replace('/\s+/', ' ', $sAuthorContent); // 合并多余空格
+ //        $sAuthorContent = trim($sAuthorContent);
+
+ //        // 处理作者
+ //        $content = mb_convert_encoding($sAuthorContent, 'UTF-8', 'auto'); // 确保编码正确
+ //        $content = str_replace("\xC2\xA0", ' ', $content); // 替换非-breaking空格为普通空格
+ //        $content = preg_replace('/(\d+)\s*([*#†])/', '$1$2', $content); // 合并"1 *"为"1*"、"1 #"为"1#"
+ //        $content = preg_replace('/,†/', ',†', $content); // 保留"1,†"格式（防止被拆分）
+ //        //标记上标内的逗号+空格（多编号）
+ //        $tempStr = preg_replace('/(\d+)\s*,\s*(\d+)/', '$1<SEP>$2', $content);
+ //        // 原有步骤2：正则匹配（扩展上标符号支持，保持原有逻辑）
+ //        $pattern = '/
+ //            ([A-Za-z\s\.\-]+?)          # 姓名（支持缩写、空格）
+ //            \s*                         # 姓名与上标间空格
+ //            (                           # 上标组（扩展符号支持）
+ //                \d+                     # 起始数字
+ //                (?:[†#*,]|<SEP>\d+)*    # 允许：†#*符号、逗号、<SEP>+数字（兼容1,†、1,*等）
+ //            )
+ //            \s*,?                       # 作者间逗号（可选）
+ //            (?=\s|$)                    # 确保后面是空格或结尾
+ //        /ux';
+
+ //        preg_match_all($pattern, $tempStr, $matches);
+ //        $authorList = [];
+ //        if(!empty($matches[1])){
+ //            foreach ($matches[1] as $i => $name) {
+ //                $name = trim($name);
+ //                $superscript = trim($matches[2][$i]);
+ //                $superscript = str_replace('<SEP>', ',', $superscript); // 恢复多编号逗号
+ //                $superscript = preg_replace('/,$/', '', $superscript); // 清理末尾逗号
+ //                // 修复符号与数字间的空格（如原始"1 *"被误处理为"1*"的情况，保持原样）
+ //                $superscript = preg_replace('/(\d)([*#†])/', '$1$2', $superscript);
+ //                if (!empty($name)) {
+ //                    $authorList[] = [
+ //                        'name' => $name,
+ //                        'superscript' => $superscript
+ //                    ];
+ //                }
+ //            }
+ //        }else {
+ //            // 按“两个或多个连续空格”拆分（姓名之间的分隔）
+ //            $authorList = array_filter(
+ //                array_map('trim', 
+ //                    preg_split('/(,\p{Z}*|\p{Z}{2,})/u', $sAuthorContent)
+ //                )
+ //            );
+ //        }
+        
+
+ //        // //处理作者
+ //        // $authorList = [];
+ //        // // 新正则：匹配“姓名+上标”整体，允许上标含逗号（如1,†）
+ //        // // 逻辑：姓名以字母/中文开头，上标以数字开头、以符号/数字结尾
+ //        // // if (preg_match_all('/([A-Za-z\x{4e00}-\x{9fa5}][A-Za-z\s·\-\'\x{4e00}-\x{9fa5}]*)\s*([\d,†#*]+)/u', $sAuthorContent, $matches)) {
+ //        // if(preg_match_all('/([A-Za-z\x{4e00}-\x{9fa5}][A-Za-z\s·\-\'\x{4e00}-\x{9fa5}]*)\s*(\d[\d,†#\s*]*)/u', $sAuthorContent, $matches)){
+ //        //     for ($i = 0; $i < count($matches[1]); $i++) {
+ //        //         $authorList[] = trim($matches[1][$i] . $matches[2][$i]);
+ //        //     }
+ //        // } else {
+ //        //     // 按“两个或多个连续空格”拆分（姓名之间的分隔）
+ //        //     $authorList = array_filter(
+ //        //         array_map('trim', 
+ //        //             preg_split('/(,\p{Z}*|\p{Z}{2,})/u', $sAuthorContent)
+ //        //         )
+ //        //     );
+ //        // }
+ //        $aAuthorData = [];
+ //        $aReport = [];
+ //        $namePattern = '/
+ //            (?:[A-Za-z\s·\-\']+|                # 英文姓名（支持空格、连字符）
+ //             [\x{4e00}-\x{9fa5}]+|             # 中文姓名
+ //             [\x{1800}-\x{18AF}]+|             # 蒙古文姓名
+ //             [A-Z]\.)                           # 单字母缩写（如 J.）
+ //        /ux';
+ // var_dump($authorList);exit;
+ //        foreach ($authorList as $authorStr) {
+ //            if (empty($authorStr)) continue;
+ //            var_dump($authorList);exit;
+ //            //分离姓名与上标（支持上标含逗号，如1,†）
+ //            $superscript = '';
+ //            // 新正则：匹配以数字开头、含逗号/符号的完整上标（如1,†、2*#）
+ //            $authorStr = trim(trim($authorStr,','),' ');
+ //            // if (preg_match('/([\d,†#*]+)$/u', $authorStr, $supMatch)) {
+ //            // if(preg_match('/\s*([\d,†#* ]+)$/u', $authorStr, $supMatch)){
+ //            // if (preg_match('/.*?\s*([\d,†#* ]+)$/u', $authorStr, $supMatch)) {
+ //            // if (preg_match('/.*?\s*([\d,\x{2020}#* ]+?)\s*$/u', $authorStr, $supMatch)) {
+ //            // if (preg_match('/^(.+?)\D*?(\d[\d,#*†,\s]*)$/u', $authorStr, $supMatch)) {
+ //            //     $superscript = $supMatch[1];
+ //            //     // 移除上标，保留纯姓名（避免残留符号）
+ //            //     $nameStr = trim(preg_replace('/' . preg_quote($superscript, '/') . '$/', '', $authorStr));
+ //            // } else {
+ //            //     $nameStr = $authorStr;
+ //            // }
+ //            $pattern = '/^(.+?)\s*(\d[\d,#*†\s]*?)\s*$/u';
+ //            if (preg_match($pattern, $authorStr, $supMatch)) {
+ //                $nameStr = empty($supMatch[1]) ? '' : trim($supMatch[1]); // 姓名部分："Liguo Zhang"
+ //                $superscript = empty($supMatch[2]) ? $nameStr : $nameStr.trim($supMatch[2]); // 上标部分："1 
+ //                // echo "姓名: $nameStr, 上标: $superscript\n";
+ //            } else {
+ //                $nameStr = $authorStr;
+ //            }
+ //            //验证姓名合法性（过滤无效内容）
+ //            if (!preg_match($namePattern, $nameStr)) {
+ //                continue;
+ //            }
+ //            //解析上标信息（正确识别1,†中的机构编号和符号）
+ //            $companyId = '';
+ //            $isSuper = 0;
+ //            $isReport = 0;
+ //            if (!empty($superscript)) {
+ //                // 提取机构编号（忽略上标中的逗号，如1,† → 提取1）
+ //                if (preg_match('/(\d+)/', $superscript, $numMatch)) {
+ //                    $companyId = $numMatch[1];
+ //                }
+ //                // 识别特殊符号（#为超级作者，*†为通讯作者）
+ //                $isSuper = strpos($superscript, '#') !== false ? 1 : 0;
+ //                $isReport = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0;
+ //            }
+ //            if (preg_match("/^([A-Za-z\s'\.-]+)/u", $nameStr, $match)) {
+ //                $nameStr = trim($match[1]);
+ //            }
+ //            $aAuthorData[] = [
+ //                'name' => $nameStr,
+ //                'company_id' => $companyId,
+ //                'is_super' => $isSuper,
+ //                'is_report' => $isReport
+ //            ];
+ //            if ($isReport) {
+ //                $aReport[] = $nameStr;
+ //            }
+ //        }
+ //           var_dump($aAuthorData);exit;
+ //        return ['author' => $aAuthorData,'report' => array_unique($aReport)];
+ //    }
+    private function getAuthors($aParam = []) {
+        $title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
+        $sAuthorContent = $this->getNextParagraphAfterText($title);
+        if (empty($sAuthorContent)) {
+            return ['author' => [], 'report' => []];
+        }
+
+        //编码修复
+        $possibleEncodings = [
+            'Windows-1252', 'UTF-8', 'GBK', 'GB2312', 
+            'Latin-1', 'ISO-8859-1', 'CP1252'
+        ];
+        $encodedContent = @mb_convert_encoding($sAuthorContent, 'UTF-8', implode(',', $possibleEncodings));
+        $sAuthorContent = $encodedContent ?: $sAuthorContent;
+
+        //清理不可见字符
+        $sAuthorContent = preg_replace('/[\x00-\x1F\x7F\x{200B}-\x{200F}]/u', '', $sAuthorContent);
+
+        //修复特殊符号乱码
+        $symbolMap = [
+            'â€ ' => '†', 'â  ' => '†', 'â' => '†', '?†' => '†',
+            'ï¼š' => ':', 'ï¼Œ' => ',', 'â€”' => '-',
+            '啊' => '' // 针对性移除异常字符“啊”（若为固定乱码）
+        ];
+        $sAuthorContent = strtr($sAuthorContent, $symbolMap);
+
+        //格式标准化
+        $sAuthorContent = str_replace(['，', ';', '；', '、'], ',', $sAuthorContent); // 统一分隔符
+        $sAuthorContent = preg_replace('/\s+and\s+/i', ', ', $sAuthorContent); // and转逗号
+        $sAuthorContent = preg_replace('/\s+/', ' ', $sAuthorContent); // 合并多余空格
+        $sAuthorContent = trim($sAuthorContent);
+
+        // 处理作者
+        $content = mb_convert_encoding($sAuthorContent, 'UTF-8', 'auto'); // 确保编码正确
+        $content = str_replace("\xC2\xA0", ' ', $content); // 替换非-breaking空格为普通空格
+        $content = preg_replace('/(\d+)\s*([*#†])/', '$1$2', $content); // 合并"1 *"为"1*"、"1 #"为"1#"
+        $content = preg_replace('/,†/', ',†', $content); // 保留"1,†"格式（防止被拆分）
+        //标记上标内的逗号+空格（多编号）
+        $tempStr = preg_replace('/(\d+)\s*,\s*(\d+)/', '$1<SEP>$2', $content);
+        // 原有步骤2：正则匹配（扩展上标符号支持，保持原有逻辑）
+        $pattern = '/
+            ([A-Za-z\s\.\-]+?)          # 姓名（支持缩写、空格）
+            \s*                         # 姓名与上标间空格
+            (                           # 上标组（扩展符号支持）
+                \d+                     # 起始数字
+                (?:[†#*,]|<SEP>\d+)*    # 允许：†#*符号、逗号、<SEP>+数字（兼容1,†、1,*等）
+            )
+            \s*,?                       # 作者间逗号（可选）
+            (?=\s|$)                    # 确保后面是空格或结尾
+        /ux';
+
+        preg_match_all($pattern, $tempStr, $matches);
+        $authorList = [];
+        if(!empty($matches[1])){
+            foreach ($matches[1] as $i => $name) {
+                $name = trim($name);
+                $superscript = trim($matches[2][$i]);
+                $superscript = str_replace('<SEP>', ',', $superscript); // 恢复多编号逗号
+                $superscript = preg_replace('/,$/', '', $superscript); // 清理末尾逗号
+                // 修复符号与数字间的空格（如原始"1 *"被误处理为"1*"的情况，保持原样）
+                $superscript = preg_replace('/(\d)([*#†])/', '$1$2', $superscript);
+                if (!empty($name)) {
+                    $authorList[] = [
+                        'name' => $name,
+                        'superscript' => $superscript
+                    ];
+                }
+            }
+        }else {
+            // 按“两个或多个连续空格”拆分（姓名之间的分隔）
+            $authorList = array_filter(
+                array_map('trim', 
+                    preg_split('/(,\p{Z}*|\p{Z}{2,})/u', $sAuthorContent)
+                )
+            );
+        }
+        
+
+        // //处理作者
+        $aAuthorData = [];
+        $aReport = [];
+        $namePattern = '/
+            (?:[A-Za-z\s·\-\']+|                # 英文姓名（支持空格、连字符）
+             [\x{4e00}-\x{9fa5}]+|             # 中文姓名
+             [\x{1800}-\x{18AF}]+|             # 蒙古文姓名
+             [A-Z]\.)                           # 单字母缩写（如 J.）
+        /ux';
+ 
+        foreach ($authorList as $authorStr){
+            if (empty($authorStr)) continue;
+            
+            //获取下标
+            $superscript = empty($authorStr['superscript']) ? $authorStr : $authorStr['superscript'];
+            $nameStr = empty($authorStr['name']) ? $authorStr : $authorStr['name'];
+
+            $companyId = [];
+            $isSuper = 0;
+            $isReport = 0;
+            if (!empty($superscript)) {
+                // 提取机构编号（忽略上标中的逗号，如1,† → 提取1）
+                preg_match_all('/\d+/', $superscript, $numMatch);
+                // 识别特殊符号（#为超级作者，*†为通讯作者）
+                $isSuper = strpos($superscript, '#') !== false ? 1 : 0;
+                $isReport = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0;
+            }
+            if (preg_match("/^([A-Za-z\s'\.-]+)/u", $nameStr, $match)) {
+                $nameStr = trim($match[1]);
+            }
+            $aAuthorData[] = [
+                'name' => $nameStr,
+                'company_id' => empty($numMatch[0]) ? [] : $numMatch[0],
+                'is_super' => $isSuper,
+                'is_report' => $isReport
+            ];
+            if ($isReport) {
+                $aReport[] = $nameStr;
+            }
+        }
+        return ['author' => $aAuthorData,'report' => array_unique($aReport)];
+    }
+
+    // 获取机构
+    private function getCompany($aParam = []){
+        //获取标题
+        $title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
+        //获取标题下的作者
+        $sAuthorContent = empty($aParam['authors']) ? $this->getNextParagraphAfterText($title) : $aParam['authors'];
+        //获取作者结构
+        $sCompany = $this->getContentAfterText($sAuthorContent);
+        if(empty($sCompany)){
+            return [];
+        }
+        //编码修复
+        $possibleEncodings = [
+            'Windows-1252', 'UTF-8', 'GBK', 'GB2312', 
+            'Latin-1', 'ISO-8859-1', 'CP1252'
+        ];
+        $encodedContent = @mb_convert_encoding($sCompany, 'UTF-8', implode(',', $possibleEncodings));
+        $sCompany = $encodedContent ?: $sCompany;
+        //按行拆分，保留数字开头的行
+        $sCompany = str_replace(["\r\n", "\r"], "\n", $sCompany);
+        $aCompanyLines = explode("\n", $sCompany);
+        $aCompanyLines = array_filter(array_map('trim', $aCompanyLines), function($line) {
+            return preg_match('/^\d+/', $line); // 仅保留数字开头的行
+        });
+
+        $aCompany = [];
+        foreach ($aCompanyLines as $line) {
+            if (preg_match('/^(\d+)\s*(.+)$/', $line, $match)) {
+                if(empty($match[1]) || empty($match[2])){
+                    continue;
+                }
+                $aCompany[$match[1]] = ltrim(trim(ltrim($match[2]),'.'),' ');
+            }
+        }
+        return $aCompany;
+    }
+
+    // 提取通讯作者（含E-mail、地址、电话）
+    private function getCorrespondingAuthors($aParam = []){
+        $aCorrespondingAuthor = empty($aParam['report']) ? [] : $aParam['report'];
+        if(empty($aCorrespondingAuthor)){
+            return [];
+        }
+
+        // 获取标题
+        $title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
+        $sAuthorContent = $this->getNextParagraphAfterText($title);
+        $sCompany = $this->getNextParagraphAfterText($sAuthorContent); // 直接取机构所在段落的原始文本
+        if (empty($sCompany)) {
+            // 备选方案：若机构段落获取失败，用解析后的机构名称拼接
+            $aCompany = $this->getCompany($aParam);
+            $sCompany = implode(' ', array_values($aCompany));
+        }
+
+        // 获取机构后的完整内容
+        $corrText = $this->getContentAfterText($sCompany);
+         //编码修复
+        $possibleEncodings = [
+            'Windows-1252', 'UTF-8', 'GBK', 'GB2312', 
+            'Latin-1', 'ISO-8859-1', 'CP1252'
+        ];
+        $encodedContent = @mb_convert_encoding($corrText, 'UTF-8', implode(',', $possibleEncodings));
+        $corrText = $encodedContent ?: $corrText;
+        // // 调试
+        // file_put_contents(ROOT_PATH . 'runtime/corr_text_raw.log', $corrText);
+
+        //清理文本
+        $corrText = str_replace(['：', '＠'], [':', '@'], $corrText);
+        $corrText = preg_replace('/\s+/', ' ', $corrText); // 统一空格
+        $corrText = str_replace('  ', ' ', $corrText); // 去除多余空格
+
+        //按"*"分割通讯作者
+        $corrBlocks = preg_split('/\s*\*\s*/', $corrText);
+        $corrBlocks = array_filter(array_map('trim', $corrBlocks));
+
+        $aCorresponding = [];
+        foreach ($corrBlocks as $block) {
+            //匹配通讯作者姓名
+            $sName = $this->matchCorrespondingName($block, $aCorrespondingAuthor);
+            if (empty($sName)) {
+                continue;
+            }
+            preg_match('/(E[\s-]*mail|邮箱)[\s:]*([^\s]+@[^\s]+)/i', $block, $email);
+            preg_match('/(Postal[\s-]*address|地址)[\s:]*([^,;]+)/i', $block, $address);
+            preg_match('/(Tel|电话)[\s:]*([^\s]+)/i', $block, $tel);
+
+            $aCorresponding[] = [
+                'name' => $sName,
+                'email' => isset($email[2]) ? trim($email[2]) : '',
+                'postal_address' => isset($address[2]) ? trim($address[2]) : '',
+                'tel' => isset($tel[2]) ? trim($tel[2]) : ''
+            ];
+        }
+        return $aCorresponding;
+    }
+
+    //匹配通讯作者姓名
+    private function matchCorrespondingName($block, $corrNames)
+    {
+        $blockLower = strtolower($block);
+        foreach ($corrNames as $name) {
+            if (strpos($blockLower, strtolower($name)) !== false) {
+                return $name;
+            }
+            $nameParts = explode(' ', $name);
+            if (count($nameParts) >= 2) {
+                $reversedName = implode(' ', array_reverse($nameParts));
+                if (strpos($blockLower, strtolower($reversedName)) !== false) {
+                    return $name;
+                }
+            }
+        }
+        return '';
+    }
+
+    // 获取目标文本的下一个段落
+    private function getNextParagraphAfterText($targetText){
+
+        $found = false;
+        foreach ($this->sections as $section) {
+            foreach ($section->getElements() as $element) {
+                $text = $this->getTextFromElement($element);
+                if(empty($text)){
+                    continue;
+                }
+                if ($found) {
+                    return $text;
+                }
+                if (stripos($text, $targetText) !== false) {
+                    $found = true;
+                }
+            }
+        }
+        return '';
+    }
+
+    // 获取目标文本后的所有内容
+    private function getContentAfterText($targetText){
+        $found = false;
+        $content = [];
+        $stopKeywords = ['关键词', 'Key words', '摘要', 'Abstract'];
+        $maxLines = 200;
+        $lineNumber = 0;
+        foreach ($this->sections as $section) {
+
+            foreach ($section->getElements() as $element) {
+
+                $lineNumber++;
+                if (count($content) >= $maxLines) break;
+
+                $text = $this->getTextFromElement($element,$lineNumber);
+                $text = trim($text);
+                if (empty($text)) continue;
+                if (!$found) {
+                    // 移除所有非字母数字字符后匹配
+                    $cleanTarget = preg_replace('/[^a-zA-Z0-9]/', '', strtolower($targetText));
+                    $cleanText = preg_replace('/[^a-zA-Z0-9]/', '', strtolower($text));
+                    // 只要目标文本的50%以上能匹配即可
+                    if (strlen($cleanTarget) > 0 && similar_text($cleanText, $cleanTarget) / strlen($cleanTarget) > 0.5) {
+                        $found = true;
+                    }
+                    continue;
+                }
+
+                // 检查停止关键词
+                $shouldStop = false;
+                foreach ($stopKeywords as $kw) {
+                    if (stripos($text, $kw) !== false) {
+                        $shouldStop = true;
+                        break;
+                    }
+                }
+                if ($shouldStop) break;
+
+                $content[] = $text;
+            }
+            if (count($content) >= $maxLines || (isset($shouldStop) && $shouldStop)) break;
+        }
+        return implode("\n", $content);
+    }
+
+    // 统一提取元素文本
+    private function getTextFromElement($element,$lineNumber = 0){
+        $text = '';
+        // 处理PreserveText元素
+        if ($element instanceof \PhpOffice\PhpWord\Element\PreserveText) {
+            // 通过反射获取私有属性 text
+            $reflection = new \ReflectionClass($element);
+            $property = $reflection->getProperty('text');
+            $property->setAccessible(true);
+            $textParts = $property->getValue($element);
+            foreach ($textParts as $part) {
+                if (strpos($part, 'HYPERLINK') !== false) {
+                    // 解码 HTML 实体（&quot; -> "）
+                    $decoded = html_entity_decode($part);
+                    // 提取 mailto: 后的邮箱
+                    if (preg_match('/mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/i', $decoded, $match)) {
+                        $text .= $match[1] . ' ';
+                    }
+                } else {
+                    // 普通文本直接拼接
+                    $text .= $part;
+                }
+            }
+            return $text;
+        }
+        // 处理表格和单元格（E-mail可能在表格中）
+        if ($element instanceof \PhpOffice\PhpWord\Element\Table) {
+            foreach ($element->getRows() as $row) {
+                foreach ($row->getCells() as $cell) {
+                    $text .= $this->getTextFromElement($cell);
+                }
+            }
+            return $text;
+        }
+        if ($element instanceof \PhpOffice\PhpWord\Element\Cell) {
+            foreach ($element->getElements() as $child) {
+                $text .= $this->getTextFromElement($child);
+            }
+            return $text;
+        }
+
+        //处理嵌套元素（递归提取所有子元素）
+        if (method_exists($element, 'getElements')) {
+            foreach ($element->getElements() as $child) {
+                $text .= $this->getTextFromElement($child);
+            }
+        }
+
+        //处理文本元素（包括带格式的文本）
+        if ($element instanceof \PhpOffice\PhpWord\Element\Text) {
+            $text .= $element->getText();
+        }
+
+        //处理超链接（优先提取链接目标，可能是邮箱）
+        if ($element instanceof \PhpOffice\PhpWord\Element\Link) {
+            $target = $element->getTarget();
+            if (strpos($target, 'mailto:') === 0) {
+                $text .= str_replace('mailto:', '', $target) . ' '; // 剥离mailto:前缀
+            }
+            $text .= $element->getText() . ' ';
+        }
+
+        //处理字段和注释（可能包含隐藏邮箱）
+        if ($element instanceof \PhpOffice\PhpWord\Element\Field) {
+            $text .= $element->getContent() . ' ';
+        }
+        if ($element instanceof \PhpOffice\PhpWord\Element\Note) {
+            $text .= $element->getContent() . ' ';
+        }
+        //清理所有不可见字符（关键：移除格式干扰）
+        $text = preg_replace('/[\x00-\x1F\x7F-\x9F]/', ' ', $text); // 移除控制字符
+        $text = str_replace(["\t", "\r", "\n"], ' ', $text); // 统一空白字符
+        $text = preg_replace('/\s+/', ' ', $text); // 合并多个空格
+
+        return $text;
+    }
+
+    /**
+     * 从 Word 文档提取摘要和关键词
+     * @return array 提取结果
+     */
+    public function extractFromWord() {
+        $sContent = '';
+        //文本处理
+        $sFundContent = '';
+        foreach ($this->sections as $section) {
+            foreach ($section->getElements() as $element) {
+                $textContent = $this->getTextFromElement($element);
+                if(empty($textContent)){
+                    continue;
+                }
+                //编码修复
+                $possibleEncodings = [
+                    'Windows-1252', 'UTF-8', 'GBK', 'GB2312', 
+                    'Latin-1', 'ISO-8859-1', 'CP1252'
+                ];
+                $sContent .= @mb_convert_encoding($textContent, 'UTF-8', implode(',', $possibleEncodings));
+                if(stripos($textContent, 'Keywords:') !== false){
+                    $sContent .= "Keywords-End-Flag";
+                }
+                if(empty($sFundContent)){
+                    $aFund = $this->getMatchedFundPhrases($sContent);
+                    if(!empty($aFund[0])){
+                        $position = stripos($sContent, $aFund[0]);
+                        $sFundContent = substr($sContent, $position);
+                        $sFundContent = trim(str_ireplace($aFund[0], '', $sFundContent));
+                        if (preg_match('/^(.*?)Peer review/', $sFundContent, $matches)) {
+                            $sFundContent = $matches[1]; // 提取匹配到的前置内容
+                        }
+                    }
+                }
+                $sContent .= "\n";
+            }
+        }
+        // 2. 基础文本清理（合并多余空格，保留有效换行）
+        $textContent = preg_replace('/(\S)\s+/', '$1 ', $sContent);
+        $textContent = trim($textContent);
+
+        // 3. 提取摘要
+        $abstract = '';
+        $abstractPattern = '/Abstract\s*([\s\S]*?)(?=Keywords:|$)/i';
+        if (preg_match($abstractPattern, $textContent, $abstractMatches)) {
+            $abstract = trim($abstractMatches[1]);
+            $abstract = preg_replace('/\n+/', ' ', $abstract);
+        }
+        // 4. 提取关键词（核心：仅保留两种强制匹配逻辑）
+        $keywords = [];
+        // $keywordPattern = '/Keywords:\s*([\s\S]*?)(?=\s*\d+\.|[;,]\s*[\r\n]+\s*[\r\n]+|(?i)\bintroduction|abbreviations\b|$)/i';
+        $keywordPattern = '/Keywords:\s*(.*?)\s*Keywords-End-Flag/s';
+        if (preg_match($keywordPattern, $textContent, $keywordMatches)) {
+            $keywordStr = trim($keywordMatches[1]);
+            
+            // 清理关键词列表格式（去除换行、末尾多余符号）
+            $keywordStr = preg_replace('/\n+/', ' ', $keywordStr);
+            $keywordStr = rtrim($keywordStr, ';,. '); // 去除末尾分号、逗号等
+            $keywordStr = trim($keywordStr);
+
+            // 分割并过滤有效关键词
+            $keywords = preg_split('/[,;]\s*/', $keywordStr);
+            $keywords = array_filter(array_map('trim', $keywords), function($item) {
+                return !empty($item) && !ctype_space($item);
+            });
+        }
+        return [
+            'status' => 1,
+            'msg' => '提取成功',
+            'data' => [
+                'abstrart' => $abstract,
+                'keywords' => $keywords,
+                'fund' => $sFundContent
+            ]
+        ];
+    }
+    private function getMatchedFundPhrases($content = '') {
+        if (empty($content)) {
+            return [];
+        }
+
+        // 基金支持词组列表
+        $fundPhrases = [
+            'Supported by', 'Funded by', 'Sponsored by', 'Supported in part by',
+            'Funding was provided by', 'Funded in part by'
+        ];
+
+        // 1. 转义词组中的特殊字符，使用 # 作为分隔符
+        $escapedPhrases = array_map(function($phrase) {
+            return preg_quote($phrase, '#');
+        }, $fundPhrases);
+
+        // 2. 拼接为正则模式：匹配任意一个词组（保留原始词组的捕获）
+        $pattern = '#('.implode('|', $escapedPhrases).')#i'; 
+        // 注意：此处用 () 捕获分组，而非 (?:)，用于提取匹配到的具体词组
+
+        // 3. 全局匹配所有符合的词组
+        preg_match_all($pattern, $content, $matches);
+
+        // 4. 处理结果：去重、保留原始词组格式（忽略大小写导致的变体）
+        $matched = [];
+        if (!empty($matches[1])) {
+            // 遍历匹配到的结果（可能包含大小写变体，如 'funded by'）
+            foreach ($matches[1] as $match) {
+                // 与原始词组列表比对，找到完全匹配的原始词组（忽略大小写）
+                foreach ($fundPhrases as $original) {
+                    if (strcasecmp($match, $original) === 0) {
+                        $matched[] = $original;
+                        break; // 找到后跳出内层循环，避免重复
+                    }
+                }
+            }
+            // 去重并保持原始顺序
+            $matched = array_values(array_unique($matched));
+        }
+
+        return $matched;
+    }
+    //日志打印
+    private function log($msg){
+        // echo date('[Y-m-d H:i:s] ') . $msg . "\n";
+    }
+}