tougao/application/common/ArticleParserService.php

<?php
namespace app\common;
use PhpOffice\PhpWord\IOFactory;
use think\Exception;
use ZipArchive;
use RecursiveIteratorIterator;
use RecursiveDirectoryIterator;
use PhpOffice\PhpWord\Settings;
use PhpOffice\PhpWord\Element\TextRun;
use DOMDocument;
use DOMXPath;
// use BadMethodCallException;
class ArticleParserService
{
    private $phpWord;
    private $sections;

    public function __construct($filePath = '')
    {
        if (!file_exists($filePath)) {
            return json_encode(['status' => 5, 'msg' => '"文档不存在：{$filePath}"']);
        }
        try {
            // 关键配置：关闭“仅读数据”，保留完整节结构
            $reader = IOFactory::createReader();
            $reader->setReadDataOnly(false);
            Settings::setCompatibility(false);
            Settings::setOutputEscapingEnabled(true); // 避免XML转义冲突

            $doc = $reader->load($filePath);
            $sectionCount = count($doc->getSections());
            // $this->log("✅ 文档直接加载成功，节数量：{$sectionCount}");
            $this->phpWord = $reader->load($filePath);
            $this->sections = $this->phpWord->getSections();
        } catch (\Exception $e) {
            // 预处理：移除 DOCX 中的 EMF 图片
            $processedFilePath = $this->removeEmfFromDocx($filePath);
            // 加载处理后的文档
            $reader = IOFactory::createReader();
            $reader->setReadDataOnly(false);
            Settings::setCompatibility(false);
            Settings::setOutputEscapingEnabled(true);

            $this->phpWord = $reader->load($processedFilePath);
            $this->sections = $this->phpWord->getSections();

            // 可选：删除临时处理文件（避免冗余）
            unlink($processedFilePath);
            return json_encode(['status' => 5, 'msg' => $e->getMessage()]);
        }
    }
    /**
     * 移除 DOCX 压缩包内的所有 EMF 图片
     * @param string $docxPath 原 DOCX 文件路径
     * @return string 处理后的临时 DOCX 路径
     */
    private function removeEmfFromDocx($docxPath){
        $zip = new ZipArchive();
        if ($zip->open($docxPath) !== true) {
            throw new \Exception("无法打开 DOCX 文件：{$docxPath}");
        }

        // 1. 创建临时目录用于解压
        $tempDir = rtrim(ROOT_PATH,'/').'/runtime/'.uniqid('docx_temp_');

        mkdir($tempDir, 0700, true);

        // 2. 解压 DOCX 到临时目录
        $zip->extractTo($tempDir);
        $zip->close();

        // 3. 递归删除所有 EMF 文件
        $dirIterator = new RecursiveDirectoryIterator($tempDir);
        $iterator = new RecursiveIteratorIterator($dirIterator);
        foreach ($iterator as $file) {
            if ($file->isFile() && strtolower(pathinfo($file, PATHINFO_EXTENSION)) === 'emf') {
                unlink($file->getPathname());
            }
        }
        // 4. 重新打包为 DOCX
        $processedPath = $tempDir . '_processed.docx';
        $newZip = new ZipArchive();
        if ($newZip->open($processedPath, ZipArchive::CREATE | ZipArchive::OVERWRITE) !== true) {
            throw new \Exception("无法创建处理后的 DOCX 文件");
        }

        // 遍历临时目录，添加所有文件到新压缩包
        $this->addFilesToZip($tempDir, $newZip);
        $newZip->close();

        // 5. 删除临时解压目录
        $this->deleteDir($tempDir);

        return $processedPath;
    }

    /**
     * 递归添加目录文件到 ZipArchive
     * @param string $dir 目录路径
     * @param ZipArchive $zip ZipArchive 实例
     */
    private function addFilesToZip($dir, $zip)
    {
        $files = scandir($dir);
        foreach ($files as $file) {
            if ($file === '.' || $file === '..') continue;

            $filePath = $dir . '/' . $file;
            if (is_dir($filePath)) {
                $this->addFilesToZip($filePath, $zip);
            } else {
                // 计算压缩包内的相对路径（避免冗余目录层级）
                $relativePath = str_replace(dirname($dir) . '/', '', $filePath);
                $zip->addFile($filePath, $relativePath);
            }
        }
    }

    /**
     * 递归删除目录
     * @param string $dir 目录路径
     */
    private function deleteDir($dir){
        // 1. 基础校验：非空字符串且为有效目录
        if (trim($dir) === '' || !is_dir($dir)) {
            return false;
        }

        // 2. 统一路径格式（去除尾部分隔符，避免跨系统差异）
        $dir = rtrim($dir, DIRECTORY_SEPARATOR);
        $dirName = basename($dir);

        // 3. 前缀强校验：仅处理docx_temp_开头的目录
        if (strpos($dirName, 'docx_temp_') !== 0) {
            return false;
        }

        // 4. 路径归属校验（缓存realpath结果，减少I/O）
        $runtimeDir = rtrim(ROOT_PATH, '/') . '/runtime';
        $realDir = realpath($dir);
        $realRuntimeDir = realpath($runtimeDir);
        if ($realDir === false || $realRuntimeDir === false || strpos($realDir, $realRuntimeDir) !== 0) {
            return false;
        }

        // 5. 扫描目录（带错误抑制，处理权限问题）
        $files = @scandir($dir);
        if ($files === false) {
            return false;
        }

        $isFullyDeleted = true; // 标记是否完全删除

        // 6. 递归处理子项
        foreach ($files as $file) {
            if ($file === '.' || $file === '..') {
                continue;
            }

            $filePath = $dir . DIRECTORY_SEPARATOR . $file;
            $realFilePath = realpath($filePath);

            // 子路径校验：必须是当前目录的子项（防符号链接跳转）
            if ($realFilePath === false || strpos($realFilePath, $realDir) !== 0) {
                $isFullyDeleted = false;
                continue;
            }

            if (is_dir($realFilePath)) {
                // 递归删除子目录，继承校验逻辑
                if (!$this->deleteDir($realFilePath)) {
                    $isFullyDeleted = false;
                }
            } else {
                // 尝试删除文件（失败则标记未完全删除）
                if (!@unlink($realFilePath)) {
                    $isFullyDeleted = false;
                }
            }
        }

        // 7. 最终删除目录（确保空目录才删除）
        $remainingFiles = @scandir($dir);
        if ($remainingFiles !== false && count($remainingFiles) <= 2) {
            @rmdir($dir);
            return $isFullyDeleted; // 若子项完全删除，则返回true
        }

        return false;
    }

    // 上传并解析文档的入口方法
    public static function uploadAndParse($sFileUrl){
        //必填值验证
        if(empty($sFileUrl)){
            return json_encode(['status' => 2,'msg' => 'Please upload the submission file']);
        }

        //判断文件是否执行
        if (!file_exists($sFileUrl)) {
            return json_encode(['status' => 3, 'msg' => 'The uploaded file does not exist']);
        }
        if (!is_readable($sFileUrl)) {
            return json_encode(['status' => 4, 'msg' => 'The uploaded file is unreadable']);
        }

        // 解析文档
        $oDealFile = new self($sFileUrl);
        //获取标题
        $sTitle = $oDealFile->getTitle();
        if(empty($sTitle)){
            return json_encode(['status' => 5, 'msg' => 'Article title retrieval failed']);
        }
        //获取作者
        $aParam = ['title' => $sTitle];
        $aAuthor = $oDealFile->getAuthors($aParam);
        $aAuthorData = empty($aAuthor['author']) ? [] : $aAuthor['author'];//所有作者信息
        $aAuthorReportData = empty($aAuthor['report']) ? [] : $aAuthor['report'];//通讯作者信息
        $aParam['author'] = $aAuthorData;
        $aParam['report'] = $aAuthorReportData;
        //获取机构
        $aCompany = $oDealFile->getCompany($aParam);
        $aParam['company'] = $aCompany;
        //获取通讯作者信息
        $aParam['corresponding'] = $oDealFile->getCorrespondingAuthors($aParam);
        //keywords 和 摘要
        $aContent = $oDealFile->extractFromWord();
        $aParam += empty($aContent['data']) ? [] : $aContent['data'];
        return json_encode(['status' => 1,'msg' => 'success','data' => $aParam]);
    }

    // 提取文章标题
    private function getTitle(){
        if(empty($this->sections)){
            return '';
        }
        $title = '';
        $maxLength = 0;
        foreach ($this->sections as $section) {
            foreach ($section->getElements() as $element) {
                $text = $this->getTextFromElement($element);
                $length = mb_strlen(trim($text));
                if ($length > $maxLength && $length > 10) { // 标题通常较长
                    $title = trim($text);
                    $maxLength = $length;
                    break 2; // 取第一个最长段落作为标题
                }
            }
        }
        if(!empty($title) && !mb_check_encoding($title, 'UTF-8')){
            $title = mb_convert_encoding($title, 'UTF-8', 'GBK');
        }
        return $title;
    }
    // 提取作者
 //    private function getAuthors($aParam = []) {
 //        $title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
 //        $sAuthorContent = $this->getNextParagraphAfterText($title);
 //        if (empty($sAuthorContent)) {
 //            return ['author' => [], 'report' => []];
 //        }

 //        //编码修复
 //        $possibleEncodings = [
 //            'Windows-1252', 'UTF-8', 'GBK', 'GB2312',
 //            'Latin-1', 'ISO-8859-1', 'CP1252'
 //        ];
 //        $encodedContent = @mb_convert_encoding($sAuthorContent, 'UTF-8', implode(',', $possibleEncodings));
 //        $sAuthorContent = $encodedContent ?: $sAuthorContent;

 //        //清理不可见字符
 //        $sAuthorContent = preg_replace('/[\x00-\x1F\x7F\x{200B}-\x{200F}]/u', '', $sAuthorContent);

 //        //修复特殊符号乱码
 //        $symbolMap = [
 //            'â€ ' => '†', 'â  ' => '†', 'â' => '†', '?†' => '†',
 //            'ï¼š' => ':', 'ï¼Œ' => ',', 'â€”' => '-',
 //            '啊' => '' // 针对性移除异常字符“啊”（若为固定乱码）
 //        ];
 //        $sAuthorContent = strtr($sAuthorContent, $symbolMap);

 //        //格式标准化
 //        $sAuthorContent = str_replace(['，', ';', '；', '、'], ',', $sAuthorContent); // 统一分隔符
 //        $sAuthorContent = preg_replace('/\s+and\s+/i', ', ', $sAuthorContent); // and转逗号
 //        $sAuthorContent = preg_replace('/\s+/', ' ', $sAuthorContent); // 合并多余空格
 //        $sAuthorContent = trim($sAuthorContent);

 //        // 处理作者
 //        $content = mb_convert_encoding($sAuthorContent, 'UTF-8', 'auto'); // 确保编码正确
 //        $content = str_replace("\xC2\xA0", ' ', $content); // 替换非-breaking空格为普通空格
 //        $content = preg_replace('/(\d+)\s*([*#†])/', '$1$2', $content); // 合并"1 *"为"1*"、"1 #"为"1#"
 //        $content = preg_replace('/,†/', ',†', $content); // 保留"1,†"格式（防止被拆分）
 //        //标记上标内的逗号+空格（多编号）
 //        $tempStr = preg_replace('/(\d+)\s*,\s*(\d+)/', '$1<SEP>$2', $content);
 //        // 原有步骤2：正则匹配（扩展上标符号支持，保持原有逻辑）
 //        $pattern = '/
 //            ([A-Za-z\s\.\-]+?)          # 姓名（支持缩写、空格）
 //            \s*                         # 姓名与上标间空格
 //            (                           # 上标组（扩展符号支持）
 //                \d+                     # 起始数字
 //                (?:[†#*,]|<SEP>\d+)*    # 允许：†#*符号、逗号、<SEP>+数字（兼容1,†、1,*等）
 //            )
 //            \s*,?                       # 作者间逗号（可选）
 //            (?=\s|$)                    # 确保后面是空格或结尾
 //        /ux';

 //        preg_match_all($pattern, $tempStr, $matches);
 //        $authorList = [];
 //        if(!empty($matches[1])){
 //            foreach ($matches[1] as $i => $name) {
 //                $name = trim($name);
 //                $superscript = trim($matches[2][$i]);
 //                $superscript = str_replace('<SEP>', ',', $superscript); // 恢复多编号逗号
 //                $superscript = preg_replace('/,$/', '', $superscript); // 清理末尾逗号
 //                // 修复符号与数字间的空格（如原始"1 *"被误处理为"1*"的情况，保持原样）
 //                $superscript = preg_replace('/(\d)([*#†])/', '$1$2', $superscript);
 //                if (!empty($name)) {
 //                    $authorList[] = [
 //                        'name' => $name,
 //                        'superscript' => $superscript
 //                    ];
 //                }
 //            }
 //        }else {
 //            // 按“两个或多个连续空格”拆分（姓名之间的分隔）
 //            $authorList = array_filter(
 //                array_map('trim',
 //                    preg_split('/(,\p{Z}*|\p{Z}{2,})/u', $sAuthorContent)
 //                )
 //            );
 //        }


 //        // //处理作者
 //        // $authorList = [];
 //        // // 新正则：匹配“姓名+上标”整体，允许上标含逗号（如1,†）
 //        // // 逻辑：姓名以字母/中文开头，上标以数字开头、以符号/数字结尾
 //        // // if (preg_match_all('/([A-Za-z\x{4e00}-\x{9fa5}][A-Za-z\s·\-\'\x{4e00}-\x{9fa5}]*)\s*([\d,†#*]+)/u', $sAuthorContent, $matches)) {
 //        // if(preg_match_all('/([A-Za-z\x{4e00}-\x{9fa5}][A-Za-z\s·\-\'\x{4e00}-\x{9fa5}]*)\s*(\d[\d,†#\s*]*)/u', $sAuthorContent, $matches)){
 //        //     for ($i = 0; $i < count($matches[1]); $i++) {
 //        //         $authorList[] = trim($matches[1][$i] . $matches[2][$i]);
 //        //     }
 //        // } else {
 //        //     // 按“两个或多个连续空格”拆分（姓名之间的分隔）
 //        //     $authorList = array_filter(
 //        //         array_map('trim',
 //        //             preg_split('/(,\p{Z}*|\p{Z}{2,})/u', $sAuthorContent)
 //        //         )
 //        //     );
 //        // }
 //        $aAuthorData = [];
 //        $aReport = [];
 //        $namePattern = '/
 //            (?:[A-Za-z\s·\-\']+|                # 英文姓名（支持空格、连字符）
 //             [\x{4e00}-\x{9fa5}]+|             # 中文姓名
 //             [\x{1800}-\x{18AF}]+|             # 蒙古文姓名
 //             [A-Z]\.)                           # 单字母缩写（如 J.）
 //        /ux';
 // var_dump($authorList);exit;
 //        foreach ($authorList as $authorStr) {
 //            if (empty($authorStr)) continue;
 //            var_dump($authorList);exit;
 //            //分离姓名与上标（支持上标含逗号，如1,†）
 //            $superscript = '';
 //            // 新正则：匹配以数字开头、含逗号/符号的完整上标（如1,†、2*#）
 //            $authorStr = trim(trim($authorStr,','),' ');
 //            // if (preg_match('/([\d,†#*]+)$/u', $authorStr, $supMatch)) {
 //            // if(preg_match('/\s*([\d,†#* ]+)$/u', $authorStr, $supMatch)){
 //            // if (preg_match('/.*?\s*([\d,†#* ]+)$/u', $authorStr, $supMatch)) {
 //            // if (preg_match('/.*?\s*([\d,\x{2020}#* ]+?)\s*$/u', $authorStr, $supMatch)) {
 //            // if (preg_match('/^(.+?)\D*?(\d[\d,#*†,\s]*)$/u', $authorStr, $supMatch)) {
 //            //     $superscript = $supMatch[1];
 //            //     // 移除上标，保留纯姓名（避免残留符号）
 //            //     $nameStr = trim(preg_replace('/' . preg_quote($superscript, '/') . '$/', '', $authorStr));
 //            // } else {
 //            //     $nameStr = $authorStr;
 //            // }
 //            $pattern = '/^(.+?)\s*(\d[\d,#*†\s]*?)\s*$/u';
 //            if (preg_match($pattern, $authorStr, $supMatch)) {
 //                $nameStr = empty($supMatch[1]) ? '' : trim($supMatch[1]); // 姓名部分："Liguo Zhang"
 //                $superscript = empty($supMatch[2]) ? $nameStr : $nameStr.trim($supMatch[2]); // 上标部分："1
 //                // echo "姓名: $nameStr, 上标: $superscript\n";
 //            } else {
 //                $nameStr = $authorStr;
 //            }
 //            //验证姓名合法性（过滤无效内容）
 //            if (!preg_match($namePattern, $nameStr)) {
 //                continue;
 //            }
 //            //解析上标信息（正确识别1,†中的机构编号和符号）
 //            $companyId = '';
 //            $isSuper = 0;
 //            $isReport = 0;
 //            if (!empty($superscript)) {
 //                // 提取机构编号（忽略上标中的逗号，如1,† → 提取1）
 //                if (preg_match('/(\d+)/', $superscript, $numMatch)) {
 //                    $companyId = $numMatch[1];
 //                }
 //                // 识别特殊符号（#为超级作者，*†为通讯作者）
 //                $isSuper = strpos($superscript, '#') !== false ? 1 : 0;
 //                $isReport = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0;
 //            }
 //            if (preg_match("/^([A-Za-z\s'\.-]+)/u", $nameStr, $match)) {
 //                $nameStr = trim($match[1]);
 //            }
 //            $aAuthorData[] = [
 //                'name' => $nameStr,
 //                'company_id' => $companyId,
 //                'is_super' => $isSuper,
 //                'is_report' => $isReport
 //            ];
 //            if ($isReport) {
 //                $aReport[] = $nameStr;
 //            }
 //        }
 //           var_dump($aAuthorData);exit;
 //        return ['author' => $aAuthorData,'report' => array_unique($aReport)];
 //    }

    // 提取作者
    private function parseAuthorsWithoutRegex($str = '') {
        if (empty($str)) {
            return [];
        }
        // 清理乱码和特殊字符（扩展全角数字处理）
        $str = mb_convert_encoding($str, 'UTF-8', 'auto');
        $str = str_replace(["\xC2\xA0", 'ï¼', 'ï¿½', '，', '１', '２', '３', '４', '５', '６', '７', '８', '９', '０'],
                          [' ', ' ', ' ', ' ', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0'], $str);
        $str = trim(str_replace([' and ', ' AND ', ' And '], ', ', $str));

        // 合并上标中数字与逗号间的空格（如"2, 3"→"2,3"）
        $len = mb_strlen($str);
        $processed = '';
        for ($i = 0; $i < $len; $i++) {
            $char = mb_substr($str, $i, 1);
            if ($char === ',' && $i - 1 >= 0 && $i + 2 < $len) {
                $prevChar = mb_substr($str, $i - 1, 1);
                $next1 = mb_substr($str, $i + 1, 1);
                $next2 = mb_substr($str, $i + 2, 1);
                // 兼容全角数字转半角后的判断
                if ((ctype_digit($prevChar) || is_numeric($prevChar)) && $next1 === ' ' && (ctype_digit($next2) || is_numeric($next2))) {
                    $processed .= $char;
                    $i += 1;
                    continue;
                }
            }
            $processed .= $char;
        }
        $str = $processed;

        // 合并数字与符号间的空格（如"1 *"→"1*"）
        $len = mb_strlen($str);
        $processed = '';
        for ($i = 0; $i < $len; $i++) {
            $char = mb_substr($str, $i, 1);
            if ((ctype_digit($char) || is_numeric($char)) && $i + 2 < $len) { // 支持数字判断
                $next1 = mb_substr($str, $i + 1, 1);
                $next2 = mb_substr($str, $i + 2, 1);
                if ($next1 === ' ' && in_array($next2, ['#', '*', '†', '‡', '§'])) { // 扩展符号支持
                    $processed .= $char;
                    $i += 2;
                    $processed .= $next2;
                    continue;
                }
            }
            $processed .= $char;
        }
        $str = $processed;

        // 合并连续空格
        $len = mb_strlen($str);
        $processed = '';
        $prevSpace = false;
        for ($i = 0; $i < $len; $i++) {
            $char = mb_substr($str, $i, 1);
            if ($char === ' ') {
                if (!$prevSpace) {
                    $processed .= $char;
                    $prevSpace = true;
                }
            } else {
                $processed .= $char;
                $prevSpace = false;
            }
        }
        $str = trim($processed);

        // 作者处理
        $authors = [];
        $currentName = '';
        $currentSuperscript = '';
        $inName = true;
        $len = mb_strlen($str);
        for ($i = 0; $i < $len; $i++) {
            $char = mb_substr($str, $i, 1);

            // 处理作者分隔符：逗号+空格
            if ($char === ',' && $i + 1 < $len) {
                $nextChar = mb_substr($str, $i + 1, 1);
                if ($nextChar === ' ') {
                    if (!empty($currentName)) {
                        $currentSuperscript = rtrim($currentSuperscript, ',');
                        $authors[] = [
                            'name' => trim($currentName),
                            'superscript' => trim($currentSuperscript)
                        ];
                    }
                    $currentName = '';
                    $currentSuperscript = '';
                    $inName = true;
                    $i++;
                    continue;
                }
            }

            // 支持姓名中的点、连字符、特殊字母（如带重音的字母）
            if (ctype_alpha($char) || in_array($char, [' ', '.', '-', 'à', 'á', 'â', 'ã', 'ä', 'ç', 'è', 'é', 'ê', 'ë'])) {
                if ($inName) {
                    $currentName .= $char;
                } else {
                    $currentSuperscript = rtrim($currentSuperscript, ',');
                    $authors[] = [
                        'name' => trim($currentName),
                        'superscript' => trim($currentSuperscript)
                    ];
                    $currentName = $char;
                    $currentSuperscript = '';
                    $inName = true;
                }
            }
            // 解析上标（数字、逗号、#、*、†等）
            elseif ((ctype_digit($char) || is_numeric($char)) || in_array($char, ['#', '*', '†', ',', '‡', '§'])) {
                $inName = false;
                $currentSuperscript .= $char;
            }
            // 忽略其他字符
            else {
                continue;
            }
        }

        // 处理最后一个作者
        if (!empty($currentName)) {
            $currentSuperscript = rtrim($currentSuperscript, ',');
            $authors[] = [
                'name' => trim($currentName),
                'superscript' => trim($currentSuperscript)
            ];
        }

        // 提取机构编号为数组、判断通讯作者和第一作者
        foreach ($authors as $index => &$author) {
            // 提取机构编号（兼容多字节数字）
            $institutionIds = [];
            $superscript = $author['superscript'];
            $numStr = '';
            for ($i = 0; $i < mb_strlen($superscript); $i++) {
                $c = mb_substr($superscript, $i, 1);
                if (ctype_digit($c) || is_numeric($c)) { // 支持数字判断
                    $numStr .= $c;
                } else {
                    if (!empty($numStr)) {
                        $institutionIds[] = (int)$numStr;
                        $numStr = '';
                    }
                }
            }
            if (!empty($numStr)) {
                $institutionIds[] = (int)$numStr;
            }
            $institutionIds = array_values(array_unique($institutionIds));
            $author['company_id'] = $institutionIds;

            // 判断第一作者（#标记）和通讯作者（*、†标记）
            $author['is_super'] = strpos($superscript, '#') !== false ? 1 : 0;
            $author['is_report'] = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0;
        }
        unset($author); // 释放引用
        return $authors;
    }
    private function getAuthors($aParam = []) {
        $title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
        $sAuthorContent = $this->getNextParagraphAfterText($title);
        if (empty($sAuthorContent)) {
            return ['author' => [], 'report' => []];
        }

        //编码修复
        $possibleEncodings = [
            'Windows-1252', 'UTF-8', 'GBK', 'GB2312',
            'Latin-1', 'ISO-8859-1', 'CP1252'
        ];
        $encodedContent = @mb_convert_encoding($sAuthorContent, 'UTF-8', implode(',', $possibleEncodings));
        $sAuthorContent = $encodedContent ?: $sAuthorContent;

        //清理不可见字符
        $sAuthorContent = preg_replace('/[\x00-\x1F\x7F\x{200B}-\x{200F}]/u', '', $sAuthorContent);

        //修复特殊符号乱码
        $symbolMap = [
            'â€ ' => '†', 'â  ' => '†', 'â' => '†', '?†' => '†',
            'ï¼š' => ':', 'ï¼Œ' => ',', 'â€”' => '-',
            '啊' => '' // 针对性移除异常字符“啊”（若为固定乱码）
        ];
        $sAuthorContent = strtr($sAuthorContent, $symbolMap);

        //格式标准化
        $sAuthorContent = str_replace(['，', ';', '；', '、'], ',', $sAuthorContent); // 统一分隔符
        $sAuthorContent = preg_replace('/\s+and\s+/i', ', ', $sAuthorContent); // and转逗号
        $sAuthorContent = preg_replace('/\s+/', ' ', $sAuthorContent); // 合并多余空格
        $sAuthorContent = trim($sAuthorContent);
        $aAuthor = $this->parseAuthorsWithoutRegex($sAuthorContent);
        if(empty($aAuthor)){
            return ['author' => [],'report' => []];
        }
        $aReport = $aAuthorData = [];

        foreach ($aAuthor as $key => $value) {
            if(empty($value['name']) && empty($value['superscript'])){
                continue;
            }
            if(!mb_check_encoding($value['name'], 'UTF-8')){
                $value['name'] = mb_convert_encoding($value['name'], 'UTF-8', 'GBK');
            }
            if(!empty($value['name']) && !empty($value['is_report']) && $value['is_report'] == 1){
                $aReport[] = $value['name'];
            }
            $aAuthorData[] = $value;
        }
        return ['author' => $aAuthorData,'report' => array_unique($aReport)];
    }
//     private function getAuthors($aParam = []) {
//         $title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
//         $sAuthorContent = $this->getNextParagraphAfterText($title);
//         if (empty($sAuthorContent)) {
//             return ['author' => [], 'report' => []];
//         }

//         //编码修复
//         $possibleEncodings = [
//             'Windows-1252', 'UTF-8', 'GBK', 'GB2312',
//             'Latin-1', 'ISO-8859-1', 'CP1252'
//         ];
//         $encodedContent = @mb_convert_encoding($sAuthorContent, 'UTF-8', implode(',', $possibleEncodings));
//         $sAuthorContent = $encodedContent ?: $sAuthorContent;

//         //清理不可见字符
//         $sAuthorContent = preg_replace('/[\x00-\x1F\x7F\x{200B}-\x{200F}]/u', '', $sAuthorContent);

//         //修复特殊符号乱码
//         $symbolMap = [
//             'â€ ' => '†', 'â  ' => '†', 'â' => '†', '?†' => '†',
//             'ï¼š' => ':', 'ï¼Œ' => ',', 'â€”' => '-',
//             '啊' => '' // 针对性移除异常字符“啊”（若为固定乱码）
//         ];
//         $sAuthorContent = strtr($sAuthorContent, $symbolMap);

//         //格式标准化
//         $sAuthorContent = str_replace(['，', ';', '；', '、'], ',', $sAuthorContent); // 统一分隔符
//         $sAuthorContent = preg_replace('/\s+and\s+/i', ', ', $sAuthorContent); // and转逗号
//         $sAuthorContent = preg_replace('/\s+/', ' ', $sAuthorContent); // 合并多余空格
//         $sAuthorContent = trim($sAuthorContent);
// var_dump($this->parseAuthorsWithoutRegex($sAuthorContent));exit;
//         // 关键预处理：兼容"and"分隔符、清理乱码、统一空格
// $content = mb_convert_encoding($sAuthorContent, 'UTF-8', 'auto');
// $content = str_replace(["\xC2\xA0", 'ï¼', 'ï¿½', '，'], ' ', $content); // 清理乱码和全角符号
// $content = preg_replace('/\band\b/i', ',', $content); // 将 "and" 转为逗号（统一分隔符）
// $content = preg_replace('/(\d+)\s*([*#†])/', '$1$2', $content); // 合并数字与符号间的空格（如"1 *"→"1*"）
// $content = trim(preg_replace('/\s+/', ' ', $content)); // 合并连续空格

// // 标记上标内的逗号（多编号处理）
// $tempStr = preg_replace('/(\d+)\s*,\s*(\d+)/', '$1<SEP>$2', $content);

// // 核心正则（保持原有结构，扩展符号支持）
// $pattern = '/
//     ([A-Za-z\s\.\-]+?)          # 姓名（支持缩写、空格、连字符）
//     \s*                         # 姓名与上标间的空格（允许0或多个）
//     (                           # 上标组（扩展兼容所有符号）
//         \d+                     # 起始数字（至少1个数字）
//         (?:[†#*,]|<SEP>\d+)*    # 允许：符号（†#*）、逗号、<SEP>+数字（多编号）
//     )
//     \s*,?                       # 作者间的逗号（可选，允许逗号前有空格）
//     (?=\s|$)                    # 确保后面是空格或字符串结尾（避免跨作者匹配）
// /ux';

// preg_match_all($pattern, $tempStr, $matches);

// // 解析结果并格式化
// $authorList = [];
// if (!empty($matches[1])) {
//     foreach ($matches[1] as $i => $name) {
//         $name = trim($name);
//         $superscript = trim($matches[2][$i]);
//         $superscript = str_replace('<SEP>', ',', $superscript); // 恢复多编号逗号
//         $superscript = preg_replace('/,$/', '', $superscript); // 清理末尾多余逗号
//         if (!empty($name)) {
//             $authorList[] = [
//                 'name' => $name,
//                 'superscript' => $superscript
//             ];
//         }
//     }
// }

// // 输出结果
// echo "<pre>";
// print_r($authorList);
// echo "</pre>";
// exit;

//         // 处理作者
//         $content = mb_convert_encoding($sAuthorContent, 'UTF-8', 'auto'); // 确保编码正确
//         $content = str_replace("\xC2\xA0", ' ', $content); // 替换非-breaking空格为普通空格
//         $content = preg_replace('/(\d+)\s*([*#†])/', '$1$2', $content); // 合并"1 *"为"1*"、"1 #"为"1#"
//         $content = preg_replace('/,†/', ',†', $content); // 保留"1,†"格式（防止被拆分）

//         //标记上标内的逗号+空格（多编号）
//         $tempStr = preg_replace('/(\d+)\s*,\s*(\d+)/', '$1<SEP>$2', $content);
//         // 原有步骤2：正则匹配（扩展上标符号支持，保持原有逻辑）
//         $pattern = '/
//             ([A-Za-z\s\.\-]+?)          # 姓名（支持缩写、空格）
//             \s*                         # 姓名与上标间空格
//             (                           # 上标组（扩展符号支持）
//                 \d+                     # 起始数字
//                 (?:[†#*,]|<SEP>\d+)*    # 允许：†#*符号、逗号、<SEP>+数字（兼容1,†、1,*等）
//             )
//             \s*,?                       # 作者间逗号（可选）
//             (?=\s|$)                    # 确保后面是空格或结尾
//         /ux';

//         preg_match_all($pattern, $tempStr, $matches);
//         var_dump($matches);exit;
//         $authorList = [];
//         if(!empty($matches[1])){
//             foreach ($matches[1] as $i => $name) {
//                 $name = trim($name);
//                 $superscript = trim($matches[2][$i]);
//                 $superscript = str_replace('<SEP>', ',', $superscript); // 恢复多编号逗号
//                 $superscript = preg_replace('/,$/', '', $superscript); // 清理末尾逗号
//                 // 修复符号与数字间的空格（如原始"1 *"被误处理为"1*"的情况，保持原样）
//                 $superscript = preg_replace('/(\d)([*#†])/', '$1$2', $superscript);
//                 if (!empty($name)) {
//                     $authorList[] = [
//                         'name' => $name,
//                         'superscript' => $superscript
//                     ];
//                 }
//             }
//         }else {
//             // 按“两个或多个连续空格”拆分（姓名之间的分隔）
//             $authorList = array_filter(
//                 array_map('trim',
//                     preg_split('/(,\p{Z}*|\p{Z}{2,})/u', $sAuthorContent)
//                 )
//             );
//         }


//         // //处理作者
//         $aAuthorData = [];
//         $aReport = [];
//         $namePattern = '/
//             (?:[A-Za-z\s·\-\']+|                # 英文姓名（支持空格、连字符）
//              [\x{4e00}-\x{9fa5}]+|             # 中文姓名
//              [\x{1800}-\x{18AF}]+|             # 蒙古文姓名
//              [A-Z]\.)                           # 单字母缩写（如 J.）
//         /ux';

//         foreach ($authorList as $authorStr){
//             if (empty($authorStr)) continue;

//             //获取下标
//             $superscript = empty($authorStr['superscript']) ? $authorStr : $authorStr['superscript'];
//             $nameStr = empty($authorStr['name']) ? $authorStr : $authorStr['name'];

//             $companyId = [];
//             $isSuper = 0;
//             $isReport = 0;
//             if (!empty($superscript)) {
//                 // 提取机构编号（忽略上标中的逗号，如1,† → 提取1）
//                 preg_match_all('/\d+/', $superscript, $numMatch);
//                 // 识别特殊符号（#为超级作者，*†为通讯作者）
//                 $isSuper = strpos($superscript, '#') !== false ? 1 : 0;
//                 $isReport = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0;
//             }
//             if (preg_match("/^([A-Za-z\s'\.-]+)/u", $nameStr, $match)) {
//                 $nameStr = trim($match[1]);
//             }
//             $aAuthorData[] = [
//                 'name' => $nameStr,
//                 'company_id' => empty($numMatch[0]) ? [] : $numMatch[0],
//                 'is_super' => $isSuper,
//                 'is_report' => $isReport
//             ];
//             if ($isReport) {
//                 $aReport[] = $nameStr;
//             }
//         }
//         return ['author' => $aAuthorData,'report' => array_unique($aReport)];
//     }

    // 获取机构
    private function getCompany($aParam = []){
        //获取标题
        $title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
        //获取标题下的作者
        $sAuthorContent = empty($aParam['authors']) ? $this->getNextParagraphAfterText($title) : $aParam['authors'];
        //获取作者结构
        $allLines = $this->getContentAfterText($sAuthorContent,1);
        if(empty($allLines)){
            return [];
        }
        // 2. 按序号分组，合并同一序号的多行内容
        $grouped = [];
        $currentNumber = null; // 当前序号
        foreach ($allLines as $line) {
            $line = trim($line);
            if (empty($line)) continue;

            // 判断是否是新条目的开头：行首为数字（后续可接任意字符或直接接内容）
            $number = '';
            $i = 0;
            $lineLen = strlen($line);
            // 提取行首的连续数字（作为序号）
            while ($i < $lineLen && ctype_digit($line[$i])) {
                $number .= $line[$i];
                $i++;
            }

            // 若行首有数字，则视为新条目
            if (!empty($number)) {
                $currentNumber = $number;
                // 提取序号后的内容（跳过数字后的符号/空格，保留核心内容）
                // 从数字后的位置开始，跳过可能的符号（./*）或空格
                while ($i < $lineLen && (in_array($line[$i], ['.', '*', ' ']))) {
                    $i++;
                }
                $content = trim(substr($line, $i)); // 序号后的内容
                $grouped[$currentNumber] = $content;
                continue;
            }

            // 非新条目，合并到当前序号的内容中
            if ($currentNumber !== null) {
                $grouped[$currentNumber] .= ' ' . $line;
            }
        }

        //清理结果
        $possibleEncodings = [
            'Windows-1252', 'UTF-8', 'GBK', 'GB2312',
            'Latin-1', 'ISO-8859-1', 'CP1252'
        ];
        $aCompany = [];
        foreach ($grouped as $number => $institution) {
            $encodedContent = @mb_convert_encoding($institution, 'UTF-8', implode(',', $possibleEncodings));
            $sCompany = $encodedContent ?: $sCompany;
            $institution = preg_replace('/\s+/', ' ', $institution); // 合并多余空格
            $institution = rtrim($institution, '.');
            $institution = preg_replace('/^\d+\s+/', '', $institution);
            $institution = trim($institution); // 清理首尾空格
            preg_match('/(.*?, [A-Za-z]+ \d+, [A-Za-z]+)/', $institution, $institutionmatches);;
            $institution = trim($institutionmatches[1] ?? $institution);
            if (preg_match('/^(.*?)(?=\s*\*Email)/', $institution, $matches)) {
                $institution = trim($matches[1]); // trim() 去除内容前后多余空格
            }
            if(!empty($institution) && !mb_check_encoding($institution, 'UTF-8')){
                $institution = mb_convert_encoding($institution, 'UTF-8', 'GBK');
            }
            $aCompany[$number] = $institution;
        }
        return $aCompany;
    }

    // 提取通讯作者（含E-mail、地址、电话）
    private function getCorrespondingAuthors($aParam = []){
        $aCorrespondingAuthor = empty($aParam['report']) ? [] : $aParam['report'];
        if(empty($aCorrespondingAuthor)){
            return [];
        }

        // 获取标题
        $title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
        $sAuthorContent = $this->getNextParagraphAfterText($title);
        $sCompany = $this->getNextParagraphAfterText($sAuthorContent); // 直接取机构所在段落的原始文本
        if (empty($sCompany)) {
            // 备选方案：若机构段落获取失败，用解析后的机构名称拼接
            $aCompany = $this->getCompany($aParam);
            $sCompany = implode(' ', array_values($aCompany));
        }

        // 获取机构后的完整内容
        $corrText = $this->getContentAfterText($sCompany);
         //编码修复
        $possibleEncodings = [
            'Windows-1252', 'UTF-8', 'GBK', 'GB2312',
            'Latin-1', 'ISO-8859-1', 'CP1252'
        ];
        $encodedContent = @mb_convert_encoding($corrText, 'UTF-8', implode(',', $possibleEncodings));
        $corrText = $encodedContent ?: $corrText;
        // // 调试
        // file_put_contents(ROOT_PATH . 'runtime/corr_text_raw.log', $corrText);

        //清理文本
        $corrText = str_replace(['：', '＠'], [':', '@'], $corrText);
        $corrText = preg_replace('/\s+/', ' ', $corrText); // 统一空格
        $corrText = str_replace('  ', ' ', $corrText); // 去除多余空格
        //按"*"分割通讯作者
        $corrBlocks = preg_split('/\s*\*\s*/', $corrText);
        $corrBlocks = array_filter(array_map('trim', $corrBlocks));

        $aCorresponding = [];
        foreach ($corrBlocks as $block) {
            //匹配通讯作者姓名
            $sName = $this->matchCorrespondingName($block, $aCorrespondingAuthor);
            if (empty($sName)) {
                continue;
            }
            preg_match('/(E[\s-]*mail|邮箱)[\s:]*([^\s]+@[^\s]+)/i', $block, $email);
            preg_match('/(Postal[\s-]*address|地址)[\s:]*([^,;]+)/i', $block, $address);
            preg_match('/(Tel|电话)[\s:]*([^\s]+)/i', $block, $tel);
            $aCorresponding[] = [
                'name' => $sName,
                'email' => isset($email[2]) ? trim($email[2]) : '',
                'postal_address' => isset($address[2]) ? trim($address[2]) : '',
                'tel' => isset($tel[2]) ? trim($tel[2]) : ''
            ];
        }
        if(empty($aCorresponding)){
            $pattern = '/Corresponding Authors: (.*?)(?=$|;)/s';
            preg_match($pattern, $corrText, $match);
            if (!empty($match[1])) {
                $corrContent = $match[1];
                // 提取每个作者的名称和邮箱（优化正则，支持更多字符）
                $authorPattern = '/([A-Za-z\s]+?),\s*E-mail:\s*([\w@\.\-]+)/';
                preg_match_all($authorPattern, $corrContent, $authors);
                if(!empty($authors[1])){
                    for ($i = 0; $i < count($authors[1]); $i++) {
                        $aCorresponding[] = [
                            'name' => empty($authors[1][$i]) ? '' : trim($authors[1][$i]),
                            'email' => empty($authors[2][$i]) ? '' : trim($authors[2][$i])
                        ];
                    }
                }
            }
        }
        return $aCorresponding;
    }

    //匹配通讯作者姓名
    private function matchCorrespondingName($block, $corrNames)
    {
        $blockLower = strtolower($block);
        foreach ($corrNames as $name) {
            if (strpos($blockLower, strtolower($name)) !== false) {
                return $name;
            }
            $nameParts = explode(' ', $name);
            if (count($nameParts) >= 2) {
                $reversedName = implode(' ', array_reverse($nameParts));
                if (strpos($blockLower, strtolower($reversedName)) !== false) {
                    return $name;
                }
            }
        }
        return '';
    }

    // 获取目标文本的下一个段落
    private function getNextParagraphAfterText($targetText){

        $found = false;
        foreach ($this->sections as $section) {
            foreach ($section->getElements() as $element) {
                $text = $this->getTextFromElement($element);
                if(empty($text)){
                    continue;
                }
                if ($found) {
                    return $text;
                }
                if (stripos($text, $targetText) !== false) {
                    $found = true;
                }
            }
        }
        return '';
    }

    // 获取目标文本后的所有内容
    private function getContentAfterText($targetText,$return_type = 2){
        $found = false;
        $content = [];
        $stopKeywords = ['关键词', 'Key words', '摘要', 'Abstract','ABSTRACT'];
        $maxLines = 200;
        $lineNumber = 0;
        foreach ($this->sections as $section) {

            foreach ($section->getElements() as $element) {

                $lineNumber++;
                if (count($content) >= $maxLines) break;

                $text = $this->getTextFromElement($element,$lineNumber);
                $text = trim($text);
                if (empty($text)) continue;
                if (!$found) {
                    // 移除所有非字母数字字符后匹配
                    $cleanTarget = preg_replace('/[^a-zA-Z0-9]/', '', strtolower($targetText));
                    $cleanText = preg_replace('/[^a-zA-Z0-9]/', '', strtolower($text));
                    // 只要目标文本的50%以上能匹配即可
                    if (strlen($cleanTarget) > 0 && similar_text($cleanText, $cleanTarget) / strlen($cleanTarget) > 0.5) {
                        $found = true;
                    }
                    continue;
                }

                // 检查停止关键词
                $shouldStop = false;
                foreach ($stopKeywords as $kw) {
                    if (stripos($text, $kw) !== false) {
                        $shouldStop = true;
                        break;
                    }
                }
                if ($shouldStop) break;

                $content[] = $text;
            }
            if (count($content) >= $maxLines || (isset($shouldStop) && $shouldStop)) break;
        }
        if($return_type == 1){
            return $content;
        }
        $content = implode("\n", $content);
        if(!empty($content) && !mb_check_encoding($content, 'UTF-8')){
            $content = mb_convert_encoding($content, 'UTF-8', 'GBK');
        }
        return $content;
    }

    // 统一提取元素文本
    private function getTextFromElement($element,$lineNumber = 0){
        $text = '';
        // 处理PreserveText元素
        if ($element instanceof \PhpOffice\PhpWord\Element\PreserveText) {
            // 通过反射获取私有属性 text
            $reflection = new \ReflectionClass($element);
            $property = $reflection->getProperty('text');
            $property->setAccessible(true);
            $textParts = $property->getValue($element);
            foreach ($textParts as $part) {
                if (strpos($part, 'HYPERLINK') !== false) {
                    // 解码 HTML 实体（&quot; -> "）
                    $decoded = html_entity_decode($part);
                    // 提取 mailto: 后的邮箱
                    if (preg_match('/mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/i', $decoded, $match)) {
                        $text .= $match[1] . ' ';
                    }
                } else {
                    // 普通文本直接拼接
                    $text .= $part;
                }
            }
            return $text;
        }
        // 处理表格和单元格（E-mail可能在表格中）
        if ($element instanceof \PhpOffice\PhpWord\Element\Table) {
            foreach ($element->getRows() as $row) {
                foreach ($row->getCells() as $cell) {
                    $text .= $this->getTextFromElement($cell);
                }
            }
            return $text;
        }
        if ($element instanceof \PhpOffice\PhpWord\Element\Cell) {
            foreach ($element->getElements() as $child) {
                $text .= $this->getTextFromElement($child);
            }
            return $text;
        }

        //处理嵌套元素（递归提取所有子元素）
        if (method_exists($element, 'getElements')) {
            foreach ($element->getElements() as $child) {
                $text .= $this->getTextFromElement($child);
            }
        }

        //处理文本元素（包括带格式的文本）
        if ($element instanceof \PhpOffice\PhpWord\Element\Text) {
            $text .= $element->getText();
        }

        //处理超链接（优先提取链接目标，可能是邮箱）
        if ($element instanceof \PhpOffice\PhpWord\Element\Link) {
            $target = $element->getTarget();
            if (strpos($target, 'mailto:') === 0) {
                $text .= str_replace('mailto:', '', $target) . ' '; // 剥离mailto:前缀
            }
            $text .= $element->getText() . ' ';
        }

        //处理字段和注释（可能包含隐藏邮箱）
        if ($element instanceof \PhpOffice\PhpWord\Element\Field) {
            $text .= $element->getContent() . ' ';
        }
        if ($element instanceof \PhpOffice\PhpWord\Element\Note) {
            $text .= $element->getContent() . ' ';
        }
        //清理所有不可见字符（关键：移除格式干扰）
        $text = preg_replace('/[\x00-\x1F\x7F-\x9F]/', ' ', $text); // 移除控制字符
        $text = str_replace(["\t", "\r", "\n"], ' ', $text); // 统一空白字符
        $text = preg_replace('/\s+/', ' ', $text); // 合并多个空格
        if(!empty($text) && !mb_check_encoding($text, 'UTF-8')){
            $text = mb_convert_encoding($text, 'UTF-8', 'GBK');
        }
        return $text;
    }

    /**
     * 从 Word 文档提取摘要和关键词
     * @return array 提取结果
     */
    public function extractFromWord() {
        $sContent = '';
        //文本处理
        $sFundContent = '';
        foreach ($this->sections as $section) {
            foreach ($section->getElements() as $element) {
                $textContent = $this->getTextFromElement($element);
                if(empty($textContent)){
                    continue;
                }
                //编码修复
                $possibleEncodings = [
                    'Windows-1252', 'UTF-8', 'GBK', 'GB2312',
                    'Latin-1', 'ISO-8859-1', 'CP1252'
                ];
                $sContent .= @mb_convert_encoding($textContent, 'UTF-8', implode(',', $possibleEncodings));
                if(stripos($textContent, 'Keywords:') !== false){
                    $sContent .= "Keywords-End-Flag";
                }
                if(empty($sFundContent)){
                    $aFund = $this->getMatchedFundPhrases($sContent);
                    if(!empty($aFund[0])){
                        $position = stripos($sContent, $aFund[0]);
                        $sFundContent = substr($sContent, $position);
                        $sFundContent = trim(str_ireplace($aFund[0], '', $sFundContent));
                        if (preg_match('/^(.*?)Peer review/', $sFundContent, $matches)) {
                            $sFundContent = $matches[1]; // 提取匹配到的前置内容
                        }
                    }
                }
                $sContent .= "\n";
            }
        }

        if(!empty($sContent) && !mb_check_encoding($sContent, 'UTF-8')){
            $sContent = mb_convert_encoding($sContent, 'UTF-8', 'GBK');
        }
        // 2. 基础文本清理（合并多余空格，保留有效换行）
        $textContent = preg_replace('/(\S)\s+/', '$1 ', $sContent);
        $textContent = trim($textContent);

        // 3. 提取摘要
        $abstract = '';
        $abstractPattern = '/Abstract\s*([\s\S]*?)(?=Keywords|$)/i';
        if (preg_match($abstractPattern, $textContent, $abstractMatches)) {
            $abstract = trim($abstractMatches[1]);
            $abstract = preg_replace('/\n+/', ' ', $abstract);
        }
        // 4. 提取关键词（核心：仅保留两种强制匹配逻辑）
        $keywords = [];
        // $keywordPattern = '/Keywords:\s*([\s\S]*?)(?=\s*\d+\.|[;,]\s*[\r\n]+\s*[\r\n]+|(?i)\bintroduction|abbreviations\b|$)/i';
        $keywordPattern = '/Keywords\s*(.*?)\s*Keywords-End-Flag/s';

        if (preg_match($keywordPattern, $textContent, $keywordMatches)) {
            $keywordStr = trim($keywordMatches[1]);

            // 清理关键词列表格式（去除换行、末尾多余符号）
            $keywordStr = preg_replace('/\n+/', ' ', $keywordStr);
            $keywordStr = rtrim($keywordStr, ';,. '); // 去除末尾分号、逗号等
            $keywordStr = trim($keywordStr);

            // 分割并过滤有效关键词
            $keywords = preg_split('/[,;]\s*/', $keywordStr);
            $keywords = array_filter(array_map('trim', $keywords), function($item) {
                return !empty($item) && !ctype_space($item);
            });
        }
        if(empty($keywords)){
            $keywordPattern = '/Keywords\s*([\s\S]*?)(?=Introduction|$)/i';
            if (preg_match($keywordPattern, $textContent, $keywordMatches)) {
                $keywordStr = trim($keywordMatches[1]);
                // 清理关键词列表格式（去除换行、末尾多余符号）
                $keywordStr = preg_replace('/\n+/', ' ', $keywordStr);
                $keywordStr = rtrim($keywordStr, ';,. '); // 去除末尾分号、逗号等
                $keywordStr = trim($keywordStr);

                // 分割并过滤有效关键词
                $keywords = preg_split('/[,;]\s*/', $keywordStr);
                $keywords = array_filter(array_map('trim', $keywords), function($item) {
                    return !empty($item) && !ctype_space($item);
                });
            }
        }
        return [
            'status' => 1,
            'msg' => '提取成功',
            'data' => [
                'abstrart' => $abstract,
                'keywords' => $keywords,
                'fund' => $sFundContent
            ]
        ];
    }
    private function getMatchedFundPhrases($content = '') {
        if (empty($content)) {
            return [];
        }

        // 基金支持词组列表
        $fundPhrases = [
            'Supported by', 'Funded by', 'Sponsored by', 'Supported in part by',
            'Funding was provided by', 'Funded in part by'
        ];

        // 1. 转义词组中的特殊字符，使用 # 作为分隔符
        $escapedPhrases = array_map(function($phrase) {
            return preg_quote($phrase, '#');
        }, $fundPhrases);

        // 2. 拼接为正则模式：匹配任意一个词组（保留原始词组的捕获）
        $pattern = '#('.implode('|', $escapedPhrases).')#i';
        // 注意：此处用 () 捕获分组，而非 (?:)，用于提取匹配到的具体词组

        // 3. 全局匹配所有符合的词组
        preg_match_all($pattern, $content, $matches);

        // 4. 处理结果：去重、保留原始词组格式（忽略大小写导致的变体）
        $matched = [];
        if (!empty($matches[1])) {
            // 遍历匹配到的结果（可能包含大小写变体，如 'funded by'）
            foreach ($matches[1] as $match) {
                // 与原始词组列表比对，找到完全匹配的原始词组（忽略大小写）
                foreach ($fundPhrases as $original) {
                    if (strcasecmp($match, $original) === 0) {
                        $matched[] = $original;
                        break; // 找到后跳出内层循环，避免重复
                    }
                }
            }
            // 去重并保持原始顺序
            $matched = array_values(array_unique($matched));
        }

        return $matched;
    }
    //日志打印
    private function log($msg){
        // echo date('[Y-m-d H:i:s] ') . $msg . "\n";
    }
}