tougao/application/common/ArticleParserService.php

<?php
namespace app\common;
use PhpOffice\PhpWord\IOFactory;
use think\Exception;
use ZipArchive;
use RecursiveIteratorIterator;
use RecursiveDirectoryIterator;
use PhpOffice\PhpWord\Settings;
use PhpOffice\PhpWord\Element\TextRun;
use DOMDocument;
use DOMXPath;
// use BadMethodCallException;
class ArticleParserService
{
    private $phpWord;
    private $sections;

    public function __construct($filePath = '')
    {
        if (!file_exists($filePath)) {
            return json_encode(['status' => 5, 'msg' => '"文档不存在：{$filePath}"']);
        }
        try {
            // 关键配置：关闭“仅读数据”，保留完整节结构
            $reader = IOFactory::createReader();
            $reader->setReadDataOnly(false);
            Settings::setCompatibility(false);
            Settings::setOutputEscapingEnabled(true); // 避免XML转义冲突

            $doc = $reader->load($filePath);
            $sectionCount = count($doc->getSections());
            // $this->log("✅ 文档直接加载成功，节数量：{$sectionCount}");
            $this->phpWord = $reader->load($filePath);
            $this->sections = $this->phpWord->getSections();
        } catch (\Exception $e) {
            // 预处理：移除 DOCX 中的 EMF 图片
            $processedFilePath = $this->removeEmfFromDocx($filePath);
            // 加载处理后的文档
            $reader = IOFactory::createReader();
            $reader->setReadDataOnly(false);
            Settings::setCompatibility(false);
            Settings::setOutputEscapingEnabled(true);

            $this->phpWord = $reader->load($processedFilePath);
            $this->sections = $this->phpWord->getSections();

            // 可选：删除临时处理文件（避免冗余）
            unlink($processedFilePath);
            return json_encode(['status' => 5, 'msg' => $e->getMessage()]);
        }
    }
    /**
     * 移除 DOCX 压缩包内的所有 EMF 图片
     * @param string $docxPath 原 DOCX 文件路径
     * @return string 处理后的临时 DOCX 路径
     */
    private function removeEmfFromDocx($docxPath){
        $zip = new ZipArchive();
        if ($zip->open($docxPath) !== true) {
            throw new \Exception("无法打开 DOCX 文件：{$docxPath}");
        }

        // 1. 创建临时目录用于解压
        $tempDir = rtrim(ROOT_PATH,'/').'/runtime/'.uniqid('docx_temp_');

        mkdir($tempDir, 0700, true);

        // 2. 解压 DOCX 到临时目录
        $zip->extractTo($tempDir);
        $zip->close();

        // 3. 递归删除所有 EMF 文件
        $dirIterator = new RecursiveDirectoryIterator($tempDir);
        $iterator = new RecursiveIteratorIterator($dirIterator);
        foreach ($iterator as $file) {
            if ($file->isFile() && strtolower(pathinfo($file, PATHINFO_EXTENSION)) === 'emf') {
                unlink($file->getPathname());
            }
        }
        // 4. 重新打包为 DOCX
        $processedPath = $tempDir . '_processed.docx';
        $newZip = new ZipArchive();
        if ($newZip->open($processedPath, ZipArchive::CREATE | ZipArchive::OVERWRITE) !== true) {
            throw new \Exception("无法创建处理后的 DOCX 文件");
        }

        // 遍历临时目录，添加所有文件到新压缩包
        $this->addFilesToZip($tempDir, $newZip);
        $newZip->close();

        // 5. 删除临时解压目录
        $this->deleteDir($tempDir);

        return $processedPath;
    }

    /**
     * 递归添加目录文件到 ZipArchive
     * @param string $dir 目录路径
     * @param ZipArchive $zip ZipArchive 实例
     */
    private function addFilesToZip($dir, $zip)
    {
        $files = scandir($dir);
        foreach ($files as $file) {
            if ($file === '.' || $file === '..') continue;

            $filePath = $dir . '/' . $file;
            if (is_dir($filePath)) {
                $this->addFilesToZip($filePath, $zip);
            } else {
                // 计算压缩包内的相对路径（避免冗余目录层级）
                $relativePath = str_replace(dirname($dir) . '/', '', $filePath);
                $zip->addFile($filePath, $relativePath);
            }
        }
    }

    /**
     * 递归删除目录
     * @param string $dir 目录路径
     */
    private function deleteDir($dir){
        // 1. 基础校验：非空字符串且为有效目录
        if (trim($dir) === '' || !is_dir($dir)) {
            return false;
        }

        // 2. 统一路径格式（去除尾部分隔符，避免跨系统差异）
        $dir = rtrim($dir, DIRECTORY_SEPARATOR);
        $dirName = basename($dir);

        // 3. 前缀强校验：仅处理docx_temp_开头的目录
        if (strpos($dirName, 'docx_temp_') !== 0) {
            return false;
        }

        // 4. 路径归属校验（缓存realpath结果，减少I/O）
        $runtimeDir = rtrim(ROOT_PATH, '/') . '/runtime';
        $realDir = realpath($dir);
        $realRuntimeDir = realpath($runtimeDir);
        if ($realDir === false || $realRuntimeDir === false || strpos($realDir, $realRuntimeDir) !== 0) {
            return false;
        }

        // 5. 扫描目录（带错误抑制，处理权限问题）
        $files = @scandir($dir);
        if ($files === false) {
            return false;
        }

        $isFullyDeleted = true; // 标记是否完全删除

        // 6. 递归处理子项
        foreach ($files as $file) {
            if ($file === '.' || $file === '..') {
                continue;
            }

            $filePath = $dir . DIRECTORY_SEPARATOR . $file;
            $realFilePath = realpath($filePath);

            // 子路径校验：必须是当前目录的子项（防符号链接跳转）
            if ($realFilePath === false || strpos($realFilePath, $realDir) !== 0) {
                $isFullyDeleted = false;
                continue;
            }

            if (is_dir($realFilePath)) {
                // 递归删除子目录，继承校验逻辑
                if (!$this->deleteDir($realFilePath)) {
                    $isFullyDeleted = false;
                }
            } else {
                // 尝试删除文件（失败则标记未完全删除）
                if (!@unlink($realFilePath)) {
                    $isFullyDeleted = false;
                }
            }
        }

        // 7. 最终删除目录（确保空目录才删除）
        $remainingFiles = @scandir($dir);
        if ($remainingFiles !== false && count($remainingFiles) <= 2) {
            @rmdir($dir);
            return $isFullyDeleted; // 若子项完全删除，则返回true
        }

        return false;
    }

    // 上传并解析文档的入口方法
    public static function uploadAndParse($sFileUrl){
        //必填值验证
        if(empty($sFileUrl)){
            return json_encode(['status' => 2,'msg' => 'Please upload the submission file']);
        }

        //判断文件是否执行
        if (!file_exists($sFileUrl)) {
            return json_encode(['status' => 3, 'msg' => 'The uploaded file does not exist']);
        }
        if (!is_readable($sFileUrl)) {
            return json_encode(['status' => 4, 'msg' => 'The uploaded file is unreadable']);
        }

        // 解析文档
        $oDealFile = new self($sFileUrl);
        //获取标题
        $sTitle = $oDealFile->getTitle();
        if(empty($sTitle)){
            return json_encode(['status' => 5, 'msg' => 'Article title retrieval failed']);
        }
        //获取作者
        $aParam = ['title' => $sTitle];
        $aAuthor = $oDealFile->getAuthors($aParam);
        $aAuthorData = empty($aAuthor['author']) ? [] : $aAuthor['author'];//所有作者信息
        $aAuthorReportData = empty($aAuthor['report']) ? [] : $aAuthor['report'];//通讯作者信息
        $aParam['author'] = $aAuthorData;
        $aParam['report'] = $aAuthorReportData;
        //获取机构
        $aCompany = $oDealFile->getCompany($aParam);
        $aParam['company'] = $aCompany;
        //获取通讯作者信息
        $aParam['corresponding'] = $oDealFile->getCorrespondingAuthors($aParam);
        //keywords 和 摘要
        $aContent = $oDealFile->extractFromWord();
        if(!mb_check_encoding($sTitle, 'UTF-8')){
            $sTitle = mb_convert_encoding($sTitle, 'UTF-8', 'GBK');
        }
        $aParam['title'] = $oDealFile->fullDecode($aParam['title']);
        $aParam += empty($aContent['data']) ? [] : $aContent['data'];
        return json_encode(['status' => 1,'msg' => 'success','data' => $aParam]);
    }

    // 提取文章标题
    private function getTitle(){
        if(empty($this->sections)){
            return '';
        }
        $title = '';
        $maxLength = 0;
        foreach ($this->sections as $section) {
            foreach ($section->getElements() as $element) {
                $text = $this->getTextFromElement($element);
                $length = mb_strlen(trim($text));
                if ($length > $maxLength && $length > 3) { // 标题通常较长
                    $title = trim($text);
                    $maxLength = $length;
                    break 2; // 取第一个最长段落作为标题
                }
            }
        }
        return $title;
    }

    // 提取作者
    private function parseAuthorsWithoutRegex($str = '') {
        if (empty($str)) {
            return [];
        }
        if(!mb_check_encoding($str, 'UTF-8')){
            $str = mb_convert_encoding($str, 'UTF-8', 'GBK');
        }
        $str = $this->fullDecode($str);
        $str = str_replace(["\xC2\xA0", 'ï¼', 'ï¿½', '，', '１', '２', '３', '４', '５', '６', '７', '８', '９', '０'],
                          [' ', ' ', ' ', ' ', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0'], $str);
        $str = trim(str_replace([' and ', ' AND ', ' And '], ', ', $str));

        // 合并上标中数字与逗号间的空格（如"2, 3"→"2,3"）
        $len = mb_strlen($str);
        $processed = '';
        for ($i = 0; $i < $len; $i++) {
            $char = mb_substr($str, $i, 1);
            if ($char === ',' && $i - 1 >= 0 && $i + 2 < $len) {
                $prevChar = mb_substr($str, $i - 1, 1);
                $next1 = mb_substr($str, $i + 1, 1);
                $next2 = mb_substr($str, $i + 2, 1);
                // 兼容全角数字转半角后的判断
                if ((ctype_digit($prevChar) || is_numeric($prevChar)) && $next1 === ' ' && (ctype_digit($next2) || is_numeric($next2))) {
                    $processed .= $char;
                    $i += 1;
                    continue;
                }
            }
            $processed .= $char;
        }
        $str = $processed;

        // 合并数字与符号间的空格（如"1 *"→"1*"）
        $len = mb_strlen($str);
        $processed = '';
        for ($i = 0; $i < $len; $i++) {
            $char = mb_substr($str, $i, 1);
            if ((ctype_digit($char) || is_numeric($char)) && $i + 2 < $len) { // 支持数字判断
                $next1 = mb_substr($str, $i + 1, 1);
                $next2 = mb_substr($str, $i + 2, 1);
                if ($next1 === ' ' && in_array($next2, ['#', '*', '†', '‡', '§'])) { // 扩展符号支持
                    $processed .= $char;
                    $i += 2;
                    $processed .= $next2;
                    continue;
                }
            }
            $processed .= $char;
        }
        $str = $processed;

        // 合并连续空格
        $len = mb_strlen($str);
        $processed = '';
        $prevSpace = false;
        for ($i = 0; $i < $len; $i++) {
            $char = mb_substr($str, $i, 1);
            if ($char === ' ') {
                if (!$prevSpace) {
                    $processed .= $char;
                    $prevSpace = true;
                }
            } else {
                $processed .= $char;
                $prevSpace = false;
            }
        }
        $str = trim($processed);

        // 作者处理
        $authors = [];
        $currentName = '';
        $currentSuperscript = '';
        $inName = true;
        $len = mb_strlen($str);
        for ($i = 0; $i < $len; $i++) {
            $char = mb_substr($str, $i, 1);

            // 处理作者分隔符：逗号+空格
            if ($char === ',' && $i + 1 < $len) {
                $nextChar = mb_substr($str, $i + 1, 1);
                if ($nextChar === ' ') {
                    if (!empty($currentName)) {
                        $currentSuperscript = rtrim($currentSuperscript, ',');
                        $authors[] = [
                            'name' => trim($currentName),
                            'superscript' => trim($currentSuperscript)
                        ];
                    }
                    $currentName = '';
                    $currentSuperscript = '';
                    $inName = true;
                    $i++;
                    continue;
                }
            }

            // 支持姓名中的点、连字符、特殊字母（如带重音的字母）
            if (ctype_alpha($char) || in_array($char, [' ', '.', '-', 'à', 'á', 'â', 'ã', 'ä', 'ç', 'è', 'é', 'ê', 'ë'])) {
                if ($inName) {
                    $currentName .= $char;
                } else {
                    $currentSuperscript = rtrim($currentSuperscript, ',');
                    $authors[] = [
                        'name' => trim($currentName),
                        'superscript' => trim($currentSuperscript)
                    ];
                    $currentName = $char;
                    $currentSuperscript = '';
                    $inName = true;
                }
            }
            // 解析上标（数字、逗号、#、*、†等）
            elseif ((ctype_digit($char) || is_numeric($char)) || in_array($char, ['#', '*', '†', ',', '‡', '§'])) {
                $inName = false;
                $currentSuperscript .= $char;
            }
            // 忽略其他字符
            else {
                continue;
            }
        }

        // 处理最后一个作者
        if (!empty($currentName)) {
            $currentSuperscript = rtrim($currentSuperscript, ',');
            $authors[] = [
                'name' => trim($currentName),
                'superscript' => trim($currentSuperscript)
            ];
        }

        // 提取机构编号为数组、判断通讯作者和第一作者
        foreach ($authors as $index => &$author) {
            // 提取机构编号（兼容多字节数字）
            $institutionIds = [];
            $superscript = $author['superscript'];
            $numStr = '';
            for ($i = 0; $i < mb_strlen($superscript); $i++) {
                $c = mb_substr($superscript, $i, 1);
                if (ctype_digit($c) || is_numeric($c)) { // 支持数字判断
                    $numStr .= $c;
                } else {
                    if (!empty($numStr)) {
                        $institutionIds[] = (int)$numStr;
                        $numStr = '';
                    }
                }
            }
            if (!empty($numStr)) {
                $institutionIds[] = (int)$numStr;
            }
            $institutionIds = array_values(array_unique($institutionIds));
            $author['company_id'] = $institutionIds;

            // 判断第一作者（#标记）和通讯作者（*、†标记）
            $author['is_super'] = strpos($superscript, '#') !== false ? 1 : 0;
            $author['is_report'] = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0;
        }
        unset($author); // 释放引用
        return $authors;
    }
    private function getAuthors($aParam = []) {
        $title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
        $sAuthorContent = $this->getNextParagraphAfterText($title);
        if (empty($sAuthorContent)) {
            return ['author' => [], 'report' => []];
        }
        if(!mb_check_encoding($sAuthorContent, 'UTF-8')){
            $sAuthorContent = mb_convert_encoding($sAuthorContent, 'UTF-8', 'GBK');
        }
        $sAuthorContent = $this->fullDecode($sAuthorContent);
        //清理不可见字符
        $sAuthorContent = preg_replace('/[\x00-\x1F\x7F\x{200B}-\x{200F}]/u', '', $sAuthorContent);

        //修复特殊符号乱码
        $symbolMap = [
            'â€ ' => '†', 'â  ' => '†', 'â' => '†', '?†' => '†',
            'ï¼š' => ':', 'ï¼Œ' => ',', 'â€”' => '-',
            '啊' => '' // 针对性移除异常字符“啊”（若为固定乱码）
        ];
        $sAuthorContent = strtr($sAuthorContent, $symbolMap);

        //格式标准化
        $sAuthorContent = str_replace(['，', ';', '；', '、'], ',', $sAuthorContent); // 统一分隔符
        $sAuthorContent = preg_replace('/\s+and\s+/i', ', ', $sAuthorContent); // and转逗号
        $sAuthorContent = preg_replace('/\s+/', ' ', $sAuthorContent); // 合并多余空格
        $sAuthorContent = trim($sAuthorContent);
        $aAuthor = $this->parseAuthorsWithoutRegex($sAuthorContent);
        if(empty($aAuthor)){
            return ['author' => [],'report' => []];
        }
        $aReport = $aAuthorData = [];
        foreach ($aAuthor as $key => $value) {
            if(empty($value['name']) && empty($value['superscript'])){
                continue;
            }
            if(!empty($value['name']) && !empty($value['is_report']) && $value['is_report'] == 1){
                $aReport[] = $value['name'];
            }
            $aAuthorData[] = $value;
        }
        return ['author' => $aAuthorData,'report' => array_unique($aReport)];
    }

    // 获取机构
    private function getCompany($aParam = []){
        //获取标题
        $title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
        //获取标题下的作者
        $sAuthorContent = empty($aParam['authors']) ? $this->getNextParagraphAfterText($title) : $aParam['authors'];
        //获取作者结构
        $allLines = $this->getContentAfterText($sAuthorContent,1);
        if(empty($allLines)){
            return [];
        }
        // 2. 按序号分组，合并同一序号的多行内容
        $grouped = [];
        $currentNumber = null; // 当前序号
        foreach ($allLines as $line) {
            $line = trim($line);
            if (empty($line)) {
                continue;
            }
            if(!mb_check_encoding($line, 'UTF-8')){
                $line = mb_convert_encoding($line, 'UTF-8', 'GBK');
            }
            $line = $this->fullDecode($line);
            $number = '';
            $i = 0;
            $lineLen = strlen($line);
            // 提取行首的连续数字（作为序号）
            $hasFirstChar = false;
            while ($i < $lineLen) {
                $currentChar = $line[$i];
                // 首字符处理：允许 26个字母（大小写）或数字
                if (!$hasFirstChar) {
                    if (ctype_digit($currentChar) || ctype_alpha($currentChar)) {
                        $number .= $currentChar;
                        $hasFirstChar = true;
                        $i++;
                    } else {
                        // 首字符不符合（非字母/数字），终止循环
                        break;
                    }
                } else {
                    // 后续字符必须是数字（保持原逻辑）
                    if (ctype_digit($currentChar)) {
                        $number .= $currentChar;
                        $i++;
                    } else {
                        break;
                    }
                }
            }

            // 若行首有数字，则视为新条目
            if (!empty($number)) {
                $currentNumber = $number;
                // 提取序号后的内容（跳过数字后的符号/空格，保留核心内容）
                // 从数字后的位置开始，跳过可能的符号（./*）或空格
                while ($i < $lineLen && (in_array($line[$i], ['.', '*', ' ']))) {
                    $i++;
                }
                $content = trim(substr($line, $i)); // 序号后的内容
                $grouped[$currentNumber] = $content;
                continue;
            }

            // // 非新条目，合并到当前序号的内容中
            // if ($currentNumber !== null) {
            //     $grouped[$currentNumber] .= ' ' . $line;
            // }
        }

        $aCompany = [];
        foreach ($grouped as $number => $institution) {
            $institution = $this->fullDecode($institution);
            // 原有基础清理逻辑不变
            $institution = preg_replace('/\s+/', ' ', $institution); // 合并多余空格
            $institution = rtrim($institution, '.'); // 去除末尾句号
            $institution = preg_replace('/^\d+\s+/', '', $institution); // 去除开头数字
            $institution = trim($institution); // 清理首尾空格

            // 增强地址提取：匹配"机构名, 城市 邮编, 国家"格式（兼容更多变体）
            // 允许地址中包含多个逗号（如子机构、街道信息），最终以"城市 邮编, 国家"结尾
            // preg_match('/(.*?, [A-Za-z\s]+ \d+, [A-Za-z\s]+)/', $institution, $institutionmatches);
            // $institution = trim($institutionmatches[1] ?? $institution);
            // 强化冗余信息过滤：去除"*"及之后的内容（包括通讯作者、邮箱等）
            // 新增对"#"、"†"等标记的过滤，兼容更多期刊格式
            if (preg_match('/^(.*?)(?=\s*[\*#†]|(?i)\s*Email)/', $institution, $matches)) {
                $institution = trim($matches[1]);
            }

            // 编码校验不变
            if (!empty($institution) && !mb_check_encoding($institution, 'UTF-8')) {
                $institution = mb_convert_encoding($institution, 'UTF-8', 'GBK');
            }
            $aCompany[$number] = $institution;
        }
        return $aCompany;
    }

    // 提取通讯作者（含E-mail、地址、电话）
    private function getCorrespondingAuthors($aParam = []){
        $aCorrespondingAuthor = empty($aParam['report']) ? [] : $aParam['report'];
        if(empty($aCorrespondingAuthor)){
            return [];
        }

        // 获取标题
        $title = empty($aParam['title']) ? $this->getTitle() : $aParam['title'];
        $sAuthorContent = $this->getNextParagraphAfterText($title);
        $sCompany = $this->getNextParagraphAfterText($sAuthorContent); // 直接取机构所在段落的原始文本
        if (empty($sCompany)) {
            // 备选方案：若机构段落获取失败，用解析后的机构名称拼接
            $aCompany = $this->getCompany($aParam);
            $sCompany = implode(' ', array_values($aCompany));
        }

        // 获取机构后的完整内容
        $corrText = $this->getContentAfterText($sCompany);
        if(!mb_check_encoding($corrText, 'UTF-8')){
            $corrText = mb_convert_encoding($corrText, 'UTF-8', 'GBK');
        }
        $corrText = $this->fullDecode($corrText);
        // // 调试
        // file_put_contents(ROOT_PATH . 'runtime/corr_text_raw.log', $corrText);

        //清理文本
        $corrText = str_replace(['：', '＠'], [':', '@'], $corrText);
        $corrText = preg_replace('/\s+/', ' ', $corrText); // 统一空格
        $corrText = str_replace('  ', ' ', $corrText); // 去除多余空格
        //按"*"分割通讯作者
        $corrBlocks = preg_split('/\s*\*\s*/', $corrText);
        $corrBlocks = array_filter(array_map('trim', $corrBlocks));

        $aCorresponding = [];
        foreach ($corrBlocks as $block) {
            //匹配通讯作者姓名
            $sName = $this->matchCorrespondingName($block, $aCorrespondingAuthor);
            if (empty($sName)) {
                continue;
            }
            preg_match('/(E[\s-]*mail|邮箱)[\s:]*([^\s]+@[^\s]+)/i', $block, $email);
            preg_match('/(Postal[\s-]*address|地址)[\s:]*([^,;]+)/i', $block, $address);
            preg_match('/(Tel|电话)[\s:]*([^\s]+)/i', $block, $tel);
            $aCorresponding[] = [
                'name' => $sName,
                'email' => isset($email[2]) ? trim($email[2]) : '',
                'postal_address' => isset($address[2]) ? trim($address[2]) : '',
                'tel' => isset($tel[2]) ? trim($tel[2]) : ''
            ];
        }
        if(empty($aCorresponding)){
            $pattern = '/Corresponding Authors|Correspondence to|Correspondence: (.*?)(?=$|;)/s';
            $corrText = trim($corrText,'*');
            preg_match($pattern, $corrText, $match);
            if (!empty($match[1])) {
                $corrContent = $match[1];
                // 提取每个作者的名称和邮箱（优化正则，支持更多字符）
                $authorPattern = '/([A-Za-z\s]+?),\s*E-mail:\s*([\w@\.\-]+)/';
                preg_match_all($authorPattern, $corrContent, $authors);
                if(!empty($authors[1])){
                    for ($i = 0; $i < count($authors[1]); $i++) {
                        $aCorresponding[] = [
                            'name' => empty($authors[1][$i]) ? '' : trim($authors[1][$i]),
                            'email' => empty($authors[2][$i]) ? '' : trim($authors[2][$i])
                        ];
                    }
                }
                if(empty($authors[1])){
                    $authorPattern = '/([A-Za-z0-9\s]+?),\s*([\w@\.\-]+)(?=\.?)/';
                    preg_match_all($authorPattern, $corrContent, $authors);
                    for ($i = 0; $i < count($authors[1]); $i++) {
                        $aCorresponding[] = [
                            'name' => empty($authors[1][$i]) ? '' : trim($authors[1][$i]),
                            'email' => empty($authors[2][$i]) ? '' : trim($authors[2][$i])
                        ];
                    }
                }
            }
        }
        return $aCorresponding;
    }

    //匹配通讯作者姓名
    private function matchCorrespondingName($block, $corrNames)
    {
        $blockLower = strtolower($block);
        foreach ($corrNames as $name) {
            if (strpos($blockLower, strtolower($name)) !== false) {
                return $name;
            }
            $nameParts = explode(' ', $name);
            if (count($nameParts) >= 2) {
                $reversedName = implode(' ', array_reverse($nameParts));
                if (strpos($blockLower, strtolower($reversedName)) !== false) {
                    return $name;
                }
            }
        }
        return '';
    }

    // 获取目标文本的下一个段落
    private function getNextParagraphAfterText($targetText){

        $found = false;
        foreach ($this->sections as $section) {
            foreach ($section->getElements() as $element) {
                $text = $this->getTextFromElement($element);
                if(empty($text)){
                    continue;
                }
                if ($found) {
                    return $text;
                }
                if (stripos($text, $targetText) !== false) {
                    $found = true;
                }
            }
        }
        return '';
    }

    // 获取目标文本后的所有内容
    private function getContentAfterText($targetText,$return_type = 2){
        $found = false;
        $content = [];
        $stopKeywords = ['关键词', 'Key words', '摘要', 'Abstract','ABSTRACT'];
        $maxLines = 200;
        $lineNumber = 0;
        foreach ($this->sections as $section) {

            foreach ($section->getElements() as $element) {

                $lineNumber++;
                if (count($content) >= $maxLines) break;

                $text = $this->getTextFromElement($element,$lineNumber);
                $text = trim($text);
                if (empty($text)) continue;
                if (!$found) {
                    // 移除所有非字母数字字符后匹配
                    $cleanTarget = preg_replace('/[^a-zA-Z0-9]/', '', strtolower($targetText));
                    $cleanText = preg_replace('/[^a-zA-Z0-9]/', '', strtolower($text));
                    // 只要目标文本的50%以上能匹配即可
                    if (strlen($cleanTarget) > 0 && similar_text($cleanText, $cleanTarget) / strlen($cleanTarget) > 0.5) {
                        $found = true;
                    }
                    continue;
                }

                // 检查停止关键词
                $shouldStop = false;
                foreach ($stopKeywords as $kw) {
                    if (stripos($text, $kw) !== false) {
                        $shouldStop = true;
                        break;
                    }
                }
                if ($shouldStop) break;

                $content[] = $text;
            }
            if (count($content) >= $maxLines || (isset($shouldStop) && $shouldStop)) break;
        }
        if($return_type == 1){
            return $content;
        }
        $content = implode("\n", $content);
        if(!empty($content) && !mb_check_encoding($content, 'UTF-8')){
            $content = mb_convert_encoding($content, 'UTF-8', 'GBK');
        }
        return $content;
    }

    // 统一提取元素文本
    private function getTextFromElement($element,$lineNumber = 0){
        $text = '';
        // 处理PreserveText元素
        if ($element instanceof \PhpOffice\PhpWord\Element\PreserveText) {
            // 通过反射获取私有属性 text
            $reflection = new \ReflectionClass($element);
            $property = $reflection->getProperty('text');
            $property->setAccessible(true);
            $textParts = $property->getValue($element);
            foreach ($textParts as $part) {
                if (strpos($part, 'HYPERLINK') !== false) {
                    // 解码 HTML 实体（&quot; -> "）
                    $decoded = html_entity_decode($part);
                    // 提取 mailto: 后的邮箱
                    if (preg_match('/mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/i', $decoded, $match)) {
                        $text .= $match[1] . ' ';
                    }
                } else {
                    // 普通文本直接拼接
                    $text .= $part;
                }
            }
            return $text;
        }
        // 处理表格和单元格（E-mail可能在表格中）
        if ($element instanceof \PhpOffice\PhpWord\Element\Table) {
            foreach ($element->getRows() as $row) {
                foreach ($row->getCells() as $cell) {
                    $text .= $this->getTextFromElement($cell);
                }
            }
            return $text;
        }
        if ($element instanceof \PhpOffice\PhpWord\Element\Cell) {
            foreach ($element->getElements() as $child) {
                $text .= $this->getTextFromElement($child);
            }
            return $text;
        }

        //处理嵌套元素（递归提取所有子元素）
        if (method_exists($element, 'getElements')) {
            foreach ($element->getElements() as $child) {
                $text .= $this->getTextFromElement($child);
            }
        }

        //处理文本元素（包括带格式的文本）
        if ($element instanceof \PhpOffice\PhpWord\Element\Text) {
            $text .= $element->getText();
        }

        //处理超链接（优先提取链接目标，可能是邮箱）
        if ($element instanceof \PhpOffice\PhpWord\Element\Link) {
            $target = $element->getTarget();
            if (strpos($target, 'mailto:') === 0) {
                $text .= str_replace('mailto:', '', $target) . ' '; // 剥离mailto:前缀
            }
            $text .= $element->getText() . ' ';
        }

        //处理字段和注释（可能包含隐藏邮箱）
        if ($element instanceof \PhpOffice\PhpWord\Element\Field) {
            $text .= $element->getContent() . ' ';
        }
        if ($element instanceof \PhpOffice\PhpWord\Element\Note) {
            $text .= $element->getContent() . ' ';
        }
        //清理所有不可见字符（关键：移除格式干扰）
        $text = preg_replace('/[\x00-\x1F\x7F-\x9F]/', ' ', $text); // 移除控制字符
        $text = str_replace(["\t", "\r", "\n"], ' ', $text); // 统一空白字符
        $text = preg_replace('/\s+/', ' ', $text); // 合并多个空格
        if(!empty($text) && !mb_check_encoding($text, 'UTF-8')){
            $text = mb_convert_encoding($text, 'UTF-8', 'GBK');
        }
        return $text;
    }

    /**
     * 从 Word 文档提取摘要和关键词
     * @return array 提取结果
     */
    function extractContentIntervals($str, $markers = []) {
        // 1. 初始化标记（支持自定义，默认值兼容原逻辑）
        $defaultMarkers = [
            'abstract' => 'abstract',
            'keywords' => 'keywords',
            'end_span' => '===========end-span'
        ];
        $markers = array_merge($defaultMarkers, $markers);
        extract($markers); // 解析为变量 $abstract, $keywords, $end_span

        // 2. 初始化结果（包含元信息）
        $result = [
            'abstract_to_keywords' => '',
            'keywords_to_end' => '',
            'positions' => [ // 标记位置信息（-1 表示未找到）
                'abstract' => -1,
                'keywords' => -1,
                'end_span' => -1
            ],
            'is_valid' => false, // 整体区间是否有效
            'error' => '' // 错误信息（如标记顺序异常）
        ];

        // 3. 定位 Abstract（不区分大小写）
        $absPos = stripos($str, $abstract);
        if ($absPos === false) {
            $result['error'] = "未找到标记: {$abstract}";
            return $result;
        }
        $result['positions']['abstract'] = $absPos;
        $absEndPos = $absPos + strlen($abstract);

        // 4. 定位 Keywords（需在 Abstract 之后，不区分大小写）
        $keyPos = stripos($str, $keywords, $absEndPos);
        if ($keyPos === false) {
            $result['error'] = "未找到 {$keywords} 或在 {$abstract} 之前";
            return $result;
        }
        $result['positions']['keywords'] = $keyPos;
        $keyEndPos = $keyPos + strlen($keywords);

        // 5. 定位 end-span（需在 Keywords 之后，严格匹配）
        $endPos = strpos($str, $end_span, $keyEndPos);
        if ($endPos === false) {
            $result['error'] = "未找到 {$end_span} 或在 {$keywords} 之前";
            return $result;
        }
        $result['positions']['end_span'] = $endPos;

        // 6. 截取区间内容（清理标记后的紧邻符号）
        // 区间1：Abstract 结束 → Keywords 开始（清理标记后的冒号/空格）
        $len1 = $keyPos - $absEndPos;
        $part1 = substr($str, $absEndPos, $len1);
        $part1 = trim($part1);
        // 移除 Abstract 后可能的冒号/短横线（如 "Abstract: ..." → 去掉开头的 ":"）
        $part1 = ltrim($part1, ': -—');
        $result['abstract_to_keywords'] = trim($part1);

        // 区间2：Keywords 结束 → end-span 开始（同理清理）
        $len2 = $endPos - $keyEndPos;
        $part2 = substr($str, $keyEndPos, $len2);
        $part2 = trim($part2);
        $part2 = ltrim($part2, ': -—');
        $result['keywords_to_end'] = trim($part2);

        // 7. 标记为有效
        $result['is_valid'] = true;
        return $result;
    }
    public function extractFromWord() {
        $sContent = '';
        //文本处理
        $sFundContent = '';
        $aContent = [];
        foreach ($this->sections as $section) {
            foreach ($section->getElements() as $element) {
                $textContent = $this->getTextFromElement($element);
                if(empty($textContent)){
                    continue;
                }
                if(!empty($textContent) && !mb_check_encoding($textContent, 'UTF-8')){
                    $textContent = mb_convert_encoding($textContent, 'UTF-8', 'GBK');
                }
                if(empty($sFundContent)){
                    $aFund = $this->getMatchedFundPhrases($sContent);
                    if(!empty($aFund[0])){
                        $position = stripos($sContent, $aFund[0]);
                        $sFundContent = substr($sContent, $position);
                        $sFundContent = trim(str_ireplace($aFund[0], '', $sFundContent));
                        if (preg_match('/^(.*?)Peer review/', $sFundContent, $matches)) {
                            $sFundContent = $matches[1]; // 提取匹配到的前置内容
                        }
                    }
                }
                $sContent .= $textContent."===========end-span";
            }
        }
        if(!empty($sContent) && !mb_check_encoding($sContent, 'UTF-8')){
            $sContent = mb_convert_encoding($sContent, 'UTF-8', 'GBK');
        }
        $result = $this->extractContentIntervals($sContent);
        // 3. 提取摘要
        $abstract = empty($result['abstract_to_keywords']) ? '' : $result['abstract_to_keywords'];
        if(!empty($abstract) && !mb_check_encoding($abstract, 'UTF-8')){
            $abstract =  mb_convert_encoding($abstract, 'UTF-8', 'GBK');
        }
        $keywords = empty($result['keywords_to_end']) ? '' : $result['keywords_to_end'];
        if(!empty($keywords) && !mb_check_encoding($keywords, 'UTF-8')){
            $keywords = mb_convert_encoding($keywords, 'UTF-8', 'GBK');
        }
        if(!empty($sFundContent) && !mb_check_encoding($sFundContent, 'UTF-8')){
            $sFundContent = mb_convert_encoding($sFundContent, 'UTF-8', 'GBK');
        }

        return [
            'status' => 1,
            'msg' => '提取成功',
            'data' => [
                'abstrart' => empty($abstract) ? '' : $this->fullDecode(str_replace('===========end-span', '',$abstract)),
                'keywords' => empty($keywords) ? '' : $this->fullDecode(str_replace('===========end-span', '',$keywords)),
                'fund' => empty($sFundContent) ? '' : $this->fullDecode(str_replace('===========end-span', '',$sFundContent))
            ]
        ];
    }
    private function fullDecode($str, $maxDepth = 5) {
        // 空值/深度为0，直接返回（提前终止，避免无效操作）
        if (empty($str) || $maxDepth <= 0) {
            return $str;
        }

        // 【性能优化1：预编译所有正则表达式】避免每次循环重新解析正则
        // 预编译：≥专属场景正则
        $regOb0 = '/0B\s*\?0/';
        $regDl18 = '/DL\s*\?.18/';
        // 预编译：≥通用场景正则
        $regQMarkNum = '/\?(\d+)/';
        $regQMarkDotNum = '/\?(\.\d+)/';
        // 预编译：≤、≠空格修复正则
        $regNeNum = '/≠\s*(\d+)/';
        $regLeNum = '/≤\s*(\d+)/';
        // 预编译：混合符号乱码正则（中文顿号/英文逗号）
        $regMixCn = '/(\?)\s*、\s*(\?)\s*、\s*(\?)(\d+)/';
        $regMixEn = '/(\?)\s*,\s*(\?)\s*,\s*(\?)(\d+)/';
        // 预编译：≤、≠专属标识正则
        $regLeMark = '/LE\s*\?(\d+)/';
        $regNeMark = '/NE\s*\?(\d+)/';
        // 预编译：Unicode转义正则（提取到外部，避免闭包重复创建）
        $regUnicode = '/\\\\u([0-9a-fA-F]{4})/';

        // 【性能优化2：预定义常量/映射】避免循环内重复创建数组/字符串
        // HTML实体映射（一次性定义，避免循环内重复赋值）
        $htmlEntityMap = [
            '&le;' => '≤', '&#8804;' => '≤', '&#x2264;' => '≤',
            '&ge;' => '≥', '&#8805;' => '≥', '&#x2265;' => '≥',
            '&ne;' => '≠', '&#8800;' => '≠', '&#x2260;' => '≠',
        ];
        // 不间断空格替换数组
        $nbspReplace = [chr(0xC2) . chr(0xA0), chr(0xA0)];
        // Unicode回调函数（预定义，避免循环内重复创建闭包）
        $unicodeCallback = function ($m) {
            return mb_chr(hexdec($m[1]), 'UTF-8') ?: $m[0];
        };

        $original = $str;
        $depth = 0;
        $hasChange = false; // 标记是否有变化，提前终止循环

        // 循环解码：仅在有变化且未达最大深度时执行
        do {
            $depth++;
            $hasChange = false;
            $prevStr = $str; // 保存当前状态，用于判断变化

            // 1. 解码Unicode转义（\uXXXX格式）
            $str = $this->decodeUnicode($str);

            // 2. 解码HTML实体（先替换专属实体，再执行通用解码）
            $str = strtr($str, $htmlEntityMap); // 高性能替换（strtr比str_replace快）
            $str = html_entity_decode($str, ENT_QUOTES | ENT_HTML5, 'UTF-8');

            // 3. 再次处理遗漏的Unicode转义（使用预编译正则+预定义回调）
            $str = preg_replace_callback($regUnicode, $unicodeCallback, $str);

            // 4. 替换不间断空格为普通空格（strtr比str_replace更高效）
            $str = str_replace($nbspReplace, ' ', $str);

            // 5. 核心替换逻辑（优化执行顺序，避免覆盖）
            // 5.1 原有≥专属场景（保留）
            $str = preg_replace($regOb0, '0B≥30', $str, -1, $count1);
            $str = preg_replace($regDl18, 'DL≥0.18', $str, -1, $count2);
            // 5.2 ≤、≠空格修复（保留）
            $str = preg_replace($regNeNum, '≠$1', $str, -1, $count3);
            $str = preg_replace($regLeNum, '≤$1', $str, -1, $count4);
            // 5.3 原有≥通用场景（保留）
            $str = preg_replace($regQMarkNum, '≥$1', $str, -1, $count5);
            $str = preg_replace($regQMarkDotNum, '≥0$1', $str, -1, $count6);
            // 5.4 混合符号乱码还原（保留）
            $str = preg_replace($regMixCn, '≤、≥、≠$4', $str, -1, $count7);
            $str = preg_replace($regMixEn, '≤、≥、≠$4', $str, -1, $count8);
            // 5.5 ≤、≠专属标识还原（保留）
            $str = preg_replace($regLeMark, '≤$1', $str, -1, $count9);
            $str = preg_replace($regNeMark, '≠$1', $str, -1, $count10);

            // 5.6 修复前缀"d with "乱码（保留）
            $str = str_replace('d with ', 'd with ', $str, $count11);

            // 【性能优化3：统计所有替换次数，判断是否有变化】
            $totalCount = $count1 + $count2 + $count3 + $count4 + $count5 + $count6 +
                          $count7 + $count8 + $count9 + $count10 + $count11;
            if ($totalCount > 0 || $str !== $prevStr) {
                $hasChange = true;
                $original = $str;
            }

            // 【性能优化4：提前终止】单次循环无变化，直接退出
            if (!$hasChange) {
                break;
            }

        } while ($depth < $maxDepth); // 改用do-while，减少循环判断次数

        // 最终清理：仅执行一次trim
        return trim($str, ':');
    }
    // private function fullDecode($str, $maxDepth = 5) {
    //     if (empty($str) || $maxDepth <= 0) {
    //         return $str;
    //     }

    //     $original = $str;
    //     $depth = 0;

    //     // 循环解码，直到无变化或达到最大次数
    //     while (true) {
    //         $depth++;
    //         if ($depth > $maxDepth) {
    //             break; // 防止过度解码导致死循环
    //         }

    //         // 1. 解码 Unicode 转义（\uXXXX 格式）
    //         $str = $this->decodeUnicode($str);

    //         // 2. 解码 HTML 实体（&amp;、&#039;、&lt; 等）
    //         $str = html_entity_decode($str, ENT_QUOTES | ENT_HTML5, 'UTF-8');

    //         $str = preg_replace_callback('/\\\\u([0-9a-fA-F]{4})/', function ($m) {
    //             return mb_chr(hexdec($m[1]), 'UTF-8') ?: $m[0];
    //         }, $str);
    //         $str = str_replace([chr(0xC2).chr(0xA0), chr(0xA0)], ' ', $str);

    //         // 2. 核心：强制匹配所有可能的乱码格式，还原≥
    //         // 匹配：0B?0、0B ?0、0B ?0（空格/制表符）→ 0B≥30
    //         $str = preg_replace('/0B\s*\?0/', '0B≥30', $str);
    //         // 匹配：DL?.18、DL ?.18、DL ?.18 → DL≥0.18
    //         $str = preg_replace('/DL\s*\?.18/', 'DL≥0.18', $str);
    //         // 通用匹配：数字前的?（如?30、?0.18）→ ≥30、≥0.18（防止其他变体）
    //         $str = preg_replace('/\?(\d+)/', '≥$1', $str);
    //         $str = preg_replace('/\?(\.\d+)/', '≥0$1', $str);

    //         // 3. 修复前缀的"d with "可能的乱码（若有）
    //         $str = str_replace('d with ', 'd with ', $str); // 若前缀也乱码，可同步替换

    //         // 若解码后无变化，退出循环
    //         if ($str === $original) {
    //             break;
    //         }

    //         $original = $str;
    //     }

    //     return trim($str,':');
    // }
    private function decodeUnicode($str) {
        return preg_replace_callback(
            '/\\\\u([0-9a-fA-F]{4})/',
            function ($matches) {
                // 将十六进制 Unicode 码转为 UTF-8 字符
                return mb_convert_encoding(pack('H*', $matches[1]), 'UTF-8', 'UCS-2BE');
            },
            $str
        );
    }
    private function getMatchedFundPhrases($content = '') {
        if (empty($content)) {
            return [];
        }

        // 基金支持词组列表
        $fundPhrases = [
            'Supported by', 'Funded by', 'Sponsored by', 'Supported in part by',
            'Funding was provided by', 'Funded in part by','FUNDING:'
        ];

        // 1. 转义词组中的特殊字符，使用 # 作为分隔符
        $escapedPhrases = array_map(function($phrase) {
            return preg_quote($phrase, '#');
        }, $fundPhrases);

        // 2. 拼接为正则模式：匹配任意一个词组（保留原始词组的捕获）
        $pattern = '#('.implode('|', $escapedPhrases).')#i';
        // 注意：此处用 () 捕获分组，而非 (?:)，用于提取匹配到的具体词组

        // 3. 全局匹配所有符合的词组
        preg_match_all($pattern, $content, $matches);

        // 4. 处理结果：去重、保留原始词组格式（忽略大小写导致的变体）
        $matched = [];
        if (!empty($matches[1])) {
            // 遍历匹配到的结果（可能包含大小写变体，如 'funded by'）
            foreach ($matches[1] as $match) {
                // 与原始词组列表比对，找到完全匹配的原始词组（忽略大小写）
                foreach ($fundPhrases as $original) {
                    if (strcasecmp($match, $original) === 0) {
                        $matched[] = $original;
                        break; // 找到后跳出内层循环，避免重复
                    }
                }
            }
            // 去重并保持原始顺序
            $matched = array_values(array_unique($matched));
        }

        return $matched;
    }
    //日志打印
    private function log($msg){
        // echo date('[Y-m-d H:i:s] ') . $msg . "\n";
    }
}