测试修改

2025-11-06 17:00:20 +08:00
parent 6d867d6088
commit 3c459af7fc
1 changed files with 191 additions and 10 deletions
--- a/application/common/ArticleParserService.php
+++ b/application/common/ArticleParserService.php
@@ -33,9 +33,162 @@ class ArticleParserService
            $this->phpWord = $reader->load($filePath);
            $this->sections = $this->phpWord->getSections();
        } catch (\Exception $e) {
+            // 预处理：移除 DOCX 中的 EMF 图片
+            $processedFilePath = $this->removeEmfFromDocx($filePath);
+            // 加载处理后的文档
+            $reader = IOFactory::createReader();
+            $reader->setReadDataOnly(false);
+            Settings::setCompatibility(false);
+            Settings::setOutputEscapingEnabled(true);
+
+            $this->phpWord = $reader->load($processedFilePath);
+            $this->sections = $this->phpWord->getSections();
+
+            // 可选：删除临时处理文件（避免冗余）
+            var_dump($processedFilePath);
+            unlink($processedFilePath);
            return json_encode(['status' => 5, 'msg' => $e->getMessage()]);
        }
    }
+    /**
+     * 移除 DOCX 压缩包内的所有 EMF 图片
+     * @param string $docxPath 原 DOCX 文件路径
+     * @return string 处理后的临时 DOCX 路径
+     */
+    private function removeEmfFromDocx($docxPath){
+        $zip = new ZipArchive();
+        if ($zip->open($docxPath) !== true) {
+            throw new \Exception("无法打开 DOCX 文件：{$docxPath}");
+        }
+
+        // 1. 创建临时目录用于解压
+        $tempDir = rtrim(ROOT_PATH,'/').'/runtime/'.uniqid('docx_temp_');
+      
+        mkdir($tempDir, 0700, true);
+
+        // 2. 解压 DOCX 到临时目录
+        $zip->extractTo($tempDir);
+        $zip->close();
+
+        // 3. 递归删除所有 EMF 文件
+        $dirIterator = new RecursiveDirectoryIterator($tempDir);
+        $iterator = new RecursiveIteratorIterator($dirIterator);
+        foreach ($iterator as $file) {
+            if ($file->isFile() && strtolower(pathinfo($file, PATHINFO_EXTENSION)) === 'emf') {
+                unlink($file->getPathname());
+            }
+        }
+        // 4. 重新打包为 DOCX
+        $processedPath = $tempDir . '_processed.docx';
+        $newZip = new ZipArchive();
+        if ($newZip->open($processedPath, ZipArchive::CREATE | ZipArchive::OVERWRITE) !== true) {
+            throw new \Exception("无法创建处理后的 DOCX 文件");
+        }
+
+        // 遍历临时目录，添加所有文件到新压缩包
+        $this->addFilesToZip($tempDir, $newZip);
+        $newZip->close();
+
+        // 5. 删除临时解压目录
+        $this->deleteDir($tempDir);
+
+        return $processedPath;
+    }
+
+    /**
+     * 递归添加目录文件到 ZipArchive
+     * @param string $dir 目录路径
+     * @param ZipArchive $zip ZipArchive 实例
+     */
+    private function addFilesToZip($dir, $zip)
+    {
+        $files = scandir($dir);
+        foreach ($files as $file) {
+            if ($file === '.' || $file === '..') continue;
+
+            $filePath = $dir . '/' . $file;
+            if (is_dir($filePath)) {
+                $this->addFilesToZip($filePath, $zip);
+            } else {
+                // 计算压缩包内的相对路径（避免冗余目录层级）
+                $relativePath = str_replace(dirname($dir) . '/', '', $filePath);
+                $zip->addFile($filePath, $relativePath);
+            }
+        }
+    }
+
+    /**
+     * 递归删除目录
+     * @param string $dir 目录路径
+     */
+    private function deleteDir($dir){
+        // 1. 基础校验：非空字符串且为有效目录
+        if (trim($dir) === '' || !is_dir($dir)) {
+            return false;
+        }
+
+        // 2. 统一路径格式（去除尾部分隔符，避免跨系统差异）
+        $dir = rtrim($dir, DIRECTORY_SEPARATOR);
+        $dirName = basename($dir);
+
+        // 3. 前缀强校验：仅处理docx_temp_开头的目录
+        if (strpos($dirName, 'docx_temp_') !== 0) {
+            return false;
+        }
+
+        // 4. 路径归属校验（缓存realpath结果，减少I/O）
+        $runtimeDir = rtrim(ROOT_PATH, '/') . '/runtime';
+        $realDir = realpath($dir);
+        $realRuntimeDir = realpath($runtimeDir);
+        if ($realDir === false || $realRuntimeDir === false || strpos($realDir, $realRuntimeDir) !== 0) {
+            return false;
+        }
+
+        // 5. 扫描目录（带错误抑制，处理权限问题）
+        $files = @scandir($dir);
+        if ($files === false) {
+            return false;
+        }
+
+        $isFullyDeleted = true; // 标记是否完全删除
+
+        // 6. 递归处理子项
+        foreach ($files as $file) {
+            if ($file === '.' || $file === '..') {
+                continue;
+            }
+
+            $filePath = $dir . DIRECTORY_SEPARATOR . $file;
+            $realFilePath = realpath($filePath);
+
+            // 子路径校验：必须是当前目录的子项（防符号链接跳转）
+            if ($realFilePath === false || strpos($realFilePath, $realDir) !== 0) {
+                $isFullyDeleted = false;
+                continue;
+            }
+
+            if (is_dir($realFilePath)) {
+                // 递归删除子目录，继承校验逻辑
+                if (!$this->deleteDir($realFilePath)) {
+                    $isFullyDeleted = false;
+                }
+            } else {
+                // 尝试删除文件（失败则标记未完全删除）
+                if (!@unlink($realFilePath)) {
+                    $isFullyDeleted = false;
+                }
+            }
+        }
+
+        // 7. 最终删除目录（确保空目录才删除）
+        $remainingFiles = @scandir($dir);
+        if ($remainingFiles !== false && count($remainingFiles) <= 2) {
+            @rmdir($dir);
+            return $isFullyDeleted; // 若子项完全删除，则返回true
+        }
+
+        return false;
+    }

    // 上传并解析文档的入口方法
    public static function uploadAndParse($sFileUrl){
@@ -79,9 +232,11 @@ class ArticleParserService

    // 提取文章标题
    private function getTitle(){
+        if(empty($this->sections)){
+            return '';
+        }
        $title = '';
        $maxLength = 0;
-
        foreach ($this->sections as $section) {
            foreach ($section->getElements() as $element) {
                $text = $this->getTextFromElement($element);
@@ -93,6 +248,9 @@ class ArticleParserService
                }
            }
        }
+        if(!empty($title) && !mb_check_encoding($title, 'UTF-8')){
+            $title = mb_convert_encoding($title, 'UTF-8', 'GBK');
+        }
        return $title;
    }
    // 提取作者
@@ -462,6 +620,9 @@ class ArticleParserService
            if(empty($value['name']) && empty($value['superscript'])){
                continue;
            }
+            if(!mb_check_encoding($value['name'], 'UTF-8')){
+                $value['name'] = mb_convert_encoding($value['name'], 'UTF-8', 'GBK');
+            }
            if(!empty($value['name']) && !empty($value['is_report']) && $value['is_report'] == 1){
                $aReport[] = $value['name'];
            }
@@ -701,12 +862,12 @@ class ArticleParserService
            $institution = trim($institution); // 清理首尾空格
            preg_match('/(.*?, [A-Za-z]+ \d+, [A-Za-z]+)/', $institution, $institutionmatches);;
            $institution = trim($institutionmatches[1] ?? $institution);
-            if(!mb_check_encoding($institution, 'UTF-8')){
-                $institution = mb_convert_encoding($institution, 'UTF-8', 'GBK');
-            }
            if (preg_match('/^(.*?)(?=\s*\*Email)/', $institution, $matches)) {
                $institution = trim($matches[1]); // trim() 去除内容前后多余空格
            }
+            if(!empty($institution) && !mb_check_encoding($institution, 'UTF-8')){
+                $institution = mb_convert_encoding($institution, 'UTF-8', 'GBK');
+            }
            $aCompany[$number] = $institution;
        }
        return $aCompany;
@@ -831,7 +992,7 @@ class ArticleParserService
    private function getContentAfterText($targetText,$return_type = 2){
        $found = false;
        $content = [];
-        $stopKeywords = ['关键词', 'Key words', '摘要', 'Abstract'];
+        $stopKeywords = ['关键词', 'Key words', '摘要', 'Abstract','ABSTRACT'];
        $maxLines = 200;
        $lineNumber = 0;
        foreach ($this->sections as $section) {
@@ -873,7 +1034,7 @@ class ArticleParserService
            return $content;
        }
        $content = implode("\n", $content);
-        if(!mb_check_encoding($content, 'UTF-8')){
+        if(!empty($content) && !mb_check_encoding($content, 'UTF-8')){
            $content = mb_convert_encoding($content, 'UTF-8', 'GBK');
        }
        return $content;
@@ -952,7 +1113,9 @@ class ArticleParserService
        $text = preg_replace('/[\x00-\x1F\x7F-\x9F]/', ' ', $text); // 移除控制字符
        $text = str_replace(["\t", "\r", "\n"], ' ', $text); // 统一空白字符
        $text = preg_replace('/\s+/', ' ', $text); // 合并多个空格
-
+        if(!empty($text) && !mb_check_encoding($text, 'UTF-8')){
+            $text = mb_convert_encoding($text, 'UTF-8', 'GBK');
+        }
        return $text;
    }

@@ -993,7 +1156,8 @@ class ArticleParserService
                $sContent .= "\n";
            }
        }
-        if(!mb_check_encoding($sContent, 'UTF-8')){
+
+        if(!empty($sContent) && !mb_check_encoding($sContent, 'UTF-8')){
            $sContent = mb_convert_encoding($sContent, 'UTF-8', 'GBK');
        }
        // 2. 基础文本清理（合并多余空格，保留有效换行）
@@ -1002,7 +1166,7 @@ class ArticleParserService

        // 3. 提取摘要
        $abstract = '';
-        $abstractPattern = '/Abstract\s*([\s\S]*?)(?=Keywords:|$)/i';
+        $abstractPattern = '/Abstract\s*([\s\S]*?)(?=Keywords|$)/i';
        if (preg_match($abstractPattern, $textContent, $abstractMatches)) {
            $abstract = trim($abstractMatches[1]);
            $abstract = preg_replace('/\n+/', ' ', $abstract);
@@ -1010,7 +1174,8 @@ class ArticleParserService
        // 4. 提取关键词（核心：仅保留两种强制匹配逻辑）
        $keywords = [];
        // $keywordPattern = '/Keywords:\s*([\s\S]*?)(?=\s*\d+\.|[;,]\s*[\r\n]+\s*[\r\n]+|(?i)\bintroduction|abbreviations\b|$)/i';
-        $keywordPattern = '/Keywords:\s*(.*?)\s*Keywords-End-Flag/s';
+        $keywordPattern = '/Keywords\s*(.*?)\s*Keywords-End-Flag/s';
+
        if (preg_match($keywordPattern, $textContent, $keywordMatches)) {
            $keywordStr = trim($keywordMatches[1]);
            
@@ -1025,6 +1190,22 @@ class ArticleParserService
                return !empty($item) && !ctype_space($item);
            });
        }
+        if(empty($keywords)){
+            $keywordPattern = '/Keywords\s*([\s\S]*?)(?=Introduction|$)/i';
+            if (preg_match($keywordPattern, $textContent, $keywordMatches)) {
+                $keywordStr = trim($keywordMatches[1]);
+                // 清理关键词列表格式（去除换行、末尾多余符号）
+                $keywordStr = preg_replace('/\n+/', ' ', $keywordStr);
+                $keywordStr = rtrim($keywordStr, ';,. '); // 去除末尾分号、逗号等
+                $keywordStr = trim($keywordStr);
+
+                // 分割并过滤有效关键词
+                $keywords = preg_split('/[,;]\s*/', $keywordStr);
+                $keywords = array_filter(array_map('trim', $keywords), function($item) {
+                    return !empty($item) && !ctype_space($item);
+                });
+            }
+        }
        return [
            'status' => 1,
            'msg' => '提取成功',