From 3c459af7fc119074ada85c2ed32fa095eb8422c4 Mon Sep 17 00:00:00 2001 From: chengxl Date: Thu, 6 Nov 2025 17:00:20 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B5=8B=E8=AF=95=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- application/common/ArticleParserService.php | 201 +++++++++++++++++++- 1 file changed, 191 insertions(+), 10 deletions(-) diff --git a/application/common/ArticleParserService.php b/application/common/ArticleParserService.php index 8a4a978..3b8668c 100644 --- a/application/common/ArticleParserService.php +++ b/application/common/ArticleParserService.php @@ -33,9 +33,162 @@ class ArticleParserService $this->phpWord = $reader->load($filePath); $this->sections = $this->phpWord->getSections(); } catch (\Exception $e) { + // 预处理:移除 DOCX 中的 EMF 图片 + $processedFilePath = $this->removeEmfFromDocx($filePath); + // 加载处理后的文档 + $reader = IOFactory::createReader(); + $reader->setReadDataOnly(false); + Settings::setCompatibility(false); + Settings::setOutputEscapingEnabled(true); + + $this->phpWord = $reader->load($processedFilePath); + $this->sections = $this->phpWord->getSections(); + + // 可选:删除临时处理文件(避免冗余) + var_dump($processedFilePath); + unlink($processedFilePath); return json_encode(['status' => 5, 'msg' => $e->getMessage()]); } } + /** + * 移除 DOCX 压缩包内的所有 EMF 图片 + * @param string $docxPath 原 DOCX 文件路径 + * @return string 处理后的临时 DOCX 路径 + */ + private function removeEmfFromDocx($docxPath){ + $zip = new ZipArchive(); + if ($zip->open($docxPath) !== true) { + throw new \Exception("无法打开 DOCX 文件:{$docxPath}"); + } + + // 1. 创建临时目录用于解压 + $tempDir = rtrim(ROOT_PATH,'/').'/runtime/'.uniqid('docx_temp_'); + + mkdir($tempDir, 0700, true); + + // 2. 解压 DOCX 到临时目录 + $zip->extractTo($tempDir); + $zip->close(); + + // 3. 递归删除所有 EMF 文件 + $dirIterator = new RecursiveDirectoryIterator($tempDir); + $iterator = new RecursiveIteratorIterator($dirIterator); + foreach ($iterator as $file) { + if ($file->isFile() && strtolower(pathinfo($file, PATHINFO_EXTENSION)) === 'emf') { + unlink($file->getPathname()); + } + } + // 4. 重新打包为 DOCX + $processedPath = $tempDir . '_processed.docx'; + $newZip = new ZipArchive(); + if ($newZip->open($processedPath, ZipArchive::CREATE | ZipArchive::OVERWRITE) !== true) { + throw new \Exception("无法创建处理后的 DOCX 文件"); + } + + // 遍历临时目录,添加所有文件到新压缩包 + $this->addFilesToZip($tempDir, $newZip); + $newZip->close(); + + // 5. 删除临时解压目录 + $this->deleteDir($tempDir); + + return $processedPath; + } + + /** + * 递归添加目录文件到 ZipArchive + * @param string $dir 目录路径 + * @param ZipArchive $zip ZipArchive 实例 + */ + private function addFilesToZip($dir, $zip) + { + $files = scandir($dir); + foreach ($files as $file) { + if ($file === '.' || $file === '..') continue; + + $filePath = $dir . '/' . $file; + if (is_dir($filePath)) { + $this->addFilesToZip($filePath, $zip); + } else { + // 计算压缩包内的相对路径(避免冗余目录层级) + $relativePath = str_replace(dirname($dir) . '/', '', $filePath); + $zip->addFile($filePath, $relativePath); + } + } + } + + /** + * 递归删除目录 + * @param string $dir 目录路径 + */ + private function deleteDir($dir){ + // 1. 基础校验:非空字符串且为有效目录 + if (trim($dir) === '' || !is_dir($dir)) { + return false; + } + + // 2. 统一路径格式(去除尾部分隔符,避免跨系统差异) + $dir = rtrim($dir, DIRECTORY_SEPARATOR); + $dirName = basename($dir); + + // 3. 前缀强校验:仅处理docx_temp_开头的目录 + if (strpos($dirName, 'docx_temp_') !== 0) { + return false; + } + + // 4. 路径归属校验(缓存realpath结果,减少I/O) + $runtimeDir = rtrim(ROOT_PATH, '/') . '/runtime'; + $realDir = realpath($dir); + $realRuntimeDir = realpath($runtimeDir); + if ($realDir === false || $realRuntimeDir === false || strpos($realDir, $realRuntimeDir) !== 0) { + return false; + } + + // 5. 扫描目录(带错误抑制,处理权限问题) + $files = @scandir($dir); + if ($files === false) { + return false; + } + + $isFullyDeleted = true; // 标记是否完全删除 + + // 6. 递归处理子项 + foreach ($files as $file) { + if ($file === '.' || $file === '..') { + continue; + } + + $filePath = $dir . DIRECTORY_SEPARATOR . $file; + $realFilePath = realpath($filePath); + + // 子路径校验:必须是当前目录的子项(防符号链接跳转) + if ($realFilePath === false || strpos($realFilePath, $realDir) !== 0) { + $isFullyDeleted = false; + continue; + } + + if (is_dir($realFilePath)) { + // 递归删除子目录,继承校验逻辑 + if (!$this->deleteDir($realFilePath)) { + $isFullyDeleted = false; + } + } else { + // 尝试删除文件(失败则标记未完全删除) + if (!@unlink($realFilePath)) { + $isFullyDeleted = false; + } + } + } + + // 7. 最终删除目录(确保空目录才删除) + $remainingFiles = @scandir($dir); + if ($remainingFiles !== false && count($remainingFiles) <= 2) { + @rmdir($dir); + return $isFullyDeleted; // 若子项完全删除,则返回true + } + + return false; + } // 上传并解析文档的入口方法 public static function uploadAndParse($sFileUrl){ @@ -79,9 +232,11 @@ class ArticleParserService // 提取文章标题 private function getTitle(){ + if(empty($this->sections)){ + return ''; + } $title = ''; $maxLength = 0; - foreach ($this->sections as $section) { foreach ($section->getElements() as $element) { $text = $this->getTextFromElement($element); @@ -93,6 +248,9 @@ class ArticleParserService } } } + if(!empty($title) && !mb_check_encoding($title, 'UTF-8')){ + $title = mb_convert_encoding($title, 'UTF-8', 'GBK'); + } return $title; } // 提取作者 @@ -462,6 +620,9 @@ class ArticleParserService if(empty($value['name']) && empty($value['superscript'])){ continue; } + if(!mb_check_encoding($value['name'], 'UTF-8')){ + $value['name'] = mb_convert_encoding($value['name'], 'UTF-8', 'GBK'); + } if(!empty($value['name']) && !empty($value['is_report']) && $value['is_report'] == 1){ $aReport[] = $value['name']; } @@ -701,12 +862,12 @@ class ArticleParserService $institution = trim($institution); // 清理首尾空格 preg_match('/(.*?, [A-Za-z]+ \d+, [A-Za-z]+)/', $institution, $institutionmatches);; $institution = trim($institutionmatches[1] ?? $institution); - if(!mb_check_encoding($institution, 'UTF-8')){ - $institution = mb_convert_encoding($institution, 'UTF-8', 'GBK'); - } if (preg_match('/^(.*?)(?=\s*\*Email)/', $institution, $matches)) { $institution = trim($matches[1]); // trim() 去除内容前后多余空格 } + if(!empty($institution) && !mb_check_encoding($institution, 'UTF-8')){ + $institution = mb_convert_encoding($institution, 'UTF-8', 'GBK'); + } $aCompany[$number] = $institution; } return $aCompany; @@ -831,7 +992,7 @@ class ArticleParserService private function getContentAfterText($targetText,$return_type = 2){ $found = false; $content = []; - $stopKeywords = ['关键词', 'Key words', '摘要', 'Abstract']; + $stopKeywords = ['关键词', 'Key words', '摘要', 'Abstract','ABSTRACT']; $maxLines = 200; $lineNumber = 0; foreach ($this->sections as $section) { @@ -873,7 +1034,7 @@ class ArticleParserService return $content; } $content = implode("\n", $content); - if(!mb_check_encoding($content, 'UTF-8')){ + if(!empty($content) && !mb_check_encoding($content, 'UTF-8')){ $content = mb_convert_encoding($content, 'UTF-8', 'GBK'); } return $content; @@ -952,7 +1113,9 @@ class ArticleParserService $text = preg_replace('/[\x00-\x1F\x7F-\x9F]/', ' ', $text); // 移除控制字符 $text = str_replace(["\t", "\r", "\n"], ' ', $text); // 统一空白字符 $text = preg_replace('/\s+/', ' ', $text); // 合并多个空格 - + if(!empty($text) && !mb_check_encoding($text, 'UTF-8')){ + $text = mb_convert_encoding($text, 'UTF-8', 'GBK'); + } return $text; } @@ -993,7 +1156,8 @@ class ArticleParserService $sContent .= "\n"; } } - if(!mb_check_encoding($sContent, 'UTF-8')){ + + if(!empty($sContent) && !mb_check_encoding($sContent, 'UTF-8')){ $sContent = mb_convert_encoding($sContent, 'UTF-8', 'GBK'); } // 2. 基础文本清理(合并多余空格,保留有效换行) @@ -1002,7 +1166,7 @@ class ArticleParserService // 3. 提取摘要 $abstract = ''; - $abstractPattern = '/Abstract\s*([\s\S]*?)(?=Keywords:|$)/i'; + $abstractPattern = '/Abstract\s*([\s\S]*?)(?=Keywords|$)/i'; if (preg_match($abstractPattern, $textContent, $abstractMatches)) { $abstract = trim($abstractMatches[1]); $abstract = preg_replace('/\n+/', ' ', $abstract); @@ -1010,7 +1174,8 @@ class ArticleParserService // 4. 提取关键词(核心:仅保留两种强制匹配逻辑) $keywords = []; // $keywordPattern = '/Keywords:\s*([\s\S]*?)(?=\s*\d+\.|[;,]\s*[\r\n]+\s*[\r\n]+|(?i)\bintroduction|abbreviations\b|$)/i'; - $keywordPattern = '/Keywords:\s*(.*?)\s*Keywords-End-Flag/s'; + $keywordPattern = '/Keywords\s*(.*?)\s*Keywords-End-Flag/s'; + if (preg_match($keywordPattern, $textContent, $keywordMatches)) { $keywordStr = trim($keywordMatches[1]); @@ -1025,6 +1190,22 @@ class ArticleParserService return !empty($item) && !ctype_space($item); }); } + if(empty($keywords)){ + $keywordPattern = '/Keywords\s*([\s\S]*?)(?=Introduction|$)/i'; + if (preg_match($keywordPattern, $textContent, $keywordMatches)) { + $keywordStr = trim($keywordMatches[1]); + // 清理关键词列表格式(去除换行、末尾多余符号) + $keywordStr = preg_replace('/\n+/', ' ', $keywordStr); + $keywordStr = rtrim($keywordStr, ';,. '); // 去除末尾分号、逗号等 + $keywordStr = trim($keywordStr); + + // 分割并过滤有效关键词 + $keywords = preg_split('/[,;]\s*/', $keywordStr); + $keywords = array_filter(array_map('trim', $keywords), function($item) { + return !empty($item) && !ctype_space($item); + }); + } + } return [ 'status' => 1, 'msg' => '提取成功',