测试修改
This commit is contained in:
@@ -33,9 +33,162 @@ class ArticleParserService
|
||||
$this->phpWord = $reader->load($filePath);
|
||||
$this->sections = $this->phpWord->getSections();
|
||||
} catch (\Exception $e) {
|
||||
// 预处理:移除 DOCX 中的 EMF 图片
|
||||
$processedFilePath = $this->removeEmfFromDocx($filePath);
|
||||
// 加载处理后的文档
|
||||
$reader = IOFactory::createReader();
|
||||
$reader->setReadDataOnly(false);
|
||||
Settings::setCompatibility(false);
|
||||
Settings::setOutputEscapingEnabled(true);
|
||||
|
||||
$this->phpWord = $reader->load($processedFilePath);
|
||||
$this->sections = $this->phpWord->getSections();
|
||||
|
||||
// 可选:删除临时处理文件(避免冗余)
|
||||
var_dump($processedFilePath);
|
||||
unlink($processedFilePath);
|
||||
return json_encode(['status' => 5, 'msg' => $e->getMessage()]);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* 移除 DOCX 压缩包内的所有 EMF 图片
|
||||
* @param string $docxPath 原 DOCX 文件路径
|
||||
* @return string 处理后的临时 DOCX 路径
|
||||
*/
|
||||
private function removeEmfFromDocx($docxPath){
|
||||
$zip = new ZipArchive();
|
||||
if ($zip->open($docxPath) !== true) {
|
||||
throw new \Exception("无法打开 DOCX 文件:{$docxPath}");
|
||||
}
|
||||
|
||||
// 1. 创建临时目录用于解压
|
||||
$tempDir = rtrim(ROOT_PATH,'/').'/runtime/'.uniqid('docx_temp_');
|
||||
|
||||
mkdir($tempDir, 0700, true);
|
||||
|
||||
// 2. 解压 DOCX 到临时目录
|
||||
$zip->extractTo($tempDir);
|
||||
$zip->close();
|
||||
|
||||
// 3. 递归删除所有 EMF 文件
|
||||
$dirIterator = new RecursiveDirectoryIterator($tempDir);
|
||||
$iterator = new RecursiveIteratorIterator($dirIterator);
|
||||
foreach ($iterator as $file) {
|
||||
if ($file->isFile() && strtolower(pathinfo($file, PATHINFO_EXTENSION)) === 'emf') {
|
||||
unlink($file->getPathname());
|
||||
}
|
||||
}
|
||||
// 4. 重新打包为 DOCX
|
||||
$processedPath = $tempDir . '_processed.docx';
|
||||
$newZip = new ZipArchive();
|
||||
if ($newZip->open($processedPath, ZipArchive::CREATE | ZipArchive::OVERWRITE) !== true) {
|
||||
throw new \Exception("无法创建处理后的 DOCX 文件");
|
||||
}
|
||||
|
||||
// 遍历临时目录,添加所有文件到新压缩包
|
||||
$this->addFilesToZip($tempDir, $newZip);
|
||||
$newZip->close();
|
||||
|
||||
// 5. 删除临时解压目录
|
||||
$this->deleteDir($tempDir);
|
||||
|
||||
return $processedPath;
|
||||
}
|
||||
|
||||
/**
|
||||
* 递归添加目录文件到 ZipArchive
|
||||
* @param string $dir 目录路径
|
||||
* @param ZipArchive $zip ZipArchive 实例
|
||||
*/
|
||||
private function addFilesToZip($dir, $zip)
|
||||
{
|
||||
$files = scandir($dir);
|
||||
foreach ($files as $file) {
|
||||
if ($file === '.' || $file === '..') continue;
|
||||
|
||||
$filePath = $dir . '/' . $file;
|
||||
if (is_dir($filePath)) {
|
||||
$this->addFilesToZip($filePath, $zip);
|
||||
} else {
|
||||
// 计算压缩包内的相对路径(避免冗余目录层级)
|
||||
$relativePath = str_replace(dirname($dir) . '/', '', $filePath);
|
||||
$zip->addFile($filePath, $relativePath);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 递归删除目录
|
||||
* @param string $dir 目录路径
|
||||
*/
|
||||
private function deleteDir($dir){
|
||||
// 1. 基础校验:非空字符串且为有效目录
|
||||
if (trim($dir) === '' || !is_dir($dir)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// 2. 统一路径格式(去除尾部分隔符,避免跨系统差异)
|
||||
$dir = rtrim($dir, DIRECTORY_SEPARATOR);
|
||||
$dirName = basename($dir);
|
||||
|
||||
// 3. 前缀强校验:仅处理docx_temp_开头的目录
|
||||
if (strpos($dirName, 'docx_temp_') !== 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// 4. 路径归属校验(缓存realpath结果,减少I/O)
|
||||
$runtimeDir = rtrim(ROOT_PATH, '/') . '/runtime';
|
||||
$realDir = realpath($dir);
|
||||
$realRuntimeDir = realpath($runtimeDir);
|
||||
if ($realDir === false || $realRuntimeDir === false || strpos($realDir, $realRuntimeDir) !== 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// 5. 扫描目录(带错误抑制,处理权限问题)
|
||||
$files = @scandir($dir);
|
||||
if ($files === false) {
|
||||
return false;
|
||||
}
|
||||
|
||||
$isFullyDeleted = true; // 标记是否完全删除
|
||||
|
||||
// 6. 递归处理子项
|
||||
foreach ($files as $file) {
|
||||
if ($file === '.' || $file === '..') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$filePath = $dir . DIRECTORY_SEPARATOR . $file;
|
||||
$realFilePath = realpath($filePath);
|
||||
|
||||
// 子路径校验:必须是当前目录的子项(防符号链接跳转)
|
||||
if ($realFilePath === false || strpos($realFilePath, $realDir) !== 0) {
|
||||
$isFullyDeleted = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (is_dir($realFilePath)) {
|
||||
// 递归删除子目录,继承校验逻辑
|
||||
if (!$this->deleteDir($realFilePath)) {
|
||||
$isFullyDeleted = false;
|
||||
}
|
||||
} else {
|
||||
// 尝试删除文件(失败则标记未完全删除)
|
||||
if (!@unlink($realFilePath)) {
|
||||
$isFullyDeleted = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 7. 最终删除目录(确保空目录才删除)
|
||||
$remainingFiles = @scandir($dir);
|
||||
if ($remainingFiles !== false && count($remainingFiles) <= 2) {
|
||||
@rmdir($dir);
|
||||
return $isFullyDeleted; // 若子项完全删除,则返回true
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// 上传并解析文档的入口方法
|
||||
public static function uploadAndParse($sFileUrl){
|
||||
@@ -79,9 +232,11 @@ class ArticleParserService
|
||||
|
||||
// 提取文章标题
|
||||
private function getTitle(){
|
||||
if(empty($this->sections)){
|
||||
return '';
|
||||
}
|
||||
$title = '';
|
||||
$maxLength = 0;
|
||||
|
||||
foreach ($this->sections as $section) {
|
||||
foreach ($section->getElements() as $element) {
|
||||
$text = $this->getTextFromElement($element);
|
||||
@@ -93,6 +248,9 @@ class ArticleParserService
|
||||
}
|
||||
}
|
||||
}
|
||||
if(!empty($title) && !mb_check_encoding($title, 'UTF-8')){
|
||||
$title = mb_convert_encoding($title, 'UTF-8', 'GBK');
|
||||
}
|
||||
return $title;
|
||||
}
|
||||
// 提取作者
|
||||
@@ -462,6 +620,9 @@ class ArticleParserService
|
||||
if(empty($value['name']) && empty($value['superscript'])){
|
||||
continue;
|
||||
}
|
||||
if(!mb_check_encoding($value['name'], 'UTF-8')){
|
||||
$value['name'] = mb_convert_encoding($value['name'], 'UTF-8', 'GBK');
|
||||
}
|
||||
if(!empty($value['name']) && !empty($value['is_report']) && $value['is_report'] == 1){
|
||||
$aReport[] = $value['name'];
|
||||
}
|
||||
@@ -701,12 +862,12 @@ class ArticleParserService
|
||||
$institution = trim($institution); // 清理首尾空格
|
||||
preg_match('/(.*?, [A-Za-z]+ \d+, [A-Za-z]+)/', $institution, $institutionmatches);;
|
||||
$institution = trim($institutionmatches[1] ?? $institution);
|
||||
if(!mb_check_encoding($institution, 'UTF-8')){
|
||||
$institution = mb_convert_encoding($institution, 'UTF-8', 'GBK');
|
||||
}
|
||||
if (preg_match('/^(.*?)(?=\s*\*Email)/', $institution, $matches)) {
|
||||
$institution = trim($matches[1]); // trim() 去除内容前后多余空格
|
||||
}
|
||||
if(!empty($institution) && !mb_check_encoding($institution, 'UTF-8')){
|
||||
$institution = mb_convert_encoding($institution, 'UTF-8', 'GBK');
|
||||
}
|
||||
$aCompany[$number] = $institution;
|
||||
}
|
||||
return $aCompany;
|
||||
@@ -831,7 +992,7 @@ class ArticleParserService
|
||||
private function getContentAfterText($targetText,$return_type = 2){
|
||||
$found = false;
|
||||
$content = [];
|
||||
$stopKeywords = ['关键词', 'Key words', '摘要', 'Abstract'];
|
||||
$stopKeywords = ['关键词', 'Key words', '摘要', 'Abstract','ABSTRACT'];
|
||||
$maxLines = 200;
|
||||
$lineNumber = 0;
|
||||
foreach ($this->sections as $section) {
|
||||
@@ -873,7 +1034,7 @@ class ArticleParserService
|
||||
return $content;
|
||||
}
|
||||
$content = implode("\n", $content);
|
||||
if(!mb_check_encoding($content, 'UTF-8')){
|
||||
if(!empty($content) && !mb_check_encoding($content, 'UTF-8')){
|
||||
$content = mb_convert_encoding($content, 'UTF-8', 'GBK');
|
||||
}
|
||||
return $content;
|
||||
@@ -952,7 +1113,9 @@ class ArticleParserService
|
||||
$text = preg_replace('/[\x00-\x1F\x7F-\x9F]/', ' ', $text); // 移除控制字符
|
||||
$text = str_replace(["\t", "\r", "\n"], ' ', $text); // 统一空白字符
|
||||
$text = preg_replace('/\s+/', ' ', $text); // 合并多个空格
|
||||
|
||||
if(!empty($text) && !mb_check_encoding($text, 'UTF-8')){
|
||||
$text = mb_convert_encoding($text, 'UTF-8', 'GBK');
|
||||
}
|
||||
return $text;
|
||||
}
|
||||
|
||||
@@ -993,7 +1156,8 @@ class ArticleParserService
|
||||
$sContent .= "\n";
|
||||
}
|
||||
}
|
||||
if(!mb_check_encoding($sContent, 'UTF-8')){
|
||||
|
||||
if(!empty($sContent) && !mb_check_encoding($sContent, 'UTF-8')){
|
||||
$sContent = mb_convert_encoding($sContent, 'UTF-8', 'GBK');
|
||||
}
|
||||
// 2. 基础文本清理(合并多余空格,保留有效换行)
|
||||
@@ -1002,7 +1166,7 @@ class ArticleParserService
|
||||
|
||||
// 3. 提取摘要
|
||||
$abstract = '';
|
||||
$abstractPattern = '/Abstract\s*([\s\S]*?)(?=Keywords:|$)/i';
|
||||
$abstractPattern = '/Abstract\s*([\s\S]*?)(?=Keywords|$)/i';
|
||||
if (preg_match($abstractPattern, $textContent, $abstractMatches)) {
|
||||
$abstract = trim($abstractMatches[1]);
|
||||
$abstract = preg_replace('/\n+/', ' ', $abstract);
|
||||
@@ -1010,7 +1174,8 @@ class ArticleParserService
|
||||
// 4. 提取关键词(核心:仅保留两种强制匹配逻辑)
|
||||
$keywords = [];
|
||||
// $keywordPattern = '/Keywords:\s*([\s\S]*?)(?=\s*\d+\.|[;,]\s*[\r\n]+\s*[\r\n]+|(?i)\bintroduction|abbreviations\b|$)/i';
|
||||
$keywordPattern = '/Keywords:\s*(.*?)\s*Keywords-End-Flag/s';
|
||||
$keywordPattern = '/Keywords\s*(.*?)\s*Keywords-End-Flag/s';
|
||||
|
||||
if (preg_match($keywordPattern, $textContent, $keywordMatches)) {
|
||||
$keywordStr = trim($keywordMatches[1]);
|
||||
|
||||
@@ -1025,6 +1190,22 @@ class ArticleParserService
|
||||
return !empty($item) && !ctype_space($item);
|
||||
});
|
||||
}
|
||||
if(empty($keywords)){
|
||||
$keywordPattern = '/Keywords\s*([\s\S]*?)(?=Introduction|$)/i';
|
||||
if (preg_match($keywordPattern, $textContent, $keywordMatches)) {
|
||||
$keywordStr = trim($keywordMatches[1]);
|
||||
// 清理关键词列表格式(去除换行、末尾多余符号)
|
||||
$keywordStr = preg_replace('/\n+/', ' ', $keywordStr);
|
||||
$keywordStr = rtrim($keywordStr, ';,. '); // 去除末尾分号、逗号等
|
||||
$keywordStr = trim($keywordStr);
|
||||
|
||||
// 分割并过滤有效关键词
|
||||
$keywords = preg_split('/[,;]\s*/', $keywordStr);
|
||||
$keywords = array_filter(array_map('trim', $keywords), function($item) {
|
||||
return !empty($item) && !ctype_space($item);
|
||||
});
|
||||
}
|
||||
}
|
||||
return [
|
||||
'status' => 1,
|
||||
'msg' => '提取成功',
|
||||
|
||||
Reference in New Issue
Block a user