first commit
This commit is contained in:
224
src/test/java/com/peanut/service/Test.java
Normal file
224
src/test/java/com/peanut/service/Test.java
Normal file
@@ -0,0 +1,224 @@
|
||||
package com.peanut.service;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.Base64;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.select.Elements;
|
||||
import com.aspose.words.Document;
|
||||
import com.aspose.words.SaveFormat;
|
||||
|
||||
/**
|
||||
* @author zcc 解析word获取对应章节下面的数据
|
||||
*/
|
||||
public class Test
|
||||
{
|
||||
|
||||
public static void main(String[] args)
|
||||
{
|
||||
// word文件的路径
|
||||
String filePath = "C:\\Users\\Administrator\\Documents\\WeChat Files\\yl328572838\\FileStorage\\File\\2022-10\\桃源探秘.docx";
|
||||
InputStream input = null;
|
||||
|
||||
try
|
||||
{
|
||||
// 先创建一个临时目录文件夹
|
||||
String tempPath = "D:\\chapter\\";
|
||||
File tempFile = new File(tempPath);
|
||||
if (!tempFile.exists())
|
||||
{
|
||||
tempFile.mkdirs();
|
||||
}
|
||||
|
||||
input = new FileInputStream(filePath);
|
||||
Document doc = new Document(input);
|
||||
// 将word文档全量转成html显示的html文件路径
|
||||
String htmlFilePath = tempPath + "test.html";
|
||||
doc.save(htmlFilePath, SaveFormat.HTML);
|
||||
String htmlStr = getHtmlStrFromFile(htmlFilePath);
|
||||
org.jsoup.nodes.Document htmlDoc = Jsoup.parse(htmlStr);
|
||||
changeImageSrc(htmlDoc, tempPath);// 转换图片格式
|
||||
Map<String, String> map = exactContentFromHtml(htmlDoc, 2);
|
||||
String string = map.get("h1_标题名称");
|
||||
System.out.println(string);
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
e.printStackTrace();
|
||||
}
|
||||
finally
|
||||
{
|
||||
IOUtils.closeQuietly(input);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 从html中抽取出预规内容,章节对应章节的内容。
|
||||
* @return key=h1_name/h2_name,value=html
|
||||
*/
|
||||
private static Map<String, String> exactContentFromHtml(org.jsoup.nodes.Document htmlDoc, int level)
|
||||
{
|
||||
Map<String, String> value = new LinkedHashMap<String, String>();
|
||||
try
|
||||
{
|
||||
Elements eleList = htmlDoc.getElementsByTag("h1");
|
||||
if (eleList == null || eleList.size() == 0)
|
||||
{
|
||||
throw new Exception("上传的文件中不存在一级标题,请检查!");
|
||||
}
|
||||
Element ele = eleList.get(0);
|
||||
String tempKey = "h1_" + ele.text();
|
||||
StringBuffer tempBuffer = new StringBuffer();
|
||||
while (true)
|
||||
{
|
||||
ele = ele.nextElementSibling();// 获取当前节点的下一个节点
|
||||
if (ele == null)
|
||||
{
|
||||
if (StringUtils.isNotEmpty(tempKey))
|
||||
{
|
||||
value.put(tempKey, tempBuffer.toString());
|
||||
}
|
||||
break;
|
||||
}
|
||||
String eleTagName = ele.tagName();// 标签名称
|
||||
if ("h1".equals(eleTagName))
|
||||
{
|
||||
if (StringUtils.isNotEmpty(tempKey))
|
||||
{
|
||||
value.put(tempKey, tempBuffer.toString());
|
||||
tempBuffer.setLength(0);
|
||||
}
|
||||
tempKey = "h1_" + removeNullChar(ele.text());
|
||||
continue;
|
||||
}
|
||||
if (level == 2)
|
||||
{
|
||||
if ("h2".equals(eleTagName))
|
||||
{
|
||||
if (StringUtils.isNotEmpty(tempKey))
|
||||
{
|
||||
value.put(tempKey, tempBuffer.toString());
|
||||
tempBuffer.setLength(0);
|
||||
}
|
||||
tempKey = "h2_" + removeNullChar(ele.text());
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
tempBuffer.append(ele.outerHtml());
|
||||
}
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
e.printStackTrace();
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
/**
|
||||
* 移除空字符串和*字符
|
||||
* @param text
|
||||
* @return
|
||||
*/
|
||||
private static String removeNullChar(String text)
|
||||
{
|
||||
if (text == null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
return text.replaceAll(" ", "").trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* 修改图片的src,改为base64格式
|
||||
* @param htmlDoc org.jsoup.nodes.Document
|
||||
* @param file 文件路径
|
||||
* @throws IOException
|
||||
*/
|
||||
private static void changeImageSrc(org.jsoup.nodes.Document htmlDoc, String file) throws IOException
|
||||
{
|
||||
Elements images = htmlDoc.getElementsByTag("img"); // 获取所有的image,转码。
|
||||
for (int i = 0; i < images.size(); i++)
|
||||
{
|
||||
Element tempImage = images.get(i);
|
||||
String tempSrc = tempImage.attr("src");
|
||||
tempImage.attr("src", imageConvertBase64(file + File.separator + tempSrc));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 把图片转换成base64格式
|
||||
* @return
|
||||
* @throws IOException
|
||||
*/
|
||||
private static String imageConvertBase64(String filePath) throws IOException
|
||||
{
|
||||
InputStream in = null;
|
||||
byte[] data = null;
|
||||
try
|
||||
{
|
||||
String fileExtension = filePath.substring(filePath.lastIndexOf(".") + 1); // 获取文件后缀
|
||||
in = new FileInputStream(new File(filePath));
|
||||
data = new byte[in.available()];
|
||||
in.read(data);
|
||||
in.close();
|
||||
String enCoderContent = new String(Base64.getEncoder().encode(data));
|
||||
Pattern p = Pattern.compile("\\s*|\t|\r|\n");
|
||||
Matcher m = p.matcher(enCoderContent);
|
||||
enCoderContent = m.replaceAll("");
|
||||
return "data:image/" + fileExtension + ";base64," + enCoderContent;
|
||||
}
|
||||
catch (IOException e)
|
||||
{
|
||||
e.printStackTrace();
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (in != null)
|
||||
{
|
||||
in.close();
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* 从html文件中读取文件信息string
|
||||
* @param filePath
|
||||
* @return
|
||||
* @throws IOException
|
||||
*/
|
||||
private static String getHtmlStrFromFile(String filePath) throws IOException
|
||||
{
|
||||
FileInputStream in = null;
|
||||
File file = new File(filePath);
|
||||
Long filelength = file.length();
|
||||
byte[] filecontent = new byte[filelength.intValue()];
|
||||
try
|
||||
{
|
||||
in = new FileInputStream(file);
|
||||
in.read(filecontent);
|
||||
}
|
||||
catch (IOException e)
|
||||
{
|
||||
e.printStackTrace();
|
||||
}
|
||||
finally
|
||||
{
|
||||
IOUtils.closeQuietly(in);
|
||||
}
|
||||
return new String(filecontent, "UTF-8");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user