Files
nuttyreading-java/src/test/java/com/peanut/service/Test.java
cys841515238 2733a60b97 first commit
2023-03-02 16:13:28 +08:00

225 lines
7.0 KiB
Java
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package com.peanut.service;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Base64;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.aspose.words.Document;
import com.aspose.words.SaveFormat;
/**
* @author zcc 解析word获取对应章节下面的数据
*/
public class Test
{
public static void main(String[] args)
{
// word文件的路径
String filePath = "C:\\Users\\Administrator\\Documents\\WeChat Files\\yl328572838\\FileStorage\\File\\2022-10\\桃源探秘.docx";
InputStream input = null;
try
{
// 先创建一个临时目录文件夹
String tempPath = "D:\\chapter\\";
File tempFile = new File(tempPath);
if (!tempFile.exists())
{
tempFile.mkdirs();
}
input = new FileInputStream(filePath);
Document doc = new Document(input);
// 将word文档全量转成html显示的html文件路径
String htmlFilePath = tempPath + "test.html";
doc.save(htmlFilePath, SaveFormat.HTML);
String htmlStr = getHtmlStrFromFile(htmlFilePath);
org.jsoup.nodes.Document htmlDoc = Jsoup.parse(htmlStr);
changeImageSrc(htmlDoc, tempPath);// 转换图片格式
Map<String, String> map = exactContentFromHtml(htmlDoc, 2);
String string = map.get("h1_标题名称");
System.out.println(string);
}
catch (Exception e)
{
e.printStackTrace();
}
finally
{
IOUtils.closeQuietly(input);
}
}
/**
* 从html中抽取出预规内容,章节对应章节的内容。
* @return key=h1_name/h2_name,value=html
*/
private static Map<String, String> exactContentFromHtml(org.jsoup.nodes.Document htmlDoc, int level)
{
Map<String, String> value = new LinkedHashMap<String, String>();
try
{
Elements eleList = htmlDoc.getElementsByTag("h1");
if (eleList == null || eleList.size() == 0)
{
throw new Exception("上传的文件中不存在一级标题,请检查!");
}
Element ele = eleList.get(0);
String tempKey = "h1_" + ele.text();
StringBuffer tempBuffer = new StringBuffer();
while (true)
{
ele = ele.nextElementSibling();// 获取当前节点的下一个节点
if (ele == null)
{
if (StringUtils.isNotEmpty(tempKey))
{
value.put(tempKey, tempBuffer.toString());
}
break;
}
String eleTagName = ele.tagName();// 标签名称
if ("h1".equals(eleTagName))
{
if (StringUtils.isNotEmpty(tempKey))
{
value.put(tempKey, tempBuffer.toString());
tempBuffer.setLength(0);
}
tempKey = "h1_" + removeNullChar(ele.text());
continue;
}
if (level == 2)
{
if ("h2".equals(eleTagName))
{
if (StringUtils.isNotEmpty(tempKey))
{
value.put(tempKey, tempBuffer.toString());
tempBuffer.setLength(0);
}
tempKey = "h2_" + removeNullChar(ele.text());
continue;
}
}
tempBuffer.append(ele.outerHtml());
}
}
catch (Exception e)
{
e.printStackTrace();
}
return value;
}
/**
* 移除空字符串和*字符
* @param text
* @return
*/
private static String removeNullChar(String text)
{
if (text == null)
{
return null;
}
return text.replaceAll(" ", "").trim();
}
/**
* 修改图片的src改为base64格式
* @param htmlDoc org.jsoup.nodes.Document
* @param file 文件路径
* @throws IOException
*/
private static void changeImageSrc(org.jsoup.nodes.Document htmlDoc, String file) throws IOException
{
Elements images = htmlDoc.getElementsByTag("img"); // 获取所有的image转码。
for (int i = 0; i < images.size(); i++)
{
Element tempImage = images.get(i);
String tempSrc = tempImage.attr("src");
tempImage.attr("src", imageConvertBase64(file + File.separator + tempSrc));
}
}
/**
* 把图片转换成base64格式
* @return
* @throws IOException
*/
private static String imageConvertBase64(String filePath) throws IOException
{
InputStream in = null;
byte[] data = null;
try
{
String fileExtension = filePath.substring(filePath.lastIndexOf(".") + 1); // 获取文件后缀
in = new FileInputStream(new File(filePath));
data = new byte[in.available()];
in.read(data);
in.close();
String enCoderContent = new String(Base64.getEncoder().encode(data));
Pattern p = Pattern.compile("\\s*|\t|\r|\n");
Matcher m = p.matcher(enCoderContent);
enCoderContent = m.replaceAll("");
return "data:image/" + fileExtension + ";base64," + enCoderContent;
}
catch (IOException e)
{
e.printStackTrace();
}
finally
{
if (in != null)
{
in.close();
}
}
return null;
}
/**
* 从html文件中读取文件信息string
* @param filePath
* @return
* @throws IOException
*/
private static String getHtmlStrFromFile(String filePath) throws IOException
{
FileInputStream in = null;
File file = new File(filePath);
Long filelength = file.length();
byte[] filecontent = new byte[filelength.intValue()];
try
{
in = new FileInputStream(file);
in.read(filecontent);
}
catch (IOException e)
{
e.printStackTrace();
}
finally
{
IOUtils.closeQuietly(in);
}
return new String(filecontent, "UTF-8");
}
}