225 lines
7.0 KiB
Java
225 lines
7.0 KiB
Java
package com.peanut.service;
|
||
|
||
import java.io.File;
|
||
import java.io.FileInputStream;
|
||
import java.io.IOException;
|
||
import java.io.InputStream;
|
||
import java.util.Base64;
|
||
import java.util.LinkedHashMap;
|
||
import java.util.Map;
|
||
import java.util.regex.Matcher;
|
||
import java.util.regex.Pattern;
|
||
|
||
import org.apache.commons.io.IOUtils;
|
||
import org.apache.commons.lang.StringUtils;
|
||
import org.jsoup.Jsoup;
|
||
import org.jsoup.nodes.Element;
|
||
import org.jsoup.select.Elements;
|
||
import com.aspose.words.Document;
|
||
import com.aspose.words.SaveFormat;
|
||
|
||
/**
|
||
* @author zcc 解析word获取对应章节下面的数据
|
||
*/
|
||
public class Test
|
||
{
|
||
|
||
public static void main(String[] args)
|
||
{
|
||
// word文件的路径
|
||
String filePath = "C:\\Users\\Administrator\\Documents\\WeChat Files\\yl328572838\\FileStorage\\File\\2022-10\\桃源探秘.docx";
|
||
InputStream input = null;
|
||
|
||
try
|
||
{
|
||
// 先创建一个临时目录文件夹
|
||
String tempPath = "D:\\chapter\\";
|
||
File tempFile = new File(tempPath);
|
||
if (!tempFile.exists())
|
||
{
|
||
tempFile.mkdirs();
|
||
}
|
||
|
||
input = new FileInputStream(filePath);
|
||
Document doc = new Document(input);
|
||
// 将word文档全量转成html显示的html文件路径
|
||
String htmlFilePath = tempPath + "test.html";
|
||
doc.save(htmlFilePath, SaveFormat.HTML);
|
||
String htmlStr = getHtmlStrFromFile(htmlFilePath);
|
||
org.jsoup.nodes.Document htmlDoc = Jsoup.parse(htmlStr);
|
||
changeImageSrc(htmlDoc, tempPath);// 转换图片格式
|
||
Map<String, String> map = exactContentFromHtml(htmlDoc, 2);
|
||
String string = map.get("h1_标题名称");
|
||
System.out.println(string);
|
||
}
|
||
catch (Exception e)
|
||
{
|
||
e.printStackTrace();
|
||
}
|
||
finally
|
||
{
|
||
IOUtils.closeQuietly(input);
|
||
}
|
||
}
|
||
|
||
/**
|
||
* 从html中抽取出预规内容,章节对应章节的内容。
|
||
* @return key=h1_name/h2_name,value=html
|
||
*/
|
||
private static Map<String, String> exactContentFromHtml(org.jsoup.nodes.Document htmlDoc, int level)
|
||
{
|
||
Map<String, String> value = new LinkedHashMap<String, String>();
|
||
try
|
||
{
|
||
Elements eleList = htmlDoc.getElementsByTag("h1");
|
||
if (eleList == null || eleList.size() == 0)
|
||
{
|
||
throw new Exception("上传的文件中不存在一级标题,请检查!");
|
||
}
|
||
Element ele = eleList.get(0);
|
||
String tempKey = "h1_" + ele.text();
|
||
StringBuffer tempBuffer = new StringBuffer();
|
||
while (true)
|
||
{
|
||
ele = ele.nextElementSibling();// 获取当前节点的下一个节点
|
||
if (ele == null)
|
||
{
|
||
if (StringUtils.isNotEmpty(tempKey))
|
||
{
|
||
value.put(tempKey, tempBuffer.toString());
|
||
}
|
||
break;
|
||
}
|
||
String eleTagName = ele.tagName();// 标签名称
|
||
if ("h1".equals(eleTagName))
|
||
{
|
||
if (StringUtils.isNotEmpty(tempKey))
|
||
{
|
||
value.put(tempKey, tempBuffer.toString());
|
||
tempBuffer.setLength(0);
|
||
}
|
||
tempKey = "h1_" + removeNullChar(ele.text());
|
||
continue;
|
||
}
|
||
if (level == 2)
|
||
{
|
||
if ("h2".equals(eleTagName))
|
||
{
|
||
if (StringUtils.isNotEmpty(tempKey))
|
||
{
|
||
value.put(tempKey, tempBuffer.toString());
|
||
tempBuffer.setLength(0);
|
||
}
|
||
tempKey = "h2_" + removeNullChar(ele.text());
|
||
continue;
|
||
}
|
||
}
|
||
|
||
tempBuffer.append(ele.outerHtml());
|
||
}
|
||
}
|
||
catch (Exception e)
|
||
{
|
||
e.printStackTrace();
|
||
}
|
||
return value;
|
||
}
|
||
|
||
/**
|
||
* 移除空字符串和*字符
|
||
* @param text
|
||
* @return
|
||
*/
|
||
private static String removeNullChar(String text)
|
||
{
|
||
if (text == null)
|
||
{
|
||
return null;
|
||
}
|
||
return text.replaceAll(" ", "").trim();
|
||
}
|
||
|
||
/**
|
||
* 修改图片的src,改为base64格式
|
||
* @param htmlDoc org.jsoup.nodes.Document
|
||
* @param file 文件路径
|
||
* @throws IOException
|
||
*/
|
||
private static void changeImageSrc(org.jsoup.nodes.Document htmlDoc, String file) throws IOException
|
||
{
|
||
Elements images = htmlDoc.getElementsByTag("img"); // 获取所有的image,转码。
|
||
for (int i = 0; i < images.size(); i++)
|
||
{
|
||
Element tempImage = images.get(i);
|
||
String tempSrc = tempImage.attr("src");
|
||
tempImage.attr("src", imageConvertBase64(file + File.separator + tempSrc));
|
||
}
|
||
}
|
||
|
||
/**
|
||
* 把图片转换成base64格式
|
||
* @return
|
||
* @throws IOException
|
||
*/
|
||
private static String imageConvertBase64(String filePath) throws IOException
|
||
{
|
||
InputStream in = null;
|
||
byte[] data = null;
|
||
try
|
||
{
|
||
String fileExtension = filePath.substring(filePath.lastIndexOf(".") + 1); // 获取文件后缀
|
||
in = new FileInputStream(new File(filePath));
|
||
data = new byte[in.available()];
|
||
in.read(data);
|
||
in.close();
|
||
String enCoderContent = new String(Base64.getEncoder().encode(data));
|
||
Pattern p = Pattern.compile("\\s*|\t|\r|\n");
|
||
Matcher m = p.matcher(enCoderContent);
|
||
enCoderContent = m.replaceAll("");
|
||
return "data:image/" + fileExtension + ";base64," + enCoderContent;
|
||
}
|
||
catch (IOException e)
|
||
{
|
||
e.printStackTrace();
|
||
}
|
||
finally
|
||
{
|
||
if (in != null)
|
||
{
|
||
in.close();
|
||
}
|
||
}
|
||
return null;
|
||
}
|
||
|
||
/**
|
||
* 从html文件中读取文件信息string
|
||
* @param filePath
|
||
* @return
|
||
* @throws IOException
|
||
*/
|
||
private static String getHtmlStrFromFile(String filePath) throws IOException
|
||
{
|
||
FileInputStream in = null;
|
||
File file = new File(filePath);
|
||
Long filelength = file.length();
|
||
byte[] filecontent = new byte[filelength.intValue()];
|
||
try
|
||
{
|
||
in = new FileInputStream(file);
|
||
in.read(filecontent);
|
||
}
|
||
catch (IOException e)
|
||
{
|
||
e.printStackTrace();
|
||
}
|
||
finally
|
||
{
|
||
IOUtils.closeQuietly(in);
|
||
}
|
||
return new String(filecontent, "UTF-8");
|
||
}
|
||
}
|
||
|
||
|