package com.peanut.service; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.util.Base64; import java.util.LinkedHashMap; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import com.aspose.words.Document; import com.aspose.words.SaveFormat; /** * @author zcc 解析word获取对应章节下面的数据 */ public class Test { public static void main(String[] args) { // word文件的路径 String filePath = "C:\\Users\\Administrator\\Documents\\WeChat Files\\yl328572838\\FileStorage\\File\\2022-10\\桃源探秘.docx"; InputStream input = null; try { // 先创建一个临时目录文件夹 String tempPath = "D:\\chapter\\"; File tempFile = new File(tempPath); if (!tempFile.exists()) { tempFile.mkdirs(); } input = new FileInputStream(filePath); Document doc = new Document(input); // 将word文档全量转成html显示的html文件路径 String htmlFilePath = tempPath + "test.html"; doc.save(htmlFilePath, SaveFormat.HTML); String htmlStr = getHtmlStrFromFile(htmlFilePath); org.jsoup.nodes.Document htmlDoc = Jsoup.parse(htmlStr); changeImageSrc(htmlDoc, tempPath);// 转换图片格式 Map map = exactContentFromHtml(htmlDoc, 2); String string = map.get("h1_标题名称"); System.out.println(string); } catch (Exception e) { e.printStackTrace(); } finally { IOUtils.closeQuietly(input); } } /** * 从html中抽取出预规内容,章节对应章节的内容。 * @return key=h1_name/h2_name,value=html */ private static Map exactContentFromHtml(org.jsoup.nodes.Document htmlDoc, int level) { Map value = new LinkedHashMap(); try { Elements eleList = htmlDoc.getElementsByTag("h1"); if (eleList == null || eleList.size() == 0) { throw new Exception("上传的文件中不存在一级标题,请检查!"); } Element ele = eleList.get(0); String tempKey = "h1_" + ele.text(); StringBuffer tempBuffer = new StringBuffer(); while (true) { ele = ele.nextElementSibling();// 获取当前节点的下一个节点 if (ele == null) { if (StringUtils.isNotEmpty(tempKey)) { value.put(tempKey, tempBuffer.toString()); } break; } String eleTagName = ele.tagName();// 标签名称 if ("h1".equals(eleTagName)) { if (StringUtils.isNotEmpty(tempKey)) { value.put(tempKey, tempBuffer.toString()); tempBuffer.setLength(0); } tempKey = "h1_" + removeNullChar(ele.text()); continue; } if (level == 2) { if ("h2".equals(eleTagName)) { if (StringUtils.isNotEmpty(tempKey)) { value.put(tempKey, tempBuffer.toString()); tempBuffer.setLength(0); } tempKey = "h2_" + removeNullChar(ele.text()); continue; } } tempBuffer.append(ele.outerHtml()); } } catch (Exception e) { e.printStackTrace(); } return value; } /** * 移除空字符串和*字符 * @param text * @return */ private static String removeNullChar(String text) { if (text == null) { return null; } return text.replaceAll(" ", "").trim(); } /** * 修改图片的src,改为base64格式 * @param htmlDoc org.jsoup.nodes.Document * @param file 文件路径 * @throws IOException */ private static void changeImageSrc(org.jsoup.nodes.Document htmlDoc, String file) throws IOException { Elements images = htmlDoc.getElementsByTag("img"); // 获取所有的image,转码。 for (int i = 0; i < images.size(); i++) { Element tempImage = images.get(i); String tempSrc = tempImage.attr("src"); tempImage.attr("src", imageConvertBase64(file + File.separator + tempSrc)); } } /** * 把图片转换成base64格式 * @return * @throws IOException */ private static String imageConvertBase64(String filePath) throws IOException { InputStream in = null; byte[] data = null; try { String fileExtension = filePath.substring(filePath.lastIndexOf(".") + 1); // 获取文件后缀 in = new FileInputStream(new File(filePath)); data = new byte[in.available()]; in.read(data); in.close(); String enCoderContent = new String(Base64.getEncoder().encode(data)); Pattern p = Pattern.compile("\\s*|\t|\r|\n"); Matcher m = p.matcher(enCoderContent); enCoderContent = m.replaceAll(""); return "data:image/" + fileExtension + ";base64," + enCoderContent; } catch (IOException e) { e.printStackTrace(); } finally { if (in != null) { in.close(); } } return null; } /** * 从html文件中读取文件信息string * @param filePath * @return * @throws IOException */ private static String getHtmlStrFromFile(String filePath) throws IOException { FileInputStream in = null; File file = new File(filePath); Long filelength = file.length(); byte[] filecontent = new byte[filelength.intValue()]; try { in = new FileInputStream(file); in.read(filecontent); } catch (IOException e) { e.printStackTrace(); } finally { IOUtils.closeQuietly(in); } return new String(filecontent, "UTF-8"); } }