nuttyreading-java/src/test/java/com/peanut/service/Test.java

package com.peanut.service;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Base64;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.aspose.words.Document;
import com.aspose.words.SaveFormat;

/**
 * @author zcc 解析word获取对应章节下面的数据
 */
public class Test
{

    public static void main(String[] args)
    {
        // word文件的路径
        String filePath = "C:\\Users\\Administrator\\Documents\\WeChat Files\\yl328572838\\FileStorage\\File\\2022-10\\桃源探秘.docx";
        InputStream input = null;

        try
        {
            // 先创建一个临时目录文件夹
            String tempPath = "D:\\chapter\\";
            File tempFile = new File(tempPath);
            if (!tempFile.exists())
            {
                tempFile.mkdirs();
            }

            input = new FileInputStream(filePath);
            Document doc = new Document(input);
            // 将word文档全量转成html显示的html文件路径
            String htmlFilePath = tempPath + "test.html";
            doc.save(htmlFilePath, SaveFormat.HTML);
            String htmlStr = getHtmlStrFromFile(htmlFilePath);
            org.jsoup.nodes.Document htmlDoc = Jsoup.parse(htmlStr);
            changeImageSrc(htmlDoc, tempPath);// 转换图片格式
            Map<String, String> map = exactContentFromHtml(htmlDoc, 2);
            String string = map.get("h1_标题名称");
            System.out.println(string);
        }
        catch (Exception e)
        {
            e.printStackTrace();
        }
        finally
        {
            IOUtils.closeQuietly(input);
        }
    }

    /**
     * 从html中抽取出预规内容,章节对应章节的内容。
     * @return key=h1_name/h2_name,value=html
     */
    private static Map<String, String> exactContentFromHtml(org.jsoup.nodes.Document htmlDoc, int level)
    {
        Map<String, String> value = new LinkedHashMap<String, String>();
        try
        {
            Elements eleList = htmlDoc.getElementsByTag("h1");
            if (eleList == null || eleList.size() == 0)
            {
                throw new Exception("上传的文件中不存在一级标题，请检查！");
            }
            Element ele = eleList.get(0);
            String tempKey = "h1_" + ele.text();
            StringBuffer tempBuffer = new StringBuffer();
            while (true)
            {
                ele = ele.nextElementSibling();// 获取当前节点的下一个节点
                if (ele == null)
                {
                    if (StringUtils.isNotEmpty(tempKey))
                    {
                        value.put(tempKey, tempBuffer.toString());
                    }
                    break;
                }
                String eleTagName = ele.tagName();// 标签名称
                if ("h1".equals(eleTagName))
                {
                    if (StringUtils.isNotEmpty(tempKey))
                    {
                        value.put(tempKey, tempBuffer.toString());
                        tempBuffer.setLength(0);
                    }
                    tempKey = "h1_" + removeNullChar(ele.text());
                    continue;
                }
                if (level == 2)
                {
                    if ("h2".equals(eleTagName))
                    {
                        if (StringUtils.isNotEmpty(tempKey))
                        {
                            value.put(tempKey, tempBuffer.toString());
                            tempBuffer.setLength(0);
                        }
                        tempKey = "h2_" + removeNullChar(ele.text());
                        continue;
                    }
                }

                tempBuffer.append(ele.outerHtml());
            }
        }
        catch (Exception e)
        {
            e.printStackTrace();
        }
        return value;
    }

    /**
     * 移除空字符串和*字符
     * @param text
     * @return
     */
    private static String removeNullChar(String text)
    {
        if (text == null)
        {
            return null;
        }
        return text.replaceAll(" ", "").trim();
    }

    /**
     * 修改图片的src，改为base64格式
     * @param htmlDoc org.jsoup.nodes.Document
     * @param file 文件路径
     * @throws IOException
     */
    private static void changeImageSrc(org.jsoup.nodes.Document htmlDoc, String file) throws IOException
    {
        Elements images = htmlDoc.getElementsByTag("img"); // 获取所有的image，转码。
        for (int i = 0; i < images.size(); i++)
        {
            Element tempImage = images.get(i);
            String tempSrc = tempImage.attr("src");
            tempImage.attr("src", imageConvertBase64(file + File.separator + tempSrc));
        }
    }

    /**
     * 把图片转换成base64格式
     * @return
     * @throws IOException
     */
    private static String imageConvertBase64(String filePath) throws IOException
    {
        InputStream in = null;
        byte[] data = null;
        try
        {
            String fileExtension = filePath.substring(filePath.lastIndexOf(".") + 1); // 获取文件后缀
            in = new FileInputStream(new File(filePath));
            data = new byte[in.available()];
            in.read(data);
            in.close();
            String enCoderContent = new String(Base64.getEncoder().encode(data));
            Pattern p = Pattern.compile("\\s*|\t|\r|\n");
            Matcher m = p.matcher(enCoderContent);
            enCoderContent = m.replaceAll("");
            return "data:image/" + fileExtension + ";base64," + enCoderContent;
        }
        catch (IOException e)
        {
            e.printStackTrace();
        }
        finally
        {
            if (in != null)
            {
                in.close();
            }
        }
        return null;
    }

    /**
     * 从html文件中读取文件信息string
     * @param filePath
     * @return
     * @throws IOException
     */
    private static String getHtmlStrFromFile(String filePath) throws IOException
    {
        FileInputStream in = null;
        File file = new File(filePath);
        Long filelength = file.length();
        byte[] filecontent = new byte[filelength.intValue()];
        try
        {
            in = new FileInputStream(file);
            in.read(filecontent);
        }
        catch (IOException e)
        {
            e.printStackTrace();
        }
        finally
        {
            IOUtils.closeQuietly(in);
        }
        return new String(filecontent, "UTF-8");
    }
}