first commit

2023-03-02 16:13:28 +08:00
commit 2733a60b97
741 changed files with 76931 additions and 0 deletions
--- a/src/test/java/com/peanut/service/Test.java
+++ b/src/test/java/com/peanut/service/Test.java
@@ -0,0 +1,224 @@
+package com.peanut.service;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Base64;
+import java.util.LinkedHashMap;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.StringUtils;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+import com.aspose.words.Document;
+import com.aspose.words.SaveFormat;
+
+/**
+ * @author zcc 解析word获取对应章节下面的数据
+ */
+public class Test
+{
+
+    public static void main(String[] args)
+    {
+        // word文件的路径
+        String filePath = "C:\\Users\\Administrator\\Documents\\WeChat Files\\yl328572838\\FileStorage\\File\\2022-10\\桃源探秘.docx";
+        InputStream input = null;
+
+        try
+        {
+            // 先创建一个临时目录文件夹
+            String tempPath = "D:\\chapter\\";
+            File tempFile = new File(tempPath);
+            if (!tempFile.exists())
+            {
+                tempFile.mkdirs();
+            }
+
+            input = new FileInputStream(filePath);
+            Document doc = new Document(input);
+            // 将word文档全量转成html显示的html文件路径
+            String htmlFilePath = tempPath + "test.html";
+            doc.save(htmlFilePath, SaveFormat.HTML);
+            String htmlStr = getHtmlStrFromFile(htmlFilePath);
+            org.jsoup.nodes.Document htmlDoc = Jsoup.parse(htmlStr);
+            changeImageSrc(htmlDoc, tempPath);// 转换图片格式
+            Map<String, String> map = exactContentFromHtml(htmlDoc, 2);
+            String string = map.get("h1_标题名称");
+            System.out.println(string);
+        }
+        catch (Exception e)
+        {
+            e.printStackTrace();
+        }
+        finally
+        {
+            IOUtils.closeQuietly(input);
+        }
+    }
+
+    /**
+     * 从html中抽取出预规内容,章节对应章节的内容。
+     * @return key=h1_name/h2_name,value=html
+     */
+    private static Map<String, String> exactContentFromHtml(org.jsoup.nodes.Document htmlDoc, int level)
+    {
+        Map<String, String> value = new LinkedHashMap<String, String>();
+        try
+        {
+            Elements eleList = htmlDoc.getElementsByTag("h1");
+            if (eleList == null || eleList.size() == 0)
+            {
+                throw new Exception("上传的文件中不存在一级标题，请检查！");
+            }
+            Element ele = eleList.get(0);
+            String tempKey = "h1_" + ele.text();
+            StringBuffer tempBuffer = new StringBuffer();
+            while (true)
+            {
+                ele = ele.nextElementSibling();// 获取当前节点的下一个节点
+                if (ele == null)
+                {
+                    if (StringUtils.isNotEmpty(tempKey))
+                    {
+                        value.put(tempKey, tempBuffer.toString());
+                    }
+                    break;
+                }
+                String eleTagName = ele.tagName();// 标签名称
+                if ("h1".equals(eleTagName))
+                {
+                    if (StringUtils.isNotEmpty(tempKey))
+                    {
+                        value.put(tempKey, tempBuffer.toString());
+                        tempBuffer.setLength(0);
+                    }
+                    tempKey = "h1_" + removeNullChar(ele.text());
+                    continue;
+                }
+                if (level == 2)
+                {
+                    if ("h2".equals(eleTagName))
+                    {
+                        if (StringUtils.isNotEmpty(tempKey))
+                        {
+                            value.put(tempKey, tempBuffer.toString());
+                            tempBuffer.setLength(0);
+                        }
+                        tempKey = "h2_" + removeNullChar(ele.text());
+                        continue;
+                    }
+                }
+
+                tempBuffer.append(ele.outerHtml());
+            }
+        }
+        catch (Exception e)
+        {
+            e.printStackTrace();
+        }
+        return value;
+    }
+
+    /**
+     * 移除空字符串和*字符
+     * @param text
+     * @return
+     */
+    private static String removeNullChar(String text)
+    {
+        if (text == null)
+        {
+            return null;
+        }
+        return text.replaceAll(" ", "").trim();
+    }
+
+    /**
+     * 修改图片的src，改为base64格式
+     * @param htmlDoc org.jsoup.nodes.Document
+     * @param file 文件路径
+     * @throws IOException
+     */
+    private static void changeImageSrc(org.jsoup.nodes.Document htmlDoc, String file) throws IOException
+    {
+        Elements images = htmlDoc.getElementsByTag("img"); // 获取所有的image，转码。
+        for (int i = 0; i < images.size(); i++)
+        {
+            Element tempImage = images.get(i);
+            String tempSrc = tempImage.attr("src");
+            tempImage.attr("src", imageConvertBase64(file + File.separator + tempSrc));
+        }
+    }
+
+    /**
+     * 把图片转换成base64格式
+     * @return
+     * @throws IOException
+     */
+    private static String imageConvertBase64(String filePath) throws IOException
+    {
+        InputStream in = null;
+        byte[] data = null;
+        try
+        {
+            String fileExtension = filePath.substring(filePath.lastIndexOf(".") + 1); // 获取文件后缀
+            in = new FileInputStream(new File(filePath));
+            data = new byte[in.available()];
+            in.read(data);
+            in.close();
+            String enCoderContent = new String(Base64.getEncoder().encode(data));
+            Pattern p = Pattern.compile("\\s*|\t|\r|\n");
+            Matcher m = p.matcher(enCoderContent);
+            enCoderContent = m.replaceAll("");
+            return "data:image/" + fileExtension + ";base64," + enCoderContent;
+        }
+        catch (IOException e)
+        {
+            e.printStackTrace();
+        }
+        finally
+        {
+            if (in != null)
+            {
+                in.close();
+            }
+        }
+        return null;
+    }
+
+    /**
+     * 从html文件中读取文件信息string
+     * @param filePath
+     * @return
+     * @throws IOException
+     */
+    private static String getHtmlStrFromFile(String filePath) throws IOException
+    {
+        FileInputStream in = null;
+        File file = new File(filePath);
+        Long filelength = file.length();
+        byte[] filecontent = new byte[filelength.intValue()];
+        try
+        {
+            in = new FileInputStream(file);
+            in.read(filecontent);
+        }
+        catch (IOException e)
+        {
+            e.printStackTrace();
+        }
+        finally
+        {
+            IOUtils.closeQuietly(in);
+        }
+        return new String(filecontent, "UTF-8");
+    }
+}
+
+