POI之Word转化为Markdown-yellowcong

寿丰
2023-12-01

Markdown最近特别的火,同时我也想把自己以前做的笔记(用doc写的)分享出来,所以我想将DOC解析,然后生成Markdown文件,然后通过代码直接将图片上传到七牛云 ,将文本数据传到野狗云CSND,为啥我喜欢用七牛和野狗呢,因为他们都有免费的份额。PS(没有写完)

环境搭建

<!-- excel -->
    <dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi</artifactId>
    <version>3.17</version>
</dependency>

<!-- word -->
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-scratchpad</artifactId>
    <version>3.17</version>
</dependency>

<!-- xlsx -->
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-ooxml</artifactId>
    <version>3.17</version>
</dependency>
<!-- xlsx  依赖这个包 -->
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-ooxml-schemas</artifactId>
    <version>3.17</version>
</dependency>

获取标题等级

Doc中的标题有等级,对应的我们的Markdown中也存在等级的操作

/**一级标题  begin */
private static final String STYLE_TITLE_FONT_FONTNAME_1 ="宋体";
private static final Integer STYLE_TITLE_FONT_SIZE_1 =48;
private static final Integer TITLE_LV_1 = 1;
/**一级标题  end */
/**二级标题  begin */
private static final String STYLE_TITLE_FONT_FONTNAME_2 ="Arial";
private static final Integer STYLE_TITLE_FONT_SIZE_2 =32;
private static final Integer TITLE_LV_2 = 2;
/**二级标题  begin */
/**三级标题  begin */
private static final String STYLE_TITLE_FONT_FONTNAME_3 ="Times New Roman";
private static final Integer STYLE_TITLE_FONT_SIZE_3 =32;
private static final Integer TITLE_LV_3 = 3;
/**三级标题  begin */
/**普通的文本,不是标题*/
private static final Integer TITLE_LV_0 = 0;

/**
 * 获取标题的等级
 * @param paragraph
 * @return 1、2、3三个等级,如果没有数据返回,那么等级为0 ,是普通文本
 */
private static Integer getTitleLvl(Paragraph paragraph) {
    //获取这一段文字的数目
    int charCnts = paragraph.numCharacterRuns();
    int titleLv = TITLE_LV_0;
    if(charCnts == 0){
        return TITLE_LV_0;
    }

    CharacterRun characterRun = paragraph.getCharacterRun(0);

    if(characterRun != null){
        //字体的名称
        String fontName =characterRun.getFontName();
        //字体大小
        int fontSize = characterRun.getFontSize();
        //一级标题的情况
        if(fontName.equals(STYLE_TITLE_FONT_FONTNAME_1) && fontSize == STYLE_TITLE_FONT_SIZE_1){
            //System.out.println("一级标题\t"+paragraph.text().trim());
            titleLv= TITLE_LV_1;
        }else if(fontName.equals(STYLE_TITLE_FONT_FONTNAME_2) && fontSize == STYLE_TITLE_FONT_SIZE_2){
            titleLv= TITLE_LV_2;
//              System.out.println("\t二级标题\t"+paragraph.text().trim());
        }else if(fontName.equals(STYLE_TITLE_FONT_FONTNAME_3) && fontSize == STYLE_TITLE_FONT_SIZE_3){
            titleLv= TITLE_LV_3;
            //System.out.println("\t\t三级标题\t"+paragraph.text().trim());
        }
    }
    return titleLv;
}

获取每行文字的每段落字符的样式

Markdown中的文字都是存在样式 的,所以每一行都是有不同的样式,通过Range获取CharacterRun (这个类中包含了字体信息)

// 获取段落数,一个回车符号就是一个段落了
int paraNum = range.numParagraphs();
System.out.println(paraNum);
for (int i = 0; i < paraNum; i++) {
    //获取段落
    Paragraph paragraph =range.getParagraph(i);

    //获取这一段文字的数目
    int charCnts = paragraph.numCharacterRuns();
    if(charCnts == 0){
        return;
    }
    //获取标题的级别,是一级 二级,三级的情况 , 0是普通文本
    int titleLev = getTitleLvl(paragraph);
    //源码中,有类似的写法,具体为啥判断是否》=-1不是特别清楚
    int skipUntil = -1;
    for (int c = 0; c < range.numCharacterRuns(); c++) {
        //CharacterRun 会根据字体和样式,自动的字符分割开
        CharacterRun characterRun = range.getCharacterRun(c);
        if (characterRun != null && characterRun.getStartOffset() >= skipUntil) {
            String text = characterRun.text();
            String fontName = characterRun.getFontName();
            int fontSize = characterRun.getFontSize();
            int fontColor = characterRun.getColor();
            boolean bold = characterRun.isBold();
            boolean italic= characterRun.isItalic();

            System.out.printf("当前文字的字体名称:%s,字体大小%d,字体颜色%d,加粗%b,斜体%b\r\n",text,fontName,fontSize,fontColor,bold,italic);
        }
    }
    //图片处理

    //表格处理

}

完整代码

这部分代码没有完全的写完

package com.yellowcong.test;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.UUID;

import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.Bookmark;
import org.apache.poi.hwpf.usermodel.Bookmarks;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.hwpf.usermodel.Section;
import org.apache.poi.hwpf.usermodel.Table;
import org.apache.poi.hwpf.usermodel.TableCell;
import org.apache.poi.hwpf.usermodel.TableIterator;
import org.apache.poi.hwpf.usermodel.TableRow;

import com.sun.xml.internal.messaging.saaj.util.ByteInputStream;
import com.yellowcong.utils.DocUtils;

public class DocTest {

    private static final String BASE_PATH = "D:\\笔记\\服务器学习\\bae\\";
    /**一级标题  begin */
    private static final String STYLE_TITLE_FONT_FONTNAME_1 ="宋体";
    private static final Integer STYLE_TITLE_FONT_SIZE_1 =48;
    private static final Integer TITLE_LV_1 = 1;
    /**一级标题  end */
    /**二级标题  begin */
    private static final String STYLE_TITLE_FONT_FONTNAME_2 ="Arial";
    private static final Integer STYLE_TITLE_FONT_SIZE_2 =32;
    private static final Integer TITLE_LV_2 = 2;
    /**二级标题  begin */
    /**三级标题  begin */
    private static final String STYLE_TITLE_FONT_FONTNAME_3 ="Times New Roman";
    private static final Integer STYLE_TITLE_FONT_SIZE_3 =32;
    private static final Integer TITLE_LV_3 = 3;
    /**三级标题  begin */
    /**普通的文本,不是标题*/
    private static final Integer TITLE_LV_0 = 0;


    public static void main(String[] args) throws Exception, IOException {

        File file = new File(BASE_PATH + "BAE 服务器文件目录问题.doc");

        HWPFDocument doc = new HWPFDocument(new FileInputStream(file));

        DocUtils.copyDocToHtml(file);

        Range range = doc.getRange();

        printInfo(range);
        // HWPFDocumentCore

        // HWPFDocumentCore doc = WordToHtmlUtils.loadDoc(file);


    }

   /** 
    * 插入内容到Range,这里只会写到内存中 
    * @param range 
    */  
   private static void insertInfo(Range range) {  
      range.insertAfter("Hello");  
   }  

    /**
     * 在Word中,一个回车符就是一个段落了
     * 输出Range
     * 
     * @param range
     */
    private static void printInfo(Range range) {
        // 获取段落数,一个回车符号就是一个段落了
        int paraNum = range.numParagraphs();
        System.out.println(paraNum);
        for (int i = 0; i < paraNum; i++) {
            //获取段落
            Paragraph paragraph =range.getParagraph(i);

            //获取这一段文字的数目
            int charCnts = paragraph.numCharacterRuns();
            if(charCnts == 0){
                return;
            }
            //获取标题的级别,是一级 二级,三级的情况 , 0是普通文本
            int titleLev = getTitleLvl(paragraph);

            int skipUntil = -1;
            for (int c = 0; c < range.numCharacterRuns(); c++) {
                //CharacterRun 会根据字体和样式,自动的字符分割开
                CharacterRun characterRun = range.getCharacterRun(c);
                if (characterRun != null && characterRun.getStartOffset() >= skipUntil) {
                    String text = characterRun.text();
                    String fontName = characterRun.getFontName();
                    int fontSize = characterRun.getFontSize();
                    int fontColor = characterRun.getColor();
                    boolean bold = characterRun.isBold();
                    boolean italic= characterRun.isItalic();

                    System.out.printf("当前文字的字体名称:%s,字体大小%d,字体颜色%d,加粗%b,斜体%b\r\n",text,fontName,fontSize,fontColor,bold,italic);
                }
            }
            //图片处理

            //表格处理

        }
    }

    /**
     * 用于装每一个 返回的文字的样式信息
     * @author yellowcong
     * @date 2017年7月15日
     */
    class FontStyle{
        private String fontColor;
        private String fontName;
        private Integer fontSize;
        private boolean bold; //粗体
        private boolean italic; //斜体
        public String getFontColor() {
            return fontColor;
        }
        public void setFontColor(String fontColor) {
            this.fontColor = fontColor;
        }
        public String getFontName() {
            return fontName;
        }
        public void setFontName(String fontName) {
            this.fontName = fontName;
        }
        public Integer getFontSize() {
            return fontSize;
        }
        public void setFontSize(Integer fontSize) {
            this.fontSize = fontSize;
        }
        public boolean isBold() {
            return bold;
        }
        public void setBold(boolean bold) {
            this.bold = bold;
        }
        public boolean isItalic() {
            return italic;
        }
        public void setItalic(boolean italic) {
            this.italic = italic;
        }
    }
    /**
     * 获取标题的等级
     * @param paragraph
     * @return 1、2、3三个等级,如果没有数据返回,那么等级为0 ,是普通文本
     */
    private static Integer getTitleLvl(Paragraph paragraph) {
        //获取这一段文字的数目
        int charCnts = paragraph.numCharacterRuns();
        int titleLv = TITLE_LV_0;
        if(charCnts == 0){
            return TITLE_LV_0;
        }

        CharacterRun characterRun = paragraph.getCharacterRun(0);

        if(characterRun != null){
            //字体的名称
            String fontName =characterRun.getFontName();
            //字体大小
            int fontSize = characterRun.getFontSize();
            //一级标题的情况
            if(fontName.equals(STYLE_TITLE_FONT_FONTNAME_1) && fontSize == STYLE_TITLE_FONT_SIZE_1){
                //System.out.println("一级标题\t"+paragraph.text().trim());
                titleLv= TITLE_LV_1;
            }else if(fontName.equals(STYLE_TITLE_FONT_FONTNAME_2) && fontSize == STYLE_TITLE_FONT_SIZE_2){
                titleLv= TITLE_LV_2;
//              System.out.println("\t二级标题\t"+paragraph.text().trim());
            }else if(fontName.equals(STYLE_TITLE_FONT_FONTNAME_3) && fontSize == STYLE_TITLE_FONT_SIZE_3){
                titleLv= TITLE_LV_3;
                //System.out.println("\t\t三级标题\t"+paragraph.text().trim());
            }
        }
        return titleLv;
    }

    private static void getFontLive(){

    }
    @SuppressWarnings("resource")
    public static void copyByteToFile(byte[] imgByte) throws Exception {

        InputStream in = new ByteInputStream(imgByte, 0, imgByte.length);
        byte[] buff = new byte[1024];
        String fileName = UUID.randomUUID().toString().substring(0, 6);
        OutputStream out = new FileOutputStream(new File(BASE_PATH + fileName + ".jpg"));

        int len = 0;
        while ((len = in.read(buff)) > 0) {
            out.write(buff, 0, len);
        }

        out.flush();
        out.close();
        in.close();
    }

    /** 
        * 输出书签信息 
        * @param bookmarks 
        */  
       private void printInfo(Bookmarks bookmarks) {  
          int count = bookmarks.getBookmarksCount();  
          System.out.println("书签数量:" + count);  
          Bookmark bookmark;  
          for (int i=0; i<count; i++) {  
             bookmark = bookmarks.getBookmark(i);  
             System.out.println("书签" + (i+1) + "的名称是:" + bookmark.getName());  
             System.out.println("开始位置:" + bookmark.getStart());  
             System.out.println("结束位置:" + bookmark.getEnd());  
          }  
       }  

       /** 
        * 读表格 
        * 每一个回车符代表一个段落,所以对于表格而言,每一个单元格至少包含一个段落,每行结束都是一个段落。 
        * @param range 
        */  
       private static void readTable(Range range) {  
          //遍历range范围内的table。  
          TableIterator tableIter = new TableIterator(range);  
          Table table;  
          TableRow row;  
          TableCell cell;  
          while (tableIter.hasNext()) {  
             table = tableIter.next();  
             int rowNum = table.numRows();  
             for (int j=0; j<rowNum; j++) {  
                row = table.getRow(j);  
                int cellNum = row.numCells();  
                for (int k=0; k<cellNum; k++) {  
                    cell = row.getCell(k);  
                    //输出单元格的文本  
                    System.out.println(cell.text().trim());  
                }  
             }  
          }  
       }  

       /** 
        * 读列表 
        * @param range 
        */  
       private static void readList(Range range) {  
          int num = range.numParagraphs();  
          Paragraph para;  
          for (int i=0; i<num; i++) {  
             para = range.getParagraph(i);  
             if (para.isInList()) {  
                System.out.println("list: " + para.text());  
             }  
          }  
       }  


}
 类似资料: