Markdown最近特别的火,同时我也想把自己以前做的笔记(用doc写的)分享出来,所以我想将DOC解析,然后生成Markdown文件,然后通过代码直接将图片上传到七牛云 ,将文本数据传到野狗云和CSND,为啥我喜欢用七牛和野狗呢,因为他们都有免费的份额。PS(没有写完)
<!-- excel -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.17</version>
</dependency>
<!-- word -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>3.17</version>
</dependency>
<!-- xlsx -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.17</version>
</dependency>
<!-- xlsx 依赖这个包 -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml-schemas</artifactId>
<version>3.17</version>
</dependency>
Doc中的标题有等级,对应的我们的Markdown中也存在等级的操作
/**一级标题 begin */
private static final String STYLE_TITLE_FONT_FONTNAME_1 ="宋体";
private static final Integer STYLE_TITLE_FONT_SIZE_1 =48;
private static final Integer TITLE_LV_1 = 1;
/**一级标题 end */
/**二级标题 begin */
private static final String STYLE_TITLE_FONT_FONTNAME_2 ="Arial";
private static final Integer STYLE_TITLE_FONT_SIZE_2 =32;
private static final Integer TITLE_LV_2 = 2;
/**二级标题 begin */
/**三级标题 begin */
private static final String STYLE_TITLE_FONT_FONTNAME_3 ="Times New Roman";
private static final Integer STYLE_TITLE_FONT_SIZE_3 =32;
private static final Integer TITLE_LV_3 = 3;
/**三级标题 begin */
/**普通的文本,不是标题*/
private static final Integer TITLE_LV_0 = 0;
/**
* 获取标题的等级
* @param paragraph
* @return 1、2、3三个等级,如果没有数据返回,那么等级为0 ,是普通文本
*/
private static Integer getTitleLvl(Paragraph paragraph) {
//获取这一段文字的数目
int charCnts = paragraph.numCharacterRuns();
int titleLv = TITLE_LV_0;
if(charCnts == 0){
return TITLE_LV_0;
}
CharacterRun characterRun = paragraph.getCharacterRun(0);
if(characterRun != null){
//字体的名称
String fontName =characterRun.getFontName();
//字体大小
int fontSize = characterRun.getFontSize();
//一级标题的情况
if(fontName.equals(STYLE_TITLE_FONT_FONTNAME_1) && fontSize == STYLE_TITLE_FONT_SIZE_1){
//System.out.println("一级标题\t"+paragraph.text().trim());
titleLv= TITLE_LV_1;
}else if(fontName.equals(STYLE_TITLE_FONT_FONTNAME_2) && fontSize == STYLE_TITLE_FONT_SIZE_2){
titleLv= TITLE_LV_2;
// System.out.println("\t二级标题\t"+paragraph.text().trim());
}else if(fontName.equals(STYLE_TITLE_FONT_FONTNAME_3) && fontSize == STYLE_TITLE_FONT_SIZE_3){
titleLv= TITLE_LV_3;
//System.out.println("\t\t三级标题\t"+paragraph.text().trim());
}
}
return titleLv;
}
Markdown中的文字都是存在样式 的,所以每一行都是有不同的样式,通过
Range
获取CharacterRun
(这个类中包含了字体信息)
// 获取段落数,一个回车符号就是一个段落了
int paraNum = range.numParagraphs();
System.out.println(paraNum);
for (int i = 0; i < paraNum; i++) {
//获取段落
Paragraph paragraph =range.getParagraph(i);
//获取这一段文字的数目
int charCnts = paragraph.numCharacterRuns();
if(charCnts == 0){
return;
}
//获取标题的级别,是一级 二级,三级的情况 , 0是普通文本
int titleLev = getTitleLvl(paragraph);
//源码中,有类似的写法,具体为啥判断是否》=-1不是特别清楚
int skipUntil = -1;
for (int c = 0; c < range.numCharacterRuns(); c++) {
//CharacterRun 会根据字体和样式,自动的字符分割开
CharacterRun characterRun = range.getCharacterRun(c);
if (characterRun != null && characterRun.getStartOffset() >= skipUntil) {
String text = characterRun.text();
String fontName = characterRun.getFontName();
int fontSize = characterRun.getFontSize();
int fontColor = characterRun.getColor();
boolean bold = characterRun.isBold();
boolean italic= characterRun.isItalic();
System.out.printf("当前文字的字体名称:%s,字体大小%d,字体颜色%d,加粗%b,斜体%b\r\n",text,fontName,fontSize,fontColor,bold,italic);
}
}
//图片处理
//表格处理
}
这部分代码没有完全的写完
package com.yellowcong.test;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.UUID;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.Bookmark;
import org.apache.poi.hwpf.usermodel.Bookmarks;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.hwpf.usermodel.Section;
import org.apache.poi.hwpf.usermodel.Table;
import org.apache.poi.hwpf.usermodel.TableCell;
import org.apache.poi.hwpf.usermodel.TableIterator;
import org.apache.poi.hwpf.usermodel.TableRow;
import com.sun.xml.internal.messaging.saaj.util.ByteInputStream;
import com.yellowcong.utils.DocUtils;
public class DocTest {
private static final String BASE_PATH = "D:\\笔记\\服务器学习\\bae\\";
/**一级标题 begin */
private static final String STYLE_TITLE_FONT_FONTNAME_1 ="宋体";
private static final Integer STYLE_TITLE_FONT_SIZE_1 =48;
private static final Integer TITLE_LV_1 = 1;
/**一级标题 end */
/**二级标题 begin */
private static final String STYLE_TITLE_FONT_FONTNAME_2 ="Arial";
private static final Integer STYLE_TITLE_FONT_SIZE_2 =32;
private static final Integer TITLE_LV_2 = 2;
/**二级标题 begin */
/**三级标题 begin */
private static final String STYLE_TITLE_FONT_FONTNAME_3 ="Times New Roman";
private static final Integer STYLE_TITLE_FONT_SIZE_3 =32;
private static final Integer TITLE_LV_3 = 3;
/**三级标题 begin */
/**普通的文本,不是标题*/
private static final Integer TITLE_LV_0 = 0;
public static void main(String[] args) throws Exception, IOException {
File file = new File(BASE_PATH + "BAE 服务器文件目录问题.doc");
HWPFDocument doc = new HWPFDocument(new FileInputStream(file));
DocUtils.copyDocToHtml(file);
Range range = doc.getRange();
printInfo(range);
// HWPFDocumentCore
// HWPFDocumentCore doc = WordToHtmlUtils.loadDoc(file);
}
/**
* 插入内容到Range,这里只会写到内存中
* @param range
*/
private static void insertInfo(Range range) {
range.insertAfter("Hello");
}
/**
* 在Word中,一个回车符就是一个段落了
* 输出Range
*
* @param range
*/
private static void printInfo(Range range) {
// 获取段落数,一个回车符号就是一个段落了
int paraNum = range.numParagraphs();
System.out.println(paraNum);
for (int i = 0; i < paraNum; i++) {
//获取段落
Paragraph paragraph =range.getParagraph(i);
//获取这一段文字的数目
int charCnts = paragraph.numCharacterRuns();
if(charCnts == 0){
return;
}
//获取标题的级别,是一级 二级,三级的情况 , 0是普通文本
int titleLev = getTitleLvl(paragraph);
int skipUntil = -1;
for (int c = 0; c < range.numCharacterRuns(); c++) {
//CharacterRun 会根据字体和样式,自动的字符分割开
CharacterRun characterRun = range.getCharacterRun(c);
if (characterRun != null && characterRun.getStartOffset() >= skipUntil) {
String text = characterRun.text();
String fontName = characterRun.getFontName();
int fontSize = characterRun.getFontSize();
int fontColor = characterRun.getColor();
boolean bold = characterRun.isBold();
boolean italic= characterRun.isItalic();
System.out.printf("当前文字的字体名称:%s,字体大小%d,字体颜色%d,加粗%b,斜体%b\r\n",text,fontName,fontSize,fontColor,bold,italic);
}
}
//图片处理
//表格处理
}
}
/**
* 用于装每一个 返回的文字的样式信息
* @author yellowcong
* @date 2017年7月15日
*/
class FontStyle{
private String fontColor;
private String fontName;
private Integer fontSize;
private boolean bold; //粗体
private boolean italic; //斜体
public String getFontColor() {
return fontColor;
}
public void setFontColor(String fontColor) {
this.fontColor = fontColor;
}
public String getFontName() {
return fontName;
}
public void setFontName(String fontName) {
this.fontName = fontName;
}
public Integer getFontSize() {
return fontSize;
}
public void setFontSize(Integer fontSize) {
this.fontSize = fontSize;
}
public boolean isBold() {
return bold;
}
public void setBold(boolean bold) {
this.bold = bold;
}
public boolean isItalic() {
return italic;
}
public void setItalic(boolean italic) {
this.italic = italic;
}
}
/**
* 获取标题的等级
* @param paragraph
* @return 1、2、3三个等级,如果没有数据返回,那么等级为0 ,是普通文本
*/
private static Integer getTitleLvl(Paragraph paragraph) {
//获取这一段文字的数目
int charCnts = paragraph.numCharacterRuns();
int titleLv = TITLE_LV_0;
if(charCnts == 0){
return TITLE_LV_0;
}
CharacterRun characterRun = paragraph.getCharacterRun(0);
if(characterRun != null){
//字体的名称
String fontName =characterRun.getFontName();
//字体大小
int fontSize = characterRun.getFontSize();
//一级标题的情况
if(fontName.equals(STYLE_TITLE_FONT_FONTNAME_1) && fontSize == STYLE_TITLE_FONT_SIZE_1){
//System.out.println("一级标题\t"+paragraph.text().trim());
titleLv= TITLE_LV_1;
}else if(fontName.equals(STYLE_TITLE_FONT_FONTNAME_2) && fontSize == STYLE_TITLE_FONT_SIZE_2){
titleLv= TITLE_LV_2;
// System.out.println("\t二级标题\t"+paragraph.text().trim());
}else if(fontName.equals(STYLE_TITLE_FONT_FONTNAME_3) && fontSize == STYLE_TITLE_FONT_SIZE_3){
titleLv= TITLE_LV_3;
//System.out.println("\t\t三级标题\t"+paragraph.text().trim());
}
}
return titleLv;
}
private static void getFontLive(){
}
@SuppressWarnings("resource")
public static void copyByteToFile(byte[] imgByte) throws Exception {
InputStream in = new ByteInputStream(imgByte, 0, imgByte.length);
byte[] buff = new byte[1024];
String fileName = UUID.randomUUID().toString().substring(0, 6);
OutputStream out = new FileOutputStream(new File(BASE_PATH + fileName + ".jpg"));
int len = 0;
while ((len = in.read(buff)) > 0) {
out.write(buff, 0, len);
}
out.flush();
out.close();
in.close();
}
/**
* 输出书签信息
* @param bookmarks
*/
private void printInfo(Bookmarks bookmarks) {
int count = bookmarks.getBookmarksCount();
System.out.println("书签数量:" + count);
Bookmark bookmark;
for (int i=0; i<count; i++) {
bookmark = bookmarks.getBookmark(i);
System.out.println("书签" + (i+1) + "的名称是:" + bookmark.getName());
System.out.println("开始位置:" + bookmark.getStart());
System.out.println("结束位置:" + bookmark.getEnd());
}
}
/**
* 读表格
* 每一个回车符代表一个段落,所以对于表格而言,每一个单元格至少包含一个段落,每行结束都是一个段落。
* @param range
*/
private static void readTable(Range range) {
//遍历range范围内的table。
TableIterator tableIter = new TableIterator(range);
Table table;
TableRow row;
TableCell cell;
while (tableIter.hasNext()) {
table = tableIter.next();
int rowNum = table.numRows();
for (int j=0; j<rowNum; j++) {
row = table.getRow(j);
int cellNum = row.numCells();
for (int k=0; k<cellNum; k++) {
cell = row.getCell(k);
//输出单元格的文本
System.out.println(cell.text().trim());
}
}
}
}
/**
* 读列表
* @param range
*/
private static void readList(Range range) {
int num = range.numParagraphs();
Paragraph para;
for (int i=0; i<num; i++) {
para = range.getParagraph(i);
if (para.isInList()) {
System.out.println("list: " + para.text());
}
}
}
}