代码实现主要依赖两个第三方jar包,一个是apache-poi,一个是aspose-pdf。apache-poi包完全开源免费,aspose-pdf免费版生成有水印,且生成数量有限制。单纯用apache-poi实现pdf转word文件的话,实现非常复杂,且样式和原来样式,保持一致的的比例很低。所以,我先用aspose-pdf生成了带水印的docx文件,再用docx文件去除aspose-pdf生成的水印的,最终得到了一个无水印的word文件。
项目远程仓库
aspose-pdf 这个需要配置单独的仓库地址才能下载,不会配置的可以去官网直接下载jar引入项目代码中。
<repositories>
<repository>
<id>AsposeJavaAPI</id>
<name>Aspose Java API</name>
<url>https://repository.aspose.com/repo/</url>
</repository>
</repositories>
Maven项目pom文件依赖
<!-- https://mvnrepository.com/artifact/com.aspose/aspose-pdf -->
<dependency>
<groupId>com.aspose</groupId>
<artifactId>aspose-pdf</artifactId>
<version>21.8</version>
</dependency>
<!-- poi-ooxml是poi的升级版本-->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>4.1.2</version>
</dependency>
核心代码实现
import com.aspose.pdf.Document;
import com.aspose.pdf.SaveFormat;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFRun;
import java.io.*;
import java.util.List;
public class PDFHelper3 {
public static void main(String[] args) throws IOException {
pdf2doc("test.pdf");
}
//移除文字水印
public static boolean removeWatermark(File file) {
try {
XWPFDocument doc = new XWPFDocument(new FileInputStream(file));
// 段落
List<XWPFParagraph> paragraphs = doc.getParagraphs();
for (XWPFParagraph paragraph : paragraphs) {
String text=paragraph.getText();
if("Evaluation Only. Created with Aspose.PDF. Copyright 2002-2021 Aspose Pty Ltd.".equals(text)){
List<XWPFRun> runs = paragraph.getRuns();
runs.forEach(e-> e.setText("",0));
}
}
FileOutputStream outStream = new FileOutputStream(file);
doc.write(outStream);
outStream.close();
} catch (IOException e) {
e.printStackTrace();
}
return true;
}
//pdf转doc(目前最大支持21页)
public static void pdf2doc(String pdfPath) {
long old = System.currentTimeMillis();
try {
//新建一个pdf文档
String wordPath=pdfPath.substring(0,pdfPath.lastIndexOf("."))+".docx";
File file = new File(wordPath);
FileOutputStream os = new FileOutputStream(file);
//Address是将要被转化的word文档
Document doc = new Document(pdfPath);
//全面支持DOC, DOCX, OOXML, RTF HTML, OpenDocument, PDF, EPUB, XPS, SWF 相互转换
doc.save(os, SaveFormat.DocX);
os.close();
//去除水印
removeWatermark(new File(wordPath));
//转化用时
long now = System.currentTimeMillis();
System.out.println("Pdf 转 Word 共耗时:" + ((now - old) / 1000.0) + "秒");
} catch (Exception e) {
System.out.println("Pdf 转 Word 失败...");
e.printStackTrace();
}
}
//转ppt
public static void pdf2ppt(String pdfPath) {
long old = System.currentTimeMillis();
try {
//新建一个word文档
String wordPath = pdfPath.substring(0, pdfPath.lastIndexOf(".")) + ".pptx";
FileOutputStream os = new FileOutputStream(wordPath);
//doc是将要被转化的word文档
Document doc = new Document(pdfPath);
//全面支持DOC, DOCX, OOXML, RTF HTML, OpenDocument, PDF, EPUB, XPS, SWF 相互转换
doc.save(os, SaveFormat.Pptx);
os.close();
//转化用时
long now = System.currentTimeMillis();
System.out.println("Pdf 转 PPT 共耗时:" + ((now - old) / 1000.0) + "秒");
} catch (Exception e) {
System.out.println("Pdf 转 PPT 失败...");
e.printStackTrace();
}
}
//转excel
public static void pdf2excel(String pdfPath) {
long old = System.currentTimeMillis();
try {
String wordPath = pdfPath.substring(0, pdfPath.lastIndexOf(".")) + ".xlsx";
FileOutputStream os = new FileOutputStream(wordPath);
Document doc = new Document(pdfPath);
doc.save(os, SaveFormat.Excel);
os.close();
long now = System.currentTimeMillis();
System.out.println("Pdf 转 EXCEL 共耗时:" + ((now - old) / 1000.0) + "秒");
} catch (Exception e) {
System.out.println("Pdf 转 EXCEL 失败...");
e.printStackTrace();
}
}
//转html
public static void pdf2Html(String pdfPath) {
long old = System.currentTimeMillis();
try {
String htmlPath = pdfPath.substring(0, pdfPath.lastIndexOf(".")) + ".html";
Document doc = new Document(pdfPath);
doc.save(htmlPath, SaveFormat.Html);
long now = System.currentTimeMillis();
System.out.println("Pdf 转 HTML 共耗时:" + ((now - old) / 1000.0) + "秒");
} catch (Exception e) {
System.out.println("Pdf 转 HTML 失败...");
e.printStackTrace();
}
}
//转图片
public static void pdf2image(String pdfPath) {
long old = System.currentTimeMillis();
try {
Resolution resolution = new Resolution(300);
String dataDir = pdfPath.substring(0, pdfPath.lastIndexOf("."));
File imageDir = new File(dataDir + "_images");
if (!imageDir.exists()) {
imageDir.mkdirs();
}
Document doc = new Document(pdfPath);
PngDevice pngDevice = new PngDevice(resolution);
for (int pageCount = 1; pageCount <= doc.getPages().size(); pageCount++) {
OutputStream imageStream = new FileOutputStream(imageDir + "/" + pageCount + ".png");
pngDevice.process(doc.getPages().get_Item(pageCount), imageStream);
imageStream.close();
}
long now = System.currentTimeMillis();
System.out.println("Pdf 转 PNG 共耗时:" + ((now - old) / 1000.0) + "秒");
} catch (Exception e) {
System.out.println("Pdf 转 PNG 失败...");
e.printStackTrace();
}
}
//转txt
public static void pdf2txt(String pdfPath) {
long old = System.currentTimeMillis();
Document pdfDocument = new Document(pdfPath);
TextAbsorber ta = new TextAbsorber();
ta.visit(pdfDocument);
String txtPath = pdfPath.substring(0, pdfPath.lastIndexOf(".")) + ".txt";
try {
BufferedWriter writer = new BufferedWriter(new FileWriter(txtPath));
writer.write(ta.getText());
writer.close();
long now = System.currentTimeMillis();
System.out.println("Pdf 转 TXT 共耗时:" + ((now - old) / 1000.0) + "秒");
} catch (IOException e) {
System.out.println("Pdf 转 TXT 失败...");
e.printStackTrace();
}
}
}
import java.io.File;
import com.spire.doc.Document;
import com.spire.pdf.FileFormat;
import com.spire.pdf.PdfDocument;
import com.spire.pdf.widget.PdfPageCollection;
public class PdfToWord {
public static void main(String[] args) {
pdftoword("test.pdf");
}
public static String pdftoword(String srcPath) {
boolean result = false;
String baseDir = srcPath.substring(0, srcPath.length() - 4);
String splitPath = baseDir + "_temp_split" + File.separator;
String docPath = baseDir + "_temp_doc" + File.separator;
String desPath = baseDir + ".docx";
try {
// 0、判断输入的是否是pdf文件
//第一步:判断输入的是否合法
boolean flag = isPDFFile(srcPath);
if(flag){
//第二步:在输入的路径下新建文件夹
boolean flag1 = create(splitPath, docPath);
if (flag1) {
// 1、加载pdf
PdfDocument pdf = new PdfDocument();
pdf.loadFromFile(srcPath);
PdfPageCollection num = pdf.getPages();
// 2、如果pdf的页数小于11,那么直接进行转化
if (num.getCount() <= 10) {
pdf.saveToFile(desPath, com.spire.pdf.FileFormat.DOCX);
}
// 3、否则输入的页数比较多,就开始进行切分再转化
else {
// 第一步:将其进行切分,每页一张pdf
pdf.split(splitPath + "test{0}.pdf", 0);
// 第二步:将切分的pdf,一个一个进行转换
File[] fs = getSplitFiles(splitPath);
for (int i = 0; i < fs.length; i++) {
PdfDocument sonpdf = new PdfDocument();
sonpdf.loadFromFile(fs[i].getAbsolutePath());
sonpdf.saveToFile(docPath + fs[i].getName().substring(0, fs[i].getName().length() - 4) + ".docx", FileFormat.DOCX);
}
//第三步:对转化的doc文档进行合并,合并成一个大的word
try {
result = merge(docPath, desPath);
return desPath;
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
//4、把刚刚缓存的split和doc删除
if (result == true) {
clearFiles(splitPath);
clearFiles(docPath);
}
}
return null;
}
private static boolean create(String splitPath, String docPath) {
File f = new File(splitPath);
File f1 = new File(docPath);
if (!f.exists()) f.mkdirs();
if (!f.exists()) f1.mkdirs();
return true;
}
// 判断是否是pdf文件
private static boolean isPDFFile(String srcPath2) {
File file = new File(srcPath2);
String filename = file.getName();
if (filename.endsWith(".pdf")) {
return true;
}
return false;
}
// 取得某一路径下所有的pdf
private static File[] getSplitFiles(String path) {
File f = new File(path);
File[] fs = f.listFiles();
if (fs == null) {
return null;
}
return fs;
}
//删除文件和目录
private static void clearFiles(String workspaceRootPath){
File file = new File(workspaceRootPath);
if(file.exists()){
deleteFile(file);
}
}
private static void deleteFile(File file){
if(file.isDirectory()){
File[] files = file.listFiles();
for(int i=0; i<files.length; i++){
deleteFile(files[i]);
}
}
file.delete();
}
private static boolean merge(String docPath, String desPath) {
File[] fs = getSplitFiles(docPath);
Document document = new Document(docPath + "test0.docx");
for (int i = 1; i < fs.length; i++) {
document.insertTextFromFile(docPath + "test" + i + ".docx", com.spire.doc.FileFormat.Docx_2013);
}
//第四步:对合并的doc进行保存2
document.saveToFile(desPath);
return true;
}
}
public static void wordToPdf() {
//实例化Document类的对象
Document doc = new Document();
//加载Word
doc.loadFromFile("test.docx");
//保存为PDF格式
doc.saveToFile("test.pdf", com.spire.doc.FileFormat.PDF);
}
这个只能转3页
Spire Doc.
Free version converting word documents to PDF files, you can only get the first 3 page of PDF file.
Upgrade to Commercial Edition of Spire.Doc <https://www.e-iceblue.com/Introduce/doc-for-java.html>.
先word 转 html 再把 html 转 pdf
word 转 html
/**
* Word03 转为 HTML
*/
public static String wordToHtml03(String fileName) {
if (!(checkFile(fileName, "doc") || checkFile(fileName, "docx"))) {
return null;
}
HWPFDocument wordDoc = null;
WordToHtmlConverter wthc = null;
try {
wordDoc = new HWPFDocument(new FileInputStream(fileName));
wthc = new WordToHtmlConverter(
DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
} catch (IllegalArgumentException e) {
return null;
} catch (Exception e) {
return null;
}
final String fn = fileName;
//html引用图片位置
wthc.setPicturesManager(new PicturesManager() {
public String savePicture(byte[] bytes, PictureType pt, String str, float f, float f1) {
return getImageUrl(fn) + str;
}
});
wthc.processDocument(wordDoc);
List<Picture> pics = wordDoc.getPicturesTable().getAllPictures();
if (null != pics && pics.size() > 0) {
fileExists(getImageSavePath(fileName));
for (Picture pic : pics) {
try {
//生成图片位置
pic.writeImageContent(new FileOutputStream(getImageSavePath(fileName) + pic.suggestFullFileName()));
} catch (IOException e) {
return null;
}
}
}
org.w3c.dom.Document htmlDocument = wthc.getDocument();
ByteArrayOutputStream out = new ByteArrayOutputStream();
DOMSource domSource = new DOMSource(htmlDocument);
StreamResult streamResult = new StreamResult(out);
try {
TransformerFactory tf = TransformerFactory.newInstance();
Transformer serializer = tf.newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "html");
serializer.transform(domSource, streamResult);
} catch (TransformerException e) {
return null;
} finally {
}
String htmlStr = new String(out.toByteArray());
return htmlStr;
}
/**
* Word07 转为 HTML
*/
@SuppressWarnings("deprecation")
public static String wordToHtml07(String fileName) {
if (!checkFile(fileName, "docx") && !checkFile(fileName, "doc")) {
return null;
}
//加载html页面时图片路径
XHTMLOptions options = XHTMLOptions.create().URIResolver(new BasicURIResolver(getImageUrl(fileName)));
//图片保存文件夹路径
fileExists(getImageSavePath(fileName));
options.setExtractor(new FileImageExtractor(new File(getImageSavePath(fileName))));
ByteArrayOutputStream out = null;
//读取文档内容
XWPFDocument document = null;
InputStream in = null;
try {
in = new FileInputStream(fileName);
document = new XWPFDocument(in);
out = new ByteArrayOutputStream();
//out = new FileOutputStream(new File(outputFile));
XHTMLConverter.getInstance().convert(document, out, options);
String str = out.toString("utf-8");
return str;
} catch (NotOfficeXmlFileException e) {
return null;
} catch (Exception e) {
return null;
} finally {
}
}
html 转 pdf
<!-- itext7html转pdf -->
<dependency>
<groupId>com.itextpdf</groupId>
<artifactId>html2pdf</artifactId>
<version>3.0.2</version>
</dependency>
<!-- 中文字体支持 -->
<dependency>
<groupId>com.itextpdf</groupId>
<artifactId>font-asian</artifactId>
<version>7.1.13</version>
</dependency>
import com.itextpdf.html2pdf.ConverterProperties;
import com.itextpdf.html2pdf.HtmlConverter;
import com.itextpdf.kernel.events.PdfDocumentEvent;
import com.itextpdf.kernel.font.PdfFont;
import com.itextpdf.kernel.font.PdfFontFactory;
import com.itextpdf.kernel.pdf.PdfDocument;
import com.itextpdf.kernel.pdf.PdfWriter;
import com.itextpdf.layout.font.FontProvider;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
public class HtmlToPdfUtils {
/**
* html转pdf
*
* @param inputStream 输入流
* @param waterMark 水印
* @param fontPath 字体路径,ttc后缀的字体需要添加<b>,0<b/>
* @param outputStream 输出流
* @date : 2021/1/15 14:07
*/
public static void convertToPdf(InputStream inputStream, String waterMark, String fontPath, OutputStream outputStream) throws IOException {
PdfWriter pdfWriter = new PdfWriter(outputStream);
PdfDocument pdfDocument = new PdfDocument(pdfWriter);
//设置为A4大小
pdfDocument.setDefaultPageSize(PageSize.A4);
//添加水印
pdfDocument.addEventHandler(PdfDocumentEvent.END_PAGE, new WaterMarkEventHandler(waterMark));
//添加中文字体支持
ConverterProperties properties = new ConverterProperties();
FontProvider fontProvider = new FontProvider();
PdfFont sysFont = PdfFontFactory.createFont("STSongStd-Light", "UniGB-UCS2-H", false);
fontProvider.addFont(sysFont.getFontProgram(), "UniGB-UCS2-H");
//添加自定义字体,例如微软雅黑
if (StrUtil.isNotBlank(fontPath)){
PdfFont microsoft = PdfFontFactory.createFont(fontPath, PdfEncodings.IDENTITY_H, false);
fontProvider.addFont(microsoft.getFontProgram(), PdfEncodings.IDENTITY_H);
}
properties.setFontProvider(fontProvider);
HtmlConverter.convertToPdf(inputStream, pdfDocument, properties);
pdfWriter.close();
pdfDocument.close();
}
public static void html2pdf(String htmlFile, String pdfFile) throws Exception {
ConverterProperties converterProperties = new ConverterProperties();
DefaultFontProvider dfp = new DefaultFontProvider();
//添加字体库
dfp.addDirectory("C:/Windows/Fonts");
converterProperties.setFontProvider(dfp);
try (InputStream in = new FileInputStream(new File(htmlFile)); OutputStream out = new FileOutputStream(new File(pdfFile))){
HtmlConverter.convertToPdf(in, out, converterProperties);
}catch (Exception e){
e.printStackTrace();
}
}
}