最近遇到需求解析pdf,具体需求为解析处pdf文件中的标题,内容,图片,表格等信息,并分类存储。查遍某度也没有找到方法,问了https://www.e-iceblue.cn/的技术,也做不到。无奈只能按页提取到文本和图片了,而且顺序也是错乱的。代码如下,希望能对类似需求的小伙伴有所帮助。 不过虽然pdf没搞定,但是搞定了word(doc、docx)的解析,后续会更新出来与大家分享。当然如果有小伙伴解决了pdf的解析还请多多指教 需要依赖:
<dependency> <groupId>org.apache.pdfbox</groupId> <artifactId>pdfbox</artifactId> <version>2.0.24</version> </dependency>
<repository> <id>com.e-iceblue</id> <name>e-iceblue</name> <url>http://repo.e-iceblue.cn/repository/maven-public/</url> </repository>
测试代码如下:
package com.aiwrite.file.utils.pdf; import com.aiwrite.common.core.constant.FileConstants; import com.aiwrite.file.utils.FileUploadUtils; import com.spire.pdf.PdfDocument; import com.spire.pdf.PdfPageBase; import com.spire.pdf.exporting.PdfImageInfo; import com.spire.pdf.widget.PdfPageCollection; import org.apache.commons.lang3.ObjectUtils; import org.apache.pdfbox.io.RandomAccessFile; import org.apache.pdfbox.pdfparser.PDFParser; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper; import javax.imageio.ImageIO; import java.awt.geom.Rectangle2D; import java.awt.image.BufferedImage; import java.io.File; import java.io.FileInputStream; import java.io.FileWriter; import java.io.IOException; /** * @BelongsProject: aiwrite * @BelongsPackage: com.aiwrite.file.utils * @Author: zhousc * @CreateTime: 2022-06-12 17:31 * @Description: TODO * @Version: 1.0 */ public class ParsePdfUtilTest { public static void main(String[] args) { String basePath = "E:\\123\\"; String pdfPath = basePath + "认证授权开发手册.pdf"; /* =================================================================================== */ System.out.println(" ==================== pdf parse begin !!! ==================== "); long l = System.currentTimeMillis(); try { // testPdf(pdfPath, basePath); printPdfInfo(pdfPath); // readPDF(pdfPath); } catch (Exception e) { e.printStackTrace(); } long l1 = System.currentTimeMillis(); System.out.println(" ==================== pdf parse success !!! 共耗时 " + (l1 -l) + " ms ==================== "); /* =================================================================================== */ } public static void printPdfInfo(String filePath) throws Exception { //加载测试文档 PdfDocument pdf = new PdfDocument(); pdf.loadFromFile(filePath); // 按每页获取文件内容 PdfPageCollection pages = pdf.getPages(); // pdf每页对象 PdfPageBase page; if (pages.getCount() > 0) { for (int i = 0; i < pages.getCount(); i++) { page = pages.get(i); System.out.println(" >>>>> >>>>> >>>>> >>>>> >>>>> 第 " + (i + 1) + " 页 <<<<< <<<<< <<<<< <<<<< <<<<< "); System.out.println(" ***** ***** ***** ***** ***** 内容如下 ***** ***** ***** ***** ***** "); // 按原内容格式输出 String text = page.extractText(true); System.out.println(text.replace("Evaluation Warning : The document was created with Spire.PDF for Java.","")); // 判断是否有图片 PdfImageInfo[] imagesInfo = page.getImagesInfo(); for (PdfImageInfo pdfImageInfo : imagesInfo) { Rectangle2D bounds = pdfImageInfo.getBounds(); int index = pdfImageInfo.getIndex(); String base64 = FileUploadUtils.BufferedImageToBase64(pdfImageInfo.getImage()); System.out.println(" ===== ===== ===== ===== ===== 图片坐标信息如下 ===== ===== ===== ===== ===== "); System.out.println(" index: " + index + "; x: " +bounds.getX() + "; y: " + bounds.getY()); // System.out.println(" base64: " + base64); } // BufferedImage[] bufferedImages = page.extractImages(true); // if (ObjectUtils.isNotEmpty(bufferedImages)) { // for (BufferedImage bufferedImage : bufferedImages) { // System.out.println(" ===== ===== ===== ===== ===== 图片信息如下 ===== ===== ===== ===== ===== "); // System.out.println(bufferedImage); // } // } } } pdf.close(); } public static void testPdf(String filePath, String outPath) throws IOException { //加载测试文档,实例化StringBuilder类 PdfDocument pdf = new PdfDocument(filePath); //定义一个int型变量 StringBuilder sb = new StringBuilder(); //遍历PDF文档中每页 int index = 0; PdfPageBase page; for (int i = 0; i < pdf.getPages().getCount(); i++) { page = pdf.getPages().get(i); //调用extractText()方法提取文本 sb.append(page.extractText(true)); FileWriter writer; try { //将StringBuilder对象中的文本写入到txt writer = new FileWriter(outPath + "ExtractText.txt"); writer.write(sb.toString()); writer.flush(); } catch (IOException e) { e.printStackTrace(); } //调用extractImages方法获取图片 if (ObjectUtils.isNotEmpty(page.extractImages())) { //指定输出图片名,指定图片格式 for (BufferedImage image : page.extractImages()) { File output = new File(String.format(outPath + "Image_%d.png", index++)); ImageIO.write(image, FileConstants.PICTURE_PNG, output); } } } pdf.close(); } /** * 读PDF文件,使用了pdfbox开源项目 * @param fileName */ public static void readPDF(String fileName) { File file = new File(fileName); FileInputStream in = null; try { in = new FileInputStream(fileName); // 新建一个PDF解析器对象 PDFParser parser = new PDFParser(new RandomAccessFile(file,"rw")); // 对PDF文件进行解析 parser.parse(); // 获取解析后得到的PDF文档对象 PDDocument pdfdocument = parser.getPDDocument(); // 新建一个PDF文本剥离器 PDFTextStripper stripper = new PDFTextStripper(); //sort设置为true 则按照行进行读取,默认是false stripper .setSortByPosition(true); // 从PDF文档对象中剥离文本 String result = stripper.getText(pdfdocument); // 写入到文件 // FileWriter fileWriter = new FileWriter(new File("pdf.txt")); // fileWriter.write(result); // fileWriter.flush(); // fileWriter.close(); System.out.println("PDF文件的文本内容如下:"); System.out.println(result); } catch (Exception e) { System.out.println("读取PDF文件" + file.getAbsolutePath() + "生失败!" + e); e.printStackTrace(); } finally { if (in != null) { try { in.close(); } catch (IOException e1) { } } } } }