xpdf4.0的使用（详细）

左丘楷

2023-12-01

1.下载xpdf tools 和Chinese simplified，（刚开始下载了源码但是没有找到pdftotxt.exe,结果发现在tools里）

2.新建xpdf文件夹，解压tools到xpdf里，将bin64里的所有文件复制到xpdf下，再将doc里的sample-xpdfrc复制到xpdf下，并重命名为xpdfrc，将xpdf-chinese-simplified复制到xpdf下。

3.修改xpdfrc文件，

（1）修改文件 xpdfrc 第73行，将 textEncoding UTF-8 注释打开，指定编码为UTF-8，

（2）并在下面增加 textPageBreaks no 参数，意思是在pdf文档的两页间不加入分行符。

（3）在最后加入这些代码：其中的路径换成自己的路径

#-----?begin?Chinese?Simplified?support?package?(2011-sep-02)??

cidToUnicode?Adobe-GB1?E:/xpdf/xpdf-chinese-simplified/Adobe-GB1.cidToUnicode??

unicodeMap?ISO-2022-CN?E:/xpdf/xpdf-chinese-simplified/ISO-2022-CN.unicodeMap??

unicodeMap?EUC-CN?E:/xpdf/xpdf-chinese-simplified/EUC-CN.unicodeMap??

unicodeMap?GBK?E:/xpdf/xpdf-chinese-simplified/GBK.unicodeMap??

cMapDir?Adobe-GB1?E:/xpdf/xpdf-chinese-simplified/CMap??

toUnicodeDir?E:/xpdf/xpdf-chinese-simplified/CMap??

#displayCIDFontTT?Adobe-GB1?E:/xpdf/xpdf-chinese-simplified/CMap/gkai00mp.ttf???

#fontFileCC?Adobe-GB1?/usr/..../gkai00mp.ttf??

#-----?end?Chinese?Simplified?support?package??

4.接下来贴代码：

import java.io.BufferedInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
*
* @author Angela
*/
public class Pdf4Text {

/**
* 读取pdf文本
* @param file pdf文件路径
* @param isLayout 是否维持原来布局
*/
public static void extractTXT(String file,boolean isLayout){
// XPDF存放路径
String PATH_TO_XPDF=
"E:\\xpdf\\pdftotext.exe";
File pdffile=new File(file);
// PDF文件的绝对路径
String source_absolutePath = pdffile.getAbsolutePath();
// -表示不保存文本
String target_absolutePath = "-";

//-layout表示保持原有的layout，-enc指定字符集，
//-q设置不打印任何消息和错误，-nopgbrk指定不分页

// 保持原来的layout
String layout = "-layout";
// 如果isLayout为false，则设置不保持原来的layout
if(!isLayout) layout = "";
// 设置编码方式
String encoding = "-enc";
String character = "UTF-8";
// 设置不打印任何消息和错误
String mistake = "-q";
// 页面之间不加入分页
String nopagebrk = "-nopgbrk";

//命令行
String[] cmd = new String[] { PATH_TO_XPDF, layout, encoding, character,
mistake, nopagebrk, source_absolutePath, target_absolutePath };

try {
Process p = Runtime.getRuntime().exec(cmd);
//获取控制台的输入流
BufferedInputStream bis = new BufferedInputStream(p.getInputStream());
InputStreamReader reader = new InputStreamReader(bis, character);
StringBuilder sb=new StringBuilder();
char [] buf = new char[10000];
int len;
while((len = reader.read(buf))>= 0) {
sb.append(buf);
}
System.out.println(sb.toString());
reader.close();
} catch (IOException ex) {
Logger.getLogger(Pdf4Text.class.getName()).log(Level.SEVERE, null, ex);
}
}

/**
* 保存pdf文本内容
* @param file pdf文件路径
* @param savePath 文本保存路径
* @param isLayout 是否维持原来布局
*/
public static void extractTXT(String file,String savePath,boolean isLayout){
// XPDF存放路径
String PATH_TO_XPDF=
"E:\\xpdf\\pdftotext.exe";
File pdffile=new File(file);
// PDF文件的绝对路径
String source_absolutePath = pdffile.getAbsolutePath();
File targetfile=new File(savePath);
// 输出文本文件的绝对路径
String target_absolutePath = targetfile.getAbsolutePath();
// 保持原来的layout
String layout = "-layout";
// 如果isLayout为false，则设置不保持原来的layout
if(!isLayout) layout = "";
// 设置编码方式
String encoding = "-enc";
String character = "UTF-8";
// 设置不打印任何消息和错误
String mistake = "-q";
// 页面之间不加入分页
String nopagebrk = "-nopgbrk";

//命令行
String[] cmd = new String[] { PATH_TO_XPDF, layout, encoding, character,
mistake, nopagebrk, source_absolutePath, target_absolutePath };

try {
Runtime.getRuntime().exec(cmd);
} catch (IOException ex) {
Logger.getLogger(Pdf4Text.class.getName()).log(Level.SEVERE, null, ex);
}
}

public static void main(String args[]){
String file="E:\\three body.pdf";
String savePath="E:\\three body.txt";
long startTime=System.currentTimeMillis();
extractTXT(file,savePath,true);
long endTime=System.currentTimeMillis();
System.out.println("读写所用时间为："+(endTime-startTime)+"ms");
}

}

当然你还可以利用pdftohtml.exe,pdftopng.exe,pdftoppm.exe,pdftops.exe等EXE转化成其他格式的文件。

有什么问题就在下面评论，我看到了就会及时的回复。

xpdf4.0的使用（详细）

相关阅读

相关文章

相关问答

相关文档