有人能给我一个如何用PDFBox提取“单词”坐标的例子吗
我正在使用此链接提取单个角色的位置:https://www.tutorialkart.com/pdfbox/how-to-extract-coordinates-or-position-of-characters-in-pdf/
我正在使用此链接提取单词:https://www.tutorialkart.com/pdfbox/extract-words-from-pdf-document/
我被困在获取整个单词的坐标上。
您可以创建CustomPDFTextStripper
,它扩展了PDFTextStripper
并覆盖受保护的void writeString(字符串文本,列表
下面的完整示例还包含生成的边界框的绘图。
package com.example;
import lombok.Value;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.junit.Ignore;
import org.junit.Test;
import javax.imageio.ImageIO;
import java.awt.*;
import java.awt.image.BufferedImage;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
public class PdfBoxTest {
private static final String BASE_DIR_PATH = "C:\\Users\\Milan\\50330484";
private static final String INPUT_FILE_PATH = "input.pdf";
private static final String OUTPUT_IMAGE_PATH = "output.jpg";
private static final String OUTPUT_BBOX_IMAGE_PATH = "output-bbox.jpg";
private static final float FROM_72_TO_300_DPI = 300.0f / 72.0f;
@Test
public void run() throws Exception {
pdfToImage();
drawBoundingBoxes();
}
@Ignore
@Test
public void pdfToImage() throws IOException {
PDDocument document = PDDocument.load(new File(BASE_DIR_PATH, INPUT_FILE_PATH));
PDFRenderer renderer = new PDFRenderer(document);
BufferedImage image = renderer.renderImageWithDPI(0, 300);
ImageIO.write(image, "JPEG", new File(BASE_DIR_PATH, OUTPUT_IMAGE_PATH));
}
@Ignore
@Test
public void drawBoundingBoxes() throws IOException {
PDDocument document = PDDocument.load(new File(BASE_DIR_PATH, INPUT_FILE_PATH));
List<WordWithBBox> words = getWords(document);
draw(words);
}
private List<WordWithBBox> getWords(PDDocument document) throws IOException {
CustomPDFTextStripper customPDFTextStripper = new CustomPDFTextStripper();
customPDFTextStripper.setSortByPosition(true);
customPDFTextStripper.setStartPage(0);
customPDFTextStripper.setEndPage(1);
Writer writer = new OutputStreamWriter(new ByteArrayOutputStream());
customPDFTextStripper.writeText(document, writer);
List<WordWithBBox> words = customPDFTextStripper.getWords();
return words;
}
private void draw(List<WordWithBBox> words) throws IOException {
BufferedImage bufferedImage = ImageIO.read(new File(BASE_DIR_PATH, OUTPUT_IMAGE_PATH));
Graphics2D graphics = bufferedImage.createGraphics();
graphics.setColor(Color.GREEN);
List<Rectangle> rectangles = words.stream()
.map(word -> new Rectangle(word.getX(), word.getY(), word.getWidth(), word.getHeight()))
.collect(Collectors.toList());
rectangles.forEach(graphics::draw);
graphics.dispose();
ImageIO.write(bufferedImage, "JPEG", new File(BASE_DIR_PATH, OUTPUT_BBOX_IMAGE_PATH));
}
private class CustomPDFTextStripper extends PDFTextStripper {
private final List<WordWithBBox> words;
public CustomPDFTextStripper() throws IOException {
this.words = new ArrayList<>();
}
public List<WordWithBBox> getWords() {
return new ArrayList<>(words);
}
@Override
protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
String wordSeparator = getWordSeparator();
List<TextPosition> wordTextPositions = new ArrayList<>();
for (TextPosition textPosition : textPositions) {
String str = textPosition.getUnicode();
if (wordSeparator.equals(str)) {
if (!wordTextPositions.isEmpty()) {
this.words.add(createWord(wordTextPositions));
wordTextPositions.clear();
}
} else {
wordTextPositions.add(textPosition);
}
}
super.writeString(text, textPositions);
}
private WordWithBBox createWord(List<TextPosition> wordTextPositions) {
String word = wordTextPositions.stream()
.map(TextPosition::getUnicode)
.collect(Collectors.joining());
int minX = Integer.MAX_VALUE;
int minY = Integer.MAX_VALUE;
int maxX = Integer.MIN_VALUE;
int maxY = Integer.MIN_VALUE;
for (TextPosition wordTextPosition : wordTextPositions) {
minX = Math.min(minX, from72To300Dpi(wordTextPosition.getXDirAdj()));
minY = Math.min(minY, from72To300Dpi(wordTextPosition.getYDirAdj() - wordTextPosition.getHeightDir()));
maxX = Math.max(maxX, from72To300Dpi(wordTextPosition.getXDirAdj() + wordTextPosition.getWidthDirAdj()));
maxY = Math.max(maxY, from72To300Dpi(wordTextPosition.getYDirAdj()));
}
return new WordWithBBox(word, minX, minY, maxX - minX, maxY - minY);
}
}
private int from72To300Dpi(float f) {
return Math.round(f * FROM_72_TO_300_DPI);
}
@Value
private class WordWithBBox {
private final String word;
private final int x;
private final int y;
private final int width;
private final int height;
}
}
注:
如果您对其他选项感兴趣,也可以查看Popple
PDF转图像
pdftoppm -r 300 -jpeg input.pdf output
生成一个XHTML文件,其中包含文件中每个单词的边界框信息。
pdftotext -r 300 -bbox input.pdf
通过收集所有构建单词的TextPosition
对象并组合它们的边界框,可以提取单词的坐标。
按照您参考的两个教程的思路实现这一点,您可以像这样扩展PDFTextStripper
:
public class GetWordLocationAndSize extends PDFTextStripper {
public GetWordLocationAndSize() throws IOException {
}
@Override
protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
String wordSeparator = getWordSeparator();
List<TextPosition> word = new ArrayList<>();
for (TextPosition text : textPositions) {
String thisChar = text.getUnicode();
if (thisChar != null) {
if (thisChar.length() >= 1) {
if (!thisChar.equals(wordSeparator)) {
word.add(text);
} else if (!word.isEmpty()) {
printWord(word);
word.clear();
}
}
}
}
if (!word.isEmpty()) {
printWord(word);
word.clear();
}
}
void printWord(List<TextPosition> word) {
Rectangle2D boundingBox = null;
StringBuilder builder = new StringBuilder();
for (TextPosition text : word) {
Rectangle2D box = new Rectangle2D.Float(text.getXDirAdj(), text.getYDirAdj(), text.getWidthDirAdj(), text.getHeightDir());
if (boundingBox == null)
boundingBox = box;
else
boundingBox.add(box);
builder.append(text.getUnicode());
}
System.out.println(builder.toString() + " [(X=" + boundingBox.getX() + ",Y=" + boundingBox.getY()
+ ") height=" + boundingBox.getHeight() + " width=" + boundingBox.getWidth() + "]");
}
}
(内部类)
然后像这样运行:
PDDocument document = PDDocument.load(resource);
PDFTextStripper stripper = new GetWordLocationAndSize();
stripper.setSortByPosition( true );
stripper.setStartPage( 0 );
stripper.setEndPage( document.getNumberOfPages() );
Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
stripper.writeText(document, dummy);
(ExtractWords坐标测试TestExtractWordsForgoodju
)
应用于apache。pdf示例教程使用的内容如下:
2017-8-6 [(X=26.004425048828125,Y=22.00372314453125) height=5.833024024963379 width=36.31868362426758]
Welcome [(X=226.44479370117188,Y=22.00372314453125) height=5.833024024963379 width=36.5999755859375]
to [(X=265.5881652832031,Y=22.00372314453125) height=5.833024024963379 width=8.032623291015625]
The [(X=276.1641845703125,Y=22.00372314453125) height=5.833024024963379 width=14.881439208984375]
Apache [(X=293.5890197753906,Y=22.00372314453125) height=5.833024024963379 width=29.848846435546875]
Software [(X=325.98126220703125,Y=22.00372314453125) height=5.833024024963379 width=35.271636962890625]
Foundation! [(X=363.7962951660156,Y=22.00372314453125) height=5.833024024963379 width=47.871429443359375]
Custom [(X=334.0334777832031,Y=157.6195068359375) height=4.546705722808838 width=25.03936767578125]
Search [(X=360.8929138183594,Y=157.6195068359375) height=4.546705722808838 width=22.702728271484375]
嗨,这个问题是指之前的帖子: 有人能给我举个例子,说明如何使用PDFBox提取“单词”的坐标吗 我使用的是PDFBOX 2.0.10 我已成功编译了组合代码,但在尝试运行示例时出现异常错误。 提供的解决方案没有标准的主方法,这是我感到困惑的地方。 有人能告诉我怎样才能成功运行组合代码吗。 堆栈跟踪 可以在这里找到坐标https://github.com/mkl-public/testarea-pd
本文向大家介绍举例说明如何使用WebSQL?相关面试题,主要包含被问及举例说明如何使用WebSQL?时的应答技巧和注意事项,需要的朋友参考一下
我正在使用PDFbox来提取PDF文档中单词/字符串的坐标,并且到目前为止已经成功地确定了单个字符的位置。 这将生成一系列包含每个字符位置的行,包括空格,如下所示: 其中“P”是字符。我还没有在PDFbox中找到查找单词的函数,而且我对Java还不够熟悉,无法将这些字符准确地连接回单词中进行搜索,即使空格也包括在内。有没有其他人遇到过类似的情况,如果有,你是如何处理的?我真的只需要单词中第一个字符
本文向大家介绍举例说明clear取值有哪些?相关面试题,主要包含被问及举例说明clear取值有哪些?时的应答技巧和注意事项,需要的朋友参考一下 none 默认值。允许浮动元素出现在两侧。 left 在左侧不允许浮动元素。 right 在右侧不允许浮动元素。 both 在左右两侧均不允许浮动元素。 inherit 从父元素继承 clear 属性的值。
本文向大家介绍举例说明常用的cursor取值有哪些?相关面试题,主要包含被问及举例说明常用的cursor取值有哪些?时的应答技巧和注意事项,需要的朋友参考一下 https://www.w3school.com.cn/cssref/pr_class_cursor.asp 把这个背熟,需要的时候才可以逢场作戏,题目让你说说你常用的,其实你在保证正确的情况下多回答几个是没错的。
本文向大家介绍举例说明如何使用纯html怎么实现下拉提示的功能?相关面试题,主要包含被问及举例说明如何使用纯html怎么实现下拉提示的功能?时的应答技巧和注意事项,需要的朋友参考一下 datalist标签