import pdfplumber
pdf_path = 'hung2019.pdf'
with pdfplumber.open(pdf_path) as pdf:
first_page = pdf.pages[0]
result = first_page.extract_words(x_tolerance=1, keep_blank_chars=True)
for value in result:
print(value['text'])
Malware
detection
based
on
directed
multi-edge
dataflow
graph
representation
and
convolutional
neural
network
keep_blank_chars=True
,仍不能很好提取出每一行内容。不过,还有一些超参数可以调节,例如x_tolerance
和y_tolerance
等等。我反正是试了好多,都不得行。from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import *
from pdfminer.converter import PDFPageAggregator
pdf_path = 'hung2019.pdf'
f = open(pdf_path, 'rb')
#来创建一个pdf文档分析器
parser = PDFParser(f)
#创建一个PDF文档对象存储文档结构
document = PDFDocument(parser)
document.is_extractable
# 创建一个PDF资源管理器对象来存储共赏资源
rsrcmgr = PDFResourceManager()
# 设定参数进行分析
laparams = LAParams()
# 创建一个PDF设备对象
device = PDFPageAggregator(rsrcmgr,laparams=laparams)
# 创建一个PDF解释器对象
interpreter = PDFPageInterpreter(rsrcmgr,device)
# 处理每一页
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
# 接受该页面的LTPage对象
layout = device.get_result()
page_height = layout.bbox[3]
for x in layout:
if isinstance(x, LTTextBox):
for v in x:
if isinstance(v, LTTextLine):
text = v.get_text()
x0, y0, x1, y1 = v.bbox
# 注意这里的bbox y轴坐标需要用page 高度减去才是 正常坐标
y0 = page_height - y0
y1 = page_height - y1
print(f'{text}\t({x0}, {y0}, {x1}, {y1})')
f.close()
Malware detection based on directed multi-edge
(69.517, 77.95079429999998, 542.4866443, 54.04049429999998)
dataflow graph representation and convolutional
(71.142, 105.84579429999997, 540.8598435, 81.93549429999996)
neural network
(232.883, 133.74179430000004, 379.11839480000003, 109.83149430000003)
Nguyen Viet Hung
(105.702, 161.48445089999996, 190.63347499999998, 150.52555089999998)
Le Quy Don Techincal University
(81.264, 173.5719019999999, 218.55859060000006, 163.60930199999996)
Faculty of Information Technology
(76.013, 185.899902, 216.83435100000005, 175.93730200000005)
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTPage, LTTextBoxHorizontal, LTTextLineHorizontal
pdf_path = 'hung2019.pdf'
pages = list(extract_pages(pdf_path))
# 示例,取第一页
page = pages[0]
boxes, texts = [], []
if isinstance(page, LTPage):
for text_box_h in page:
if isinstance(text_box_h, LTTextBoxHorizontal):
for text_box_h_l in text_box_h:
if isinstance(text_box_h_l, LTTextLineHorizontal):
x0, y0, x1, y1 = text_box_h_l.bbox
y0 = page.height - y0
y1 = page.height - y1
text = text_box_h_l.get_text()
boxes.append([[x0, y0], [x1, y0],
[x1, y1], [x0, y1]])
texts.append(text)
print(f'{text}\t({x0}, {y0}, {x1}, {y1})')
Malware detection based on directed multi-edge
(69.517, 77.95079429999998, 542.4866443, 54.04049429999998)
dataflow graph representation and convolutional
(71.142, 105.84579429999997, 540.8598435, 81.93549429999996)
neural network
(232.883, 133.74179430000004, 379.11839480000003, 109.83149430000003)
Nguyen Viet Hung
(105.702, 161.48445089999996, 190.63347499999998, 150.52555089999998)
Le Quy Don Techincal University
(81.264, 173.5719019999999, 218.55859060000006, 163.60930199999996)
Faculty of Information Technology
(76.013, 185.899902, 216.83435100000005, 175.93730200000005)