def extract_docx_document(document,tables,akts_df):
rel_list=[]
docx_id_all=[]
proxy=[]
for p in document.tables:
proxy.append(p._element.xml)
rIds=[]
docx_id_all=[]
num=loc_table(tables) ##获取当前所提取表格的位置 ie 为当前文档中第几个表格
root=ET.fromstring(proxy[num])
#获得<w:tc>树
##wtr 每行
wtr_str=".//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}tr"
wtrs=root.findall(wtr_str)
wtc_str=".//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}tc"
wr_str=".//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}r"
wt_str=".//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}t"
pdf_str=".//{urn:schemas-microsoft-com:office:office}OLEObject"
wt_lt=[]
wr_lt={}
name_docx=[]
docx_lt=[]
##在每个wtr树立找ole 和wt标签
for i in range(len(wtrs)):
# wtcs=wtrs[i].findall(wtc_str)
pdfs=wtrs[i].findall(pdf_str)
wts=wtrs[i].findall(wt_str)
for wt in wts:
wt_lt.append(wt.text)
for pdf in pdfs:
docx_lt.append(pdf.attrib['{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id'])
wr_lt["content"]= wt_lt
wr_lt["docx"]=docx_lt
name_docx.append(wr_lt)
#name_pdf[str(i)]=wr_lt
wt_lt=[]
docx_lt=[]
wr_lt={}
##包含ole 文件的row
name_docx_yes=[]
for value in name_docx:
if value["docx"]!=[]:
name_docx_yes.append(value)
name_docx_true=name_docx
for item in name_docx :
#print(name_pdf_true.index(item))
for docx in item["docx"]:
if item["docx"]==[]:
pass
else:
#print(item["docx"].index(docx))
for rel in document.part._rels:
#print(rel)
if rel ==docx:
# print(rel)
rel = document.part._rels[rel]
# print(rel.target_ref)
##docx
embeddings = re.findall("embeddings/Microsoft_Word___\d*",rel.target_ref)
##doc
embedding2s = re.findall("embeddings/Microsoft_Word_97_-_2003___\d*",rel.target_ref)
##ole-pdf
embedding3s = re.findall("embeddings/oleObject\d*",rel.target_ref)
if len(embeddings)!=0:
embedding=embeddings[0]
# print(embeddings[0])
rel_list.append(rel.target_ref)
#i=i+1
base2=rel.target_part.blob
docx_filename = sample_name+".docx"
# print(docx_filename)
##Save the docx
with open('./run_document/'+docx_filename, "wb") as output_file:
output_file.write(base2)
url = "xxxx"
requests.packages.urllib3.disable_warnings()
payload={'width': '0',
'height': '0'}
files=[
('files',(docx_filename, open('./run_document/'+docx_filename,'rb'),'application/vnd.openxmlformats-officedocument.wordprocessingml.document'))
]
headers = {
'Authorization': 'Bearer '
}
response = requests.request("POST", url, headers=headers, data=payload, files=files,verify=False)
docx_id=response.text
if len(docx_id)!=0:
int_pt=re.compile( r'\d+',re.I)
docx_id_lt=int_pt.findall(docx_id)
if len(docx_id_lt)==1:
if sample_name!=[]:
docx_id_f={}
docx_id_f['id']=docx_id_lt[0]
docx_id_f["sample_name"]=sample_name docx_id_f["row"]=name_docx.index(item)
docx_id_all.append(docx_id_f)
elif len(embedding2s)!=0:
embedding=embedding2s[0]
# print(embedding2s[0])
rel_list.append(rel.target_ref)
#i=i+1
base2=rel.target_part.blob
docx_filename =sample_name+".doc"
##Save the docx
with open('./run_document/'+docx_filename, "wb") as output_file:
output_file.write(base2)
url = ""
requests.packages.urllib3.disable_warnings()
payload={'width': '0',
'height': '0'}
files=[
('files',(docx_filename, open('./run_document/'+docx_filename,'rb'),'application/vnd.openxmlformats-officedocument.wordprocessingml.document'))
]
headers = {
'Authorization': 'Bearer '
}
response = requests.request("POST", url, headers=headers, data=payload, files=files,verify=False)
docx_id=response.text
if len(docx_id)!=0:
int_pt=re.compile( r'\d+',re.I)
docx_id_lt=int_pt.findall(docx_id)
if len(docx_id_lt)==1:
docx_id_f={}
docx_id_f['id']=docx_id_lt[0]
docx_id_f["sample_name"]=sample_name docx_id_f["row"]=name_docx.index(item)
docx_id_all.append(docx_id_f)
elif len(embedding3s)!=0:
embedding=embedding3s[0]
# print(embedding3s[0])
rel_list.append(rel.target_ref)
#i=i+1
base2=rel.target_part.blob
ole = olefile.OleFileIO(base2)
ole_stream=ole.listdir()
if ['CONTENTS'] in ole_stream:
pdf_data = ole.openstream('CONTENTS').read()
elif ['\x01Ole10Native'] in ole_stream:
pdf_data=ole.openstream('\x01Ole10Native').read()
elif ['Contents'] in ole_stream:
pdf_data = ole.openstream('Contents').read()
# pdf_data = ole.openstream('\x01CompObj').read()
else:
pdf_data=""
# Does the embedded file have a %PDF- header?
if pdf_data[0:5] == b'%PDF-'or str(pdf_data).find('pdf')>=0:
#if len(conclusion)==len(name_docx_yes):
pdf_filename =sample_name[name_docx.index(item)]+".pdf"
##Save the docx
with open('./run_document/'+pdf_filename, "wb") as output_file:
output_file.write( pdf_data )
url = ""
requests.packages.urllib3.disable_warnings()
payload={'width': '0',
'height': '0'}
files=[
('files',(pdf_filename, open('./rd3/run_document/'+pdf_filename,'rb'),'application/pdf'))
]
headers = {
'Authorization': 'Bearer'
}
response = requests.request("POST", url, headers=headers, data=payload, files=files,verify=False)
docx_id=response.text
if len(docx_id)!=0:
int_pt=re.compile( r'\d+',re.I)
docx_id_lt=int_pt.findall(docx_id)
if len(docx_id_lt)==1:
docx_id_f={}
docx_id_f['id']=docx_id_lt[0]
docx_id_f["sample_name"]=sample_name
docx_id_f["row"]=name_docx.index(item)
docx_id_all.append(docx_id_f)
## pdf 图标 docx
elif str(pdf_data).find('docx')>=0:
#if len(conclusion)==len(name_docx_yes):
docx_filename =sample_name[name_docx.index(item)+".docx"
##Save the docx
with open('./run_document/'+docx_filename, "wb") as output_file:
output_file.write( pdf_data )
url = ""
requests.packages.urllib3.disable_warnings()
payload={'width': '0',
'height': '0'}
files=[
('files',(docx_filename, open('./run_document/'+docx_filename,'rb'),'application/pdf'))
]
headers = {
'Authorization': 'Bearer '
}
response = requests.request("POST", url, headers=headers, data=payload, files=files,verify=False)
docx_id=response.text
if len(docx_id)!=0:
int_pt=re.compile( r'\d+',re.I)
docx_id_lt=int_pt.findall(docx_id)
if len(docx_id_lt)==1:
if sample_name!=[]:
docx_id_f={}
docx_id_f['id']=docx_id_lt[0]
docx_id_f["sample_name"]=sample_name[name_docx.index(item)]
docx_id_f["row"]=name_docx.index(item)
docx_id_all.append(docx_id_f)
else:
docx_id_f={}
docx_id_f['id']=docx_id_lt[0]
docx_id_f["sample_name"]=conclusion[name_docx.index(item)]
docx_id_f["row"]=name_docx.index(item)
docx_id_all.append(docx_id_f)
## pdf 图标 docx
elif str(pdf_data).find('doc')>=0:
#if len(conclusion)==len(name_docx_yes):
docx_filename =sample_name ##Save the docx
with open('./run_document/'+docx_filename, "wb") as output_file:
output_file.write( pdf_data )
url = ""
requests.packages.urllib3.disable_warnings()
payload={'width': '0',
'height': '0'}
files=[
('files',(docx_filename, open('./rd3/run_document/'+docx_filename,'rb'),'application/pdf'))
]
headers = {
'Authorization': 'Bearer '
}
response = requests.request("POST", url, headers=headers, data=payload, files=files,verify=False)
docx_id=response.text
if len(docx_id)!=0:
int_pt=re.compile( r'\d+',re.I)
docx_id_lt=int_pt.findall(docx_id)
if len(docx_id_lt)==1:
if sample_name!=[]:
docx_id_f={}
docx_id_f['id']=docx_id_lt[0]
docx_id_f["sample_name"]=sample_name[name_docx.index(item)]
docx_id_f["row"]=name_docx.index(item)
docx_id_all.append(docx_id_f)
else:
docx_id_f={}
docx_id_f['id']=docx_id_lt[0]
docx_id_f["sample_name"]=conclusion[name_docx.index(item)]
docx_id_f["row"]=name_docx.index(item)
docx_id_all.append(docx_id_f)
return docx_id_all