python docx 提取word文档内嵌docx/doc/pdf文件附件

段干飞翔
2023-12-01
def extract_docx_document(document,tables,akts_df):

    rel_list=[] 
    docx_id_all=[]          

    proxy=[]
    for p in document.tables:
        proxy.append(p._element.xml)
    

    rIds=[]
    docx_id_all=[]

    num=loc_table(tables) ##获取当前所提取表格的位置 ie 为当前文档中第几个表格

    root=ET.fromstring(proxy[num])
    #获得<w:tc>树
    ##wtr 每行
    wtr_str=".//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}tr" 
    wtrs=root.findall(wtr_str)
    wtc_str=".//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}tc"
    wr_str=".//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}r"
    wt_str=".//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}t"
    pdf_str=".//{urn:schemas-microsoft-com:office:office}OLEObject"
    
    wt_lt=[]
    wr_lt={}
    name_docx=[]
    docx_lt=[]
    
    ##在每个wtr树立找ole 和wt标签
    for i in range(len(wtrs)):
        # wtcs=wtrs[i].findall(wtc_str)
        pdfs=wtrs[i].findall(pdf_str)
        wts=wtrs[i].findall(wt_str)
        for wt in wts:
            wt_lt.append(wt.text)
        for pdf in pdfs:
            docx_lt.append(pdf.attrib['{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id'])
    
        wr_lt["content"]= wt_lt
        wr_lt["docx"]=docx_lt 
        name_docx.append(wr_lt)
        #name_pdf[str(i)]=wr_lt
        wt_lt=[]  
        docx_lt=[]
        wr_lt={}

   
    ##包含ole 文件的row
    name_docx_yes=[]
    for value in name_docx:
        if value["docx"]!=[]:
            name_docx_yes.append(value)
    name_docx_true=name_docx
    
    for item in name_docx :
        #print(name_pdf_true.index(item))
        for docx in item["docx"]:
            if item["docx"]==[]:
                pass
            else:
                #print(item["docx"].index(docx))
                for rel in document.part._rels: 
                    #print(rel)
                    if rel ==docx:
                        # print(rel)
                        rel = document.part._rels[rel]  
                        # print(rel.target_ref)              

                        ##docx
                        embeddings = re.findall("embeddings/Microsoft_Word___\d*",rel.target_ref)
                        ##doc
                        embedding2s = re.findall("embeddings/Microsoft_Word_97_-_2003___\d*",rel.target_ref)
                        ##ole-pdf
                        embedding3s = re.findall("embeddings/oleObject\d*",rel.target_ref)  
                        if len(embeddings)!=0:
                            embedding=embeddings[0]
                            # print(embeddings[0])
                    
                            rel_list.append(rel.target_ref)
                            #i=i+1
            
                            base2=rel.target_part.blob
                            docx_filename = sample_name+".docx" 
                                # print(docx_filename)

                                ##Save the docx
                            with open('./run_document/'+docx_filename, "wb") as output_file:
                                output_file.write(base2)
                                
                            url = "xxxx"
                            requests.packages.urllib3.disable_warnings()
                            
                            payload={'width': '0',
                                     'height': '0'}
                            files=[
                                ('files',(docx_filename, open('./run_document/'+docx_filename,'rb'),'application/vnd.openxmlformats-officedocument.wordprocessingml.document'))
                                ]
                            headers = {
                                'Authorization': 'Bearer '
                                }

                            response = requests.request("POST", url, headers=headers, data=payload, files=files,verify=False)

                            docx_id=response.text
                                    
                            if len(docx_id)!=0:
                                int_pt=re.compile( r'\d+',re.I) 
                                docx_id_lt=int_pt.findall(docx_id)
                                if len(docx_id_lt)==1:
                                    if sample_name!=[]:
                                    docx_id_f={}
                                    docx_id_f['id']=docx_id_lt[0]
                                    docx_id_f["sample_name"]=sample_name                                    docx_id_f["row"]=name_docx.index(item)
                                    docx_id_all.append(docx_id_f)

                                        
                            
                        elif len(embedding2s)!=0:
                            embedding=embedding2s[0]
                            # print(embedding2s[0])
                    
                            rel_list.append(rel.target_ref)
                            #i=i+1
            
                            base2=rel.target_part.blob

                            docx_filename =sample_name+".doc" 
  
                            ##Save the docx
                            with open('./run_document/'+docx_filename, "wb") as output_file:
                                output_file.write(base2)
                                
                            url = ""
                            requests.packages.urllib3.disable_warnings()
                            
                            payload={'width': '0',
                                     'height': '0'}
                            files=[
                                ('files',(docx_filename, open('./run_document/'+docx_filename,'rb'),'application/vnd.openxmlformats-officedocument.wordprocessingml.document'))
                                ]
                            headers = {
                                'Authorization': 'Bearer '
                                }

                            response = requests.request("POST", url, headers=headers, data=payload, files=files,verify=False)

                            docx_id=response.text
                                    
                            if len(docx_id)!=0:
                                int_pt=re.compile( r'\d+',re.I) 
                                docx_id_lt=int_pt.findall(docx_id)
                                if len(docx_id_lt)==1:

                                    docx_id_f={}
                                    docx_id_f['id']=docx_id_lt[0]
                                    docx_id_f["sample_name"]=sample_name                                    docx_id_f["row"]=name_docx.index(item)
                                    docx_id_all.append(docx_id_f)

                                    
                        elif len(embedding3s)!=0:
                            embedding=embedding3s[0]
                            # print(embedding3s[0])
                    
                            rel_list.append(rel.target_ref)
                            #i=i+1
            
                            base2=rel.target_part.blob
                            ole = olefile.OleFileIO(base2)
                            ole_stream=ole.listdir()
                            if ['CONTENTS'] in ole_stream:
                                pdf_data = ole.openstream('CONTENTS').read()
                            elif ['\x01Ole10Native'] in ole_stream:
                                pdf_data=ole.openstream('\x01Ole10Native').read()
                            elif ['Contents'] in ole_stream:
                                pdf_data = ole.openstream('Contents').read()
                            # pdf_data = ole.openstream('\x01CompObj').read()
                            else:
                                pdf_data=""
                            # Does the embedded file have a %PDF- header?
                            if pdf_data[0:5] == b'%PDF-'or str(pdf_data).find('pdf')>=0:
                            #if len(conclusion)==len(name_docx_yes):

                                pdf_filename =sample_name[name_docx.index(item)]+".pdf" 

                                
                            ##Save the docx
                                with open('./run_document/'+pdf_filename, "wb") as output_file:
                                    output_file.write( pdf_data )
                                
                                url = ""
                                requests.packages.urllib3.disable_warnings()
                                    
                                payload={'width': '0',
                                         'height': '0'}
                                files=[
                                    ('files',(pdf_filename, open('./rd3/run_document/'+pdf_filename,'rb'),'application/pdf'))
                                    ]
                                headers = {
                                    'Authorization': 'Bearer'
                                    }

                                response = requests.request("POST", url, headers=headers, data=payload, files=files,verify=False)
                                
                                docx_id=response.text
                                    
                                if len(docx_id)!=0:
                                    int_pt=re.compile( r'\d+',re.I) 
                                    docx_id_lt=int_pt.findall(docx_id)
                                    if len(docx_id_lt)==1:

                                        docx_id_f={}
                                        docx_id_f['id']=docx_id_lt[0]
                                        docx_id_f["sample_name"]=sample_name
                                        docx_id_f["row"]=name_docx.index(item)
                                        docx_id_all.append(docx_id_f)

                            ## pdf 图标 docx
                            elif str(pdf_data).find('docx')>=0:
                            #if len(conclusion)==len(name_docx_yes):
                                docx_filename =sample_name[name_docx.index(item)+".docx" 

                                
                            ##Save the docx
                                with open('./run_document/'+docx_filename, "wb") as output_file:
                                    output_file.write( pdf_data )
                                
                                url = ""
                                requests.packages.urllib3.disable_warnings()
                                    
                                payload={'width': '0',
                                         'height': '0'}
                                files=[
                                    ('files',(docx_filename, open('./run_document/'+docx_filename,'rb'),'application/pdf'))
                                    ]
                                headers = {
                                    'Authorization': 'Bearer '
                                    }

                                response = requests.request("POST", url, headers=headers, data=payload, files=files,verify=False)
                                
                                docx_id=response.text
                                    
                                if len(docx_id)!=0:
                                    int_pt=re.compile( r'\d+',re.I) 
                                    docx_id_lt=int_pt.findall(docx_id)
                                    if len(docx_id_lt)==1:
                                        if sample_name!=[]:
                                            docx_id_f={}
                                            docx_id_f['id']=docx_id_lt[0]
                                            docx_id_f["sample_name"]=sample_name[name_docx.index(item)]
                                            docx_id_f["row"]=name_docx.index(item)
                                            docx_id_all.append(docx_id_f)
                                        else:
                                            docx_id_f={}
                                            docx_id_f['id']=docx_id_lt[0]
                                            docx_id_f["sample_name"]=conclusion[name_docx.index(item)]
                                            docx_id_f["row"]=name_docx.index(item)
                                            docx_id_all.append(docx_id_f) 
                                            ## pdf 图标 docx
                            elif str(pdf_data).find('doc')>=0:
                            #if len(conclusion)==len(name_docx_yes):

                                docx_filename =sample_name                            ##Save the docx
                                with open('./run_document/'+docx_filename, "wb") as output_file:
                                    output_file.write( pdf_data )
                                
                                url = ""
                                requests.packages.urllib3.disable_warnings()
                                    
                                payload={'width': '0',
                                         'height': '0'}
                                files=[
                                    ('files',(docx_filename, open('./rd3/run_document/'+docx_filename,'rb'),'application/pdf'))
                                    ]
                                headers = {
                                    'Authorization': 'Bearer '
                                    }

                                response = requests.request("POST", url, headers=headers, data=payload, files=files,verify=False)
                                
                                docx_id=response.text
                                    
                                if len(docx_id)!=0:
                                    int_pt=re.compile( r'\d+',re.I) 
                                    docx_id_lt=int_pt.findall(docx_id)
                                    if len(docx_id_lt)==1:
                                        if sample_name!=[]:
                                            docx_id_f={}
                                            docx_id_f['id']=docx_id_lt[0]
                                            docx_id_f["sample_name"]=sample_name[name_docx.index(item)]
                                            docx_id_f["row"]=name_docx.index(item)
                                            docx_id_all.append(docx_id_f)
                                        else:
                                            docx_id_f={}
                                            docx_id_f['id']=docx_id_lt[0]
                                            docx_id_f["sample_name"]=conclusion[name_docx.index(item)]
                                            docx_id_f["row"]=name_docx.index(item)
                                            docx_id_all.append(docx_id_f) 

    return docx_id_all
python docx 提取word文档内嵌docx/doc/pdf文件附件

相关阅读

相关文章

相关问答

相关文档