from xml.etree.ElementTree import ElementTree as ET
import pandas as pd
def read_xml(file):
tree = ET()
tree.parse(file)
return tree
def find_node(tree,path):
return tree.findall(path)
def get_generator_length(generator):
return sum(1 for _ in generator)
def keymap_replace(
string: str,
mappings: dict,
) -> str:
"""Replace parts of a string based on a dictionary.
This function takes a string a dictionary of
replacement mappings. For example, if I supplied
the string "Hello world.", and the mappings
{"H": "J", ".": "!"}, it would return "Jello world!".
Keyword arguments:
string -- The string to replace characters in.
mappings -- A dictionary of replacement mappings.
"""
replaced_string = string.lower()
for character, replacement in mappings.items():
if replaced_string == character:
replaced_string = replaced_string.replace(character,replacement)
return replaced_string
tree = read_xml("meme.xml")#读取XML
root = tree.getroot()
value_list=[]
names=locals()#动态命名变量准备
for x in range(1,11):
#这里想嵌套get_generator_length函数,但是一直报错,可能是函数中的for循环影响,
#可采用其他获取generator类型长度的方法
names['seq_dict'+str(x)] = [{}]*get_generator_length\
(tree.iterfind("motifs/motif[@id='motif_%d']/contributing_sites/contributing_site"%x))
#动态命名并声明10个字典
y=0
exec('list{}=[]'.format(x))
for elem in find_node\
(tree,"motifs/motif[@id='motif_%d']/contributing_sites/contributing_site"%x):
exec('seq_dict{}[y]=elem.attrib'.format(x))
exec('list{}.append(seq_dict{}[y]["sequence_id"])'.format(x,x))
y+=1
#向每个字典内填入相应motif下每个序列含有序列的序号的attrib信息
exec('value_list.append(list{})'.format(x))
motif_dict = [{}]*get_generator_length(tree.iterfind("motifs/motif"))
#获取motif个数
columns=[]
i = 0
for motifs in tree.iterfind("motifs/motif"):
motif_dict[i]=(motifs.attrib)
columns.append(motif_dict[i]['id'])
i+=1
#向字典内填入motif的attrib信息,并提取id作为列名
name_dict=[{}]*get_generator_length(tree.iterfind("training_set/sequence"))
j = 0
id2name_dict={}
for seq_name in tree.iterfind("training_set/sequence"):
name_dict[j]=seq_name.attrib
id2name_dict[name_dict[j]["id"]]=name_dict[j]["name"]
j+=1
#获取sequence id和name对应关系的字典
for m in range(len(value_list)):
for n in range(len(value_list[m])):
value_list[m][n]=keymap_replace(value_list[m][n],id2name_dict)
data=dict(zip(columns,value_list))
df=pd.DataFrame(dict([(k,pd.Series(v)) for k,v in data.items()]))
df.to_csv('meme.csv',sep=',',index=False)