当前位置: 首页 > 工具软件 > Motif > 使用案例 >

从meme的motif查询结果中获取含有某个motif的基因名称列表(python解析xml)

颜镜
2023-12-01
from xml.etree.ElementTree import ElementTree as ET
import pandas as pd

def read_xml(file):
    tree = ET()
    tree.parse(file)
    return tree

def find_node(tree,path):
    return tree.findall(path)

def get_generator_length(generator):
        return sum(1 for _ in generator)

def keymap_replace(
        string: str, 
        mappings: dict,
    ) -> str:
    """Replace parts of a string based on a dictionary.

    This function takes a string a dictionary of
    replacement mappings. For example, if I supplied
    the string "Hello world.", and the mappings 
    {"H": "J", ".": "!"}, it would return "Jello world!".

    Keyword arguments:
    string       -- The string to replace characters in.
    mappings     -- A dictionary of replacement mappings.
    
    """
    replaced_string = string.lower()
    for character, replacement in mappings.items():
        if replaced_string == character:
            replaced_string = replaced_string.replace(character,replacement)
    return replaced_string

tree = read_xml("meme.xml")#读取XML
root = tree.getroot()

value_list=[]
names=locals()#动态命名变量准备                     
for x in range(1,11):
#这里想嵌套get_generator_length函数,但是一直报错,可能是函数中的for循环影响,
#可采用其他获取generator类型长度的方法
    names['seq_dict'+str(x)] = [{}]*get_generator_length\
    (tree.iterfind("motifs/motif[@id='motif_%d']/contributing_sites/contributing_site"%x))
    #动态命名并声明10个字典
    y=0
    exec('list{}=[]'.format(x))
    for elem in find_node\
    (tree,"motifs/motif[@id='motif_%d']/contributing_sites/contributing_site"%x):
        exec('seq_dict{}[y]=elem.attrib'.format(x))
        exec('list{}.append(seq_dict{}[y]["sequence_id"])'.format(x,x))
        y+=1
        #向每个字典内填入相应motif下每个序列含有序列的序号的attrib信息
    exec('value_list.append(list{})'.format(x))
        
motif_dict = [{}]*get_generator_length(tree.iterfind("motifs/motif"))
#获取motif个数
columns=[]
i = 0
for motifs in tree.iterfind("motifs/motif"):
    motif_dict[i]=(motifs.attrib)
    columns.append(motif_dict[i]['id'])
    i+=1
    #向字典内填入motif的attrib信息,并提取id作为列名

name_dict=[{}]*get_generator_length(tree.iterfind("training_set/sequence"))
j = 0
id2name_dict={}
for seq_name in tree.iterfind("training_set/sequence"):
    name_dict[j]=seq_name.attrib
    id2name_dict[name_dict[j]["id"]]=name_dict[j]["name"]
    j+=1
    #获取sequence id和name对应关系的字典

for m in range(len(value_list)):
    for n in range(len(value_list[m])):
        value_list[m][n]=keymap_replace(value_list[m][n],id2name_dict)
    
data=dict(zip(columns,value_list))
df=pd.DataFrame(dict([(k,pd.Series(v)) for k,v in data.items()]))

df.to_csv('meme.csv',sep=',',index=False)

 类似资料: