GMAP gff3格式转换与数据统计

鞠乐

2023-12-01

##gff-version   3
# Generated by GMAP version 2016-06-09 using call:  gmapl.sse42 -D /export/data/ -d NRGenome --trim-end-exons=10 -t 32 --canonical-mode=2 --allow-close-indels=2 -B 4 -f 4 -n 0 ./unigene_seq_5.fasta
chr6B	NRGenome	EST_match	558820383	558820604	99	.	.	ID=UN227692.path1;Name=UN227692;Target=UN227692 1 222 +;Gap=M222;coverage=100.0;identity=99.1;matches=216;mismatches=2;indels=0;unknowns=4
###
chr7A	NRGenome	EST_match	683635472	683635624	100	.	.	ID=UN113387.path1;Name=UN113387;Target=UN113387 1 153 -;Gap=M153;coverage=100.0;identity=100.0;matches=153;mismatches=0;indels=0;unknowns=0
###
chr7D	NRGenome	EST_match	27592786	27593326	100	.	.	ID=UN128584.path1;Name=UN128584;Target=UN128584 1 541 -;Gap=M541;coverage=100.0;identity=100.0;matches=541;mismatches=0;indels=0;unknowns=0
###
chr4B	NRGenome	EST_match	505369881	505370146	99	.	.	ID=UN170802.path1;Name=UN170802;Target=UN170802 4 269 +;Gap=M266;coverage=98.9;identity=99.6;matches=265;mismatches=1;indels=0;unknowns=0
###
chr3A	NRGenome	EST_match	106517703	106518022	100	.	.	ID=UN181903.path1;Name=UN181903;Target=UN181903 1 320 -;Gap=M320;coverage=100.0;identity=100.0;matches=320;mismatches=0;indels=0;unknowns=0
###

结果文件

Query Target Start End Coverage Identity Matches Mismatches Indels Unknowns
UN227692 chr6B 558820383 558820604 100.0 99.1 216 2 0 4
UN113387 chr7A 683635472 683635624 100.0 100.0 153 0 0 0
UN128584 chr7D 27592786 27593326 100.0 100.0 541 0 0 0
UN170802 chr4B 505369881 505370146 98.9 99.6 265 1 0 0
UN181903 chr3A 106517703 106518022 100.0 100.0 320 0 0 0
UN076932 chr2B 452598011 452598795 99.1 99.2 781 4 2 0
UN067930 chr3D 23548729 23549162 100.0 100.0 434 0 0 0

#!/usr/bin/env python
# -*- coding: utf-8 -*-

H_Id_98 = 0
H_Id_97 = 0
H_Id_96 = 0
H_Id_95 = 0
H_Id_94 = 0
H_Id_93_90 = 0
H_Id_89 = 0

L_Id_98 = 0
L_Id_97 = 0
L_Id_96 = 0
L_Id_95 = 0
L_Id_94 = 0
L_Id_93_90 = 0
L_Id_89 = 0

V_L = 0
cal_file2 = open('cal_file2.txt', 'w')
with open('unigene_seq_5.fasta_gmapl_NRGenome.out.gff', 'r') as f:
    print 'Query', 'Target', 'Start', 'End', 'Coverage', 'Identity', 'Matches', 'Mismatches', 'Indels', 'Unknowns'
    for line in f:
        if line.startswith('#'):
            pass
        else:
            line = line.strip().split('\t')
            new = line[-1].split(';')
            print new[1].lstrip('Name='), line[0], line[3], line[4], new[4].lstrip('coverage='), \
                new[5].lstrip('identity='), new[6].lstrip('matches='), new[7].lstrip('mismatches='),\
            new[8].lstrip('indels='), new[9].lstrip('unknowns=')
            if float(new[4].lstrip('coverage=')) >= 90.0:
                if float(new[5].lstrip('identity=')) >= 98.0:
                    H_Id_98 += 1
                    cal_file2.writelines(new[1].lstrip('Name=') + '\n')
                if 98.0 > float(new[5].lstrip('identity=')) >= 97.0:
                    H_Id_97 += 1
                    cal_file2.writelines(new[1].lstrip('Name=') + '\n')
                if 97.0 > float(new[5].lstrip('identity=')) >= 96.0:
                    H_Id_96 += 1
                    cal_file2.writelines(new[1].lstrip('Name=') + '\n')
                if 96.0 > float(new[5].lstrip('identity=')) >= 95.0:
                    H_Id_95 += 1
                    cal_file2.writelines(new[1].lstrip('Name=') + '\n')
                if 95.0 > float(new[5].lstrip('identity=')) >= 94.0:
                    H_Id_94 += 1
                    cal_file2.writelines(new[1].lstrip('Name=') + '\n')
                if 90.0 <= float(new[5].lstrip('identity=')) < 94.0:
                    H_Id_93_90 += 1
                    cal_file2.writelines(new[1].lstrip('Name=') + '\n')
                if float(new[5].lstrip('identity=')) < 90.0:
                    H_Id_89 += 1
            elif 50.0 <= float(new[4].lstrip('coverage=')) < 90.0:
                if float(new[5].lstrip('identity=')) >= 98.0:
                    L_Id_98 += 1
                if 98.0 > float(new[5].lstrip('identity=')) >= 97.0:
                    L_Id_97 += 1
                if 97.0 > float(new[5].lstrip('identity=')) >= 96.0:
                    L_Id_96 += 1
                if 96.0 > float(new[5].lstrip('identity=')) >= 95.0:
                    L_Id_95 += 1
                if 95.0 > float(new[5].lstrip('identity=')) >= 94.0:
                    L_Id_94 += 1
                if 90.0 <= float(new[5].lstrip('identity=')) < 94.0:
                    L_Id_93_90 += 1
                if float(new[5].lstrip('identity=')) < 90.0:
                    L_Id_89 += 1
            else:
                V_L += 1
    C_100_90 = ('%d\t%d\t%d\t%d\t%d\t%d\t%d\t') % (H_Id_98, H_Id_97, H_Id_96, H_Id_95, H_Id_94, H_Id_93_90, H_Id_89)
    C_89_50 = ('%d\t%d\t%d\t%d\t%d\t%d\t%d\t') % (L_Id_98, L_Id_97, L_Id_96, L_Id_95, L_Id_94, L_Id_93_90, L_Id_89)
cal_file = open('cal_file.txt', 'w')
cal_file.write('Coverage/Identity\t100%-98%\t98%-97%\t97%-96%\t96%-95%\t95%-94%\t93%-90%\t<90%\n')
cal_file.write('100%-90%\t' + C_100_90 + '\n')
cal_file.write('89%-50%\t' + C_89_50 + '\n')
cal_file.write('<50%\t' + str(V_L))

GMAP gff3格式转换与数据统计

相关阅读

相关文章

相关问答

相关文档