GMAP gff3格式转换与数据统计

来源:互联网 发布:cnc编程入门教学视频 编辑:程序博客网 时间:2024/03/29 22:11
##gff-version   3# Generated by GMAP version 2016-06-09 using call:  gmapl.sse42 -D /export/data/ -d NRGenome --trim-end-exons=10 -t 32 --canonical-mode=2 --allow-close-indels=2 -B 4 -f 4 -n 0 ./unigene_seq_5.fastachr6BNRGenomeEST_match55882038355882060499..ID=UN227692.path1;Name=UN227692;Target=UN227692 1 222 +;Gap=M222;coverage=100.0;identity=99.1;matches=216;mismatches=2;indels=0;unknowns=4###chr7ANRGenomeEST_match683635472683635624100..ID=UN113387.path1;Name=UN113387;Target=UN113387 1 153 -;Gap=M153;coverage=100.0;identity=100.0;matches=153;mismatches=0;indels=0;unknowns=0###chr7DNRGenomeEST_match2759278627593326100..ID=UN128584.path1;Name=UN128584;Target=UN128584 1 541 -;Gap=M541;coverage=100.0;identity=100.0;matches=541;mismatches=0;indels=0;unknowns=0###chr4BNRGenomeEST_match50536988150537014699..ID=UN170802.path1;Name=UN170802;Target=UN170802 4 269 +;Gap=M266;coverage=98.9;identity=99.6;matches=265;mismatches=1;indels=0;unknowns=0###chr3ANRGenomeEST_match106517703106518022100..ID=UN181903.path1;Name=UN181903;Target=UN181903 1 320 -;Gap=M320;coverage=100.0;identity=100.0;matches=320;mismatches=0;indels=0;unknowns=0###

结果文件

Query Target Start End Coverage Identity Matches Mismatches Indels UnknownsUN227692 chr6B 558820383 558820604 100.0 99.1 216 2 0 4UN113387 chr7A 683635472 683635624 100.0 100.0 153 0 0 0UN128584 chr7D 27592786 27593326 100.0 100.0 541 0 0 0UN170802 chr4B 505369881 505370146 98.9 99.6 265 1 0 0UN181903 chr3A 106517703 106518022 100.0 100.0 320 0 0 0UN076932 chr2B 452598011 452598795 99.1 99.2 781 4 2 0UN067930 chr3D 23548729 23549162 100.0 100.0 434 0 0 0
#!/usr/bin/env python# -*- coding: utf-8 -*-H_Id_98 = 0H_Id_97 = 0H_Id_96 = 0H_Id_95 = 0H_Id_94 = 0H_Id_93_90 = 0H_Id_89 = 0L_Id_98 = 0L_Id_97 = 0L_Id_96 = 0L_Id_95 = 0L_Id_94 = 0L_Id_93_90 = 0L_Id_89 = 0V_L = 0cal_file2 = open('cal_file2.txt', 'w')with open('unigene_seq_5.fasta_gmapl_NRGenome.out.gff', 'r') as f:    print 'Query', 'Target', 'Start', 'End', 'Coverage', 'Identity', 'Matches', 'Mismatches', 'Indels', 'Unknowns'    for line in f:        if line.startswith('#'):            pass        else:            line = line.strip().split('\t')            new = line[-1].split(';')            print new[1].lstrip('Name='), line[0], line[3], line[4], new[4].lstrip('coverage='), \                new[5].lstrip('identity='), new[6].lstrip('matches='), new[7].lstrip('mismatches='),\            new[8].lstrip('indels='), new[9].lstrip('unknowns=')            if float(new[4].lstrip('coverage=')) >= 90.0:                if float(new[5].lstrip('identity=')) >= 98.0:                    H_Id_98 += 1                    cal_file2.writelines(new[1].lstrip('Name=') + '\n')                if 98.0 > float(new[5].lstrip('identity=')) >= 97.0:                    H_Id_97 += 1                    cal_file2.writelines(new[1].lstrip('Name=') + '\n')                if 97.0 > float(new[5].lstrip('identity=')) >= 96.0:                    H_Id_96 += 1                    cal_file2.writelines(new[1].lstrip('Name=') + '\n')                if 96.0 > float(new[5].lstrip('identity=')) >= 95.0:                    H_Id_95 += 1                    cal_file2.writelines(new[1].lstrip('Name=') + '\n')                if 95.0 > float(new[5].lstrip('identity=')) >= 94.0:                    H_Id_94 += 1                    cal_file2.writelines(new[1].lstrip('Name=') + '\n')                if 90.0 <= float(new[5].lstrip('identity=')) < 94.0:                    H_Id_93_90 += 1                    cal_file2.writelines(new[1].lstrip('Name=') + '\n')                if float(new[5].lstrip('identity=')) < 90.0:                    H_Id_89 += 1            elif 50.0 <= float(new[4].lstrip('coverage=')) < 90.0:                if float(new[5].lstrip('identity=')) >= 98.0:                    L_Id_98 += 1                if 98.0 > float(new[5].lstrip('identity=')) >= 97.0:                    L_Id_97 += 1                if 97.0 > float(new[5].lstrip('identity=')) >= 96.0:                    L_Id_96 += 1                if 96.0 > float(new[5].lstrip('identity=')) >= 95.0:                    L_Id_95 += 1                if 95.0 > float(new[5].lstrip('identity=')) >= 94.0:                    L_Id_94 += 1                if 90.0 <= float(new[5].lstrip('identity=')) < 94.0:                    L_Id_93_90 += 1                if float(new[5].lstrip('identity=')) < 90.0:                    L_Id_89 += 1            else:                V_L += 1    C_100_90 = ('%d\t%d\t%d\t%d\t%d\t%d\t%d\t') % (H_Id_98, H_Id_97, H_Id_96, H_Id_95, H_Id_94, H_Id_93_90, H_Id_89)    C_89_50 = ('%d\t%d\t%d\t%d\t%d\t%d\t%d\t') % (L_Id_98, L_Id_97, L_Id_96, L_Id_95, L_Id_94, L_Id_93_90, L_Id_89)cal_file = open('cal_file.txt', 'w')cal_file.write('Coverage/Identity\t100%-98%\t98%-97%\t97%-96%\t96%-95%\t95%-94%\t93%-90%\t<90%\n')cal_file.write('100%-90%\t' + C_100_90 + '\n')cal_file.write('89%-50%\t' + C_89_50 + '\n')cal_file.write('<50%\t' + str(V_L))
0 0