取转录本fasta最长的当作基因fasta

来源:互联网 发布:java经典程序 源代码 编辑:程序博客网 时间:2024/06/04 17:46
#!/usr/bin/env perluse warnings;use strict;use Bio::SeqIO;die "perl $0 <fasta> > <outfile>\n" if(@ARGV != 1);my @len = `fastalength $ARGV[0]`;my %xsh = map { chomp; my($length,$name)=split /\s+/; $name=>$length } @len;my %hash;open FA, $ARGV[0] or die $!;while(<FA>){chomp;next if($_ !~ /^>/);if(/gene:([^ ]+) transcript:([^ ]+)/){if(exists $xsh{$2}){push @{$hash{$1}}, "$2 $xsh{$2}";}}}my %trans;open OUT, ">longest_transcript.list" or die $!;foreach my $g(keys %hash){my $tmp = 0;my $lt;foreach my $t(@{$hash{$g}}){my @x = split / /, $t;($x[1] > $tmp) ? $lt = $x[0] : next;}$trans{$lt} = 0;print OUT "$lt\n";}$/ = "\n>";open FB, $ARGV[0] or die $!;while(<FB>){chomp;s/^>//;my @tmp = split /\n/;my @t = split /\s+/, $tmp[0];if(exists $trans{$t[0]}){print ">$_\n";}}&tt;sub tt{my $t = localtime;print STDERR "$t\n";}=cutmy ($seq_hash,$gene_hash) = &read_fasta(shift @ARGV);foreach my$gene(sort keys %$gene_hash){my @names = @{$gene_hash->{$gene}};my $name;my $len = 0;foreach (@names){if ($seq_hash->{$_}->{len} > $len){$name = $_;$len = $seq_hash->{$_}->{len};}}print "$name\n$seq_hash->{$name}{seq}\n";}sub read_fasta{my $file = shift;my $inseq = Bio::SeqIO->new(-file=>$file,-format=>"fasta");my %hash;my %gene;while(my$seq=$inseq->next_seq){my $name = $seq->id;my $gene = $1 if ($seq->desc =~ /gene:(\S+)/);$hash{$name}{seq} = $seq->seq;$hash{$name}{len} = $seq->length;push @{$gene{$gene}} , $name;}return (\%hash,\%gene);}

0 0
原创粉丝点击