test
来源:互联网 发布:linux 开机挂载 编辑:程序博客网 时间:2024/06/10 20:44
#!/usr/bin/perl -wuse strict;use warnings;use lib '/home/yanch/perl5/lib/perl5';use lib '/home/yanch/perl5/lib/share/perl5';use Getopt::Long;use Bio::SeqIO;use Cwd;use File::Basename;use subs::parallel;my %hash;my ($raw_reads,$raw_bases);my ($trim_reads,$trim_bases);my $name = shift;my $path = shift;my $lib = uc(shift);my $qual = shift;my $len = shift;my $head = shift;my $raw_path = getcwd;my $trim_path = "$raw_path/trim";my $fq_path = "$raw_path/fq"; unless (-e $trim_path && -e $fq_path){ system("mkdir -p $trim_path") ; system("mkdir -p $fq_path") ; }raw_to_fq($name,$path,$lib);sub raw_to_fq{ my ($name,$path,$lib) = @_; system ("/mnt/lustre/users/qiangang/pipeline/utils/fq_zcat.py -i $path -o $fq_path -s $name -L $lib ") == 0 or die "Error occur in rawdata transfer to fastq $?\n"; }open FILE, ">> $trim_path/cleanFq.list" or die $!;if( lc($lib) eq "pe"){ my $rd1 = "$fq_path/$name\.1.fq"; my $rd2 = "$fq_path/$name\.2.fq"; pe_pipeline($name,$rd1,$rd2,$qual); print FILE "$name\tPE\t$trim_path/$name/$name\_trim1.fq\t$trim_path/$name/$name\_trim2.fq\t$trim_path/$name/$name\_trim_unpaired.fq\n"; }elsif( lc($lib) eq "se"){ my $rd1 = "$fq_path/$name\.1.fq"; se_pipeline($name,$rd1,$qual); print FILE "$name\tSE\t$trim_path/$name/$name\_trim_adpter.fq.trimmed.single\n";} sub pe_pipeline { my ($name,$rd1,$rd2,$qual) = @_;print "head:$head","\n"; open my $out ,">>",$trim_path."/raw_and_trimed.stat" or die "Can't open file to output\n";if($head == 1){ print $out "Sample_ID\tRawReads_Num\tRawBase_Num\tRaw_Q20\(%\)\tGC(%)\tN(%)\tTrimReads_Num\tTrimBase_Num\tTrim_Q20(%)\tClean(%)\tAdpter(%)\n";}############raw reads bases count #########################parallelize_sub('bases_count_pe');my ($raw_reads,$raw_bases) = bases_count_pe($rd1,$rd2);print "raw_reads:",$raw_reads,"\t",$raw_bases,"\n";##################### Q20 or Q30 ####################################parallelize_sub('calcu_quality_pe'); my ($q_raw) = calcu_quality_pe($rd1,$rd2,$qual); print "q_raw:$q_raw";######################## Duplication ################################parallelize_sub('duplicate');duplicate ($rd1,$rd2);######################GC contain #################################parallelize_sub('calcu_GC_pe');my ($GC_pert,$Ns_pert) = calcu_GC_pe($rd1,$rd2);plot_stat($rd1,$rd2);###################trim fastq and caculte ###########################my ($flag,$trim1,$trim2,$adpter_num) = trim_fq($name,$rd1,$rd2);while($flag){ ($trim_reads,$trim_bases)= bases_count_pe($trim1,$trim2); my ($q_trim) = calcu_quality_pe($trim1,$trim2,$qual); my $clean = sprintf("%.2f",$trim_reads/$raw_reads*100); my $adpter= sprintf("%.2f",2*$adpter_num/$raw_reads)*100; print $out "$name\t$raw_reads\t$raw_bases\t$q_raw\t$GC_pert\t$Ns_pert\t$trim_reads\t$trim_bases\t$q_trim\t$clean\t$adpter\n"; $flag = 0;}}sub se_pipeline { my ($name,$rd1,$qual) = @_; open my $out ,">>","raw_and_trimed_se.stat" or die "Can't open file to output\n";if($head == 1){ print $out "Sample_ID\tRawReads_Num\tRawBase_Num\tRaw_Q20(%)\tGC(%)\tN(%)\tTrimReads_Num\tTrimBase_Num\tTrim_Q20(%)\tClean(%)\tAdpter(%)\n";}############raw reads bases count ########################($raw_reads,$raw_bases) = bases_count_se($rd1);##################### Q20 or Q30 ####################################my ($q_raw) = calcu_quality_se($rd1,$qual);######################## Duplication #################################duplicate ($rd1,$rd2);######################GC contain ################################my ($GC_pert,$Ns_pert) = calcu_GC_se($rd1);#plot_stat($rd1,$rd2);###################trim fastq and caculte ###########################my ($flag,$trim1,$adpter_num) = trim_fq($name,$rd1);if($flag){ ($trim_reads,$trim_bases)= bases_count_se($trim1); my ($q_trim) = calcu_quality_se($trim1,$qual); my $clean = sprintf("%.2f",$trim_reads/$raw_reads*100); my $adpter= sprintf("%.2f",($adpter_num/$raw_reads*100)); print $out "$name\t$raw_reads\t$raw_bases\t$q_raw\t$GC_pert\t$Ns_pert\t$trim_reads\t$trim_bases\t$q_trim\t$clean\t$adpter\n"; } }#########################subs###########################sub bases_count_pe{ my ($rd1,$rd2) = @_; my $i; my $bases;foreach my $file ($rd1,$rd2){ my $in = Bio::SeqIO -> new( -file => $file, -format => "Fastq", ); while( my $seq = $in -> next_seq){ $i++; my $s = $seq -> seq; $bases += length $s; } } return ($i,$bases);}sub bases_count_se{ my ($rd1) = @_; my $i; my $bases; # print "rd1\t$rd1","\n"; # print "rd2\t$rd2","\n"; my $in = Bio::SeqIO -> new( -file => $rd1, -format => "Fastq", ); while( my $seq = $in -> next_seq()){ $i ++; $bases += length $seq; } return ($i,$bases);}sub calcu_quality_pe{ my $result1; my $result2; my ($q_raw1,$q_raw2); my $pert; my ($rd1,$rd2,$score) = @_; $result1 = `FastqTotalHighQualityBase.jar -i $rd1 -q $score ` ; $result2 = `FastqTotalHighQualityBase.jar -i $rd2 -q $score ` ; chomp $result1; chomp $result2; ($q_raw1) = (split"\t",$result1)[3]; ($q_raw2) = (split"\t",$result2)[3]; print $q_raw1,"\t",$q_raw2,"\n"; $pert = ($q_raw1 + $q_raw2)/2 ; return $pert;}sub calcu_quality_se { my $result1; my $result2; my ($q_raw1,$q_raw2); my $pert; my ($rd1,$score) = @_; $result1 = `FastqTotalHighQualityBase.jar -i $rd1 -q $score ` ; chomp $result1; ($q_raw1) = (split"\t",$result1)[3]; return $q_raw1;}sub calcu_GC_pe{ my ($rd1,$rd2) = @_; my ($line,$g,$c,$n,$Ns,$gc,$total,$gc_pert,$Ns_pert); my ($file1,$subpath) = fileparse($rd1); my ($file2) = fileparse($rd2); my $out1 = "$fq_path/$file1\.stat"; my $out2 = "$fq_path/$file2\.stat"; system "fastx_quality_stats -Q 33 -i $rd1 -o $out1"; system "fastx_quality_stats -Q 33 -i $rd2 -o $out2"; foreach my $key ($out1,$out2){ open IN ,"<", $key or die "Test Can't open $key $!"; while(<IN>){ chomp; unless(/column/){ ($line,$g,$c,$n) = (split"\t",$_)[1,13,14,16]; $total += $line; $gc += $g+$c; $Ns += $n; }} ($gc_pert) = sprintf("%.2f",$gc/$total*100); ($Ns_pert) = sprintf("%.2f",$Ns/$total*100); return ($gc_pert,$Ns_pert);}}sub calcu_GC_se{ my ($rd1) = @_; my ($line,$g,$c,$n,$Ns,$gc,$total,$gc_pert,$Ns_pert); my ($file1,$subpath) = fileparse($rd1); my $out1 = "$fq_path/$file1\.stat"; system "fastx_quality_stats -Q 33 -i $rd1 -o $out1"; foreach my $key ($out1){ open IN ,"<", $key or die "Test Can't open $key $!"; while(<IN>){ chomp; unless(/column/){ ($line,$g,$c,$n) = (split"\t",$_)[1,13,14,16]; $total += $line; $gc += $g+$c; $Ns += $n; }} ($gc_pert) = sprintf("%.2f",$gc/$total*100); ($Ns_pert) = sprintf("%.2f",$Ns/$total*100);} return ($gc_pert,$Ns_pert);}sub plot_stat { my ($rd1,$rd2) = @_; my ($file1,$subpath) = fileparse($rd1); my ($file2) = fileparse($rd2); chdir "$fq_path"; open FA, ">> draw.sh" or die $!; if($rd1 eq $rd2) { my $out1 = "$file1.stat"; print FA "/mnt/lustre/users/sunyong/bin/fqcheck/fqcheck_distribute.pl $out1 -size $len\n"; }else{ my $out1 = "$file1.stat"; my $out2 = "$file2.stat"; print FA "/mnt/lustre/users/sunyong/bin/fqcheck/fqcheck_distribute.pl $out1 $out2 -size $len\n"; } chdir "$raw_path";}sub duplicate { my (@file) = @_; foreach my $file (@file){ my %hash; my $dup = 0; my $total; open my $in,"<",$file or die "Can't open $file : $!\n"; open my $out,">>","$fq_path/$name\_dup_stat.txt" or die "Can't open file to output\n"; while(<$in>) { my $line = <$in>; chomp($line); $hash{$line} ++; <$in>; <$in>; $total ++; } foreach my $k(keys %hash) { if($hash{$k} > 1) { $dup += $hash{$k} - 1; } } my $pert = sprintf("%.2f%%",$dup/$total*100); my ($id,$dir) = fileparse($file); print $out "$id\t$pert\n";}} sub trim_fq{ my $flag = 0; my ($i,$rd1,$rd2) = @_; chdir "$trim_path"; `mkdir $i`; chdir "$i" ; if($rd2){ `SeqPrep -f $rd1 -r $rd2 -1 $i\_clip_1.fq -2 $i\_clip_2.fq -3 $i\_discard_1.fq -4 $i\_discard_2.fq -q 20 -L 20 -B AGATCGGAAGAGCGTCGTGT -A AGATCGGAAGAGCACACGTC 2> $i\_trim_adpter.txt `; `sickle pe -f $i\_clip_1.fq -r $i\_clip_2.fq -t sanger -l 25 -n -o $i\_trim1.fq -p $i\_trim2.fq -s $i\_trim_unpaired.fq >$i\_trim_sickle.log ` ; $flag = 1; my $adpter_num; open ADPTER,"<","$i\_trim_adpter.txt" or die "can't open $!"; while(<ADPTER>){ chomp; if(/Pairs With Adapters:\s+(\d+)/){ $adpter_num = $1; print $adpter_num,"\n"; } } return ($flag,"$i\_trim1.fq","$i\_trim2.fq",$adpter_num); # `echo -e "$PWD/$i\_trim1.fq\t$PWD/$i\_trim2.fq\t$PWD/$i\_trim_unpaired.fq" >> trim_fq.list; }else{ my $adpter_num; `fastx_clipper -a AGATCGGAAGAGCACACGTC -Q 33 -l 25 -v -i $rd1 -o $i\_trim_adpter.fq >$i\_trim_adpter.txt`; `DynamicTrim.pl $i\_trim_adpter.fq -h 20 -bwa -sanger`; `LengthSort.pl $i\_trim_adpter.fq.trimmed -l 25`; # `sickle se -f $i\_clip_1.fq -t sanger -l 25 -n -s $i\_trim_unpaired.fq >$i\_trim_sickle.log `; open ADPTER,"<","$i\_trim_adpter.txt" or die "can't open $!"; while(<ADPTER>){ chomp; if(/(\d+)\s+adpter-only/){ $adpter_num = $1; } } $flag = 1; return ($flag,"$trim_path/$i/$i\_trim_adpter.fq.trimmed.single",$adpter_num); } chdir "$raw_path"; }
0 0
- test
- test
- test
- test
- test
- test
- test
- Test
- test
- Test
- test
- test
- test
- test
- test
- test
- test
- Test
- 程序员最艰巨的十大任务
- viso 形状搜索 无法使用
- Windows下获取Dump文件以及进程下各线程调用栈的方法总结
- 如何实现掩码位图的透明显示
- ZOJ-1405
- test
- 利用录音键改装MP3接驳器
- CvArr、Mat、CvMat、IplImage、BYTE转换
- 如何在 windowsXP 和 node.js 环境下,安装 sqlite3
- ZOJ-1410
- OpenJDK和JDK区别
- Struts1线程问题
- 更改Itunes备份路径-最简便方法(win7)
- 如何搭建一个 Data Guard 环境