test

来源:互联网 发布:linux 开机挂载 编辑:程序博客网 时间:2024/06/10 20:44
#!/usr/bin/perl -wuse strict;use warnings;use lib '/home/yanch/perl5/lib/perl5';use lib '/home/yanch/perl5/lib/share/perl5';use Getopt::Long;use Bio::SeqIO;use Cwd;use File::Basename;use subs::parallel;my %hash;my ($raw_reads,$raw_bases);my ($trim_reads,$trim_bases);my $name = shift;my $path = shift;my $lib = uc(shift);my $qual = shift;my $len = shift;my $head = shift;my $raw_path = getcwd;my $trim_path = "$raw_path/trim";my $fq_path = "$raw_path/fq";  unless (-e $trim_path &&  -e $fq_path){             system("mkdir -p $trim_path") ;             system("mkdir -p $fq_path")  ;          }raw_to_fq($name,$path,$lib);sub raw_to_fq{    my ($name,$path,$lib) = @_;     system ("/mnt/lustre/users/qiangang/pipeline/utils/fq_zcat.py -i $path -o $fq_path  -s $name -L $lib ") == 0 or die "Error occur in rawdata transfer to fastq $?\n"; }open FILE, ">> $trim_path/cleanFq.list" or die $!;if( lc($lib) eq "pe"){       my $rd1 = "$fq_path/$name\.1.fq";       my $rd2 = "$fq_path/$name\.2.fq";              pe_pipeline($name,$rd1,$rd2,$qual);       print FILE "$name\tPE\t$trim_path/$name/$name\_trim1.fq\t$trim_path/$name/$name\_trim2.fq\t$trim_path/$name/$name\_trim_unpaired.fq\n";  }elsif( lc($lib) eq "se"){       my $rd1 = "$fq_path/$name\.1.fq";              se_pipeline($name,$rd1,$qual);       print FILE "$name\tSE\t$trim_path/$name/$name\_trim_adpter.fq.trimmed.single\n";}     sub pe_pipeline {        my ($name,$rd1,$rd2,$qual) = @_;print "head:$head","\n";        open my $out ,">>",$trim_path."/raw_and_trimed.stat" or die "Can't open file to output\n";if($head == 1){        print $out "Sample_ID\tRawReads_Num\tRawBase_Num\tRaw_Q20\(%\)\tGC(%)\tN(%)\tTrimReads_Num\tTrimBase_Num\tTrim_Q20(%)\tClean(%)\tAdpter(%)\n";}############raw reads bases count #########################parallelize_sub('bases_count_pe');my ($raw_reads,$raw_bases) = bases_count_pe($rd1,$rd2);print "raw_reads:",$raw_reads,"\t",$raw_bases,"\n";##################### Q20 or Q30  ####################################parallelize_sub('calcu_quality_pe'); my ($q_raw) = calcu_quality_pe($rd1,$rd2,$qual);    print "q_raw:$q_raw";######################## Duplication ################################parallelize_sub('duplicate');duplicate ($rd1,$rd2);######################GC contain #################################parallelize_sub('calcu_GC_pe');my ($GC_pert,$Ns_pert) = calcu_GC_pe($rd1,$rd2);plot_stat($rd1,$rd2);###################trim fastq and caculte ###########################my ($flag,$trim1,$trim2,$adpter_num) = trim_fq($name,$rd1,$rd2);while($flag){      ($trim_reads,$trim_bases)= bases_count_pe($trim1,$trim2);      my ($q_trim) = calcu_quality_pe($trim1,$trim2,$qual);      my $clean = sprintf("%.2f",$trim_reads/$raw_reads*100);      my $adpter= sprintf("%.2f",2*$adpter_num/$raw_reads)*100;      print $out "$name\t$raw_reads\t$raw_bases\t$q_raw\t$GC_pert\t$Ns_pert\t$trim_reads\t$trim_bases\t$q_trim\t$clean\t$adpter\n";         $flag = 0;}}sub se_pipeline {        my ($name,$rd1,$qual) = @_;        open my $out ,">>","raw_and_trimed_se.stat" or die "Can't open file to output\n";if($head == 1){        print $out "Sample_ID\tRawReads_Num\tRawBase_Num\tRaw_Q20(%)\tGC(%)\tN(%)\tTrimReads_Num\tTrimBase_Num\tTrim_Q20(%)\tClean(%)\tAdpter(%)\n";}############raw reads bases count ########################($raw_reads,$raw_bases) = bases_count_se($rd1);##################### Q20 or Q30  ####################################my ($q_raw) = calcu_quality_se($rd1,$qual);######################## Duplication #################################duplicate ($rd1,$rd2);######################GC contain ################################my ($GC_pert,$Ns_pert) = calcu_GC_se($rd1);#plot_stat($rd1,$rd2);###################trim fastq and caculte ###########################my ($flag,$trim1,$adpter_num) = trim_fq($name,$rd1);if($flag){            ($trim_reads,$trim_bases)= bases_count_se($trim1);      my ($q_trim) = calcu_quality_se($trim1,$qual);      my $clean = sprintf("%.2f",$trim_reads/$raw_reads*100);      my $adpter= sprintf("%.2f",($adpter_num/$raw_reads*100));      print $out "$name\t$raw_reads\t$raw_bases\t$q_raw\t$GC_pert\t$Ns_pert\t$trim_reads\t$trim_bases\t$q_trim\t$clean\t$adpter\n";   }      }#########################subs###########################sub bases_count_pe{    my ($rd1,$rd2) = @_;    my $i;    my $bases;foreach my $file ($rd1,$rd2){    my $in = Bio::SeqIO -> new( -file => $file,                                -format => "Fastq",                              );    while( my $seq = $in -> next_seq){              $i++;              my $s = $seq -> seq;              $bases += length $s;                             }    }            return ($i,$bases);}sub bases_count_se{    my ($rd1) = @_;    my $i;    my $bases;  #   print "rd1\t$rd1","\n";  #   print "rd2\t$rd2","\n";    my $in = Bio::SeqIO -> new( -file => $rd1,                                -format => "Fastq",                              );    while( my $seq = $in -> next_seq()){              $i ++;              $bases += length $seq;              }            return ($i,$bases);}sub calcu_quality_pe{       my $result1;       my $result2;       my ($q_raw1,$q_raw2);       my $pert;       my ($rd1,$rd2,$score) = @_;       $result1 = `FastqTotalHighQualityBase.jar -i $rd1 -q $score ` ;       $result2 = `FastqTotalHighQualityBase.jar -i $rd2 -q $score ` ;       chomp $result1;       chomp $result2;       ($q_raw1) = (split"\t",$result1)[3];       ($q_raw2) = (split"\t",$result2)[3];       print $q_raw1,"\t",$q_raw2,"\n";        $pert = ($q_raw1 + $q_raw2)/2 ;       return $pert;}sub calcu_quality_se {       my $result1;       my $result2;       my ($q_raw1,$q_raw2);       my $pert;       my ($rd1,$score) = @_;       $result1 = `FastqTotalHighQualityBase.jar -i $rd1 -q $score ` ;       chomp $result1;       ($q_raw1) = (split"\t",$result1)[3];       return $q_raw1;}sub calcu_GC_pe{          my ($rd1,$rd2) = @_;          my ($line,$g,$c,$n,$Ns,$gc,$total,$gc_pert,$Ns_pert);          my ($file1,$subpath) = fileparse($rd1);          my ($file2) = fileparse($rd2);          my $out1 = "$fq_path/$file1\.stat";          my $out2 = "$fq_path/$file2\.stat";          system "fastx_quality_stats -Q 33 -i $rd1 -o $out1";                    system "fastx_quality_stats -Q 33 -i $rd2 -o $out2";                foreach my $key ($out1,$out2){          open IN ,"<", $key or die "Test Can't open $key $!";           while(<IN>){            chomp;             unless(/column/){             ($line,$g,$c,$n) = (split"\t",$_)[1,13,14,16];             $total += $line;               $gc += $g+$c;               $Ns  += $n;       }}            ($gc_pert) = sprintf("%.2f",$gc/$total*100);            ($Ns_pert) = sprintf("%.2f",$Ns/$total*100);           return ($gc_pert,$Ns_pert);}}sub calcu_GC_se{          my ($rd1) = @_;          my ($line,$g,$c,$n,$Ns,$gc,$total,$gc_pert,$Ns_pert);          my ($file1,$subpath) = fileparse($rd1);          my $out1 = "$fq_path/$file1\.stat";          system "fastx_quality_stats -Q 33 -i $rd1 -o $out1";                foreach my $key ($out1){          open IN ,"<", $key or die "Test Can't open $key $!";           while(<IN>){            chomp;             unless(/column/){             ($line,$g,$c,$n) = (split"\t",$_)[1,13,14,16];             $total += $line;               $gc += $g+$c;               $Ns  += $n;       }}            ($gc_pert) = sprintf("%.2f",$gc/$total*100);            ($Ns_pert) = sprintf("%.2f",$Ns/$total*100);}           return ($gc_pert,$Ns_pert);}sub plot_stat {        my ($rd1,$rd2) = @_;                my ($file1,$subpath) = fileparse($rd1);                my ($file2) = fileparse($rd2);                chdir "$fq_path";                open FA, ">> draw.sh" or die $!;                if($rd1 eq $rd2)                {                        my $out1 = "$file1.stat";                        print FA "/mnt/lustre/users/sunyong/bin/fqcheck/fqcheck_distribute.pl $out1 -size $len\n";                }else{                        my $out1 = "$file1.stat";                        my $out2 = "$file2.stat";                        print FA "/mnt/lustre/users/sunyong/bin/fqcheck/fqcheck_distribute.pl $out1 $out2 -size $len\n";                }                chdir "$raw_path";}sub duplicate {      my (@file) = @_;      foreach my $file (@file){         my %hash;         my $dup = 0;         my $total;         open my $in,"<",$file or die "Can't open $file : $!\n";         open my $out,">>","$fq_path/$name\_dup_stat.txt" or die "Can't open file to output\n";         while(<$in>)                 {                        my $line = <$in>;                        chomp($line);                        $hash{$line} ++;                        <$in>;                        <$in>;                        $total ++;                 }                 foreach my $k(keys %hash)                 {                        if($hash{$k} > 1)                        {                                $dup += $hash{$k} - 1;                        }                 }         my $pert = sprintf("%.2f%%",$dup/$total*100);         my ($id,$dir) = fileparse($file);         print $out "$id\t$pert\n";}}                                sub trim_fq{           my $flag = 0;           my ($i,$rd1,$rd2) = @_;                      chdir "$trim_path";           `mkdir $i`;            chdir "$i" ;          if($rd2){           `SeqPrep -f $rd1 -r $rd2 -1 $i\_clip_1.fq -2 $i\_clip_2.fq -3 $i\_discard_1.fq -4 $i\_discard_2.fq -q 20 -L 20 -B AGATCGGAAGAGCGTCGTGT -A AGATCGGAAGAGCACACGTC 2> $i\_trim_adpter.txt `;          `sickle pe -f $i\_clip_1.fq -r $i\_clip_2.fq -t sanger -l 25 -n -o $i\_trim1.fq -p $i\_trim2.fq -s $i\_trim_unpaired.fq >$i\_trim_sickle.log ` ;           $flag = 1;           my $adpter_num;           open ADPTER,"<","$i\_trim_adpter.txt" or die "can't open $!";           while(<ADPTER>){                   chomp;                   if(/Pairs With Adapters:\s+(\d+)/){                          $adpter_num = $1;                          print $adpter_num,"\n";                       }               }                               return ($flag,"$i\_trim1.fq","$i\_trim2.fq",$adpter_num);          # `echo -e "$PWD/$i\_trim1.fq\t$PWD/$i\_trim2.fq\t$PWD/$i\_trim_unpaired.fq" >> trim_fq.list;                      }else{            my $adpter_num;            `fastx_clipper -a AGATCGGAAGAGCACACGTC -Q 33 -l 25 -v -i $rd1 -o $i\_trim_adpter.fq >$i\_trim_adpter.txt`;            `DynamicTrim.pl $i\_trim_adpter.fq -h 20 -bwa -sanger`;            `LengthSort.pl $i\_trim_adpter.fq.trimmed -l 25`;           # `sickle se -f $i\_clip_1.fq -t sanger -l 25 -n -s $i\_trim_unpaired.fq >$i\_trim_sickle.log `;                         open ADPTER,"<","$i\_trim_adpter.txt" or die "can't open $!";            while(<ADPTER>){                     chomp;                     if(/(\d+)\s+adpter-only/){                         $adpter_num = $1;                         }                  }                          $flag = 1;            return ($flag,"$trim_path/$i/$i\_trim_adpter.fq.trimmed.single",$adpter_num);                 }                             chdir "$raw_path";                      }               


0 0