Beginning Perl for Bioinformatics 总结提升

来源：互联网发布：网络写手怎样才有收入编辑：程序博客网时间：2024/06/06 18:00

1 Biology and Computer Science2 Getting Started with Perl3 The Art of Programming4 Sequences and Strings

5 Motifs and LoopsExample 53. Searching for motifs #!/usr/bin/perl w # Searching for motifs # Ask the user for the filename of the file containing # the protein sequence data, and collect it from the keyboard print "Please type the filename of the protein sequence data: ";$proteinfilename = <STDIN>; # Remove the newline from the protein filename chomp $proteinfilename; # open the file, or exit unless ( open(PROTEINFILE, $proteinfilename) ) { print "Cannot open file \"$proteinfilename\"\n\n"; exit; } # Read the protein sequence data from the file, and store it # into the array variable @protein @protein = <PROTEINFILE>; # Close the file  we've read all the data into @protein now. close PROTEINFILE; # Put the protein sequence data into a single string, as it's easier # to search for a motif in a string than in an array of # lines (what if the motif occurs over a line break?) $protein = join( '', @protein); # Remove whitespace $protein =~ s/\s//g; # In a loop, ask the user for a motif, search for the motif, # and report if it was found. # Exit if no motif is entered. do { print "Enter a motif to search for: "; $motif = <STDIN>; # Remove the newline at the end of $motif chomp $motif; # Look for the motif if ( $protein =~ /$motif/ ) { print "I found it!\n\n"; } else { print "I couldn\'t find it.\n\n"; }# exit on an empty user input } until ( $motif =~ /^\s*$/ ); # exit the program exit; #!/usr/bin/perl w # Determining frequency of nucleotides, take 2 # Get the DNA sequence data print "Please type the filename of the DNA sequence data: "; $dna_filename = <STDIN>; chomp $dna_filename; # Does the file exist? unless ( e $dna_filename) { print "File \"$dna_filename\" doesn\'t seem to exist!!\n"; exit; } # Can we open the file? unless ( open(DNAFILE, $dna_filename) ) { print "Cannot open file \"$dna_filename\"\n\n"; exit; } @DNA = <DNAFILE>; close DNAFILE; $DNA = join( '', @DNA); # Remove whitespace $DNA =~ s/\s//g; # Initialize the counts. # Notice that we can use scalar variables to hold numbers. $count_of_A = 0; $count_of_C = 0; $count_of_G = 0; $count_of_T = 0; $errors     = 0; # In a loop, look at each base in turn, determine which of the # four types of nucleotides it is, and increment the # appropriate count. for ( $position = 0 ; $position < length $DNA ; ++$position ) { $base = substr($DNA, $position, 1);if     ( $base eq 'A' ) { ++$count_of_A; } elsif ( $base eq 'C' ) { ++$count_of_C; } elsif ( $base eq 'G' ) { ++$count_of_G; } elsif ( $base eq 'T' ) { ++$count_of_T; } else { print "!!!!!!!! Error  I don\'t recognize this base: $base\n"; ++$errors; } } # print the results print "A = $count_of_A\n"; print "C = $count_of_C\n"; print "G = $count_of_G\n"; print "T = $count_of_T\n"; print "errors = $errors\n"; # exit the program exit; Here's the output of Example 56: Please type the filename of the DNA sequence data: small.dna !!!!!!!! Error  I don't recognize this vase: V A = 40 C = 27 G = 24 T = 17 errors = 1 6 Subroutines and Bugs7 Mutations and Randomization8 The Genetic Code9 Restriction Maps and Regular Expressions10 GenBankExtract annotation and sequence from GenBank file #!/usr/bin/perl # Extract annotation and sequence from GenBank file use strict; use warnings; use BeginPerlBioinfo;     # see Chapter 6 about this module # declare and initialize variables my @annotation = (  ); my $sequence = ''; my $filename = 'record.gb'; parse1(\@annotation, \$sequence, $filename); # Print the annotation, and then #   print the DNA in new format just to check if we got it okay. print @annotation; print_sequence($sequence, 50); exit; #######################sub parse1 { my($annotation, $dna, $filename) = @_; # $annotationreference to array # $dna reference to scalar # $filename  scalar # declare and initialize variables my $in_sequence = 0; my @GenBankFile = (  ); # Get the GenBank data into an array from a file @GenBankFile = get_file_data($filename); # Extract all the sequence lines foreach my $line (@GenBankFile) { if( $line =~ /^\/\/\n/ ) { # If $line is endofrecord line //\n, last; #break out of the foreach loop. } elsif( $in_sequence) { # If we know we're in a sequence, $$dna .= $line; # add the current line to $$dna. } elsif ( $line =~ /^ORIGIN/ ) { # If $line begins a sequence, $in_sequence = 1; # set the $in_sequence flag. } else{ # Otherwise push( @$annotation, $line); # add the current line to @annotation. } } # remove whitespace and line numbers from DNA sequence $$dna =~ s/[\s09]//g; } Extract annotation and sequence from Genbank record #!/usr/bin/perl # Extract the annotation and sequence sections from the first #   record of a GenBank library use strict; use warnings; use BeginPerlBioinfo;# Declare and initialize variables my $annotation = ''; my $dna = ''; my $record = ''; my $filename = 'record.gb'; my $save_input_separator = $/; # Open GenBank library file unless (open(GBFILE, $filename)) { print "Cannot open GenBank file \"$filename\"\n\n"; exit; } # Set input separator to "//\n" and read in a record to a scalar $/ = "//\n"; $record = <GBFILE>; # reset input separator $/ = $save_input_separator; # Now separate the annotation from the sequence data ($annotation, $dna) = ($record =~ /^(LOCUS.*ORIGIN\s*\n)(.*)\/\/\n/s); # Print the two pieces, which should give us the same as the #  original GenBank file, minus the // at the end print $annotation, $dna; exit;Parsing GenBank annotations using arrays #!/usr/bin/perl # Parsing GenBank annotations using arrays use strict; use warnings; use BeginPerlBioinfo;     # see Chapter 6 about this module # Declare and initialize variables my @genbank = (  ); my $locus = ''; my $accession = ''; my $organism = ''; # Get GenBank file data @genbank = get_file_data('record.gb');# Let's start with something simple.  Let's get some of the identifying # information, let's say the locus and accession number (here the same # thing) and the definition and the organism. for my $line (@genbank) { if($line =~ /^LOCUS/) { $line =~ s/^LOCUS\s*//; $locus = $line; }elsif($line =~ /^ACCESSION/) { $line =~ s/^ACCESSION\s*//; $accession = $line; }elsif($line =~ /^  ORGANISM/) { $line =~ s/^\s*ORGANISM\s*//; $organism = $line; } } print "*** LOCUS ***\n"; print $locus; print "*** ACCESSION ***\n"; print $accession; print "*** ORGANISM ***\n"; print $organism; exit; Listing the contents of a folder (or directory) #!/usr/bin/perl #   Demonstrating how to open a folder and list its contents use strict; use warnings; use BeginPerlBioinfo;     # see Chapter 6 about this module my @files = (  ); my $folder = 'pdb'; # open the folder unless(opendir(FOLDER, $folder)) { print "Cannot open folder $folder!\n"; exit; } # read the contents of the folder (i.e. the files and subfolders) @files = readdir(FOLDER); # close the folder closedir(FOLDER); # print them out, one per line print join( "\n", @files), "\n"; exit; Extract sequence chains from PDB file #!/usr/bin/perl #  Extract sequence chains from PDB file use strict; use warnings; use BeginPerlBioinfo;# Read in PDB file:  Warning  some files are very large! my @file = get_file_data('pdb/c1/pdb1c1f.ent'); # Parse the record types of the PDB file my %recordtypes = parsePDBrecordtypes(@file); # Extract the amino acid sequences of all chains in the protein my @chains = extractSEQRES( $recordtypes{'SEQRES'} ); # Translate the 3character codes to 1character codes, and print foreach my $chain (@chains) { print "****chain $chain **** \n";print "$chain\n"; print iub3to1($chain), "\n"; } exit;# parsePDBrecordtypes # #given an array of a PDB file, return a hash with #    keys   = record type names #    values = scalar containing lines for that record type sub parsePDBrecordtypes { my @file = @_; use strict; use warnings; my %recordtypes = (  ); foreach my $line (@file) { # Get the record type name which begins at the # start of the line and ends at the first space my($recordtype) = ($line =~ /^(\S+)/); # .= fails if a key is undefined, so we have to # test for definition and use either .= or = depending if(defined $recordtypes{$recordtype} ) { $recordtypes{$recordtype} .= $line; }else{ $recordtypes{$recordtype} = $line; } } return %recordtypes; } # extractSEQRES # #given an scalar containing SEQRES lines, #    return an array containing the chains of the sequencesub extractSEQRES { use strict; use warnings; my($seqres) = @_; my $lastchain = ''; my $sequence = ''; my @results = (  ); # make array of lines my @record = split ( /\n/, $seqres); foreach my $line (@record) { # Chain is in column 12, residues start in column 20 my ($thischain) = substr($line, 11, 1); my($residues)  = substr($line, 19, 52); # add space at end # Check if a new chain, or continuation of previous chain if("$lastchain" eq "") { $sequence = $residues; }elsif("$thischain" eq "$lastchain") { $sequence .= $residues; # Finish gathering previous chain (unless first record) }elsif ( $sequence ) { push(@results, $sequence); $sequence = $residues; } $lastchain = $thischain; } # save last chain push(@results, $sequence); return @results; } # iub3to1 # #change string of 3character IUB amino acid codes (whitespace separated) #    into a string of 1character amino acid codes sub iub3to1 { my($input) = @_;my %three2one = ( 'ALA' => 'A', 'VAL' => 'V', 'LEU' => 'L', 'ILE' => 'I', 'PRO' => 'P', 'TRP' => 'W', 'PHE' => 'F', 'MET' => 'M', 'GLY' => 'G', 'SER' => 'S', 'THR' => 'T', 'TYR' => 'Y', 'CYS' => 'C', 'ASN' => 'N', 'GLN' => 'Q', 'LYS' => 'K', 'ARG' => 'R', 'HIS' => 'H', 'ASP' => 'D', 'GLU' => 'E', ); # clean up the input $input =~ s/\n/ /g; my $seq = ''; # This use of split separates on any contiguous whitespace my @code3 = split(' ', $input); foreach my $code (@code3) { # A little error checking if(not defined $three2one{$code}) { print "Code $code not defined\n"; next; } $seq .= $three2one{$code}; } return $seq; } 12 BLASTExtract annotation and alignments from BLAST output file #!/usr/bin/perl # Extract annotation and alignments from BLAST output file use strict; use warnings; use BeginPerlBioinfo;     # declare and initialize variablesmy $beginning_annotation = ''; my $ending_annotation = ''; my %alignments = (  ); my $filename = 'blast.txt'; parse_blast(\$beginning_annotation, \$ending_annotation, \%alignments, $filename); # Print the annotation, and then #   print the DNA in new format just to check if we got it okay. print $beginning_annotation; foreach my $key (keys %alignments) { print "$key\nXXXXXXXXXXXX\n", $alignments{$key}, "\nXXXXXXXXXXX\n"; } print $ending_annotation; exit; # parse_blast # # parse beginning and ending annotation, and alignments, #     from BLAST output file sub parse_blast { my($beginning_annotation, $ending_annotation, $alignments, $filename) = @_; # $beginning_annotationreference to scalar # $ending_annotation  reference to scalar # $alignments reference to hash # $filename scalar # declare and initialize variables my $blast_output_file = ''; my $alignment_section = ''; # Get the BLAST program output into an array from a file $blast_output_file = join( '', get_file_data($filename));# Extract the beginning annotation, alignments, and ending annotation ($$beginning_annotation, $alignment_section, $$ending_annotation) = ($blast_output_file =~ /(.*^ALIGNMENTS\n)(.*)(^ Database:.*)/ms); # Populate %alignments hash # key = ID of hit # value = alignment section %$alignments = parse_blast_alignment($alignment_section); } # parse_blast_alignment # # parse the alignments from a BLAST output file, #       return hash with #       key = ID #       value = text of alignment sub parse_blast_alignment { my($alignment_section) = @_; # declare and initialize variables my(%alignment_hash) = (  ); # loop through the scalar containing the BLAST alignments, # extracting the ID and the alignment and storing in a hash # # The regular expression matches a line beginning with >, # and containing the ID between the first pair of | characters; # followed by any number of lines that don't begin with > while($alignment_section =~ /^>.*\n(^(?!>).*\n)+/gm) { my($value) = $&; my($key) = (split(/\|/, $value)) [1]; $alignment_hash{$key} = $value; } return %alignment_hash; }13 Further Topics

0 0