Mercurial > repos > galaxyp > nbic_fasta
view ProteinDigestor.pl @ 0:163892325845 draft default tip
Initial commit.
author | galaxyp |
---|---|
date | Fri, 10 May 2013 17:15:08 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/perl -w ############################################################### # ProteinDigestor.pl # # ===================================================== # $Id: ProteinDigestor.pl 76 2011-01-04 13:16:56Z pieter.neerincx@gmail.com $ # $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/ProteinDigestor.pl $ # $LastChangedDate: 2011-01-04 07:16:56 -0600 (Tue, 04 Jan 2011) $ # $LastChangedRevision: 76 $ # $LastChangedBy: pieter.neerincx@gmail.com $ # ===================================================== # # (c) Dr. Ir. B. van Breukelen et al. # https://bioinformatics.chem.uu.nl # http://www.netherlandsproteomicscentre.eu/ # b.vanbreukelen@pharm.uu.nl # # Software to create peptide databases from a fasta # file of proteins. You can cut with several enzymes # select on size, PI, etc etc etc. # # TODO: put filters, enzymes, in seperate config file ############################################################### ############################# # Initialise environment ############################# use strict; use Getopt::Std; use Log::Log4perl qw(:easy); # # Log levels for Log4perl. # my %log_levels = ( 'ALL' => $ALL, 'TRACE' => $TRACE, 'DEBUG' => $DEBUG, 'INFO' => $INFO, 'WARN' => $WARN, 'ERROR' => $ERROR, 'FATAL' => $FATAL, 'OFF' => $OFF, ); # # pK values array # my %cPk = ( 'A'=>[3.55, 7.59, 0 , 0 , 0], 'B'=>[3.55, 7.50, 0 , 0 , 0], 'C'=>[3.55, 7.50, 9.00 , 9.00 , 9.00], 'D'=>[4.55, 7.50, 4.05 , 4.05 , 4.05], 'E'=>[4.75, 7.70, 4.45 , 4.45 , 4.45], 'F'=>[3.55, 7.50, 0 , 0. , 0], 'G'=>[3.55, 7.50, 0 , 0. , 0], 'H'=>[3.55, 7.50, 5.98 , 5.98 , 5.98], 'I'=>[3.55, 7.50, 0 , 0. , 0.], 'J'=>[0.00, 0.00, 0 , 0. , 0.], 'K'=>[3.55, 7.50, 10.00, 10.00, 10.00], 'L'=>[3.55, 7.50, 0 , 0. , 0.], 'M'=>[3.55, 7.00, 0 , 0. , 0.], 'N'=>[3.55, 7.50, 0 , 0. , 0.], 'O'=>[0.00, 0.00, 0 , 0. , 0.], 'P'=>[3.55, 8.36, 0 , 0. , 0.], 'Q'=>[3.55, 7.50, 0 , 0. , 0.], 'R'=>[3.55, 7.50, 12.0 , 12.0 , 12.0], 'S'=>[3.55, 6.93, 0 , 0. , 0.], 'T'=>[3.55, 6.82, 0 , 0. , 0.], 'U'=>[0.00, 0.00, 0 , 0. , 0.], 'V'=>[3.55, 7.44, 0 , 0. , 0.], 'W'=>[3.55, 7.50, 0 , 0. , 0.], 'X'=>[3.55, 7.50, 0 , 0. , 0.], 'Y'=>[3.55, 7.50, 10.00, 10.00, 10.00], 'Z'=>[3.55, 7.50, 0 , 0. , 0.] ); # # MW values array # my %cMWs = ( 'A'=>89.09, 'B'=>132.65, 'C'=>121.15, 'D'=>133.1, 'E'=>147.13, 'F'=>165.19, 'G'=>75.07, 'H'=>155.16, 'I'=>131.18, 'J'=>0.00, 'K'=>146.19, 'L'=>131.18, 'M'=>149.22, 'N'=>132.12, 'O'=>0.00, 'P'=>115.13, 'Q'=>146.15, 'R'=>174.21, 'S'=>105.09, 'T'=>119.12, 'U'=>168.05, 'V'=>117.15, 'W'=>204.22, 'X'=>129.26, 'Y'=>181.19, 'Z'=>146.73 ); # # Digestion enzymes. # # For example Trypsin cuts c-terminal of K and R not followed by a P at position p1 = c:K,R:P=1 # my %enzymes = ( 'Trypsin' => 'c:K,R:P=1', 'Trypsin/P' => 'c:K,R', 'Chymotrypsin' => 'c:F,L,W,Y:P=1', 'Chymotrypsin/P' => 'c:F,L,W,Y', 'V8_E' => 'c:E,Z:P=1', 'V8_DE' => 'c:E,D,B,Z:P=1', 'LysC' => 'c:K:P=1', 'LysC/P' => 'c:K', 'LysN/P' => 'n:K' ); my @filters = ( # "[R|K][R|K][R|K]..[S|T]." # "[R|K][R|K][R|K].[S|T]." , # "[R|K][R|K]..[S|T].", # "[R|K]..[S|T].", # "[R|K][R|K].[S|T].", # "[R|K].[S|T]." # "[S|T].[R|K]" ); my %charges = ('A'=>0, 'B'=>0, 'C'=>0, 'D'=>-1, 'E'=>-1, 'F'=>0,'G'=>0, 'H'=>1, 'I'=>0, 'J'=>0, 'K'=>1, 'L'=>0, 'M'=>0, 'N'=>0, 'O'=>0,'P'=>0, 'Q'=>0, 'R'=>1, 'S'=>0, 'T'=>0, 'U'=>0, 'V'=>0, 'W'=>0, 'X'=>0,'Y'=>0, 'Z'=>0); my %pept = (); #hashed array of peptides (HASH) and fasta headers (VALUE) my $H20 = 18.015; #Mol. weight water my $PH_MIN = 0.0; #min pH value my $PH_MAX = 14.0; #max pH value my $MAXLOOP = 2000; # max number iterations my $EPSI = 0.0001; # desired presision # # Get options. # my %opts; Getopt::Std::getopts('i:o:e:r:p:m:n:c:l:sa', \%opts); my $input = $opts{'i'}; my $output = $opts{'o'}; my $enzyme = $opts{'e'}; my $enzyme_cleavage_site_rules = $opts{'r'}; my $partial_cleavage = $opts{'s'}; my $partial_cleavage_chance = $opts{'p'}; my $min_peptide_weight = $opts{'m'}; my $max_peptide_weight = $opts{'n'}; my $max_charge = $opts{'c'}; my $add_sequence_context = $opts{'a'}; my $log_level = $opts{'l'}; # # Configure logging. # # Provides default if user did not specify log level: $log_level = (defined($log_level) ? $log_level : 'INFO'); # Reset log level to default if user specified illegal log level. $log_level = ( defined($log_levels{$log_level}) ? $log_levels{$log_level} : $log_levels{'INFO'}); #Log::Log4perl->init('log4perl.properties'); Log::Log4perl->easy_init( #{ level => $log_level, # file => ">>ProteinDigestor.log", # layout => '%F{1}-%L-%M: %m%n' }, { level => $log_level, file => "STDOUT", layout => '%d L:%L %p> %m%n' }, ); my $logger = Log::Log4perl::get_logger(); # # Check user input. # unless (defined($input) && defined($output)) { _Usage(); exit; } if ($input =~ /^$/ || $output =~ /^$/) { _Usage(); exit; } if ($input eq $output) { $logger->fatal('Output file is the same as the input file. Select a different file for the output.'); exit; } # # Check input file. # unless (-e $input && -f $input && -r $input) { $logger->fatal('Cannot read from input file ' . $input . ': ' . $!); exit; } # # Set additional defaults. # $partial_cleavage = (defined($partial_cleavage) ? $partial_cleavage : 0); # Disabled by default $partial_cleavage_chance = (defined($partial_cleavage_chance) ? $partial_cleavage_chance : 0.1); # 10% change by default $add_sequence_context = (defined($add_sequence_context) ? $add_sequence_context : 0); # Disabled by default ######################################################## # MAIN PROGRAM ######################################################## $logger->info('Starting...'); # # Get protease cleavage site specs. # unless (defined($enzyme_cleavage_site_rules)) { # # Set default to Trypsin if the user did not specify an enzyme name nor a string with enzyme cleavage site rules. # $enzyme = (defined($enzyme) ? $enzyme : 'Trypsin'); # # Check for illegal enzymes. # unless (defined($enzymes{$enzyme})) { $logger->fatal('Unkown enzyme ' . $enzyme . '.'); exit; } # # Fetch enzyme cleavage site rules. # $enzyme_cleavage_site_rules = $enzymes{$enzyme}; } $logger->info('Using enzyme cleavage site rules: ' . $enzyme_cleavage_site_rules); my @split_params = split(/:/, $enzyme_cleavage_site_rules); my $cut_term = $split_params[0]; my @cut_aa = split(/,/, $split_params[1]); my @inhibition_aa; if (defined($split_params[2])) { @inhibition_aa = split(/,/, $split_params[2]); } my $annotation = {}; my $sequence = ''; # # Create filehandles. # open(my $input_fh, "$input"); open(my $output_fh, ">$output"); # # Write header to result file. # print($output_fh "Protein ID\tPeptide\tMW\tpI\tCharge\tnumber S\tnumber T\tnumber Y\n"); # # Digest fasta files and store peptides in an array (hashed) # foreach my $line (<$input_fh>){ if ($line =~ /^>/){ # # We found a new FASTA header. # # # Process the previously found protein if we already found one. # unless ($sequence eq '') { _ProcessProtein(); } # # Parse new FASTA header. # $line =~ s/[\r\n\f]+//g; $annotation = _ParseHeader($line); $sequence = ''; } else { # # Found a sequence line. # $line =~ s/[\t\s\r\n\f0-9\*\-]+//g; $sequence .= $line; } } # # Don't foget to process the last sequence! # _ProcessProtein(); close($input_fh); close($output_fh); $logger->info('Finished!'); ######################################################### # SUBS ######################################################### sub _ProcessProtein { my $id = ${$annotation}{'ID'}; $logger->debug('Accession/ID: ' . $id); $logger->debug('Sequence: ' . $sequence . "\n"); my $total_length_sequence = length($sequence); my $total_peptide_length = 0; my ($peptides) = _Digest($sequence, $annotation, \@cut_aa, $cut_term, \@inhibition_aa); # # Process peptides. # foreach my $peptide_with_context (sort(keys(%{$peptides}))) { # # Get peptide properties. # $logger->debug('Peptide with context: ' . $peptide_with_context); my ($bare_peptide) = _ParsePeptide($peptide_with_context); my $peplength = length($bare_peptide); my ($pepweight) = _MwCalc($bare_peptide); # # Apply filters # # Filter 1 is on weight. # if (defined($min_peptide_weight)) { $logger->debug('Minimum weight filter:'); $logger->debug("\t" . 'Peptide weight: ' . $pepweight); if ($pepweight >= $min_peptide_weight) { $logger->debug("\t" . 'PASSED - Weight is >= ' . $min_peptide_weight); } else { # Skip this one. $logger->debug("\t" . 'REJECTED - Weight is < ' . $min_peptide_weight); next; } } if (defined($max_peptide_weight)) { $logger->debug('Maximum weight filter:'); $logger->debug("\t" . 'Peptide weight: ' . $pepweight); if ($pepweight <= $max_peptide_weight) { $logger->debug("\t" . 'PASSED - Weight is <= ' . $max_peptide_weight); } else { # Skip this one. $logger->debug("\t" . 'REJECTED - Weight is > ' . $max_peptide_weight); next; } } # # Filter 2 is on sequence motifs. # if (scalar(@filters) > 0) { $logger->debug('Motif filter:'); my ($motif) = _ContainsMotif($bare_peptide, \@filters); if ($motif > 0) { $logger->debug("\t" . 'PASSED - Peptide contains motif ' . $motif); } else { # Skip this one. $logger->debug("\t" . 'REJECTED - Peptide does not contain any motif.'); next; } } my ($pepPI) = _PiCalc($bare_peptide); my ($pepCharge) = _ChargeCalc($bare_peptide); # # Filter 3 is on the amount of charges. # if (defined($max_charge)) { $logger->debug('Charge filter:'); $logger->debug("\t" . 'Peptide charge: ' . $pepCharge); if (defined($max_charge) && $pepCharge <= $max_charge) { $logger->debug("\t" . 'PASSED - Charge <= ' . $max_charge); } else { # Skip this one. $logger->debug("\t" . 'REJECTED - Charge > ' . $max_charge); next; } } # # If there is no partial digestion we can calculated the % coverage. # unless ($partial_cleavage) { $total_peptide_length += length($bare_peptide); } my ($residue_count) = _CountResiduesSTY($bare_peptide); # # Store peptide. # my $pep_sequence; if ($add_sequence_context) { $pep_sequence = $peptide_with_context; } else { $pep_sequence = $bare_peptide; } print($output_fh $id . "\t" . $pep_sequence . "\t" . $pepweight . "\t" . $pepPI. "\t" . $pepCharge. "\t" . $residue_count . "\n"); } # # If partial digestion was not enabled, we can calculated the % coverage. # unless ($partial_cleavage) { my $coverage = ($total_peptide_length / $total_length_sequence)*100; $logger->debug("Sequence residues: $total_length_sequence | #peptide residues: $total_peptide_length | %Coverage: $coverage"); } } sub _ParseHeader { my ($header) = @_; my %annotation; my $ids; my $id; my $description; $header=~ s/^>//; if ($header =~ m/([^\s]+)\s+([^\s]+.*)/) { $ids = $1; $description = $2; } else { $ids = $header; } # # Redimentory support for detecting multiple IDs: check for pipe sperated IDs. # if ($ids =~ m/([^\|])\|/) { $id = $1; } else { $id = $ids; } $annotation{'ID'} = $id; $annotation{'DE'} = $description; return (\%annotation); } sub _ContainsMotif{ my ($peptide, $filters) = @_; my $cnt = 0; foreach my $filter (@{$filters}){ $cnt++; if ($peptide =~ /$filter/){ return($cnt); } } return(0); } sub _ParsePeptide{ my ($peptide_with_flanking_context) = @_; my @pep = split(/\./, $peptide_with_flanking_context); my $bare_peptide = uc($pep[1]); return($bare_peptide); } sub _Digest{ my ($sequence, $annotation, $enz_cutting, $cut_term, $enz_restrict) = @_; $sequence = uc($sequence); my %pept = (); # we start by scanning the sequence.. to find one of the cutting rulez not followed by a restict rule # what is the sequence length my @array = split(//, $sequence); my $length = scalar(@array); my $start_offset = 0; my $offset_adjust_for_cleavage_terminus = 0; if ($cut_term eq 'n') { $offset_adjust_for_cleavage_terminus = 1; } my $peptide = ''; my $counter = 1; if (!($partial_cleavage)) { $partial_cleavage_chance = 0; } else { $counter = 100; } #this is for incomplete cleavage... we repeat digest 100 times with a probability that the enzyme will not cut for (my $cnt = 0; $cnt < $counter; $cnt++) { AA:for (my $aa_offset = 0 + $offset_adjust_for_cleavage_terminus; $aa_offset < ($length - 1 + $offset_adjust_for_cleavage_terminus); $aa_offset++) { my $this_aa = substr($sequence, $aa_offset, 1); # # Check if protease recognizes this amino acid as cleavage site. # foreach my $cut_aa (@{$enz_cutting}) { if ($this_aa eq $cut_aa) { # # Check if AA is not preceeded or followed by an AA that inhibits the protease... # foreach my $inhibit (@{$enz_restrict}) { my @inhibit_conditions = split(/=/, $inhibit); my $inhibit_aa = $inhibit_conditions[0]; my $position_to_check = $inhibit_conditions[1]; my $neighborhood_aa = substr($sequence, $aa_offset + $position_to_check, 1); if ($neighborhood_aa eq $inhibit_aa){ $logger->debug('No cleavage due to inhibition by ' . $inhibit_aa . ' at position ' . $position_to_check); next AA; } } # # Consider the possibility of not cutting when doing incomplete digestions! # my $random = 1; if ($partial_cleavage){ $random = rand(1); } if ($random <= $partial_cleavage_chance){ $logger->debug('No cleavage due to incomplete digestion.'); next AA; } # # If no inhibition spoiled the cleavage, lets cut! # my $cut_offset; if ($cut_term eq 'c') { $cut_offset = $aa_offset + 1; } elsif ($cut_term eq 'n') { $cut_offset = $aa_offset; } # nice formatting of peptide if ($start_offset == 0){ $peptide .="terminus."; $peptide .= substr($sequence, $start_offset, $cut_offset - $start_offset)."."; }else{ $peptide .= substr($sequence, $start_offset - 3, 3)."."; $peptide .= substr($sequence, $start_offset, $cut_offset - $start_offset)."."; } $peptide .= substr($sequence, $cut_offset, 3); $start_offset = $cut_offset; # check if peptide already exists # if so, concatenate this fasta header #if (exists($pept{$peptide})){ # $fasta_header .= $pept{$peptide}; #} $pept{$peptide} = ${$annotation}{'ID'}; $peptide = ''; } } } # # The last peptide sequence ! # $peptide = substr($sequence, $start_offset - 3, 3)."."; $peptide .= substr($sequence, $start_offset, $length); $peptide .= ".terminus"; $pept{$peptide} = ${$annotation}{'ID'}; } #COUNT return(\%pept); } sub _PiCalc{ my ($sequence) = @_; my $nTermResidue; my $cTermResidue; my $phMin;my $phMid ;my $phMax; my $charge; my $cter; my $nter; my $carg; my $chis; my $clys;my $casp;my $cglu;my $ccys;my $ctyr; my $composition = _GetComposition($sequence); my $strlength = length($sequence); $nTermResidue = substr($sequence, 0, 1); for ($nTermResidue){s/[\s\n\r\t]//g;}; $cTermResidue = substr($sequence, $strlength - 1, 1); for ($cTermResidue){s/[\s\n\r\t]//g;}; $phMin = $PH_MIN; $phMax = $PH_MAX; my $i; for ($i=0, $charge = 1.0; $i<$MAXLOOP && ($phMax-$phMin)>$EPSI; $i++){ $phMid = $phMin + ($phMax - $phMin) / 2; $cter = 10**(-$cPk{$cTermResidue}->[0]) / (10**(-$cPk{$cTermResidue}->[0]) + 10**(-$phMid)); $nter = 10**(-$phMid)/(10**(-$cPk{$nTermResidue}->[1])+10**(-$phMid)); $carg = ${$composition}{R} * 10**(-$phMid) / (10**(-$cPk{'R'}->[2])+10**(-$phMid)); $chis = ${$composition}{H} * 10**(-$phMid) / (10**(-$cPk{'H'}->[2])+10**(-$phMid)); $clys = ${$composition}{K} * 10**(-$phMid) / (10**(-$cPk{'K'}->[2])+10**(-$phMid)); $casp = ${$composition}{D} * 10**(-$cPk{'D'}->[2]) / (10**(-$cPk{'D'}->[2])+10**(-$phMid)); $cglu = ${$composition}{E} * 10**(-$cPk{'E'}->[2]) / (10**(-$cPk{'E'}->[2])+10**(-$phMid)); $ccys = ${$composition}{C} * 10**(-$cPk{'C'}->[2]) / (10**(-$cPk{'C'}->[2])+10**(-$phMid)); $ctyr = ${$composition}{Y} * 10**(-$cPk{'Y'}->[2]) / (10**(-$cPk{'Y'}->[2])+10**(-$phMid)); $charge = $carg + $clys + $chis + $nter - ($casp + $cglu + $ctyr + $ccys + $cter); if ($charge > 0.0){ $phMin = $phMid; }else{ $phMax = $phMid; } } return($phMid); } sub _MwCalc{ my ($sequence) = @_; my $mw; my $strlength = length($sequence); my $composition = _GetComposition($sequence); # subtract N-1 water molecules $mw = - ($strlength -1) * $H20; foreach my $aa (sort keys %{$composition}){ $logger->trace('AA:' . $aa . ' | frequency: ' . ${$composition}{$aa} . ' | AA MW: ' . $cMWs{$aa}); $mw += ${$composition}{$aa} * $cMWs{$aa}; } return($mw); } sub _ChargeCalc{ my ($sequence) = @_; my $charge; my $composition = _GetComposition($sequence); foreach my $aa (sort(keys(%{$composition}))){ $charge += ${$composition}{$aa} * $charges{$aa}; } return($charge); } sub _CountResiduesSTY{ my ($sequence) = @_; my $composition = _GetComposition($sequence); my $sty_residues = ${$composition}{'S'} . "\t" . ${$composition}{'T'} . "\t" . ${$composition}{'Y'}; return($sty_residues); } sub _GetComposition { my ($sequence) = @_; my $i; my $c; my %composition = ('A'=>0, 'B'=>0,'C'=>0, 'D'=>0, 'E'=>0, 'F'=>0,'G'=>0, 'H'=>0, 'I'=>0, 'J'=>0, 'K'=>0,'L'=>0, 'M'=>0, 'N'=>0, 'O'=>0,'P'=>0, 'Q'=>0, 'R'=>0, 'S'=>0, 'T'=>0,'U'=>0, 'V'=>0, 'W'=>0, 'X'=>0,'Y'=>0, 'Z'=>0); # # Compute aminoacid composition. # my $strlength = length($sequence); for ($i=0; $i<$strlength; $i++){ $c = substr($sequence, $i, 1); $composition{$c}++; } return(\%composition); } sub _Usage { my $longest_enzyme_name_length = 0; foreach my $protease (keys(%enzymes)) { if (length($protease) > $longest_enzyme_name_length) { $longest_enzyme_name_length = length($protease); } } my $field_formatting = '%-' . ($longest_enzyme_name_length + 3) . 's'; print "\n"; print "ProteinDigestor.pl - Digests protein sequences from a FASTA file\n"; print "\n"; print "Usage:\n"; print "\n"; print " ProteinDigestor.pl options\n"; print "\n"; print "Available options are:\n"; print "\n"; print " -i [file] Input FASTA file.\n"; print " -o [file] Output stats file.\n"; print "\n"; print " You can either specify a protease this tool knows with -e\n"; print " or you can specify the cleavage rule for a protease this does not know (yet) with -r\n"; print "\n"; print " -e [enzyme_name] Digestion enzyme. Known enzymes and their cleavage rules:\n"; foreach my $protease (sort(keys(%enzymes))) { print ' ' . sprintf("$field_formatting", $protease) . $enzymes{$protease} . "\n"; } print " -r [enzyme_rule] Protease cleavage site rule. This rule contains sperated by colons:\n"; print " * The terminus that indicates on which side of a recognized amino acid the protease will cleave: c or n.\n"; print " * The amino acids recognized by the protease.\n"; print " Multiple amino acids separated by a comma.\n"; print " * The amino acids that inhibit cleavage and their position relative to amino acids recognized by the protease.\n"; print ' Amino acid and its relative position separated by an equals sign (=).' . "\n"; print " Multiple amino acids separated by a comma.\n"; print ' For example \'c:K,R:P=1\' for Trypsin with proline inhibition.' . "\n"; print " -s Semi for partial digestion.\n"; print " -p [chance] Number between 0 and 1 indicating the chance that a site will not be cleaved.\n"; print " -m [weight] Minimum weight for a peptide.\n"; print " -n [weight] Maximum weight for a peptide.\n"; print " -c [int] Maximum charge a peptide is allowed to have.\n"; print " -a Add sequence context.\n"; print " This will add max 3 amino acids or the word terminus on each side of the peptides.\n"; print " Contexts and peptides are separated with a dot.\n"; print " Examples of peptides with a context:\n"; print " terminus.MSVSLSAK.ASH\n"; print " SAK.ASHLLCSSTR.VSL\n"; print " STR.VSLSPAVTSSSSSPVVRPALSSSTS.terminus\n"; print " -l [LEVEL] Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.\n"; print "\n"; exit; }