Mercurial > repos > galaxyp > nbic_fasta
view ExtractPeptideSequenceContext.pl @ 0:163892325845 draft default tip
Initial commit.
author | galaxyp |
---|---|
date | Fri, 10 May 2013 17:15:08 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/perl -w # # This script extracts subsequences from UniProtKB *.dat files or from *.fasta files # based on a list of accession numbers / IDs and a sequence fragment (peptide). # # ===================================================== # $Id: ExtractPeptideSequenceContext.pl 112 2011-03-01 19:50:23Z pieter.neerincx@gmail.com $ # $HeadURL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/ExtractPeptideSequenceContext.pl $ # $LastChangedDate: 2011-03-01 13:50:23 -0600 (Tue, 01 Mar 2011) $ # $LastChangedRevision: 112 $ # $LastChangedBy: pieter.neerincx@gmail.com $ # ===================================================== # # # initialise environment # use strict; use Getopt::Long; use SWISS::Entry; # For parsing UniProtKB *.dat files. use Log::Log4perl qw(:easy); my $ps = '/'; # Path Separator. my %log_levels = ( 'ALL' => $ALL, 'TRACE' => $TRACE, 'DEBUG' => $DEBUG, 'INFO' => $INFO, 'WARN' => $WARN, 'ERROR' => $ERROR, 'FATAL' => $FATAL, 'OFF' => $OFF, ); # # List of databases, which may contain versioned accession numbers, # and the separation character used to join the accession number and # the version number. # my %databases_with_optionally_versioned_ids = ( 'IPI' => '.', 'ipi' => '.', 'UniProtKB/Swiss-Prot' => '.', 'sp' => '.', 'SP' => '.', 'UniProtKB/TrEMBL' => '.', 'TR' => '.', 'tr' => '.', 'Tr' => '.' ); my $db; my $db_alt; my $db_format; my $index_id; # Key used to lookup sequences. Can be ID or Accession number (default). my $fragments; my $id_column; my $peptide_column; my $strip_lc; # For the sequence fragments in the fragments file. With strip_lc enabled we # assume amino acids are in upper case and lower case characters represent modifications, # which will be stripped off before searching for the fragments in the complete sequences. my $cleavage_output; # Sequence contexts for cleavage sites. my $miscleavage_output; # Sequence contexts for miscleavage sites. my $modification_output; # Sequence contexts for modification sites. my $peptide_output; # Sequence contexts for complete peptides. my $cleavage_aa; # Assume the sequence fragments were derived from cutting at this amino acid. my $cleavage_terminus; # Assume the sequence fragments were derived from cutting at this side of $cut_at_aa my $modified_aa; my $n_term_context_length; my $c_term_context_length; my $miscleavage_distance; # Minimal distance between a putative miscleavage site and the peptide termini. # Uncleaved AAs closer to the peptide termini can be ignored with this parameter: # A. To prevent overlap between cleavage and miscleavage peptide sequence contexts. # B. To remove false positive miscleavage sites that cannot be cleaved, # because one of the resulting fragments is too short. Sometimes proteases may need # a minimal length surrounding the cleavage site to bind and cleave a peptide. my $padding_character; # Used for padding if the protein sequence is too short for a long enough context. my $log_level; Getopt::Long::GetOptions ( 'db=s' => \$db, 'dba:s' => \$db_alt, 'dbf:s' => \$db_format, 'k:s' => \$index_id, # Key used to lookup sequences. Can be ID or Accession number (default). 'f=s' => \$fragments, 'icol:i' => \$id_column, 'pcol:i' => \$peptide_column, 's' => \$strip_lc, # For the sequence fragments in the fragments file. With strip_lc enabled we # assume amino acids are in upper case and lower case characters represent modifications, # which will be stripped off before searching for the fragments in the complete sequences. 'cleo:s' => \$cleavage_output, 'miso:s' => \$miscleavage_output, 'modo:s' => \$modification_output, 'pepo:s' => \$peptide_output, 'ca:s' => \$cleavage_aa, # Assume the sequence fragments were derived from cutting at this amino acid. 'ct:s' => \$cleavage_terminus, # Assume the sequence fragments were derived from cutting at this side of $cut_at_aa 'ma:s' => \$modified_aa, 'n:i' => \$n_term_context_length, 'c:i' => \$c_term_context_length, 'mcd:i' => \$miscleavage_distance, # Minimal distance between a putative miscleavage site and the peptide termini. # Uncleaved AAs closer to the peptide termini can be ignored with this parameter: # A. To prevent overlap between cleavage and miscleavage peptide sequence contexts. # B. To remove false positive miscleavage sites that cannot be cleaved, # because one of the resulting fragments is too short. Sometimes proteases may need # a minimal length surrounding the cleavage site to bind and cleave a peptide. 'pc:s' => \$padding_character, 'll:s' => \$log_level ); # # TODO: 1. Handle --mcd param. Maybe should be extended to exclude also modification sites too close together... # # # Configure logging. # # Provides default if user did not specify log level: $log_level = (defined($log_level) ? $log_level : 'INFO'); # Reset log level to default if user specified illegal log level. $log_level = (defined($log_levels{$log_level}) ? $log_levels{$log_level} : $log_levels{'INFO'}); #Log::Log4perl->init('log4perl.properties'); Log::Log4perl->easy_init( #{ level => $log_level, # file => ">>ExtractPeptideSequenceContext.log", # layout => '%F{1}-%L-%M: %m%n' }, { level => $log_level, file => "STDOUT", layout => '%d L:%L %p> %m%n' }, ); my $logger = Log::Log4perl::get_logger(); # # Check user input. # foreach my $input ($db, $db_alt, $fragments, $cleavage_output, $miscleavage_output, $modification_output) { if (defined($input) && length($input) <= 0) { $input = undef(); } } unless (defined($db) && defined($fragments) && (defined($cleavage_output) || defined($miscleavage_output) || defined($modification_output) || defined($peptide_output)) ) { $logger->fatal('You need to specify at least a DB & fragments file as inputs and at least one output file.'); _Usage(); exit; } if (defined($db_format)) { unless ($db_format eq 'UniProtKB DAT' or $db_format eq 'FASTA') { $logger->fatal('Database format in unsupported format.'); $logger->fatal("\t" . 'Must be \'UniProtKB DAT\' or \'FASTA\'.'); $logger->fatal("\t" . 'But --dbf was ' . $db_format . '.'); exit; } } if (defined($cleavage_aa)) { unless ($cleavage_aa =~ m/^[*A-Z]$/) { $logger->fatal('Cleavage amino acid in unsupported format.'); $cleavage_aa = undef; } } if (defined($cleavage_terminus)) { unless ($cleavage_terminus =~ m/^[*CN]$/) { $logger->fatal('Cleavage terminus in unsupported format.'); $cleavage_terminus = undef; } } if (defined($modified_aa)) { unless ($modified_aa =~ m/^[*A-Z][a-z]+$/ || $modified_aa =~ m/^[a-z]+[*A-Z]$/) { $logger->fatal('Modified amino acid in unsupported format.'); $modified_aa = undef; } } # # We need info to extract: # * cleavage site sequence contexts or # * modification site sequence contexts or # * peptide sequence contexts # if (defined($cleavage_output) || defined($miscleavage_output)) { unless (defined($cleavage_aa) && defined($cleavage_terminus)) { $logger->fatal('If you want to retrieve (mis)cleavage site sequence contexts, you must provide valid values for --ca and --ct.'); $logger->debug('You specified --ca ' . $cleavage_aa . ' --ct ' . $cleavage_terminus . '.'); _Usage(); exit; } } if (defined($modification_output)) { unless (defined($modified_aa)) { $logger->fatal('If you want to retrieve modifications site sequence contexts you must provide a valid value for --ma.'); $logger->debug('You specified --ma ' . $modified_aa . '.'); _Usage(); exit; } } if (defined($peptide_output)) { # # We can work with defaults for all params. # } # Make sure we don't overwrite the input. foreach my $output ($cleavage_output, $miscleavage_output, $modification_output, $peptide_output) { foreach my $input ($db, $db_alt, $fragments) { if (defined($input) && defined($output) && $output eq $input) { $logger->fatal('Output file ' . $output . ' is the same as input file ' . $input . '.'); $logger->fatal("\t" . 'Please choose a different file for the output.'); exit; } } } # Define defaults for optional arguments. $index_id = (defined($index_id) && length($index_id) > 0 ? $index_id : 'AC'); $id_column = (defined($id_column) && length($id_column) > 0 ? $id_column : 1); $peptide_column = (defined($peptide_column) && length($peptide_column) > 0 ? $peptide_column : 2); $n_term_context_length = (defined($n_term_context_length) && length($n_term_context_length) > 0 ? $n_term_context_length : 5); $c_term_context_length = (defined($c_term_context_length) && length($c_term_context_length) > 0 ? $c_term_context_length : 5); $padding_character = (defined($padding_character) ? $padding_character : '-'); # Stripping of lowercase characters is disabled by default unless we search for modification site contexts in which case it must be enabled! $strip_lc = (defined($strip_lc) ? $strip_lc : 0); $strip_lc = (defined($modified_aa) ? 1 : $strip_lc); unless (defined($miscleavage_distance) && $miscleavage_distance > 0) { if ($n_term_context_length >= $c_term_context_length) { $miscleavage_distance = $n_term_context_length; } else { $miscleavage_distance = $c_term_context_length; } } # # Check db files and fragments file. # unless (-e $db && -f $db && -r $db) { $logger->fatal('Cannot read from database input file ' . $db . ': ' . $!); exit; } if (defined($db_alt)) { unless (-e $db_alt && -f $db_alt && -r $db_alt) { $logger->fatal('Cannot read from alternative database input file ' . $db_alt . ': ' . $!); exit; } } unless (-e $fragments && -f $fragments && -r $fragments) { $logger->fatal('Cannot read from fragments input file ' . $fragments . ': ' .$!); exit; } # # Create lookup table(s) of the entire database input file(s). # my @db_lookup_table_refs; (my $db_seqs, $index_id) = _CreateLookupHash($db, $index_id, $db_format); my $seq_count = scalar(keys(%{$db_seqs})); $logger->info('Number of sequences in database lookup table: ' . $seq_count); push(@db_lookup_table_refs, $db_seqs); # Optional backup DB file. if (defined($db_alt)) { (my $db_seqs_alt, $index_id) = _CreateLookupHash($db_alt, $index_id, $db_format); my $seq_count_alt = scalar(keys(%{$db_seqs_alt})); $logger->info('Number of sequences in backup database lookup table: ' . $seq_count_alt); push(@db_lookup_table_refs, $db_seqs_alt); } # # Debugging loop # #foreach my $keyyy (keys(%{$db_seqs})) { # $logger->fatal('Key: ' . $keyyy); # $logger->fatal("\t" . 'AC: ' . ${$db_seqs}{$keyyy}{'AC'}); # $logger->fatal("\t" . 'ID: ' . ${$db_seqs}{$keyyy}{'ID'}); # $logger->fatal("\t" . 'SQ: ' . ${$db_seqs}{$keyyy}{'SQ'}); #} # # Lookup sequence fragment contexts and store results. # my ($counter_cleavages, $counter_miscleavages, $counter_modifications, $counter_peptides) = _ExtractSeqs(\@db_lookup_table_refs, $fragments, $strip_lc, $id_column, $peptide_column, $cleavage_output, $miscleavage_output, $modification_output, $peptide_output, $cleavage_aa, $cleavage_terminus, $miscleavage_distance, $n_term_context_length, $c_term_context_length, $padding_character); # # Report result stats. # my $max_length = 0; foreach my $counter ($counter_cleavages, $counter_miscleavages, $counter_modifications, $counter_peptides) { $max_length = (length($counter) > $max_length) ? length($counter) : $max_length; } my $prefixed_counter = sprintf('%*s', $max_length, $counter_cleavages); $logger->info('Extracted cleavage site sequence contexts: ' . $prefixed_counter . '.'); $prefixed_counter = sprintf('%*s', $max_length, $counter_miscleavages); $logger->info('Extracted miscleavage site sequence contexts: ' . $prefixed_counter . '.'); $prefixed_counter = sprintf('%*s', $max_length, $counter_modifications); $logger->info('Extracted modification site sequence contexts: ' . $prefixed_counter . '.'); $prefixed_counter = sprintf('%*s', $max_length, $counter_peptides); $logger->info('Extracted peptide sequence contexts: ' . $prefixed_counter . '.'); $prefixed_counter = sprintf('%*s', $max_length, $counter_cleavages + $counter_miscleavages + $counter_modifications + $counter_peptides); $logger->info('Total extracted sequence contexts: ' . $prefixed_counter . '.'); $logger->info('Finished!'); # ## ### Internal subs. ## # sub _CreateLookupHash { my ($file_path, $index_id, $file_type) = @_; my $file_path_fh; my %seqs; if (defined($file_type)) { $logger->info('Parsing ' . $file_type . ' file ' . $file_path . '...'); } elsif ($file_path =~ m/($ps?)([^\.]+)\.dat$/i) { $file_type = 'UniProtKB DAT'; $logger->info('Parsing ' . $file_type . ' file ' . $file_path . '...'); } elsif ($file_path =~ m/($ps?)([^\.]+)\.fa(sta)?$/i) { $file_type = 'FASTA'; # For FASTA files we don't know what to expect: # IDs or ACcession numbers or a mix of both. # (Re)set index_id type to AC to prevent adding redundant IDs to the results. # Hence treat all identifiers stable and unstable as ACs. # (Normally we default to ACcession numbers and if we would have found an AC # and the input fragments file was using IDs, we would append the AC to the results.) $index_id = 'AC'; $logger->info('Parsing ' . $file_type . ' file ' . $file_path . '...'); } else { $file_type = 'FASTA'; $index_id = 'AC'; $logger->warn('Could not determine database file type based on the file name extension of ' . $file_path); $logger->warn('Will assume it is a FASTA file and hope that is correct...'); $logger->info('Parsing ' . $file_type . ' file ' . $file_path . '...'); } if ($file_type eq 'UniProtKB DAT') { # # Read an entire record delimited by // at a time. # local $/ = "\/\/\n"; eval { open($file_path_fh, "<$file_path"); }; if ($@) { $logger->fatal('Cannot read from database input file: ' . $@); exit; } while (<$file_path_fh>){ $logger->trace('Record:' . "\n$_\n"); my $entry = SWISS::Entry->fromText($_); my $id = $entry->ID; my $acc = $entry->AC; my $seq = $entry->SQ; my $index_key; if ($index_id eq 'ID') { $index_key = $id; } elsif ($index_id eq 'AC') { $index_key = $acc; } else { $logger->fatal('Unknown type of identifier used for indexing: ' . $index_id . '. Must be ID or AC.'); exit; } $logger->debug('Found accession ' . $acc); $logger->trace('Found accession ' . $acc . ' with ID ' . $id . ' and seq ' . $seq); $seqs{$index_key}{'AC'} = $acc; $seqs{$index_key}{'ID'} = $id; $seqs{$index_key}{'SQ'} = $seq; } } elsif ($file_type eq 'FASTA') { my @header_ids = (); my $seq = ''; eval { open($file_path_fh, "<$file_path"); }; if ($@) { $logger->fatal('Cannot read from database input file: ' . $@); exit; } LINE: while (my $line = <$file_path_fh>) { if ($line =~ m/^[\s\n\r]+$/) { # # Skip blank line # $logger->debug('Found empty line in FASTA file.'); next LINE; } elsif ($line =~ m/^[a-zA-Z]+/) { # # It's a sequence line # Remove all white space and new line characters. # $logger->debug('Found FASTA sequence line: ' . $line); $line =~ s/[\s\n\r]+//g; $seq .= $line; } elsif ($line =~ m/^>/) { $logger->debug('Found new FASTA header line: ' . $line); if ((length($seq) > 0) && defined($header_ids[0])) { # # Store the previously found sequence in the lookup table. # foreach my $header_id (@header_ids) { # # We don't know if the ID we found is an accession number (stable ID) or a normal (unstable) ID # -> Fill both the AC and ID fields with the same ID value. # $seqs{$header_id}{'AC'} = $header_id; $seqs{$header_id}{'ID'} = $header_id; $seqs{$header_id}{'SQ'} = $seq; $logger->debug('Stored ' . $header_id . ':'); $logger->debug("\t" . 'SQ ' . $seq); } # # Reset vars. # $seq = ''; @header_ids = (); } # # Check for the presence of some frequently occurring naming schemes: # # >IPI:CON_Trypsin|SWISS-PROT:P00761|TRYP_PIG Trypsin - Sus scrofa (Pig). # >IPI:CON_IPI00174775.2|TREMBL:Q32MB2;Q86Y46 Tax_Id=9606 Gene_Symbol=KRT73 Keratin-73 # >sp|Q42592|APXS_ARATH L-ascorbate peroxidase S, chloroplastic/mitochondrial; # >jgi|Triad1|1|gw1.3.1.1 # # The FASTA header line can basically have any format. # Therefore we try to see if the IDs we are looking for are present anywhere # in the header line up until the first white space or new line. # This should prevent matching annotation from other proteins in the description # like a for example a 'best' BLAST hit that contains one of the IDs we # are looking for. Hence, in such a case this sequence is *not* the one of # the IDs we looking for, but similar to that one at best. # my @unfiltered_ids; if ($line =~ m/^>((([^\s;|]+)[;|])*([^\s;|]+))[|;]?/i) { # # Got multiple IDs # my $multiple_ids = $1; @unfiltered_ids = split(/[\s;|]+/, $multiple_ids); } elsif ($line =~ m/^>([^\s]+)/i){ # # Got a single ID # my $single_id = $1; push(@unfiltered_ids, $single_id); } KEY:foreach my $unfiltered_id (@unfiltered_ids) { my $this_id = $unfiltered_id; # # Check for RockerBox formattted IDs. # # Example of RockerBox ID format, which may contain # * multiple IDs separated by semi-colons and # * suffixed with the position of the peptide in the protein: # YGR192C[123-142]; YJR009C[123-142] if ($this_id =~ m/^([^\[\]]+)\[\d+-\d+\]$/i) { $logger->trace('Protein ID in RockerBox format: ' . $this_id); $this_id = $1; $logger->trace("\t" . 'Using ID : ' . $this_id); } # # Check for DB namespace prefixes. # if ($this_id =~ m/([^:]+):([^:]+)/i) { # # Namespace prefixed ID. # $logger->trace('Namespace prefixed ID: ' . $this_id); my $prefix = $1; $this_id = $2; $logger->trace("\t" . 'Using ID: ' . $this_id); # # For a selected list of "known" database namespace prefixes, # check for versioned accession numbers and remove the version number. # if (defined($prefix) && defined($databases_with_optionally_versioned_ids{$prefix})) { $logger->trace('Found prefix: ' . $prefix); my $separator = $databases_with_optionally_versioned_ids{$prefix}; $separator = '\\' . $separator; if ($this_id =~ m/([^$separator]+)$separator\d+$/) { $this_id = $1; } } } $logger->debug('Found ID: ' . $this_id); push(@header_ids, $this_id); } # if ($line =~ m/^>((([^\s\n\r:;|]+)[:]([^\s\n\r:|]+)[|;])*([^\s\n\r:;|]+)[:]([^\s\n\r:|]+))[|;]?(\s+(.+))?/i) { # # # # # One or more namespace prefixed IDs. # # # my $concatenated_namespace_prefixed_ids = $1; # $logger->debug('Found prefixed IDs in header: ' . $concatenated_namespace_prefixed_ids); # # # database_namespace = $3 && $5; # # id = $4 && $6; # # description = $8; # my @namespace_prefixed_ids = split(/[|;]/, $concatenated_namespace_prefixed_ids); # # foreach my $prefixed_id (@namespace_prefixed_ids) { # # $logger->trace('Prefixed ID: ' . $prefixed_id); # # if ($prefixed_id =~ m/(([^\s\n\r:;|]+):)?([^\s\n\r:|]+)/i) { # # my $this_prefix = $2; # my $this_id = $3; # $logger->debug('Found ID: ' . $this_id); # push(@header_ids, $this_id); # # # # # For a selected list of "known" database namespace prefixes, # # check for versioned accession numbers and - if found - add # # both the versioned and the unversioned accession number # # to the list of header IDs. # # # if (defined($this_prefix) && defined($databases_with_optionally_versioned_ids{$this_prefix})) { # $logger->trace('Found prefix: ' . $this_prefix); # my $separator = $databases_with_optionally_versioned_ids{$this_prefix}; # $separator = '\\' . $separator; # if ($this_id =~ m/([^$separator]+)$separator\d+$/) { # my $this_unversioned_id = $1; # $logger->debug('Found unversioned ID: ' . $this_unversioned_id); # push(@header_ids, $this_unversioned_id); # } # } # # } else { # # $logger->warn( # 'This should have been an optionally prefixed ID, ' # . 'but failed to match the corresponding regex: ' # . $prefixed_id); # # } # } # # } elsif ($line =~ m/^>((([^\s\n\r:;|]+)[|;])*([^\s\n\r:;|]+))[|;]?(\s+(.+))?/i) { # # # # # One or more unprefixed IDs. # # # my $concatenated_ids = $1; # $logger->debug('Found unprefixed IDs in header: ' . $concatenated_ids); # # # id = $3 && $4; # # description = $7; # my @unprefixed_ids = split(/[|;]/, $concatenated_ids); # # foreach my $unprefixed_id (@unprefixed_ids) { # # $logger->debug('Found ID: ' . $unprefixed_id); # push(@header_ids, $unprefixed_id); # # } # # } else { # # # # # Something else. # # # # The FASTA header line can basically have any format, # # so this is probably one of the less frequent occurring annotation schemes. # # Therefore we try to see if the IDs we are looking for are present anywhere # # in the header line up until the first white space or new line. # # This may be tricky as we might match other annotation from other proteins # # like a description from a 'best' BLAST hit that contains one of the IDs we # # are looking for. Hence, in such a case this sequence is *not* the one of # # the IDs we looking for, but similar to that one at best. # # # if ($line =~ m/>([^\n\r\s]+)/) { # # my $putative_id = $1; # $logger->debug('Found putative ID in header: ' . $putative_id); # push(@header_ids, $putative_id); # # } else { # $logger->warn('Cannot identify IDs in this header: ' . $line); # } # } } } # # Store the last found sequence in the lookup table. # foreach my $header_id (@header_ids) { # # We don't know if the ID we found is an accession number (stable ID) or a normal (unstable) ID # -> Fill both the AC and ID fields with the same ID value. # $seqs{$header_id}{'AC'} = $header_id; $seqs{$header_id}{'ID'} = $header_id; $seqs{$header_id}{'SQ'} = $seq; $logger->debug('Stored ' . $header_id . ':'); $logger->debug("\t" . 'SQ ' . $seq); } # # Reset vars. # $seq = ''; @header_ids = (); } close($file_path_fh); $logger->info('Created lookup list for database sequences.'); return (\%seqs, $index_id); } sub _ExtractSeqs { my ($db_lookup_table_refs, $path_from, $strip_lc, $id_column, $peptide_column, $cleavage_path_to, $miscleavage_path_to, $modification_path_to, $peptide_path_to, $cleavage_aa, $cleavage_terminus, $miscleavage_distance, $n_term_context_length, $c_term_context_length, $padding_character) = @_; $logger->debug('Parsing ' . $path_from . '...'); my $extracted_seq_contexts_cle = 0; my $extracted_seq_contexts_mis = 0; my $extracted_seq_contexts_mod = 0; my $extracted_seq_contexts_pep = 0; my $path_from_fh; my $cleavage_path_to_fh; my $miscleavage_path_to_fh; my $modification_path_to_fh; my $peptide_path_to_fh; eval { open($path_from_fh, "<$path_from"); }; if ($@) { $logger->fatal('Cannot read from fragments input file: ' . $@); exit; } if (defined($cleavage_path_to)) { eval { open($cleavage_path_to_fh, ">$cleavage_path_to"); }; if ($@) { $logger->fatal('Cannot write to output file: ' . $@); exit; } } if (defined($miscleavage_path_to)) { eval { open($miscleavage_path_to_fh, ">$miscleavage_path_to"); }; if ($@) { $logger->fatal('Cannot write to output file: ' . $@); exit; } } if (defined($modification_path_to)) { eval { open($modification_path_to_fh, ">$modification_path_to"); }; if ($@) { $logger->fatal('Cannot write to output file: ' . $@); exit; } } if (defined($peptide_path_to)) { eval { open($peptide_path_to_fh, ">$peptide_path_to"); }; if ($@) { $logger->fatal('Cannot write to output file: ' . $@); exit; } } LINE:while (my $line = <$path_from_fh>) { # # Remove (trailing) line end characters! # $line =~ s/[\n\r\f]+//g; $logger->trace($line); my @column_values = split(/\t/, $line); # For debugging only. #foreach my $val (@column_values) { # $logger->debug('Col val: ' . $val); #} my $id_column_index = $id_column - 1; my $peptide_column_index = $peptide_column - 1; my $unfiltered_key = $column_values[$id_column_index]; my $key; my @keys; my $peptide = $column_values[$peptide_column_index]; my $peptide_nonmod; my $protein; my $acc; my $id; my $index = -1; # An index < 0 indicates the peptide was not found in a protein sequence (yet). my $length; my $cleavage_peptide_contexts = []; my $miscleavage_peptide_contexts = []; my $modification_peptide_contexts = []; my $peptide_contexts = []; # # Check if $key and $peptide were present in this line. # If not this may be a leading/trailing empty or meta-data line, # that does not contain the correct amount of columns. # unless (defined($unfiltered_key) && defined($peptide)) { $logger->warn('Skipping line: ' . $line); $logger->warn("\t" . '(Peptide sequence and/or protein ID missing or incorrect amount of columns.)'); next LINE; } # # Sanity check for peptide sequences. # unless ($peptide =~ m/([a-zA-Z]+)/) { $logger->warn('Fragment file line in unexpected format. Cannot find peptide in: ' . $line); $logger->debug('$peptide contains: ' . $peptide); next LINE; } if ($strip_lc) { # Remove lower case characters representing modifications. $peptide_nonmod = $peptide; $peptide_nonmod =~ s/[a-z]+//g; } else { # Convert everything to uppercase. $peptide_nonmod = uc($peptide); } # # Figure out in what format the Protein ID(s) were supplied. # # * Strip off optional DB namespace prefixes. # * Handle optional accession number version suffixes. # if ($unfiltered_key =~ m/^((([^\s;|]+)[\s;|]+)*([^\s;|]+))[\s|;]?/i) { # # Got multiple IDs # my $multiple_ids = $1; @keys = split(/[\s;|]+/, $multiple_ids); } else { # # Got a single ID # push(@keys, $unfiltered_key); } KEY:foreach $unfiltered_key (@keys) { $key = $unfiltered_key; # # Check for RockerBox formattted IDs. # # Example of RockerBox ID format, which may contain # * multiple IDs separated by semi-colons and # * suffixed with the position of the peptide in the protein: # YGR192C[123-142]; YJR009C[123-142] if ($key =~ m/^([^\[]+)\[\d+-\d+\]$/i) { $logger->trace('Protein ID in RockerBox format: ' . $key); $key = $1; $logger->trace("\t" . 'Using ID : ' . $key); } # # Check for DB namespace prefixes. # if ($key =~ m/([^:]+):([^:]+)/i) { # # Namespace prefixed ID. # $logger->trace('Namespace prefixed ID: ' . $key); my $prefix = $1; $key = $2; $logger->trace("\t" . 'Using ID: ' . $key); # # For a selected list of "known" database namespace prefixes, # check for versioned accession numbers and remove the version number. # if (defined($prefix) && defined($databases_with_optionally_versioned_ids{$prefix})) { $logger->trace('Found prefix: ' . $prefix); my $separator = $databases_with_optionally_versioned_ids{$prefix}; $separator = '\\' . $separator; if ($key =~ m/([^$separator]+)$separator\d+$/) { $key = $1; } } } $logger->debug('Found ID: ' . $key); # # Sanity check for Protein ID. # unless ($key =~ m/^([A-Z0-9\._-]+)$/i) { $logger->warn('Fragment file line in unexpected format. Cannot find sequence ID in:'); $logger->warn("\t" . $line); $logger->debug('$key contains: ' . $key); next LINE; } # # Lookup info for the protein this peptide was derived from. # ($protein, $acc, $id, $index) = _LookupProteinInfo($key, $peptide_nonmod, $db_lookup_table_refs); # # Check if the peptide (fragment) was found in this protein sequence. # if (defined($protein)) { if ($index > -1) { $logger->trace('Found peptide sequence in protein ' . $key); last KEY; } else { $logger->warn('Cannot find peptide (' . $peptide_nonmod . ') in protein ' . $key); $logger->warn("\t" . 'Will try other protein IDs for this peptide if available.'); } } else { $logger->warn('Cannot find protein identifier ' . $key . ' in any of the supplied databases.'); $logger->warn("\t" . 'Will try other protein IDs for this peptide if available.'); } } # # Check if the peptide (fragment) was found in any of the associated protein sequences. # if (defined($protein)) { $logger->debug('Found protein ' . $key); if ($index > -1) { $logger->debug('Found peptide (' . $peptide_nonmod . ') in protein ' . $key); if (defined($cleavage_path_to)) { ($cleavage_peptide_contexts) = _GetCleavageSequenceContexts($peptide_nonmod, $protein, $key, $index, $cleavage_aa, $cleavage_terminus); } if (defined($miscleavage_path_to)) { ($miscleavage_peptide_contexts) = _GetMiscleavageSequenceContexts($peptide_nonmod, $protein, $key, $index, $cleavage_aa, $cleavage_terminus, $miscleavage_distance); } if (defined($modification_path_to)) { ($modification_peptide_contexts) = _GetModificationSequenceContexts($peptide_nonmod, $protein, $key, $index, $modified_aa, $peptide); } if (defined($peptide_path_to)) { ($peptide_contexts) = _GetPeptideSequenceContexts($peptide_nonmod, $protein, $key, $index); } } else { $logger->error('Cannot find peptide (' . $peptide_nonmod . ') in protein ' . $key); $acc = 'NA'; } } else { $logger->error('Cannot find protein "' . $unfiltered_key . '" in any of the supplied databases.'); $acc = 'NA'; } # # Append the peptide contexts to the existing lines of the fragment file and save that new line to the output file. # $extracted_seq_contexts_cle = _SaveSequenceContexts($line, $acc, $cleavage_peptide_contexts, $cleavage_path_to_fh, $extracted_seq_contexts_cle); $extracted_seq_contexts_mis = _SaveSequenceContexts($line, $acc, $miscleavage_peptide_contexts, $miscleavage_path_to_fh, $extracted_seq_contexts_mis); $extracted_seq_contexts_mod = _SaveSequenceContexts($line, $acc, $modification_peptide_contexts, $modification_path_to_fh, $extracted_seq_contexts_mod); $extracted_seq_contexts_pep = _SaveSequenceContexts($line, $acc, $peptide_contexts, $peptide_path_to_fh, $extracted_seq_contexts_pep); } # # Close file handles. # foreach my $fh ($path_from_fh, $cleavage_path_to_fh, $miscleavage_path_to_fh, $modification_path_to_fh, $peptide_path_to_fh) { if (defined($fh)) { close($fh); } } return($extracted_seq_contexts_cle, $extracted_seq_contexts_mis, $extracted_seq_contexts_mod, $extracted_seq_contexts_pep); } sub _LookupProteinInfo { my ($key, $peptide, $db_lookup_table_refs) = @_; my $protein_found_in_db = 0; my $protein; my $acc; my $id; my $index; DB_LOOKUP:foreach my $db_lookup_table_ref (@{$db_lookup_table_refs}) { if (defined(${$db_lookup_table_ref}{$key})) { $protein_found_in_db = 1; $logger->debug('Found protein ' . $key . ' in this sequence DB.'); $protein = ${$db_lookup_table_ref}{$key}{'SQ'}; $acc = ${$db_lookup_table_ref}{$key}{'AC'}; $id = ${$db_lookup_table_ref}{$key}{'ID'}; # # Find peptide (fragment) in DB protein sequence. # $index = index($protein, $peptide); if ($index > -1) { # # Found peptide! # $logger->debug('Found peptide ' . $peptide . ' at position ' . $index . ' in protein ' . $key); last DB_LOOKUP; } # # If the input was based on mass spectrometry data we cannot see # the difference between I (Isolucene) and L (Leucine) as the both # weigh the same. Let's try an I agnostic search by substituting all # Is for Ls. # my $protein_l_only = $protein; my $peptide_l_only = $peptide; $protein_l_only =~ s/I/L/g; $peptide_l_only =~ s/I/L/g; $index = index($protein_l_only, $peptide_l_only); if ($index > -1) { # # Found peptide! # $logger->debug('Found peptide ' . $peptide . ' at position ' . $index . ' in protein ' . $key); $logger->debug("\t" . 'Had to treat I and L as the same amino acid in the DB search to get a match.'); last DB_LOOKUP; } # # Cannot find peptide :( # $protein = undef; $acc = undef; $id = undef; $logger->debug('Cannot find peptide (' . $peptide . ') in protein ' . $key); $logger->debug("\t" . 'This may be the result of an updated protein sequence in the database.'); $logger->debug("\t" . 'Will try to find the peptide in other databases (if available).'); if ($strip_lc) { $logger->debug("\t" . '(Stripping of lowercase characters, indicating modifications in the peptide sequence, was enabled.)'); } else { $logger->debug("\t" . '(Stripping of lowercase characters, indicating modifications in the peptide sequence, was *not* enabled.)'); } } else { $logger->debug('Protein ' . $key . ' missing from this sequence DB. Will try other databases (if available).'); } } # # Check if the peptide (fragment) was found in a DB protein sequence. # if ($protein_found_in_db == 1) { unless ($index > -1) { $logger->warn('Cannot find peptide (' . $peptide . ') in protein ' . $key); $logger->warn("\t" . 'Will try to find this peptide (' . $peptide . ') using other associated protein identifiers (if available).'); } } else { $logger->warn('Cannot find protein ' . $key . ' in any of the supplied databases.'); } return($protein, $acc, $id, $index); } sub _GetCleavageSequenceContexts { my ($peptide, $protein, $key, $index, $cleavage_aa, $cleavage_terminus) = @_; my %peptide_contexts; $peptide_contexts{'n_term'}{'context'} = ''; $peptide_contexts{'c_term'}{'context'} = ''; my $max_context_length = $n_term_context_length + $c_term_context_length; foreach my $peptide_term (keys(%peptide_contexts)) { # # Get the offset for the sequence contexts to retrieve from this peptide terminus # or drop the context for this peptide terminus if it is also the protein terminus. # if ($peptide_term eq 'n_term') { if ($index > 0) { # This is an N-terminal peptide context. my $offset = $index - $n_term_context_length; $peptide_contexts{$peptide_term}{'offset'} = $offset; $logger->debug('Offset = ' . $offset . ' for ' . $peptide_term . ' of peptide ' . $peptide . ' in protein ' . $key . '.'); } else { # Peptide N-terminus == Protein N-terminus. $logger->warn('Peptide N-terminus = protein N-terminus. Dropping N-terminal sequence context for peptide ' . $peptide . ' in protein ' . $key . '.'); next; } } elsif ($peptide_term eq 'c_term') { if (($index + length($peptide)) < length($protein)) { # This is a C-terminal peptide context. my $offset = ($index + length($peptide))- $n_term_context_length; $peptide_contexts{$peptide_term}{'offset'} = $offset; $logger->debug('Offset = ' . $offset . ' for ' . $peptide_term . ' of peptide ' . $peptide . ' in protein ' . $key . '.'); } else { # Peptide C-terminus == Protein C-terminus. $logger->warn('Peptide C-terminus = protein C-terminus. Dropping C-terminal sequence context for peptide ' . $peptide . ' in protein ' . $key . '.'); next; } } # # Check if DB protein sequence is long enough on n-terminal side # to retrieve full length peptide contexts. # (c-term length is checked later...) # if ($peptide_contexts{$peptide_term}{'offset'} < 0) { # # Protein too short for full length context: Calculate offset adjustment. # my $offset = $peptide_contexts{$peptide_term}{'offset'}; my $offset_adjustment = -($offset); $peptide_contexts{$peptide_term}{'offset_adjustment'} = $offset_adjustment; $logger->debug('Offset ' . $offset . ' for ' . $peptide_term . ' of peptide ' . $peptide . ' in protein ' . $key . ' less than zero'); if (length($padding_character) == 1) { # # Add N-terminal padding characters for contexts that are too short on the N-terminal side. # my $n_term_padding = "$padding_character" x $offset_adjustment; $peptide_contexts{$peptide_term}{'context'} = $n_term_padding; $logger->debug("\t" . 'Will add N-terminal padding: ' . $n_term_padding); } else { $logger->debug('Padding was disabled. Will not add n-term padding.'); } $peptide_contexts{$peptide_term}{'offset'} = 0; } else { $peptide_contexts{$peptide_term}{'offset_adjustment'} = 0; } # # Extract sequence context from DB protein sequence. # my $peptide_context = substr($protein, $peptide_contexts{$peptide_term}{'offset'}, $max_context_length - $peptide_contexts{$peptide_term}{'offset_adjustment'}); # append the context extracted from the protein sequence, # because there may be already some N-terminal padding... $peptide_contexts{$peptide_term}{'context'} .= $peptide_context; # # Quality Control: Check enzyme specificity for enzymes with known (= user supplied) specs. # unless ($cleavage_aa eq '*') { $peptide_context = $peptide_contexts{$peptide_term}{'context'}; my $p1 = substr($peptide_context, ($n_term_context_length -1), 1); my $p1_prime = substr($peptide_context, $n_term_context_length, 1); if ($cleavage_terminus eq 'C') { if ($p1 eq $cleavage_aa) { # Peptide fragment contains the AA recognised by the protease. $logger->debug('Specific protease activity -> Retaining peptide context.'); } else { # Must be non-specific cleavage -> drop peptide context. $logger->warn('Non-specific cleavage: dropping ' . $peptide_term . ' peptide context ' . $peptide_context . ' for peptide ' . $peptide . ' in protein ' . $key); $peptide_contexts{$peptide_term}{'context'} = ''; } } elsif ($cleavage_terminus eq 'N') { if ($p1_prime eq $cleavage_aa) { # Peptide fragment contains the AA recognised by the protease. $logger->debug('Specific protease activity -> Retaining peptide context.'); } else { # Must be non-specific cleavage -> drop peptide context. $logger->warn('Non-specific cleavage: dropping ' . $peptide_term . ' peptide context ' . $peptide_context . ' for peptide ' . $peptide . ' in protein ' . $key); $peptide_contexts{$peptide_term}{'context'} = ''; } } elsif ($cleavage_terminus eq '*') { if (($p1 eq $cleavage_aa) || ($p1_prime eq $cleavage_aa)) { # Peptide fragment contains the AA recognised by the protease. $logger->debug('Specific protease activity -> Retaining peptide context.'); } else { # Must be non-specific cleavage -> drop peptide context. $logger->warn('Non-specific cleavage: dropping ' . $peptide_term . ' peptide context ' . $peptide_context . ' for peptide ' . $peptide . ' in protein ' . $key); $peptide_contexts{$peptide_term}{'context'} = ''; } } else { $logger->fatal('Unknown cleavage terminus ' . $cleavage_terminus . '. Must be C, N or *.'); exit; } } } push(my @contexts, $peptide_contexts{'n_term'}{'context'}, $peptide_contexts{'c_term'}{'context'}); if (length($padding_character) == 1) { _CheckSequenceContextLength(\@contexts, $peptide, $key, $max_context_length); } else { $logger->debug('Padding was disabled. Will not add c-term padding nor check sequence context length.'); } return(\@contexts); } sub _GetMiscleavageSequenceContexts { my ($peptide, $protein, $key, $petide_index, $cleavage_aa, $cleavage_terminus, $miscleavage_distance) = @_; my @peptide_contexts; my $max_context_length = $n_term_context_length + $c_term_context_length; # # Find miscleavage sites in peptide. # # TODO: handle --mcd # my @amino_acids = split('', $peptide); my $aa_index = 0; # # Drop first and last amino acids as these are from cleavage sites or from the protein termini. # In either case they cannot represent a miscleavage site. # shift(@amino_acids); pop(@amino_acids); foreach my $aa (@amino_acids) { $aa_index++; if ($aa eq $cleavage_aa) { # # We found a miscleavage site. # $logger->debug('Found miscleavage site in peptide ' . $peptide . ' of protein ' . $key . '.'); # # Get the offset for the sequence context. # my $offset = $petide_index + $aa_index - $n_term_context_length; $logger->debug("\t" . 'Offset = ' . $offset . '.'); my $n_term_padding = ''; my $offset_adjustment = 0; if ($cleavage_terminus eq 'C') { # # Must increment the offset with 1 if the protease cuts C-terminal of a recognition site. # $offset++; } # # Check if DB protein sequence is long enough on n-terminal side # to retrieve full length peptide contexts. # (c-term length is checked elsewhere.) # if ($offset < 0) { # # Protein too short for full length context: Calculate offset adjustment. # $offset_adjustment = -($offset); $logger->debug('Offset ' . $offset . ' for miscleavage context in peptide ' . $peptide . ' in protein ' . $key . ' less than zero'); if (length($padding_character) == 1) { # # Add N-terminal padding characters for contexts that are too short on the N-terminal side. # $n_term_padding = "$padding_character" x $offset_adjustment; $logger->debug("\t" . 'Will add N-terminal padding: ' . $n_term_padding); } else { $logger->debug('Padding was disabled. Will not add n-term padding.'); } # # Reset offset for this context to protein start. # $offset = 0; } # # Extract sequence context from DB protein sequence. # my $peptide_context = substr($protein, $offset, $max_context_length - $offset_adjustment); $peptide_context = $n_term_padding . $peptide_context; $logger->debug('Found miscleavage context: ' . $peptide_context . '.'); push(@peptide_contexts, $peptide_context); } } if (length($padding_character) == 1) { _CheckSequenceContextLength(\@peptide_contexts, $peptide, $key, $max_context_length); } else { $logger->debug('Padding was disabled. Will not add c-term padding nor check sequence context length.'); } return(\@peptide_contexts); } sub _GetModificationSequenceContexts { my ($peptide, $protein, $key, $petide_index, $modified_aa, $modified_peptide) = @_; my @peptide_contexts; my $modified_aa_index_adjustment; my @modified_aa_in_peptide_indices; my $modified_aa_in_protein_index; my $max_context_length = $n_term_context_length + 1 + $c_term_context_length; #my $offset = 0; my $modified_aa_in_modified_peptide_index; my $modified_peptide_remainder; my $processed_peptide_length = 0; # # Find amino acid in the string identifying both the modified amino acid and the modification. # if ($modified_aa =~ m/^([a-z]+)[A-Z*]$/) { $modified_aa_index_adjustment = length($1); } elsif ($modified_aa =~ m/^[A-Z*][a-z]$/) { $modified_aa_index_adjustment = 0; } else { $logger->fatal('Modified amino acid was specified in an unsupported format: ' . $modified_aa . '.'); exit; } # # Find modified sites in peptide. # my $modified_aa_for_regex = $modified_aa; $modified_aa_for_regex =~ s/\*/[A-Z]/; $logger->debug('modified_aa for search: ' . $modified_aa_for_regex); if ($modified_peptide =~ m/$modified_aa_for_regex/) { $logger->debug("\t" . 'Peptide contains mods.'); $modified_aa_in_modified_peptide_index = $-[0]; $modified_peptide_remainder = $'; my $modified_peptide_prefix = $`; $logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length); $processed_peptide_length += length($modified_peptide_prefix); $logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length); $processed_peptide_length += length($modified_aa); $logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length); $logger->trace("\t\t" . 'Index for mod :' . $modified_aa_in_modified_peptide_index); } else { $logger->debug('No modified amino acids found in this peptide.'); } #my $modified_aa_in_modified_peptide_index = index($modified_peptide, $modified_aa); while (defined($modified_aa_in_modified_peptide_index)) { $modified_aa_in_modified_peptide_index += $modified_aa_index_adjustment; $logger->debug('Found modified amino acid at position ' . ($modified_aa_in_modified_peptide_index + 1) . ' in modified peptide ' . $modified_peptide . ' of protein ' . $key . '.'); # # Count all modifications of any type preceeding the found modified amino acid. # my $modified_n_term = substr($modified_peptide, 0, $modified_aa_in_modified_peptide_index); my $n_term = $modified_n_term; $n_term =~ s/[a-z]+//g; my $mod_count = length($modified_n_term) - length($n_term); if ($mod_count >= 0) { $logger->debug(' counted modifications preceeding amino acid = ' . $mod_count); } else { $logger->fatal(' counted modifications preceeding amino acid is negative: ' . $mod_count); exit; } # # We need to substract the amount of mods preceeding the identified amino acid # from the index to get the position of the amino acid in the unmodified peptide. # my $modified_aa_in_peptide_index = $modified_aa_in_modified_peptide_index - $mod_count; $logger->debug(' and at position ' . ($modified_aa_in_peptide_index + 1) . ' in peptide ' . $peptide . ' of protein ' . $key . '.'); push(@modified_aa_in_peptide_indices, $modified_aa_in_peptide_index); # # Search for additional mods in the modified peptide. # #$offset = $modified_aa_in_modified_peptide_index + 1; #$modified_aa_in_modified_peptide_index = index($modified_peptide, $modified_aa, $offset); if ($modified_peptide_remainder =~ m/$modified_aa_for_regex/) { $logger->debug("\t" . 'Peptide contains more mods.'); $modified_peptide_remainder = $'; my $modified_peptide_prefix = $`; $logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length); $modified_aa_in_modified_peptide_index = $-[0] + $processed_peptide_length; $logger->trace("\t\t" . 'Index for mod :' . $modified_aa_in_modified_peptide_index); $processed_peptide_length += length($modified_peptide_prefix); $logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length); $processed_peptide_length += length($modified_aa); $logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length); } else { $modified_aa_in_modified_peptide_index = undef(); $logger->debug('No more modified amino acids found in this peptide.'); } } # # Retrieve sequence contexts for modified stites from protein sequence # foreach my $modified_aa_in_peptide_index (@modified_aa_in_peptide_indices) { # # Get the offset for the sequence context. # my $offset = $petide_index + $modified_aa_in_peptide_index - $n_term_context_length; $logger->debug("\t" . 'Offset = ' . $offset . '.'); my $n_term_padding = ''; my $offset_adjustment = 0; # # Check if DB protein sequence is long enough on n-terminal side # to retrieve full length peptide contexts. # (c-term length is checked elsewhere.) # if ($offset < 0) { # # Protein too short for full length context: Calculate offset adjustment. # $offset_adjustment = -($offset); $logger->debug('Offset ' . $offset . ' for modification context in peptide ' . $peptide . ' in protein ' . $key . ' less than zero'); if (length($padding_character) == 1) { # # Add N-terminal padding characters for contexts that are too short on the N-terminal side. # $n_term_padding = "$padding_character" x $offset_adjustment; $logger->debug("\t" . 'Will add N-terminal padding: ' . $n_term_padding); } else { $logger->debug('Padding was disabled. Will not add n-term padding.'); } # # Reset offset for this context to protein start. # $offset = 0; } # # Extract sequence context from DB protein sequence. # my $peptide_context = substr($protein, $offset, $max_context_length - $offset_adjustment); $peptide_context = $n_term_padding . $peptide_context; $logger->debug('Found modification context: ' . $peptide_context . '.'); push(@peptide_contexts, $peptide_context); } if (length($padding_character) == 1) { _CheckSequenceContextLength(\@peptide_contexts, $peptide, $key, $max_context_length); } else { $logger->debug('Padding was disabled. Will not add c-term padding nor check sequence context length.'); } return(\@peptide_contexts); } # # TODO: pep contexts. # sub _GetPeptideSequenceContexts { my ($peptide, $protein, $key, $peptide_index) = @_; my @peptide_contexts; my $max_context_length = $n_term_context_length + length($peptide) + $c_term_context_length; # # Get the offset for the peptide sequence context. # my $offset = $peptide_index - $n_term_context_length; $logger->debug("\t" . 'Offset = ' . $offset . '.'); my $n_term_padding = ''; my $offset_adjustment = 0; # # Check if DB protein sequence is long enough on n-terminal side # to retrieve full length peptide contexts. # (c-term length is checked elsewhere.) # if ($offset < 0) { # # Protein too short for full length context: Calculate offset adjustment. # $offset_adjustment = -($offset); $logger->debug('Offset ' . $offset . ' for peptide context of peptide ' . $peptide . ' in protein ' . $key . ' less than zero'); if (length($padding_character) == 1) { # # Add N-terminal padding characters for contexts that are too short on the N-terminal side. # $n_term_padding = "$padding_character" x $offset_adjustment; $logger->debug("\t" . 'Will add N-terminal padding: ' . $n_term_padding); } else { $logger->debug('Padding was disabled. Will not add n-term padding.'); } # # Reset offset for this context to protein start. # $offset = 0; } # # Extract sequence context from DB protein sequence. # my $peptide_context = substr($protein, $offset, $max_context_length - $offset_adjustment); $peptide_context = $n_term_padding . $peptide_context; $logger->debug('Found modification context: ' . $peptide_context . '.'); push(@peptide_contexts, $peptide_context); if (length($padding_character) == 1) { _CheckSequenceContextLength(\@peptide_contexts, $peptide, $key, $max_context_length); } else { $logger->debug('Padding was disabled. Will not add c-term padding nor check sequence context length.'); } return(\@peptide_contexts); } sub _CheckSequenceContextLength { my ($contexts, $peptide, $key, $max_context_length) = @_; # # If padding was enabled for sequence contexts < max sequence context length. # foreach my $peptide_context (@{$contexts}) { my $peptide_context_length = length($peptide_context); if ($peptide_context_length < $max_context_length) { # # sequence context < max sequence context length -> append c-term padding. # (n-term padding was already appended if necessary during sequence context lookup with _GetSequenceContext().) # my $c_term_padding = "$padding_character" x ($max_context_length - $peptide_context_length); $logger->debug('Length of peptide context (' . $peptide_context_length . ') < max context length.'); $logger->debug("\t" . 'Will add C-terminal padding: ' . $c_term_padding); $peptide_context = $peptide_context . $c_term_padding; } # # Sanity check for padded sequence contexts. # # (Checks if the combined effect of n-term and c-term padding worked well.) # $peptide_context_length = length($peptide_context); unless ($peptide_context_length == $max_context_length) { $logger->fatal('Length of peptide context too short or too long for peptide ' . $peptide . ' in protein ' . $key); $logger->fatal("\t" . 'Context length is ' . $peptide_context_length . ', but should be ' . $max_context_length . '.'); exit; } } # # No need to return the modified (c-terminal-padded) contexts as we worked with a arrayref. # } sub _SaveSequenceContexts { my ($line, $acc, $contexts, $path_to_fh, $extracted_seq_contexts) = @_; # # Append the peptide contexts to the existing line of the fragment file and # save that new line to the output file. # By appending we keep all info from the original fragment file intact. # foreach my $context (@{$contexts}) { # # Skip "empty"/"missing" contexts. # May be the result of: # 1. The peptide terminus being also the protein terminus. # 2. Proteins missing from the database. # 3. ... # if ($padding_character ne '') { if ($context =~ m/^($padding_character)+$/) { next; } } else { if ($context eq '') { next; } } my $new_line = $line; # 1. Append accession numbers if the fragment file used IDs. if ($index_id eq 'ID') { $new_line .= "\t" . $acc; } # 2. Append sequence context. $new_line .= "\t" . $context; # 3. Append new line character. $new_line .= "\n"; # Save result to disk. print($path_to_fh $new_line); $extracted_seq_contexts++; } return($extracted_seq_contexts); } sub _Usage { print STDOUT "\n" . 'ExtractPeptideSequenceContext.pl: Extract sequence contexts of cleavage, miscleavage or modification sites ' . "\n" . ' based on sequence fragments (like peptides experimentally identified with MSMS) ' . "\n" . ' and protein sequences from a database in FASTA or UniProtKB file format.' . "\n" . "\n" . 'Usage:' . "\n" . "\n" . ' ExtractPeptideSequenceContext.pl options' . "\n" . "\n" . 'Available options are:' . "\n" . "\n" . ' --db [file] Input file containing all database sequences in either UniProtKB *.dat or *.fasta format.' . "\n" . ' --dba [file] Optional alternative / backup input file containing database sequences in either UniProtKB *.dat or *.fasta format.' . "\n" . ' This may be a slightly older or newer version of the database, which will be searched in case a ' . "\n" . ' sequence ID was not found in the primary database file that was specified with --db [file].' . "\n" . ' --dbf [format] Optional argument to specify the database sequence file format. Valid format specifications are:' . "\n" . ' * \'UniProtKB DAT\' for UniProtKB DAT files.' . "\n" . ' * \'FASTA\' for FASTA files.' . "\n" . ' Only required if the database files do *not* have the standard file extensions: ' . "\n" . ' *.dat for UniProtKB DAT files or ' . "\n" . ' *.fa or *.fasta for FASTA files.' . "\n" . ' --k [ID|AC] Unique identifier type used as key to index an UniProtKB *.dat database file.' . "\n" . ' Must be either AC for a UniProt accession number (default) or ID for a UniProt identifier.' . "\n" . ' Not required for *.fasta databases: This tool will look for any type of ID in the first part of FASTA ' . "\n" . ' sequence headers up until the first white space.' . "\n" . ' Optionally multiple IDs may be present separated with pipe symbols (|) or semicolons (;).' . "\n" . ' Optionally IDs may be prefixed with a database namespace and a colon (:).' . "\n" . ' For example the accession number P32234 as well as the ID 128UP_DROME would be recognised in both ' . "\n" . ' this sequence header:' . "\n" . ' >UniProtAcc:P32234|UniProtID:128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)' . "\n" . ' and in this one:' . "\n" . ' >P32234|128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)' . "\n" . ' --f [file] Fragment file containing accession numbers or IDs and sequence fragments (peptides), ' . "\n" . ' whose sequence context should be extracted from the database. This file' . "\n" . ' * Must be tab delimited with one accession/ID plus one sequence fragment per line' . "\n" . ' * Must contain accession numbers / IDs in the same format as the database file used.' . "\n" . ' Exception: Optionally IDs may be suffixed with the peptide\'s position in its protein between brackets.' . "\n" . ' For example: CLH1_HUMAN[1612-1620].' . "\n" . ' * May contain other columns, which will be preserved in the output.' . "\n" . ' * Is the basis for the output: if a sequence context was found, ' . "\n" . ' it will be appended in another column to the right of the existing columns.' . "\n" . ' --icol [int] Column in the tab delimited fragment file containing the protein identifiers / accession numbers.' . "\n" . ' Default = 1.' . "\n" . ' --pcol [int] Column in the tab delimited fragment file containing the peptide sequences.' . "\n" . ' Default = 2.' . "\n" . ' --s Strip lowercase characters from the sequence fragments in the fragment file before doing a database lookup.' . "\n" . ' Amino acids are expected to be in uppercase. If not, the sequences are converted to uppercase ' . "\n" . ' before searching for the fragment in the database. When lowercase characters represent ' . "\n" . ' modifications instead of amino acids these need to be removed before searching the database.' . "\n" . ' Default = 0 (disabled) except when a modified amino acid is specified with --ma, which will automatically enable --s.' . "\n" . ' --cleo [file] Optional output file where the retrieved cleavage site sequence contexts will be saved.' . "\n" . ' --miso [file] Optional output file where the retrieved miscleavage site sequence contexts will be saved.' . "\n" . ' --modo [file] Optional output file where the retrieved modification site sequence contexts will be saved.' . "\n" . ' --pepo [file] Optional output file where the retrieved peptide sequence contexts will be saved.' . "\n" . ' (At least one output file must be specified with --cleo, --miso, --modo or --pepo).' . "\n" . ' --ca [A-Z] Cleavage amino acid. Assume the sequence fragments were derived from cutting with a proteolytic enzyme, ' . "\n" . ' that recognizes this amino acid as the position to cut.' . "\n" . ' When the specificity of the protease used to generate the peptides in the fragments file is unknown,' . "\n" . ' you may provide an asterisk (*) to retrieve sequence context for any cleavage,' . "\n" . ' but in that case this tool will not filter non-specifically cleaved fragments,' . "\n" . ' that may be the result of other processes than protease activity.' . "\n" . ' --ct [C|N] Cleavage terminus. Assume the sequence fragments were derived from cutting with a proteolytic enzyme, ' . "\n" . ' that cuts at the C or N terminus of the amino acid specified with the --ca [A-Z] parameter.' . "\n" . ' When the specificity of the protease used to generate the peptides in the fragments file is unknown,' . "\n" . ' you may provide an asterisk (*) to retrieve sequence context for any cleavage,' . "\n" . ' but in that case this tool will not filter non-specifically cleaved fragments,' . "\n" . ' that may be the result of other processes than protease activity.' . "\n" . ' --ma [A-Za-z] Modified amino acid.' . "\n" . ' The amino acid must be specified in uppercase and the modification in lower case.' . "\n" . ' The order is not important.' . "\n" . ' Hence a phophorylated serine in the fragments file can be indicated with either pS or Sp, ' . "\n" . ' but you cannot mix both pS and Sp in a single fragments file.' . "\n" . ' You may provide an asterisk (*) instead of an upper case amino acid to retrieve sequence contexts ' . "\n" . ' for the specified modification no matter what amino acid it was located on.' . "\n" . ' A modification may be specified with more than one lower case character, ' . "\n" . ' so for example phosphoS or Sphospho can also be used for a phosphorylated serine.' . "\n" . ' --n [int] Integer specifying the length of the N-terminal sequence context to retrieve starting from the ' . "\n" . ' cleavage, miscleavage or modification site. Default = 5.' . "\n" . ' --c [int] Integer specifying the length of the C-terminal sequence context to retrieve starting from the ' . "\n" . ' cleavage, miscleavage or modification site. Default = 5.' . "\n" . ' Please note that cleavage and miscleavage sites have zero width, ' . "\n" . ' while modified amino acids will increment the sequence context lenght with 1.' . "\n" . ' When defaults are used for both the N-terminal and C-terminal sequence context lengths,' . "\n" . ' the total sequence context length for a' . "\n" . ' * cleavage or miscleavage site will be:' . "\n" . ' (N-terminal sequence context) + (C-terminal sequence context) = 5 + 5 = 10.' . "\n" . ' * modification site will be:' . "\n" . ' (N-terminal sequence context) + modified AA + (C-terminal sequence context) = 5 + 1 + 5 = 11.' . "\n" . # ' --mcd [int] Minimal distance between a putative miscleavage site and the peptide termini.' . "\n" . # ' Uncleaved amino acids closer to the peptide termini can be ignored with this parameter:' . "\n" . # ' A. To prevent overlap between cleavage and miscleavage peptide sequence contexts.' . "\n" . # ' B. To prevent false positive miscleavage sites due to sites that cannot be cleaved, ' . "\n" . # ' because one of the resulting fragments is too short. (Sometimes proteases may need ' . "\n" . # ' a minimal length surrounding the cleavage site to bind and cleave a peptide.)' . "\n" . # ' Default = same as the longest *-terminal context length. Hence [--n [int]|--c [int]] (see above).' . "\n" . ' --pc [char] Optional padding character to fill N-terminal or C-terminal positions in the sequence context, ' . "\n" . ' when the protein was too short to get a complete sequence context.' . "\n" . ' Default = - (a.k.a. dash or the alignment gap character.)' . "\n" . ' To disable padding specify an empty string like this --pc \'\'.' . "\n" . ' --ll [LEVEL] Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.' . "\n" . "\n"; exit; }