Mercurial > repos > galaxyp > nbic_fasta
diff GenerateDegenerateFasta.pl @ 0:163892325845 draft default tip
Initial commit.
author | galaxyp |
---|---|
date | Fri, 10 May 2013 17:15:08 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/GenerateDegenerateFasta.pl Fri May 10 17:15:08 2013 -0400 @@ -0,0 +1,435 @@ +#!/usr/bin/perl -w +# +# This script generates a multi-sequence FASTA file +# with all possible combinations of sequences based +# on degenerate input sequences. +# +# ===================================================== +# $Id: GenerateDegenerateFasta.pl 67 2010-11-09 15:20:01Z pieter.neerincx@gmail.com $ +# $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/GenerateDegenerateFasta.pl $ +# $LastChangedDate: 2010-11-09 09:20:01 -0600 (Tue, 09 Nov 2010) $ +# $LastChangedRevision: 67 $ +# $LastChangedBy: pieter.neerincx@gmail.com $ +# ===================================================== + +# +# initialise environment +# +use strict; +use Getopt::Std; +use Log::Log4perl qw(:easy); + +my %log_levels = ( + 'ALL' => $ALL, + 'TRACE' => $TRACE, + 'DEBUG' => $DEBUG, + 'INFO' => $INFO, + 'WARN' => $WARN, + 'ERROR' => $ERROR, + 'FATAL' => $FATAL, + 'OFF' => $OFF, +); + +my %amino_acids = ( + 'A' => ['A'], + 'B' => ['D','N'], + 'C' => ['C'], + 'D' => ['D'], + 'E' => ['E'], + 'F' => ['F'], + 'G' => ['G'], + 'H' => ['H'], + 'I' => ['I'], + 'J' => ['I','L'], + 'K' => ['K'], + 'L' => ['L'], + 'M' => ['M'], + 'N' => ['N'], + 'O' => ['O'], + 'P' => ['P'], + 'Q' => ['Q'], + 'R' => ['R'], + 'S' => ['S'], + 'T' => ['T'], + 'U' => ['U'], + 'V' => ['V'], + 'W' => ['W'], + 'X' => ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y'], # 20 regular Amino Acids. + 'X22' => ['A','C','D','E','F','G','H','I','K','L','M','N','O','P','Q','R','S','T','U','V','W','Y'], # 22 Amino Acids including the rare Selenocysteine and Pyrrolysine. + 'Y' => ['Y'], + 'Z' => ['E','Q'], +); + +my %dna = ( + 'A' => ['A'], + 'B' => ['C','G','T'], + 'C' => ['C'], + 'D' => ['A','G','T'], + 'G' => ['G'], + 'H' => ['A','C','T'], + 'K' => ['G','T'], + 'M' => ['A','C'], + 'N' => ['A','C','G','T'], + 'R' => ['A','G'], + 'S' => ['C','G'], + 'T' => ['T'], + 'V' => ['A','C','G'], + 'W' => ['A','T'], + 'Y' => ['C','T'], +); + +my %rna = ( + 'A' => ['A'], + 'B' => ['C','G','U'], + 'C' => ['C'], + 'D' => ['A','G','U'], + 'G' => ['G'], + 'H' => ['A','C','U'], + 'K' => ['G','U'], + 'M' => ['A','C'], + 'N' => ['A','C','G','U'], + 'R' => ['A','G'], + 'S' => ['C','G'], + 'U' => ['U'], + 'V' => ['A','C','G'], + 'W' => ['A','U'], + 'Y' => ['C','U'], +); + +# +# Get options. +# +my %opts; +Getopt::Std::getopts('i:p:s:t:o:x:l:', \%opts); + +my $input = $opts{'i'}; +my $prefix_column = $opts{'p'}; +my $sequence_column = $opts{'s'}; +my $acid_type = $opts{'t'}; +my $output = $opts{'o'}; +my $aa_x = $opts{'x'}; +my $log_level = $opts{'l'}; + +# +# Configure logging. +# +# Provides default if user did not specify log level: +$log_level = (defined($log_level) ? $log_level : 'INFO'); + +# Reset log level to default if user specified illegal log level. +$log_level = ( + defined($log_levels{$log_level}) + ? $log_levels{$log_level} + : $log_levels{'INFO'}); + +#Log::Log4perl->init('log4perl.properties'); +Log::Log4perl->easy_init( + + #{ level => $log_level, + # file => ">>GenerateDegenrateFasta.log", + # layout => '%F{1}-%L-%M: %m%n' }, + { + level => $log_level, + file => "STDOUT", + layout => '%d L:%L %p> %m%n' + }, +); +my $logger = Log::Log4perl::get_logger(); + +# +# Check user input. +# +unless (defined($input) && defined($output)) { + _Usage(); + exit; +} +if ($input =~ /^$/ || $output =~ /^$/) { + _Usage(); + exit; +} +if ($input eq $output) { + $logger->fatal('Output file is the same as the input file. Select a different file for the output.'); + exit; +} + +# +# Check input file. +# +unless (-e $input && -f $input && -r $input) { + $logger->fatal('Cannot read from input file ' . $input . ': ' . $!); + exit; +} + +$prefix_column = (defined($prefix_column) ? $prefix_column : 1); +unless ($prefix_column =~ m/^\d+$/ && $prefix_column > 0) { + $logger->fatal('Prefix column must be a positive integer.'); + exit; +} else { + $logger->debug('Prefix column = ' . $prefix_column); +} + +$sequence_column = (defined($sequence_column) ? $sequence_column : 2); +unless ($sequence_column =~ m/^\d+$/ && $sequence_column > 0) { + $logger->fatal('Sequence column must be a positive integer.'); + exit; +} else { + $logger->debug('Sequence column = ' . $sequence_column); +} + +$acid_type = (defined($acid_type) ? $acid_type : 'aa'); +unless ($acid_type eq 'aa' || $acid_type eq 'dna' || $acid_type eq 'rna') { + $logger->fatal('Illegal value for option -t. Value for acid type must be \'aa\' for amino acids, \'dna\' for deoxyribonucleic acids or \'rna\' for ribonucleic acids.'); + exit; +} + +my %acids; +if ($acid_type eq 'aa') { + $aa_x = (defined($aa_x) ? $aa_x : 20); + unless ($aa_x == 20 || $aa_x == 22) { + $logger->fatal('Illegal value for option -x. Value for amino acid \'X\' expansion must be 20 or 22.'); + exit; + } + %acids = %amino_acids; +} elsif ($acid_type eq 'dna') { + %acids = %dna; +} elsif ($acid_type eq 'rna') { + %acids = %rna; +} + +# +# Read degenerate sequences and their IDs from a tab delimited file +# +my $degenerate_sequences = _ParseInput($input, $prefix_column, $sequence_column); +my $degenerate_sequence_count = scalar(keys(%{$degenerate_sequences})); +$logger->info('Number of degenerate sequences: ' . $degenerate_sequence_count); + +# +# Generate FASTA DB with all possible permutations. +# +my ($counter) = _GenerateSeqs($degenerate_sequences, $output, $acid_type); + +$logger->info('Generated FASTA DB with ' . $counter . ' sequences.'); +$logger->info('Finished!'); + +# +## +### Internal subs. +## +# + +sub _ParseInput { + + my ($file_path, $prefix_column, $sequence_column) = @_; + + $logger->info('Parsing ' . $file_path . '...'); + + my %degenerate_sequences; + my $file_path_fh; + my $prefix_column_offset = $prefix_column - 1; + my $sequence_column_offset = $sequence_column - 1; + my $line_counter = 0; + + eval { + open($file_path_fh, "<$file_path"); + }; + if ($@) { + $logger->fatal('Cannot read input file: ' . $@); + exit; + } + + LINE: while (my $line = <$file_path_fh>) { + + $line =~ s/[\r\n]+//g; + $line_counter++; + + my @values = split("\t", $line); + + unless (defined($values[$prefix_column_offset])) { + $logger->fatal('Prefix missing on line ' . $line_counter . ' of the input file.'); + exit; + } + + unless (defined($values[$sequence_column_offset])) { + $logger->fatal('Sequence missing on line ' . $line_counter . ' of the input file.'); + exit; + } + + my $prefix = $values[$prefix_column_offset]; + my $degenerate_sequence = $values[$sequence_column_offset]; + + + unless ($prefix =~ m/[a-zA-Z0-9_:\.\-]/i) { + $logger->fatal('Prefix countains illegal characters on line ' . $line_counter . '. Prefix should contain only a-z A-Z 0-9 _ : . or -.'); + exit; + } + + unless ($degenerate_sequence =~ m/[a-zA-Z0-9_:\.\-]/i) { + $logger->fatal('Degenerate sequence countains illegal characters on line ' . $line_counter . '. Sequence should contain only a-z A-Z.'); + exit; + } + + $degenerate_sequences{$prefix} = $degenerate_sequence; + $logger->debug('Found degenerate sequence ' . $degenerate_sequence . ' with prefix ' . $prefix); + + } + + close($file_path_fh); + $logger->info('Parsed input.'); + + return (\%degenerate_sequences); + +} + +sub _GenerateSeqs { + + my ($degenerate_sequences, $path_to, $acid_type) = @_; + + $logger->info('Generating sequences...'); + + my $generated_seqs = 0; + my $path_to_fh; + + eval { + open($path_to_fh, ">$path_to"); + }; + if ($@) { + $logger->fatal('Cannot write FASTA file: ' . $@); + exit; + } + + DS:foreach my $prefix(sort(keys(%{$degenerate_sequences}))) { + + my $degenerate_sequence = ${$degenerate_sequences}{$prefix}; + $degenerate_sequence = uc($degenerate_sequence); + + $logger->debug("\t" . 'For ' . $prefix); + + my $suffix = 1; + my @degenerate_sequence_acids = split('', $degenerate_sequence); + my $new_sequences = []; + my $next_acid_count = 0; + + foreach my $next_acid (@degenerate_sequence_acids) { + + $next_acid_count++; + + unless (defined($acids{$next_acid})) { + $logger->error('Unknown (degenerate) acid ' . $next_acid . ' in input sequence ' . $prefix . ' (' . $degenerate_sequence . ').'); + $logger->warn("\t" . 'Skipping sequence ' . $prefix . '.'); + next DS; + } + + if ($acid_type eq 'aa') { + + if ($next_acid eq 'X') { + + # + # By default X will expand to the 20 most common amino acids, + # but if -x 22 was specified X will expand to all 22 amino acids + # including the very rare ones. + # + if ($aa_x == 22) { + + $next_acid = 'X22'; + + } + } + } + + $new_sequences = _GrowSequence ($new_sequences, $next_acid); + + my $sequence_count = scalar(@{$new_sequences}); + $logger->trace("\t\t" . $next_acid_count . ' Acid ' . $next_acid . ': ' . $sequence_count . ' sequences.'); + + } + + foreach my $new_sequence (@{$new_sequences}) { + + $generated_seqs++; + my $id = $prefix . '_' . $suffix; + + $logger->trace("\t\t\t" . $id . ' :: ' . $new_sequence); + + print($path_to_fh '>' . $id . "\n"); + # TODO: wrap long sequences over multiple lines. + print($path_to_fh $new_sequence . "\n"); + + $suffix++; + + } + } + + close($path_to_fh); + + return ($generated_seqs); + +} + +# +# Usage +# + +sub _GrowSequence { + + my ($sequences, $next_acid) = @_; + + my @larger_sequences; + + if (scalar(@{$sequences}) < 1) { + + foreach my $acid (@{$acids{$next_acid}}) { + + my $larger_sequence = $acid; + + push(@larger_sequences, $larger_sequence); + + } + + } else { + + foreach my $sequence (@{$sequences}) { + + foreach my $acid (@{$acids{$next_acid}}) { + + my $larger_sequence = $sequence . $acid; + + push(@larger_sequences, $larger_sequence); + + } + } + } + + return (\@larger_sequences); + +} + +sub _Usage { + + print STDERR "\n" + . 'GenerateDegenerateFasta.pl:' . "\n\n" + . ' Generates a multi-sequence FASTA file with all possible combinations of sequences ' . "\n" + . ' based on degenerate input sequences.' . "\n\n" + . 'Usage:' . "\n\n" + . ' GenerateDegenerateFasta.pl options' . "\n\n" + . 'Available options are:' . "\n\n" + . ' -i [file] Input file in tab delimited format.' . "\n" + . ' -p [number] Prefix column.' . "\n" + . ' Column from the input file containg strings that will be used as prefixes ' . "\n" + . ' to generate unique IDs for the FASTA sequences.' . "\n" + . ' -s [number] Sequence column.' . "\n" + . ' Column from the input file containg degenerate amino or nucleic acid sequences ' . "\n" + . ' that will be used to generate the FASTA sequences.' . "\n" + . ' For example RXX can be used to generate the amino acids sequences RAA, RAC, RAD ... RYY.' . "\n" + . ' -t [type] Acid type of the degenerate input sequences.' . "\n" + . ' Must be one of:' . "\n" + . ' aa - for amino acids' . "\n" + . ' dna - for deoxyribonucleic acids' . "\n" + . ' rna - for ribonucleic acids' . "\n" + . ' -o [file] Output file in FASTA format.' . "\n" + . ' -x [20|22] Indicates whether the degenerate amino acid X represents only the 20 most common amino acids (default).' . "\n" + . ' or all 22 amino acids including the rare Selenocysteine and Pyrrolysine.' . "\n" + . ' -l [LEVEL] Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.' . "\n" + . "\n"; + exit; + +}