Mercurial > repos > galaxyp > nbic_fasta
view GenerateDegenerateFasta.pl @ 0:163892325845 draft default tip
Initial commit.
author | galaxyp |
---|---|
date | Fri, 10 May 2013 17:15:08 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/perl -w # # This script generates a multi-sequence FASTA file # with all possible combinations of sequences based # on degenerate input sequences. # # ===================================================== # $Id: GenerateDegenerateFasta.pl 67 2010-11-09 15:20:01Z pieter.neerincx@gmail.com $ # $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/GenerateDegenerateFasta.pl $ # $LastChangedDate: 2010-11-09 09:20:01 -0600 (Tue, 09 Nov 2010) $ # $LastChangedRevision: 67 $ # $LastChangedBy: pieter.neerincx@gmail.com $ # ===================================================== # # initialise environment # use strict; use Getopt::Std; use Log::Log4perl qw(:easy); my %log_levels = ( 'ALL' => $ALL, 'TRACE' => $TRACE, 'DEBUG' => $DEBUG, 'INFO' => $INFO, 'WARN' => $WARN, 'ERROR' => $ERROR, 'FATAL' => $FATAL, 'OFF' => $OFF, ); my %amino_acids = ( 'A' => ['A'], 'B' => ['D','N'], 'C' => ['C'], 'D' => ['D'], 'E' => ['E'], 'F' => ['F'], 'G' => ['G'], 'H' => ['H'], 'I' => ['I'], 'J' => ['I','L'], 'K' => ['K'], 'L' => ['L'], 'M' => ['M'], 'N' => ['N'], 'O' => ['O'], 'P' => ['P'], 'Q' => ['Q'], 'R' => ['R'], 'S' => ['S'], 'T' => ['T'], 'U' => ['U'], 'V' => ['V'], 'W' => ['W'], 'X' => ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y'], # 20 regular Amino Acids. 'X22' => ['A','C','D','E','F','G','H','I','K','L','M','N','O','P','Q','R','S','T','U','V','W','Y'], # 22 Amino Acids including the rare Selenocysteine and Pyrrolysine. 'Y' => ['Y'], 'Z' => ['E','Q'], ); my %dna = ( 'A' => ['A'], 'B' => ['C','G','T'], 'C' => ['C'], 'D' => ['A','G','T'], 'G' => ['G'], 'H' => ['A','C','T'], 'K' => ['G','T'], 'M' => ['A','C'], 'N' => ['A','C','G','T'], 'R' => ['A','G'], 'S' => ['C','G'], 'T' => ['T'], 'V' => ['A','C','G'], 'W' => ['A','T'], 'Y' => ['C','T'], ); my %rna = ( 'A' => ['A'], 'B' => ['C','G','U'], 'C' => ['C'], 'D' => ['A','G','U'], 'G' => ['G'], 'H' => ['A','C','U'], 'K' => ['G','U'], 'M' => ['A','C'], 'N' => ['A','C','G','U'], 'R' => ['A','G'], 'S' => ['C','G'], 'U' => ['U'], 'V' => ['A','C','G'], 'W' => ['A','U'], 'Y' => ['C','U'], ); # # Get options. # my %opts; Getopt::Std::getopts('i:p:s:t:o:x:l:', \%opts); my $input = $opts{'i'}; my $prefix_column = $opts{'p'}; my $sequence_column = $opts{'s'}; my $acid_type = $opts{'t'}; my $output = $opts{'o'}; my $aa_x = $opts{'x'}; my $log_level = $opts{'l'}; # # Configure logging. # # Provides default if user did not specify log level: $log_level = (defined($log_level) ? $log_level : 'INFO'); # Reset log level to default if user specified illegal log level. $log_level = ( defined($log_levels{$log_level}) ? $log_levels{$log_level} : $log_levels{'INFO'}); #Log::Log4perl->init('log4perl.properties'); Log::Log4perl->easy_init( #{ level => $log_level, # file => ">>GenerateDegenrateFasta.log", # layout => '%F{1}-%L-%M: %m%n' }, { level => $log_level, file => "STDOUT", layout => '%d L:%L %p> %m%n' }, ); my $logger = Log::Log4perl::get_logger(); # # Check user input. # unless (defined($input) && defined($output)) { _Usage(); exit; } if ($input =~ /^$/ || $output =~ /^$/) { _Usage(); exit; } if ($input eq $output) { $logger->fatal('Output file is the same as the input file. Select a different file for the output.'); exit; } # # Check input file. # unless (-e $input && -f $input && -r $input) { $logger->fatal('Cannot read from input file ' . $input . ': ' . $!); exit; } $prefix_column = (defined($prefix_column) ? $prefix_column : 1); unless ($prefix_column =~ m/^\d+$/ && $prefix_column > 0) { $logger->fatal('Prefix column must be a positive integer.'); exit; } else { $logger->debug('Prefix column = ' . $prefix_column); } $sequence_column = (defined($sequence_column) ? $sequence_column : 2); unless ($sequence_column =~ m/^\d+$/ && $sequence_column > 0) { $logger->fatal('Sequence column must be a positive integer.'); exit; } else { $logger->debug('Sequence column = ' . $sequence_column); } $acid_type = (defined($acid_type) ? $acid_type : 'aa'); unless ($acid_type eq 'aa' || $acid_type eq 'dna' || $acid_type eq 'rna') { $logger->fatal('Illegal value for option -t. Value for acid type must be \'aa\' for amino acids, \'dna\' for deoxyribonucleic acids or \'rna\' for ribonucleic acids.'); exit; } my %acids; if ($acid_type eq 'aa') { $aa_x = (defined($aa_x) ? $aa_x : 20); unless ($aa_x == 20 || $aa_x == 22) { $logger->fatal('Illegal value for option -x. Value for amino acid \'X\' expansion must be 20 or 22.'); exit; } %acids = %amino_acids; } elsif ($acid_type eq 'dna') { %acids = %dna; } elsif ($acid_type eq 'rna') { %acids = %rna; } # # Read degenerate sequences and their IDs from a tab delimited file # my $degenerate_sequences = _ParseInput($input, $prefix_column, $sequence_column); my $degenerate_sequence_count = scalar(keys(%{$degenerate_sequences})); $logger->info('Number of degenerate sequences: ' . $degenerate_sequence_count); # # Generate FASTA DB with all possible permutations. # my ($counter) = _GenerateSeqs($degenerate_sequences, $output, $acid_type); $logger->info('Generated FASTA DB with ' . $counter . ' sequences.'); $logger->info('Finished!'); # ## ### Internal subs. ## # sub _ParseInput { my ($file_path, $prefix_column, $sequence_column) = @_; $logger->info('Parsing ' . $file_path . '...'); my %degenerate_sequences; my $file_path_fh; my $prefix_column_offset = $prefix_column - 1; my $sequence_column_offset = $sequence_column - 1; my $line_counter = 0; eval { open($file_path_fh, "<$file_path"); }; if ($@) { $logger->fatal('Cannot read input file: ' . $@); exit; } LINE: while (my $line = <$file_path_fh>) { $line =~ s/[\r\n]+//g; $line_counter++; my @values = split("\t", $line); unless (defined($values[$prefix_column_offset])) { $logger->fatal('Prefix missing on line ' . $line_counter . ' of the input file.'); exit; } unless (defined($values[$sequence_column_offset])) { $logger->fatal('Sequence missing on line ' . $line_counter . ' of the input file.'); exit; } my $prefix = $values[$prefix_column_offset]; my $degenerate_sequence = $values[$sequence_column_offset]; unless ($prefix =~ m/[a-zA-Z0-9_:\.\-]/i) { $logger->fatal('Prefix countains illegal characters on line ' . $line_counter . '. Prefix should contain only a-z A-Z 0-9 _ : . or -.'); exit; } unless ($degenerate_sequence =~ m/[a-zA-Z0-9_:\.\-]/i) { $logger->fatal('Degenerate sequence countains illegal characters on line ' . $line_counter . '. Sequence should contain only a-z A-Z.'); exit; } $degenerate_sequences{$prefix} = $degenerate_sequence; $logger->debug('Found degenerate sequence ' . $degenerate_sequence . ' with prefix ' . $prefix); } close($file_path_fh); $logger->info('Parsed input.'); return (\%degenerate_sequences); } sub _GenerateSeqs { my ($degenerate_sequences, $path_to, $acid_type) = @_; $logger->info('Generating sequences...'); my $generated_seqs = 0; my $path_to_fh; eval { open($path_to_fh, ">$path_to"); }; if ($@) { $logger->fatal('Cannot write FASTA file: ' . $@); exit; } DS:foreach my $prefix(sort(keys(%{$degenerate_sequences}))) { my $degenerate_sequence = ${$degenerate_sequences}{$prefix}; $degenerate_sequence = uc($degenerate_sequence); $logger->debug("\t" . 'For ' . $prefix); my $suffix = 1; my @degenerate_sequence_acids = split('', $degenerate_sequence); my $new_sequences = []; my $next_acid_count = 0; foreach my $next_acid (@degenerate_sequence_acids) { $next_acid_count++; unless (defined($acids{$next_acid})) { $logger->error('Unknown (degenerate) acid ' . $next_acid . ' in input sequence ' . $prefix . ' (' . $degenerate_sequence . ').'); $logger->warn("\t" . 'Skipping sequence ' . $prefix . '.'); next DS; } if ($acid_type eq 'aa') { if ($next_acid eq 'X') { # # By default X will expand to the 20 most common amino acids, # but if -x 22 was specified X will expand to all 22 amino acids # including the very rare ones. # if ($aa_x == 22) { $next_acid = 'X22'; } } } $new_sequences = _GrowSequence ($new_sequences, $next_acid); my $sequence_count = scalar(@{$new_sequences}); $logger->trace("\t\t" . $next_acid_count . ' Acid ' . $next_acid . ': ' . $sequence_count . ' sequences.'); } foreach my $new_sequence (@{$new_sequences}) { $generated_seqs++; my $id = $prefix . '_' . $suffix; $logger->trace("\t\t\t" . $id . ' :: ' . $new_sequence); print($path_to_fh '>' . $id . "\n"); # TODO: wrap long sequences over multiple lines. print($path_to_fh $new_sequence . "\n"); $suffix++; } } close($path_to_fh); return ($generated_seqs); } # # Usage # sub _GrowSequence { my ($sequences, $next_acid) = @_; my @larger_sequences; if (scalar(@{$sequences}) < 1) { foreach my $acid (@{$acids{$next_acid}}) { my $larger_sequence = $acid; push(@larger_sequences, $larger_sequence); } } else { foreach my $sequence (@{$sequences}) { foreach my $acid (@{$acids{$next_acid}}) { my $larger_sequence = $sequence . $acid; push(@larger_sequences, $larger_sequence); } } } return (\@larger_sequences); } sub _Usage { print STDERR "\n" . 'GenerateDegenerateFasta.pl:' . "\n\n" . ' Generates a multi-sequence FASTA file with all possible combinations of sequences ' . "\n" . ' based on degenerate input sequences.' . "\n\n" . 'Usage:' . "\n\n" . ' GenerateDegenerateFasta.pl options' . "\n\n" . 'Available options are:' . "\n\n" . ' -i [file] Input file in tab delimited format.' . "\n" . ' -p [number] Prefix column.' . "\n" . ' Column from the input file containg strings that will be used as prefixes ' . "\n" . ' to generate unique IDs for the FASTA sequences.' . "\n" . ' -s [number] Sequence column.' . "\n" . ' Column from the input file containg degenerate amino or nucleic acid sequences ' . "\n" . ' that will be used to generate the FASTA sequences.' . "\n" . ' For example RXX can be used to generate the amino acids sequences RAA, RAC, RAD ... RYY.' . "\n" . ' -t [type] Acid type of the degenerate input sequences.' . "\n" . ' Must be one of:' . "\n" . ' aa - for amino acids' . "\n" . ' dna - for deoxyribonucleic acids' . "\n" . ' rna - for ribonucleic acids' . "\n" . ' -o [file] Output file in FASTA format.' . "\n" . ' -x [20|22] Indicates whether the degenerate amino acid X represents only the 20 most common amino acids (default).' . "\n" . ' or all 22 amino acids including the rare Selenocysteine and Pyrrolysine.' . "\n" . ' -l [LEVEL] Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.' . "\n" . "\n"; exit; }