Mercurial > repos > galaxyp > nbic_fasta

diff GenerateDegenerateFasta.pl @ 0:163892325845 draft default tip
Initial commit.
author: galaxyp
date: Fri, 10 May 2013 17:15:08 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/GenerateDegenerateFasta.pl	Fri May 10 17:15:08 2013 -0400
@@ -0,0 +1,435 @@
+#!/usr/bin/perl -w
+#
+# This script generates a multi-sequence FASTA file
+# with all possible combinations of sequences based 
+# on degenerate input sequences.
+#
+# =====================================================
+# $Id: GenerateDegenerateFasta.pl 67 2010-11-09 15:20:01Z pieter.neerincx@gmail.com $
+# $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/GenerateDegenerateFasta.pl $
+# $LastChangedDate: 2010-11-09 09:20:01 -0600 (Tue, 09 Nov 2010) $
+# $LastChangedRevision: 67 $
+# $LastChangedBy: pieter.neerincx@gmail.com $
+# =====================================================
+
+#
+# initialise environment
+#
+use strict;
+use Getopt::Std;
+use Log::Log4perl qw(:easy);
+
+my %log_levels = (
+	'ALL'   => $ALL,
+	'TRACE' => $TRACE,
+	'DEBUG' => $DEBUG,
+	'INFO'  => $INFO,
+	'WARN'  => $WARN,
+	'ERROR' => $ERROR,
+	'FATAL' => $FATAL,
+	'OFF'   => $OFF,
+);
+
+my %amino_acids = (
+	'A'		=> ['A'],
+	'B'		=> ['D','N'],
+	'C'		=> ['C'],
+	'D'		=> ['D'],
+	'E'		=> ['E'],
+	'F'		=> ['F'],
+	'G'		=> ['G'],
+	'H'		=> ['H'],
+	'I'		=> ['I'],
+	'J'		=> ['I','L'],
+	'K'		=> ['K'],
+	'L'		=> ['L'],
+	'M'		=> ['M'],
+	'N'		=> ['N'],
+	'O'		=> ['O'],
+	'P'		=> ['P'],
+	'Q'		=> ['Q'],
+	'R'		=> ['R'],
+	'S'		=> ['S'],
+	'T'		=> ['T'],
+	'U'		=> ['U'],
+	'V'		=> ['V'],
+	'W'		=> ['W'],
+	'X'		=> ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y'],	# 20 regular Amino Acids.
+	'X22'	=> ['A','C','D','E','F','G','H','I','K','L','M','N','O','P','Q','R','S','T','U','V','W','Y'],	# 22 Amino Acids including the rare Selenocysteine and Pyrrolysine.
+	'Y'		=> ['Y'],
+	'Z'		=> ['E','Q'],
+);
+
+my %dna = (
+	'A'		=> ['A'],
+	'B'		=> ['C','G','T'],
+	'C'		=> ['C'],
+	'D'		=> ['A','G','T'],
+	'G'		=> ['G'],
+	'H'		=> ['A','C','T'],
+	'K'		=> ['G','T'],
+	'M'		=> ['A','C'],
+	'N'		=> ['A','C','G','T'],
+	'R'		=> ['A','G'],
+	'S'		=> ['C','G'],
+	'T'		=> ['T'],
+	'V'		=> ['A','C','G'],
+	'W'		=> ['A','T'],
+	'Y'		=> ['C','T'],
+);
+
+my %rna = (
+	'A'		=> ['A'],
+	'B'		=> ['C','G','U'],
+	'C'		=> ['C'],
+	'D'		=> ['A','G','U'],
+	'G'		=> ['G'],
+	'H'		=> ['A','C','U'],
+	'K'		=> ['G','U'],
+	'M'		=> ['A','C'],
+	'N'		=> ['A','C','G','U'],
+	'R'		=> ['A','G'],
+	'S'		=> ['C','G'],
+	'U'		=> ['U'],
+	'V'		=> ['A','C','G'],
+	'W'		=> ['A','U'],
+	'Y'		=> ['C','U'],
+);
+
+#
+# Get options.
+#
+my %opts;
+Getopt::Std::getopts('i:p:s:t:o:x:l:', \%opts);
+
+my $input						= $opts{'i'};
+my $prefix_column				= $opts{'p'};
+my $sequence_column				= $opts{'s'};
+my $acid_type					= $opts{'t'};
+my $output						= $opts{'o'};
+my $aa_x						= $opts{'x'};
+my $log_level					= $opts{'l'};
+
+#
+# Configure logging.
+#
+# Provides default if user did not specify log level:
+$log_level = (defined($log_level) ? $log_level : 'INFO');
+
+# Reset log level to default if user specified illegal log level.
+$log_level = (
+	defined($log_levels{$log_level})
+	? $log_levels{$log_level}
+	: $log_levels{'INFO'});
+
+#Log::Log4perl->init('log4perl.properties');
+Log::Log4perl->easy_init(
+
+	#{ level    => $log_level,
+	#  file     => ">>GenerateDegenrateFasta.log",
+	#  layout   => '%F{1}-%L-%M: %m%n' },
+	{
+		level  => $log_level,
+		file   => "STDOUT",
+		layout => '%d L:%L %p> %m%n'
+	},
+);
+my $logger = Log::Log4perl::get_logger();
+
+#
+# Check user input.
+#
+unless (defined($input) && defined($output)) {
+	_Usage();
+	exit;
+}
+if ($input =~ /^$/ || $output =~ /^$/) {
+	_Usage();
+	exit;
+}
+if ($input eq $output) {
+	$logger->fatal('Output file is the same as the input file. Select a different file for the output.');
+	exit;
+}
+
+#
+# Check input file.
+#
+unless (-e $input && -f $input && -r $input) {
+	$logger->fatal('Cannot read from input file ' . $input . ': ' . $!);
+	exit;
+}
+
+$prefix_column = (defined($prefix_column) ? $prefix_column : 1);
+unless ($prefix_column =~ m/^\d+$/ && $prefix_column > 0) {
+	$logger->fatal('Prefix column must be a positive integer.');
+	exit;
+} else {
+	$logger->debug('Prefix column = ' . $prefix_column);
+}
+
+$sequence_column = (defined($sequence_column) ? $sequence_column : 2);
+unless ($sequence_column =~ m/^\d+$/ && $sequence_column > 0) {
+	$logger->fatal('Sequence column must be a positive integer.');
+	exit;
+} else {
+	$logger->debug('Sequence column = ' . $sequence_column);
+}
+
+$acid_type = (defined($acid_type) ? $acid_type : 'aa');
+unless ($acid_type eq 'aa' || $acid_type eq 'dna' || $acid_type eq 'rna') {
+	$logger->fatal('Illegal value for option -t. Value for acid type must be \'aa\' for amino acids, \'dna\' for deoxyribonucleic acids or \'rna\' for ribonucleic acids.');
+	exit;
+}
+
+my %acids;
+if ($acid_type eq 'aa') {
+	$aa_x = (defined($aa_x) ? $aa_x : 20);
+	unless ($aa_x == 20 || $aa_x == 22) {
+		$logger->fatal('Illegal value for option -x. Value for amino acid \'X\' expansion must be 20 or 22.');
+		exit;
+	}
+	%acids = %amino_acids;
+} elsif ($acid_type eq 'dna') {
+	%acids = %dna;
+} elsif ($acid_type eq 'rna') {
+	%acids = %rna;
+}
+
+#
+# Read degenerate sequences and their IDs from a tab delimited file 
+#
+my $degenerate_sequences = _ParseInput($input, $prefix_column, $sequence_column);
+my $degenerate_sequence_count = scalar(keys(%{$degenerate_sequences}));
+$logger->info('Number of degenerate sequences: ' . $degenerate_sequence_count);
+
+#
+# Generate FASTA DB with all possible permutations.
+#
+my ($counter) = _GenerateSeqs($degenerate_sequences, $output, $acid_type);
+
+$logger->info('Generated FASTA DB with ' . $counter . ' sequences.');
+$logger->info('Finished!');
+
+#
+##
+### Internal subs.
+##
+#
+
+sub _ParseInput {
+
+	my ($file_path, $prefix_column, $sequence_column) = @_;
+
+	$logger->info('Parsing ' . $file_path . '...');
+
+	my %degenerate_sequences;
+	my $file_path_fh;
+	my $prefix_column_offset	= $prefix_column - 1;
+	my $sequence_column_offset	= $sequence_column - 1;
+	my $line_counter			= 0;
+	
+	eval {
+		open($file_path_fh, "<$file_path");
+	};
+	if ($@) {
+		$logger->fatal('Cannot read input file: ' . $@);
+		exit;
+	}
+	
+	LINE: while (my $line = <$file_path_fh>) {
+
+		$line =~ s/[\r\n]+//g;
+		$line_counter++;
+		
+		my @values = split("\t", $line);
+		
+		unless (defined($values[$prefix_column_offset])) {
+			$logger->fatal('Prefix missing on line ' . $line_counter . ' of the input file.');
+			exit;
+		}
+		
+		unless (defined($values[$sequence_column_offset])) {
+			$logger->fatal('Sequence missing on line ' . $line_counter . ' of the input file.');
+			exit;
+		}
+		
+		my $prefix = $values[$prefix_column_offset];
+		my $degenerate_sequence = $values[$sequence_column_offset];
+		
+		
+		unless ($prefix =~ m/[a-zA-Z0-9_:\.\-]/i) {
+			$logger->fatal('Prefix countains illegal characters on line ' . $line_counter . '. Prefix should contain only a-z A-Z 0-9 _ : . or -.');
+			exit;
+		}
+
+		unless ($degenerate_sequence =~ m/[a-zA-Z0-9_:\.\-]/i) {
+			$logger->fatal('Degenerate sequence countains illegal characters on line ' . $line_counter . '. Sequence should contain only a-z A-Z.');
+			exit;
+		}
+		
+		$degenerate_sequences{$prefix} = $degenerate_sequence;
+		$logger->debug('Found degenerate sequence ' . $degenerate_sequence . ' with prefix ' . $prefix);
+		
+	}
+
+	close($file_path_fh);
+	$logger->info('Parsed input.');
+
+	return (\%degenerate_sequences);
+
+}
+
+sub _GenerateSeqs {
+
+	my ($degenerate_sequences, $path_to, $acid_type) = @_;
+
+	$logger->info('Generating sequences...');
+
+	my $generated_seqs = 0;
+	my $path_to_fh;
+
+	eval {
+		open($path_to_fh,	">$path_to");
+	};
+	if ($@) {
+		$logger->fatal('Cannot write FASTA file: ' . $@);
+		exit;
+	}
+	
+	DS:foreach my $prefix(sort(keys(%{$degenerate_sequences}))) {
+		
+		my $degenerate_sequence = ${$degenerate_sequences}{$prefix};
+		$degenerate_sequence = uc($degenerate_sequence);
+		
+		$logger->debug("\t" . 'For ' . $prefix);
+		
+		my $suffix = 1;
+		my @degenerate_sequence_acids = split('', $degenerate_sequence);
+		my $new_sequences = [];
+		my $next_acid_count = 0;
+		
+		foreach my $next_acid (@degenerate_sequence_acids) {
+			
+			$next_acid_count++;
+			
+			unless (defined($acids{$next_acid})) {
+				$logger->error('Unknown (degenerate) acid ' . $next_acid . ' in input sequence ' . $prefix . ' (' . $degenerate_sequence . ').');
+				$logger->warn("\t" . 'Skipping sequence ' . $prefix . '.');
+				next DS;
+			}
+			
+			if ($acid_type eq 'aa') {
+			
+				if ($next_acid eq 'X') {
+				
+					#
+					# By default X will expand to the 20 most common amino acids,
+					# but if -x 22 was specified X will expand to all 22 amino acids 
+					# including the very rare ones.
+					#
+					if ($aa_x == 22) {	
+					
+						$next_acid = 'X22';
+					
+					}
+				}
+			}
+			
+			$new_sequences = _GrowSequence ($new_sequences, $next_acid);
+			
+			my $sequence_count = scalar(@{$new_sequences});
+			$logger->trace("\t\t" . $next_acid_count . ' Acid ' . $next_acid . ': ' . $sequence_count . ' sequences.');
+			
+		}
+		
+		foreach my $new_sequence (@{$new_sequences}) {
+			
+			$generated_seqs++;
+			my $id = $prefix . '_' . $suffix;
+			
+			$logger->trace("\t\t\t" . $id . ' :: ' . $new_sequence);
+			
+			print($path_to_fh '>' . $id . "\n");
+			# TODO: wrap long sequences over multiple lines.
+			print($path_to_fh $new_sequence . "\n");		
+			
+			$suffix++;
+			
+		}	
+	}
+	
+	close($path_to_fh);
+
+	return ($generated_seqs);
+
+}
+
+#
+# Usage
+#
+
+sub _GrowSequence {
+
+	my ($sequences, $next_acid) = @_;
+	
+	my @larger_sequences;
+	
+	if (scalar(@{$sequences}) < 1) {
+		
+		foreach my $acid (@{$acids{$next_acid}}) {
+		
+			my $larger_sequence = $acid;
+			
+			push(@larger_sequences, $larger_sequence);
+			
+		}
+	
+	} else {
+		
+		foreach my $sequence (@{$sequences}) {
+					
+			foreach my $acid (@{$acids{$next_acid}}) {
+			
+				my $larger_sequence = $sequence . $acid;
+				
+				push(@larger_sequences, $larger_sequence);
+				
+			}
+		}
+	}
+	
+	return (\@larger_sequences);
+
+}
+
+sub _Usage {
+
+	print STDERR "\n"
+	  . 'GenerateDegenerateFasta.pl:' . "\n\n"
+	  . '   Generates a multi-sequence FASTA file with all possible combinations of sequences ' . "\n"
+	  . '   based on degenerate input sequences.' . "\n\n"
+	  . 'Usage:' . "\n\n"
+	  . '   GenerateDegenerateFasta.pl options' . "\n\n"
+	  . 'Available options are:' . "\n\n"
+	  . '   -i [file]   Input file in tab delimited format.' . "\n"
+	  . '   -p [number] Prefix column.' . "\n"
+	  . '               Column from the input file containg strings that will be used as prefixes ' . "\n"
+	  . '               to generate unique IDs for the FASTA sequences.' . "\n"
+	  . '   -s [number] Sequence column.' . "\n"
+	  . '               Column from the input file containg degenerate amino or nucleic acid sequences ' . "\n"
+	  . '               that will be used to generate the FASTA sequences.' . "\n"
+	  . '               For example RXX can be used to generate the amino acids sequences RAA, RAC, RAD ... RYY.' . "\n"
+	  . '   -t [type]   Acid type of the degenerate input sequences.' . "\n"
+	  . '               Must be one of:' . "\n"
+	  . '                  aa  - for amino acids' . "\n"
+	  . '                  dna - for deoxyribonucleic acids' . "\n"
+	  . '                  rna - for ribonucleic acids' . "\n"
+	  . '   -o [file]   Output file in FASTA format.' . "\n"
+	  . '   -x [20|22]  Indicates whether the degenerate amino acid X represents only the 20 most common amino acids (default).' . "\n"
+	  . '               or all 22 amino acids including the rare Selenocysteine and Pyrrolysine.' . "\n"
+	  . '   -l [LEVEL]  Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.' . "\n"
+	  . "\n";
+	exit;
+
+}
author	galaxyp
date	Fri, 10 May 2013 17:15:08 -0400
parents
children