Mercurial > repos > galaxyp > nbic_fasta

#!/usr/bin/perl -w
#
# This script generates a multi-sequence FASTA file
# with all possible combinations of sequences based
# on degenerate input sequences.
#
# =====================================================
# $Id: GenerateDegenerateFasta.pl 67 2010-11-09 15:20:01Z pieter.neerincx@gmail.com $
# $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/GenerateDegenerateFasta.pl $
# $LastChangedDate: 2010-11-09 09:20:01 -0600 (Tue, 09 Nov 2010) $
# $LastChangedRevision: 67 $
# $LastChangedBy: pieter.neerincx@gmail.com $
# =====================================================

#
# initialise environment
#
use strict;
use Getopt::Std;
use Log::Log4perl qw(:easy);

my %log_levels = (
	'ALL'   => $ALL,
	'TRACE' => $TRACE,
	'DEBUG' => $DEBUG,
	'INFO'  => $INFO,
	'WARN'  => $WARN,
	'ERROR' => $ERROR,
	'FATAL' => $FATAL,
	'OFF'   => $OFF,
);

my %amino_acids = (
	'A'		=> ['A'],
	'B'		=> ['D','N'],
	'C'		=> ['C'],
	'D'		=> ['D'],
	'E'		=> ['E'],
	'F'		=> ['F'],
	'G'		=> ['G'],
	'H'		=> ['H'],
	'I'		=> ['I'],
	'J'		=> ['I','L'],
	'K'		=> ['K'],
	'L'		=> ['L'],
	'M'		=> ['M'],
	'N'		=> ['N'],
	'O'		=> ['O'],
	'P'		=> ['P'],
	'Q'		=> ['Q'],
	'R'		=> ['R'],
	'S'		=> ['S'],
	'T'		=> ['T'],
	'U'		=> ['U'],
	'V'		=> ['V'],
	'W'		=> ['W'],
	'X'		=> ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y'],	# 20 regular Amino Acids.
	'X22'	=> ['A','C','D','E','F','G','H','I','K','L','M','N','O','P','Q','R','S','T','U','V','W','Y'],	# 22 Amino Acids including the rare Selenocysteine and Pyrrolysine.
	'Y'		=> ['Y'],
	'Z'		=> ['E','Q'],
);

my %dna = (
	'A'		=> ['A'],
	'B'		=> ['C','G','T'],
	'C'		=> ['C'],
	'D'		=> ['A','G','T'],
	'G'		=> ['G'],
	'H'		=> ['A','C','T'],
	'K'		=> ['G','T'],
	'M'		=> ['A','C'],
	'N'		=> ['A','C','G','T'],
	'R'		=> ['A','G'],
	'S'		=> ['C','G'],
	'T'		=> ['T'],
	'V'		=> ['A','C','G'],
	'W'		=> ['A','T'],
	'Y'		=> ['C','T'],
);

my %rna = (
	'A'		=> ['A'],
	'B'		=> ['C','G','U'],
	'C'		=> ['C'],
	'D'		=> ['A','G','U'],
	'G'		=> ['G'],
	'H'		=> ['A','C','U'],
	'K'		=> ['G','U'],
	'M'		=> ['A','C'],
	'N'		=> ['A','C','G','U'],
	'R'		=> ['A','G'],
	'S'		=> ['C','G'],
	'U'		=> ['U'],
	'V'		=> ['A','C','G'],
	'W'		=> ['A','U'],
	'Y'		=> ['C','U'],
);

#
# Get options.
#
my %opts;
Getopt::Std::getopts('i:p:s:t:o:x:l:', \%opts);

my $input						= $opts{'i'};
my $prefix_column				= $opts{'p'};
my $sequence_column				= $opts{'s'};
my $acid_type					= $opts{'t'};
my $output						= $opts{'o'};
my $aa_x						= $opts{'x'};
my $log_level					= $opts{'l'};

#
# Configure logging.
#
# Provides default if user did not specify log level:
$log_level = (defined($log_level) ? $log_level : 'INFO');

# Reset log level to default if user specified illegal log level.
$log_level = (
	defined($log_levels{$log_level})
	? $log_levels{$log_level}
	: $log_levels{'INFO'});

#Log::Log4perl->init('log4perl.properties');
Log::Log4perl->easy_init(

	#{ level    => $log_level,
	#  file     => ">>GenerateDegenrateFasta.log",
	#  layout   => '%F{1}-%L-%M: %m%n' },
	{
		level  => $log_level,
		file   => "STDOUT",
		layout => '%d L:%L %p> %m%n'
	},
);
my $logger = Log::Log4perl::get_logger();

#
# Check user input.
#
unless (defined($input) && defined($output)) {
	_Usage();
	exit;
}
if ($input =~ /^$/ || $output =~ /^$/) {
	_Usage();
	exit;
}
if ($input eq $output) {
	$logger->fatal('Output file is the same as the input file. Select a different file for the output.');
	exit;
}

#
# Check input file.
#
unless (-e $input && -f $input && -r $input) {
	$logger->fatal('Cannot read from input file ' . $input . ': ' . $!);
	exit;
}

$prefix_column = (defined($prefix_column) ? $prefix_column : 1);
unless ($prefix_column =~ m/^\d+$/ && $prefix_column > 0) {
	$logger->fatal('Prefix column must be a positive integer.');
	exit;
} else {
	$logger->debug('Prefix column = ' . $prefix_column);
}

$sequence_column = (defined($sequence_column) ? $sequence_column : 2);
unless ($sequence_column =~ m/^\d+$/ && $sequence_column > 0) {
	$logger->fatal('Sequence column must be a positive integer.');
	exit;
} else {
	$logger->debug('Sequence column = ' . $sequence_column);
}

$acid_type = (defined($acid_type) ? $acid_type : 'aa');
unless ($acid_type eq 'aa' || $acid_type eq 'dna' || $acid_type eq 'rna') {
	$logger->fatal('Illegal value for option -t. Value for acid type must be \'aa\' for amino acids, \'dna\' for deoxyribonucleic acids or \'rna\' for ribonucleic acids.');
	exit;
}

my %acids;
if ($acid_type eq 'aa') {
	$aa_x = (defined($aa_x) ? $aa_x : 20);
	unless ($aa_x == 20 || $aa_x == 22) {
		$logger->fatal('Illegal value for option -x. Value for amino acid \'X\' expansion must be 20 or 22.');
		exit;
	}
	%acids = %amino_acids;
} elsif ($acid_type eq 'dna') {
	%acids = %dna;
} elsif ($acid_type eq 'rna') {
	%acids = %rna;
}

#
# Read degenerate sequences and their IDs from a tab delimited file
#
my $degenerate_sequences = _ParseInput($input, $prefix_column, $sequence_column);
my $degenerate_sequence_count = scalar(keys(%{$degenerate_sequences}));
$logger->info('Number of degenerate sequences: ' . $degenerate_sequence_count);

#
# Generate FASTA DB with all possible permutations.
#
my ($counter) = _GenerateSeqs($degenerate_sequences, $output, $acid_type);

$logger->info('Generated FASTA DB with ' . $counter . ' sequences.');
$logger->info('Finished!');

#
##
### Internal subs.
##
#

sub _ParseInput {

	my ($file_path, $prefix_column, $sequence_column) = @_;

	$logger->info('Parsing ' . $file_path . '...');

	my %degenerate_sequences;
	my $file_path_fh;
	my $prefix_column_offset	= $prefix_column - 1;
	my $sequence_column_offset	= $sequence_column - 1;
	my $line_counter			= 0;

	eval {
		open($file_path_fh, "<$file_path");
	};
	if ($@) {
		$logger->fatal('Cannot read input file: ' . $@);
		exit;
	}

	LINE: while (my $line = <$file_path_fh>) {

		$line =~ s/[\r\n]+//g;
		$line_counter++;

		my @values = split("\t", $line);

		unless (defined($values[$prefix_column_offset])) {
			$logger->fatal('Prefix missing on line ' . $line_counter . ' of the input file.');
			exit;
		}

		unless (defined($values[$sequence_column_offset])) {
			$logger->fatal('Sequence missing on line ' . $line_counter . ' of the input file.');
			exit;
		}

		my $prefix = $values[$prefix_column_offset];
		my $degenerate_sequence = $values[$sequence_column_offset];


		unless ($prefix =~ m/[a-zA-Z0-9_:\.\-]/i) {
			$logger->fatal('Prefix countains illegal characters on line ' . $line_counter . '. Prefix should contain only a-z A-Z 0-9 _ : . or -.');
			exit;
		}

		unless ($degenerate_sequence =~ m/[a-zA-Z0-9_:\.\-]/i) {
			$logger->fatal('Degenerate sequence countains illegal characters on line ' . $line_counter . '. Sequence should contain only a-z A-Z.');
			exit;
		}

		$degenerate_sequences{$prefix} = $degenerate_sequence;
		$logger->debug('Found degenerate sequence ' . $degenerate_sequence . ' with prefix ' . $prefix);

	}

	close($file_path_fh);
	$logger->info('Parsed input.');

	return (\%degenerate_sequences);

}

sub _GenerateSeqs {

	my ($degenerate_sequences, $path_to, $acid_type) = @_;

	$logger->info('Generating sequences...');

	my $generated_seqs = 0;
	my $path_to_fh;

	eval {
		open($path_to_fh,	">$path_to");
	};
	if ($@) {
		$logger->fatal('Cannot write FASTA file: ' . $@);
		exit;
	}

	DS:foreach my $prefix(sort(keys(%{$degenerate_sequences}))) {

		my $degenerate_sequence = ${$degenerate_sequences}{$prefix};
		$degenerate_sequence = uc($degenerate_sequence);

		$logger->debug("\t" . 'For ' . $prefix);

		my $suffix = 1;
		my @degenerate_sequence_acids = split('', $degenerate_sequence);
		my $new_sequences = [];
		my $next_acid_count = 0;

		foreach my $next_acid (@degenerate_sequence_acids) {

			$next_acid_count++;

			unless (defined($acids{$next_acid})) {
				$logger->error('Unknown (degenerate) acid ' . $next_acid . ' in input sequence ' . $prefix . ' (' . $degenerate_sequence . ').');
				$logger->warn("\t" . 'Skipping sequence ' . $prefix . '.');
				next DS;
			}

			if ($acid_type eq 'aa') {

				if ($next_acid eq 'X') {

					#
					# By default X will expand to the 20 most common amino acids,
					# but if -x 22 was specified X will expand to all 22 amino acids
					# including the very rare ones.
					#
					if ($aa_x == 22) {

						$next_acid = 'X22';

					}
				}
			}

			$new_sequences = _GrowSequence ($new_sequences, $next_acid);

			my $sequence_count = scalar(@{$new_sequences});
			$logger->trace("\t\t" . $next_acid_count . ' Acid ' . $next_acid . ': ' . $sequence_count . ' sequences.');

		}

		foreach my $new_sequence (@{$new_sequences}) {

			$generated_seqs++;
			my $id = $prefix . '_' . $suffix;

			$logger->trace("\t\t\t" . $id . ' :: ' . $new_sequence);

			print($path_to_fh '>' . $id . "\n");
			# TODO: wrap long sequences over multiple lines.
			print($path_to_fh $new_sequence . "\n");

			$suffix++;

		}
	}

	close($path_to_fh);

	return ($generated_seqs);

}

#
# Usage
#

sub _GrowSequence {

	my ($sequences, $next_acid) = @_;

	my @larger_sequences;

	if (scalar(@{$sequences}) < 1) {

		foreach my $acid (@{$acids{$next_acid}}) {

			my $larger_sequence = $acid;

			push(@larger_sequences, $larger_sequence);

		}

	} else {

		foreach my $sequence (@{$sequences}) {

			foreach my $acid (@{$acids{$next_acid}}) {

				my $larger_sequence = $sequence . $acid;

				push(@larger_sequences, $larger_sequence);

			}
		}
	}

	return (\@larger_sequences);

}

sub _Usage {

	print STDERR "\n"
	  . 'GenerateDegenerateFasta.pl:' . "\n\n"
	  . '   Generates a multi-sequence FASTA file with all possible combinations of sequences ' . "\n"
	  . '   based on degenerate input sequences.' . "\n\n"
	  . 'Usage:' . "\n\n"
	  . '   GenerateDegenerateFasta.pl options' . "\n\n"
	  . 'Available options are:' . "\n\n"
	  . '   -i [file]   Input file in tab delimited format.' . "\n"
	  . '   -p [number] Prefix column.' . "\n"
	  . '               Column from the input file containg strings that will be used as prefixes ' . "\n"
	  . '               to generate unique IDs for the FASTA sequences.' . "\n"
	  . '   -s [number] Sequence column.' . "\n"
	  . '               Column from the input file containg degenerate amino or nucleic acid sequences ' . "\n"
	  . '               that will be used to generate the FASTA sequences.' . "\n"
	  . '               For example RXX can be used to generate the amino acids sequences RAA, RAC, RAD ... RYY.' . "\n"
	  . '   -t [type]   Acid type of the degenerate input sequences.' . "\n"
	  . '               Must be one of:' . "\n"
	  . '                  aa  - for amino acids' . "\n"
	  . '                  dna - for deoxyribonucleic acids' . "\n"
	  . '                  rna - for ribonucleic acids' . "\n"
	  . '   -o [file]   Output file in FASTA format.' . "\n"
	  . '   -x [20|22]  Indicates whether the degenerate amino acid X represents only the 20 most common amino acids (default).' . "\n"
	  . '               or all 22 amino acids including the rare Selenocysteine and Pyrrolysine.' . "\n"
	  . '   -l [LEVEL]  Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.' . "\n"
	  . "\n";
	exit;

}
author	galaxyp
date	Fri, 10 May 2013 17:15:08 -0400
parents
children