diff FastaStats.pl @ 0:163892325845 draft default tip

Initial commit.
author galaxyp
date Fri, 10 May 2013 17:15:08 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/FastaStats.pl	Fri May 10 17:15:08 2013 -0400
@@ -0,0 +1,220 @@
+#!/usr/bin/perl
+
+#
+# FastaStats.pl
+#
+# =====================================================
+# $Id: FastaStats.pl 6 2010-05-27 15:52:50Z pieter $
+# $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/FastaStats.pl $
+# $LastChangedDate: 2010-05-27 10:52:50 -0500 (Thu, 27 May 2010) $ 
+# $LastChangedRevision: 6 $
+# $LastChangedBy: pieter $
+# =====================================================
+# 
+# Counts the amount of sequences in a FASTA file 
+# as well as the amount of nucleotides / amino acids 
+# and the sueqence composition.
+#
+
+#
+# Initialize evironment
+#
+use strict;
+use Getopt::Std;
+use Log::Log4perl qw(:easy);
+
+my %log_levels = (
+	'ALL'	=> $ALL,
+	'TRACE'	=> $TRACE,
+	'DEBUG'	=> $DEBUG,
+	'INFO'	=> $INFO,
+	'WARN'	=> $WARN,
+	'ERROR'	=> $ERROR,
+	'FATAL'	=> $FATAL,
+	'OFF'	=> $OFF,
+);
+
+#
+# Get options.
+#
+my %opts;
+Getopt::Std::getopts('pi:o:l:', \%opts);
+
+my $fasta_file					= $opts{'i'};
+my $stats_file  				= $opts{'o'};
+my $log_level					= $opts{'l'};
+my $get_positional_composition	= $opts{'p'};
+
+#
+# Configure logging.
+#
+# Provides default if user did not specify log level:
+$log_level = (defined($log_level) ? $log_level : 'INFO');
+# Reset log level to default if user specified illegal log level.
+$log_level = (defined($log_levels{$log_level}) ? $log_levels{$log_level} : $log_levels{'INFO'});
+#Log::Log4perl->init('log4perl.properties');
+Log::Log4perl->easy_init(
+	#{ level    => $log_level,
+	#  file     => ">>FastaStats.log",
+	#  layout   => '%F{1}-%L-%M: %m%n' },
+	{ level    => $log_level,
+	  file     => "STDOUT",
+	  layout   => '%d L:%L %p> %m%n' },
+);
+my $logger = Log::Log4perl::get_logger();
+
+#
+# Check options.
+#
+if ($fasta_file =~ /^$/) {
+	_Usage();
+	exit;
+}
+unless (-f $fasta_file && -r $fasta_file) {
+	_Usage();
+	exit;
+}
+
+#
+# Start job.
+#
+$logger->info('Starting...');
+$logger->info('Using FASTA file: '. $fasta_file);
+
+my $sequence_count	= 0;
+my %acid_composition_total;
+my %acid_composition_positional;
+my $position_offset;
+
+#
+# Create filehandles.
+#
+my $fasta_fh;
+my $stats_fh;
+
+eval {
+	open($fasta_fh, "<$fasta_file");
+	open($stats_fh, ">$stats_file");
+};
+if ($@) {
+	$logger->fatal('Cannot create filehandle: ' . $@);
+	exit;
+}
+
+#
+# Parse FASTA file.
+#
+while (my $line = <$fasta_fh>) {
+
+	if ($line =~ /^>/i) {
+		
+		$sequence_count++;
+		$position_offset = 0; # reset position. 
+		
+	} else {
+		
+		#
+		# Ignore:
+		#	* white space
+		#	* end of line (EOL) characters 
+		#	* lower- and/or uppercase (repeat) masking.
+		#
+		my $seq_line = $line;
+		$seq_line =~ s/[\s\n\r]+//g;
+		next if ($seq_line eq '');
+		$seq_line = uc($seq_line);
+		
+		if ($seq_line =~ m/^([a-zA-Z*-]+)$/) {
+
+			my $seq = $1;
+			my @acids = split(//, $seq);
+
+			foreach my $acid(@acids) {
+				
+				$acid_composition_total{$acid}++;
+				
+				if ($get_positional_composition) {
+					
+					$acid_composition_positional{$acid}[$position_offset]++;
+					$position_offset++;
+										
+				}
+			}
+
+		} else {
+
+			$logger->warn('Weird line in FASTA file: ' . $line);
+			exit;
+
+		}
+	}
+}
+
+#
+# Save stats.
+#
+print($stats_fh 'Sequences'	. "\t" . $sequence_count	. "\n");
+print($stats_fh 'Total acid composition:' . "\n");
+my $total_acid_count = 0;
+foreach my $acid (sort(keys(%acid_composition_total))) {
+	my $acid_count = $acid_composition_total{$acid};
+	print($stats_fh 'Acid ' . $acid		. "\t" . $acid_count	. "\n");
+	$total_acid_count += $acid_count;
+}
+if ($get_positional_composition) {
+print($stats_fh 'Positional acid composition:' . "\n");
+	foreach my $acid (sort(keys(%acid_composition_positional))) { 
+		print($stats_fh 'Acid ' . $acid);
+		for my $inx (1 .. scalar(@{$acid_composition_positional{$acid}})) {
+			my $acid_count;
+			if (defined($acid_composition_positional{$acid}[$inx-1])) {
+				$acid_count = $acid_composition_positional{$acid}[$inx-1];
+			} else {
+				$acid_count = 0;
+			}
+			print($stats_fh "\t" . $acid_count);
+		}
+		print($stats_fh "\n");
+	}
+}
+
+print($stats_fh 'Total acids'		. "\t" . $total_acid_count	. "\n");
+
+#
+# Close filehandles.
+#
+close($fasta_fh);
+close($stats_fh);
+
+$logger->info('Found ' . $total_acid_count . ' nucleotide/amino acids in ' . $sequence_count . ' sequences.');
+$logger->info('Finished!');
+
+#
+##
+### Subs.
+##
+#
+
+sub _Usage {
+
+	print "\n";
+	print "FastaStats.pl - Reports statistics on a FASTA file like \n";
+	print "                  * The number of sequences\n";
+	print "                  * The total number of (nucleotide|amino) acids\n";
+	print "                  * Sequence composition per (nucleotide|amino) acid\n";
+	print "\n";
+	print "Usage:\n";
+	print "\n";
+	print "   FastaStats.pl options\n";
+	print "\n";
+	print "Available options are:\n";
+	print "\n";
+	print "   -i [file]          Input FASTA file.\n";
+	print "   -o [file]          Output stats file.\n";
+	print "   -p                 Add positional stats to the output.\n";
+	print "                      This will count the occurrance of each AA on each position of all sequences.\n";
+	print "   -l [LEVEL]         Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.\n";
+	print "\n";
+	exit;
+
+}