Mercurial > repos > galaxyp > nbic_fasta
diff FastaStats.pl @ 0:163892325845 draft default tip
Initial commit.
author | galaxyp |
---|---|
date | Fri, 10 May 2013 17:15:08 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/FastaStats.pl Fri May 10 17:15:08 2013 -0400 @@ -0,0 +1,220 @@ +#!/usr/bin/perl + +# +# FastaStats.pl +# +# ===================================================== +# $Id: FastaStats.pl 6 2010-05-27 15:52:50Z pieter $ +# $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/FastaStats.pl $ +# $LastChangedDate: 2010-05-27 10:52:50 -0500 (Thu, 27 May 2010) $ +# $LastChangedRevision: 6 $ +# $LastChangedBy: pieter $ +# ===================================================== +# +# Counts the amount of sequences in a FASTA file +# as well as the amount of nucleotides / amino acids +# and the sueqence composition. +# + +# +# Initialize evironment +# +use strict; +use Getopt::Std; +use Log::Log4perl qw(:easy); + +my %log_levels = ( + 'ALL' => $ALL, + 'TRACE' => $TRACE, + 'DEBUG' => $DEBUG, + 'INFO' => $INFO, + 'WARN' => $WARN, + 'ERROR' => $ERROR, + 'FATAL' => $FATAL, + 'OFF' => $OFF, +); + +# +# Get options. +# +my %opts; +Getopt::Std::getopts('pi:o:l:', \%opts); + +my $fasta_file = $opts{'i'}; +my $stats_file = $opts{'o'}; +my $log_level = $opts{'l'}; +my $get_positional_composition = $opts{'p'}; + +# +# Configure logging. +# +# Provides default if user did not specify log level: +$log_level = (defined($log_level) ? $log_level : 'INFO'); +# Reset log level to default if user specified illegal log level. +$log_level = (defined($log_levels{$log_level}) ? $log_levels{$log_level} : $log_levels{'INFO'}); +#Log::Log4perl->init('log4perl.properties'); +Log::Log4perl->easy_init( + #{ level => $log_level, + # file => ">>FastaStats.log", + # layout => '%F{1}-%L-%M: %m%n' }, + { level => $log_level, + file => "STDOUT", + layout => '%d L:%L %p> %m%n' }, +); +my $logger = Log::Log4perl::get_logger(); + +# +# Check options. +# +if ($fasta_file =~ /^$/) { + _Usage(); + exit; +} +unless (-f $fasta_file && -r $fasta_file) { + _Usage(); + exit; +} + +# +# Start job. +# +$logger->info('Starting...'); +$logger->info('Using FASTA file: '. $fasta_file); + +my $sequence_count = 0; +my %acid_composition_total; +my %acid_composition_positional; +my $position_offset; + +# +# Create filehandles. +# +my $fasta_fh; +my $stats_fh; + +eval { + open($fasta_fh, "<$fasta_file"); + open($stats_fh, ">$stats_file"); +}; +if ($@) { + $logger->fatal('Cannot create filehandle: ' . $@); + exit; +} + +# +# Parse FASTA file. +# +while (my $line = <$fasta_fh>) { + + if ($line =~ /^>/i) { + + $sequence_count++; + $position_offset = 0; # reset position. + + } else { + + # + # Ignore: + # * white space + # * end of line (EOL) characters + # * lower- and/or uppercase (repeat) masking. + # + my $seq_line = $line; + $seq_line =~ s/[\s\n\r]+//g; + next if ($seq_line eq ''); + $seq_line = uc($seq_line); + + if ($seq_line =~ m/^([a-zA-Z*-]+)$/) { + + my $seq = $1; + my @acids = split(//, $seq); + + foreach my $acid(@acids) { + + $acid_composition_total{$acid}++; + + if ($get_positional_composition) { + + $acid_composition_positional{$acid}[$position_offset]++; + $position_offset++; + + } + } + + } else { + + $logger->warn('Weird line in FASTA file: ' . $line); + exit; + + } + } +} + +# +# Save stats. +# +print($stats_fh 'Sequences' . "\t" . $sequence_count . "\n"); +print($stats_fh 'Total acid composition:' . "\n"); +my $total_acid_count = 0; +foreach my $acid (sort(keys(%acid_composition_total))) { + my $acid_count = $acid_composition_total{$acid}; + print($stats_fh 'Acid ' . $acid . "\t" . $acid_count . "\n"); + $total_acid_count += $acid_count; +} +if ($get_positional_composition) { +print($stats_fh 'Positional acid composition:' . "\n"); + foreach my $acid (sort(keys(%acid_composition_positional))) { + print($stats_fh 'Acid ' . $acid); + for my $inx (1 .. scalar(@{$acid_composition_positional{$acid}})) { + my $acid_count; + if (defined($acid_composition_positional{$acid}[$inx-1])) { + $acid_count = $acid_composition_positional{$acid}[$inx-1]; + } else { + $acid_count = 0; + } + print($stats_fh "\t" . $acid_count); + } + print($stats_fh "\n"); + } +} + +print($stats_fh 'Total acids' . "\t" . $total_acid_count . "\n"); + +# +# Close filehandles. +# +close($fasta_fh); +close($stats_fh); + +$logger->info('Found ' . $total_acid_count . ' nucleotide/amino acids in ' . $sequence_count . ' sequences.'); +$logger->info('Finished!'); + +# +## +### Subs. +## +# + +sub _Usage { + + print "\n"; + print "FastaStats.pl - Reports statistics on a FASTA file like \n"; + print " * The number of sequences\n"; + print " * The total number of (nucleotide|amino) acids\n"; + print " * Sequence composition per (nucleotide|amino) acid\n"; + print "\n"; + print "Usage:\n"; + print "\n"; + print " FastaStats.pl options\n"; + print "\n"; + print "Available options are:\n"; + print "\n"; + print " -i [file] Input FASTA file.\n"; + print " -o [file] Output stats file.\n"; + print " -p Add positional stats to the output.\n"; + print " This will count the occurrance of each AA on each position of all sequences.\n"; + print " -l [LEVEL] Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.\n"; + print "\n"; + exit; + +}