Mercurial > repos > galaxyp > nbic_fasta
view FastaStats.pl @ 0:163892325845 draft default tip
Initial commit.
author | galaxyp |
---|---|
date | Fri, 10 May 2013 17:15:08 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/perl # # FastaStats.pl # # ===================================================== # $Id: FastaStats.pl 6 2010-05-27 15:52:50Z pieter $ # $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/FastaStats.pl $ # $LastChangedDate: 2010-05-27 10:52:50 -0500 (Thu, 27 May 2010) $ # $LastChangedRevision: 6 $ # $LastChangedBy: pieter $ # ===================================================== # # Counts the amount of sequences in a FASTA file # as well as the amount of nucleotides / amino acids # and the sueqence composition. # # # Initialize evironment # use strict; use Getopt::Std; use Log::Log4perl qw(:easy); my %log_levels = ( 'ALL' => $ALL, 'TRACE' => $TRACE, 'DEBUG' => $DEBUG, 'INFO' => $INFO, 'WARN' => $WARN, 'ERROR' => $ERROR, 'FATAL' => $FATAL, 'OFF' => $OFF, ); # # Get options. # my %opts; Getopt::Std::getopts('pi:o:l:', \%opts); my $fasta_file = $opts{'i'}; my $stats_file = $opts{'o'}; my $log_level = $opts{'l'}; my $get_positional_composition = $opts{'p'}; # # Configure logging. # # Provides default if user did not specify log level: $log_level = (defined($log_level) ? $log_level : 'INFO'); # Reset log level to default if user specified illegal log level. $log_level = (defined($log_levels{$log_level}) ? $log_levels{$log_level} : $log_levels{'INFO'}); #Log::Log4perl->init('log4perl.properties'); Log::Log4perl->easy_init( #{ level => $log_level, # file => ">>FastaStats.log", # layout => '%F{1}-%L-%M: %m%n' }, { level => $log_level, file => "STDOUT", layout => '%d L:%L %p> %m%n' }, ); my $logger = Log::Log4perl::get_logger(); # # Check options. # if ($fasta_file =~ /^$/) { _Usage(); exit; } unless (-f $fasta_file && -r $fasta_file) { _Usage(); exit; } # # Start job. # $logger->info('Starting...'); $logger->info('Using FASTA file: '. $fasta_file); my $sequence_count = 0; my %acid_composition_total; my %acid_composition_positional; my $position_offset; # # Create filehandles. # my $fasta_fh; my $stats_fh; eval { open($fasta_fh, "<$fasta_file"); open($stats_fh, ">$stats_file"); }; if ($@) { $logger->fatal('Cannot create filehandle: ' . $@); exit; } # # Parse FASTA file. # while (my $line = <$fasta_fh>) { if ($line =~ /^>/i) { $sequence_count++; $position_offset = 0; # reset position. } else { # # Ignore: # * white space # * end of line (EOL) characters # * lower- and/or uppercase (repeat) masking. # my $seq_line = $line; $seq_line =~ s/[\s\n\r]+//g; next if ($seq_line eq ''); $seq_line = uc($seq_line); if ($seq_line =~ m/^([a-zA-Z*-]+)$/) { my $seq = $1; my @acids = split(//, $seq); foreach my $acid(@acids) { $acid_composition_total{$acid}++; if ($get_positional_composition) { $acid_composition_positional{$acid}[$position_offset]++; $position_offset++; } } } else { $logger->warn('Weird line in FASTA file: ' . $line); exit; } } } # # Save stats. # print($stats_fh 'Sequences' . "\t" . $sequence_count . "\n"); print($stats_fh 'Total acid composition:' . "\n"); my $total_acid_count = 0; foreach my $acid (sort(keys(%acid_composition_total))) { my $acid_count = $acid_composition_total{$acid}; print($stats_fh 'Acid ' . $acid . "\t" . $acid_count . "\n"); $total_acid_count += $acid_count; } if ($get_positional_composition) { print($stats_fh 'Positional acid composition:' . "\n"); foreach my $acid (sort(keys(%acid_composition_positional))) { print($stats_fh 'Acid ' . $acid); for my $inx (1 .. scalar(@{$acid_composition_positional{$acid}})) { my $acid_count; if (defined($acid_composition_positional{$acid}[$inx-1])) { $acid_count = $acid_composition_positional{$acid}[$inx-1]; } else { $acid_count = 0; } print($stats_fh "\t" . $acid_count); } print($stats_fh "\n"); } } print($stats_fh 'Total acids' . "\t" . $total_acid_count . "\n"); # # Close filehandles. # close($fasta_fh); close($stats_fh); $logger->info('Found ' . $total_acid_count . ' nucleotide/amino acids in ' . $sequence_count . ' sequences.'); $logger->info('Finished!'); # ## ### Subs. ## # sub _Usage { print "\n"; print "FastaStats.pl - Reports statistics on a FASTA file like \n"; print " * The number of sequences\n"; print " * The total number of (nucleotide|amino) acids\n"; print " * Sequence composition per (nucleotide|amino) acid\n"; print "\n"; print "Usage:\n"; print "\n"; print " FastaStats.pl options\n"; print "\n"; print "Available options are:\n"; print "\n"; print " -i [file] Input FASTA file.\n"; print " -o [file] Output stats file.\n"; print " -p Add positional stats to the output.\n"; print " This will count the occurrance of each AA on each position of all sequences.\n"; print " -l [LEVEL] Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.\n"; print "\n"; exit; }