0
|
1 #!/usr/bin/perl
|
|
2
|
|
3 #
|
|
4 # FastaStats.pl
|
|
5 #
|
|
6 # =====================================================
|
|
7 # $Id: FastaStats.pl 6 2010-05-27 15:52:50Z pieter $
|
|
8 # $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/FastaStats.pl $
|
|
9 # $LastChangedDate: 2010-05-27 10:52:50 -0500 (Thu, 27 May 2010) $
|
|
10 # $LastChangedRevision: 6 $
|
|
11 # $LastChangedBy: pieter $
|
|
12 # =====================================================
|
|
13 #
|
|
14 # Counts the amount of sequences in a FASTA file
|
|
15 # as well as the amount of nucleotides / amino acids
|
|
16 # and the sueqence composition.
|
|
17 #
|
|
18
|
|
19 #
|
|
20 # Initialize evironment
|
|
21 #
|
|
22 use strict;
|
|
23 use Getopt::Std;
|
|
24 use Log::Log4perl qw(:easy);
|
|
25
|
|
26 my %log_levels = (
|
|
27 'ALL' => $ALL,
|
|
28 'TRACE' => $TRACE,
|
|
29 'DEBUG' => $DEBUG,
|
|
30 'INFO' => $INFO,
|
|
31 'WARN' => $WARN,
|
|
32 'ERROR' => $ERROR,
|
|
33 'FATAL' => $FATAL,
|
|
34 'OFF' => $OFF,
|
|
35 );
|
|
36
|
|
37 #
|
|
38 # Get options.
|
|
39 #
|
|
40 my %opts;
|
|
41 Getopt::Std::getopts('pi:o:l:', \%opts);
|
|
42
|
|
43 my $fasta_file = $opts{'i'};
|
|
44 my $stats_file = $opts{'o'};
|
|
45 my $log_level = $opts{'l'};
|
|
46 my $get_positional_composition = $opts{'p'};
|
|
47
|
|
48 #
|
|
49 # Configure logging.
|
|
50 #
|
|
51 # Provides default if user did not specify log level:
|
|
52 $log_level = (defined($log_level) ? $log_level : 'INFO');
|
|
53 # Reset log level to default if user specified illegal log level.
|
|
54 $log_level = (defined($log_levels{$log_level}) ? $log_levels{$log_level} : $log_levels{'INFO'});
|
|
55 #Log::Log4perl->init('log4perl.properties');
|
|
56 Log::Log4perl->easy_init(
|
|
57 #{ level => $log_level,
|
|
58 # file => ">>FastaStats.log",
|
|
59 # layout => '%F{1}-%L-%M: %m%n' },
|
|
60 { level => $log_level,
|
|
61 file => "STDOUT",
|
|
62 layout => '%d L:%L %p> %m%n' },
|
|
63 );
|
|
64 my $logger = Log::Log4perl::get_logger();
|
|
65
|
|
66 #
|
|
67 # Check options.
|
|
68 #
|
|
69 if ($fasta_file =~ /^$/) {
|
|
70 _Usage();
|
|
71 exit;
|
|
72 }
|
|
73 unless (-f $fasta_file && -r $fasta_file) {
|
|
74 _Usage();
|
|
75 exit;
|
|
76 }
|
|
77
|
|
78 #
|
|
79 # Start job.
|
|
80 #
|
|
81 $logger->info('Starting...');
|
|
82 $logger->info('Using FASTA file: '. $fasta_file);
|
|
83
|
|
84 my $sequence_count = 0;
|
|
85 my %acid_composition_total;
|
|
86 my %acid_composition_positional;
|
|
87 my $position_offset;
|
|
88
|
|
89 #
|
|
90 # Create filehandles.
|
|
91 #
|
|
92 my $fasta_fh;
|
|
93 my $stats_fh;
|
|
94
|
|
95 eval {
|
|
96 open($fasta_fh, "<$fasta_file");
|
|
97 open($stats_fh, ">$stats_file");
|
|
98 };
|
|
99 if ($@) {
|
|
100 $logger->fatal('Cannot create filehandle: ' . $@);
|
|
101 exit;
|
|
102 }
|
|
103
|
|
104 #
|
|
105 # Parse FASTA file.
|
|
106 #
|
|
107 while (my $line = <$fasta_fh>) {
|
|
108
|
|
109 if ($line =~ /^>/i) {
|
|
110
|
|
111 $sequence_count++;
|
|
112 $position_offset = 0; # reset position.
|
|
113
|
|
114 } else {
|
|
115
|
|
116 #
|
|
117 # Ignore:
|
|
118 # * white space
|
|
119 # * end of line (EOL) characters
|
|
120 # * lower- and/or uppercase (repeat) masking.
|
|
121 #
|
|
122 my $seq_line = $line;
|
|
123 $seq_line =~ s/[\s\n\r]+//g;
|
|
124 next if ($seq_line eq '');
|
|
125 $seq_line = uc($seq_line);
|
|
126
|
|
127 if ($seq_line =~ m/^([a-zA-Z*-]+)$/) {
|
|
128
|
|
129 my $seq = $1;
|
|
130 my @acids = split(//, $seq);
|
|
131
|
|
132 foreach my $acid(@acids) {
|
|
133
|
|
134 $acid_composition_total{$acid}++;
|
|
135
|
|
136 if ($get_positional_composition) {
|
|
137
|
|
138 $acid_composition_positional{$acid}[$position_offset]++;
|
|
139 $position_offset++;
|
|
140
|
|
141 }
|
|
142 }
|
|
143
|
|
144 } else {
|
|
145
|
|
146 $logger->warn('Weird line in FASTA file: ' . $line);
|
|
147 exit;
|
|
148
|
|
149 }
|
|
150 }
|
|
151 }
|
|
152
|
|
153 #
|
|
154 # Save stats.
|
|
155 #
|
|
156 print($stats_fh 'Sequences' . "\t" . $sequence_count . "\n");
|
|
157 print($stats_fh 'Total acid composition:' . "\n");
|
|
158 my $total_acid_count = 0;
|
|
159 foreach my $acid (sort(keys(%acid_composition_total))) {
|
|
160 my $acid_count = $acid_composition_total{$acid};
|
|
161 print($stats_fh 'Acid ' . $acid . "\t" . $acid_count . "\n");
|
|
162 $total_acid_count += $acid_count;
|
|
163 }
|
|
164 if ($get_positional_composition) {
|
|
165 print($stats_fh 'Positional acid composition:' . "\n");
|
|
166 foreach my $acid (sort(keys(%acid_composition_positional))) {
|
|
167 print($stats_fh 'Acid ' . $acid);
|
|
168 for my $inx (1 .. scalar(@{$acid_composition_positional{$acid}})) {
|
|
169 my $acid_count;
|
|
170 if (defined($acid_composition_positional{$acid}[$inx-1])) {
|
|
171 $acid_count = $acid_composition_positional{$acid}[$inx-1];
|
|
172 } else {
|
|
173 $acid_count = 0;
|
|
174 }
|
|
175 print($stats_fh "\t" . $acid_count);
|
|
176 }
|
|
177 print($stats_fh "\n");
|
|
178 }
|
|
179 }
|
|
180
|
|
181 print($stats_fh 'Total acids' . "\t" . $total_acid_count . "\n");
|
|
182
|
|
183 #
|
|
184 # Close filehandles.
|
|
185 #
|
|
186 close($fasta_fh);
|
|
187 close($stats_fh);
|
|
188
|
|
189 $logger->info('Found ' . $total_acid_count . ' nucleotide/amino acids in ' . $sequence_count . ' sequences.');
|
|
190 $logger->info('Finished!');
|
|
191
|
|
192 #
|
|
193 ##
|
|
194 ### Subs.
|
|
195 ##
|
|
196 #
|
|
197
|
|
198 sub _Usage {
|
|
199
|
|
200 print "\n";
|
|
201 print "FastaStats.pl - Reports statistics on a FASTA file like \n";
|
|
202 print " * The number of sequences\n";
|
|
203 print " * The total number of (nucleotide|amino) acids\n";
|
|
204 print " * Sequence composition per (nucleotide|amino) acid\n";
|
|
205 print "\n";
|
|
206 print "Usage:\n";
|
|
207 print "\n";
|
|
208 print " FastaStats.pl options\n";
|
|
209 print "\n";
|
|
210 print "Available options are:\n";
|
|
211 print "\n";
|
|
212 print " -i [file] Input FASTA file.\n";
|
|
213 print " -o [file] Output stats file.\n";
|
|
214 print " -p Add positional stats to the output.\n";
|
|
215 print " This will count the occurrance of each AA on each position of all sequences.\n";
|
|
216 print " -l [LEVEL] Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.\n";
|
|
217 print "\n";
|
|
218 exit;
|
|
219
|
|
220 }
|