Mercurial > repos > galaxyp > nbic_fasta
comparison FastaStats.pl @ 0:163892325845 draft default tip
Initial commit.
author | galaxyp |
---|---|
date | Fri, 10 May 2013 17:15:08 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:163892325845 |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 # | |
4 # FastaStats.pl | |
5 # | |
6 # ===================================================== | |
7 # $Id: FastaStats.pl 6 2010-05-27 15:52:50Z pieter $ | |
8 # $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/FastaStats.pl $ | |
9 # $LastChangedDate: 2010-05-27 10:52:50 -0500 (Thu, 27 May 2010) $ | |
10 # $LastChangedRevision: 6 $ | |
11 # $LastChangedBy: pieter $ | |
12 # ===================================================== | |
13 # | |
14 # Counts the amount of sequences in a FASTA file | |
15 # as well as the amount of nucleotides / amino acids | |
16 # and the sueqence composition. | |
17 # | |
18 | |
19 # | |
20 # Initialize evironment | |
21 # | |
22 use strict; | |
23 use Getopt::Std; | |
24 use Log::Log4perl qw(:easy); | |
25 | |
26 my %log_levels = ( | |
27 'ALL' => $ALL, | |
28 'TRACE' => $TRACE, | |
29 'DEBUG' => $DEBUG, | |
30 'INFO' => $INFO, | |
31 'WARN' => $WARN, | |
32 'ERROR' => $ERROR, | |
33 'FATAL' => $FATAL, | |
34 'OFF' => $OFF, | |
35 ); | |
36 | |
37 # | |
38 # Get options. | |
39 # | |
40 my %opts; | |
41 Getopt::Std::getopts('pi:o:l:', \%opts); | |
42 | |
43 my $fasta_file = $opts{'i'}; | |
44 my $stats_file = $opts{'o'}; | |
45 my $log_level = $opts{'l'}; | |
46 my $get_positional_composition = $opts{'p'}; | |
47 | |
48 # | |
49 # Configure logging. | |
50 # | |
51 # Provides default if user did not specify log level: | |
52 $log_level = (defined($log_level) ? $log_level : 'INFO'); | |
53 # Reset log level to default if user specified illegal log level. | |
54 $log_level = (defined($log_levels{$log_level}) ? $log_levels{$log_level} : $log_levels{'INFO'}); | |
55 #Log::Log4perl->init('log4perl.properties'); | |
56 Log::Log4perl->easy_init( | |
57 #{ level => $log_level, | |
58 # file => ">>FastaStats.log", | |
59 # layout => '%F{1}-%L-%M: %m%n' }, | |
60 { level => $log_level, | |
61 file => "STDOUT", | |
62 layout => '%d L:%L %p> %m%n' }, | |
63 ); | |
64 my $logger = Log::Log4perl::get_logger(); | |
65 | |
66 # | |
67 # Check options. | |
68 # | |
69 if ($fasta_file =~ /^$/) { | |
70 _Usage(); | |
71 exit; | |
72 } | |
73 unless (-f $fasta_file && -r $fasta_file) { | |
74 _Usage(); | |
75 exit; | |
76 } | |
77 | |
78 # | |
79 # Start job. | |
80 # | |
81 $logger->info('Starting...'); | |
82 $logger->info('Using FASTA file: '. $fasta_file); | |
83 | |
84 my $sequence_count = 0; | |
85 my %acid_composition_total; | |
86 my %acid_composition_positional; | |
87 my $position_offset; | |
88 | |
89 # | |
90 # Create filehandles. | |
91 # | |
92 my $fasta_fh; | |
93 my $stats_fh; | |
94 | |
95 eval { | |
96 open($fasta_fh, "<$fasta_file"); | |
97 open($stats_fh, ">$stats_file"); | |
98 }; | |
99 if ($@) { | |
100 $logger->fatal('Cannot create filehandle: ' . $@); | |
101 exit; | |
102 } | |
103 | |
104 # | |
105 # Parse FASTA file. | |
106 # | |
107 while (my $line = <$fasta_fh>) { | |
108 | |
109 if ($line =~ /^>/i) { | |
110 | |
111 $sequence_count++; | |
112 $position_offset = 0; # reset position. | |
113 | |
114 } else { | |
115 | |
116 # | |
117 # Ignore: | |
118 # * white space | |
119 # * end of line (EOL) characters | |
120 # * lower- and/or uppercase (repeat) masking. | |
121 # | |
122 my $seq_line = $line; | |
123 $seq_line =~ s/[\s\n\r]+//g; | |
124 next if ($seq_line eq ''); | |
125 $seq_line = uc($seq_line); | |
126 | |
127 if ($seq_line =~ m/^([a-zA-Z*-]+)$/) { | |
128 | |
129 my $seq = $1; | |
130 my @acids = split(//, $seq); | |
131 | |
132 foreach my $acid(@acids) { | |
133 | |
134 $acid_composition_total{$acid}++; | |
135 | |
136 if ($get_positional_composition) { | |
137 | |
138 $acid_composition_positional{$acid}[$position_offset]++; | |
139 $position_offset++; | |
140 | |
141 } | |
142 } | |
143 | |
144 } else { | |
145 | |
146 $logger->warn('Weird line in FASTA file: ' . $line); | |
147 exit; | |
148 | |
149 } | |
150 } | |
151 } | |
152 | |
153 # | |
154 # Save stats. | |
155 # | |
156 print($stats_fh 'Sequences' . "\t" . $sequence_count . "\n"); | |
157 print($stats_fh 'Total acid composition:' . "\n"); | |
158 my $total_acid_count = 0; | |
159 foreach my $acid (sort(keys(%acid_composition_total))) { | |
160 my $acid_count = $acid_composition_total{$acid}; | |
161 print($stats_fh 'Acid ' . $acid . "\t" . $acid_count . "\n"); | |
162 $total_acid_count += $acid_count; | |
163 } | |
164 if ($get_positional_composition) { | |
165 print($stats_fh 'Positional acid composition:' . "\n"); | |
166 foreach my $acid (sort(keys(%acid_composition_positional))) { | |
167 print($stats_fh 'Acid ' . $acid); | |
168 for my $inx (1 .. scalar(@{$acid_composition_positional{$acid}})) { | |
169 my $acid_count; | |
170 if (defined($acid_composition_positional{$acid}[$inx-1])) { | |
171 $acid_count = $acid_composition_positional{$acid}[$inx-1]; | |
172 } else { | |
173 $acid_count = 0; | |
174 } | |
175 print($stats_fh "\t" . $acid_count); | |
176 } | |
177 print($stats_fh "\n"); | |
178 } | |
179 } | |
180 | |
181 print($stats_fh 'Total acids' . "\t" . $total_acid_count . "\n"); | |
182 | |
183 # | |
184 # Close filehandles. | |
185 # | |
186 close($fasta_fh); | |
187 close($stats_fh); | |
188 | |
189 $logger->info('Found ' . $total_acid_count . ' nucleotide/amino acids in ' . $sequence_count . ' sequences.'); | |
190 $logger->info('Finished!'); | |
191 | |
192 # | |
193 ## | |
194 ### Subs. | |
195 ## | |
196 # | |
197 | |
198 sub _Usage { | |
199 | |
200 print "\n"; | |
201 print "FastaStats.pl - Reports statistics on a FASTA file like \n"; | |
202 print " * The number of sequences\n"; | |
203 print " * The total number of (nucleotide|amino) acids\n"; | |
204 print " * Sequence composition per (nucleotide|amino) acid\n"; | |
205 print "\n"; | |
206 print "Usage:\n"; | |
207 print "\n"; | |
208 print " FastaStats.pl options\n"; | |
209 print "\n"; | |
210 print "Available options are:\n"; | |
211 print "\n"; | |
212 print " -i [file] Input FASTA file.\n"; | |
213 print " -o [file] Output stats file.\n"; | |
214 print " -p Add positional stats to the output.\n"; | |
215 print " This will count the occurrance of each AA on each position of all sequences.\n"; | |
216 print " -l [LEVEL] Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.\n"; | |
217 print "\n"; | |
218 exit; | |
219 | |
220 } |