Mercurial > repos > galaxyp > nbic_fasta
comparison GenerateDegenerateFasta.pl @ 0:163892325845 draft default tip
Initial commit.
author | galaxyp |
---|---|
date | Fri, 10 May 2013 17:15:08 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:163892325845 |
---|---|
1 #!/usr/bin/perl -w | |
2 # | |
3 # This script generates a multi-sequence FASTA file | |
4 # with all possible combinations of sequences based | |
5 # on degenerate input sequences. | |
6 # | |
7 # ===================================================== | |
8 # $Id: GenerateDegenerateFasta.pl 67 2010-11-09 15:20:01Z pieter.neerincx@gmail.com $ | |
9 # $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/GenerateDegenerateFasta.pl $ | |
10 # $LastChangedDate: 2010-11-09 09:20:01 -0600 (Tue, 09 Nov 2010) $ | |
11 # $LastChangedRevision: 67 $ | |
12 # $LastChangedBy: pieter.neerincx@gmail.com $ | |
13 # ===================================================== | |
14 | |
15 # | |
16 # initialise environment | |
17 # | |
18 use strict; | |
19 use Getopt::Std; | |
20 use Log::Log4perl qw(:easy); | |
21 | |
22 my %log_levels = ( | |
23 'ALL' => $ALL, | |
24 'TRACE' => $TRACE, | |
25 'DEBUG' => $DEBUG, | |
26 'INFO' => $INFO, | |
27 'WARN' => $WARN, | |
28 'ERROR' => $ERROR, | |
29 'FATAL' => $FATAL, | |
30 'OFF' => $OFF, | |
31 ); | |
32 | |
33 my %amino_acids = ( | |
34 'A' => ['A'], | |
35 'B' => ['D','N'], | |
36 'C' => ['C'], | |
37 'D' => ['D'], | |
38 'E' => ['E'], | |
39 'F' => ['F'], | |
40 'G' => ['G'], | |
41 'H' => ['H'], | |
42 'I' => ['I'], | |
43 'J' => ['I','L'], | |
44 'K' => ['K'], | |
45 'L' => ['L'], | |
46 'M' => ['M'], | |
47 'N' => ['N'], | |
48 'O' => ['O'], | |
49 'P' => ['P'], | |
50 'Q' => ['Q'], | |
51 'R' => ['R'], | |
52 'S' => ['S'], | |
53 'T' => ['T'], | |
54 'U' => ['U'], | |
55 'V' => ['V'], | |
56 'W' => ['W'], | |
57 'X' => ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y'], # 20 regular Amino Acids. | |
58 'X22' => ['A','C','D','E','F','G','H','I','K','L','M','N','O','P','Q','R','S','T','U','V','W','Y'], # 22 Amino Acids including the rare Selenocysteine and Pyrrolysine. | |
59 'Y' => ['Y'], | |
60 'Z' => ['E','Q'], | |
61 ); | |
62 | |
63 my %dna = ( | |
64 'A' => ['A'], | |
65 'B' => ['C','G','T'], | |
66 'C' => ['C'], | |
67 'D' => ['A','G','T'], | |
68 'G' => ['G'], | |
69 'H' => ['A','C','T'], | |
70 'K' => ['G','T'], | |
71 'M' => ['A','C'], | |
72 'N' => ['A','C','G','T'], | |
73 'R' => ['A','G'], | |
74 'S' => ['C','G'], | |
75 'T' => ['T'], | |
76 'V' => ['A','C','G'], | |
77 'W' => ['A','T'], | |
78 'Y' => ['C','T'], | |
79 ); | |
80 | |
81 my %rna = ( | |
82 'A' => ['A'], | |
83 'B' => ['C','G','U'], | |
84 'C' => ['C'], | |
85 'D' => ['A','G','U'], | |
86 'G' => ['G'], | |
87 'H' => ['A','C','U'], | |
88 'K' => ['G','U'], | |
89 'M' => ['A','C'], | |
90 'N' => ['A','C','G','U'], | |
91 'R' => ['A','G'], | |
92 'S' => ['C','G'], | |
93 'U' => ['U'], | |
94 'V' => ['A','C','G'], | |
95 'W' => ['A','U'], | |
96 'Y' => ['C','U'], | |
97 ); | |
98 | |
99 # | |
100 # Get options. | |
101 # | |
102 my %opts; | |
103 Getopt::Std::getopts('i:p:s:t:o:x:l:', \%opts); | |
104 | |
105 my $input = $opts{'i'}; | |
106 my $prefix_column = $opts{'p'}; | |
107 my $sequence_column = $opts{'s'}; | |
108 my $acid_type = $opts{'t'}; | |
109 my $output = $opts{'o'}; | |
110 my $aa_x = $opts{'x'}; | |
111 my $log_level = $opts{'l'}; | |
112 | |
113 # | |
114 # Configure logging. | |
115 # | |
116 # Provides default if user did not specify log level: | |
117 $log_level = (defined($log_level) ? $log_level : 'INFO'); | |
118 | |
119 # Reset log level to default if user specified illegal log level. | |
120 $log_level = ( | |
121 defined($log_levels{$log_level}) | |
122 ? $log_levels{$log_level} | |
123 : $log_levels{'INFO'}); | |
124 | |
125 #Log::Log4perl->init('log4perl.properties'); | |
126 Log::Log4perl->easy_init( | |
127 | |
128 #{ level => $log_level, | |
129 # file => ">>GenerateDegenrateFasta.log", | |
130 # layout => '%F{1}-%L-%M: %m%n' }, | |
131 { | |
132 level => $log_level, | |
133 file => "STDOUT", | |
134 layout => '%d L:%L %p> %m%n' | |
135 }, | |
136 ); | |
137 my $logger = Log::Log4perl::get_logger(); | |
138 | |
139 # | |
140 # Check user input. | |
141 # | |
142 unless (defined($input) && defined($output)) { | |
143 _Usage(); | |
144 exit; | |
145 } | |
146 if ($input =~ /^$/ || $output =~ /^$/) { | |
147 _Usage(); | |
148 exit; | |
149 } | |
150 if ($input eq $output) { | |
151 $logger->fatal('Output file is the same as the input file. Select a different file for the output.'); | |
152 exit; | |
153 } | |
154 | |
155 # | |
156 # Check input file. | |
157 # | |
158 unless (-e $input && -f $input && -r $input) { | |
159 $logger->fatal('Cannot read from input file ' . $input . ': ' . $!); | |
160 exit; | |
161 } | |
162 | |
163 $prefix_column = (defined($prefix_column) ? $prefix_column : 1); | |
164 unless ($prefix_column =~ m/^\d+$/ && $prefix_column > 0) { | |
165 $logger->fatal('Prefix column must be a positive integer.'); | |
166 exit; | |
167 } else { | |
168 $logger->debug('Prefix column = ' . $prefix_column); | |
169 } | |
170 | |
171 $sequence_column = (defined($sequence_column) ? $sequence_column : 2); | |
172 unless ($sequence_column =~ m/^\d+$/ && $sequence_column > 0) { | |
173 $logger->fatal('Sequence column must be a positive integer.'); | |
174 exit; | |
175 } else { | |
176 $logger->debug('Sequence column = ' . $sequence_column); | |
177 } | |
178 | |
179 $acid_type = (defined($acid_type) ? $acid_type : 'aa'); | |
180 unless ($acid_type eq 'aa' || $acid_type eq 'dna' || $acid_type eq 'rna') { | |
181 $logger->fatal('Illegal value for option -t. Value for acid type must be \'aa\' for amino acids, \'dna\' for deoxyribonucleic acids or \'rna\' for ribonucleic acids.'); | |
182 exit; | |
183 } | |
184 | |
185 my %acids; | |
186 if ($acid_type eq 'aa') { | |
187 $aa_x = (defined($aa_x) ? $aa_x : 20); | |
188 unless ($aa_x == 20 || $aa_x == 22) { | |
189 $logger->fatal('Illegal value for option -x. Value for amino acid \'X\' expansion must be 20 or 22.'); | |
190 exit; | |
191 } | |
192 %acids = %amino_acids; | |
193 } elsif ($acid_type eq 'dna') { | |
194 %acids = %dna; | |
195 } elsif ($acid_type eq 'rna') { | |
196 %acids = %rna; | |
197 } | |
198 | |
199 # | |
200 # Read degenerate sequences and their IDs from a tab delimited file | |
201 # | |
202 my $degenerate_sequences = _ParseInput($input, $prefix_column, $sequence_column); | |
203 my $degenerate_sequence_count = scalar(keys(%{$degenerate_sequences})); | |
204 $logger->info('Number of degenerate sequences: ' . $degenerate_sequence_count); | |
205 | |
206 # | |
207 # Generate FASTA DB with all possible permutations. | |
208 # | |
209 my ($counter) = _GenerateSeqs($degenerate_sequences, $output, $acid_type); | |
210 | |
211 $logger->info('Generated FASTA DB with ' . $counter . ' sequences.'); | |
212 $logger->info('Finished!'); | |
213 | |
214 # | |
215 ## | |
216 ### Internal subs. | |
217 ## | |
218 # | |
219 | |
220 sub _ParseInput { | |
221 | |
222 my ($file_path, $prefix_column, $sequence_column) = @_; | |
223 | |
224 $logger->info('Parsing ' . $file_path . '...'); | |
225 | |
226 my %degenerate_sequences; | |
227 my $file_path_fh; | |
228 my $prefix_column_offset = $prefix_column - 1; | |
229 my $sequence_column_offset = $sequence_column - 1; | |
230 my $line_counter = 0; | |
231 | |
232 eval { | |
233 open($file_path_fh, "<$file_path"); | |
234 }; | |
235 if ($@) { | |
236 $logger->fatal('Cannot read input file: ' . $@); | |
237 exit; | |
238 } | |
239 | |
240 LINE: while (my $line = <$file_path_fh>) { | |
241 | |
242 $line =~ s/[\r\n]+//g; | |
243 $line_counter++; | |
244 | |
245 my @values = split("\t", $line); | |
246 | |
247 unless (defined($values[$prefix_column_offset])) { | |
248 $logger->fatal('Prefix missing on line ' . $line_counter . ' of the input file.'); | |
249 exit; | |
250 } | |
251 | |
252 unless (defined($values[$sequence_column_offset])) { | |
253 $logger->fatal('Sequence missing on line ' . $line_counter . ' of the input file.'); | |
254 exit; | |
255 } | |
256 | |
257 my $prefix = $values[$prefix_column_offset]; | |
258 my $degenerate_sequence = $values[$sequence_column_offset]; | |
259 | |
260 | |
261 unless ($prefix =~ m/[a-zA-Z0-9_:\.\-]/i) { | |
262 $logger->fatal('Prefix countains illegal characters on line ' . $line_counter . '. Prefix should contain only a-z A-Z 0-9 _ : . or -.'); | |
263 exit; | |
264 } | |
265 | |
266 unless ($degenerate_sequence =~ m/[a-zA-Z0-9_:\.\-]/i) { | |
267 $logger->fatal('Degenerate sequence countains illegal characters on line ' . $line_counter . '. Sequence should contain only a-z A-Z.'); | |
268 exit; | |
269 } | |
270 | |
271 $degenerate_sequences{$prefix} = $degenerate_sequence; | |
272 $logger->debug('Found degenerate sequence ' . $degenerate_sequence . ' with prefix ' . $prefix); | |
273 | |
274 } | |
275 | |
276 close($file_path_fh); | |
277 $logger->info('Parsed input.'); | |
278 | |
279 return (\%degenerate_sequences); | |
280 | |
281 } | |
282 | |
283 sub _GenerateSeqs { | |
284 | |
285 my ($degenerate_sequences, $path_to, $acid_type) = @_; | |
286 | |
287 $logger->info('Generating sequences...'); | |
288 | |
289 my $generated_seqs = 0; | |
290 my $path_to_fh; | |
291 | |
292 eval { | |
293 open($path_to_fh, ">$path_to"); | |
294 }; | |
295 if ($@) { | |
296 $logger->fatal('Cannot write FASTA file: ' . $@); | |
297 exit; | |
298 } | |
299 | |
300 DS:foreach my $prefix(sort(keys(%{$degenerate_sequences}))) { | |
301 | |
302 my $degenerate_sequence = ${$degenerate_sequences}{$prefix}; | |
303 $degenerate_sequence = uc($degenerate_sequence); | |
304 | |
305 $logger->debug("\t" . 'For ' . $prefix); | |
306 | |
307 my $suffix = 1; | |
308 my @degenerate_sequence_acids = split('', $degenerate_sequence); | |
309 my $new_sequences = []; | |
310 my $next_acid_count = 0; | |
311 | |
312 foreach my $next_acid (@degenerate_sequence_acids) { | |
313 | |
314 $next_acid_count++; | |
315 | |
316 unless (defined($acids{$next_acid})) { | |
317 $logger->error('Unknown (degenerate) acid ' . $next_acid . ' in input sequence ' . $prefix . ' (' . $degenerate_sequence . ').'); | |
318 $logger->warn("\t" . 'Skipping sequence ' . $prefix . '.'); | |
319 next DS; | |
320 } | |
321 | |
322 if ($acid_type eq 'aa') { | |
323 | |
324 if ($next_acid eq 'X') { | |
325 | |
326 # | |
327 # By default X will expand to the 20 most common amino acids, | |
328 # but if -x 22 was specified X will expand to all 22 amino acids | |
329 # including the very rare ones. | |
330 # | |
331 if ($aa_x == 22) { | |
332 | |
333 $next_acid = 'X22'; | |
334 | |
335 } | |
336 } | |
337 } | |
338 | |
339 $new_sequences = _GrowSequence ($new_sequences, $next_acid); | |
340 | |
341 my $sequence_count = scalar(@{$new_sequences}); | |
342 $logger->trace("\t\t" . $next_acid_count . ' Acid ' . $next_acid . ': ' . $sequence_count . ' sequences.'); | |
343 | |
344 } | |
345 | |
346 foreach my $new_sequence (@{$new_sequences}) { | |
347 | |
348 $generated_seqs++; | |
349 my $id = $prefix . '_' . $suffix; | |
350 | |
351 $logger->trace("\t\t\t" . $id . ' :: ' . $new_sequence); | |
352 | |
353 print($path_to_fh '>' . $id . "\n"); | |
354 # TODO: wrap long sequences over multiple lines. | |
355 print($path_to_fh $new_sequence . "\n"); | |
356 | |
357 $suffix++; | |
358 | |
359 } | |
360 } | |
361 | |
362 close($path_to_fh); | |
363 | |
364 return ($generated_seqs); | |
365 | |
366 } | |
367 | |
368 # | |
369 # Usage | |
370 # | |
371 | |
372 sub _GrowSequence { | |
373 | |
374 my ($sequences, $next_acid) = @_; | |
375 | |
376 my @larger_sequences; | |
377 | |
378 if (scalar(@{$sequences}) < 1) { | |
379 | |
380 foreach my $acid (@{$acids{$next_acid}}) { | |
381 | |
382 my $larger_sequence = $acid; | |
383 | |
384 push(@larger_sequences, $larger_sequence); | |
385 | |
386 } | |
387 | |
388 } else { | |
389 | |
390 foreach my $sequence (@{$sequences}) { | |
391 | |
392 foreach my $acid (@{$acids{$next_acid}}) { | |
393 | |
394 my $larger_sequence = $sequence . $acid; | |
395 | |
396 push(@larger_sequences, $larger_sequence); | |
397 | |
398 } | |
399 } | |
400 } | |
401 | |
402 return (\@larger_sequences); | |
403 | |
404 } | |
405 | |
406 sub _Usage { | |
407 | |
408 print STDERR "\n" | |
409 . 'GenerateDegenerateFasta.pl:' . "\n\n" | |
410 . ' Generates a multi-sequence FASTA file with all possible combinations of sequences ' . "\n" | |
411 . ' based on degenerate input sequences.' . "\n\n" | |
412 . 'Usage:' . "\n\n" | |
413 . ' GenerateDegenerateFasta.pl options' . "\n\n" | |
414 . 'Available options are:' . "\n\n" | |
415 . ' -i [file] Input file in tab delimited format.' . "\n" | |
416 . ' -p [number] Prefix column.' . "\n" | |
417 . ' Column from the input file containg strings that will be used as prefixes ' . "\n" | |
418 . ' to generate unique IDs for the FASTA sequences.' . "\n" | |
419 . ' -s [number] Sequence column.' . "\n" | |
420 . ' Column from the input file containg degenerate amino or nucleic acid sequences ' . "\n" | |
421 . ' that will be used to generate the FASTA sequences.' . "\n" | |
422 . ' For example RXX can be used to generate the amino acids sequences RAA, RAC, RAD ... RYY.' . "\n" | |
423 . ' -t [type] Acid type of the degenerate input sequences.' . "\n" | |
424 . ' Must be one of:' . "\n" | |
425 . ' aa - for amino acids' . "\n" | |
426 . ' dna - for deoxyribonucleic acids' . "\n" | |
427 . ' rna - for ribonucleic acids' . "\n" | |
428 . ' -o [file] Output file in FASTA format.' . "\n" | |
429 . ' -x [20|22] Indicates whether the degenerate amino acid X represents only the 20 most common amino acids (default).' . "\n" | |
430 . ' or all 22 amino acids including the rare Selenocysteine and Pyrrolysine.' . "\n" | |
431 . ' -l [LEVEL] Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.' . "\n" | |
432 . "\n"; | |
433 exit; | |
434 | |
435 } |