0
|
1 #!/usr/bin/perl -w
|
|
2 #
|
|
3 # This script generates a multi-sequence FASTA file
|
|
4 # with all possible combinations of sequences based
|
|
5 # on degenerate input sequences.
|
|
6 #
|
|
7 # =====================================================
|
|
8 # $Id: GenerateDegenerateFasta.pl 67 2010-11-09 15:20:01Z pieter.neerincx@gmail.com $
|
|
9 # $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/GenerateDegenerateFasta.pl $
|
|
10 # $LastChangedDate: 2010-11-09 09:20:01 -0600 (Tue, 09 Nov 2010) $
|
|
11 # $LastChangedRevision: 67 $
|
|
12 # $LastChangedBy: pieter.neerincx@gmail.com $
|
|
13 # =====================================================
|
|
14
|
|
15 #
|
|
16 # initialise environment
|
|
17 #
|
|
18 use strict;
|
|
19 use Getopt::Std;
|
|
20 use Log::Log4perl qw(:easy);
|
|
21
|
|
22 my %log_levels = (
|
|
23 'ALL' => $ALL,
|
|
24 'TRACE' => $TRACE,
|
|
25 'DEBUG' => $DEBUG,
|
|
26 'INFO' => $INFO,
|
|
27 'WARN' => $WARN,
|
|
28 'ERROR' => $ERROR,
|
|
29 'FATAL' => $FATAL,
|
|
30 'OFF' => $OFF,
|
|
31 );
|
|
32
|
|
33 my %amino_acids = (
|
|
34 'A' => ['A'],
|
|
35 'B' => ['D','N'],
|
|
36 'C' => ['C'],
|
|
37 'D' => ['D'],
|
|
38 'E' => ['E'],
|
|
39 'F' => ['F'],
|
|
40 'G' => ['G'],
|
|
41 'H' => ['H'],
|
|
42 'I' => ['I'],
|
|
43 'J' => ['I','L'],
|
|
44 'K' => ['K'],
|
|
45 'L' => ['L'],
|
|
46 'M' => ['M'],
|
|
47 'N' => ['N'],
|
|
48 'O' => ['O'],
|
|
49 'P' => ['P'],
|
|
50 'Q' => ['Q'],
|
|
51 'R' => ['R'],
|
|
52 'S' => ['S'],
|
|
53 'T' => ['T'],
|
|
54 'U' => ['U'],
|
|
55 'V' => ['V'],
|
|
56 'W' => ['W'],
|
|
57 'X' => ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y'], # 20 regular Amino Acids.
|
|
58 'X22' => ['A','C','D','E','F','G','H','I','K','L','M','N','O','P','Q','R','S','T','U','V','W','Y'], # 22 Amino Acids including the rare Selenocysteine and Pyrrolysine.
|
|
59 'Y' => ['Y'],
|
|
60 'Z' => ['E','Q'],
|
|
61 );
|
|
62
|
|
63 my %dna = (
|
|
64 'A' => ['A'],
|
|
65 'B' => ['C','G','T'],
|
|
66 'C' => ['C'],
|
|
67 'D' => ['A','G','T'],
|
|
68 'G' => ['G'],
|
|
69 'H' => ['A','C','T'],
|
|
70 'K' => ['G','T'],
|
|
71 'M' => ['A','C'],
|
|
72 'N' => ['A','C','G','T'],
|
|
73 'R' => ['A','G'],
|
|
74 'S' => ['C','G'],
|
|
75 'T' => ['T'],
|
|
76 'V' => ['A','C','G'],
|
|
77 'W' => ['A','T'],
|
|
78 'Y' => ['C','T'],
|
|
79 );
|
|
80
|
|
81 my %rna = (
|
|
82 'A' => ['A'],
|
|
83 'B' => ['C','G','U'],
|
|
84 'C' => ['C'],
|
|
85 'D' => ['A','G','U'],
|
|
86 'G' => ['G'],
|
|
87 'H' => ['A','C','U'],
|
|
88 'K' => ['G','U'],
|
|
89 'M' => ['A','C'],
|
|
90 'N' => ['A','C','G','U'],
|
|
91 'R' => ['A','G'],
|
|
92 'S' => ['C','G'],
|
|
93 'U' => ['U'],
|
|
94 'V' => ['A','C','G'],
|
|
95 'W' => ['A','U'],
|
|
96 'Y' => ['C','U'],
|
|
97 );
|
|
98
|
|
99 #
|
|
100 # Get options.
|
|
101 #
|
|
102 my %opts;
|
|
103 Getopt::Std::getopts('i:p:s:t:o:x:l:', \%opts);
|
|
104
|
|
105 my $input = $opts{'i'};
|
|
106 my $prefix_column = $opts{'p'};
|
|
107 my $sequence_column = $opts{'s'};
|
|
108 my $acid_type = $opts{'t'};
|
|
109 my $output = $opts{'o'};
|
|
110 my $aa_x = $opts{'x'};
|
|
111 my $log_level = $opts{'l'};
|
|
112
|
|
113 #
|
|
114 # Configure logging.
|
|
115 #
|
|
116 # Provides default if user did not specify log level:
|
|
117 $log_level = (defined($log_level) ? $log_level : 'INFO');
|
|
118
|
|
119 # Reset log level to default if user specified illegal log level.
|
|
120 $log_level = (
|
|
121 defined($log_levels{$log_level})
|
|
122 ? $log_levels{$log_level}
|
|
123 : $log_levels{'INFO'});
|
|
124
|
|
125 #Log::Log4perl->init('log4perl.properties');
|
|
126 Log::Log4perl->easy_init(
|
|
127
|
|
128 #{ level => $log_level,
|
|
129 # file => ">>GenerateDegenrateFasta.log",
|
|
130 # layout => '%F{1}-%L-%M: %m%n' },
|
|
131 {
|
|
132 level => $log_level,
|
|
133 file => "STDOUT",
|
|
134 layout => '%d L:%L %p> %m%n'
|
|
135 },
|
|
136 );
|
|
137 my $logger = Log::Log4perl::get_logger();
|
|
138
|
|
139 #
|
|
140 # Check user input.
|
|
141 #
|
|
142 unless (defined($input) && defined($output)) {
|
|
143 _Usage();
|
|
144 exit;
|
|
145 }
|
|
146 if ($input =~ /^$/ || $output =~ /^$/) {
|
|
147 _Usage();
|
|
148 exit;
|
|
149 }
|
|
150 if ($input eq $output) {
|
|
151 $logger->fatal('Output file is the same as the input file. Select a different file for the output.');
|
|
152 exit;
|
|
153 }
|
|
154
|
|
155 #
|
|
156 # Check input file.
|
|
157 #
|
|
158 unless (-e $input && -f $input && -r $input) {
|
|
159 $logger->fatal('Cannot read from input file ' . $input . ': ' . $!);
|
|
160 exit;
|
|
161 }
|
|
162
|
|
163 $prefix_column = (defined($prefix_column) ? $prefix_column : 1);
|
|
164 unless ($prefix_column =~ m/^\d+$/ && $prefix_column > 0) {
|
|
165 $logger->fatal('Prefix column must be a positive integer.');
|
|
166 exit;
|
|
167 } else {
|
|
168 $logger->debug('Prefix column = ' . $prefix_column);
|
|
169 }
|
|
170
|
|
171 $sequence_column = (defined($sequence_column) ? $sequence_column : 2);
|
|
172 unless ($sequence_column =~ m/^\d+$/ && $sequence_column > 0) {
|
|
173 $logger->fatal('Sequence column must be a positive integer.');
|
|
174 exit;
|
|
175 } else {
|
|
176 $logger->debug('Sequence column = ' . $sequence_column);
|
|
177 }
|
|
178
|
|
179 $acid_type = (defined($acid_type) ? $acid_type : 'aa');
|
|
180 unless ($acid_type eq 'aa' || $acid_type eq 'dna' || $acid_type eq 'rna') {
|
|
181 $logger->fatal('Illegal value for option -t. Value for acid type must be \'aa\' for amino acids, \'dna\' for deoxyribonucleic acids or \'rna\' for ribonucleic acids.');
|
|
182 exit;
|
|
183 }
|
|
184
|
|
185 my %acids;
|
|
186 if ($acid_type eq 'aa') {
|
|
187 $aa_x = (defined($aa_x) ? $aa_x : 20);
|
|
188 unless ($aa_x == 20 || $aa_x == 22) {
|
|
189 $logger->fatal('Illegal value for option -x. Value for amino acid \'X\' expansion must be 20 or 22.');
|
|
190 exit;
|
|
191 }
|
|
192 %acids = %amino_acids;
|
|
193 } elsif ($acid_type eq 'dna') {
|
|
194 %acids = %dna;
|
|
195 } elsif ($acid_type eq 'rna') {
|
|
196 %acids = %rna;
|
|
197 }
|
|
198
|
|
199 #
|
|
200 # Read degenerate sequences and their IDs from a tab delimited file
|
|
201 #
|
|
202 my $degenerate_sequences = _ParseInput($input, $prefix_column, $sequence_column);
|
|
203 my $degenerate_sequence_count = scalar(keys(%{$degenerate_sequences}));
|
|
204 $logger->info('Number of degenerate sequences: ' . $degenerate_sequence_count);
|
|
205
|
|
206 #
|
|
207 # Generate FASTA DB with all possible permutations.
|
|
208 #
|
|
209 my ($counter) = _GenerateSeqs($degenerate_sequences, $output, $acid_type);
|
|
210
|
|
211 $logger->info('Generated FASTA DB with ' . $counter . ' sequences.');
|
|
212 $logger->info('Finished!');
|
|
213
|
|
214 #
|
|
215 ##
|
|
216 ### Internal subs.
|
|
217 ##
|
|
218 #
|
|
219
|
|
220 sub _ParseInput {
|
|
221
|
|
222 my ($file_path, $prefix_column, $sequence_column) = @_;
|
|
223
|
|
224 $logger->info('Parsing ' . $file_path . '...');
|
|
225
|
|
226 my %degenerate_sequences;
|
|
227 my $file_path_fh;
|
|
228 my $prefix_column_offset = $prefix_column - 1;
|
|
229 my $sequence_column_offset = $sequence_column - 1;
|
|
230 my $line_counter = 0;
|
|
231
|
|
232 eval {
|
|
233 open($file_path_fh, "<$file_path");
|
|
234 };
|
|
235 if ($@) {
|
|
236 $logger->fatal('Cannot read input file: ' . $@);
|
|
237 exit;
|
|
238 }
|
|
239
|
|
240 LINE: while (my $line = <$file_path_fh>) {
|
|
241
|
|
242 $line =~ s/[\r\n]+//g;
|
|
243 $line_counter++;
|
|
244
|
|
245 my @values = split("\t", $line);
|
|
246
|
|
247 unless (defined($values[$prefix_column_offset])) {
|
|
248 $logger->fatal('Prefix missing on line ' . $line_counter . ' of the input file.');
|
|
249 exit;
|
|
250 }
|
|
251
|
|
252 unless (defined($values[$sequence_column_offset])) {
|
|
253 $logger->fatal('Sequence missing on line ' . $line_counter . ' of the input file.');
|
|
254 exit;
|
|
255 }
|
|
256
|
|
257 my $prefix = $values[$prefix_column_offset];
|
|
258 my $degenerate_sequence = $values[$sequence_column_offset];
|
|
259
|
|
260
|
|
261 unless ($prefix =~ m/[a-zA-Z0-9_:\.\-]/i) {
|
|
262 $logger->fatal('Prefix countains illegal characters on line ' . $line_counter . '. Prefix should contain only a-z A-Z 0-9 _ : . or -.');
|
|
263 exit;
|
|
264 }
|
|
265
|
|
266 unless ($degenerate_sequence =~ m/[a-zA-Z0-9_:\.\-]/i) {
|
|
267 $logger->fatal('Degenerate sequence countains illegal characters on line ' . $line_counter . '. Sequence should contain only a-z A-Z.');
|
|
268 exit;
|
|
269 }
|
|
270
|
|
271 $degenerate_sequences{$prefix} = $degenerate_sequence;
|
|
272 $logger->debug('Found degenerate sequence ' . $degenerate_sequence . ' with prefix ' . $prefix);
|
|
273
|
|
274 }
|
|
275
|
|
276 close($file_path_fh);
|
|
277 $logger->info('Parsed input.');
|
|
278
|
|
279 return (\%degenerate_sequences);
|
|
280
|
|
281 }
|
|
282
|
|
283 sub _GenerateSeqs {
|
|
284
|
|
285 my ($degenerate_sequences, $path_to, $acid_type) = @_;
|
|
286
|
|
287 $logger->info('Generating sequences...');
|
|
288
|
|
289 my $generated_seqs = 0;
|
|
290 my $path_to_fh;
|
|
291
|
|
292 eval {
|
|
293 open($path_to_fh, ">$path_to");
|
|
294 };
|
|
295 if ($@) {
|
|
296 $logger->fatal('Cannot write FASTA file: ' . $@);
|
|
297 exit;
|
|
298 }
|
|
299
|
|
300 DS:foreach my $prefix(sort(keys(%{$degenerate_sequences}))) {
|
|
301
|
|
302 my $degenerate_sequence = ${$degenerate_sequences}{$prefix};
|
|
303 $degenerate_sequence = uc($degenerate_sequence);
|
|
304
|
|
305 $logger->debug("\t" . 'For ' . $prefix);
|
|
306
|
|
307 my $suffix = 1;
|
|
308 my @degenerate_sequence_acids = split('', $degenerate_sequence);
|
|
309 my $new_sequences = [];
|
|
310 my $next_acid_count = 0;
|
|
311
|
|
312 foreach my $next_acid (@degenerate_sequence_acids) {
|
|
313
|
|
314 $next_acid_count++;
|
|
315
|
|
316 unless (defined($acids{$next_acid})) {
|
|
317 $logger->error('Unknown (degenerate) acid ' . $next_acid . ' in input sequence ' . $prefix . ' (' . $degenerate_sequence . ').');
|
|
318 $logger->warn("\t" . 'Skipping sequence ' . $prefix . '.');
|
|
319 next DS;
|
|
320 }
|
|
321
|
|
322 if ($acid_type eq 'aa') {
|
|
323
|
|
324 if ($next_acid eq 'X') {
|
|
325
|
|
326 #
|
|
327 # By default X will expand to the 20 most common amino acids,
|
|
328 # but if -x 22 was specified X will expand to all 22 amino acids
|
|
329 # including the very rare ones.
|
|
330 #
|
|
331 if ($aa_x == 22) {
|
|
332
|
|
333 $next_acid = 'X22';
|
|
334
|
|
335 }
|
|
336 }
|
|
337 }
|
|
338
|
|
339 $new_sequences = _GrowSequence ($new_sequences, $next_acid);
|
|
340
|
|
341 my $sequence_count = scalar(@{$new_sequences});
|
|
342 $logger->trace("\t\t" . $next_acid_count . ' Acid ' . $next_acid . ': ' . $sequence_count . ' sequences.');
|
|
343
|
|
344 }
|
|
345
|
|
346 foreach my $new_sequence (@{$new_sequences}) {
|
|
347
|
|
348 $generated_seqs++;
|
|
349 my $id = $prefix . '_' . $suffix;
|
|
350
|
|
351 $logger->trace("\t\t\t" . $id . ' :: ' . $new_sequence);
|
|
352
|
|
353 print($path_to_fh '>' . $id . "\n");
|
|
354 # TODO: wrap long sequences over multiple lines.
|
|
355 print($path_to_fh $new_sequence . "\n");
|
|
356
|
|
357 $suffix++;
|
|
358
|
|
359 }
|
|
360 }
|
|
361
|
|
362 close($path_to_fh);
|
|
363
|
|
364 return ($generated_seqs);
|
|
365
|
|
366 }
|
|
367
|
|
368 #
|
|
369 # Usage
|
|
370 #
|
|
371
|
|
372 sub _GrowSequence {
|
|
373
|
|
374 my ($sequences, $next_acid) = @_;
|
|
375
|
|
376 my @larger_sequences;
|
|
377
|
|
378 if (scalar(@{$sequences}) < 1) {
|
|
379
|
|
380 foreach my $acid (@{$acids{$next_acid}}) {
|
|
381
|
|
382 my $larger_sequence = $acid;
|
|
383
|
|
384 push(@larger_sequences, $larger_sequence);
|
|
385
|
|
386 }
|
|
387
|
|
388 } else {
|
|
389
|
|
390 foreach my $sequence (@{$sequences}) {
|
|
391
|
|
392 foreach my $acid (@{$acids{$next_acid}}) {
|
|
393
|
|
394 my $larger_sequence = $sequence . $acid;
|
|
395
|
|
396 push(@larger_sequences, $larger_sequence);
|
|
397
|
|
398 }
|
|
399 }
|
|
400 }
|
|
401
|
|
402 return (\@larger_sequences);
|
|
403
|
|
404 }
|
|
405
|
|
406 sub _Usage {
|
|
407
|
|
408 print STDERR "\n"
|
|
409 . 'GenerateDegenerateFasta.pl:' . "\n\n"
|
|
410 . ' Generates a multi-sequence FASTA file with all possible combinations of sequences ' . "\n"
|
|
411 . ' based on degenerate input sequences.' . "\n\n"
|
|
412 . 'Usage:' . "\n\n"
|
|
413 . ' GenerateDegenerateFasta.pl options' . "\n\n"
|
|
414 . 'Available options are:' . "\n\n"
|
|
415 . ' -i [file] Input file in tab delimited format.' . "\n"
|
|
416 . ' -p [number] Prefix column.' . "\n"
|
|
417 . ' Column from the input file containg strings that will be used as prefixes ' . "\n"
|
|
418 . ' to generate unique IDs for the FASTA sequences.' . "\n"
|
|
419 . ' -s [number] Sequence column.' . "\n"
|
|
420 . ' Column from the input file containg degenerate amino or nucleic acid sequences ' . "\n"
|
|
421 . ' that will be used to generate the FASTA sequences.' . "\n"
|
|
422 . ' For example RXX can be used to generate the amino acids sequences RAA, RAC, RAD ... RYY.' . "\n"
|
|
423 . ' -t [type] Acid type of the degenerate input sequences.' . "\n"
|
|
424 . ' Must be one of:' . "\n"
|
|
425 . ' aa - for amino acids' . "\n"
|
|
426 . ' dna - for deoxyribonucleic acids' . "\n"
|
|
427 . ' rna - for ribonucleic acids' . "\n"
|
|
428 . ' -o [file] Output file in FASTA format.' . "\n"
|
|
429 . ' -x [20|22] Indicates whether the degenerate amino acid X represents only the 20 most common amino acids (default).' . "\n"
|
|
430 . ' or all 22 amino acids including the rare Selenocysteine and Pyrrolysine.' . "\n"
|
|
431 . ' -l [LEVEL] Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.' . "\n"
|
|
432 . "\n";
|
|
433 exit;
|
|
434
|
|
435 }
|