annotate GenerateDegenerateFasta.pl @ 0:163892325845 draft default tip

Initial commit.
author galaxyp
date Fri, 10 May 2013 17:15:08 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
163892325845 Initial commit.
galaxyp
parents:
diff changeset
1 #!/usr/bin/perl -w
163892325845 Initial commit.
galaxyp
parents:
diff changeset
2 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
3 # This script generates a multi-sequence FASTA file
163892325845 Initial commit.
galaxyp
parents:
diff changeset
4 # with all possible combinations of sequences based
163892325845 Initial commit.
galaxyp
parents:
diff changeset
5 # on degenerate input sequences.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
6 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
7 # =====================================================
163892325845 Initial commit.
galaxyp
parents:
diff changeset
8 # $Id: GenerateDegenerateFasta.pl 67 2010-11-09 15:20:01Z pieter.neerincx@gmail.com $
163892325845 Initial commit.
galaxyp
parents:
diff changeset
9 # $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/GenerateDegenerateFasta.pl $
163892325845 Initial commit.
galaxyp
parents:
diff changeset
10 # $LastChangedDate: 2010-11-09 09:20:01 -0600 (Tue, 09 Nov 2010) $
163892325845 Initial commit.
galaxyp
parents:
diff changeset
11 # $LastChangedRevision: 67 $
163892325845 Initial commit.
galaxyp
parents:
diff changeset
12 # $LastChangedBy: pieter.neerincx@gmail.com $
163892325845 Initial commit.
galaxyp
parents:
diff changeset
13 # =====================================================
163892325845 Initial commit.
galaxyp
parents:
diff changeset
14
163892325845 Initial commit.
galaxyp
parents:
diff changeset
15 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
16 # initialise environment
163892325845 Initial commit.
galaxyp
parents:
diff changeset
17 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
18 use strict;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
19 use Getopt::Std;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
20 use Log::Log4perl qw(:easy);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
21
163892325845 Initial commit.
galaxyp
parents:
diff changeset
22 my %log_levels = (
163892325845 Initial commit.
galaxyp
parents:
diff changeset
23 'ALL' => $ALL,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
24 'TRACE' => $TRACE,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
25 'DEBUG' => $DEBUG,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
26 'INFO' => $INFO,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
27 'WARN' => $WARN,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
28 'ERROR' => $ERROR,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
29 'FATAL' => $FATAL,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
30 'OFF' => $OFF,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
31 );
163892325845 Initial commit.
galaxyp
parents:
diff changeset
32
163892325845 Initial commit.
galaxyp
parents:
diff changeset
33 my %amino_acids = (
163892325845 Initial commit.
galaxyp
parents:
diff changeset
34 'A' => ['A'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
35 'B' => ['D','N'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
36 'C' => ['C'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
37 'D' => ['D'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
38 'E' => ['E'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
39 'F' => ['F'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
40 'G' => ['G'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
41 'H' => ['H'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
42 'I' => ['I'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
43 'J' => ['I','L'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
44 'K' => ['K'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
45 'L' => ['L'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
46 'M' => ['M'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
47 'N' => ['N'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
48 'O' => ['O'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
49 'P' => ['P'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
50 'Q' => ['Q'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
51 'R' => ['R'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
52 'S' => ['S'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
53 'T' => ['T'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
54 'U' => ['U'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
55 'V' => ['V'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
56 'W' => ['W'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
57 'X' => ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y'], # 20 regular Amino Acids.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
58 'X22' => ['A','C','D','E','F','G','H','I','K','L','M','N','O','P','Q','R','S','T','U','V','W','Y'], # 22 Amino Acids including the rare Selenocysteine and Pyrrolysine.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
59 'Y' => ['Y'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
60 'Z' => ['E','Q'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
61 );
163892325845 Initial commit.
galaxyp
parents:
diff changeset
62
163892325845 Initial commit.
galaxyp
parents:
diff changeset
63 my %dna = (
163892325845 Initial commit.
galaxyp
parents:
diff changeset
64 'A' => ['A'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
65 'B' => ['C','G','T'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
66 'C' => ['C'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
67 'D' => ['A','G','T'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
68 'G' => ['G'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
69 'H' => ['A','C','T'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
70 'K' => ['G','T'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
71 'M' => ['A','C'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
72 'N' => ['A','C','G','T'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
73 'R' => ['A','G'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
74 'S' => ['C','G'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
75 'T' => ['T'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
76 'V' => ['A','C','G'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
77 'W' => ['A','T'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
78 'Y' => ['C','T'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
79 );
163892325845 Initial commit.
galaxyp
parents:
diff changeset
80
163892325845 Initial commit.
galaxyp
parents:
diff changeset
81 my %rna = (
163892325845 Initial commit.
galaxyp
parents:
diff changeset
82 'A' => ['A'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
83 'B' => ['C','G','U'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
84 'C' => ['C'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
85 'D' => ['A','G','U'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
86 'G' => ['G'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
87 'H' => ['A','C','U'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
88 'K' => ['G','U'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
89 'M' => ['A','C'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
90 'N' => ['A','C','G','U'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
91 'R' => ['A','G'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
92 'S' => ['C','G'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
93 'U' => ['U'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
94 'V' => ['A','C','G'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
95 'W' => ['A','U'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
96 'Y' => ['C','U'],
163892325845 Initial commit.
galaxyp
parents:
diff changeset
97 );
163892325845 Initial commit.
galaxyp
parents:
diff changeset
98
163892325845 Initial commit.
galaxyp
parents:
diff changeset
99 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
100 # Get options.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
101 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
102 my %opts;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
103 Getopt::Std::getopts('i:p:s:t:o:x:l:', \%opts);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
104
163892325845 Initial commit.
galaxyp
parents:
diff changeset
105 my $input = $opts{'i'};
163892325845 Initial commit.
galaxyp
parents:
diff changeset
106 my $prefix_column = $opts{'p'};
163892325845 Initial commit.
galaxyp
parents:
diff changeset
107 my $sequence_column = $opts{'s'};
163892325845 Initial commit.
galaxyp
parents:
diff changeset
108 my $acid_type = $opts{'t'};
163892325845 Initial commit.
galaxyp
parents:
diff changeset
109 my $output = $opts{'o'};
163892325845 Initial commit.
galaxyp
parents:
diff changeset
110 my $aa_x = $opts{'x'};
163892325845 Initial commit.
galaxyp
parents:
diff changeset
111 my $log_level = $opts{'l'};
163892325845 Initial commit.
galaxyp
parents:
diff changeset
112
163892325845 Initial commit.
galaxyp
parents:
diff changeset
113 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
114 # Configure logging.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
115 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
116 # Provides default if user did not specify log level:
163892325845 Initial commit.
galaxyp
parents:
diff changeset
117 $log_level = (defined($log_level) ? $log_level : 'INFO');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
118
163892325845 Initial commit.
galaxyp
parents:
diff changeset
119 # Reset log level to default if user specified illegal log level.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
120 $log_level = (
163892325845 Initial commit.
galaxyp
parents:
diff changeset
121 defined($log_levels{$log_level})
163892325845 Initial commit.
galaxyp
parents:
diff changeset
122 ? $log_levels{$log_level}
163892325845 Initial commit.
galaxyp
parents:
diff changeset
123 : $log_levels{'INFO'});
163892325845 Initial commit.
galaxyp
parents:
diff changeset
124
163892325845 Initial commit.
galaxyp
parents:
diff changeset
125 #Log::Log4perl->init('log4perl.properties');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
126 Log::Log4perl->easy_init(
163892325845 Initial commit.
galaxyp
parents:
diff changeset
127
163892325845 Initial commit.
galaxyp
parents:
diff changeset
128 #{ level => $log_level,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
129 # file => ">>GenerateDegenrateFasta.log",
163892325845 Initial commit.
galaxyp
parents:
diff changeset
130 # layout => '%F{1}-%L-%M: %m%n' },
163892325845 Initial commit.
galaxyp
parents:
diff changeset
131 {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
132 level => $log_level,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
133 file => "STDOUT",
163892325845 Initial commit.
galaxyp
parents:
diff changeset
134 layout => '%d L:%L %p> %m%n'
163892325845 Initial commit.
galaxyp
parents:
diff changeset
135 },
163892325845 Initial commit.
galaxyp
parents:
diff changeset
136 );
163892325845 Initial commit.
galaxyp
parents:
diff changeset
137 my $logger = Log::Log4perl::get_logger();
163892325845 Initial commit.
galaxyp
parents:
diff changeset
138
163892325845 Initial commit.
galaxyp
parents:
diff changeset
139 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
140 # Check user input.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
141 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
142 unless (defined($input) && defined($output)) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
143 _Usage();
163892325845 Initial commit.
galaxyp
parents:
diff changeset
144 exit;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
145 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
146 if ($input =~ /^$/ || $output =~ /^$/) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
147 _Usage();
163892325845 Initial commit.
galaxyp
parents:
diff changeset
148 exit;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
149 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
150 if ($input eq $output) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
151 $logger->fatal('Output file is the same as the input file. Select a different file for the output.');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
152 exit;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
153 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
154
163892325845 Initial commit.
galaxyp
parents:
diff changeset
155 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
156 # Check input file.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
157 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
158 unless (-e $input && -f $input && -r $input) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
159 $logger->fatal('Cannot read from input file ' . $input . ': ' . $!);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
160 exit;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
161 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
162
163892325845 Initial commit.
galaxyp
parents:
diff changeset
163 $prefix_column = (defined($prefix_column) ? $prefix_column : 1);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
164 unless ($prefix_column =~ m/^\d+$/ && $prefix_column > 0) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
165 $logger->fatal('Prefix column must be a positive integer.');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
166 exit;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
167 } else {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
168 $logger->debug('Prefix column = ' . $prefix_column);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
169 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
170
163892325845 Initial commit.
galaxyp
parents:
diff changeset
171 $sequence_column = (defined($sequence_column) ? $sequence_column : 2);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
172 unless ($sequence_column =~ m/^\d+$/ && $sequence_column > 0) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
173 $logger->fatal('Sequence column must be a positive integer.');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
174 exit;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
175 } else {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
176 $logger->debug('Sequence column = ' . $sequence_column);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
177 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
178
163892325845 Initial commit.
galaxyp
parents:
diff changeset
179 $acid_type = (defined($acid_type) ? $acid_type : 'aa');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
180 unless ($acid_type eq 'aa' || $acid_type eq 'dna' || $acid_type eq 'rna') {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
181 $logger->fatal('Illegal value for option -t. Value for acid type must be \'aa\' for amino acids, \'dna\' for deoxyribonucleic acids or \'rna\' for ribonucleic acids.');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
182 exit;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
183 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
184
163892325845 Initial commit.
galaxyp
parents:
diff changeset
185 my %acids;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
186 if ($acid_type eq 'aa') {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
187 $aa_x = (defined($aa_x) ? $aa_x : 20);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
188 unless ($aa_x == 20 || $aa_x == 22) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
189 $logger->fatal('Illegal value for option -x. Value for amino acid \'X\' expansion must be 20 or 22.');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
190 exit;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
191 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
192 %acids = %amino_acids;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
193 } elsif ($acid_type eq 'dna') {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
194 %acids = %dna;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
195 } elsif ($acid_type eq 'rna') {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
196 %acids = %rna;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
197 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
198
163892325845 Initial commit.
galaxyp
parents:
diff changeset
199 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
200 # Read degenerate sequences and their IDs from a tab delimited file
163892325845 Initial commit.
galaxyp
parents:
diff changeset
201 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
202 my $degenerate_sequences = _ParseInput($input, $prefix_column, $sequence_column);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
203 my $degenerate_sequence_count = scalar(keys(%{$degenerate_sequences}));
163892325845 Initial commit.
galaxyp
parents:
diff changeset
204 $logger->info('Number of degenerate sequences: ' . $degenerate_sequence_count);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
205
163892325845 Initial commit.
galaxyp
parents:
diff changeset
206 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
207 # Generate FASTA DB with all possible permutations.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
208 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
209 my ($counter) = _GenerateSeqs($degenerate_sequences, $output, $acid_type);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
210
163892325845 Initial commit.
galaxyp
parents:
diff changeset
211 $logger->info('Generated FASTA DB with ' . $counter . ' sequences.');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
212 $logger->info('Finished!');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
213
163892325845 Initial commit.
galaxyp
parents:
diff changeset
214 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
215 ##
163892325845 Initial commit.
galaxyp
parents:
diff changeset
216 ### Internal subs.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
217 ##
163892325845 Initial commit.
galaxyp
parents:
diff changeset
218 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
219
163892325845 Initial commit.
galaxyp
parents:
diff changeset
220 sub _ParseInput {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
221
163892325845 Initial commit.
galaxyp
parents:
diff changeset
222 my ($file_path, $prefix_column, $sequence_column) = @_;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
223
163892325845 Initial commit.
galaxyp
parents:
diff changeset
224 $logger->info('Parsing ' . $file_path . '...');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
225
163892325845 Initial commit.
galaxyp
parents:
diff changeset
226 my %degenerate_sequences;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
227 my $file_path_fh;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
228 my $prefix_column_offset = $prefix_column - 1;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
229 my $sequence_column_offset = $sequence_column - 1;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
230 my $line_counter = 0;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
231
163892325845 Initial commit.
galaxyp
parents:
diff changeset
232 eval {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
233 open($file_path_fh, "<$file_path");
163892325845 Initial commit.
galaxyp
parents:
diff changeset
234 };
163892325845 Initial commit.
galaxyp
parents:
diff changeset
235 if ($@) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
236 $logger->fatal('Cannot read input file: ' . $@);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
237 exit;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
238 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
239
163892325845 Initial commit.
galaxyp
parents:
diff changeset
240 LINE: while (my $line = <$file_path_fh>) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
241
163892325845 Initial commit.
galaxyp
parents:
diff changeset
242 $line =~ s/[\r\n]+//g;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
243 $line_counter++;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
244
163892325845 Initial commit.
galaxyp
parents:
diff changeset
245 my @values = split("\t", $line);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
246
163892325845 Initial commit.
galaxyp
parents:
diff changeset
247 unless (defined($values[$prefix_column_offset])) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
248 $logger->fatal('Prefix missing on line ' . $line_counter . ' of the input file.');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
249 exit;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
250 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
251
163892325845 Initial commit.
galaxyp
parents:
diff changeset
252 unless (defined($values[$sequence_column_offset])) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
253 $logger->fatal('Sequence missing on line ' . $line_counter . ' of the input file.');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
254 exit;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
255 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
256
163892325845 Initial commit.
galaxyp
parents:
diff changeset
257 my $prefix = $values[$prefix_column_offset];
163892325845 Initial commit.
galaxyp
parents:
diff changeset
258 my $degenerate_sequence = $values[$sequence_column_offset];
163892325845 Initial commit.
galaxyp
parents:
diff changeset
259
163892325845 Initial commit.
galaxyp
parents:
diff changeset
260
163892325845 Initial commit.
galaxyp
parents:
diff changeset
261 unless ($prefix =~ m/[a-zA-Z0-9_:\.\-]/i) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
262 $logger->fatal('Prefix countains illegal characters on line ' . $line_counter . '. Prefix should contain only a-z A-Z 0-9 _ : . or -.');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
263 exit;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
264 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
265
163892325845 Initial commit.
galaxyp
parents:
diff changeset
266 unless ($degenerate_sequence =~ m/[a-zA-Z0-9_:\.\-]/i) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
267 $logger->fatal('Degenerate sequence countains illegal characters on line ' . $line_counter . '. Sequence should contain only a-z A-Z.');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
268 exit;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
269 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
270
163892325845 Initial commit.
galaxyp
parents:
diff changeset
271 $degenerate_sequences{$prefix} = $degenerate_sequence;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
272 $logger->debug('Found degenerate sequence ' . $degenerate_sequence . ' with prefix ' . $prefix);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
273
163892325845 Initial commit.
galaxyp
parents:
diff changeset
274 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
275
163892325845 Initial commit.
galaxyp
parents:
diff changeset
276 close($file_path_fh);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
277 $logger->info('Parsed input.');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
278
163892325845 Initial commit.
galaxyp
parents:
diff changeset
279 return (\%degenerate_sequences);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
280
163892325845 Initial commit.
galaxyp
parents:
diff changeset
281 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
282
163892325845 Initial commit.
galaxyp
parents:
diff changeset
283 sub _GenerateSeqs {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
284
163892325845 Initial commit.
galaxyp
parents:
diff changeset
285 my ($degenerate_sequences, $path_to, $acid_type) = @_;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
286
163892325845 Initial commit.
galaxyp
parents:
diff changeset
287 $logger->info('Generating sequences...');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
288
163892325845 Initial commit.
galaxyp
parents:
diff changeset
289 my $generated_seqs = 0;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
290 my $path_to_fh;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
291
163892325845 Initial commit.
galaxyp
parents:
diff changeset
292 eval {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
293 open($path_to_fh, ">$path_to");
163892325845 Initial commit.
galaxyp
parents:
diff changeset
294 };
163892325845 Initial commit.
galaxyp
parents:
diff changeset
295 if ($@) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
296 $logger->fatal('Cannot write FASTA file: ' . $@);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
297 exit;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
298 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
299
163892325845 Initial commit.
galaxyp
parents:
diff changeset
300 DS:foreach my $prefix(sort(keys(%{$degenerate_sequences}))) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
301
163892325845 Initial commit.
galaxyp
parents:
diff changeset
302 my $degenerate_sequence = ${$degenerate_sequences}{$prefix};
163892325845 Initial commit.
galaxyp
parents:
diff changeset
303 $degenerate_sequence = uc($degenerate_sequence);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
304
163892325845 Initial commit.
galaxyp
parents:
diff changeset
305 $logger->debug("\t" . 'For ' . $prefix);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
306
163892325845 Initial commit.
galaxyp
parents:
diff changeset
307 my $suffix = 1;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
308 my @degenerate_sequence_acids = split('', $degenerate_sequence);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
309 my $new_sequences = [];
163892325845 Initial commit.
galaxyp
parents:
diff changeset
310 my $next_acid_count = 0;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
311
163892325845 Initial commit.
galaxyp
parents:
diff changeset
312 foreach my $next_acid (@degenerate_sequence_acids) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
313
163892325845 Initial commit.
galaxyp
parents:
diff changeset
314 $next_acid_count++;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
315
163892325845 Initial commit.
galaxyp
parents:
diff changeset
316 unless (defined($acids{$next_acid})) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
317 $logger->error('Unknown (degenerate) acid ' . $next_acid . ' in input sequence ' . $prefix . ' (' . $degenerate_sequence . ').');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
318 $logger->warn("\t" . 'Skipping sequence ' . $prefix . '.');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
319 next DS;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
320 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
321
163892325845 Initial commit.
galaxyp
parents:
diff changeset
322 if ($acid_type eq 'aa') {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
323
163892325845 Initial commit.
galaxyp
parents:
diff changeset
324 if ($next_acid eq 'X') {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
325
163892325845 Initial commit.
galaxyp
parents:
diff changeset
326 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
327 # By default X will expand to the 20 most common amino acids,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
328 # but if -x 22 was specified X will expand to all 22 amino acids
163892325845 Initial commit.
galaxyp
parents:
diff changeset
329 # including the very rare ones.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
330 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
331 if ($aa_x == 22) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
332
163892325845 Initial commit.
galaxyp
parents:
diff changeset
333 $next_acid = 'X22';
163892325845 Initial commit.
galaxyp
parents:
diff changeset
334
163892325845 Initial commit.
galaxyp
parents:
diff changeset
335 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
336 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
337 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
338
163892325845 Initial commit.
galaxyp
parents:
diff changeset
339 $new_sequences = _GrowSequence ($new_sequences, $next_acid);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
340
163892325845 Initial commit.
galaxyp
parents:
diff changeset
341 my $sequence_count = scalar(@{$new_sequences});
163892325845 Initial commit.
galaxyp
parents:
diff changeset
342 $logger->trace("\t\t" . $next_acid_count . ' Acid ' . $next_acid . ': ' . $sequence_count . ' sequences.');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
343
163892325845 Initial commit.
galaxyp
parents:
diff changeset
344 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
345
163892325845 Initial commit.
galaxyp
parents:
diff changeset
346 foreach my $new_sequence (@{$new_sequences}) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
347
163892325845 Initial commit.
galaxyp
parents:
diff changeset
348 $generated_seqs++;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
349 my $id = $prefix . '_' . $suffix;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
350
163892325845 Initial commit.
galaxyp
parents:
diff changeset
351 $logger->trace("\t\t\t" . $id . ' :: ' . $new_sequence);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
352
163892325845 Initial commit.
galaxyp
parents:
diff changeset
353 print($path_to_fh '>' . $id . "\n");
163892325845 Initial commit.
galaxyp
parents:
diff changeset
354 # TODO: wrap long sequences over multiple lines.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
355 print($path_to_fh $new_sequence . "\n");
163892325845 Initial commit.
galaxyp
parents:
diff changeset
356
163892325845 Initial commit.
galaxyp
parents:
diff changeset
357 $suffix++;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
358
163892325845 Initial commit.
galaxyp
parents:
diff changeset
359 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
360 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
361
163892325845 Initial commit.
galaxyp
parents:
diff changeset
362 close($path_to_fh);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
363
163892325845 Initial commit.
galaxyp
parents:
diff changeset
364 return ($generated_seqs);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
365
163892325845 Initial commit.
galaxyp
parents:
diff changeset
366 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
367
163892325845 Initial commit.
galaxyp
parents:
diff changeset
368 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
369 # Usage
163892325845 Initial commit.
galaxyp
parents:
diff changeset
370 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
371
163892325845 Initial commit.
galaxyp
parents:
diff changeset
372 sub _GrowSequence {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
373
163892325845 Initial commit.
galaxyp
parents:
diff changeset
374 my ($sequences, $next_acid) = @_;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
375
163892325845 Initial commit.
galaxyp
parents:
diff changeset
376 my @larger_sequences;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
377
163892325845 Initial commit.
galaxyp
parents:
diff changeset
378 if (scalar(@{$sequences}) < 1) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
379
163892325845 Initial commit.
galaxyp
parents:
diff changeset
380 foreach my $acid (@{$acids{$next_acid}}) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
381
163892325845 Initial commit.
galaxyp
parents:
diff changeset
382 my $larger_sequence = $acid;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
383
163892325845 Initial commit.
galaxyp
parents:
diff changeset
384 push(@larger_sequences, $larger_sequence);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
385
163892325845 Initial commit.
galaxyp
parents:
diff changeset
386 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
387
163892325845 Initial commit.
galaxyp
parents:
diff changeset
388 } else {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
389
163892325845 Initial commit.
galaxyp
parents:
diff changeset
390 foreach my $sequence (@{$sequences}) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
391
163892325845 Initial commit.
galaxyp
parents:
diff changeset
392 foreach my $acid (@{$acids{$next_acid}}) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
393
163892325845 Initial commit.
galaxyp
parents:
diff changeset
394 my $larger_sequence = $sequence . $acid;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
395
163892325845 Initial commit.
galaxyp
parents:
diff changeset
396 push(@larger_sequences, $larger_sequence);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
397
163892325845 Initial commit.
galaxyp
parents:
diff changeset
398 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
399 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
400 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
401
163892325845 Initial commit.
galaxyp
parents:
diff changeset
402 return (\@larger_sequences);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
403
163892325845 Initial commit.
galaxyp
parents:
diff changeset
404 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
405
163892325845 Initial commit.
galaxyp
parents:
diff changeset
406 sub _Usage {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
407
163892325845 Initial commit.
galaxyp
parents:
diff changeset
408 print STDERR "\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
409 . 'GenerateDegenerateFasta.pl:' . "\n\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
410 . ' Generates a multi-sequence FASTA file with all possible combinations of sequences ' . "\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
411 . ' based on degenerate input sequences.' . "\n\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
412 . 'Usage:' . "\n\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
413 . ' GenerateDegenerateFasta.pl options' . "\n\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
414 . 'Available options are:' . "\n\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
415 . ' -i [file] Input file in tab delimited format.' . "\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
416 . ' -p [number] Prefix column.' . "\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
417 . ' Column from the input file containg strings that will be used as prefixes ' . "\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
418 . ' to generate unique IDs for the FASTA sequences.' . "\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
419 . ' -s [number] Sequence column.' . "\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
420 . ' Column from the input file containg degenerate amino or nucleic acid sequences ' . "\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
421 . ' that will be used to generate the FASTA sequences.' . "\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
422 . ' For example RXX can be used to generate the amino acids sequences RAA, RAC, RAD ... RYY.' . "\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
423 . ' -t [type] Acid type of the degenerate input sequences.' . "\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
424 . ' Must be one of:' . "\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
425 . ' aa - for amino acids' . "\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
426 . ' dna - for deoxyribonucleic acids' . "\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
427 . ' rna - for ribonucleic acids' . "\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
428 . ' -o [file] Output file in FASTA format.' . "\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
429 . ' -x [20|22] Indicates whether the degenerate amino acid X represents only the 20 most common amino acids (default).' . "\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
430 . ' or all 22 amino acids including the rare Selenocysteine and Pyrrolysine.' . "\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
431 . ' -l [LEVEL] Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.' . "\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
432 . "\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
433 exit;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
434
163892325845 Initial commit.
galaxyp
parents:
diff changeset
435 }