Mercurial > repos > galaxyp > nbic_fasta
comparison ExtractSeqsFromFasta.pl @ 0:163892325845 draft default tip
Initial commit.
author | galaxyp |
---|---|
date | Fri, 10 May 2013 17:15:08 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:163892325845 |
---|---|
1 #!/usr/bin/perl -w | |
2 # | |
3 # This script extracts sequences from a multi-sequence FASTA file | |
4 # based on a list of accession numbers / IDs | |
5 # | |
6 # ===================================================== | |
7 # $Id: ExtractSeqsFromFasta.pl 15 2010-07-08 18:07:59Z pieter $ | |
8 # $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/ExtractSeqsFromFasta.pl $ | |
9 # $LastChangedDate: 2010-07-08 13:07:59 -0500 (Thu, 08 Jul 2010) $ | |
10 # $LastChangedRevision: 15 $ | |
11 # $LastChangedBy: pieter $ | |
12 # ===================================================== | |
13 | |
14 # | |
15 # initialise environment | |
16 # | |
17 use strict; | |
18 use Getopt::Std; | |
19 use Log::Log4perl qw(:easy); | |
20 | |
21 my %log_levels = ( | |
22 'ALL' => $ALL, | |
23 'TRACE' => $TRACE, | |
24 'DEBUG' => $DEBUG, | |
25 'INFO' => $INFO, | |
26 'WARN' => $WARN, | |
27 'ERROR' => $ERROR, | |
28 'FATAL' => $FATAL, | |
29 'OFF' => $OFF, | |
30 ); | |
31 my %match_logic_types = ( | |
32 'literal' => 1, | |
33 'regex' => 1, | |
34 ); | |
35 | |
36 # | |
37 # Get options. | |
38 # | |
39 my %opts; | |
40 Getopt::Std::getopts('ul:i:o:f:m:', \%opts); | |
41 | |
42 my $input = $opts{'i'}; | |
43 my $output = $opts{'o'}; | |
44 my $filter = $opts{'f'}; | |
45 my $log_level = $opts{'l'}; | |
46 my $match_logic = $opts{'m'}; | |
47 my $ignore_accession_version = $opts{'u'}; | |
48 | |
49 # | |
50 # Configure logging. | |
51 # | |
52 # Provides default if user did not specify log level: | |
53 $log_level = (defined($log_level) ? $log_level : 'INFO'); | |
54 | |
55 # Reset log level to default if user specified illegal log level. | |
56 $log_level = ( | |
57 defined($log_levels{$log_level}) | |
58 ? $log_levels{$log_level} | |
59 : $log_levels{'INFO'}); | |
60 | |
61 #Log::Log4perl->init('log4perl.properties'); | |
62 Log::Log4perl->easy_init( | |
63 | |
64 #{ level => $log_level, | |
65 # file => ">>ExtractSeqsFromFasta.log", | |
66 # layout => '%F{1}-%L-%M: %m%n' }, | |
67 { | |
68 level => $log_level, | |
69 file => "STDOUT", | |
70 layout => '%d L:%L %p> %m%n' | |
71 }, | |
72 ); | |
73 my $logger = Log::Log4perl::get_logger(); | |
74 | |
75 # | |
76 # Check user input. | |
77 # | |
78 unless (defined($input) && defined($output) && defined($filter)) { | |
79 _Usage(); | |
80 exit; | |
81 } | |
82 if ($input =~ /^$/ || $output =~ /^$/ || $filter =~ /^$/) { | |
83 _Usage(); | |
84 exit; | |
85 } | |
86 if ($input eq $output) { | |
87 $logger->fatal('Output file is the same as the input file. Select a different file for the output.'); | |
88 exit; | |
89 } | |
90 | |
91 # | |
92 # Check input and filter file. | |
93 # | |
94 unless (-e $input && -f $input && -r $input) { | |
95 $logger->fatal('Cannot read from input file ' . $input . ': ' . $!); | |
96 exit; | |
97 } | |
98 unless (-e $filter && -f $filter && -r $filter) { | |
99 $logger->fatal('Cannot read from filter file ' . $filter . ': ' . $!); | |
100 exit; | |
101 } | |
102 # | |
103 # Check match logic. | |
104 # | |
105 $match_logic = (defined($match_logic) ? $match_logic : 'literal'); | |
106 unless (defined($match_logic_types{$match_logic})) { | |
107 $logger->fatal('Unkown logic ' . $match_logic . ' specified for filtering of the input sequences.'); | |
108 exit; | |
109 } | |
110 | |
111 # | |
112 # Read accession numbers to search the FASTA file(s) for | |
113 # | |
114 my $wanted = _CreateLookupHash($filter, $match_logic); | |
115 my $seqs_to_extract = scalar(keys(%{$wanted})); | |
116 $logger->info('Number of sequences to search for: ' . $seqs_to_extract); | |
117 | |
118 # | |
119 # Extract seqs | |
120 # | |
121 my ($counter) = _ExtractSeqs($input, $output, $wanted, $match_logic); | |
122 | |
123 $logger->info('Extracted ' . $counter . ' sequences.'); | |
124 $logger->info('Finished!'); | |
125 | |
126 # | |
127 ## | |
128 ### Internal subs. | |
129 ## | |
130 # | |
131 | |
132 sub _CreateLookupHash { | |
133 | |
134 my ($file_path, $match_logic) = @_; | |
135 | |
136 $logger->info('Parsing ' . $file_path . '...'); | |
137 | |
138 my %wanted; | |
139 my $file_path_fh; | |
140 | |
141 eval { | |
142 open($file_path_fh, "<$file_path"); | |
143 }; | |
144 if ($@) { | |
145 $logger->fatal('Cannot read ID file: ' . $@); | |
146 exit; | |
147 } | |
148 | |
149 LINE: while (my $line = <$file_path_fh>) { | |
150 | |
151 $line =~ s/[\r\n]+//g; | |
152 | |
153 if ($match_logic eq 'literal') { | |
154 | |
155 if ($line =~ m/([a-z0-9_\-\.]+)/i) { | |
156 | |
157 my $id = $1; | |
158 | |
159 if ($ignore_accession_version) { | |
160 | |
161 # | |
162 # Remove version from accession number if it was versioned. | |
163 # | |
164 if ($id =~ m/([^\.]+)\.(\d+)/) { | |
165 | |
166 $id = $1; | |
167 | |
168 } | |
169 } | |
170 | |
171 $logger->debug('Found accession/ID ' . $id); | |
172 $wanted{$id} = 1; | |
173 | |
174 } else { | |
175 $logger->warn('Accession/ID in unsupported format: ' . $line); | |
176 next; | |
177 } | |
178 | |
179 } elsif ($match_logic eq 'regex') { | |
180 | |
181 if ($line =~ m/([a-z0-9_\-\.\[\]:?^\$]+)/i) { | |
182 my $regex = $1; | |
183 $logger->debug('Found regex ' . $regex); | |
184 $wanted{$regex} = 1; | |
185 } else { | |
186 $logger->warn('Regex in unsupported format: ' . $line); | |
187 next; | |
188 } | |
189 | |
190 } | |
191 } | |
192 | |
193 close($file_path_fh); | |
194 $logger->info('Created ID lookup list.'); | |
195 | |
196 return (\%wanted); | |
197 | |
198 } | |
199 | |
200 sub _ExtractSeqs { | |
201 | |
202 my ($path_from, $path_to, $wanted, $match_logic) = @_; | |
203 | |
204 $logger->info('Parsing ' . $path_from . '...'); | |
205 | |
206 my $extracted_seqs = 0; | |
207 my $found = 0; | |
208 my $path_from_fh; | |
209 my $path_to_fh; | |
210 | |
211 eval { | |
212 open($path_from_fh, "<$path_from"); | |
213 }; | |
214 if ($@) { | |
215 $logger->fatal('Cannot read FASTA file: ' . $@); | |
216 exit; | |
217 } | |
218 eval { | |
219 open($path_to_fh, ">$path_to"); | |
220 }; | |
221 if ($@) { | |
222 $logger->fatal('Cannot write FASTA file: ' . $@); | |
223 exit; | |
224 } | |
225 | |
226 LINE: while (my $line = <$path_from_fh>) { | |
227 | |
228 if ($line =~ m/^>/) { | |
229 $logger->debug('Found header line: ' . $line); | |
230 $found = 0; | |
231 my @header_ids; | |
232 | |
233 # | |
234 # Check for the presence of some frequently occurring naming schemes: | |
235 # | |
236 # >IPI:CON_Trypsin|SWISS-PROT:P00761|TRYP_PIG Trypsin - Sus scrofa (Pig). | |
237 # >IPI:CON_IPI00174775.2|TREMBL:Q32MB2;Q86Y46 Tax_Id=9606 Gene_Symbol=KRT73 Keratin-73 | |
238 # >sp|Q42592|APXS_ARATH L-ascorbate peroxidase S, chloroplastic/mitochondrial; | |
239 # >jgi|Triad1|1|gw1.3.1.1 | |
240 # | |
241 if ($line =~ m/^>((([^\s\n\r:;|]+)[:]([^\s\n\r:|]+)[|;])*([^\s\n\r:;|]+)[:]([^\s\n\r:|]+))[|;]?(\s+(.+))?/i) { | |
242 | |
243 # | |
244 # One or more namespace prefixed IDs. | |
245 # | |
246 my $concatenated_namespace_prefixed_ids = $1; | |
247 $logger->debug('Found prefixed IDs in header: ' . $concatenated_namespace_prefixed_ids); | |
248 | |
249 # database_namespace = $3 && $5; | |
250 # id = $4 && $6; | |
251 # description = $8; | |
252 my @namespace_prefixed_ids = split(/[|;]/, $concatenated_namespace_prefixed_ids); | |
253 | |
254 foreach my $prefixed_id (@namespace_prefixed_ids) { | |
255 | |
256 if ($prefixed_id =~ m/([^\s\n\r:;|]+:)?([^\s\n\r:|]+)/i) { | |
257 | |
258 my $this_id = $2; | |
259 | |
260 $logger->debug('Found ID: ' . $this_id); | |
261 push(@header_ids, $this_id); | |
262 | |
263 } else { | |
264 | |
265 $logger->warn( | |
266 'This should have been an optionally prefixed ID, ' | |
267 . 'but failed to match the corresponding regex: ' | |
268 . $prefixed_id); | |
269 | |
270 } | |
271 } | |
272 | |
273 } elsif ($line =~ m/^>((([^\s\n\r:;|]+)[|;])*([^\s\n\r:;|]+))[|;]?(\s+(.+))?/i) { | |
274 | |
275 # | |
276 # One or more unprefixed IDs. | |
277 # | |
278 my $concatenated_ids = $1; | |
279 $logger->debug('Found unprefixed IDs in header: ' . $concatenated_ids); | |
280 | |
281 # id = $3 && $4; | |
282 # description = $7; | |
283 my @unprefixed_ids = split(/[|;]/, $concatenated_ids); | |
284 | |
285 foreach my $unprefixed_id (@unprefixed_ids) { | |
286 | |
287 $logger->debug('Found ID: ' . $unprefixed_id); | |
288 push(@header_ids, $unprefixed_id); | |
289 | |
290 } | |
291 | |
292 } else { | |
293 | |
294 # | |
295 # Something else. | |
296 # | |
297 # The FASTA header line can basically have any format, | |
298 # so this is probably one of the less frequent occurring annotation schemes. | |
299 # Therefore we try to see if the IDs we are looking for are present anywhere | |
300 # in the header line up until the first white space or new line. | |
301 # This may be tricky as we might match other annotation from other proteins | |
302 # like a description from a 'best' BLAST hit that contains one of the IDs we | |
303 # are looking for. Hence, in such a case this sequence is *not* the one of | |
304 # the IDs we looking for, but similar to that one at best. | |
305 # | |
306 if ($line =~ m/>([^\n\r\s]+)/) { | |
307 | |
308 my $putative_id = $1; | |
309 $logger->debug('Found putative ID in header: ' . $putative_id); | |
310 push(@header_ids, $putative_id); | |
311 | |
312 } else { | |
313 $logger->warn('Cannot identify IDs in this header: ' . $line); | |
314 } | |
315 } | |
316 | |
317 | |
318 if ($ignore_accession_version) { | |
319 | |
320 for my $inx (0 .. $#header_ids) { | |
321 | |
322 if ($header_ids[$inx] =~ m/([^\.]+)\.(\d+)/) { | |
323 | |
324 my $this_unversioned_id = $1; | |
325 my $version_number = $2; | |
326 $logger->debug('Dropping version number (' . $version_number . ') for versioned ID: ' . $this_unversioned_id . '.'); | |
327 $header_ids[$inx] = $this_unversioned_id; | |
328 | |
329 } | |
330 } | |
331 } | |
332 | |
333 foreach my $id (@header_ids) { | |
334 $logger->debug('Checking if ID ' . $id . ' is in the list of sequences to extract.'); | |
335 | |
336 if ($match_logic eq 'literal') { | |
337 | |
338 if (${$wanted}{$id}) { | |
339 $logger->info('Literal bingo, preserving sequence with ID ' . $id); | |
340 $found = 1; | |
341 $extracted_seqs++; | |
342 last; | |
343 } | |
344 | |
345 } elsif ($match_logic eq 'regex') { | |
346 | |
347 foreach my $regex (keys(%{$wanted})) { | |
348 | |
349 if ($id =~ m/$regex/) { | |
350 $logger->info('Regex bingo, preserving sequence with ID ' . $id); | |
351 $found = 1; | |
352 $extracted_seqs++; | |
353 last; | |
354 } | |
355 } | |
356 } | |
357 } | |
358 } | |
359 if ($found) { | |
360 print($path_to_fh $line); | |
361 } | |
362 | |
363 } | |
364 | |
365 close($path_from_fh); | |
366 close($path_to_fh); | |
367 | |
368 return ($extracted_seqs); | |
369 | |
370 } | |
371 | |
372 sub _Usage { | |
373 | |
374 print STDERR "\n" | |
375 . "extractSeqsFromFasta.pl:\n" | |
376 . " Extracts sequences from a multi-sequence FASTA file.\n" . "\n" | |
377 . "Usage:\n" . "\n" | |
378 . " extractSeqsFromFasta.pl options\n" . "\n" | |
379 . "Available options are:\n" . "\n" | |
380 . " -i [file] Input file in FASTA format.\n" | |
381 . " -o [file] Output file in FASTA format where the extracted files will be saved.\n" | |
382 . " -f [file] Filter file containing accession numbers or IDs that shoud be extracted from the input.\n" | |
383 . " (One accession/ID per line)\n" | |
384 . " -m [string] Match logic that defines how the accession numbers or IDs from the filter file will be.\n" | |
385 . " matched to those in the FASTA file. Supported logic types:\n" | |
386 . " literal for exact matching.\n" | |
387 . " regex for fuzzy matching using simple regular expressions (in Perl regex syntax).\n" | |
388 . " -u Use unversioned accession numbers for matching (only with -m literal).\n" | |
389 . " If the FASTA input file contains a versioned accession number like this IPI00189968.5,\n" | |
390 . " running this tool without -u (default) IPI00189968 or IPI00189968.2 will not match IPI00189968.5, \n" | |
391 . " but with -u IPI00189968 or IPI00189968.2 will match IPI00189968.5 and the sequence will be extracted.\n" | |
392 . " Hence this allows for less strict matching, but is less fuzzy than matching with regexes.\n" | |
393 . " -l [LEVEL] Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.\n" | |
394 . "\n"; | |
395 exit; | |
396 | |
397 } |