Mercurial > repos > ucsb-phylogenetics > osiris_phylogenetics
comparison getdata/get_seqs.pl @ 0:5b9a38ec4a39 draft default tip
First commit of old repositories
author | osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> |
---|---|
date | Tue, 11 Mar 2014 12:19:13 -0700 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:5b9a38ec4a39 |
---|---|
1 #!/jgi/tools/bin/perl -w | |
2 | |
3 # | |
4 # This script creates a Fasta/Qual/Fastq file of selected sequences, with optional filters. | |
5 # | |
6 # 02/24/10 : created by Ed Kirton | |
7 # 12/07/10 : fixed Fastq bug | |
8 # | |
9 | |
10 use strict; | |
11 use warnings; | |
12 use Getopt::Long; | |
13 use IO::File; | |
14 #use PerlIO::gzip; | |
15 use FindBin; | |
16 use lib $FindBin::Bin; | |
17 use FastaDb; | |
18 use FastqDb; | |
19 | |
20 my $usage = <<'ENDHERE'; | |
21 NAME: | |
22 get_seqs.pl | |
23 PURPOSE: | |
24 To extract a subset of sequences by ID. | |
25 INPUT: | |
26 --db <*.fasta|fastq> : file containing sequences in Fasta or Fastq format | |
27 --table <*.tsv> : file containing sequence IDs (optional; default=stdin) | |
28 --col <int> : column of table containing sequence IDs (optional; default=1=first column) | |
29 OUTPUT: | |
30 --selected <*.fasta|fastq> : file containing named sequences | |
31 --unselected <*.fasta|fastq> : file containing unselected sequences | |
32 OPTIONS: | |
33 --cosorted : uses faster algorithm if IDs appear in both files in the same order | |
34 --paired : filter complete read-pair when one read is selected (requires Illumina-style read IDs; i.e. */1, */2) | |
35 --ignore case : ignore case differences between IDs | |
36 --gzip : compress all outfiles | |
37 OPTIONAL FILTERS: | |
38 Optional filters, each in the form of column:condition:value. | |
39 Where column is the column in the table (containing IDs) | |
40 Condition is one of the following: | |
41 String operators: | |
42 s_eq | |
43 s_ne | |
44 s_contains | |
45 s_notcontains | |
46 s_startswith | |
47 s_notstartswith | |
48 s_endswith | |
49 s_notendswith | |
50 Numerical operators: | |
51 n_eq | |
52 n_ne | |
53 n_gt | |
54 n_lt | |
55 Where value is a string or number as appropriate. | |
56 AUTHOR/SUPPORT: | |
57 Edward Kirton (ESKirton@LBL.gov) | |
58 ENDHERE | |
59 | |
60 # | |
61 # VALIDATE INPUT | |
62 # | |
63 my ($help, $dbfile, $tablefile, $id_col, $ignorecase, $cosorted, $selected, $unselected, $gzip, $paired); | |
64 GetOptions( | |
65 'd|db=s' => \$dbfile, | |
66 't|table=s' => \$tablefile, | |
67 'c|col=i' => \$id_col, | |
68 'ignorecase' => \$ignorecase, | |
69 'cosorted' => \$cosorted, | |
70 's|selected=s' => \$selected, | |
71 'u|unselected=s' => \$unselected, | |
72 'g|gzip' => \$gzip, | |
73 'p|paired' => \$paired, | |
74 'h|help' => \$help | |
75 ); | |
76 if ($help) { print $usage; exit; } | |
77 die("DB required\n") unless $dbfile; | |
78 die("DB file not found: $dbfile\n") unless -f $dbfile; | |
79 die("Table required\n") unless $tablefile; | |
80 die("Table file not found: $tablefile\n") unless -f $tablefile; | |
81 $selected = '' if !defined($selected) or $selected eq 'None'; | |
82 $unselected = '' if !defined($unselected) or $unselected eq 'None'; | |
83 $id_col=1 unless $id_col; | |
84 die("Invalid id column, $id_col\n") unless $id_col > 0; | |
85 | |
86 my $filters = []; | |
87 while (my $filter = shift @ARGV) { | |
88 next unless $filter; | |
89 my @a_filter = split(/:/, $filter); | |
90 die("Invalid number of filter options: @a_filter") unless @a_filter == 3; | |
91 push @$filters, \@a_filter; | |
92 } | |
93 | |
94 # | |
95 # MAIN | |
96 # | |
97 my ($n_selected,$n_unselected); | |
98 if ($cosorted) { | |
99 # SEARCH IS FAST AND EASY IF INPUTS SIMILARLY SORTED! | |
100 ($n_selected,$n_unselected) = search_cosorted($dbfile, $tablefile, $id_col, $ignorecase, $selected, $unselected, $paired, $gzip, $filters); | |
101 } else { | |
102 # INPUT NOT CO-SORTED SO KEEP ALL IDS IN RAM | |
103 ($n_selected,$n_unselected) = search($dbfile, $tablefile, $id_col, $ignorecase, $selected, $unselected, $paired, $gzip, $filters); | |
104 } | |
105 print "Selected = $n_selected; Unselected = $n_unselected\n"; | |
106 exit; | |
107 | |
108 # | |
109 # RETURNS TRUE ONLY IF RECORD MATCHES (OPTIONAL) SEARCH CRITERIA | |
110 # | |
111 sub match | |
112 { | |
113 my ($filters, $row) = @_; | |
114 foreach my $filterA (@$filters) { | |
115 my ($condition, $col, $value) = @$filterA; | |
116 my $x = $row->[ $col - 1 ]; | |
117 if ($condition eq 's_eq') { return 0 unless $x eq $value } | |
118 elsif ($condition eq 's_ne') { return 0 unless $x ne $value } | |
119 elsif ($condition eq 's_contains') { return 0 unless $x =~ /$value/ } | |
120 elsif ($condition eq 's_notcontains') { return 0 unless $x !~ /$value/ } | |
121 elsif ($condition eq 's_startswith') { return 0 unless $x =~ /^$value/ } | |
122 elsif ($condition eq 's_notstartswith') { return 0 unless $x !~ /^$value/ } | |
123 elsif ($condition eq 's_endswith') { return 0 unless $x =~ /$value$/ } | |
124 elsif ($condition eq 's_notendswith') { return 0 unless $x !~ /$value$/ } | |
125 elsif ($condition eq 'n_eq') { return 0 unless $x == $value } | |
126 elsif ($condition eq 'n_ne') { return 0 unless $x != $value } | |
127 elsif ($condition eq 'n_gt') { return 0 unless $x > $value } | |
128 elsif ($condition eq 'n_lt') { return 0 unless $x < $value } | |
129 } | |
130 return 1; | |
131 } | |
132 | |
133 # | |
134 # SIMULTANEOUSLY PARSE TWO STREAMS | |
135 # | |
136 sub search_cosorted | |
137 { | |
138 my ($dbfile, $tablefile, $id_col, $ignorecase, $selected, $unselected, $paired, $gzip, $filters) = @_; | |
139 my $sfh = new IO::File; | |
140 my $ufh = new IO::File; | |
141 my $table = new IO::File; | |
142 my $n_selected = 0; | |
143 my $n_unselected = 0; | |
144 | |
145 # OPEN FILES | |
146 if ($tablefile) { | |
147 open($table, "<$tablefile") or die("Unable to open file, $tablefile: $!\n"); | |
148 } else { | |
149 $table=*STDIN; | |
150 } | |
151 if ($selected) { | |
152 if ($gzip) { | |
153 # open($sfh, '>:gzip', $selected) or die("Unable to open file, $selected: $!\n"); | |
154 } else { | |
155 open($sfh, ">$selected") or die("Unable to open file, $selected: $!\n"); | |
156 } | |
157 } else { | |
158 open($sfh, ">/dev/null"); | |
159 } | |
160 if ($unselected) { | |
161 if ($gzip) { | |
162 # open($ufh, '>:gzip', $unselected) or die("Unable to open file, $unselected: $!\n"); | |
163 } else { | |
164 open($ufh, ">$unselected") or die("Unable to open file, $unselected: $!\n"); | |
165 } | |
166 } else { | |
167 open($ufh, ">/dev/null"); | |
168 } | |
169 | |
170 # GET FIRST MATCHING TARGET ID | |
171 my $prev_target_id = ''; | |
172 my $target_id = ''; | |
173 get_next_matching_target_id($table,$id_col,$ignorecase,$filters,\$target_id,\$prev_target_id,$paired); | |
174 unless ($target_id) { | |
175 # no records match search criteria | |
176 close $table; | |
177 close $sfh if $selected; | |
178 if ($unselected) { | |
179 open(DB, "<$dbfile") or die("Unable to open file, $dbfile: $!\n"); | |
180 while (<DB>) { | |
181 print $ufh $_; | |
182 ++$n_unselected; | |
183 } | |
184 close DB; | |
185 } | |
186 close $ufh; | |
187 return 0; | |
188 } | |
189 | |
190 # DETERMINE FILETYPE | |
191 open(DB, "<$dbfile") or die("Unable to open file, $dbfile: $!\n"); | |
192 my $format; | |
193 while (<DB>) { | |
194 chomp; | |
195 if (/^#/ or ! $_) { next } | |
196 elsif (/^>/) { $format='fasta' } | |
197 elsif (/^@/) { $format='fastq' } | |
198 else { die "Invalid DB file format" } | |
199 last; | |
200 } | |
201 close DB; | |
202 | |
203 # PARSE | |
204 my $db = $format eq 'fasta' ? FastaDb->new($dbfile) : FastqDb->new($dbfile); | |
205 while (my $rec=$db->next_seq ) { | |
206 unless ($target_id) { | |
207 last unless $unselected; # done if no more seqs to get | |
208 # otherwise dump rest of seqs in unselected file | |
209 print $ufh $rec->output; | |
210 ++$n_unselected; | |
211 while ($rec=$db->next_seq ) { | |
212 print $ufh $rec->output; | |
213 ++$n_unselected; | |
214 } | |
215 last; | |
216 } | |
217 my $id=$ignorecase ? uc($rec->id):$rec->id; | |
218 if ($id eq $prev_target_id or $id eq $target_id) { | |
219 # selected seq | |
220 print $sfh $rec->output; | |
221 ++$n_selected; | |
222 get_next_matching_target_id($table,$id_col,$ignorecase,$filters,\$target_id,\$prev_target_id,$paired); | |
223 } else { | |
224 # unselected seq | |
225 print $ufh $rec->output; | |
226 ++$n_unselected; | |
227 } | |
228 } | |
229 close $table; | |
230 close $sfh; | |
231 close $ufh; | |
232 | |
233 # If some target seqs not found, it's likely the files were not cosorted, so try unsorted search function. | |
234 if ($target_id) { | |
235 print "Files don't appear to be cosorted, trying unsorted search\n"; | |
236 return search($dbfile, $tablefile, $id_col, $ignorecase, $selected, $unselected, $filters); | |
237 } | |
238 return ($n_selected,$n_unselected); | |
239 | |
240 sub get_next_matching_target_id { | |
241 my ($table,$id_col,$ignorecase,$filters,$target_idR,$prev_target_idR,$paired)=@_; | |
242 $$prev_target_idR = $$target_idR; | |
243 $$target_idR = ''; | |
244 while (<$table>) { | |
245 chomp; | |
246 my @row = split(/\t/); | |
247 die("Bad input table") unless @row >= $id_col; | |
248 next unless match($filters, \@row); | |
249 my $new_target_id = $ignorecase ? uc($row[ $id_col - 1 ]) : $row[ $id_col - 1 ]; | |
250 $new_target_id=$1 if $new_target_id =~ /^(\S+)/; # use first word only | |
251 $new_target_id=$1 if $paired and $new_target_id =~ /^(\S+)\/[12]$/; | |
252 next if $new_target_id eq $$prev_target_idR; | |
253 $$target_idR=$new_target_id; | |
254 last; # return to parsing db file | |
255 } | |
256 } | |
257 } | |
258 | |
259 # | |
260 # LOAD IDS IN RAM THEN PARSE DB. | |
261 # | |
262 sub search | |
263 { | |
264 my ($dbfile, $tablefile, $id_col, $ignorecase, $selected, $unselected, $paired, $gzip, $filters) = @_; | |
265 my $sfh = new IO::File; # selected seqs | |
266 my $ufh = new IO::File; # unselected seqs | |
267 my $table=new IO::File; | |
268 my $n_selected=0; | |
269 my $n_unselected=0; | |
270 my %ids = (); | |
271 open(DB, "<$dbfile") or die("Unable to open file, $dbfile: $!\n"); | |
272 if ($tablefile) { | |
273 open($table, "<$tablefile") or die("Unable to open file, $tablefile: $!\n"); | |
274 } else { | |
275 $table=*STDIN; | |
276 } | |
277 if ($selected) { | |
278 if ($gzip) { | |
279 # open($sfh, '>:gzip', $selected) or die("Unable to open file, $selected: $!\n"); | |
280 } else { | |
281 open($sfh, ">$selected") or die("Unable to open file, $selected: $!\n"); | |
282 } | |
283 } else { | |
284 open($sfh, ">/dev/null"); | |
285 } | |
286 if ($unselected) { | |
287 if ($gzip) { | |
288 # open($ufh, '>:gzip', $unselected) or die("Unable to open file, $unselected: $!\n"); | |
289 } else { | |
290 open($ufh, ">$unselected") or die("Unable to open file, $unselected: $!\n"); | |
291 } | |
292 } else { | |
293 open($ufh, ">/dev/null"); | |
294 } | |
295 | |
296 # LOAD IDS OF MATCHING ROWS | |
297 my $num_targets=0; | |
298 while (<$table>) { | |
299 next if /^#/; | |
300 chomp; | |
301 my @row = split(/\t/); | |
302 my $id = $ignorecase ? uc($row[ $id_col - 1 ]) : $row[ $id_col - 1 ]; | |
303 $id=$1 if $id =~ /^(\S+)/; | |
304 $id=$1 if $paired and $id =~ /^(\S+)\/[12]$/; | |
305 if (match($filters, \@row)) { | |
306 # remember this ID | |
307 $ids{$id} = 0; # number of reads with this ID found (counter for paired option) | |
308 ++$num_targets; | |
309 } | |
310 } | |
311 unless ($num_targets) { | |
312 # no records match search criteria | |
313 close $table; | |
314 close $sfh if $selected; | |
315 if ($unselected) { | |
316 open(DB, "<$dbfile") or die("Unable to open file, $dbfile: $!\n"); | |
317 while (<DB>) { | |
318 print $ufh $_; | |
319 ++$n_unselected; | |
320 } | |
321 close DB; | |
322 } | |
323 close $ufh; | |
324 return 0; | |
325 } | |
326 | |
327 | |
328 # DETERMINE FILETYPE | |
329 open(DB, "<$dbfile") or die("Unable to open file, $dbfile: $!\n"); | |
330 my $format; | |
331 while (<DB>) { | |
332 chomp; | |
333 if (/^#/ or /^$/) { next } | |
334 elsif (/^>/) { $format='fasta' } | |
335 elsif (/^@/) { $format='fastq' } | |
336 else { die "Invalid DB file format" } | |
337 last; | |
338 } | |
339 close DB; | |
340 | |
341 # GET SEQS | |
342 my $db = $format eq 'fasta' ? FastaDb->new($dbfile) : FastqDb->new($dbfile); | |
343 while (my $rec=$db->next_seq ) { | |
344 my $id = $ignorecase ? uc($rec->id) : $rec->id; | |
345 $id = $1 if $paired and $id =~ /^(\S+)\/[12]$/; | |
346 if (exists($ids{$id})) { | |
347 # selected | |
348 print $sfh $rec->output; | |
349 ++$n_selected; | |
350 if (!$paired) { | |
351 delete $ids{$id}; | |
352 } else { | |
353 $ids{$id} += 1; | |
354 delete $ids{$id} if $ids{$id} == 2; | |
355 } | |
356 } else { | |
357 # unselected | |
358 print $ufh $rec->output; | |
359 ++$n_unselected; | |
360 } | |
361 } | |
362 close $table; | |
363 close $sfh; | |
364 close $ufh; | |
365 | |
366 # MAKE SURE ALL TARGETS WERE FOUND | |
367 foreach my $id (keys %ids) { | |
368 if ($ids{$id}) { | |
369 delete($ids{$id}); # SOMETIMES INFILES CONTAIN ONLY ONE READ OF PAIR | |
370 }elsif($id eq 'EMPTY') { #ADDEDBY THO TO ALLOW EMPTY blast results | |
371 #in workflow using checkempty.pl | |
372 exit; | |
373 } else { | |
374 warn("Seq not found: $id\n"); | |
375 } | |
376 } | |
377 return ($n_selected,$n_unselected); | |
378 } | |
379 | |
380 __END__ |