comparison getdata/get_seqs.pl @ 0:5b9a38ec4a39 draft default tip

First commit of old repositories
author osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu>
date Tue, 11 Mar 2014 12:19:13 -0700
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:5b9a38ec4a39
1 #!/jgi/tools/bin/perl -w
2
3 #
4 # This script creates a Fasta/Qual/Fastq file of selected sequences, with optional filters.
5 #
6 # 02/24/10 : created by Ed Kirton
7 # 12/07/10 : fixed Fastq bug
8 #
9
10 use strict;
11 use warnings;
12 use Getopt::Long;
13 use IO::File;
14 #use PerlIO::gzip;
15 use FindBin;
16 use lib $FindBin::Bin;
17 use FastaDb;
18 use FastqDb;
19
20 my $usage = <<'ENDHERE';
21 NAME:
22 get_seqs.pl
23 PURPOSE:
24 To extract a subset of sequences by ID.
25 INPUT:
26 --db <*.fasta|fastq> : file containing sequences in Fasta or Fastq format
27 --table <*.tsv> : file containing sequence IDs (optional; default=stdin)
28 --col <int> : column of table containing sequence IDs (optional; default=1=first column)
29 OUTPUT:
30 --selected <*.fasta|fastq> : file containing named sequences
31 --unselected <*.fasta|fastq> : file containing unselected sequences
32 OPTIONS:
33 --cosorted : uses faster algorithm if IDs appear in both files in the same order
34 --paired : filter complete read-pair when one read is selected (requires Illumina-style read IDs; i.e. */1, */2)
35 --ignore case : ignore case differences between IDs
36 --gzip : compress all outfiles
37 OPTIONAL FILTERS:
38 Optional filters, each in the form of column:condition:value.
39 Where column is the column in the table (containing IDs)
40 Condition is one of the following:
41 String operators:
42 s_eq
43 s_ne
44 s_contains
45 s_notcontains
46 s_startswith
47 s_notstartswith
48 s_endswith
49 s_notendswith
50 Numerical operators:
51 n_eq
52 n_ne
53 n_gt
54 n_lt
55 Where value is a string or number as appropriate.
56 AUTHOR/SUPPORT:
57 Edward Kirton (ESKirton@LBL.gov)
58 ENDHERE
59
60 #
61 # VALIDATE INPUT
62 #
63 my ($help, $dbfile, $tablefile, $id_col, $ignorecase, $cosorted, $selected, $unselected, $gzip, $paired);
64 GetOptions(
65 'd|db=s' => \$dbfile,
66 't|table=s' => \$tablefile,
67 'c|col=i' => \$id_col,
68 'ignorecase' => \$ignorecase,
69 'cosorted' => \$cosorted,
70 's|selected=s' => \$selected,
71 'u|unselected=s' => \$unselected,
72 'g|gzip' => \$gzip,
73 'p|paired' => \$paired,
74 'h|help' => \$help
75 );
76 if ($help) { print $usage; exit; }
77 die("DB required\n") unless $dbfile;
78 die("DB file not found: $dbfile\n") unless -f $dbfile;
79 die("Table required\n") unless $tablefile;
80 die("Table file not found: $tablefile\n") unless -f $tablefile;
81 $selected = '' if !defined($selected) or $selected eq 'None';
82 $unselected = '' if !defined($unselected) or $unselected eq 'None';
83 $id_col=1 unless $id_col;
84 die("Invalid id column, $id_col\n") unless $id_col > 0;
85
86 my $filters = [];
87 while (my $filter = shift @ARGV) {
88 next unless $filter;
89 my @a_filter = split(/:/, $filter);
90 die("Invalid number of filter options: @a_filter") unless @a_filter == 3;
91 push @$filters, \@a_filter;
92 }
93
94 #
95 # MAIN
96 #
97 my ($n_selected,$n_unselected);
98 if ($cosorted) {
99 # SEARCH IS FAST AND EASY IF INPUTS SIMILARLY SORTED!
100 ($n_selected,$n_unselected) = search_cosorted($dbfile, $tablefile, $id_col, $ignorecase, $selected, $unselected, $paired, $gzip, $filters);
101 } else {
102 # INPUT NOT CO-SORTED SO KEEP ALL IDS IN RAM
103 ($n_selected,$n_unselected) = search($dbfile, $tablefile, $id_col, $ignorecase, $selected, $unselected, $paired, $gzip, $filters);
104 }
105 print "Selected = $n_selected; Unselected = $n_unselected\n";
106 exit;
107
108 #
109 # RETURNS TRUE ONLY IF RECORD MATCHES (OPTIONAL) SEARCH CRITERIA
110 #
111 sub match
112 {
113 my ($filters, $row) = @_;
114 foreach my $filterA (@$filters) {
115 my ($condition, $col, $value) = @$filterA;
116 my $x = $row->[ $col - 1 ];
117 if ($condition eq 's_eq') { return 0 unless $x eq $value }
118 elsif ($condition eq 's_ne') { return 0 unless $x ne $value }
119 elsif ($condition eq 's_contains') { return 0 unless $x =~ /$value/ }
120 elsif ($condition eq 's_notcontains') { return 0 unless $x !~ /$value/ }
121 elsif ($condition eq 's_startswith') { return 0 unless $x =~ /^$value/ }
122 elsif ($condition eq 's_notstartswith') { return 0 unless $x !~ /^$value/ }
123 elsif ($condition eq 's_endswith') { return 0 unless $x =~ /$value$/ }
124 elsif ($condition eq 's_notendswith') { return 0 unless $x !~ /$value$/ }
125 elsif ($condition eq 'n_eq') { return 0 unless $x == $value }
126 elsif ($condition eq 'n_ne') { return 0 unless $x != $value }
127 elsif ($condition eq 'n_gt') { return 0 unless $x > $value }
128 elsif ($condition eq 'n_lt') { return 0 unless $x < $value }
129 }
130 return 1;
131 }
132
133 #
134 # SIMULTANEOUSLY PARSE TWO STREAMS
135 #
136 sub search_cosorted
137 {
138 my ($dbfile, $tablefile, $id_col, $ignorecase, $selected, $unselected, $paired, $gzip, $filters) = @_;
139 my $sfh = new IO::File;
140 my $ufh = new IO::File;
141 my $table = new IO::File;
142 my $n_selected = 0;
143 my $n_unselected = 0;
144
145 # OPEN FILES
146 if ($tablefile) {
147 open($table, "<$tablefile") or die("Unable to open file, $tablefile: $!\n");
148 } else {
149 $table=*STDIN;
150 }
151 if ($selected) {
152 if ($gzip) {
153 # open($sfh, '>:gzip', $selected) or die("Unable to open file, $selected: $!\n");
154 } else {
155 open($sfh, ">$selected") or die("Unable to open file, $selected: $!\n");
156 }
157 } else {
158 open($sfh, ">/dev/null");
159 }
160 if ($unselected) {
161 if ($gzip) {
162 # open($ufh, '>:gzip', $unselected) or die("Unable to open file, $unselected: $!\n");
163 } else {
164 open($ufh, ">$unselected") or die("Unable to open file, $unselected: $!\n");
165 }
166 } else {
167 open($ufh, ">/dev/null");
168 }
169
170 # GET FIRST MATCHING TARGET ID
171 my $prev_target_id = '';
172 my $target_id = '';
173 get_next_matching_target_id($table,$id_col,$ignorecase,$filters,\$target_id,\$prev_target_id,$paired);
174 unless ($target_id) {
175 # no records match search criteria
176 close $table;
177 close $sfh if $selected;
178 if ($unselected) {
179 open(DB, "<$dbfile") or die("Unable to open file, $dbfile: $!\n");
180 while (<DB>) {
181 print $ufh $_;
182 ++$n_unselected;
183 }
184 close DB;
185 }
186 close $ufh;
187 return 0;
188 }
189
190 # DETERMINE FILETYPE
191 open(DB, "<$dbfile") or die("Unable to open file, $dbfile: $!\n");
192 my $format;
193 while (<DB>) {
194 chomp;
195 if (/^#/ or ! $_) { next }
196 elsif (/^>/) { $format='fasta' }
197 elsif (/^@/) { $format='fastq' }
198 else { die "Invalid DB file format" }
199 last;
200 }
201 close DB;
202
203 # PARSE
204 my $db = $format eq 'fasta' ? FastaDb->new($dbfile) : FastqDb->new($dbfile);
205 while (my $rec=$db->next_seq ) {
206 unless ($target_id) {
207 last unless $unselected; # done if no more seqs to get
208 # otherwise dump rest of seqs in unselected file
209 print $ufh $rec->output;
210 ++$n_unselected;
211 while ($rec=$db->next_seq ) {
212 print $ufh $rec->output;
213 ++$n_unselected;
214 }
215 last;
216 }
217 my $id=$ignorecase ? uc($rec->id):$rec->id;
218 if ($id eq $prev_target_id or $id eq $target_id) {
219 # selected seq
220 print $sfh $rec->output;
221 ++$n_selected;
222 get_next_matching_target_id($table,$id_col,$ignorecase,$filters,\$target_id,\$prev_target_id,$paired);
223 } else {
224 # unselected seq
225 print $ufh $rec->output;
226 ++$n_unselected;
227 }
228 }
229 close $table;
230 close $sfh;
231 close $ufh;
232
233 # If some target seqs not found, it's likely the files were not cosorted, so try unsorted search function.
234 if ($target_id) {
235 print "Files don't appear to be cosorted, trying unsorted search\n";
236 return search($dbfile, $tablefile, $id_col, $ignorecase, $selected, $unselected, $filters);
237 }
238 return ($n_selected,$n_unselected);
239
240 sub get_next_matching_target_id {
241 my ($table,$id_col,$ignorecase,$filters,$target_idR,$prev_target_idR,$paired)=@_;
242 $$prev_target_idR = $$target_idR;
243 $$target_idR = '';
244 while (<$table>) {
245 chomp;
246 my @row = split(/\t/);
247 die("Bad input table") unless @row >= $id_col;
248 next unless match($filters, \@row);
249 my $new_target_id = $ignorecase ? uc($row[ $id_col - 1 ]) : $row[ $id_col - 1 ];
250 $new_target_id=$1 if $new_target_id =~ /^(\S+)/; # use first word only
251 $new_target_id=$1 if $paired and $new_target_id =~ /^(\S+)\/[12]$/;
252 next if $new_target_id eq $$prev_target_idR;
253 $$target_idR=$new_target_id;
254 last; # return to parsing db file
255 }
256 }
257 }
258
259 #
260 # LOAD IDS IN RAM THEN PARSE DB.
261 #
262 sub search
263 {
264 my ($dbfile, $tablefile, $id_col, $ignorecase, $selected, $unselected, $paired, $gzip, $filters) = @_;
265 my $sfh = new IO::File; # selected seqs
266 my $ufh = new IO::File; # unselected seqs
267 my $table=new IO::File;
268 my $n_selected=0;
269 my $n_unselected=0;
270 my %ids = ();
271 open(DB, "<$dbfile") or die("Unable to open file, $dbfile: $!\n");
272 if ($tablefile) {
273 open($table, "<$tablefile") or die("Unable to open file, $tablefile: $!\n");
274 } else {
275 $table=*STDIN;
276 }
277 if ($selected) {
278 if ($gzip) {
279 # open($sfh, '>:gzip', $selected) or die("Unable to open file, $selected: $!\n");
280 } else {
281 open($sfh, ">$selected") or die("Unable to open file, $selected: $!\n");
282 }
283 } else {
284 open($sfh, ">/dev/null");
285 }
286 if ($unselected) {
287 if ($gzip) {
288 # open($ufh, '>:gzip', $unselected) or die("Unable to open file, $unselected: $!\n");
289 } else {
290 open($ufh, ">$unselected") or die("Unable to open file, $unselected: $!\n");
291 }
292 } else {
293 open($ufh, ">/dev/null");
294 }
295
296 # LOAD IDS OF MATCHING ROWS
297 my $num_targets=0;
298 while (<$table>) {
299 next if /^#/;
300 chomp;
301 my @row = split(/\t/);
302 my $id = $ignorecase ? uc($row[ $id_col - 1 ]) : $row[ $id_col - 1 ];
303 $id=$1 if $id =~ /^(\S+)/;
304 $id=$1 if $paired and $id =~ /^(\S+)\/[12]$/;
305 if (match($filters, \@row)) {
306 # remember this ID
307 $ids{$id} = 0; # number of reads with this ID found (counter for paired option)
308 ++$num_targets;
309 }
310 }
311 unless ($num_targets) {
312 # no records match search criteria
313 close $table;
314 close $sfh if $selected;
315 if ($unselected) {
316 open(DB, "<$dbfile") or die("Unable to open file, $dbfile: $!\n");
317 while (<DB>) {
318 print $ufh $_;
319 ++$n_unselected;
320 }
321 close DB;
322 }
323 close $ufh;
324 return 0;
325 }
326
327
328 # DETERMINE FILETYPE
329 open(DB, "<$dbfile") or die("Unable to open file, $dbfile: $!\n");
330 my $format;
331 while (<DB>) {
332 chomp;
333 if (/^#/ or /^$/) { next }
334 elsif (/^>/) { $format='fasta' }
335 elsif (/^@/) { $format='fastq' }
336 else { die "Invalid DB file format" }
337 last;
338 }
339 close DB;
340
341 # GET SEQS
342 my $db = $format eq 'fasta' ? FastaDb->new($dbfile) : FastqDb->new($dbfile);
343 while (my $rec=$db->next_seq ) {
344 my $id = $ignorecase ? uc($rec->id) : $rec->id;
345 $id = $1 if $paired and $id =~ /^(\S+)\/[12]$/;
346 if (exists($ids{$id})) {
347 # selected
348 print $sfh $rec->output;
349 ++$n_selected;
350 if (!$paired) {
351 delete $ids{$id};
352 } else {
353 $ids{$id} += 1;
354 delete $ids{$id} if $ids{$id} == 2;
355 }
356 } else {
357 # unselected
358 print $ufh $rec->output;
359 ++$n_unselected;
360 }
361 }
362 close $table;
363 close $sfh;
364 close $ufh;
365
366 # MAKE SURE ALL TARGETS WERE FOUND
367 foreach my $id (keys %ids) {
368 if ($ids{$id}) {
369 delete($ids{$id}); # SOMETIMES INFILES CONTAIN ONLY ONE READ OF PAIR
370 }elsif($id eq 'EMPTY') { #ADDEDBY THO TO ALLOW EMPTY blast results
371 #in workflow using checkempty.pl
372 exit;
373 } else {
374 warn("Seq not found: $id\n");
375 }
376 }
377 return ($n_selected,$n_unselected);
378 }
379
380 __END__