Mercurial > repos > ucsb-phylogenetics > osiris_phylogenetics
diff getdata/get_seqs.pl @ 0:5b9a38ec4a39 draft default tip
First commit of old repositories
author | osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> |
---|---|
date | Tue, 11 Mar 2014 12:19:13 -0700 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/getdata/get_seqs.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,380 @@ +#!/jgi/tools/bin/perl -w + +# +# This script creates a Fasta/Qual/Fastq file of selected sequences, with optional filters. +# +# 02/24/10 : created by Ed Kirton +# 12/07/10 : fixed Fastq bug +# + +use strict; +use warnings; +use Getopt::Long; +use IO::File; +#use PerlIO::gzip; +use FindBin; +use lib $FindBin::Bin; +use FastaDb; +use FastqDb; + +my $usage = <<'ENDHERE'; +NAME: + get_seqs.pl +PURPOSE: + To extract a subset of sequences by ID. +INPUT: + --db <*.fasta|fastq> : file containing sequences in Fasta or Fastq format + --table <*.tsv> : file containing sequence IDs (optional; default=stdin) + --col <int> : column of table containing sequence IDs (optional; default=1=first column) +OUTPUT: + --selected <*.fasta|fastq> : file containing named sequences + --unselected <*.fasta|fastq> : file containing unselected sequences +OPTIONS: + --cosorted : uses faster algorithm if IDs appear in both files in the same order + --paired : filter complete read-pair when one read is selected (requires Illumina-style read IDs; i.e. */1, */2) + --ignore case : ignore case differences between IDs + --gzip : compress all outfiles +OPTIONAL FILTERS: + Optional filters, each in the form of column:condition:value. + Where column is the column in the table (containing IDs) + Condition is one of the following: + String operators: + s_eq + s_ne + s_contains + s_notcontains + s_startswith + s_notstartswith + s_endswith + s_notendswith + Numerical operators: + n_eq + n_ne + n_gt + n_lt + Where value is a string or number as appropriate. +AUTHOR/SUPPORT: + Edward Kirton (ESKirton@LBL.gov) +ENDHERE + +# +# VALIDATE INPUT +# +my ($help, $dbfile, $tablefile, $id_col, $ignorecase, $cosorted, $selected, $unselected, $gzip, $paired); +GetOptions( + 'd|db=s' => \$dbfile, + 't|table=s' => \$tablefile, + 'c|col=i' => \$id_col, + 'ignorecase' => \$ignorecase, + 'cosorted' => \$cosorted, + 's|selected=s' => \$selected, + 'u|unselected=s' => \$unselected, + 'g|gzip' => \$gzip, + 'p|paired' => \$paired, + 'h|help' => \$help +); +if ($help) { print $usage; exit; } +die("DB required\n") unless $dbfile; +die("DB file not found: $dbfile\n") unless -f $dbfile; +die("Table required\n") unless $tablefile; +die("Table file not found: $tablefile\n") unless -f $tablefile; +$selected = '' if !defined($selected) or $selected eq 'None'; +$unselected = '' if !defined($unselected) or $unselected eq 'None'; +$id_col=1 unless $id_col; +die("Invalid id column, $id_col\n") unless $id_col > 0; + +my $filters = []; +while (my $filter = shift @ARGV) { + next unless $filter; + my @a_filter = split(/:/, $filter); + die("Invalid number of filter options: @a_filter") unless @a_filter == 3; + push @$filters, \@a_filter; +} + +# +# MAIN +# +my ($n_selected,$n_unselected); +if ($cosorted) { + # SEARCH IS FAST AND EASY IF INPUTS SIMILARLY SORTED! + ($n_selected,$n_unselected) = search_cosorted($dbfile, $tablefile, $id_col, $ignorecase, $selected, $unselected, $paired, $gzip, $filters); +} else { + # INPUT NOT CO-SORTED SO KEEP ALL IDS IN RAM + ($n_selected,$n_unselected) = search($dbfile, $tablefile, $id_col, $ignorecase, $selected, $unselected, $paired, $gzip, $filters); +} +print "Selected = $n_selected; Unselected = $n_unselected\n"; +exit; + +# +# RETURNS TRUE ONLY IF RECORD MATCHES (OPTIONAL) SEARCH CRITERIA +# +sub match +{ + my ($filters, $row) = @_; + foreach my $filterA (@$filters) { + my ($condition, $col, $value) = @$filterA; + my $x = $row->[ $col - 1 ]; + if ($condition eq 's_eq') { return 0 unless $x eq $value } + elsif ($condition eq 's_ne') { return 0 unless $x ne $value } + elsif ($condition eq 's_contains') { return 0 unless $x =~ /$value/ } + elsif ($condition eq 's_notcontains') { return 0 unless $x !~ /$value/ } + elsif ($condition eq 's_startswith') { return 0 unless $x =~ /^$value/ } + elsif ($condition eq 's_notstartswith') { return 0 unless $x !~ /^$value/ } + elsif ($condition eq 's_endswith') { return 0 unless $x =~ /$value$/ } + elsif ($condition eq 's_notendswith') { return 0 unless $x !~ /$value$/ } + elsif ($condition eq 'n_eq') { return 0 unless $x == $value } + elsif ($condition eq 'n_ne') { return 0 unless $x != $value } + elsif ($condition eq 'n_gt') { return 0 unless $x > $value } + elsif ($condition eq 'n_lt') { return 0 unless $x < $value } + } + return 1; +} + +# +# SIMULTANEOUSLY PARSE TWO STREAMS +# +sub search_cosorted +{ + my ($dbfile, $tablefile, $id_col, $ignorecase, $selected, $unselected, $paired, $gzip, $filters) = @_; + my $sfh = new IO::File; + my $ufh = new IO::File; + my $table = new IO::File; + my $n_selected = 0; + my $n_unselected = 0; + + # OPEN FILES + if ($tablefile) { + open($table, "<$tablefile") or die("Unable to open file, $tablefile: $!\n"); + } else { + $table=*STDIN; + } + if ($selected) { + if ($gzip) { +# open($sfh, '>:gzip', $selected) or die("Unable to open file, $selected: $!\n"); + } else { + open($sfh, ">$selected") or die("Unable to open file, $selected: $!\n"); + } + } else { + open($sfh, ">/dev/null"); + } + if ($unselected) { + if ($gzip) { +# open($ufh, '>:gzip', $unselected) or die("Unable to open file, $unselected: $!\n"); + } else { + open($ufh, ">$unselected") or die("Unable to open file, $unselected: $!\n"); + } + } else { + open($ufh, ">/dev/null"); + } + + # GET FIRST MATCHING TARGET ID + my $prev_target_id = ''; + my $target_id = ''; + get_next_matching_target_id($table,$id_col,$ignorecase,$filters,\$target_id,\$prev_target_id,$paired); + unless ($target_id) { + # no records match search criteria + close $table; + close $sfh if $selected; + if ($unselected) { + open(DB, "<$dbfile") or die("Unable to open file, $dbfile: $!\n"); + while (<DB>) { + print $ufh $_; + ++$n_unselected; + } + close DB; + } + close $ufh; + return 0; + } + + # DETERMINE FILETYPE + open(DB, "<$dbfile") or die("Unable to open file, $dbfile: $!\n"); + my $format; + while (<DB>) { + chomp; + if (/^#/ or ! $_) { next } + elsif (/^>/) { $format='fasta' } + elsif (/^@/) { $format='fastq' } + else { die "Invalid DB file format" } + last; + } + close DB; + + # PARSE + my $db = $format eq 'fasta' ? FastaDb->new($dbfile) : FastqDb->new($dbfile); + while (my $rec=$db->next_seq ) { + unless ($target_id) { + last unless $unselected; # done if no more seqs to get + # otherwise dump rest of seqs in unselected file + print $ufh $rec->output; + ++$n_unselected; + while ($rec=$db->next_seq ) { + print $ufh $rec->output; + ++$n_unselected; + } + last; + } + my $id=$ignorecase ? uc($rec->id):$rec->id; + if ($id eq $prev_target_id or $id eq $target_id) { + # selected seq + print $sfh $rec->output; + ++$n_selected; + get_next_matching_target_id($table,$id_col,$ignorecase,$filters,\$target_id,\$prev_target_id,$paired); + } else { + # unselected seq + print $ufh $rec->output; + ++$n_unselected; + } + } + close $table; + close $sfh; + close $ufh; + + # If some target seqs not found, it's likely the files were not cosorted, so try unsorted search function. + if ($target_id) { + print "Files don't appear to be cosorted, trying unsorted search\n"; + return search($dbfile, $tablefile, $id_col, $ignorecase, $selected, $unselected, $filters); + } + return ($n_selected,$n_unselected); + + sub get_next_matching_target_id { + my ($table,$id_col,$ignorecase,$filters,$target_idR,$prev_target_idR,$paired)=@_; + $$prev_target_idR = $$target_idR; + $$target_idR = ''; + while (<$table>) { + chomp; + my @row = split(/\t/); + die("Bad input table") unless @row >= $id_col; + next unless match($filters, \@row); + my $new_target_id = $ignorecase ? uc($row[ $id_col - 1 ]) : $row[ $id_col - 1 ]; + $new_target_id=$1 if $new_target_id =~ /^(\S+)/; # use first word only + $new_target_id=$1 if $paired and $new_target_id =~ /^(\S+)\/[12]$/; + next if $new_target_id eq $$prev_target_idR; + $$target_idR=$new_target_id; + last; # return to parsing db file + } + } +} + +# +# LOAD IDS IN RAM THEN PARSE DB. +# +sub search +{ + my ($dbfile, $tablefile, $id_col, $ignorecase, $selected, $unselected, $paired, $gzip, $filters) = @_; + my $sfh = new IO::File; # selected seqs + my $ufh = new IO::File; # unselected seqs + my $table=new IO::File; + my $n_selected=0; + my $n_unselected=0; + my %ids = (); + open(DB, "<$dbfile") or die("Unable to open file, $dbfile: $!\n"); + if ($tablefile) { + open($table, "<$tablefile") or die("Unable to open file, $tablefile: $!\n"); + } else { + $table=*STDIN; + } + if ($selected) { + if ($gzip) { +# open($sfh, '>:gzip', $selected) or die("Unable to open file, $selected: $!\n"); + } else { + open($sfh, ">$selected") or die("Unable to open file, $selected: $!\n"); + } + } else { + open($sfh, ">/dev/null"); + } + if ($unselected) { + if ($gzip) { +# open($ufh, '>:gzip', $unselected) or die("Unable to open file, $unselected: $!\n"); + } else { + open($ufh, ">$unselected") or die("Unable to open file, $unselected: $!\n"); + } + } else { + open($ufh, ">/dev/null"); + } + + # LOAD IDS OF MATCHING ROWS + my $num_targets=0; + while (<$table>) { + next if /^#/; + chomp; + my @row = split(/\t/); + my $id = $ignorecase ? uc($row[ $id_col - 1 ]) : $row[ $id_col - 1 ]; + $id=$1 if $id =~ /^(\S+)/; + $id=$1 if $paired and $id =~ /^(\S+)\/[12]$/; + if (match($filters, \@row)) { + # remember this ID + $ids{$id} = 0; # number of reads with this ID found (counter for paired option) + ++$num_targets; + } + } + unless ($num_targets) { + # no records match search criteria + close $table; + close $sfh if $selected; + if ($unselected) { + open(DB, "<$dbfile") or die("Unable to open file, $dbfile: $!\n"); + while (<DB>) { + print $ufh $_; + ++$n_unselected; + } + close DB; + } + close $ufh; + return 0; + } + + + # DETERMINE FILETYPE + open(DB, "<$dbfile") or die("Unable to open file, $dbfile: $!\n"); + my $format; + while (<DB>) { + chomp; + if (/^#/ or /^$/) { next } + elsif (/^>/) { $format='fasta' } + elsif (/^@/) { $format='fastq' } + else { die "Invalid DB file format" } + last; + } + close DB; + + # GET SEQS + my $db = $format eq 'fasta' ? FastaDb->new($dbfile) : FastqDb->new($dbfile); + while (my $rec=$db->next_seq ) { + my $id = $ignorecase ? uc($rec->id) : $rec->id; + $id = $1 if $paired and $id =~ /^(\S+)\/[12]$/; + if (exists($ids{$id})) { + # selected + print $sfh $rec->output; + ++$n_selected; + if (!$paired) { + delete $ids{$id}; + } else { + $ids{$id} += 1; + delete $ids{$id} if $ids{$id} == 2; + } + } else { + # unselected + print $ufh $rec->output; + ++$n_unselected; + } + } + close $table; + close $sfh; + close $ufh; + + # MAKE SURE ALL TARGETS WERE FOUND + foreach my $id (keys %ids) { + if ($ids{$id}) { + delete($ids{$id}); # SOMETIMES INFILES CONTAIN ONLY ONE READ OF PAIR + }elsif($id eq 'EMPTY') { #ADDEDBY THO TO ALLOW EMPTY blast results + #in workflow using checkempty.pl + exit; + } else { + warn("Seq not found: $id\n"); + } + } + return ($n_selected,$n_unselected); +} + +__END__