Mercurial > repos > ucsb-phylogenetics > osiris_phylogenetics
view getdata/get_seqs.pl @ 0:5b9a38ec4a39 draft default tip
First commit of old repositories
author | osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> |
---|---|
date | Tue, 11 Mar 2014 12:19:13 -0700 |
parents | |
children |
line wrap: on
line source
#!/jgi/tools/bin/perl -w # # This script creates a Fasta/Qual/Fastq file of selected sequences, with optional filters. # # 02/24/10 : created by Ed Kirton # 12/07/10 : fixed Fastq bug # use strict; use warnings; use Getopt::Long; use IO::File; #use PerlIO::gzip; use FindBin; use lib $FindBin::Bin; use FastaDb; use FastqDb; my $usage = <<'ENDHERE'; NAME: get_seqs.pl PURPOSE: To extract a subset of sequences by ID. INPUT: --db <*.fasta|fastq> : file containing sequences in Fasta or Fastq format --table <*.tsv> : file containing sequence IDs (optional; default=stdin) --col <int> : column of table containing sequence IDs (optional; default=1=first column) OUTPUT: --selected <*.fasta|fastq> : file containing named sequences --unselected <*.fasta|fastq> : file containing unselected sequences OPTIONS: --cosorted : uses faster algorithm if IDs appear in both files in the same order --paired : filter complete read-pair when one read is selected (requires Illumina-style read IDs; i.e. */1, */2) --ignore case : ignore case differences between IDs --gzip : compress all outfiles OPTIONAL FILTERS: Optional filters, each in the form of column:condition:value. Where column is the column in the table (containing IDs) Condition is one of the following: String operators: s_eq s_ne s_contains s_notcontains s_startswith s_notstartswith s_endswith s_notendswith Numerical operators: n_eq n_ne n_gt n_lt Where value is a string or number as appropriate. AUTHOR/SUPPORT: Edward Kirton (ESKirton@LBL.gov) ENDHERE # # VALIDATE INPUT # my ($help, $dbfile, $tablefile, $id_col, $ignorecase, $cosorted, $selected, $unselected, $gzip, $paired); GetOptions( 'd|db=s' => \$dbfile, 't|table=s' => \$tablefile, 'c|col=i' => \$id_col, 'ignorecase' => \$ignorecase, 'cosorted' => \$cosorted, 's|selected=s' => \$selected, 'u|unselected=s' => \$unselected, 'g|gzip' => \$gzip, 'p|paired' => \$paired, 'h|help' => \$help ); if ($help) { print $usage; exit; } die("DB required\n") unless $dbfile; die("DB file not found: $dbfile\n") unless -f $dbfile; die("Table required\n") unless $tablefile; die("Table file not found: $tablefile\n") unless -f $tablefile; $selected = '' if !defined($selected) or $selected eq 'None'; $unselected = '' if !defined($unselected) or $unselected eq 'None'; $id_col=1 unless $id_col; die("Invalid id column, $id_col\n") unless $id_col > 0; my $filters = []; while (my $filter = shift @ARGV) { next unless $filter; my @a_filter = split(/:/, $filter); die("Invalid number of filter options: @a_filter") unless @a_filter == 3; push @$filters, \@a_filter; } # # MAIN # my ($n_selected,$n_unselected); if ($cosorted) { # SEARCH IS FAST AND EASY IF INPUTS SIMILARLY SORTED! ($n_selected,$n_unselected) = search_cosorted($dbfile, $tablefile, $id_col, $ignorecase, $selected, $unselected, $paired, $gzip, $filters); } else { # INPUT NOT CO-SORTED SO KEEP ALL IDS IN RAM ($n_selected,$n_unselected) = search($dbfile, $tablefile, $id_col, $ignorecase, $selected, $unselected, $paired, $gzip, $filters); } print "Selected = $n_selected; Unselected = $n_unselected\n"; exit; # # RETURNS TRUE ONLY IF RECORD MATCHES (OPTIONAL) SEARCH CRITERIA # sub match { my ($filters, $row) = @_; foreach my $filterA (@$filters) { my ($condition, $col, $value) = @$filterA; my $x = $row->[ $col - 1 ]; if ($condition eq 's_eq') { return 0 unless $x eq $value } elsif ($condition eq 's_ne') { return 0 unless $x ne $value } elsif ($condition eq 's_contains') { return 0 unless $x =~ /$value/ } elsif ($condition eq 's_notcontains') { return 0 unless $x !~ /$value/ } elsif ($condition eq 's_startswith') { return 0 unless $x =~ /^$value/ } elsif ($condition eq 's_notstartswith') { return 0 unless $x !~ /^$value/ } elsif ($condition eq 's_endswith') { return 0 unless $x =~ /$value$/ } elsif ($condition eq 's_notendswith') { return 0 unless $x !~ /$value$/ } elsif ($condition eq 'n_eq') { return 0 unless $x == $value } elsif ($condition eq 'n_ne') { return 0 unless $x != $value } elsif ($condition eq 'n_gt') { return 0 unless $x > $value } elsif ($condition eq 'n_lt') { return 0 unless $x < $value } } return 1; } # # SIMULTANEOUSLY PARSE TWO STREAMS # sub search_cosorted { my ($dbfile, $tablefile, $id_col, $ignorecase, $selected, $unselected, $paired, $gzip, $filters) = @_; my $sfh = new IO::File; my $ufh = new IO::File; my $table = new IO::File; my $n_selected = 0; my $n_unselected = 0; # OPEN FILES if ($tablefile) { open($table, "<$tablefile") or die("Unable to open file, $tablefile: $!\n"); } else { $table=*STDIN; } if ($selected) { if ($gzip) { # open($sfh, '>:gzip', $selected) or die("Unable to open file, $selected: $!\n"); } else { open($sfh, ">$selected") or die("Unable to open file, $selected: $!\n"); } } else { open($sfh, ">/dev/null"); } if ($unselected) { if ($gzip) { # open($ufh, '>:gzip', $unselected) or die("Unable to open file, $unselected: $!\n"); } else { open($ufh, ">$unselected") or die("Unable to open file, $unselected: $!\n"); } } else { open($ufh, ">/dev/null"); } # GET FIRST MATCHING TARGET ID my $prev_target_id = ''; my $target_id = ''; get_next_matching_target_id($table,$id_col,$ignorecase,$filters,\$target_id,\$prev_target_id,$paired); unless ($target_id) { # no records match search criteria close $table; close $sfh if $selected; if ($unselected) { open(DB, "<$dbfile") or die("Unable to open file, $dbfile: $!\n"); while (<DB>) { print $ufh $_; ++$n_unselected; } close DB; } close $ufh; return 0; } # DETERMINE FILETYPE open(DB, "<$dbfile") or die("Unable to open file, $dbfile: $!\n"); my $format; while (<DB>) { chomp; if (/^#/ or ! $_) { next } elsif (/^>/) { $format='fasta' } elsif (/^@/) { $format='fastq' } else { die "Invalid DB file format" } last; } close DB; # PARSE my $db = $format eq 'fasta' ? FastaDb->new($dbfile) : FastqDb->new($dbfile); while (my $rec=$db->next_seq ) { unless ($target_id) { last unless $unselected; # done if no more seqs to get # otherwise dump rest of seqs in unselected file print $ufh $rec->output; ++$n_unselected; while ($rec=$db->next_seq ) { print $ufh $rec->output; ++$n_unselected; } last; } my $id=$ignorecase ? uc($rec->id):$rec->id; if ($id eq $prev_target_id or $id eq $target_id) { # selected seq print $sfh $rec->output; ++$n_selected; get_next_matching_target_id($table,$id_col,$ignorecase,$filters,\$target_id,\$prev_target_id,$paired); } else { # unselected seq print $ufh $rec->output; ++$n_unselected; } } close $table; close $sfh; close $ufh; # If some target seqs not found, it's likely the files were not cosorted, so try unsorted search function. if ($target_id) { print "Files don't appear to be cosorted, trying unsorted search\n"; return search($dbfile, $tablefile, $id_col, $ignorecase, $selected, $unselected, $filters); } return ($n_selected,$n_unselected); sub get_next_matching_target_id { my ($table,$id_col,$ignorecase,$filters,$target_idR,$prev_target_idR,$paired)=@_; $$prev_target_idR = $$target_idR; $$target_idR = ''; while (<$table>) { chomp; my @row = split(/\t/); die("Bad input table") unless @row >= $id_col; next unless match($filters, \@row); my $new_target_id = $ignorecase ? uc($row[ $id_col - 1 ]) : $row[ $id_col - 1 ]; $new_target_id=$1 if $new_target_id =~ /^(\S+)/; # use first word only $new_target_id=$1 if $paired and $new_target_id =~ /^(\S+)\/[12]$/; next if $new_target_id eq $$prev_target_idR; $$target_idR=$new_target_id; last; # return to parsing db file } } } # # LOAD IDS IN RAM THEN PARSE DB. # sub search { my ($dbfile, $tablefile, $id_col, $ignorecase, $selected, $unselected, $paired, $gzip, $filters) = @_; my $sfh = new IO::File; # selected seqs my $ufh = new IO::File; # unselected seqs my $table=new IO::File; my $n_selected=0; my $n_unselected=0; my %ids = (); open(DB, "<$dbfile") or die("Unable to open file, $dbfile: $!\n"); if ($tablefile) { open($table, "<$tablefile") or die("Unable to open file, $tablefile: $!\n"); } else { $table=*STDIN; } if ($selected) { if ($gzip) { # open($sfh, '>:gzip', $selected) or die("Unable to open file, $selected: $!\n"); } else { open($sfh, ">$selected") or die("Unable to open file, $selected: $!\n"); } } else { open($sfh, ">/dev/null"); } if ($unselected) { if ($gzip) { # open($ufh, '>:gzip', $unselected) or die("Unable to open file, $unselected: $!\n"); } else { open($ufh, ">$unselected") or die("Unable to open file, $unselected: $!\n"); } } else { open($ufh, ">/dev/null"); } # LOAD IDS OF MATCHING ROWS my $num_targets=0; while (<$table>) { next if /^#/; chomp; my @row = split(/\t/); my $id = $ignorecase ? uc($row[ $id_col - 1 ]) : $row[ $id_col - 1 ]; $id=$1 if $id =~ /^(\S+)/; $id=$1 if $paired and $id =~ /^(\S+)\/[12]$/; if (match($filters, \@row)) { # remember this ID $ids{$id} = 0; # number of reads with this ID found (counter for paired option) ++$num_targets; } } unless ($num_targets) { # no records match search criteria close $table; close $sfh if $selected; if ($unselected) { open(DB, "<$dbfile") or die("Unable to open file, $dbfile: $!\n"); while (<DB>) { print $ufh $_; ++$n_unselected; } close DB; } close $ufh; return 0; } # DETERMINE FILETYPE open(DB, "<$dbfile") or die("Unable to open file, $dbfile: $!\n"); my $format; while (<DB>) { chomp; if (/^#/ or /^$/) { next } elsif (/^>/) { $format='fasta' } elsif (/^@/) { $format='fastq' } else { die "Invalid DB file format" } last; } close DB; # GET SEQS my $db = $format eq 'fasta' ? FastaDb->new($dbfile) : FastqDb->new($dbfile); while (my $rec=$db->next_seq ) { my $id = $ignorecase ? uc($rec->id) : $rec->id; $id = $1 if $paired and $id =~ /^(\S+)\/[12]$/; if (exists($ids{$id})) { # selected print $sfh $rec->output; ++$n_selected; if (!$paired) { delete $ids{$id}; } else { $ids{$id} += 1; delete $ids{$id} if $ids{$id} == 2; } } else { # unselected print $ufh $rec->output; ++$n_unselected; } } close $table; close $sfh; close $ufh; # MAKE SURE ALL TARGETS WERE FOUND foreach my $id (keys %ids) { if ($ids{$id}) { delete($ids{$id}); # SOMETIMES INFILES CONTAIN ONLY ONE READ OF PAIR }elsif($id eq 'EMPTY') { #ADDEDBY THO TO ALLOW EMPTY blast results #in workflow using checkempty.pl exit; } else { warn("Seq not found: $id\n"); } } return ($n_selected,$n_unselected); } __END__