Mercurial > repos > yusuf > filter_table_by_names

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/FilterTableByNamesList.xml	Wed Mar 25 13:35:07 2015 -0600
@@ -0,0 +1,25 @@
+<?xml version="1.0"?>
+
+<tool id="filter_by_list_1" name="Filter a tabular file">
+  <description>against a list of desired column values</description>
+  <version_string>echo 1.0.0</version_string>
+  <command interpreter="perl">filter_by_list $case_sensitive $input_table $file_of_names $filtered_output_table $num_header_lines</command>
+  <inputs>
+    <param format="tabular" name="input_table" type="data" label="Text table to filter"/>
+    <param format="text" name="file_of_names" type="data" label="Text file with target names" help="Input lines with any of these names in any column will be retained"/>
+    <param name="case_sensitive" type="boolean" value="True" label="Case sensitive?"/>
+    <param name="num_header_lines" type="integer" value="1" min="-1" max="100" label="Number of header lines" help="These will be copied to the output verbatim, without checking for names match. Specifying -1 will copy all lines starting with a pound sign (#)"/>
+  </inputs>
+  <outputs>
+    <data name="filtered_output_table" format="tabular" type="data" label="Subset of table matching specific names"/>
+  </outputs>
+
+  <tests/>
+
+  <help>
+This tool retains lines of an  input tabular file that have a column value matching any of the values in the "names" file. This is useful for example to
+report only a subset of an HGVS or BED file corresponding to a set of genes of interest. The names file should have one name per line. If you are looking to
+only match one name, it might be just as easy to use the generic Galaxy tool "Select lines that match an expression".
+ </help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filter_by_list	Wed Mar 25 13:35:07 2015 -0600
@@ -0,0 +1,57 @@
+#!/usr/bin/env perl
+
+# Report lines of a file that have as one of the column values a value from the pattern file
+@ARGV == 6 or @ARGV == 7 or die "Usage: $0 <True|False (case sensitive)> <input.tab> <file of patterns> <matching output.tab> <num header line to retain> <column #> [nonmatching output.tab]\n";
+
+open(PATTERNS, $ARGV[2])
+  or die "Cannot open $ARGV[1] for reading: $!\n";
+my @alts;
+while(<PATTERNS>){
+  chomp;
+  push @alts, quotemeta($_);
+}
+close(PATTERNS);
+
+my $regex = "(?:\\A|\\t|; )(?:".join("|", @alts).")(?:; |\\t|\\Z)";
+#print STDERR "Regex is $regex\n";
+open(OUT, ">$ARGV[3]")
+  or die "Cannot open $ARGV[3] for writing: $!\n";
+open(TAB, $ARGV[1])
+  or die "Cannot open $ARGV[1] for reading: $!\n";
+if(@ARGV == 7){
+  open(NONMATCH, ">$ARGV[6]")
+    or die "Cannot open $ARGV[6] for writing: $!\n";
+}
+my $num_header_lines = $ARGV[4];
+if($num_header_lines > 0){
+  while($num_header_lines--){
+    my $header_line = <TAB>;
+    print OUT $header_line;
+    print NONMATCH $header_line if @ARGV == 6;
+  }
+}
+if($ARGV[0] =~ /^[t1]/i){
+  my @F = split /\t/, $_;
+  while(<TAB>){
+    if($F[$ARGV[5]] =~ /$regex/o or $num_header_lines == -1 and /^#/){
+      print OUT $_;
+    }
+    elsif(@ARGV == 7){
+      print NONMATCH $_;
+    }
+  }
+}
+else{ # case insensitive
+  while(<TAB>){
+    my @F = split /\t/, $_;
+    if($F[$ARGV[5]] =~ /$regex/io or $num_header_lines == -1 and /^#/){
+#      print STDERR $F[$ARGV[5]], "\n";
+      print OUT $_;
+    }
+    elsif(@ARGV == 7){
+      print NONMATCH $_;
+    }
+  }
+}
+close(TAB);
+close(OUT);