Mercurial > repos > yusuf > filter_table_by_names
changeset 0:f92e6aff30b7 default tip
initial commit
author | Yusuf Ali <ali@yusuf.email> |
---|---|
date | Wed, 25 Mar 2015 13:35:07 -0600 |
parents | |
children | |
files | FilterTableByNamesList.xml filter_by_list |
diffstat | 2 files changed, 82 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/FilterTableByNamesList.xml Wed Mar 25 13:35:07 2015 -0600 @@ -0,0 +1,25 @@ +<?xml version="1.0"?> + +<tool id="filter_by_list_1" name="Filter a tabular file"> + <description>against a list of desired column values</description> + <version_string>echo 1.0.0</version_string> + <command interpreter="perl">filter_by_list $case_sensitive $input_table $file_of_names $filtered_output_table $num_header_lines</command> + <inputs> + <param format="tabular" name="input_table" type="data" label="Text table to filter"/> + <param format="text" name="file_of_names" type="data" label="Text file with target names" help="Input lines with any of these names in any column will be retained"/> + <param name="case_sensitive" type="boolean" value="True" label="Case sensitive?"/> + <param name="num_header_lines" type="integer" value="1" min="-1" max="100" label="Number of header lines" help="These will be copied to the output verbatim, without checking for names match. Specifying -1 will copy all lines starting with a pound sign (#)"/> + </inputs> + <outputs> + <data name="filtered_output_table" format="tabular" type="data" label="Subset of table matching specific names"/> + </outputs> + + <tests/> + + <help> +This tool retains lines of an input tabular file that have a column value matching any of the values in the "names" file. This is useful for example to +report only a subset of an HGVS or BED file corresponding to a set of genes of interest. The names file should have one name per line. If you are looking to +only match one name, it might be just as easy to use the generic Galaxy tool "Select lines that match an expression". + </help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter_by_list Wed Mar 25 13:35:07 2015 -0600 @@ -0,0 +1,57 @@ +#!/usr/bin/env perl + +# Report lines of a file that have as one of the column values a value from the pattern file +@ARGV == 6 or @ARGV == 7 or die "Usage: $0 <True|False (case sensitive)> <input.tab> <file of patterns> <matching output.tab> <num header line to retain> <column #> [nonmatching output.tab]\n"; + +open(PATTERNS, $ARGV[2]) + or die "Cannot open $ARGV[1] for reading: $!\n"; +my @alts; +while(<PATTERNS>){ + chomp; + push @alts, quotemeta($_); +} +close(PATTERNS); + +my $regex = "(?:\\A|\\t|; )(?:".join("|", @alts).")(?:; |\\t|\\Z)"; +#print STDERR "Regex is $regex\n"; +open(OUT, ">$ARGV[3]") + or die "Cannot open $ARGV[3] for writing: $!\n"; +open(TAB, $ARGV[1]) + or die "Cannot open $ARGV[1] for reading: $!\n"; +if(@ARGV == 7){ + open(NONMATCH, ">$ARGV[6]") + or die "Cannot open $ARGV[6] for writing: $!\n"; +} +my $num_header_lines = $ARGV[4]; +if($num_header_lines > 0){ + while($num_header_lines--){ + my $header_line = <TAB>; + print OUT $header_line; + print NONMATCH $header_line if @ARGV == 6; + } +} +if($ARGV[0] =~ /^[t1]/i){ + my @F = split /\t/, $_; + while(<TAB>){ + if($F[$ARGV[5]] =~ /$regex/o or $num_header_lines == -1 and /^#/){ + print OUT $_; + } + elsif(@ARGV == 7){ + print NONMATCH $_; + } + } +} +else{ # case insensitive + while(<TAB>){ + my @F = split /\t/, $_; + if($F[$ARGV[5]] =~ /$regex/io or $num_header_lines == -1 and /^#/){ +# print STDERR $F[$ARGV[5]], "\n"; + print OUT $_; + } + elsif(@ARGV == 7){ + print NONMATCH $_; + } + } +} +close(TAB); +close(OUT);