view filter_by_list @ 0:f92e6aff30b7 default tip

initial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:35:07 -0600
parents
children
line wrap: on
line source

#!/usr/bin/env perl

# Report lines of a file that have as one of the column values a value from the pattern file
@ARGV == 6 or @ARGV == 7 or die "Usage: $0 <True|False (case sensitive)> <input.tab> <file of patterns> <matching output.tab> <num header line to retain> <column #> [nonmatching output.tab]\n";

open(PATTERNS, $ARGV[2])
  or die "Cannot open $ARGV[1] for reading: $!\n";
my @alts;
while(<PATTERNS>){
  chomp;
  push @alts, quotemeta($_);
}
close(PATTERNS);

my $regex = "(?:\\A|\\t|; )(?:".join("|", @alts).")(?:; |\\t|\\Z)";
#print STDERR "Regex is $regex\n";
open(OUT, ">$ARGV[3]")
  or die "Cannot open $ARGV[3] for writing: $!\n";
open(TAB, $ARGV[1])
  or die "Cannot open $ARGV[1] for reading: $!\n";
if(@ARGV == 7){
  open(NONMATCH, ">$ARGV[6]")
    or die "Cannot open $ARGV[6] for writing: $!\n";
}
my $num_header_lines = $ARGV[4];
if($num_header_lines > 0){
  while($num_header_lines--){
    my $header_line = <TAB>;
    print OUT $header_line;
    print NONMATCH $header_line if @ARGV == 6;
  }
}
if($ARGV[0] =~ /^[t1]/i){
  my @F = split /\t/, $_;
  while(<TAB>){
    if($F[$ARGV[5]] =~ /$regex/o or $num_header_lines == -1 and /^#/){
      print OUT $_;
    }
    elsif(@ARGV == 7){
      print NONMATCH $_;
    }
  }
}
else{ # case insensitive
  while(<TAB>){
    my @F = split /\t/, $_;
    if($F[$ARGV[5]] =~ /$regex/io or $num_header_lines == -1 and /^#/){
#      print STDERR $F[$ARGV[5]], "\n";
      print OUT $_;
    }
    elsif(@ARGV == 7){
      print NONMATCH $_;
    }
  }
}
close(TAB);
close(OUT);