diff split_hgvs_by_confidence @ 0:1a23ea467feb default tip

intial commit
author Yusuf Ali <ali@yusuf.email>
date Thu, 26 Mar 2015 09:36:17 -0600
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/split_hgvs_by_confidence	Thu Mar 26 09:36:17 2015 -0600
@@ -0,0 +1,42 @@
+#!/usr/bin/env perl
+
+@ARGV > 3 or die "Usage: $0 <input.combined.hgvs.txt> <confident_out.hgvs.txt> <marginal_out.hgvs.txt> <min num sources> [alt min regex]\n";
+
+my $infile = shift @ARGV;
+my $confident_outfile = shift @ARGV;
+my $marginal_outfile = shift @ARGV;
+my $min_sources = shift @ARGV;
+my $alt_regex = @ARGV ? shift @ARGV : undef;
+
+open(IN, $infile)
+  or die "Cannot open $infile for reading: $!\n";
+open(CONFIDENT, ">$confident_outfile")
+  or die "Cannot open $confident_outfile for writing: $!\n";
+open(MARGINAL, ">$marginal_outfile")
+  or die "Cannot open $marginal_outfile for writing: $!\n";
+
+my $header = <IN>;
+print CONFIDENT $header;
+print MARGINAL $header;
+chomp $header;
+my @headers = split /\t/, $header;
+my $srcs_column;
+for(my $i = 0; $i <= $#headers; $i++){
+  if($headers[$i] eq "Sources"){
+    $srcs_column = $i;
+  }
+}
+die "Cannot find Sources column in header of $infile, aborting.\n" if not defined $srcs_column;
+
+while(<IN>){
+  my @F = split /\t/, $_;
+  my @sources = split /; /, $F[$#F];
+  if(@sources >= $min_sources or defined $alt_regex and $F[$#F] =~ /$alt_regex/o){
+    print CONFIDENT $_;
+  }
+  else{
+    print MARGINAL $_;
+  }
+}
+close(CONFIDENT);
+close(MARGINAL);