| 
0
 | 
     1 #!/usr/bin/env perl
 | 
| 
 | 
     2 
 | 
| 
 | 
     3 use strict;
 | 
| 
 | 
     4 use warnings;
 | 
| 
 | 
     5 
 | 
| 
 | 
     6 if(@ARGV == 1 and $ARGV[0] eq "-v"){
 | 
| 
 | 
     7   print "Version 1.0\n";
 | 
| 
 | 
     8   exit;
 | 
| 
 | 
     9 }
 | 
| 
 | 
    10 
 | 
| 
 | 
    11 my $quiet = 0;
 | 
| 
 | 
    12 if(@ARGV and $ARGV[0] =~ /^-q(?:uiet)?$/){
 | 
| 
 | 
    13   $quiet = 1;
 | 
| 
 | 
    14   shift @ARGV;
 | 
| 
 | 
    15 }
 | 
| 
 | 
    16 
 | 
| 
 | 
    17 @ARGV >= 6 and not @ARGV%2 or die "Usage: $0 [-q(uiet)] <true|false> <output file> <method_name1> <hgvs_file_1.txt> <method_name2> <hgvs_file_2.txt> ...\n",
 | 
| 
 | 
    18                                   "Where true or false determines if the genotypes calls should all be reported (if false, they are collapsed to the unique set of values observed)\n";
 | 
| 
 | 
    19 
 | 
| 
 | 
    20 my %zygosity;
 | 
| 
 | 
    21 my %quality;
 | 
| 
 | 
    22 my %var_count;
 | 
| 
 | 
    23 my %tot_count;
 | 
| 
 | 
    24 my %methods;
 | 
| 
 | 
    25 my %data;
 | 
| 
 | 
    26 my $chr_prefix_exists = 0; # do any of the tools report chr1, if so we need to ensure 1 get reported as chr1 later for consistency across tools
 | 
| 
 | 
    27 
 | 
| 
 | 
    28 my $collapse = $ARGV[0] =~ /^true$/i;
 | 
| 
 | 
    29 
 | 
| 
 | 
    30 open(OUT, ">$ARGV[1]")
 | 
| 
 | 
    31   or die "Cannot open $ARGV[1] for writing: $!\n";
 | 
| 
 | 
    32 
 | 
| 
 | 
    33 my @headers;
 | 
| 
 | 
    34 my ($chr_column, $zygosity_column, $pvalue_column, $alt_cnt_column, $tot_cnt_column, $transcript_column, $cdna_hgvs_column, $pos_column, $to_column, $srcs_column);
 | 
| 
 | 
    35 for(my $i=2; $i<$#ARGV; $i+=2){
 | 
| 
 | 
    36   my $method_name = $ARGV[$i];
 | 
| 
 | 
    37   my $infile = $ARGV[$i+1];
 | 
| 
 | 
    38   print STDERR "Processing $infile...\n" unless $quiet;
 | 
| 
 | 
    39   open(IN, $infile)
 | 
| 
 | 
    40     or die "Cannot open $infile for reading: $!\n";
 | 
| 
 | 
    41   my $header = <IN>; # header
 | 
| 
 | 
    42   chomp $header;
 | 
| 
 | 
    43   if($i == 2){
 | 
| 
 | 
    44     @headers = split /\t/, $header;
 | 
| 
 | 
    45     for(my $j = 0; $j <= $#headers; $j++){
 | 
| 
 | 
    46       if($headers[$j] eq "Chr"){
 | 
| 
 | 
    47         $chr_column = $j;
 | 
| 
 | 
    48       }
 | 
| 
 | 
    49       elsif($headers[$j] eq "Zygosity"){
 | 
| 
 | 
    50         $zygosity_column = $j;
 | 
| 
 | 
    51       }
 | 
| 
 | 
    52       elsif($headers[$j] eq "P-value"){
 | 
| 
 | 
    53         $pvalue_column = $j;
 | 
| 
 | 
    54       }
 | 
| 
 | 
    55       elsif($headers[$j] eq "Variant Reads"){
 | 
| 
 | 
    56         $alt_cnt_column = $j;
 | 
| 
 | 
    57       }
 | 
| 
 | 
    58       elsif($headers[$j] eq "Total Reads"){
 | 
| 
 | 
    59         $tot_cnt_column = $j;
 | 
| 
 | 
    60       }
 | 
| 
 | 
    61       elsif($headers[$j] eq "Selected transcript"){
 | 
| 
 | 
    62         $transcript_column = $j;
 | 
| 
 | 
    63       }
 | 
| 
 | 
    64       elsif($headers[$j] eq "Transcript HGVS"){
 | 
| 
 | 
    65         $cdna_hgvs_column = $j;
 | 
| 
 | 
    66       }
 | 
| 
 | 
    67       elsif($headers[$j] eq "DNA From"){
 | 
| 
 | 
    68         $pos_column = $j;
 | 
| 
 | 
    69       }
 | 
| 
 | 
    70       elsif($headers[$j] eq "DNA To"){
 | 
| 
 | 
    71         $to_column = $j;
 | 
| 
 | 
    72       }
 | 
| 
 | 
    73       elsif($headers[$j] eq "Sources"){
 | 
| 
 | 
    74         $srcs_column = $j;
 | 
| 
 | 
    75       }
 | 
| 
 | 
    76     }
 | 
| 
 | 
    77     if(defined $srcs_column){
 | 
| 
 | 
    78       # print header as-is
 | 
| 
 | 
    79       print OUT "$header\n";
 | 
| 
 | 
    80     }
 | 
| 
 | 
    81     else{
 | 
| 
 | 
    82       # print header from the first file, with extra column for methods at the end
 | 
| 
 | 
    83       push @headers, "Sources";
 | 
| 
 | 
    84       $srcs_column = $#headers;
 | 
| 
 | 
    85       print OUT join("\t", @headers), "\n";
 | 
| 
 | 
    86     }
 | 
| 
 | 
    87   }
 | 
| 
 | 
    88   else{
 | 
| 
 | 
    89     # Make sure the columns are the same in the various files
 | 
| 
 | 
    90     my @latest_headers = split /\t/, $header;
 | 
| 
 | 
    91     for(my $j = 0; $j <= $#latest_headers && $j <= $#headers; $j++){
 | 
| 
 | 
    92       if($latest_headers[$j] ne $headers[$j]){
 | 
| 
 | 
    93         die "Header column ", $j+1, "($latest_headers[$j]) in $ARGV[$i] is different from the equivalent column label ($headers[$j]) in $ARGV[2]. Aborting, cannot merge the files.\n";
 | 
| 
 | 
    94       }
 | 
| 
 | 
    95     }
 | 
| 
 | 
    96   }
 | 
| 
 | 
    97   while(<IN>){
 | 
| 
 | 
    98     chomp;
 | 
| 
 | 
    99     my @F = split /\t/, $_, -1; # -1 to keep trailing blank fields
 | 
| 
 | 
   100     if(not $chr_prefix_exists and $F[$chr_column] =~ /^chr/){
 | 
| 
 | 
   101        $chr_prefix_exists = 1;
 | 
| 
 | 
   102     }
 | 
| 
 | 
   103     $F[$chr_column] =~ s/^chr//; # for consistency
 | 
| 
 | 
   104     my $key = "$F[$transcript_column]:$F[$cdna_hgvs_column]"; # transcript_id:cdna_hgvs is shared id for common variants amongst files
 | 
| 
 | 
   105     # record disagreement on zygosity if any
 | 
| 
 | 
   106     if(not defined $zygosity{$key}){
 | 
| 
 | 
   107       $zygosity{$key} = [];
 | 
| 
 | 
   108       $quality{$key} = [];
 | 
| 
 | 
   109       $var_count{$key} = [];
 | 
| 
 | 
   110       $tot_count{$key} = [];
 | 
| 
 | 
   111       $data{$key} = \@F;
 | 
| 
 | 
   112       #print STDERR "Missing fields (only have ", scalar(@F), " for input '$_'\n" if $#F < $#headers;
 | 
| 
 | 
   113     }
 | 
| 
 | 
   114     push @{$zygosity{$key}}, split(/; /,$F[$zygosity_column]);
 | 
| 
 | 
   115     push @{$quality{$key}}, $F[$pvalue_column];
 | 
| 
 | 
   116     push @{$var_count{$key}}, $F[$alt_cnt_column];
 | 
| 
 | 
   117     push @{$tot_count{$key}}, $F[$tot_cnt_column];
 | 
| 
 | 
   118     push @{$methods{$key}}, $method_name;
 | 
| 
 | 
   119   }
 | 
| 
 | 
   120   close(IN);
 | 
| 
 | 
   121 }
 | 
| 
 | 
   122 
 | 
| 
 | 
   123 if($chr_prefix_exists){
 | 
| 
 | 
   124   for my $key (keys %data){
 | 
| 
 | 
   125     # assuming there is a mix of chr1 and 1, always add the chr for consistency
 | 
| 
 | 
   126     $data{$key}->[$chr_column] = "chr".$data{$key}->[$chr_column] unless $data{$key}->[$chr_column] =~ /^chr/;
 | 
| 
 | 
   127   }
 | 
| 
 | 
   128 }
 | 
| 
 | 
   129 
 | 
| 
 | 
   130 # Sort by chr, then pos, then transcript_id
 | 
| 
 | 
   131 for my $key (sort {$data{$a}->[$chr_column] cmp $data{$b}->[$chr_column] or 
 | 
| 
 | 
   132                    $data{$a}->[$pos_column] <=> $data{$b}->[$pos_column] or 
 | 
| 
 | 
   133                    $data{$a}->[$to_column] <=> $data{$b}->[$to_column] or 
 | 
| 
 | 
   134                    $data{$a}->[$transcript_column] cmp $data{$b}->[$transcript_column]} keys %data){
 | 
| 
 | 
   135   # Before printing a line, merge the zygosity, variant quality, read count data if requested
 | 
| 
 | 
   136   my %seen;
 | 
| 
 | 
   137   if($collapse){
 | 
| 
 | 
   138     my @zygosities = grep {$_ ne "NA" and not $seen{$_}++} @{$zygosity{$key}};
 | 
| 
 | 
   139     if(@zygosities == 0){
 | 
| 
 | 
   140       # do nothing, existing NA in 7th column will be a fine answer as they are all like tha
 | 
| 
 | 
   141     }
 | 
| 
 | 
   142     elsif(@zygosities == 1){
 | 
| 
 | 
   143       # agreement by all methods, just print the one not NA
 | 
| 
 | 
   144       $data{$key}->[$zygosity_column] = $zygosities[0] if $data{$key}->[$zygosity_column] eq "NA";
 | 
| 
 | 
   145     }
 | 
| 
 | 
   146     else{
 | 
| 
 | 
   147       $data{$key}->[$zygosity_column] = join("/", sort keys %seen); # list all unique values that occur
 | 
| 
 | 
   148     }
 | 
| 
 | 
   149   }
 | 
| 
 | 
   150   else{
 | 
| 
 | 
   151     $data{$key}->[$zygosity_column] = join("; ", @{$zygosity{$key}});
 | 
| 
 | 
   152   }
 | 
| 
 | 
   153 
 | 
| 
 | 
   154   if($collapse){
 | 
| 
 | 
   155     undef %seen;
 | 
| 
 | 
   156     my @qualities = grep {$_ ne "NA" and not $seen{$_}++} @{$quality{$key}};
 | 
| 
 | 
   157     if(@qualities == 0){
 | 
| 
 | 
   158       # do nothing, existing NA in 8th column will be a fine answer as they are all like that
 | 
| 
 | 
   159     }
 | 
| 
 | 
   160     elsif(@qualities == 1){
 | 
| 
 | 
   161       # agreement by all methods, just print the one not NA
 | 
| 
 | 
   162       $data{$key}->[$pvalue_column] = $qualities[0] if $data{$key}->[8] eq "NA";
 | 
| 
 | 
   163     }
 | 
| 
 | 
   164     else{
 | 
| 
 | 
   165       # calculate the median for the collapsed value
 | 
| 
 | 
   166       my @sorted_qualities = sort {$a <=> $b} grep({$_ ne "NA"} @{$quality{$key}});
 | 
| 
 | 
   167       my $median_quality = $sorted_qualities[int($#sorted_qualities/2)]; # rounds down (actually better score as this is a p-value)
 | 
| 
 | 
   168       $data{$key}->[$pvalue_column] = $median_quality;
 | 
| 
 | 
   169     }
 | 
| 
 | 
   170   }
 | 
| 
 | 
   171   else{
 | 
| 
 | 
   172     $data{$key}->[$pvalue_column] = join("; ", @{$quality{$key}});
 | 
| 
 | 
   173   }
 | 
| 
 | 
   174 
 | 
| 
 | 
   175   if($collapse){
 | 
| 
 | 
   176     undef %seen;
 | 
| 
 | 
   177     my @var_counts = grep {$_ ne "NA" and not $seen{$_}++} @{$var_count{$key}};
 | 
| 
 | 
   178     undef %seen;
 | 
| 
 | 
   179     my @tot_counts = grep {$_ ne "NA" and not $seen{$_}++} @{$tot_count{$key}};
 | 
| 
 | 
   180     if(@var_counts == 0 and @tot_counts == 0){
 | 
| 
 | 
   181       # do nothing, existing NAs okay
 | 
| 
 | 
   182     }
 | 
| 
 | 
   183     elsif(@var_counts == 1 and @tot_counts == 1){
 | 
| 
 | 
   184       # agreement by all methods, just print the one in %data unless it's NA
 | 
| 
 | 
   185       $data{$key}->[$alt_cnt_column] = $var_counts[0] if $data{$key}->[$alt_cnt_column] eq "NA";
 | 
| 
 | 
   186       $data{$key}->[$tot_cnt_column] = $tot_counts[0] if $data{$key}->[$tot_cnt_column] eq "NA";
 | 
| 
 | 
   187     }
 | 
| 
 | 
   188     else{
 | 
| 
 | 
   189       # calculate the median for the collapsed value
 | 
| 
 | 
   190       my @sorted_var_counts = sort {$a <=> $b} grep({$_ ne "NA"} @{$var_count{$key}});
 | 
| 
 | 
   191       my $pivot = $#sorted_var_counts/2;
 | 
| 
 | 
   192       my $median_var_count = $pivot == int($pivot) ? $sorted_var_counts[$pivot] : # arithmetic mean
 | 
| 
 | 
   193                                                      int(($sorted_var_counts[int($pivot)]+$sorted_var_counts[int($pivot)+1])/2);
 | 
| 
 | 
   194       $data{$key}->[$alt_cnt_column] = $median_var_count;
 | 
| 
 | 
   195       my @sorted_tot_counts = sort {$a <=> $b} grep({$_ ne "NA"} @{$tot_count{$key}});
 | 
| 
 | 
   196       $pivot = $#sorted_tot_counts/2;
 | 
| 
 | 
   197       my $median_tot_count = $pivot == int($pivot) ? $sorted_tot_counts[$pivot] : # arithmetic mean
 | 
| 
 | 
   198                                                      int(($sorted_tot_counts[int($pivot)]+$sorted_tot_counts[int($pivot)+1])/2);
 | 
| 
 | 
   199       $data{$key}->[$tot_cnt_column] = $median_tot_count;
 | 
| 
 | 
   200     }
 | 
| 
 | 
   201   }
 | 
| 
 | 
   202   else{
 | 
| 
 | 
   203     # list the raw numbers
 | 
| 
 | 
   204     $data{$key}->[$alt_cnt_column] = join("; ", @{$var_count{$key}});
 | 
| 
 | 
   205     $data{$key}->[$tot_cnt_column] = join("; ", @{$tot_count{$key}});
 | 
| 
 | 
   206   }
 | 
| 
 | 
   207 
 | 
| 
 | 
   208   # to facilitate multiple rounds of combining, slash the extra column from the last round if it exists
 | 
| 
 | 
   209   $data{$key}->[$srcs_column] = join("; ", @{$methods{$key}});
 | 
| 
 | 
   210   for(my $i = 0; $i <= $#{$data{$key}}; $i++){
 | 
| 
 | 
   211     $data{$key}->[$i] = "" if not defined $data{$key}->[$i];
 | 
| 
 | 
   212   }
 | 
| 
 | 
   213   print OUT join("\t", @{$data{$key}}), "\n";
 | 
| 
 | 
   214 }
 | 
| 
 | 
   215 close(OUT);
 |