| 0 | 1 #!/usr/bin/env perl | 
|  | 2 | 
|  | 3 use strict; | 
|  | 4 use warnings; | 
|  | 5 | 
|  | 6 if(@ARGV == 1 and $ARGV[0] eq "-v"){ | 
|  | 7   print "Version 1.0\n"; | 
|  | 8   exit; | 
|  | 9 } | 
|  | 10 | 
|  | 11 my $quiet = 0; | 
|  | 12 if(@ARGV and $ARGV[0] =~ /^-q(?:uiet)?$/){ | 
|  | 13   $quiet = 1; | 
|  | 14   shift @ARGV; | 
|  | 15 } | 
|  | 16 | 
|  | 17 @ARGV >= 6 and not @ARGV%2 or die "Usage: $0 [-q(uiet)] <true|false> <output file> <method_name1> <hgvs_file_1.txt> <method_name2> <hgvs_file_2.txt> ...\n", | 
|  | 18                                   "Where true or false determines if the genotypes calls should all be reported (if false, they are collapsed to the unique set of values observed)\n"; | 
|  | 19 | 
|  | 20 my %zygosity; | 
|  | 21 my %quality; | 
|  | 22 my %var_count; | 
|  | 23 my %tot_count; | 
|  | 24 my %methods; | 
|  | 25 my %data; | 
|  | 26 my $chr_prefix_exists = 0; # do any of the tools report chr1, if so we need to ensure 1 get reported as chr1 later for consistency across tools | 
|  | 27 | 
|  | 28 my $collapse = $ARGV[0] =~ /^true$/i; | 
|  | 29 | 
|  | 30 open(OUT, ">$ARGV[1]") | 
|  | 31   or die "Cannot open $ARGV[1] for writing: $!\n"; | 
|  | 32 | 
|  | 33 my @headers; | 
|  | 34 my ($chr_column, $zygosity_column, $pvalue_column, $alt_cnt_column, $tot_cnt_column, $transcript_column, $cdna_hgvs_column, $pos_column, $to_column, $srcs_column); | 
|  | 35 for(my $i=2; $i<$#ARGV; $i+=2){ | 
|  | 36   my $method_name = $ARGV[$i]; | 
|  | 37   my $infile = $ARGV[$i+1]; | 
|  | 38   print STDERR "Processing $infile...\n" unless $quiet; | 
|  | 39   open(IN, $infile) | 
|  | 40     or die "Cannot open $infile for reading: $!\n"; | 
|  | 41   my $header = <IN>; # header | 
|  | 42   chomp $header; | 
|  | 43   if($i == 2){ | 
|  | 44     @headers = split /\t/, $header; | 
|  | 45     for(my $j = 0; $j <= $#headers; $j++){ | 
|  | 46       if($headers[$j] eq "Chr"){ | 
|  | 47         $chr_column = $j; | 
|  | 48       } | 
|  | 49       elsif($headers[$j] eq "Zygosity"){ | 
|  | 50         $zygosity_column = $j; | 
|  | 51       } | 
|  | 52       elsif($headers[$j] eq "P-value"){ | 
|  | 53         $pvalue_column = $j; | 
|  | 54       } | 
|  | 55       elsif($headers[$j] eq "Variant Reads"){ | 
|  | 56         $alt_cnt_column = $j; | 
|  | 57       } | 
|  | 58       elsif($headers[$j] eq "Total Reads"){ | 
|  | 59         $tot_cnt_column = $j; | 
|  | 60       } | 
|  | 61       elsif($headers[$j] eq "Selected transcript"){ | 
|  | 62         $transcript_column = $j; | 
|  | 63       } | 
|  | 64       elsif($headers[$j] eq "Transcript HGVS"){ | 
|  | 65         $cdna_hgvs_column = $j; | 
|  | 66       } | 
|  | 67       elsif($headers[$j] eq "DNA From"){ | 
|  | 68         $pos_column = $j; | 
|  | 69       } | 
|  | 70       elsif($headers[$j] eq "DNA To"){ | 
|  | 71         $to_column = $j; | 
|  | 72       } | 
|  | 73       elsif($headers[$j] eq "Sources"){ | 
|  | 74         $srcs_column = $j; | 
|  | 75       } | 
|  | 76     } | 
|  | 77     if(defined $srcs_column){ | 
|  | 78       # print header as-is | 
|  | 79       print OUT "$header\n"; | 
|  | 80     } | 
|  | 81     else{ | 
|  | 82       # print header from the first file, with extra column for methods at the end | 
|  | 83       push @headers, "Sources"; | 
|  | 84       $srcs_column = $#headers; | 
|  | 85       print OUT join("\t", @headers), "\n"; | 
|  | 86     } | 
|  | 87   } | 
|  | 88   else{ | 
|  | 89     # Make sure the columns are the same in the various files | 
|  | 90     my @latest_headers = split /\t/, $header; | 
|  | 91     for(my $j = 0; $j <= $#latest_headers && $j <= $#headers; $j++){ | 
|  | 92       if($latest_headers[$j] ne $headers[$j]){ | 
|  | 93         die "Header column ", $j+1, "($latest_headers[$j]) in $ARGV[$i] is different from the equivalent column label ($headers[$j]) in $ARGV[2]. Aborting, cannot merge the files.\n"; | 
|  | 94       } | 
|  | 95     } | 
|  | 96   } | 
|  | 97   while(<IN>){ | 
|  | 98     chomp; | 
|  | 99     my @F = split /\t/, $_, -1; # -1 to keep trailing blank fields | 
|  | 100     if(not $chr_prefix_exists and $F[$chr_column] =~ /^chr/){ | 
|  | 101        $chr_prefix_exists = 1; | 
|  | 102     } | 
|  | 103     $F[$chr_column] =~ s/^chr//; # for consistency | 
|  | 104     my $key = "$F[$transcript_column]:$F[$cdna_hgvs_column]"; # transcript_id:cdna_hgvs is shared id for common variants amongst files | 
|  | 105     # record disagreement on zygosity if any | 
|  | 106     if(not defined $zygosity{$key}){ | 
|  | 107       $zygosity{$key} = []; | 
|  | 108       $quality{$key} = []; | 
|  | 109       $var_count{$key} = []; | 
|  | 110       $tot_count{$key} = []; | 
|  | 111       $data{$key} = \@F; | 
|  | 112       #print STDERR "Missing fields (only have ", scalar(@F), " for input '$_'\n" if $#F < $#headers; | 
|  | 113     } | 
|  | 114     push @{$zygosity{$key}}, split(/; /,$F[$zygosity_column]); | 
|  | 115     push @{$quality{$key}}, $F[$pvalue_column]; | 
|  | 116     push @{$var_count{$key}}, $F[$alt_cnt_column]; | 
|  | 117     push @{$tot_count{$key}}, $F[$tot_cnt_column]; | 
|  | 118     push @{$methods{$key}}, $method_name; | 
|  | 119   } | 
|  | 120   close(IN); | 
|  | 121 } | 
|  | 122 | 
|  | 123 if($chr_prefix_exists){ | 
|  | 124   for my $key (keys %data){ | 
|  | 125     # assuming there is a mix of chr1 and 1, always add the chr for consistency | 
|  | 126     $data{$key}->[$chr_column] = "chr".$data{$key}->[$chr_column] unless $data{$key}->[$chr_column] =~ /^chr/; | 
|  | 127   } | 
|  | 128 } | 
|  | 129 | 
|  | 130 # Sort by chr, then pos, then transcript_id | 
|  | 131 for my $key (sort {$data{$a}->[$chr_column] cmp $data{$b}->[$chr_column] or | 
|  | 132                    $data{$a}->[$pos_column] <=> $data{$b}->[$pos_column] or | 
|  | 133                    $data{$a}->[$to_column] <=> $data{$b}->[$to_column] or | 
|  | 134                    $data{$a}->[$transcript_column] cmp $data{$b}->[$transcript_column]} keys %data){ | 
|  | 135   # Before printing a line, merge the zygosity, variant quality, read count data if requested | 
|  | 136   my %seen; | 
|  | 137   if($collapse){ | 
|  | 138     my @zygosities = grep {$_ ne "NA" and not $seen{$_}++} @{$zygosity{$key}}; | 
|  | 139     if(@zygosities == 0){ | 
|  | 140       # do nothing, existing NA in 7th column will be a fine answer as they are all like tha | 
|  | 141     } | 
|  | 142     elsif(@zygosities == 1){ | 
|  | 143       # agreement by all methods, just print the one not NA | 
|  | 144       $data{$key}->[$zygosity_column] = $zygosities[0] if $data{$key}->[$zygosity_column] eq "NA"; | 
|  | 145     } | 
|  | 146     else{ | 
|  | 147       $data{$key}->[$zygosity_column] = join("/", sort keys %seen); # list all unique values that occur | 
|  | 148     } | 
|  | 149   } | 
|  | 150   else{ | 
|  | 151     $data{$key}->[$zygosity_column] = join("; ", @{$zygosity{$key}}); | 
|  | 152   } | 
|  | 153 | 
|  | 154   if($collapse){ | 
|  | 155     undef %seen; | 
|  | 156     my @qualities = grep {$_ ne "NA" and not $seen{$_}++} @{$quality{$key}}; | 
|  | 157     if(@qualities == 0){ | 
|  | 158       # do nothing, existing NA in 8th column will be a fine answer as they are all like that | 
|  | 159     } | 
|  | 160     elsif(@qualities == 1){ | 
|  | 161       # agreement by all methods, just print the one not NA | 
|  | 162       $data{$key}->[$pvalue_column] = $qualities[0] if $data{$key}->[8] eq "NA"; | 
|  | 163     } | 
|  | 164     else{ | 
|  | 165       # calculate the median for the collapsed value | 
|  | 166       my @sorted_qualities = sort {$a <=> $b} grep({$_ ne "NA"} @{$quality{$key}}); | 
|  | 167       my $median_quality = $sorted_qualities[int($#sorted_qualities/2)]; # rounds down (actually better score as this is a p-value) | 
|  | 168       $data{$key}->[$pvalue_column] = $median_quality; | 
|  | 169     } | 
|  | 170   } | 
|  | 171   else{ | 
|  | 172     $data{$key}->[$pvalue_column] = join("; ", @{$quality{$key}}); | 
|  | 173   } | 
|  | 174 | 
|  | 175   if($collapse){ | 
|  | 176     undef %seen; | 
|  | 177     my @var_counts = grep {$_ ne "NA" and not $seen{$_}++} @{$var_count{$key}}; | 
|  | 178     undef %seen; | 
|  | 179     my @tot_counts = grep {$_ ne "NA" and not $seen{$_}++} @{$tot_count{$key}}; | 
|  | 180     if(@var_counts == 0 and @tot_counts == 0){ | 
|  | 181       # do nothing, existing NAs okay | 
|  | 182     } | 
|  | 183     elsif(@var_counts == 1 and @tot_counts == 1){ | 
|  | 184       # agreement by all methods, just print the one in %data unless it's NA | 
|  | 185       $data{$key}->[$alt_cnt_column] = $var_counts[0] if $data{$key}->[$alt_cnt_column] eq "NA"; | 
|  | 186       $data{$key}->[$tot_cnt_column] = $tot_counts[0] if $data{$key}->[$tot_cnt_column] eq "NA"; | 
|  | 187     } | 
|  | 188     else{ | 
|  | 189       # calculate the median for the collapsed value | 
|  | 190       my @sorted_var_counts = sort {$a <=> $b} grep({$_ ne "NA"} @{$var_count{$key}}); | 
|  | 191       my $pivot = $#sorted_var_counts/2; | 
|  | 192       my $median_var_count = $pivot == int($pivot) ? $sorted_var_counts[$pivot] : # arithmetic mean | 
|  | 193                                                      int(($sorted_var_counts[int($pivot)]+$sorted_var_counts[int($pivot)+1])/2); | 
|  | 194       $data{$key}->[$alt_cnt_column] = $median_var_count; | 
|  | 195       my @sorted_tot_counts = sort {$a <=> $b} grep({$_ ne "NA"} @{$tot_count{$key}}); | 
|  | 196       $pivot = $#sorted_tot_counts/2; | 
|  | 197       my $median_tot_count = $pivot == int($pivot) ? $sorted_tot_counts[$pivot] : # arithmetic mean | 
|  | 198                                                      int(($sorted_tot_counts[int($pivot)]+$sorted_tot_counts[int($pivot)+1])/2); | 
|  | 199       $data{$key}->[$tot_cnt_column] = $median_tot_count; | 
|  | 200     } | 
|  | 201   } | 
|  | 202   else{ | 
|  | 203     # list the raw numbers | 
|  | 204     $data{$key}->[$alt_cnt_column] = join("; ", @{$var_count{$key}}); | 
|  | 205     $data{$key}->[$tot_cnt_column] = join("; ", @{$tot_count{$key}}); | 
|  | 206   } | 
|  | 207 | 
|  | 208   # to facilitate multiple rounds of combining, slash the extra column from the last round if it exists | 
|  | 209   $data{$key}->[$srcs_column] = join("; ", @{$methods{$key}}); | 
|  | 210   for(my $i = 0; $i <= $#{$data{$key}}; $i++){ | 
|  | 211     $data{$key}->[$i] = "" if not defined $data{$key}->[$i]; | 
|  | 212   } | 
|  | 213   print OUT join("\t", @{$data{$key}}), "\n"; | 
|  | 214 } | 
|  | 215 close(OUT); |