0
+ − 1 #!/usr/bin/env perl
+ − 2
+ − 3 use strict;
+ − 4 use warnings;
+ − 5
+ − 6 if(@ARGV == 1 and $ARGV[0] eq "-v"){
+ − 7 print "Version 1.0\n";
+ − 8 exit;
+ − 9 }
+ − 10
+ − 11 my $quiet = 0;
+ − 12 if(@ARGV and $ARGV[0] =~ /^-q(?:uiet)?$/){
+ − 13 $quiet = 1;
+ − 14 shift @ARGV;
+ − 15 }
+ − 16
+ − 17 @ARGV >= 6 and not @ARGV%2 or die "Usage: $0 [-q(uiet)] <true|false> <output file> <method_name1> <hgvs_file_1.txt> <method_name2> <hgvs_file_2.txt> ...\n",
+ − 18 "Where true or false determines if the genotypes calls should all be reported (if false, they are collapsed to the unique set of values observed)\n";
+ − 19
+ − 20 my %zygosity;
+ − 21 my %quality;
+ − 22 my %var_count;
+ − 23 my %tot_count;
+ − 24 my %methods;
+ − 25 my %data;
+ − 26 my $chr_prefix_exists = 0; # do any of the tools report chr1, if so we need to ensure 1 get reported as chr1 later for consistency across tools
+ − 27
+ − 28 my $collapse = $ARGV[0] =~ /^true$/i;
+ − 29
+ − 30 open(OUT, ">$ARGV[1]")
+ − 31 or die "Cannot open $ARGV[1] for writing: $!\n";
+ − 32
+ − 33 my @headers;
+ − 34 my ($chr_column, $zygosity_column, $pvalue_column, $alt_cnt_column, $tot_cnt_column, $transcript_column, $cdna_hgvs_column, $pos_column, $to_column, $srcs_column);
+ − 35 for(my $i=2; $i<$#ARGV; $i+=2){
+ − 36 my $method_name = $ARGV[$i];
+ − 37 my $infile = $ARGV[$i+1];
+ − 38 print STDERR "Processing $infile...\n" unless $quiet;
+ − 39 open(IN, $infile)
+ − 40 or die "Cannot open $infile for reading: $!\n";
+ − 41 my $header = <IN>; # header
+ − 42 chomp $header;
+ − 43 if($i == 2){
+ − 44 @headers = split /\t/, $header;
+ − 45 for(my $j = 0; $j <= $#headers; $j++){
+ − 46 if($headers[$j] eq "Chr"){
+ − 47 $chr_column = $j;
+ − 48 }
+ − 49 elsif($headers[$j] eq "Zygosity"){
+ − 50 $zygosity_column = $j;
+ − 51 }
+ − 52 elsif($headers[$j] eq "P-value"){
+ − 53 $pvalue_column = $j;
+ − 54 }
+ − 55 elsif($headers[$j] eq "Variant Reads"){
+ − 56 $alt_cnt_column = $j;
+ − 57 }
+ − 58 elsif($headers[$j] eq "Total Reads"){
+ − 59 $tot_cnt_column = $j;
+ − 60 }
+ − 61 elsif($headers[$j] eq "Selected transcript"){
+ − 62 $transcript_column = $j;
+ − 63 }
+ − 64 elsif($headers[$j] eq "Transcript HGVS"){
+ − 65 $cdna_hgvs_column = $j;
+ − 66 }
+ − 67 elsif($headers[$j] eq "DNA From"){
+ − 68 $pos_column = $j;
+ − 69 }
+ − 70 elsif($headers[$j] eq "DNA To"){
+ − 71 $to_column = $j;
+ − 72 }
+ − 73 elsif($headers[$j] eq "Sources"){
+ − 74 $srcs_column = $j;
+ − 75 }
+ − 76 }
+ − 77 if(defined $srcs_column){
+ − 78 # print header as-is
+ − 79 print OUT "$header\n";
+ − 80 }
+ − 81 else{
+ − 82 # print header from the first file, with extra column for methods at the end
+ − 83 push @headers, "Sources";
+ − 84 $srcs_column = $#headers;
+ − 85 print OUT join("\t", @headers), "\n";
+ − 86 }
+ − 87 }
+ − 88 else{
+ − 89 # Make sure the columns are the same in the various files
+ − 90 my @latest_headers = split /\t/, $header;
+ − 91 for(my $j = 0; $j <= $#latest_headers && $j <= $#headers; $j++){
+ − 92 if($latest_headers[$j] ne $headers[$j]){
+ − 93 die "Header column ", $j+1, "($latest_headers[$j]) in $ARGV[$i] is different from the equivalent column label ($headers[$j]) in $ARGV[2]. Aborting, cannot merge the files.\n";
+ − 94 }
+ − 95 }
+ − 96 }
+ − 97 while(<IN>){
+ − 98 chomp;
+ − 99 my @F = split /\t/, $_, -1; # -1 to keep trailing blank fields
+ − 100 if(not $chr_prefix_exists and $F[$chr_column] =~ /^chr/){
+ − 101 $chr_prefix_exists = 1;
+ − 102 }
+ − 103 $F[$chr_column] =~ s/^chr//; # for consistency
+ − 104 my $key = "$F[$transcript_column]:$F[$cdna_hgvs_column]"; # transcript_id:cdna_hgvs is shared id for common variants amongst files
+ − 105 # record disagreement on zygosity if any
+ − 106 if(not defined $zygosity{$key}){
+ − 107 $zygosity{$key} = [];
+ − 108 $quality{$key} = [];
+ − 109 $var_count{$key} = [];
+ − 110 $tot_count{$key} = [];
+ − 111 $data{$key} = \@F;
+ − 112 #print STDERR "Missing fields (only have ", scalar(@F), " for input '$_'\n" if $#F < $#headers;
+ − 113 }
+ − 114 push @{$zygosity{$key}}, split(/; /,$F[$zygosity_column]);
+ − 115 push @{$quality{$key}}, $F[$pvalue_column];
+ − 116 push @{$var_count{$key}}, $F[$alt_cnt_column];
+ − 117 push @{$tot_count{$key}}, $F[$tot_cnt_column];
+ − 118 push @{$methods{$key}}, $method_name;
+ − 119 }
+ − 120 close(IN);
+ − 121 }
+ − 122
+ − 123 if($chr_prefix_exists){
+ − 124 for my $key (keys %data){
+ − 125 # assuming there is a mix of chr1 and 1, always add the chr for consistency
+ − 126 $data{$key}->[$chr_column] = "chr".$data{$key}->[$chr_column] unless $data{$key}->[$chr_column] =~ /^chr/;
+ − 127 }
+ − 128 }
+ − 129
+ − 130 # Sort by chr, then pos, then transcript_id
+ − 131 for my $key (sort {$data{$a}->[$chr_column] cmp $data{$b}->[$chr_column] or
+ − 132 $data{$a}->[$pos_column] <=> $data{$b}->[$pos_column] or
+ − 133 $data{$a}->[$to_column] <=> $data{$b}->[$to_column] or
+ − 134 $data{$a}->[$transcript_column] cmp $data{$b}->[$transcript_column]} keys %data){
+ − 135 # Before printing a line, merge the zygosity, variant quality, read count data if requested
+ − 136 my %seen;
+ − 137 if($collapse){
+ − 138 my @zygosities = grep {$_ ne "NA" and not $seen{$_}++} @{$zygosity{$key}};
+ − 139 if(@zygosities == 0){
+ − 140 # do nothing, existing NA in 7th column will be a fine answer as they are all like tha
+ − 141 }
+ − 142 elsif(@zygosities == 1){
+ − 143 # agreement by all methods, just print the one not NA
+ − 144 $data{$key}->[$zygosity_column] = $zygosities[0] if $data{$key}->[$zygosity_column] eq "NA";
+ − 145 }
+ − 146 else{
+ − 147 $data{$key}->[$zygosity_column] = join("/", sort keys %seen); # list all unique values that occur
+ − 148 }
+ − 149 }
+ − 150 else{
+ − 151 $data{$key}->[$zygosity_column] = join("; ", @{$zygosity{$key}});
+ − 152 }
+ − 153
+ − 154 if($collapse){
+ − 155 undef %seen;
+ − 156 my @qualities = grep {$_ ne "NA" and not $seen{$_}++} @{$quality{$key}};
+ − 157 if(@qualities == 0){
+ − 158 # do nothing, existing NA in 8th column will be a fine answer as they are all like that
+ − 159 }
+ − 160 elsif(@qualities == 1){
+ − 161 # agreement by all methods, just print the one not NA
+ − 162 $data{$key}->[$pvalue_column] = $qualities[0] if $data{$key}->[8] eq "NA";
+ − 163 }
+ − 164 else{
+ − 165 # calculate the median for the collapsed value
+ − 166 my @sorted_qualities = sort {$a <=> $b} grep({$_ ne "NA"} @{$quality{$key}});
+ − 167 my $median_quality = $sorted_qualities[int($#sorted_qualities/2)]; # rounds down (actually better score as this is a p-value)
+ − 168 $data{$key}->[$pvalue_column] = $median_quality;
+ − 169 }
+ − 170 }
+ − 171 else{
+ − 172 $data{$key}->[$pvalue_column] = join("; ", @{$quality{$key}});
+ − 173 }
+ − 174
+ − 175 if($collapse){
+ − 176 undef %seen;
+ − 177 my @var_counts = grep {$_ ne "NA" and not $seen{$_}++} @{$var_count{$key}};
+ − 178 undef %seen;
+ − 179 my @tot_counts = grep {$_ ne "NA" and not $seen{$_}++} @{$tot_count{$key}};
+ − 180 if(@var_counts == 0 and @tot_counts == 0){
+ − 181 # do nothing, existing NAs okay
+ − 182 }
+ − 183 elsif(@var_counts == 1 and @tot_counts == 1){
+ − 184 # agreement by all methods, just print the one in %data unless it's NA
+ − 185 $data{$key}->[$alt_cnt_column] = $var_counts[0] if $data{$key}->[$alt_cnt_column] eq "NA";
+ − 186 $data{$key}->[$tot_cnt_column] = $tot_counts[0] if $data{$key}->[$tot_cnt_column] eq "NA";
+ − 187 }
+ − 188 else{
+ − 189 # calculate the median for the collapsed value
+ − 190 my @sorted_var_counts = sort {$a <=> $b} grep({$_ ne "NA"} @{$var_count{$key}});
+ − 191 my $pivot = $#sorted_var_counts/2;
+ − 192 my $median_var_count = $pivot == int($pivot) ? $sorted_var_counts[$pivot] : # arithmetic mean
+ − 193 int(($sorted_var_counts[int($pivot)]+$sorted_var_counts[int($pivot)+1])/2);
+ − 194 $data{$key}->[$alt_cnt_column] = $median_var_count;
+ − 195 my @sorted_tot_counts = sort {$a <=> $b} grep({$_ ne "NA"} @{$tot_count{$key}});
+ − 196 $pivot = $#sorted_tot_counts/2;
+ − 197 my $median_tot_count = $pivot == int($pivot) ? $sorted_tot_counts[$pivot] : # arithmetic mean
+ − 198 int(($sorted_tot_counts[int($pivot)]+$sorted_tot_counts[int($pivot)+1])/2);
+ − 199 $data{$key}->[$tot_cnt_column] = $median_tot_count;
+ − 200 }
+ − 201 }
+ − 202 else{
+ − 203 # list the raw numbers
+ − 204 $data{$key}->[$alt_cnt_column] = join("; ", @{$var_count{$key}});
+ − 205 $data{$key}->[$tot_cnt_column] = join("; ", @{$tot_count{$key}});
+ − 206 }
+ − 207
+ − 208 # to facilitate multiple rounds of combining, slash the extra column from the last round if it exists
+ − 209 $data{$key}->[$srcs_column] = join("; ", @{$methods{$key}});
+ − 210 for(my $i = 0; $i <= $#{$data{$key}}; $i++){
+ − 211 $data{$key}->[$i] = "" if not defined $data{$key}->[$i];
+ − 212 }
+ − 213 print OUT join("\t", @{$data{$key}}), "\n";
+ − 214 }
+ − 215 close(OUT);