annotate combine_hgvs_tables @ 0:baf1543e8ae1 default tip

initial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:27:49 -0600
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1 #!/usr/bin/env perl
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
3 use strict;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
4 use warnings;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
5
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
6 if(@ARGV == 1 and $ARGV[0] eq "-v"){
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
7 print "Version 1.0\n";
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
8 exit;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
9 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
10
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
11 my $quiet = 0;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
12 if(@ARGV and $ARGV[0] =~ /^-q(?:uiet)?$/){
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
13 $quiet = 1;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
14 shift @ARGV;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
15 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
16
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
17 @ARGV >= 6 and not @ARGV%2 or die "Usage: $0 [-q(uiet)] <true|false> <output file> <method_name1> <hgvs_file_1.txt> <method_name2> <hgvs_file_2.txt> ...\n",
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
18 "Where true or false determines if the genotypes calls should all be reported (if false, they are collapsed to the unique set of values observed)\n";
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
19
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
20 my %zygosity;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
21 my %quality;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
22 my %var_count;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
23 my %tot_count;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
24 my %methods;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
25 my %data;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
26 my $chr_prefix_exists = 0; # do any of the tools report chr1, if so we need to ensure 1 get reported as chr1 later for consistency across tools
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
27
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
28 my $collapse = $ARGV[0] =~ /^true$/i;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
29
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
30 open(OUT, ">$ARGV[1]")
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
31 or die "Cannot open $ARGV[1] for writing: $!\n";
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
32
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
33 my @headers;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
34 my ($chr_column, $zygosity_column, $pvalue_column, $alt_cnt_column, $tot_cnt_column, $transcript_column, $cdna_hgvs_column, $pos_column, $to_column, $srcs_column);
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
35 for(my $i=2; $i<$#ARGV; $i+=2){
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
36 my $method_name = $ARGV[$i];
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
37 my $infile = $ARGV[$i+1];
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
38 print STDERR "Processing $infile...\n" unless $quiet;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
39 open(IN, $infile)
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
40 or die "Cannot open $infile for reading: $!\n";
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
41 my $header = <IN>; # header
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
42 chomp $header;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
43 if($i == 2){
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
44 @headers = split /\t/, $header;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
45 for(my $j = 0; $j <= $#headers; $j++){
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
46 if($headers[$j] eq "Chr"){
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
47 $chr_column = $j;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
48 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
49 elsif($headers[$j] eq "Zygosity"){
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
50 $zygosity_column = $j;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
51 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
52 elsif($headers[$j] eq "P-value"){
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
53 $pvalue_column = $j;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
54 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
55 elsif($headers[$j] eq "Variant Reads"){
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
56 $alt_cnt_column = $j;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
57 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
58 elsif($headers[$j] eq "Total Reads"){
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
59 $tot_cnt_column = $j;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
60 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
61 elsif($headers[$j] eq "Selected transcript"){
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
62 $transcript_column = $j;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
63 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
64 elsif($headers[$j] eq "Transcript HGVS"){
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
65 $cdna_hgvs_column = $j;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
66 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
67 elsif($headers[$j] eq "DNA From"){
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
68 $pos_column = $j;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
69 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
70 elsif($headers[$j] eq "DNA To"){
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
71 $to_column = $j;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
72 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
73 elsif($headers[$j] eq "Sources"){
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
74 $srcs_column = $j;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
75 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
76 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
77 if(defined $srcs_column){
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
78 # print header as-is
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
79 print OUT "$header\n";
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
80 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
81 else{
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
82 # print header from the first file, with extra column for methods at the end
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
83 push @headers, "Sources";
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
84 $srcs_column = $#headers;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
85 print OUT join("\t", @headers), "\n";
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
86 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
87 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
88 else{
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
89 # Make sure the columns are the same in the various files
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
90 my @latest_headers = split /\t/, $header;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
91 for(my $j = 0; $j <= $#latest_headers && $j <= $#headers; $j++){
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
92 if($latest_headers[$j] ne $headers[$j]){
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
93 die "Header column ", $j+1, "($latest_headers[$j]) in $ARGV[$i] is different from the equivalent column label ($headers[$j]) in $ARGV[2]. Aborting, cannot merge the files.\n";
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
94 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
95 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
96 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
97 while(<IN>){
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
98 chomp;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
99 my @F = split /\t/, $_, -1; # -1 to keep trailing blank fields
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
100 if(not $chr_prefix_exists and $F[$chr_column] =~ /^chr/){
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
101 $chr_prefix_exists = 1;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
102 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
103 $F[$chr_column] =~ s/^chr//; # for consistency
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
104 my $key = "$F[$transcript_column]:$F[$cdna_hgvs_column]"; # transcript_id:cdna_hgvs is shared id for common variants amongst files
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
105 # record disagreement on zygosity if any
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
106 if(not defined $zygosity{$key}){
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
107 $zygosity{$key} = [];
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
108 $quality{$key} = [];
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
109 $var_count{$key} = [];
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
110 $tot_count{$key} = [];
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
111 $data{$key} = \@F;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
112 #print STDERR "Missing fields (only have ", scalar(@F), " for input '$_'\n" if $#F < $#headers;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
113 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
114 push @{$zygosity{$key}}, split(/; /,$F[$zygosity_column]);
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
115 push @{$quality{$key}}, $F[$pvalue_column];
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
116 push @{$var_count{$key}}, $F[$alt_cnt_column];
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
117 push @{$tot_count{$key}}, $F[$tot_cnt_column];
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
118 push @{$methods{$key}}, $method_name;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
119 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
120 close(IN);
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
121 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
122
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
123 if($chr_prefix_exists){
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
124 for my $key (keys %data){
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
125 # assuming there is a mix of chr1 and 1, always add the chr for consistency
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
126 $data{$key}->[$chr_column] = "chr".$data{$key}->[$chr_column] unless $data{$key}->[$chr_column] =~ /^chr/;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
127 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
128 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
129
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
130 # Sort by chr, then pos, then transcript_id
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
131 for my $key (sort {$data{$a}->[$chr_column] cmp $data{$b}->[$chr_column] or
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
132 $data{$a}->[$pos_column] <=> $data{$b}->[$pos_column] or
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
133 $data{$a}->[$to_column] <=> $data{$b}->[$to_column] or
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
134 $data{$a}->[$transcript_column] cmp $data{$b}->[$transcript_column]} keys %data){
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
135 # Before printing a line, merge the zygosity, variant quality, read count data if requested
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
136 my %seen;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
137 if($collapse){
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
138 my @zygosities = grep {$_ ne "NA" and not $seen{$_}++} @{$zygosity{$key}};
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
139 if(@zygosities == 0){
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
140 # do nothing, existing NA in 7th column will be a fine answer as they are all like tha
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
141 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
142 elsif(@zygosities == 1){
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
143 # agreement by all methods, just print the one not NA
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
144 $data{$key}->[$zygosity_column] = $zygosities[0] if $data{$key}->[$zygosity_column] eq "NA";
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
145 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
146 else{
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
147 $data{$key}->[$zygosity_column] = join("/", sort keys %seen); # list all unique values that occur
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
148 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
149 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
150 else{
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
151 $data{$key}->[$zygosity_column] = join("; ", @{$zygosity{$key}});
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
152 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
153
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
154 if($collapse){
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
155 undef %seen;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
156 my @qualities = grep {$_ ne "NA" and not $seen{$_}++} @{$quality{$key}};
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
157 if(@qualities == 0){
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
158 # do nothing, existing NA in 8th column will be a fine answer as they are all like that
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
159 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
160 elsif(@qualities == 1){
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
161 # agreement by all methods, just print the one not NA
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
162 $data{$key}->[$pvalue_column] = $qualities[0] if $data{$key}->[8] eq "NA";
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
163 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
164 else{
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
165 # calculate the median for the collapsed value
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
166 my @sorted_qualities = sort {$a <=> $b} grep({$_ ne "NA"} @{$quality{$key}});
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
167 my $median_quality = $sorted_qualities[int($#sorted_qualities/2)]; # rounds down (actually better score as this is a p-value)
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
168 $data{$key}->[$pvalue_column] = $median_quality;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
169 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
170 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
171 else{
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
172 $data{$key}->[$pvalue_column] = join("; ", @{$quality{$key}});
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
173 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
174
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
175 if($collapse){
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
176 undef %seen;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
177 my @var_counts = grep {$_ ne "NA" and not $seen{$_}++} @{$var_count{$key}};
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
178 undef %seen;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
179 my @tot_counts = grep {$_ ne "NA" and not $seen{$_}++} @{$tot_count{$key}};
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
180 if(@var_counts == 0 and @tot_counts == 0){
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
181 # do nothing, existing NAs okay
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
182 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
183 elsif(@var_counts == 1 and @tot_counts == 1){
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
184 # agreement by all methods, just print the one in %data unless it's NA
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
185 $data{$key}->[$alt_cnt_column] = $var_counts[0] if $data{$key}->[$alt_cnt_column] eq "NA";
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
186 $data{$key}->[$tot_cnt_column] = $tot_counts[0] if $data{$key}->[$tot_cnt_column] eq "NA";
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
187 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
188 else{
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
189 # calculate the median for the collapsed value
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
190 my @sorted_var_counts = sort {$a <=> $b} grep({$_ ne "NA"} @{$var_count{$key}});
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
191 my $pivot = $#sorted_var_counts/2;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
192 my $median_var_count = $pivot == int($pivot) ? $sorted_var_counts[$pivot] : # arithmetic mean
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
193 int(($sorted_var_counts[int($pivot)]+$sorted_var_counts[int($pivot)+1])/2);
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
194 $data{$key}->[$alt_cnt_column] = $median_var_count;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
195 my @sorted_tot_counts = sort {$a <=> $b} grep({$_ ne "NA"} @{$tot_count{$key}});
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
196 $pivot = $#sorted_tot_counts/2;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
197 my $median_tot_count = $pivot == int($pivot) ? $sorted_tot_counts[$pivot] : # arithmetic mean
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
198 int(($sorted_tot_counts[int($pivot)]+$sorted_tot_counts[int($pivot)+1])/2);
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
199 $data{$key}->[$tot_cnt_column] = $median_tot_count;
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
200 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
201 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
202 else{
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
203 # list the raw numbers
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
204 $data{$key}->[$alt_cnt_column] = join("; ", @{$var_count{$key}});
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
205 $data{$key}->[$tot_cnt_column] = join("; ", @{$tot_count{$key}});
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
206 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
207
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
208 # to facilitate multiple rounds of combining, slash the extra column from the last round if it exists
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
209 $data{$key}->[$srcs_column] = join("; ", @{$methods{$key}});
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
210 for(my $i = 0; $i <= $#{$data{$key}}; $i++){
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
211 $data{$key}->[$i] = "" if not defined $data{$key}->[$i];
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
212 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
213 print OUT join("\t", @{$data{$key}}), "\n";
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
214 }
baf1543e8ae1 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
215 close(OUT);