Mercurial > repos > yusuf > combine_hgvs_annotations
diff combine_hgvs_tables @ 0:baf1543e8ae1 default tip
initial commit
author | Yusuf Ali <ali@yusuf.email> |
---|---|
date | Wed, 25 Mar 2015 13:27:49 -0600 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/combine_hgvs_tables Wed Mar 25 13:27:49 2015 -0600 @@ -0,0 +1,215 @@ +#!/usr/bin/env perl + +use strict; +use warnings; + +if(@ARGV == 1 and $ARGV[0] eq "-v"){ + print "Version 1.0\n"; + exit; +} + +my $quiet = 0; +if(@ARGV and $ARGV[0] =~ /^-q(?:uiet)?$/){ + $quiet = 1; + shift @ARGV; +} + +@ARGV >= 6 and not @ARGV%2 or die "Usage: $0 [-q(uiet)] <true|false> <output file> <method_name1> <hgvs_file_1.txt> <method_name2> <hgvs_file_2.txt> ...\n", + "Where true or false determines if the genotypes calls should all be reported (if false, they are collapsed to the unique set of values observed)\n"; + +my %zygosity; +my %quality; +my %var_count; +my %tot_count; +my %methods; +my %data; +my $chr_prefix_exists = 0; # do any of the tools report chr1, if so we need to ensure 1 get reported as chr1 later for consistency across tools + +my $collapse = $ARGV[0] =~ /^true$/i; + +open(OUT, ">$ARGV[1]") + or die "Cannot open $ARGV[1] for writing: $!\n"; + +my @headers; +my ($chr_column, $zygosity_column, $pvalue_column, $alt_cnt_column, $tot_cnt_column, $transcript_column, $cdna_hgvs_column, $pos_column, $to_column, $srcs_column); +for(my $i=2; $i<$#ARGV; $i+=2){ + my $method_name = $ARGV[$i]; + my $infile = $ARGV[$i+1]; + print STDERR "Processing $infile...\n" unless $quiet; + open(IN, $infile) + or die "Cannot open $infile for reading: $!\n"; + my $header = <IN>; # header + chomp $header; + if($i == 2){ + @headers = split /\t/, $header; + for(my $j = 0; $j <= $#headers; $j++){ + if($headers[$j] eq "Chr"){ + $chr_column = $j; + } + elsif($headers[$j] eq "Zygosity"){ + $zygosity_column = $j; + } + elsif($headers[$j] eq "P-value"){ + $pvalue_column = $j; + } + elsif($headers[$j] eq "Variant Reads"){ + $alt_cnt_column = $j; + } + elsif($headers[$j] eq "Total Reads"){ + $tot_cnt_column = $j; + } + elsif($headers[$j] eq "Selected transcript"){ + $transcript_column = $j; + } + elsif($headers[$j] eq "Transcript HGVS"){ + $cdna_hgvs_column = $j; + } + elsif($headers[$j] eq "DNA From"){ + $pos_column = $j; + } + elsif($headers[$j] eq "DNA To"){ + $to_column = $j; + } + elsif($headers[$j] eq "Sources"){ + $srcs_column = $j; + } + } + if(defined $srcs_column){ + # print header as-is + print OUT "$header\n"; + } + else{ + # print header from the first file, with extra column for methods at the end + push @headers, "Sources"; + $srcs_column = $#headers; + print OUT join("\t", @headers), "\n"; + } + } + else{ + # Make sure the columns are the same in the various files + my @latest_headers = split /\t/, $header; + for(my $j = 0; $j <= $#latest_headers && $j <= $#headers; $j++){ + if($latest_headers[$j] ne $headers[$j]){ + die "Header column ", $j+1, "($latest_headers[$j]) in $ARGV[$i] is different from the equivalent column label ($headers[$j]) in $ARGV[2]. Aborting, cannot merge the files.\n"; + } + } + } + while(<IN>){ + chomp; + my @F = split /\t/, $_, -1; # -1 to keep trailing blank fields + if(not $chr_prefix_exists and $F[$chr_column] =~ /^chr/){ + $chr_prefix_exists = 1; + } + $F[$chr_column] =~ s/^chr//; # for consistency + my $key = "$F[$transcript_column]:$F[$cdna_hgvs_column]"; # transcript_id:cdna_hgvs is shared id for common variants amongst files + # record disagreement on zygosity if any + if(not defined $zygosity{$key}){ + $zygosity{$key} = []; + $quality{$key} = []; + $var_count{$key} = []; + $tot_count{$key} = []; + $data{$key} = \@F; + #print STDERR "Missing fields (only have ", scalar(@F), " for input '$_'\n" if $#F < $#headers; + } + push @{$zygosity{$key}}, split(/; /,$F[$zygosity_column]); + push @{$quality{$key}}, $F[$pvalue_column]; + push @{$var_count{$key}}, $F[$alt_cnt_column]; + push @{$tot_count{$key}}, $F[$tot_cnt_column]; + push @{$methods{$key}}, $method_name; + } + close(IN); +} + +if($chr_prefix_exists){ + for my $key (keys %data){ + # assuming there is a mix of chr1 and 1, always add the chr for consistency + $data{$key}->[$chr_column] = "chr".$data{$key}->[$chr_column] unless $data{$key}->[$chr_column] =~ /^chr/; + } +} + +# Sort by chr, then pos, then transcript_id +for my $key (sort {$data{$a}->[$chr_column] cmp $data{$b}->[$chr_column] or + $data{$a}->[$pos_column] <=> $data{$b}->[$pos_column] or + $data{$a}->[$to_column] <=> $data{$b}->[$to_column] or + $data{$a}->[$transcript_column] cmp $data{$b}->[$transcript_column]} keys %data){ + # Before printing a line, merge the zygosity, variant quality, read count data if requested + my %seen; + if($collapse){ + my @zygosities = grep {$_ ne "NA" and not $seen{$_}++} @{$zygosity{$key}}; + if(@zygosities == 0){ + # do nothing, existing NA in 7th column will be a fine answer as they are all like tha + } + elsif(@zygosities == 1){ + # agreement by all methods, just print the one not NA + $data{$key}->[$zygosity_column] = $zygosities[0] if $data{$key}->[$zygosity_column] eq "NA"; + } + else{ + $data{$key}->[$zygosity_column] = join("/", sort keys %seen); # list all unique values that occur + } + } + else{ + $data{$key}->[$zygosity_column] = join("; ", @{$zygosity{$key}}); + } + + if($collapse){ + undef %seen; + my @qualities = grep {$_ ne "NA" and not $seen{$_}++} @{$quality{$key}}; + if(@qualities == 0){ + # do nothing, existing NA in 8th column will be a fine answer as they are all like that + } + elsif(@qualities == 1){ + # agreement by all methods, just print the one not NA + $data{$key}->[$pvalue_column] = $qualities[0] if $data{$key}->[8] eq "NA"; + } + else{ + # calculate the median for the collapsed value + my @sorted_qualities = sort {$a <=> $b} grep({$_ ne "NA"} @{$quality{$key}}); + my $median_quality = $sorted_qualities[int($#sorted_qualities/2)]; # rounds down (actually better score as this is a p-value) + $data{$key}->[$pvalue_column] = $median_quality; + } + } + else{ + $data{$key}->[$pvalue_column] = join("; ", @{$quality{$key}}); + } + + if($collapse){ + undef %seen; + my @var_counts = grep {$_ ne "NA" and not $seen{$_}++} @{$var_count{$key}}; + undef %seen; + my @tot_counts = grep {$_ ne "NA" and not $seen{$_}++} @{$tot_count{$key}}; + if(@var_counts == 0 and @tot_counts == 0){ + # do nothing, existing NAs okay + } + elsif(@var_counts == 1 and @tot_counts == 1){ + # agreement by all methods, just print the one in %data unless it's NA + $data{$key}->[$alt_cnt_column] = $var_counts[0] if $data{$key}->[$alt_cnt_column] eq "NA"; + $data{$key}->[$tot_cnt_column] = $tot_counts[0] if $data{$key}->[$tot_cnt_column] eq "NA"; + } + else{ + # calculate the median for the collapsed value + my @sorted_var_counts = sort {$a <=> $b} grep({$_ ne "NA"} @{$var_count{$key}}); + my $pivot = $#sorted_var_counts/2; + my $median_var_count = $pivot == int($pivot) ? $sorted_var_counts[$pivot] : # arithmetic mean + int(($sorted_var_counts[int($pivot)]+$sorted_var_counts[int($pivot)+1])/2); + $data{$key}->[$alt_cnt_column] = $median_var_count; + my @sorted_tot_counts = sort {$a <=> $b} grep({$_ ne "NA"} @{$tot_count{$key}}); + $pivot = $#sorted_tot_counts/2; + my $median_tot_count = $pivot == int($pivot) ? $sorted_tot_counts[$pivot] : # arithmetic mean + int(($sorted_tot_counts[int($pivot)]+$sorted_tot_counts[int($pivot)+1])/2); + $data{$key}->[$tot_cnt_column] = $median_tot_count; + } + } + else{ + # list the raw numbers + $data{$key}->[$alt_cnt_column] = join("; ", @{$var_count{$key}}); + $data{$key}->[$tot_cnt_column] = join("; ", @{$tot_count{$key}}); + } + + # to facilitate multiple rounds of combining, slash the extra column from the last round if it exists + $data{$key}->[$srcs_column] = join("; ", @{$methods{$key}}); + for(my $i = 0; $i <= $#{$data{$key}}; $i++){ + $data{$key}->[$i] = "" if not defined $data{$key}->[$i]; + } + print OUT join("\t", @{$data{$key}}), "\n"; +} +close(OUT);