annotate hgvs_to_vcf @ 0:138d81f259c8 default tip

intial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:38:09 -0600
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1 #!/usr/bin/env perl
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
3 use strict;
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
4 use warnings;
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
5
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
6 # Converts a HGVS tab delimited table into a VCF file for processing by tools that need that format.
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
7
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
8 @ARGV == 4 or die "Usage: $0 <input hgvs.txt> <output.vcf> <sample id> <reference genome>\n";
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
9
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
10 open(HGVS, $ARGV[0])
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
11 or die "Cannot open $ARGV[0] for reading: $!\n";
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
12 my ($chr_column, $pos_column, $ref_column, $alt_column, $alt_cnt_column, $tot_cnt_column, $dbid_column, $cdna_hgvs_column,
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
13 $aa_hgvs_column, $transcript_column, $zygosity_column, $caveats_column, $phase_column, $pvalue_column, $genename_column);
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
14 my $headers = <HGVS>;
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
15 chomp $headers;
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
16 my @headers = split /\t/, $headers;
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
17 my %req_columns = (
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
18 "Chr" => \$chr_column,
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
19 "DNA From" => \$pos_column,
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
20 "Gene Name" => \$genename_column,
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
21 "Ref base" => \$ref_column,
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
22 "Obs base" => \$alt_column,
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
23 "Variant Reads" => \$alt_cnt_column,
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
24 "Total Reads" => \$tot_cnt_column,
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
25 "Variant DB ID" => \$dbid_column,
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
26 "Transcript HGVS" => \$cdna_hgvs_column,
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
27 "Protein HGVS" => \$aa_hgvs_column,
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
28 "Selected transcript" => \$transcript_column,
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
29 "Zygosity" => \$zygosity_column,
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
30 "Caveats" => \$caveats_column,
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
31 "Phase" => \$phase_column,
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
32 "P-value" => \$pvalue_column);
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
33 &load_header_columns(\%req_columns, \@headers);
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
34 my $phasing = "none";
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
35 while(<HGVS>){
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
36 my @F = split /\t/, $_;
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
37 if($F[$phase_column] ne ""){
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
38 $phasing = "partial";
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
39 last;
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
40 }
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
41 }
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
42 close(HGVS);
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
43 open(HGVS, $ARGV[0])
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
44 or die "Cannot open $ARGV[0] for reading: $!\n";
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
45
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
46 open(VCFOUT, ">$ARGV[1]")
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
47 or die "Cannot open $ARGV[1] for writing: $!\n";
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
48 my ($sec, $min, $hr, $day, $month, $year) = localtime();
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
49 $year += 1900;
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
50 $month = "0$month" if $month < 10;
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
51 $day = "0$day" if $day < 10;
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
52 $, = " ";
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
53 print VCFOUT <<END;
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
54 ##fileformat=VCFv4.1
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
55 ##fileDate=$year$month$day
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
56 ##source=hgvs_to_vcf
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
57 ##reference=$ARGV[3]
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
58 ##phasing=$phasing
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
59 ##commandline="@ARGV"
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
60 ##FILTER=<ID=CAVEATS,Description="The variant call is possibly a false positive due to non-unique mapping, homopolymer stretch, etc.">
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
61 ##INFO=<ID=GENE,Number=A,Type=String,Description="List of genes that have transcripts overlapping the variant region">
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
62 ##INFO=<ID=HGVS,Number=A,Type=String,Description="Effect in HGVS syntax (protein version if available, otherwise cDNA or genomic)">
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
63 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
64 ##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Genotype Quality, the Phred-scaled marginal (or unconditional) probability of the called genotype">
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
65 ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
66 ##FORMAT=<ID=RA,Number=1,Type=Integer,Description="Reference allele observation count">
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
67 ##FORMAT=<ID=AA,Number=1,Type=Integer,Description="Alternate allele observation count">
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
68 #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT $ARGV[2]
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
69 END
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
70 my $log10 = log(10);
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
71 <HGVS>; # get rid of header line
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
72 while(<HGVS>){
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
73 chomp;
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
74 my @F = split /\t/, $_;
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
75 my $filter = $F[$caveats_column] ne "" ? "CAVEATS" : "PASS";
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
76 my $ref = $F[$ref_column]; # always on + strand
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
77 my $alt = $F[$alt_column];
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
78 my $qual = $F[$pvalue_column]; # to be coverted to MAQ
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
79 if($qual == 0){
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
80 $qual = 100;
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
81 }
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
82 else{
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
83 $qual = -10*log($qual)/$log10; # log10 because log is actually ln in Perl
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
84 }
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
85 my $genotype = $F[$zygosity_column] =~ /homo/ ? "1/1" : "0/1"; # todo: handle compound het SNP
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
86 my $transcript = $F[$transcript_column];
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
87 $transcript =~ s/;.*//;
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
88 my $genenames = $F[$genename_column];
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
89 $genenames =~ s/;\s*/,/;
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
90
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
91 print VCFOUT join("\t", $F[$chr_column], $F[$pos_column], ($F[$dbid_column] =~ /rs/ ? $F[$dbid_column] : "."), $ref, $alt, $qual, $filter,
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
92 ($F[$genename_column] ne "" ? "GENE:".$F[$genename_column].";" : "")."HGVS:$transcript,".($F[$aa_hgvs_column] ne "NA" ? $F[$aa_hgvs_column]:$F[$cdna_hgvs_column]),
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
93 "GT:GQ:RA:AA", "$genotype:$qual:".($F[$tot_cnt_column]-$F[$alt_cnt_column]).":".$F[$alt_cnt_column]),"\n";
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
94 }
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
95
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
96 sub load_header_columns{
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
97 my ($reqs_hash_ref, $headers_array_ref) = @_;
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
98 my %unfulfilled;
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
99 for my $field_name (keys %$reqs_hash_ref){
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
100 $unfulfilled{$field_name} = 1;
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
101 }
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
102 for(my $i = 0; $i <= $#{$headers_array_ref}; $i++){
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
103 for my $req_header_name (keys %$reqs_hash_ref){
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
104 if($req_header_name eq $headers_array_ref->[$i]){
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
105 ${$reqs_hash_ref->{$req_header_name}} = $i;
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
106 delete $unfulfilled{$req_header_name};
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
107 last;
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
108 }
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
109 }
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
110 }
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
111 if(keys %unfulfilled){
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
112 die "Aborting. Could not find headers in the input file for the following required fields: ", join(", ", sort keys %unfulfilled), "\n";
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
113 }
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
114 }
138d81f259c8 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
115