Mercurial > repos > yusuf > hgvs_vcf
view hgvs_to_vcf @ 0:138d81f259c8 default tip
intial commit
author | Yusuf Ali <ali@yusuf.email> |
---|---|
date | Wed, 25 Mar 2015 13:38:09 -0600 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env perl use strict; use warnings; # Converts a HGVS tab delimited table into a VCF file for processing by tools that need that format. @ARGV == 4 or die "Usage: $0 <input hgvs.txt> <output.vcf> <sample id> <reference genome>\n"; open(HGVS, $ARGV[0]) or die "Cannot open $ARGV[0] for reading: $!\n"; my ($chr_column, $pos_column, $ref_column, $alt_column, $alt_cnt_column, $tot_cnt_column, $dbid_column, $cdna_hgvs_column, $aa_hgvs_column, $transcript_column, $zygosity_column, $caveats_column, $phase_column, $pvalue_column, $genename_column); my $headers = <HGVS>; chomp $headers; my @headers = split /\t/, $headers; my %req_columns = ( "Chr" => \$chr_column, "DNA From" => \$pos_column, "Gene Name" => \$genename_column, "Ref base" => \$ref_column, "Obs base" => \$alt_column, "Variant Reads" => \$alt_cnt_column, "Total Reads" => \$tot_cnt_column, "Variant DB ID" => \$dbid_column, "Transcript HGVS" => \$cdna_hgvs_column, "Protein HGVS" => \$aa_hgvs_column, "Selected transcript" => \$transcript_column, "Zygosity" => \$zygosity_column, "Caveats" => \$caveats_column, "Phase" => \$phase_column, "P-value" => \$pvalue_column); &load_header_columns(\%req_columns, \@headers); my $phasing = "none"; while(<HGVS>){ my @F = split /\t/, $_; if($F[$phase_column] ne ""){ $phasing = "partial"; last; } } close(HGVS); open(HGVS, $ARGV[0]) or die "Cannot open $ARGV[0] for reading: $!\n"; open(VCFOUT, ">$ARGV[1]") or die "Cannot open $ARGV[1] for writing: $!\n"; my ($sec, $min, $hr, $day, $month, $year) = localtime(); $year += 1900; $month = "0$month" if $month < 10; $day = "0$day" if $day < 10; $, = " "; print VCFOUT <<END; ##fileformat=VCFv4.1 ##fileDate=$year$month$day ##source=hgvs_to_vcf ##reference=$ARGV[3] ##phasing=$phasing ##commandline="@ARGV" ##FILTER=<ID=CAVEATS,Description="The variant call is possibly a false positive due to non-unique mapping, homopolymer stretch, etc."> ##INFO=<ID=GENE,Number=A,Type=String,Description="List of genes that have transcripts overlapping the variant region"> ##INFO=<ID=HGVS,Number=A,Type=String,Description="Effect in HGVS syntax (protein version if available, otherwise cDNA or genomic)"> ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> ##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Genotype Quality, the Phred-scaled marginal (or unconditional) probability of the called genotype"> ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth"> ##FORMAT=<ID=RA,Number=1,Type=Integer,Description="Reference allele observation count"> ##FORMAT=<ID=AA,Number=1,Type=Integer,Description="Alternate allele observation count"> #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT $ARGV[2] END my $log10 = log(10); <HGVS>; # get rid of header line while(<HGVS>){ chomp; my @F = split /\t/, $_; my $filter = $F[$caveats_column] ne "" ? "CAVEATS" : "PASS"; my $ref = $F[$ref_column]; # always on + strand my $alt = $F[$alt_column]; my $qual = $F[$pvalue_column]; # to be coverted to MAQ if($qual == 0){ $qual = 100; } else{ $qual = -10*log($qual)/$log10; # log10 because log is actually ln in Perl } my $genotype = $F[$zygosity_column] =~ /homo/ ? "1/1" : "0/1"; # todo: handle compound het SNP my $transcript = $F[$transcript_column]; $transcript =~ s/;.*//; my $genenames = $F[$genename_column]; $genenames =~ s/;\s*/,/; print VCFOUT join("\t", $F[$chr_column], $F[$pos_column], ($F[$dbid_column] =~ /rs/ ? $F[$dbid_column] : "."), $ref, $alt, $qual, $filter, ($F[$genename_column] ne "" ? "GENE:".$F[$genename_column].";" : "")."HGVS:$transcript,".($F[$aa_hgvs_column] ne "NA" ? $F[$aa_hgvs_column]:$F[$cdna_hgvs_column]), "GT:GQ:RA:AA", "$genotype:$qual:".($F[$tot_cnt_column]-$F[$alt_cnt_column]).":".$F[$alt_cnt_column]),"\n"; } sub load_header_columns{ my ($reqs_hash_ref, $headers_array_ref) = @_; my %unfulfilled; for my $field_name (keys %$reqs_hash_ref){ $unfulfilled{$field_name} = 1; } for(my $i = 0; $i <= $#{$headers_array_ref}; $i++){ for my $req_header_name (keys %$reqs_hash_ref){ if($req_header_name eq $headers_array_ref->[$i]){ ${$reqs_hash_ref->{$req_header_name}} = $i; delete $unfulfilled{$req_header_name}; last; } } } if(keys %unfulfilled){ die "Aborting. Could not find headers in the input file for the following required fields: ", join(", ", sort keys %unfulfilled), "\n"; } }