diff tools/cgatools17/testvariants2VCF-v2.pl @ 1:3a2e0f376f26 draft

Minor change to tv2vcf.xml to allow for workflow automation
author dgdekoning
date Wed, 21 Oct 2015 10:09:15 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/cgatools17/testvariants2VCF-v2.pl	Wed Oct 21 10:09:15 2015 -0400
@@ -0,0 +1,221 @@
+#!/usr/bin/perl -w
+use strict;
+
+#Converts a cgatools testvariant output to a multi-sample VCF file (VCF spec 4.1)
+#Requires cgatools to access reference genome encoded in .crr format (hg18.crr or hg19.crr)
+#make sure cgatools is in the $PATH
+#
+#Example testvariant file:
+#variantId       chromosome      begin   end     varType reference       alleleSeq       xRef    GS19238 GS19239	GS19240
+#6874944 chr5    20584031        20584032        snp     C       T       dbsnp.119:rs10037487	00	01		11
+#6874945 chr5    20584031        20584032        sub     C       TA              		01      01      	00
+#6874946 chr5    20584031        20584032        sub     C       TG              		00      01      	00
+#After converting to VCF:
+#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT	GS19238 GS19239        GS19240
+#chr5    20584031        6874944;6874945;6874946 AC      AT,ATA,ATG      .       PASS    NS=51;RSID=dbsnp.119:rs10037487,.,.;AF=0.57,0.02,0.00;DB    GT      0/2	./.	1/1
+#
+#Variants that share the same location (chr,begin,end) will be merged into one locus and their flags (0,1,N) will be converted into genotype calls
+#Samples that are positive for more than two alleles within the same locus will be flagged and their genotype calls set to unknown (./.)
+#Look at the sample GS19239 as such an example
+#
+#For non-SNP locus, VCF requires an extra reference base immediately upstream of the variant locus be included in the REF and ALT columns
+#
+
+
+die "Usage: $0 testvarOutput.txt hg19.crr > vcf.txt 2> runlog.txt\n\t" .
+	"testvarOutput.txt = output file from cgatools testvariants\n\t" .
+	"hg19.crr = reference genome in .crr format (must be the same as used in testvariants)\n\t" .
+	"vcf.txt = converted file in vcf format\n\t" .
+	"runlog.txt = log file\n" unless ($#ARGV == 1);
+die "Fail to find input file " . $ARGV[0] . "\n" unless (-e $ARGV[0]);
+die "Fail to find .crr file " . $ARGV[1] . "\n" unless (-e $ARGV[1]);
+die "Fail to execute cgatools: $!\n" if (system ('cgatools > /dev/null 2>&1') != 0);
+
+my (undef, undef, undef, $mday, $mon, $year) = localtime;
+my($timestamp)=sprintf ('%s%02d%02d', $year+1900, $mon+1, $mday);
+print <<EOF;
+##fileformat=VCFv4.1
+##fileDate=$timestamp
+##source=CGA Tools v1.7.1 listvariants/testvariants
+##reference=$ARGV[1]
+##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples with Fully Called Data">
+##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">
+##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP Membership">
+##INFO=<ID=RSID,Number=A,Type=String,Description="dbSNP rs ids">
+##FILTER=<ID=s50,Description="Less than 50% of samples are fully called">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+EOF
+
+&testvar2VCF (@ARGV);
+
+sub testvar2VCF {
+#variantId=0       chromosome=1      begin=2   end=3     varType=4 reference=5       alleleSeq=6       xRef=7    GS12885-1100-37-ASM=8
+	my ($inFile, $crrFile) = @_;
+	my (@samples, %seen);
+	open (IN, "$inFile") or die "Fail to open input file $inFile\n";
+	while (<IN>) {
+		chomp;
+		my (@fs) = split (/\t/);
+		if (/^variantId/) {
+			@samples = @fs[8..$#fs];
+			print join ("\t", ('#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', @samples)), "\n";
+			next;
+		}
+		next unless (/^\d/);
+		my ($key) = $fs[1] . '-' . $fs[2] . '-' . $fs[3];
+		if (!%seen || exists $seen{$key}) {
+			push (@{$seen{$key}}, [@fs]);
+			next;
+		}
+		#has a new locus, need to process saved locus
+		&doSavedLocus (\%seen, \@samples, $crrFile);
+		#reinitialize with the new locus
+		%seen = ();
+		push (@{$seen{$key}}, [@fs]);
+	}
+	close IN;
+	&doSavedLocus (\%seen, \@samples, $crrFile); #do the last locus
+}
+
+sub doSavedLocus {
+	my ($seen, $samples, $crrFile) = @_;
+	my ($lines) = values %$seen; #get all the lines that belong to the same locus
+	my ($chr, $begin, $end)  = ($lines->[0]->[1], $lines->[0]->[2], $lines->[0]->[3]);
+	my ($info_NS) = scalar @$samples; #num. of samples fully called
+	die "no_samples,$chr-$begin-$end\n" unless ($info_NS > 0);
+	my ($info_DB) = ''; #dbSNP memebership
+	my (@info_RSid) = (); #dbSNP rs ids
+	my (%ref, @alt, @id, %alleleCalls);
+	my ($format) = 'GT'; #GT (genotype) is the only value currently populated for each sample
+	my ($qual, $filter, $i) = ('.', 'PASS', 0); #no quality scores provided in the testvar output
+	my (@info_AF);
+	#initialize every allele count to 0
+	for ($i=0; $i<=scalar @$lines; $i++) {$info_AF[$i] = 0;}
+	my ($pos) = $begin;
+	#check the vartypes
+	my (%varTypes) = map {$_->[4] => 1} @$lines; #column 4 in the testvar output is varType
+	my (@varTypeKeys) = keys %varTypes;
+	#check if this locus is SNP only
+	my ($snpOnly) = 0;
+	$snpOnly = 1 if ($#varTypeKeys == 0 && $varTypeKeys[0] eq 'snp');
+	foreach my $line (@$lines) {
+		my ($refBase) = $line->[5];
+		my ($altBase) = $line->[6];
+		if (!$snpOnly) { #not a SNP only locus
+			my ($decodecrrCmd) = join ('', ("cgatools decodecrr --reference $crrFile --range $chr:", $begin - 1 , "-$begin 2> /dev/null"));
+			my ($re) = `$decodecrrCmd`;
+			die "cgatools decodecrr failed $decodecrrCmd ", join ("\t", @$line), "\n" unless (defined $re);
+			chomp ($re);
+			$refBase = $re . $refBase;
+			$altBase = $re . $altBase;
+			$pos = $begin;
+		}
+		$ref{$refBase} = 1;
+		push (@alt, $altBase);
+		push (@id, $line->[0]);
+		if ($line->[7] =~ /dbsnp/) {
+			$info_DB = 'DB';
+			$line->[7] =~ s/;/-/g; #testvar use ; as delimiter which is reserved by the INFO column
+			push (@info_RSid, $line->[7]);
+		} else {
+			push (@info_RSid, '.');
+		}
+		#collect testvar's allele flags (00, 01, 11, 0N, NN, ...), which starts at column 8.
+		for ($i=0; $i<scalar @$samples; $i++) {
+			push (@{$alleleCalls{$samples->[$i]}}, $line->[$i+8]);
+		}
+	}
+	$pos = $begin + 1 if ($snpOnly);
+	my (@refBases) = keys %ref;
+	die "mismatched_ref_base, " . join (",", (@id, @refBases)) . "\n" if ($#refBases != 0);
+	my ($idstr) = join (";", @id);
+	my ($altstr) = join (",", @alt);
+	print join ("\t", ($chr, $pos, $idstr, $refBases[0], $altstr,$qual)), "\t";
+	my (@allGTcalls) = ();
+	#convert allele flags to genotype calls
+	foreach my $sample  (@$samples) {
+		my ($sampleAlleleCalls) = $alleleCalls{$sample};
+		my ($sampleGTcall);
+		if ($sampleAlleleCalls->[0] =~ /\S\S/) { #diploid site
+			$sampleGTcall = &mkDiploidGTcall ($sampleAlleleCalls);
+		} else {
+			$sampleGTcall = &mkHaploidGTcall ($sampleAlleleCalls);
+		}
+		if ($sampleGTcall =~ /warning/) {
+			print STDERR "$sampleGTcall,$chr-$begin-$end,$idstr,$altstr,$sample,", join ("-", @$sampleAlleleCalls), "\n";
+			$sampleGTcall = '.';
+			$sampleGTcall = './.' if ($sampleAlleleCalls->[0] =~ /\S\S/);
+		}
+		$info_NS-- if ($sampleGTcall =~ /\./);
+		push (@allGTcalls, $sampleGTcall);
+		$info_AF[$1]++ if ($sampleGTcall =~ /^(\d+)/);
+		$info_AF[$1]++ if ($sampleGTcall =~ /^\d+\/(\d+)$/);
+	}
+	$filter = 's50' if ($info_NS/scalar @$samples < 0.5);
+	print "$filter\t";
+	my ($numAlleles) = 0;
+	foreach (@info_AF) {$numAlleles += $_;}
+	$numAlleles = 1 if ($numAlleles == 0); #In some loci, all samples are no-called
+	my (@alleleFreq) = map (sprintf ('%.2f', $_/$numAlleles), @info_AF);
+	shift @alleleFreq; #don't need to print the reference allele frequency
+	print "NS=$info_NS;RSID=", join (",", @info_RSid), ";AF=", join (",", @alleleFreq), ";$info_DB\t$format\t", join ("\t", @allGTcalls), "\n";
+}
+
+#Convert a list of allele calls to diploid VCF genotype calls
+#[11] becomes 1/1
+#[01] becomes 0/1
+#[00] becomes 0/0
+#[11, 00] becomes 1/1
+#[01, 01] becomes 1/2
+#[00, 01, 1N] becomes 2/3
+#[01, 01, 01] becomes ./. (unknown)
+#[1N, 00, 00, 00] becomes 1/.
+sub mkDiploidGTcall {
+	my ($sampleAlleleCalls) = @_;
+	#[11, 00, NN, 00], [1N, NN, 01, 00], [1N, NN, 1N, 00], [01, NN, 01, 00], [1N, 00, 00, 00]
+	my ($numOfOnes) = 0;
+	my ($sampleGTcall) = './.'; #diploid no-call in VCF
+	my (%merged);
+	for (my $i=0; $i<scalar @$sampleAlleleCalls; $i++) {
+		push (@{$merged{$sampleAlleleCalls->[$i]}}, $i+1);
+		my ($ones) = $sampleAlleleCalls->[$i] =~ tr/1/1/;
+		$numOfOnes += $ones;
+	}
+	return 'warning-more_than_two_1s' if ($numOfOnes > 2);
+	return $merged{'11'}->[0] . '/' . $merged{'11'}->[0] if (exists $merged{'11'});
+	if (exists $merged{'01'}) {
+		return $merged{'01'}->[0] . '/' . $merged{'01'}->[1] if (defined $merged{'01'}->[1]);
+		return $merged{'01'}->[0] . '/' . $merged{'1N'}->[0] if (exists $merged{'1N'});
+		return '0/' . $merged{'01'}->[0] if (!exists $merged{'NN'} && !exists $merged{'0N'});
+		return $merged{'01'}->[0] . '/.';
+	}
+	if (exists $merged{'1N'}) {
+		return $merged{'1N'}->[0] . '/' . $merged{'1N'}->[1] if (defined $merged{'1N'}->[1]);
+		return $merged{'1N'}->[0] . '/.';
+	}
+	return '0/0' if (!exists $merged{'NN'} && !exists $merged{'0N'});
+	return '0/.' if (exists $merged{'0N'} && scalar @{$merged{'0N'}} == 1);
+	return $sampleGTcall;
+}
+
+#Convert a list of allele calls to haploid VCF genotype calls
+#[1] becomes 1
+#[N] becomes .
+#[0] becomes 0
+#[1, 0] becomes 1
+#[0, 1, N] becomes 2
+#[1, 1] becomes .
+sub mkHaploidGTcall {
+	my ($sampleAlleleCalls) = @_;
+	my ($sampleGTcall) = '.'; #haploid no-call in VCF
+	my (%merged);
+	for (my $i=0; $i<scalar @$sampleAlleleCalls; $i++) {
+		push (@{$merged{$sampleAlleleCalls->[$i]}}, $i+1);
+	}
+	if (exists $merged{'1'}) {
+		return "warning-more_than_one_1_haploid" if (defined $merged{'1'}->[1]);
+		return $merged{'1'}->[0];
+	}
+	return '0' if (!exists $merged{'N'});
+	return $sampleGTcall;
+}