annotate tools/cgatools17/testvariants2VCF-v2.pl @ 15:b5c879e950f5 draft default tip

planemo upload for repository https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools commit 402ebee914f2286aa9d98223f501f06c1e4b9c22-dirty
author yhoogstrate
date Fri, 20 Nov 2015 03:50:36 -0500
parents 3a2e0f376f26
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
1 #!/usr/bin/perl -w
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
2 use strict;
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
3
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
4 #Converts a cgatools testvariant output to a multi-sample VCF file (VCF spec 4.1)
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
5 #Requires cgatools to access reference genome encoded in .crr format (hg18.crr or hg19.crr)
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
6 #make sure cgatools is in the $PATH
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
7 #
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
8 #Example testvariant file:
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
9 #variantId chromosome begin end varType reference alleleSeq xRef GS19238 GS19239 GS19240
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
10 #6874944 chr5 20584031 20584032 snp C T dbsnp.119:rs10037487 00 01 11
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
11 #6874945 chr5 20584031 20584032 sub C TA 01 01 00
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
12 #6874946 chr5 20584031 20584032 sub C TG 00 01 00
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
13 #After converting to VCF:
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
14 #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT GS19238 GS19239 GS19240
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
15 #chr5 20584031 6874944;6874945;6874946 AC AT,ATA,ATG . PASS NS=51;RSID=dbsnp.119:rs10037487,.,.;AF=0.57,0.02,0.00;DB GT 0/2 ./. 1/1
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
16 #
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
17 #Variants that share the same location (chr,begin,end) will be merged into one locus and their flags (0,1,N) will be converted into genotype calls
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
18 #Samples that are positive for more than two alleles within the same locus will be flagged and their genotype calls set to unknown (./.)
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
19 #Look at the sample GS19239 as such an example
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
20 #
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
21 #For non-SNP locus, VCF requires an extra reference base immediately upstream of the variant locus be included in the REF and ALT columns
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
22 #
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
23
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
24
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
25 die "Usage: $0 testvarOutput.txt hg19.crr > vcf.txt 2> runlog.txt\n\t" .
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
26 "testvarOutput.txt = output file from cgatools testvariants\n\t" .
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
27 "hg19.crr = reference genome in .crr format (must be the same as used in testvariants)\n\t" .
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
28 "vcf.txt = converted file in vcf format\n\t" .
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
29 "runlog.txt = log file\n" unless ($#ARGV == 1);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
30 die "Fail to find input file " . $ARGV[0] . "\n" unless (-e $ARGV[0]);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
31 die "Fail to find .crr file " . $ARGV[1] . "\n" unless (-e $ARGV[1]);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
32 die "Fail to execute cgatools: $!\n" if (system ('cgatools > /dev/null 2>&1') != 0);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
33
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
34 my (undef, undef, undef, $mday, $mon, $year) = localtime;
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
35 my($timestamp)=sprintf ('%s%02d%02d', $year+1900, $mon+1, $mday);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
36 print <<EOF;
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
37 ##fileformat=VCFv4.1
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
38 ##fileDate=$timestamp
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
39 ##source=CGA Tools v1.7.1 listvariants/testvariants
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
40 ##reference=$ARGV[1]
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
41 ##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples with Fully Called Data">
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
42 ##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
43 ##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP Membership">
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
44 ##INFO=<ID=RSID,Number=A,Type=String,Description="dbSNP rs ids">
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
45 ##FILTER=<ID=s50,Description="Less than 50% of samples are fully called">
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
46 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
47 EOF
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
48
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
49 &testvar2VCF (@ARGV);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
50
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
51 sub testvar2VCF {
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
52 #variantId=0 chromosome=1 begin=2 end=3 varType=4 reference=5 alleleSeq=6 xRef=7 GS12885-1100-37-ASM=8
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
53 my ($inFile, $crrFile) = @_;
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
54 my (@samples, %seen);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
55 open (IN, "$inFile") or die "Fail to open input file $inFile\n";
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
56 while (<IN>) {
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
57 chomp;
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
58 my (@fs) = split (/\t/);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
59 if (/^variantId/) {
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
60 @samples = @fs[8..$#fs];
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
61 print join ("\t", ('#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', @samples)), "\n";
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
62 next;
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
63 }
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
64 next unless (/^\d/);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
65 my ($key) = $fs[1] . '-' . $fs[2] . '-' . $fs[3];
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
66 if (!%seen || exists $seen{$key}) {
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
67 push (@{$seen{$key}}, [@fs]);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
68 next;
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
69 }
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
70 #has a new locus, need to process saved locus
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
71 &doSavedLocus (\%seen, \@samples, $crrFile);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
72 #reinitialize with the new locus
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
73 %seen = ();
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
74 push (@{$seen{$key}}, [@fs]);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
75 }
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
76 close IN;
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
77 &doSavedLocus (\%seen, \@samples, $crrFile); #do the last locus
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
78 }
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
79
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
80 sub doSavedLocus {
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
81 my ($seen, $samples, $crrFile) = @_;
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
82 my ($lines) = values %$seen; #get all the lines that belong to the same locus
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
83 my ($chr, $begin, $end) = ($lines->[0]->[1], $lines->[0]->[2], $lines->[0]->[3]);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
84 my ($info_NS) = scalar @$samples; #num. of samples fully called
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
85 die "no_samples,$chr-$begin-$end\n" unless ($info_NS > 0);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
86 my ($info_DB) = ''; #dbSNP memebership
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
87 my (@info_RSid) = (); #dbSNP rs ids
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
88 my (%ref, @alt, @id, %alleleCalls);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
89 my ($format) = 'GT'; #GT (genotype) is the only value currently populated for each sample
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
90 my ($qual, $filter, $i) = ('.', 'PASS', 0); #no quality scores provided in the testvar output
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
91 my (@info_AF);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
92 #initialize every allele count to 0
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
93 for ($i=0; $i<=scalar @$lines; $i++) {$info_AF[$i] = 0;}
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
94 my ($pos) = $begin;
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
95 #check the vartypes
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
96 my (%varTypes) = map {$_->[4] => 1} @$lines; #column 4 in the testvar output is varType
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
97 my (@varTypeKeys) = keys %varTypes;
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
98 #check if this locus is SNP only
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
99 my ($snpOnly) = 0;
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
100 $snpOnly = 1 if ($#varTypeKeys == 0 && $varTypeKeys[0] eq 'snp');
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
101 foreach my $line (@$lines) {
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
102 my ($refBase) = $line->[5];
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
103 my ($altBase) = $line->[6];
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
104 if (!$snpOnly) { #not a SNP only locus
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
105 my ($decodecrrCmd) = join ('', ("cgatools decodecrr --reference $crrFile --range $chr:", $begin - 1 , "-$begin 2> /dev/null"));
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
106 my ($re) = `$decodecrrCmd`;
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
107 die "cgatools decodecrr failed $decodecrrCmd ", join ("\t", @$line), "\n" unless (defined $re);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
108 chomp ($re);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
109 $refBase = $re . $refBase;
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
110 $altBase = $re . $altBase;
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
111 $pos = $begin;
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
112 }
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
113 $ref{$refBase} = 1;
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
114 push (@alt, $altBase);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
115 push (@id, $line->[0]);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
116 if ($line->[7] =~ /dbsnp/) {
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
117 $info_DB = 'DB';
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
118 $line->[7] =~ s/;/-/g; #testvar use ; as delimiter which is reserved by the INFO column
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
119 push (@info_RSid, $line->[7]);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
120 } else {
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
121 push (@info_RSid, '.');
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
122 }
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
123 #collect testvar's allele flags (00, 01, 11, 0N, NN, ...), which starts at column 8.
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
124 for ($i=0; $i<scalar @$samples; $i++) {
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
125 push (@{$alleleCalls{$samples->[$i]}}, $line->[$i+8]);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
126 }
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
127 }
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
128 $pos = $begin + 1 if ($snpOnly);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
129 my (@refBases) = keys %ref;
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
130 die "mismatched_ref_base, " . join (",", (@id, @refBases)) . "\n" if ($#refBases != 0);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
131 my ($idstr) = join (";", @id);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
132 my ($altstr) = join (",", @alt);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
133 print join ("\t", ($chr, $pos, $idstr, $refBases[0], $altstr,$qual)), "\t";
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
134 my (@allGTcalls) = ();
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
135 #convert allele flags to genotype calls
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
136 foreach my $sample (@$samples) {
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
137 my ($sampleAlleleCalls) = $alleleCalls{$sample};
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
138 my ($sampleGTcall);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
139 if ($sampleAlleleCalls->[0] =~ /\S\S/) { #diploid site
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
140 $sampleGTcall = &mkDiploidGTcall ($sampleAlleleCalls);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
141 } else {
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
142 $sampleGTcall = &mkHaploidGTcall ($sampleAlleleCalls);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
143 }
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
144 if ($sampleGTcall =~ /warning/) {
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
145 print STDERR "$sampleGTcall,$chr-$begin-$end,$idstr,$altstr,$sample,", join ("-", @$sampleAlleleCalls), "\n";
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
146 $sampleGTcall = '.';
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
147 $sampleGTcall = './.' if ($sampleAlleleCalls->[0] =~ /\S\S/);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
148 }
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
149 $info_NS-- if ($sampleGTcall =~ /\./);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
150 push (@allGTcalls, $sampleGTcall);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
151 $info_AF[$1]++ if ($sampleGTcall =~ /^(\d+)/);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
152 $info_AF[$1]++ if ($sampleGTcall =~ /^\d+\/(\d+)$/);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
153 }
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
154 $filter = 's50' if ($info_NS/scalar @$samples < 0.5);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
155 print "$filter\t";
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
156 my ($numAlleles) = 0;
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
157 foreach (@info_AF) {$numAlleles += $_;}
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
158 $numAlleles = 1 if ($numAlleles == 0); #In some loci, all samples are no-called
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
159 my (@alleleFreq) = map (sprintf ('%.2f', $_/$numAlleles), @info_AF);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
160 shift @alleleFreq; #don't need to print the reference allele frequency
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
161 print "NS=$info_NS;RSID=", join (",", @info_RSid), ";AF=", join (",", @alleleFreq), ";$info_DB\t$format\t", join ("\t", @allGTcalls), "\n";
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
162 }
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
163
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
164 #Convert a list of allele calls to diploid VCF genotype calls
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
165 #[11] becomes 1/1
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
166 #[01] becomes 0/1
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
167 #[00] becomes 0/0
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
168 #[11, 00] becomes 1/1
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
169 #[01, 01] becomes 1/2
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
170 #[00, 01, 1N] becomes 2/3
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
171 #[01, 01, 01] becomes ./. (unknown)
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
172 #[1N, 00, 00, 00] becomes 1/.
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
173 sub mkDiploidGTcall {
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
174 my ($sampleAlleleCalls) = @_;
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
175 #[11, 00, NN, 00], [1N, NN, 01, 00], [1N, NN, 1N, 00], [01, NN, 01, 00], [1N, 00, 00, 00]
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
176 my ($numOfOnes) = 0;
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
177 my ($sampleGTcall) = './.'; #diploid no-call in VCF
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
178 my (%merged);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
179 for (my $i=0; $i<scalar @$sampleAlleleCalls; $i++) {
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
180 push (@{$merged{$sampleAlleleCalls->[$i]}}, $i+1);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
181 my ($ones) = $sampleAlleleCalls->[$i] =~ tr/1/1/;
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
182 $numOfOnes += $ones;
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
183 }
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
184 return 'warning-more_than_two_1s' if ($numOfOnes > 2);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
185 return $merged{'11'}->[0] . '/' . $merged{'11'}->[0] if (exists $merged{'11'});
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
186 if (exists $merged{'01'}) {
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
187 return $merged{'01'}->[0] . '/' . $merged{'01'}->[1] if (defined $merged{'01'}->[1]);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
188 return $merged{'01'}->[0] . '/' . $merged{'1N'}->[0] if (exists $merged{'1N'});
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
189 return '0/' . $merged{'01'}->[0] if (!exists $merged{'NN'} && !exists $merged{'0N'});
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
190 return $merged{'01'}->[0] . '/.';
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
191 }
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
192 if (exists $merged{'1N'}) {
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
193 return $merged{'1N'}->[0] . '/' . $merged{'1N'}->[1] if (defined $merged{'1N'}->[1]);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
194 return $merged{'1N'}->[0] . '/.';
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
195 }
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
196 return '0/0' if (!exists $merged{'NN'} && !exists $merged{'0N'});
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
197 return '0/.' if (exists $merged{'0N'} && scalar @{$merged{'0N'}} == 1);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
198 return $sampleGTcall;
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
199 }
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
200
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
201 #Convert a list of allele calls to haploid VCF genotype calls
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
202 #[1] becomes 1
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
203 #[N] becomes .
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
204 #[0] becomes 0
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
205 #[1, 0] becomes 1
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
206 #[0, 1, N] becomes 2
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
207 #[1, 1] becomes .
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
208 sub mkHaploidGTcall {
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
209 my ($sampleAlleleCalls) = @_;
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
210 my ($sampleGTcall) = '.'; #haploid no-call in VCF
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
211 my (%merged);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
212 for (my $i=0; $i<scalar @$sampleAlleleCalls; $i++) {
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
213 push (@{$merged{$sampleAlleleCalls->[$i]}}, $i+1);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
214 }
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
215 if (exists $merged{'1'}) {
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
216 return "warning-more_than_one_1_haploid" if (defined $merged{'1'}->[1]);
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
217 return $merged{'1'}->[0];
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
218 }
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
219 return '0' if (!exists $merged{'N'});
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
220 return $sampleGTcall;
3a2e0f376f26 Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff changeset
221 }