Mercurial > repos > saskia-hiltemann > cgatools_v17
annotate tools/cgatools17/testvariants2VCF-v2.pl @ 6:3b532dc3864b draft
planemo upload for repository https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools commit c475b4222a15cdadc6085865f4d13426249fec25-dirty
author | yhoogstrate |
---|---|
date | Wed, 11 Nov 2015 03:55:01 -0500 |
parents | 3a2e0f376f26 |
children |
rev | line source |
---|---|
1
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
1 #!/usr/bin/perl -w |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
2 use strict; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
3 |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
4 #Converts a cgatools testvariant output to a multi-sample VCF file (VCF spec 4.1) |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
5 #Requires cgatools to access reference genome encoded in .crr format (hg18.crr or hg19.crr) |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
6 #make sure cgatools is in the $PATH |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
7 # |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
8 #Example testvariant file: |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
9 #variantId chromosome begin end varType reference alleleSeq xRef GS19238 GS19239 GS19240 |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
10 #6874944 chr5 20584031 20584032 snp C T dbsnp.119:rs10037487 00 01 11 |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
11 #6874945 chr5 20584031 20584032 sub C TA 01 01 00 |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
12 #6874946 chr5 20584031 20584032 sub C TG 00 01 00 |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
13 #After converting to VCF: |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
14 #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT GS19238 GS19239 GS19240 |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
15 #chr5 20584031 6874944;6874945;6874946 AC AT,ATA,ATG . PASS NS=51;RSID=dbsnp.119:rs10037487,.,.;AF=0.57,0.02,0.00;DB GT 0/2 ./. 1/1 |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
16 # |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
17 #Variants that share the same location (chr,begin,end) will be merged into one locus and their flags (0,1,N) will be converted into genotype calls |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
18 #Samples that are positive for more than two alleles within the same locus will be flagged and their genotype calls set to unknown (./.) |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
19 #Look at the sample GS19239 as such an example |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
20 # |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
21 #For non-SNP locus, VCF requires an extra reference base immediately upstream of the variant locus be included in the REF and ALT columns |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
22 # |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
23 |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
24 |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
25 die "Usage: $0 testvarOutput.txt hg19.crr > vcf.txt 2> runlog.txt\n\t" . |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
26 "testvarOutput.txt = output file from cgatools testvariants\n\t" . |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
27 "hg19.crr = reference genome in .crr format (must be the same as used in testvariants)\n\t" . |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
28 "vcf.txt = converted file in vcf format\n\t" . |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
29 "runlog.txt = log file\n" unless ($#ARGV == 1); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
30 die "Fail to find input file " . $ARGV[0] . "\n" unless (-e $ARGV[0]); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
31 die "Fail to find .crr file " . $ARGV[1] . "\n" unless (-e $ARGV[1]); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
32 die "Fail to execute cgatools: $!\n" if (system ('cgatools > /dev/null 2>&1') != 0); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
33 |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
34 my (undef, undef, undef, $mday, $mon, $year) = localtime; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
35 my($timestamp)=sprintf ('%s%02d%02d', $year+1900, $mon+1, $mday); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
36 print <<EOF; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
37 ##fileformat=VCFv4.1 |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
38 ##fileDate=$timestamp |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
39 ##source=CGA Tools v1.7.1 listvariants/testvariants |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
40 ##reference=$ARGV[1] |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
41 ##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples with Fully Called Data"> |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
42 ##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency"> |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
43 ##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP Membership"> |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
44 ##INFO=<ID=RSID,Number=A,Type=String,Description="dbSNP rs ids"> |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
45 ##FILTER=<ID=s50,Description="Less than 50% of samples are fully called"> |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
46 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
47 EOF |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
48 |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
49 &testvar2VCF (@ARGV); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
50 |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
51 sub testvar2VCF { |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
52 #variantId=0 chromosome=1 begin=2 end=3 varType=4 reference=5 alleleSeq=6 xRef=7 GS12885-1100-37-ASM=8 |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
53 my ($inFile, $crrFile) = @_; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
54 my (@samples, %seen); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
55 open (IN, "$inFile") or die "Fail to open input file $inFile\n"; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
56 while (<IN>) { |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
57 chomp; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
58 my (@fs) = split (/\t/); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
59 if (/^variantId/) { |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
60 @samples = @fs[8..$#fs]; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
61 print join ("\t", ('#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', @samples)), "\n"; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
62 next; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
63 } |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
64 next unless (/^\d/); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
65 my ($key) = $fs[1] . '-' . $fs[2] . '-' . $fs[3]; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
66 if (!%seen || exists $seen{$key}) { |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
67 push (@{$seen{$key}}, [@fs]); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
68 next; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
69 } |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
70 #has a new locus, need to process saved locus |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
71 &doSavedLocus (\%seen, \@samples, $crrFile); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
72 #reinitialize with the new locus |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
73 %seen = (); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
74 push (@{$seen{$key}}, [@fs]); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
75 } |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
76 close IN; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
77 &doSavedLocus (\%seen, \@samples, $crrFile); #do the last locus |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
78 } |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
79 |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
80 sub doSavedLocus { |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
81 my ($seen, $samples, $crrFile) = @_; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
82 my ($lines) = values %$seen; #get all the lines that belong to the same locus |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
83 my ($chr, $begin, $end) = ($lines->[0]->[1], $lines->[0]->[2], $lines->[0]->[3]); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
84 my ($info_NS) = scalar @$samples; #num. of samples fully called |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
85 die "no_samples,$chr-$begin-$end\n" unless ($info_NS > 0); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
86 my ($info_DB) = ''; #dbSNP memebership |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
87 my (@info_RSid) = (); #dbSNP rs ids |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
88 my (%ref, @alt, @id, %alleleCalls); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
89 my ($format) = 'GT'; #GT (genotype) is the only value currently populated for each sample |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
90 my ($qual, $filter, $i) = ('.', 'PASS', 0); #no quality scores provided in the testvar output |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
91 my (@info_AF); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
92 #initialize every allele count to 0 |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
93 for ($i=0; $i<=scalar @$lines; $i++) {$info_AF[$i] = 0;} |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
94 my ($pos) = $begin; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
95 #check the vartypes |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
96 my (%varTypes) = map {$_->[4] => 1} @$lines; #column 4 in the testvar output is varType |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
97 my (@varTypeKeys) = keys %varTypes; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
98 #check if this locus is SNP only |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
99 my ($snpOnly) = 0; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
100 $snpOnly = 1 if ($#varTypeKeys == 0 && $varTypeKeys[0] eq 'snp'); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
101 foreach my $line (@$lines) { |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
102 my ($refBase) = $line->[5]; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
103 my ($altBase) = $line->[6]; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
104 if (!$snpOnly) { #not a SNP only locus |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
105 my ($decodecrrCmd) = join ('', ("cgatools decodecrr --reference $crrFile --range $chr:", $begin - 1 , "-$begin 2> /dev/null")); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
106 my ($re) = `$decodecrrCmd`; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
107 die "cgatools decodecrr failed $decodecrrCmd ", join ("\t", @$line), "\n" unless (defined $re); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
108 chomp ($re); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
109 $refBase = $re . $refBase; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
110 $altBase = $re . $altBase; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
111 $pos = $begin; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
112 } |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
113 $ref{$refBase} = 1; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
114 push (@alt, $altBase); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
115 push (@id, $line->[0]); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
116 if ($line->[7] =~ /dbsnp/) { |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
117 $info_DB = 'DB'; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
118 $line->[7] =~ s/;/-/g; #testvar use ; as delimiter which is reserved by the INFO column |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
119 push (@info_RSid, $line->[7]); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
120 } else { |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
121 push (@info_RSid, '.'); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
122 } |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
123 #collect testvar's allele flags (00, 01, 11, 0N, NN, ...), which starts at column 8. |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
124 for ($i=0; $i<scalar @$samples; $i++) { |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
125 push (@{$alleleCalls{$samples->[$i]}}, $line->[$i+8]); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
126 } |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
127 } |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
128 $pos = $begin + 1 if ($snpOnly); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
129 my (@refBases) = keys %ref; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
130 die "mismatched_ref_base, " . join (",", (@id, @refBases)) . "\n" if ($#refBases != 0); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
131 my ($idstr) = join (";", @id); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
132 my ($altstr) = join (",", @alt); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
133 print join ("\t", ($chr, $pos, $idstr, $refBases[0], $altstr,$qual)), "\t"; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
134 my (@allGTcalls) = (); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
135 #convert allele flags to genotype calls |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
136 foreach my $sample (@$samples) { |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
137 my ($sampleAlleleCalls) = $alleleCalls{$sample}; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
138 my ($sampleGTcall); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
139 if ($sampleAlleleCalls->[0] =~ /\S\S/) { #diploid site |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
140 $sampleGTcall = &mkDiploidGTcall ($sampleAlleleCalls); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
141 } else { |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
142 $sampleGTcall = &mkHaploidGTcall ($sampleAlleleCalls); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
143 } |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
144 if ($sampleGTcall =~ /warning/) { |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
145 print STDERR "$sampleGTcall,$chr-$begin-$end,$idstr,$altstr,$sample,", join ("-", @$sampleAlleleCalls), "\n"; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
146 $sampleGTcall = '.'; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
147 $sampleGTcall = './.' if ($sampleAlleleCalls->[0] =~ /\S\S/); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
148 } |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
149 $info_NS-- if ($sampleGTcall =~ /\./); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
150 push (@allGTcalls, $sampleGTcall); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
151 $info_AF[$1]++ if ($sampleGTcall =~ /^(\d+)/); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
152 $info_AF[$1]++ if ($sampleGTcall =~ /^\d+\/(\d+)$/); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
153 } |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
154 $filter = 's50' if ($info_NS/scalar @$samples < 0.5); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
155 print "$filter\t"; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
156 my ($numAlleles) = 0; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
157 foreach (@info_AF) {$numAlleles += $_;} |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
158 $numAlleles = 1 if ($numAlleles == 0); #In some loci, all samples are no-called |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
159 my (@alleleFreq) = map (sprintf ('%.2f', $_/$numAlleles), @info_AF); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
160 shift @alleleFreq; #don't need to print the reference allele frequency |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
161 print "NS=$info_NS;RSID=", join (",", @info_RSid), ";AF=", join (",", @alleleFreq), ";$info_DB\t$format\t", join ("\t", @allGTcalls), "\n"; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
162 } |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
163 |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
164 #Convert a list of allele calls to diploid VCF genotype calls |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
165 #[11] becomes 1/1 |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
166 #[01] becomes 0/1 |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
167 #[00] becomes 0/0 |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
168 #[11, 00] becomes 1/1 |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
169 #[01, 01] becomes 1/2 |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
170 #[00, 01, 1N] becomes 2/3 |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
171 #[01, 01, 01] becomes ./. (unknown) |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
172 #[1N, 00, 00, 00] becomes 1/. |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
173 sub mkDiploidGTcall { |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
174 my ($sampleAlleleCalls) = @_; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
175 #[11, 00, NN, 00], [1N, NN, 01, 00], [1N, NN, 1N, 00], [01, NN, 01, 00], [1N, 00, 00, 00] |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
176 my ($numOfOnes) = 0; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
177 my ($sampleGTcall) = './.'; #diploid no-call in VCF |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
178 my (%merged); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
179 for (my $i=0; $i<scalar @$sampleAlleleCalls; $i++) { |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
180 push (@{$merged{$sampleAlleleCalls->[$i]}}, $i+1); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
181 my ($ones) = $sampleAlleleCalls->[$i] =~ tr/1/1/; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
182 $numOfOnes += $ones; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
183 } |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
184 return 'warning-more_than_two_1s' if ($numOfOnes > 2); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
185 return $merged{'11'}->[0] . '/' . $merged{'11'}->[0] if (exists $merged{'11'}); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
186 if (exists $merged{'01'}) { |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
187 return $merged{'01'}->[0] . '/' . $merged{'01'}->[1] if (defined $merged{'01'}->[1]); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
188 return $merged{'01'}->[0] . '/' . $merged{'1N'}->[0] if (exists $merged{'1N'}); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
189 return '0/' . $merged{'01'}->[0] if (!exists $merged{'NN'} && !exists $merged{'0N'}); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
190 return $merged{'01'}->[0] . '/.'; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
191 } |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
192 if (exists $merged{'1N'}) { |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
193 return $merged{'1N'}->[0] . '/' . $merged{'1N'}->[1] if (defined $merged{'1N'}->[1]); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
194 return $merged{'1N'}->[0] . '/.'; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
195 } |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
196 return '0/0' if (!exists $merged{'NN'} && !exists $merged{'0N'}); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
197 return '0/.' if (exists $merged{'0N'} && scalar @{$merged{'0N'}} == 1); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
198 return $sampleGTcall; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
199 } |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
200 |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
201 #Convert a list of allele calls to haploid VCF genotype calls |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
202 #[1] becomes 1 |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
203 #[N] becomes . |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
204 #[0] becomes 0 |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
205 #[1, 0] becomes 1 |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
206 #[0, 1, N] becomes 2 |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
207 #[1, 1] becomes . |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
208 sub mkHaploidGTcall { |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
209 my ($sampleAlleleCalls) = @_; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
210 my ($sampleGTcall) = '.'; #haploid no-call in VCF |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
211 my (%merged); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
212 for (my $i=0; $i<scalar @$sampleAlleleCalls; $i++) { |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
213 push (@{$merged{$sampleAlleleCalls->[$i]}}, $i+1); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
214 } |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
215 if (exists $merged{'1'}) { |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
216 return "warning-more_than_one_1_haploid" if (defined $merged{'1'}->[1]); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
217 return $merged{'1'}->[0]; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
218 } |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
219 return '0' if (!exists $merged{'N'}); |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
220 return $sampleGTcall; |
3a2e0f376f26
Minor change to tv2vcf.xml to allow for workflow automation
dgdekoning
parents:
diff
changeset
|
221 } |