annotate vcf2lv.sh @ 4:58815aed4ec3 draft default tip

few bugfixes in VCF-2-variantlist
author saskia-hiltemann
date Wed, 04 Nov 2015 05:06:12 -0500
parents 885ba15c2564
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
1 #!/bin/bash
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
2
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
3 vcffile=$1
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
4 outputfile=$2
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
5
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
6 # vcf columns: CHROM-POS-ID-REF-ALT
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
7 # LV cloumns: variantId-chromosome-start-end-reference-alleleSeq-xRef
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
8
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
9
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
10 # add chr prefix if not present
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
11 # determine varType (snp, ins, del, sub)
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
12 # convert coordinates to 0-based halfopen
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
13 # calculate end coordinate from position and length
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
14 # remove leading reference base from the non-SNP variants, update position
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
15
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
16 awk 'BEGIN{
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
17 FS="\t";
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
18 OFS="\t";
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
19 count=0;
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
20
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
21 #output new header
4
58815aed4ec3 few bugfixes in VCF-2-variantlist
saskia-hiltemann
parents: 2
diff changeset
22 print "variantId", "chromosome", "begin", "end", "varType", "reference", "alleleSeq", "xRef" > "headerline.txt"
0
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
23 }{
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
24
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
25 if(substr($0,1,1)!="#" && $5 != "."){ #skip header or nonvariant entries (period in ALT column)
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
26
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
27 # detect multivariants
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
28 chrom=$1
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
29 pos=$2
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
30 ref=$4
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
31 #alt=$5
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
32 reflen=length($4)
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
33
4
58815aed4ec3 few bugfixes in VCF-2-variantlist
saskia-hiltemann
parents: 2
diff changeset
34 # excel adds quotes sometimes :s
58815aed4ec3 few bugfixes in VCF-2-variantlist
saskia-hiltemann
parents: 2
diff changeset
35 gsub(/"/,"",ref)
58815aed4ec3 few bugfixes in VCF-2-variantlist
saskia-hiltemann
parents: 2
diff changeset
36 gsub(/"/,"",alt)
58815aed4ec3 few bugfixes in VCF-2-variantlist
saskia-hiltemann
parents: 2
diff changeset
37
0
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
38 # add chr prefix if needed
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
39 if(substr($1,1,3)!="chr")
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
40 chromosome="chr"$1
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
41 else
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
42 chromosome=chrom
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
43
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
44 # split ALT column in case of multiple variant alleles
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
45 split($5,alleles,",");
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
46
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
47 for (i in alleles) {
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
48 alt=alleles[i]
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
49
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
50
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
51 # determine varType
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
52 if(length(ref) == 1 && length(alt) == 1)
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
53 varType="snp"
4
58815aed4ec3 few bugfixes in VCF-2-variantlist
saskia-hiltemann
parents: 2
diff changeset
54 else if (length(ref) == 1 && substr(ref,1,1)==substr(alt,1,1) )
0
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
55 varType="ins"
4
58815aed4ec3 few bugfixes in VCF-2-variantlist
saskia-hiltemann
parents: 2
diff changeset
56 else if (length(alt) == 1 && substr(ref,1,1)==substr(alt,1,1) )
0
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
57 varType="del"
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
58 else
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
59 varType="sub"
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
60
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
61 # determine start and end coordinates in 0-based half-open coordinate system
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
62
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
63 if (varType=="snp"){
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
64 start=pos-1
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
65 end=pos
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
66 }
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
67 else if (varType=="ins"){
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
68 start=pos
4
58815aed4ec3 few bugfixes in VCF-2-variantlist
saskia-hiltemann
parents: 2
diff changeset
69 end=pos
0
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
70 }
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
71 else if (varType=="del"){
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
72 start=pos
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
73 end=pos+(reflen-1)
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
74 }
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
75 else if (varType=="sub"){
4
58815aed4ec3 few bugfixes in VCF-2-variantlist
saskia-hiltemann
parents: 2
diff changeset
76 start=pos-1
0
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
77 end=pos+(reflen-1)
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
78 }
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
79
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
80 # remove leading reference base
4
58815aed4ec3 few bugfixes in VCF-2-variantlist
saskia-hiltemann
parents: 2
diff changeset
81 if ( varType!="snp" && substr(ref,1,1)==substr(alt,1,1) ){ #subs not mandatory leading reference base :s
0
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
82 reference=substr(ref,2)
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
83 alleleSeq=substr(alt,2)
4
58815aed4ec3 few bugfixes in VCF-2-variantlist
saskia-hiltemann
parents: 2
diff changeset
84 if (varType =="sub"){
58815aed4ec3 few bugfixes in VCF-2-variantlist
saskia-hiltemann
parents: 2
diff changeset
85 start+=1
58815aed4ec3 few bugfixes in VCF-2-variantlist
saskia-hiltemann
parents: 2
diff changeset
86 }
0
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
87 }
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
88 else{
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
89 reference=ref
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
90 alleleSeq=alt
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
91 }
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
92
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
93 #print output variant(s)
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
94
4
58815aed4ec3 few bugfixes in VCF-2-variantlist
saskia-hiltemann
parents: 2
diff changeset
95 if(chromosome == "chr1" || chromosome == "chr2" || chromosome == "chr3" || chromosome == "chr4" || chromosome == "chr5" || chromosome == "chr6" || chromosome == "chr7" || chromosome == "chr8" || chromosome == "chr9" || chromosome == "chr10" || chromosome == "chr11" || chromosome == "chr12" || chromosome == "chr13" || chromosome == "chr14" || chromosome == "chr15" || chromosome == "chr16" || chromosome == "chr17" ||chromosome == "chr18" ||chromosome == "chr19" ||chromosome == "chr20" ||chromosome == "chr21" ||chromosome == "chr22" ||chromosome == "chrX" ||chromosome == "chrY" )
58815aed4ec3 few bugfixes in VCF-2-variantlist
saskia-hiltemann
parents: 2
diff changeset
96 print count, chromosome, start, end, varType, reference, alleleSeq, ""
0
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
97
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
98 count+=1
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
99 }
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
100 }
4
58815aed4ec3 few bugfixes in VCF-2-variantlist
saskia-hiltemann
parents: 2
diff changeset
101 }END{}' $vcffile > $outputfile.almost
0
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
102
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
103
4
58815aed4ec3 few bugfixes in VCF-2-variantlist
saskia-hiltemann
parents: 2
diff changeset
104
58815aed4ec3 few bugfixes in VCF-2-variantlist
saskia-hiltemann
parents: 2
diff changeset
105 # due to overlapping variants that we reduce to more canonical forms, variants may have become out of order, so resort to be sure
58815aed4ec3 few bugfixes in VCF-2-variantlist
saskia-hiltemann
parents: 2
diff changeset
106 sort -k2,2V -k3,3g $outputfile.almost > $outputfile.almost2
58815aed4ec3 few bugfixes in VCF-2-variantlist
saskia-hiltemann
parents: 2
diff changeset
107
58815aed4ec3 few bugfixes in VCF-2-variantlist
saskia-hiltemann
parents: 2
diff changeset
108 cat headerline.txt $outputfile.almost2 > $outputfile
58815aed4ec3 few bugfixes in VCF-2-variantlist
saskia-hiltemann
parents: 2
diff changeset
109
58815aed4ec3 few bugfixes in VCF-2-variantlist
saskia-hiltemann
parents: 2
diff changeset
110
58815aed4ec3 few bugfixes in VCF-2-variantlist
saskia-hiltemann
parents: 2
diff changeset
111
0
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
112
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
113 #from 100Genomes site:
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
114
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
115 #CHROM chromosome: an identifier from the reference genome. All entries for a specific CHROM should form a contiguous block within the VCF file.(Alphanumeric String, Required)
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
116 #POS position: The reference position, with the 1st base having position 1. Positions are sorted numerically, in increasing order, within each reference sequence CHROM. (Integer, Required)
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
117 #ID semi-colon separated list of unique identifiers where available. If this is a dbSNP variant it is encouraged to use the rs number(s). No identifier should be present in more than one data record. If there is no identifier available, then the missing value should be used. (Alphanumeric String)
1209f18a5a83 Uploaded
saskia-hiltemann
parents:
diff changeset
118 #REF reference base(s): Each base must be one of A,C,G,T,N. Bases should be in uppercase. Multiple bases are permitted. The value in the POS field refers to the position of the first base in the String. For InDels, the reference String must include the base before the event (which must be reflected in the POS field). (String, Required).
4
58815aed4ec3 few bugfixes in VCF-2-variantlist
saskia-hiltemann
parents: 2
diff changeset
119 #ALT comma separated list of alternate non-reference alleles called on at least one of the samples. Options are base Strings made up of the bases A,C,G,T,N, or an angle-bracketed ID String (”<ID>”). If there are no alternative alleles, then the missing value should be used. Bases should be in uppercase. (Alphanumeric String; no whitespace, commas, or angle-brackets are permitted in the ID String itself)