comparison vcf2lv.sh @ 4:58815aed4ec3 draft default tip

few bugfixes in VCF-2-variantlist
author saskia-hiltemann
date Wed, 04 Nov 2015 05:06:12 -0500
parents 885ba15c2564
children
comparison
equal deleted inserted replaced
3:ac09a5aaed0b 4:58815aed4ec3
17 FS="\t"; 17 FS="\t";
18 OFS="\t"; 18 OFS="\t";
19 count=0; 19 count=0;
20 20
21 #output new header 21 #output new header
22 print "variantId", "chromosome", "begin", "end", "varType", "reference", "alleleSeq" 22 print "variantId", "chromosome", "begin", "end", "varType", "reference", "alleleSeq", "xRef" > "headerline.txt"
23 }{ 23 }{
24 24
25 if(substr($0,1,1)!="#" && $5 != "."){ #skip header or nonvariant entries (period in ALT column) 25 if(substr($0,1,1)!="#" && $5 != "."){ #skip header or nonvariant entries (period in ALT column)
26 26
27 # detect multivariants 27 # detect multivariants
28 chrom=$1 28 chrom=$1
29 pos=$2 29 pos=$2
30 ref=$4 30 ref=$4
31 #alt=$5 31 #alt=$5
32 reflen=length($4) 32 reflen=length($4)
33
34 # excel adds quotes sometimes :s
35 gsub(/"/,"",ref)
36 gsub(/"/,"",alt)
33 37
34 # add chr prefix if needed 38 # add chr prefix if needed
35 if(substr($1,1,3)!="chr") 39 if(substr($1,1,3)!="chr")
36 chromosome="chr"$1 40 chromosome="chr"$1
37 else 41 else
45 49
46 50
47 # determine varType 51 # determine varType
48 if(length(ref) == 1 && length(alt) == 1) 52 if(length(ref) == 1 && length(alt) == 1)
49 varType="snp" 53 varType="snp"
50 else if (length(ref) == 1 ) 54 else if (length(ref) == 1 && substr(ref,1,1)==substr(alt,1,1) )
51 varType="ins" 55 varType="ins"
52 else if (length(alt) == 1 ) 56 else if (length(alt) == 1 && substr(ref,1,1)==substr(alt,1,1) )
53 varType="del" 57 varType="del"
54 else 58 else
55 varType="sub" 59 varType="sub"
56 60
57 # determine start and end coordinates in 0-based half-open coordinate system 61 # determine start and end coordinates in 0-based half-open coordinate system
60 start=pos-1 64 start=pos-1
61 end=pos 65 end=pos
62 } 66 }
63 else if (varType=="ins"){ 67 else if (varType=="ins"){
64 start=pos 68 start=pos
65 end=pos 69 end=pos
66 } 70 }
67 else if (varType=="del"){ 71 else if (varType=="del"){
68 start=pos 72 start=pos
69 end=pos+(reflen-1) 73 end=pos+(reflen-1)
70 } 74 }
71 else if (varType=="sub"){ 75 else if (varType=="sub"){
72 start=pos 76 start=pos-1
73 end=pos+(reflen-1) 77 end=pos+(reflen-1)
74 } 78 }
75 79
76 # remove leading reference base 80 # remove leading reference base
77 if (varType!="snp" && substr(ref,1,1)==substr(alt,1,1)){ #subs not mandatory leading reference base :s 81 if ( varType!="snp" && substr(ref,1,1)==substr(alt,1,1) ){ #subs not mandatory leading reference base :s
78 reference=substr(ref,2) 82 reference=substr(ref,2)
79 alleleSeq=substr(alt,2) 83 alleleSeq=substr(alt,2)
84 if (varType =="sub"){
85 start+=1
86 }
80 } 87 }
81 else{ 88 else{
82 reference=ref 89 reference=ref
83 alleleSeq=alt 90 alleleSeq=alt
84 } 91 }
85 92
86 #print output variant(s) 93 #print output variant(s)
87 94
88 print count, chromosome, start, end, varType, reference, alleleSeq 95 if(chromosome == "chr1" || chromosome == "chr2" || chromosome == "chr3" || chromosome == "chr4" || chromosome == "chr5" || chromosome == "chr6" || chromosome == "chr7" || chromosome == "chr8" || chromosome == "chr9" || chromosome == "chr10" || chromosome == "chr11" || chromosome == "chr12" || chromosome == "chr13" || chromosome == "chr14" || chromosome == "chr15" || chromosome == "chr16" || chromosome == "chr17" ||chromosome == "chr18" ||chromosome == "chr19" ||chromosome == "chr20" ||chromosome == "chr21" ||chromosome == "chr22" ||chromosome == "chrX" ||chromosome == "chrY" )
96 print count, chromosome, start, end, varType, reference, alleleSeq, ""
89 97
90 count+=1 98 count+=1
91 } 99 }
92 } 100 }
93 }END{}' $vcffile > $outputfile 101 }END{}' $vcffile > $outputfile.almost
94 102
95 103
104
105 # due to overlapping variants that we reduce to more canonical forms, variants may have become out of order, so resort to be sure
106 sort -k2,2V -k3,3g $outputfile.almost > $outputfile.almost2
107
108 cat headerline.txt $outputfile.almost2 > $outputfile
109
110
111
96 112
97 #from 100Genomes site: 113 #from 100Genomes site:
98 114
99 #CHROM chromosome: an identifier from the reference genome. All entries for a specific CHROM should form a contiguous block within the VCF file.(Alphanumeric String, Required) 115 #CHROM chromosome: an identifier from the reference genome. All entries for a specific CHROM should form a contiguous block within the VCF file.(Alphanumeric String, Required)
100 #POS position: The reference position, with the 1st base having position 1. Positions are sorted numerically, in increasing order, within each reference sequence CHROM. (Integer, Required) 116 #POS position: The reference position, with the 1st base having position 1. Positions are sorted numerically, in increasing order, within each reference sequence CHROM. (Integer, Required)