Mercurial > repos > saskia-hiltemann > virtual_normal_analysis
comparison vcf2lv.sh @ 4:58815aed4ec3 draft default tip
few bugfixes in VCF-2-variantlist
author | saskia-hiltemann |
---|---|
date | Wed, 04 Nov 2015 05:06:12 -0500 |
parents | 885ba15c2564 |
children |
comparison
equal
deleted
inserted
replaced
3:ac09a5aaed0b | 4:58815aed4ec3 |
---|---|
17 FS="\t"; | 17 FS="\t"; |
18 OFS="\t"; | 18 OFS="\t"; |
19 count=0; | 19 count=0; |
20 | 20 |
21 #output new header | 21 #output new header |
22 print "variantId", "chromosome", "begin", "end", "varType", "reference", "alleleSeq" | 22 print "variantId", "chromosome", "begin", "end", "varType", "reference", "alleleSeq", "xRef" > "headerline.txt" |
23 }{ | 23 }{ |
24 | 24 |
25 if(substr($0,1,1)!="#" && $5 != "."){ #skip header or nonvariant entries (period in ALT column) | 25 if(substr($0,1,1)!="#" && $5 != "."){ #skip header or nonvariant entries (period in ALT column) |
26 | 26 |
27 # detect multivariants | 27 # detect multivariants |
28 chrom=$1 | 28 chrom=$1 |
29 pos=$2 | 29 pos=$2 |
30 ref=$4 | 30 ref=$4 |
31 #alt=$5 | 31 #alt=$5 |
32 reflen=length($4) | 32 reflen=length($4) |
33 | |
34 # excel adds quotes sometimes :s | |
35 gsub(/"/,"",ref) | |
36 gsub(/"/,"",alt) | |
33 | 37 |
34 # add chr prefix if needed | 38 # add chr prefix if needed |
35 if(substr($1,1,3)!="chr") | 39 if(substr($1,1,3)!="chr") |
36 chromosome="chr"$1 | 40 chromosome="chr"$1 |
37 else | 41 else |
45 | 49 |
46 | 50 |
47 # determine varType | 51 # determine varType |
48 if(length(ref) == 1 && length(alt) == 1) | 52 if(length(ref) == 1 && length(alt) == 1) |
49 varType="snp" | 53 varType="snp" |
50 else if (length(ref) == 1 ) | 54 else if (length(ref) == 1 && substr(ref,1,1)==substr(alt,1,1) ) |
51 varType="ins" | 55 varType="ins" |
52 else if (length(alt) == 1 ) | 56 else if (length(alt) == 1 && substr(ref,1,1)==substr(alt,1,1) ) |
53 varType="del" | 57 varType="del" |
54 else | 58 else |
55 varType="sub" | 59 varType="sub" |
56 | 60 |
57 # determine start and end coordinates in 0-based half-open coordinate system | 61 # determine start and end coordinates in 0-based half-open coordinate system |
60 start=pos-1 | 64 start=pos-1 |
61 end=pos | 65 end=pos |
62 } | 66 } |
63 else if (varType=="ins"){ | 67 else if (varType=="ins"){ |
64 start=pos | 68 start=pos |
65 end=pos | 69 end=pos |
66 } | 70 } |
67 else if (varType=="del"){ | 71 else if (varType=="del"){ |
68 start=pos | 72 start=pos |
69 end=pos+(reflen-1) | 73 end=pos+(reflen-1) |
70 } | 74 } |
71 else if (varType=="sub"){ | 75 else if (varType=="sub"){ |
72 start=pos | 76 start=pos-1 |
73 end=pos+(reflen-1) | 77 end=pos+(reflen-1) |
74 } | 78 } |
75 | 79 |
76 # remove leading reference base | 80 # remove leading reference base |
77 if (varType!="snp" && substr(ref,1,1)==substr(alt,1,1)){ #subs not mandatory leading reference base :s | 81 if ( varType!="snp" && substr(ref,1,1)==substr(alt,1,1) ){ #subs not mandatory leading reference base :s |
78 reference=substr(ref,2) | 82 reference=substr(ref,2) |
79 alleleSeq=substr(alt,2) | 83 alleleSeq=substr(alt,2) |
84 if (varType =="sub"){ | |
85 start+=1 | |
86 } | |
80 } | 87 } |
81 else{ | 88 else{ |
82 reference=ref | 89 reference=ref |
83 alleleSeq=alt | 90 alleleSeq=alt |
84 } | 91 } |
85 | 92 |
86 #print output variant(s) | 93 #print output variant(s) |
87 | 94 |
88 print count, chromosome, start, end, varType, reference, alleleSeq | 95 if(chromosome == "chr1" || chromosome == "chr2" || chromosome == "chr3" || chromosome == "chr4" || chromosome == "chr5" || chromosome == "chr6" || chromosome == "chr7" || chromosome == "chr8" || chromosome == "chr9" || chromosome == "chr10" || chromosome == "chr11" || chromosome == "chr12" || chromosome == "chr13" || chromosome == "chr14" || chromosome == "chr15" || chromosome == "chr16" || chromosome == "chr17" ||chromosome == "chr18" ||chromosome == "chr19" ||chromosome == "chr20" ||chromosome == "chr21" ||chromosome == "chr22" ||chromosome == "chrX" ||chromosome == "chrY" ) |
96 print count, chromosome, start, end, varType, reference, alleleSeq, "" | |
89 | 97 |
90 count+=1 | 98 count+=1 |
91 } | 99 } |
92 } | 100 } |
93 }END{}' $vcffile > $outputfile | 101 }END{}' $vcffile > $outputfile.almost |
94 | 102 |
95 | 103 |
104 | |
105 # due to overlapping variants that we reduce to more canonical forms, variants may have become out of order, so resort to be sure | |
106 sort -k2,2V -k3,3g $outputfile.almost > $outputfile.almost2 | |
107 | |
108 cat headerline.txt $outputfile.almost2 > $outputfile | |
109 | |
110 | |
111 | |
96 | 112 |
97 #from 100Genomes site: | 113 #from 100Genomes site: |
98 | 114 |
99 #CHROM chromosome: an identifier from the reference genome. All entries for a specific CHROM should form a contiguous block within the VCF file.(Alphanumeric String, Required) | 115 #CHROM chromosome: an identifier from the reference genome. All entries for a specific CHROM should form a contiguous block within the VCF file.(Alphanumeric String, Required) |
100 #POS position: The reference position, with the 1st base having position 1. Positions are sorted numerically, in increasing order, within each reference sequence CHROM. (Integer, Required) | 116 #POS position: The reference position, with the 1st base having position 1. Positions are sorted numerically, in increasing order, within each reference sequence CHROM. (Integer, Required) |