diff tools/annovar/annovar.sh @ 2:565c0e690238 draft

Added support for LJB2, COSMIC67, CLINVAR and NCI60. Fixed dgv annotation to use new UCSC table dgvMerged instead.
author saskia-hiltemann
date Mon, 18 Nov 2013 10:32:33 -0500
parents d3a72e55deca
children ff5325029a8e
line wrap: on
line diff
--- a/tools/annovar/annovar.sh	Tue Nov 05 07:16:32 2013 -0500
+++ b/tools/annovar/annovar.sh	Mon Nov 18 10:32:33 2013 -0500
@@ -1,7 +1,12 @@
 #!/bin/bash
 
 test="N"
+dofilter="N"
 
+#########################
+#	   DEFINE SOME
+#	    FUNCTIONS
+#########################
 
 function usage(){
 	echo "usage: $0 todo"
@@ -167,7 +172,14 @@
 
 
 
-set -- `getopt -n$0 -u -a --longoptions="inputfile: buildver: humandb: varfile: VCF: chrcol: startcol: endcol: refcol: obscol: vartypecol: convertcoords: geneanno: verdbsnp: tfbs: mce: cytoband: segdup: dgv: gwas: ver1000g: cg46: cg69: impactscores: esp: gerp: cosmic61: cosmic63: cosmic64: cosmic65: outall: outfilt: outinvalid: scriptsdir: dorunannovar: dofilter: filt_dbsnp: filt1000GALL: filt1000GAFR: filt1000GAMR: filt1000GASN: filt1000GEUR: filtESP6500ALL: filtESP6500EA: filtESP6500AA: filtcg46: filtcg69: dummy:" "h:" "$@"` || usage
+#################################
+#
+#	   PARSE PARAMETERS
+#
+#################################
+
+
+set -- `getopt -n$0 -u -a --longoptions="inputfile: buildver: humandb: varfile: VCF: chrcol: startcol: endcol: refcol: obscol: vartypecol: convertcoords: geneanno: hgvs: verdbsnp: tfbs: mce: cytoband: segdup: dgv: gwas: ver1000g: cg46: cg69: impactscores: newimpactscores: otherinfo: esp: gerp: cosmic61: cosmic63: cosmic64: cosmic65: cosmic67: clinvar: nci60: outall: outfilt: outinvalid: scriptsdir: dorunannovar: dofilter: filt_dbsnp: filt1000GALL: filt1000GAFR: filt1000GAMR: filt1000GASN: filt1000GEUR: filtESP6500ALL: filtESP6500EA: filtESP6500AA: filtcg46: filtcg69: dummy:" "h:" "$@"` || usage
 [ $# -eq 0 ] && usage
 
 
@@ -176,8 +188,8 @@
 do
     case "$1" in
        	--inputfile)      			infile=$2;shift;;  # inputfile
-		--buildver)					buildver=$2;shift;; # hg18 or hg19
-		--humandb)					humandb=$2;shift;; # location of humandb database
+		--buildver)					buildvertmp=$2;shift;; # hg18 or hg19
+		--humandb)					humandbtmp=$2;shift;; # location of humandb database
 	 	--varfile)      			varfile=$2;shift;; # Y or N  
 		--VCF)						vcf=$2;shift;; #Y or N
 		--chrcol)      				chrcol=$2;shift;;  # which column has chr 
@@ -188,6 +200,7 @@
 		--vartypecol)      			vartypecol=$2;shift;;  # which column has vartype
 		--convertcoords)			convertcoords=$2;shift;;  # Y or N convert coordinate from CG to 1-based?		
 		--geneanno)      			geneanno=$2;shift;; # comma-separated list of strings refSeq, knowngene, ensgene  
+		--hgvs)						hgvs=$2;shift;;
 		--verdbsnp)					verdbsnp=$2;shift;; #comma-separated list of dbsnp version to annotate with (e.g. "132,135NonFlagged,137")"
 		--tfbs)      				tfbs=$2;shift;; 	# Y or N 
 		--mce)      				mce=$2;shift;; 	# Y or N 
@@ -199,13 +212,18 @@
 		--cg46)						cg46=$2;shift;;
 		--cg69)						cg69=$2;shift;;		
 		--impactscores)      		impactscores=$2;shift;; # Y or N 
-		--scriptsdir)	      		scriptsdir=$2;shift;; # Y or N 
+		--newimpactscores)      	newimpactscores=$2;shift;; # Y or N 
+		--otherinfo)				otherinfo=$2;shift;; 
+		--scriptsdir)	      		scriptsdirtmp=$2;shift;; # Y or N 
 		--esp)      				esp=$2;shift;; 	# Y or N 
 		--gerp)      				gerp=$2;shift;; 	# Y or N 
 		--cosmic61)					cosmic61=$2;shift;;  # Y or N
 		--cosmic63)					cosmic63=$2;shift;;  # Y or N
 		--cosmic64)					cosmic64=$2;shift;;  # Y or N
 		--cosmic65)					cosmic65=$2;shift;;  # Y or N
+		--cosmic67)					cosmic67=$2;shift;;  # Y or N
+		--nci60)					nci60=$2;shift;;  # Y or N
+		--clinvar)					clinvar=$2;shift;;  # Y or N
 		--filt_dbsnp)				filt_dbsnp=$2;shift;;
 		--filt1000GALL)				threshold_1000g_ALL=$2;shift;; #threshold value
 		--filt1000GAFR)				threshold_1000g_AFR=$2;shift;; #threshold value
@@ -220,8 +238,7 @@
 		--outall)      				outfile_all=$2;shift;; # file 
 		--outfilt)      			outfile_filt=$2;shift;; # file
 		--outinvalid)				outfile_invalid=$2;shift;; #file
-		--dorunannovar)				dorunannovar=$2;shift;; 	#Y or N
-		--dofilter)					dofilter=$2;shift;; #Y or N	
+		--dorunannovar)				dorunannovar=$2;shift;; 	#Y or N		
        -h)        	shift;;
 	   --)        	shift;break;;
        -*)        	usage;;
@@ -230,6 +247,11 @@
     shift
 done
 
+#sometimes galaxy screws up these variables after updates, if comma-separated list, use only what is before first comma
+humandb=${humandbtmp%,*}
+buildver=${buildvertmp%,*}
+scriptsdir=${scriptsdirtmp%,*}
+
 
 if [ $test == "Y" ]
 then
@@ -249,6 +271,7 @@
 	echo "cg46: ${cg46}"	
 	echo "cg69: ${cg69}"
 	echo "impactscores: $impactscores"
+	echo "impactscores: $newimpactscores"	
 	echo "esp: $esp"
 	echo "gerp: $gerp"
 	echo "cosmic: $cosmic"
@@ -277,10 +300,18 @@
 fi
 
 
+
+############################################
+#
+#       Annotate Variants 
+#
+############################################
+
+#parse geneanno param
 refgene="N"
 knowngene="N"
 ensgene="N"	
-#parse geneanno param
+
 if [[ $geneanno =~ "refSeq" ]]
 then
 	refgene="Y"
@@ -293,7 +324,10 @@
 then
 	ensgene="Y"
 fi
-
+if [ $hgvs == "N" ]
+then
+	hgvs=""
+fi
 
 #parse verdbsnp/1000g/esp strings
 dbsnpstr=${verdbsnp//,/ }
@@ -314,7 +348,8 @@
 polyphen2="N"
 phylop="N"
 ljbsift="N"
-#parse impactscores param
+
+#parse old impactscores param (obsolete)
 if [[ $impactscores =~ "mutationtaster" ]]
 then
 	mutationtaster="Y"
@@ -331,6 +366,10 @@
 then
 	ljbsift="Y"
 fi
+if [[ $impactscores =~ "ljb2sift" ]]
+then
+	ljb2sift="Y"
+fi
 if [[ $impactscores =~ "pp2" ]]
 then
 	polyphen2="Y"
@@ -347,6 +386,65 @@
 
 #ljb refers to Liu, Jian, Boerwinkle paper in Human Mutation with pubmed ID 21520341. Cite this paper if you use the scores
 
+ljb2_sift="N"
+ljb2_pp2hdiv="N"
+ljb2_pp2hvar="N"
+ljb2_lrt="N"
+ljb2_mt="N"
+ljb2_ma="N"
+ljb2_fathmm="N"
+ljb2_gerp="N"
+ljb2_phylop="N"
+ljb2_siphy="N"
+
+# parse ljb2 newimpactscores param
+# ljb2_sift, ljb2_pp2hdiv, ljb2_pp2hvar, ljb2_lrt, ljb2_mt, ljb2_ma, ljb2_fathmm, ljb2_gerp++, ljb2_phylop, ljb2_siphy
+if [[ $newimpactscores =~ "ljb2_sift" ]]
+then
+	ljb2_sift="Y"
+fi
+if [[ $newimpactscores =~ "ljb2_pp2hdiv" ]]
+then
+	ljb2_pp2hdiv="Y"
+fi
+if [[ $newimpactscores =~ "ljb2_pp2hvar" ]]
+then
+	ljb2_pp2hvar="Y"
+fi
+if [[ $newimpactscores =~ "ljb2_lrt" ]]
+then
+	ljb2_lrt="Y"
+fi
+if [[ $newimpactscores =~ "ljb2_mt" ]]
+then
+	ljb2_mt="Y"
+fi
+if [[ $newimpactscores =~ "ljb2_ma" ]]
+then
+	ljb2_ma="Y"
+fi
+if [[ $newimpactscores =~ "ljb2_fathmm" ]]
+then
+	ljb2_fathmm="Y"
+fi
+if [[ $newimpactscores =~ "ljb2_gerp" ]]
+then
+	ljb2_gerp="Y"
+fi
+if [[ $newimpactscores =~ "ljb2_phylop" ]]
+then
+	ljb2_phylop="Y"
+fi
+if [[ $newimpactscores =~ "ljb2_siphy" ]]
+then
+	ljb2_siphy="Y"
+fi
+
+if [ $otherinfo == "N" ]
+then
+	otherinfo=""
+fi
+
 
 #column header names we will be adding
 # ESP 6500
@@ -498,7 +596,7 @@
 	if [ $refgene == "Y" ]
 	then
 		echo -e "\nrefSeq gene"
-		$scriptsdir/annotate_variation.pl --geneanno --buildver $buildver -dbtype gene annovarinput $humandb 2>&1
+		$scriptsdir/annotate_variation.pl --geneanno --buildver $buildver -dbtype gene ${hgvs} annovarinput $humandb 2>&1
 		
 		annovarout=annovarinput.variant_function
 		sed -i '1i\RefSeq_Func\tRefSeq_Gene\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout 
@@ -612,9 +710,9 @@
 	if [ $dgv == "Y" ]
 	then
 		echo -e "\nDGV Annotation"
-		$scriptsdir/annotate_variation.pl --regionanno --buildver $buildver -dbtype dgv annovarinput $humandb 2>&1
+		$scriptsdir/annotate_variation.pl --regionanno --buildver $buildver -dbtype dgvMerged annovarinput $humandb 2>&1
 	
-		annovarout=annovarinput.${buildver}_dgv
+		annovarout=annovarinput.${buildver}_dgvMerged
 		sed -i '1i\db\tDGV\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout 
 		joinresults originalfile $annovarout 3 4 5 6 7 B.DGV	
 	fi
@@ -807,6 +905,115 @@
 	fi
 
 
+	
+	
+	#### IMPACT SCORE ANNOTATIONS
+
+
+	if [ $ljb2_sift == "Y" ]
+	then
+		echo -e "\nLJB2 SIFT Annotation"
+		$scriptsdir/annotate_variation.pl --filter --buildver $buildver $otherinfo -dbtype ljb2_sift annovarinput $humandb 2>&1
+	
+		annovarout=annovarinput.${buildver}_ljb2_sift_dropped
+		sed -i '1i\db\tLJB2_SIFT\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout 
+		joinresults originalfile $annovarout 3 4 5 6 7 B.LJB2_SIFT	
+	fi
+	
+	if [ $ljb2_pp2hdiv == "Y" ]
+	then
+		echo -e "\nLJB2 pp2hdiv Annotation"
+		$scriptsdir/annotate_variation.pl --filter --buildver $buildver $otherinfo -dbtype ljb2_pp2hdiv annovarinput $humandb 2>&1
+	
+		annovarout=annovarinput.${buildver}_ljb2_pp2hdiv_dropped
+		sed -i '1i\db\tLJB2_PolyPhen2_HDIV\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout 
+		joinresults originalfile $annovarout 3 4 5 6 7 B.LJB2_PolyPhen2_HDIV	
+	fi
+	
+	if [ $ljb2_pp2hvar == "Y" ]
+	then
+		echo -e "\nLJB2 pp2hvar Annotation"
+		$scriptsdir/annotate_variation.pl --filter --buildver $buildver $otherinfo -dbtype ljb2_pp2hvar annovarinput $humandb 2>&1
+	
+		annovarout=annovarinput.${buildver}_ljb2_pp2hvar_dropped
+		sed -i '1i\db\tLJB2_PolyPhen2_HVAR\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout 
+		joinresults originalfile $annovarout 3 4 5 6 7 B.LJB2_PolyPhen2_HVAR	
+	fi
+	
+	if [ $ljb2_lrt == "Y" ]
+	then
+		echo -e "\nLJB2 LRT Annotation"
+		$scriptsdir/annotate_variation.pl --filter --buildver $buildver $otherinfo -dbtype ljb2_lrt annovarinput $humandb 2>&1
+	
+		annovarout=annovarinput.${buildver}_ljb2_lrt_dropped
+		sed -i '1i\db\tLJB2_LRT\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout 
+		joinresults originalfile $annovarout 3 4 5 6 7 B.LJB2_LRT	
+	fi
+	
+	if [ $ljb2_mt == "Y" ]
+	then
+		echo -e "\nLJB2 mutationtaster Annotation"
+		$scriptsdir/annotate_variation.pl --filter --buildver $buildver $otherinfo -dbtype ljb2_mt annovarinput $humandb 2>&1
+	
+		annovarout=annovarinput.${buildver}_ljb2_mt_dropped
+		sed -i '1i\db\tLJB2_MutationTaster\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout 
+		joinresults originalfile $annovarout 3 4 5 6 7 B.LJB2_MutationTaster	
+	fi
+	
+	if [ $ljb2_ma == "Y" ]
+	then
+		echo -e "\nLJB2 mutationassessor Annotation"
+		$scriptsdir/annotate_variation.pl --filter --buildver $buildver $otherinfo -dbtype ljb2_ma annovarinput $humandb 2>&1
+	
+		annovarout=annovarinput.${buildver}_ljb2_ma_dropped
+		sed -i '1i\db\tLJB2_MutationAssessor\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout 
+		joinresults originalfile $annovarout 3 4 5 6 7 B.LJB2_MutationAssessor	
+	fi
+	
+	if [ $ljb2_fathmm == "Y" ]
+	then
+		echo -e "\nLJB2 FATHMM Annotation"
+		$scriptsdir/annotate_variation.pl --filter --buildver $buildver $otherinfo -dbtype ljb2_fathmm annovarinput $humandb 2>&1
+	
+		annovarout=annovarinput.${buildver}_ljb2_fathmm_dropped
+		sed -i '1i\db\tLJB2_FATHMM\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout 
+		joinresults originalfile $annovarout 3 4 5 6 7 B.LJB2_FATHMM	
+	fi
+	
+	if [ $ljb2_gerp == "Y" ]
+	then
+		echo -e "\nLJB2 GERP++ Annotation"
+		$scriptsdir/annotate_variation.pl --filter --buildver $buildver $otherinfo -dbtype ljb2_gerp++ annovarinput $humandb 2>&1
+	
+		annovarout=annovarinput.${buildver}_ljb2_gerp++_dropped
+		sed -i '1i\db\tLJB2_GERP++\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout 
+		joinresults originalfile $annovarout 3 4 5 6 7 B.LJB2_GERP++	
+	fi
+	
+	if [ $ljb2_phylop == "Y" ]
+	then
+		echo -e "\nLJB2 PhyloP Annotation"
+		$scriptsdir/annotate_variation.pl --filter --buildver $buildver $otherinfo -dbtype ljb2_phylop annovarinput $humandb 2>&1
+	
+		annovarout=annovarinput.${buildver}_ljb2_phylop_dropped
+		sed -i '1i\db\tLJB2_PhyloP\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout 
+		joinresults originalfile $annovarout 3 4 5 6 7 B.LJB2_PhyloP	
+	fi
+	
+	if [ $ljb2_siphy == "Y" ]
+	then
+		echo -e "\nLJB2 SiPhy Annotation"
+		$scriptsdir/annotate_variation.pl --filter --buildver $buildver $otherinfo -dbtype ljb2_siphy annovarinput $humandb 2>&1
+	
+		annovarout=annovarinput.${buildver}_ljb2_siphy_dropped
+		sed -i '1i\db\tLJB2_SiPhy\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout 
+		joinresults originalfile $annovarout 3 4 5 6 7 B.LJB2_SiPhy	
+	fi
+
+
+
+	### OLD IMPACT SCORE ANNOTATIONS
+
 	# SIFT
 	if [ $avsift == "Y" ]
 	then
@@ -822,7 +1029,7 @@
 	# SIFT2
 	if [ $ljbsift == "Y" ]
 	then
-		echo -e "\nSIFT Annotation"
+		echo -e "\nLJB SIFT Annotation"
 		$scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype ljb_sift annovarinput $humandb 2>&1
 	
 		annovarout=annovarinput.${buildver}_ljb_sift_dropped
@@ -984,35 +1191,6 @@
 	fi	
 
 
-	#ESP6500
-	if [ $esp == "Y" ]
-	then
-		echo -e "\nESP Annotation OLD"
-		# ALL
-		$scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype esp6500si_all annovarinput $humandb 2>&1
-	
-		annovarout=annovarinput.${buildver}_esp6500si_all_dropped
-		sed -i '1i\db\t'$esp6500_colheader_ALL'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout 
-		joinresults originalfile $annovarout 3 4 5 6 7 B.$esp6500_colheader_ALL
-
-
-		# European American
-		$scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype esp6500si_ea annovarinput $humandb 2>&1
-	
-		annovarout=annovarinput.${buildver}_esp6500si_ea_dropped
-		sed -i '1i\db\t'$esp6500_colheader_EA'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout 
-		joinresults originalfile $annovarout 3 4 5 6 7 B.$esp6500_colheader_EA
-
-		# African Americans
-		$scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype esp6500si_aa annovarinput $humandb 2>&1
-	
-		annovarout=annovarinput.${buildver}_esp6500si_aa_dropped
-		sed -i '1i\db\t'$esp6500_colheader_AA'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout 
-		joinresults originalfile $annovarout 3 4 5 6 7 B.$esp6500_colheader_AA
-	fi
-
-
-
 	#GERP++
 	if [ $gerp == "Y" ]
 	then
@@ -1070,6 +1248,39 @@
 
 	fi
 
+	if [[ $cosmic67 == "Y" && $buildver == "hg19" ]]
+	then
+		echo -e "\nCOSMIC67 Annotation"
+		$scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype cosmic67 annovarinput $humandb 2>&1
+	
+		annovarout="annovarinput.${buildver}_cosmic67_dropped"
+		sed -i '1i\db\tCOSMIC67\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout 
+		joinresults originalfile $annovarout 3 4 5 6 7 B.COSMIC67
+
+	fi
+	
+	if [[ $clinvar == "Y" && $buildver == "hg19" ]]
+	then
+		echo -e "\nCLINVAR Annotation"
+		$scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype clinvar_20131105 annovarinput $humandb 2>&1
+	
+		annovarout="annovarinput.${buildver}_clinvar_20131105_dropped"
+		sed -i '1i\db\tCLINVAR\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout 
+		joinresults originalfile $annovarout 3 4 5 6 7 B.CLINVAR
+
+	fi
+	
+	if [[ $nci60 == "Y" && $buildver == "hg19" ]]
+	then
+		echo -e "\nNCI60 Annotation"
+		$scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype nci60 annovarinput $humandb 2>&1
+	
+		annovarout="annovarinput.${buildver}_nci60_dropped"
+		sed -i '1i\db\tNCI60\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout 
+		joinresults originalfile $annovarout 3 4 5 6 7 B.NCI60
+
+	fi
+	
 	#cg46
 	if [[ $cg46 == "Y"  ]]
 	then
@@ -1138,52 +1349,6 @@
 
 
 
-############################################
-#
-#       Filter Annotated Variants 
-#
-############################################
-
-
-if [[ $dofilter == "Y" ]]
-then
-	echo "starting filtering"
-	cp originalfile filteredfile
-	
-	### do the filtering
-	# usage: runfilter <column name> <threshold>   (-1=do not filter, 0=filter any value)
-	
-	#1000genomes
-	runfilter filteredfile ${g1000_colheader_ALL} ${threshold_1000g_ALL}
-	runfilter filteredfile ${g1000_colheader_AFR} ${threshold_1000g_AFR}
-	runfilter filteredfile ${g1000_colheader_AMR} ${threshold_1000g_AMR}
-	runfilter filteredfile ${g1000_colheader_ASN} ${threshold_1000g_ASN}
-	runfilter filteredfile ${g1000_colheader_EUR} ${threshold_1000g_EUR}
-
-	#esp
-	runfilter filteredfile ${esp6500_colheader_ALL} ${threshold_ESP6500_ALL}
-	runfilter filteredfile ${esp6500_colheader_EA} ${threshold_ESP6500_EA}
-	runfilter filteredfile ${esp6500_colheader_AA} ${threshold_ESP6500_AA}		
-	
-	#dbsnp
-	for version in $filt_dbsnpstr
-	do
-		if [ $version == "None" ] 
-		then
-			break
-		fi 
-		runfilter filteredfile "db$version" "text"  #-42 will filter any non-empty string in that field
-
-	done
-	
-	#complete genomics
-	runfilter filteredfile ${cg46_colheader} ${threshold_cg46}
-	runfilter filteredfile ${cg69_colheader} ${threshold_cg69}
-
-	#move filtered output file to galaxy output file
-	cp filteredfile $outfile_filt
-	
-fi
 
 
 
@@ -1201,3 +1366,4 @@
 
 
 
+