diff tools/annovar/annovar.sh @ 5:4600be69b96f draft

Added databases 1000g2015aug, SPIDEX, avsnp138, avsnp142, exac03
author saskia-hiltemann
date Thu, 01 Oct 2015 04:24:45 -0400
parents ff5325029a8e
children 69e2067a120d
line wrap: on
line diff
--- a/tools/annovar/annovar.sh	Thu Apr 10 09:31:26 2014 -0400
+++ b/tools/annovar/annovar.sh	Thu Oct 01 04:24:45 2015 -0400
@@ -179,7 +179,7 @@
 #################################
 
 
-set -- `getopt -n$0 -u -a --longoptions="inputfile: buildver: humandb: varfile: VCF: chrcol: startcol: endcol: refcol: obscol: vartypecol: convertcoords: geneanno: hgvs: verdbsnp: tfbs: mce: cytoband: segdup: dgv: gwas: ver1000g: cg46: cg69: impactscores: newimpactscores: otherinfo: esp: gerp: cosmic61: cosmic63: cosmic64: cosmic65: cosmic67: cosmic68: clinvar: nci60: outall: outfilt: outinvalid: scriptsdir: dorunannovar: dofilter: filt_dbsnp: filt1000GALL: filt1000GAFR: filt1000GAMR: filt1000GASN: filt1000GEUR: filtESP6500ALL: filtESP6500EA: filtESP6500AA: filtcg46: filtcg69: dummy:" "h:" "$@"` || usage
+set -- `getopt -n$0 -u -a --longoptions="inputfile: buildver: humandb: varfile: VCF: chrcol: startcol: endcol: refcol: obscol: vartypecol: convertcoords: geneanno: hgvs: verdbsnp: tfbs: mce: cytoband: segdup: dgv: gwas: ver1000g: cg46: cg69: impactscores: newimpactscores: otherinfo: esp: exac03: spidex: gonl: gerp: cosmic61: cosmic63: cosmic64: cosmic65: cosmic67: cosmic68: clinvar: nci60: outall: outfilt: outinvalid: scriptsdir: dorunannovar: dofilter: filt_dbsnp: filt1000GALL: filt1000GAFR: filt1000GAMR: filt1000GASN: filt1000GEUR: filtESP6500ALL: filtESP6500EA: filtESP6500AA: filtcg46: filtcg69: dummy:" "h:" "$@"` || usage
 [ $# -eq 0 ] && usage
 
 
@@ -216,6 +216,9 @@
 		--otherinfo)				otherinfo=$2;shift;; 
 		--scriptsdir)	      		scriptsdirtmp=$2;shift;; # Y or N 
 		--esp)      				esp=$2;shift;; 	# Y or N 
+		--exac03)                   exac03=$2;shift;;
+		--gonl)                     gonl=$2;shift;;
+		--spidex)                   spidex=$2;shift;;
 		--gerp)      				gerp=$2;shift;; 	# Y or N 
 		--cosmic61)					cosmic61=$2;shift;;  # Y or N
 		--cosmic63)					cosmic63=$2;shift;;  # Y or N
@@ -524,6 +527,7 @@
 				OFS="\t";
 			}{
 				if(FNR>1) { 
+                                        gsub(/chr/,"",$"'"${chrcol}"'")
 					if( $"'"${vartypecol}"'" == "snp" ){ $"'"${startcol}"'" += 1 }; 
 					if( $"'"${vartypecol}"'" == "ins" ){ $"'"${refcol}"'" = "-" };
 					if( $"'"${vartypecol}"'" == "del" ){ $"'"${startcol}"'" +=1; $"'"${obscol}"'" = "-" };
@@ -536,13 +540,15 @@
 			}' $infile > annovarinput
 
 			#remove any "chr" prefixes
-			sed -i 's/chr//g' annovarinput
+			#sed -i '2,$s/chr//g' annovarinput
 
 			awk 'BEGIN{
 				FS="\t";
-				OFS="\t";
+				OFS="\t";				
 			}{
+                                
 				if(FNR>=1) { 
+				        gsub(/chr/,"",$"'"${chrcol}"'")
 					if( $"'"${vartypecol}"'" == "snp" ){ $"'"${startcol}"'" += 1 }; 
 					if( $"'"${vartypecol}"'" == "ins" ){ $"'"${refcol}"'" = "-" };
 					if( $"'"${vartypecol}"'" == "del" ){ $"'"${startcol}"'" +=1; $"'"${obscol}"'" = "-" };
@@ -555,7 +561,7 @@
 			}' $infile > originalfile
 
 			#remove any "chr" prefixes
-			sed -i 's/chr//g' originalfile
+			#sed -i '2,$s/chr//g' originalfile
 			sed -i 's/omosome/chromosome/g' originalfile
 
 
@@ -565,16 +571,16 @@
 				FS="\t";
 				OFS="\t";
 			}{
-				if(FNR>1) { 			
-					printf("%s\t%s\t%s\t%s\t%s\n",$"'"${chrcol}"'",$"'"${startcol}"'",$"'"${endcol}"'",$"'"${refcol}"'",$"'"${obscol}"'");				
+				if(FNR>1) { 		                                    
+                                    printf("%s\t%s\t%s\t%s\t%s\n",$"'"${chrcol}"'",$"'"${startcol}"'",$"'"${endcol}"'",$"'"${refcol}"'",$"'"${obscol}"'");				
 				}
 			}	
 			END{
 			}' $infile > annovarinput
 
 			#remove any "chr" prefixes
-			sed -i 's/chr//g' annovarinput
-			sed 's/chr//g' $infile > originalfile
+			sed -i '2,$s/chr//g' annovarinput
+			sed '2,$s/chr//g' $infile > originalfile
 			sed -i 's/omosome/chromosome/g' originalfile
 	fi
 
@@ -745,9 +751,15 @@
 		echo -e "\ndbSNP region Annotation, version: $version"
 		$scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype ${version} annovarinput $humandb 2>&1
 	
+		columnname=${version}
+		if [[ $columnname == snp* ]]
+		then
+			columnname="db${version}"
+		fi
+
 		annovarout=annovarinput.${buildver}_${version}_dropped		
-		sed -i '1i\db\tdb'${version}'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout 
-		joinresults originalfile $annovarout 3 4 5 6 7 B.db${version}	
+		sed -i '1i\db\t'${columnname}'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout 
+		joinresults originalfile $annovarout 3 4 5 6 7 B.${columnname}	
 	
 
 	done
@@ -767,6 +779,8 @@
 			g1000_colheader_AMR="${version}_AMR"
 			g1000_colheader_ASN="${version}_ASN"
 			g1000_colheader_EUR="${version}_EUR"
+			g1000_colheader_EAS="${version}_EAS"
+			g1000_colheader_SAS="${version}_SAS"
 			g1000_colheader_CEU="${version}_CEU"
 			g1000_colheader_YRI="${version}_YRI"
 			g1000_colheader_JPTCHB="${version}_JPTCHB"
@@ -775,6 +789,8 @@
 			doAMR="N"
 			doAFR="N"
 			doASN="N"
+			doEAS="N"
+			doSAS="N"
 			doEUR="N"
 			doCEU="N"
 			doYRI="N"
@@ -791,7 +807,30 @@
 					doAFR="Y"
 					doASN="Y"
 					doEUR="Y"
-				fi		
+				fi
+			elif [ $version == "1000g2014oct" ]
+			then
+				fileID="2014_10"
+				doALL="Y"
+				doAMR="Y"
+				doAFR="Y"
+				doEUR="Y"
+				doEAS="Y"
+				if [ $buildver == "hg19" ]
+				then
+					doSAS="Y"
+				fi
+				
+			elif [[ $version == "1000g2015aug" && $buildver == "hg19" ]]
+			then
+				fileID="2015_08"				
+				doALL="Y"
+				doAMR="Y"
+				doAFR="Y"
+				doEUR="Y"
+				doEAS="Y"
+				doSAS="Y"
+					
 			elif [[ $version == "1000g2012feb" && $buildver == "hg19" ]]
 			then
 				fileID="2012_02"				
@@ -855,7 +894,29 @@
 				sed -i '1i\db\t'$g1000_colheader_ASN'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout 
 				joinresults originalfile $annovarout 3 4 5 6 7 B.$g1000_colheader_ASN	
 			fi
-
+			
+			# EAS
+			if [ $doEAS == "Y"  ]
+			then
+				echo -e "\n1000Genomes EAS"
+				$scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype "${version}_eas" annovarinput $humandb 2>&1
+	
+				annovarout=annovarinput.${buildver}_EAS.sites.${fileID}_dropped
+				sed -i '1i\db\t'$g1000_colheader_EAS'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout 
+				joinresults originalfile $annovarout 3 4 5 6 7 B.$g1000_colheader_EAS	
+			fi
+			
+			# SAS
+			if [ $doSAS == "Y"  ]
+			then
+				echo -e "\n1000Genomes SAS"
+				$scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype "${version}_sas" annovarinput $humandb 2>&1
+	
+				annovarout=annovarinput.${buildver}_SAS.sites.${fileID}_dropped
+				sed -i '1i\db\t'$g1000_colheader_SAS'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout 
+				joinresults originalfile $annovarout 3 4 5 6 7 B.$g1000_colheader_SAS	
+			fi
+			
 			# EUR
 			if [ $doEUR == "Y"  ]
 			then
@@ -937,6 +998,8 @@
 		$scriptsdir/annotate_variation.pl --filter --buildver $buildver $otherinfo -dbtype ljb2_pp2hvar annovarinput $humandb 2>&1
 	
 		annovarout=annovarinput.${buildver}_ljb2_pp2hvar_dropped
+		
+		head $annovarout
 		sed -i '1i\db\tLJB2_PolyPhen2_HVAR\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout 
 		joinresults originalfile $annovarout 3 4 5 6 7 B.LJB2_PolyPhen2_HVAR	
 	fi
@@ -1191,7 +1254,76 @@
 		done
 	fi	
 
+	
+	#ExAC-03 database 
+	if [ $exac03 == "Y" ]
+	then
+		echo -e "\nExAC03 Annotation"
+		$scriptsdir/annotate_variation.pl --filter -otherinfo --buildver $buildver --otherinfo -dbtype exac03 annovarinput $humandb 2>&1
+	        
+		#annovarout=annovarinput.${buildver}_exac03_dropped
+		
+		# split allelefrequency column into several columns, one per population
+		awk 'BEGIN{FS="\t"
+		           OFS="\t"		           
+		           }{		           
+		           gsub(",","\t",$2)
+		           print $0		           
+		           }END{}' annovarinput.${buildver}_exac03_dropped > $annovarout
+		
+		sed -i '1i\db\tExAC_Freq\tExAC_AFR\tExAC_AMR\tExAC_EAS\tExAC_FIN\tExAC_NFE\tExAC_OTH\tExAC_SAS\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout 
+		joinresults originalfile $annovarout 10 11 12 13 14 B.ExAC_Freq,B.ExAC_AFR,B.ExAC_AMR,B.ExAC_EAS,B.ExAC_FIN,B.ExAC_NFE,B.ExAC_OTH,B.ExAC_SAS	
+	fi
 
+    #GoNL database 
+	if [ $gonl == "Y" ]
+	then
+	
+            if [ $buildver == "hg19" ]
+            then
+		echo -e "\nGoNL Annotation"
+		$scriptsdir/annotate_variation.pl --filter --buildver $buildver --otherinfo -dbtype generic -genericdbfile ${buildver}_gonl.txt annovarinput $humandb 2>&1
+	        
+	        ls
+		annovarout=annovarinput.${buildver}_generic_dropped
+		
+		head $annovarout
+		
+		sed -i '1i\db\tGoNL\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout 
+		joinresults originalfile $annovarout 3 4 5 6 7 B.GoNL	
+		
+            fi
+            
+	fi
+	
+	#SPIDEX database 
+	if [ $spidex == "Y" ]
+	then
+	
+        if [ $buildver == "hg19" ]
+        then
+			echo -e "\nSPIDEX Annotation"
+			$scriptsdir/annotate_variation.pl --filter --buildver $buildver --otherinfo -dbtype spidex annovarinput $humandb 2>&1
+			    
+			# split allelefrequency column into several columns, one per population
+		    awk 'BEGIN{FS="\t"
+		           OFS="\t"		           
+		           }{		           
+		           gsub(",","\t",$2)
+		           print $0		           
+		    }END{}' annovarinput.${buildver}_spidex_dropped > $annovarout    
+			
+			#annovarout=annovarinput.${buildver}_spidex_dropped
+		    #head $annovarout
+		
+			sed -i '1i\db\tSPIDEX_dpsi_max_tissue\tSPIDEX_dpsi_zscore\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout 
+			joinresults originalfile $annovarout 4 5 6 7 8 B.SPIDEX_dpsi_max_tissue,B.SPIDEX_dpsi_zscore	
+		
+        fi
+            
+	fi
+	
+	
 	#GERP++
 	if [ $gerp == "Y" ]
 	then
@@ -1271,6 +1403,16 @@
 
 	fi
 	
+	if [[ $cosmic70 == "Y" && $buildver == "hg19" ]]
+	then
+		echo -e "\nCOSMIC70 Annotation"
+		$scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype cosmic70 annovarinput $humandb 2>&1
+	
+		annovarout="annovarinput.${buildver}_cosmic70_dropped"
+		sed -i '1i\db\tCOSMIC70\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout 
+		joinresults originalfile $annovarout 3 4 5 6 7 B.COSMIC70
+
+	fi
 
 	if [[ $clinvar == "Y" && $buildver == "hg19" ]]
 	then
@@ -1356,6 +1498,10 @@
 	
 	cp originalfile_coords $outfile_all
 	cp annovarinput.invalid_input $outfile_invalid 2>&1
+	
+	sed -i 's/chrchr/chr/g' $outfile_all
+	sed -i 's/chrchr/chr/g' $outfile_invalid
+	
 fi #if $dorunannovar