Mercurial > repos > saskia-hiltemann > annovar
diff tools/annovar/annovar.sh @ 5:4600be69b96f draft
Added databases 1000g2015aug, SPIDEX, avsnp138, avsnp142, exac03
author | saskia-hiltemann |
---|---|
date | Thu, 01 Oct 2015 04:24:45 -0400 |
parents | ff5325029a8e |
children | 69e2067a120d |
line wrap: on
line diff
--- a/tools/annovar/annovar.sh Thu Apr 10 09:31:26 2014 -0400 +++ b/tools/annovar/annovar.sh Thu Oct 01 04:24:45 2015 -0400 @@ -179,7 +179,7 @@ ################################# -set -- `getopt -n$0 -u -a --longoptions="inputfile: buildver: humandb: varfile: VCF: chrcol: startcol: endcol: refcol: obscol: vartypecol: convertcoords: geneanno: hgvs: verdbsnp: tfbs: mce: cytoband: segdup: dgv: gwas: ver1000g: cg46: cg69: impactscores: newimpactscores: otherinfo: esp: gerp: cosmic61: cosmic63: cosmic64: cosmic65: cosmic67: cosmic68: clinvar: nci60: outall: outfilt: outinvalid: scriptsdir: dorunannovar: dofilter: filt_dbsnp: filt1000GALL: filt1000GAFR: filt1000GAMR: filt1000GASN: filt1000GEUR: filtESP6500ALL: filtESP6500EA: filtESP6500AA: filtcg46: filtcg69: dummy:" "h:" "$@"` || usage +set -- `getopt -n$0 -u -a --longoptions="inputfile: buildver: humandb: varfile: VCF: chrcol: startcol: endcol: refcol: obscol: vartypecol: convertcoords: geneanno: hgvs: verdbsnp: tfbs: mce: cytoband: segdup: dgv: gwas: ver1000g: cg46: cg69: impactscores: newimpactscores: otherinfo: esp: exac03: spidex: gonl: gerp: cosmic61: cosmic63: cosmic64: cosmic65: cosmic67: cosmic68: clinvar: nci60: outall: outfilt: outinvalid: scriptsdir: dorunannovar: dofilter: filt_dbsnp: filt1000GALL: filt1000GAFR: filt1000GAMR: filt1000GASN: filt1000GEUR: filtESP6500ALL: filtESP6500EA: filtESP6500AA: filtcg46: filtcg69: dummy:" "h:" "$@"` || usage [ $# -eq 0 ] && usage @@ -216,6 +216,9 @@ --otherinfo) otherinfo=$2;shift;; --scriptsdir) scriptsdirtmp=$2;shift;; # Y or N --esp) esp=$2;shift;; # Y or N + --exac03) exac03=$2;shift;; + --gonl) gonl=$2;shift;; + --spidex) spidex=$2;shift;; --gerp) gerp=$2;shift;; # Y or N --cosmic61) cosmic61=$2;shift;; # Y or N --cosmic63) cosmic63=$2;shift;; # Y or N @@ -524,6 +527,7 @@ OFS="\t"; }{ if(FNR>1) { + gsub(/chr/,"",$"'"${chrcol}"'") if( $"'"${vartypecol}"'" == "snp" ){ $"'"${startcol}"'" += 1 }; if( $"'"${vartypecol}"'" == "ins" ){ $"'"${refcol}"'" = "-" }; if( $"'"${vartypecol}"'" == "del" ){ $"'"${startcol}"'" +=1; $"'"${obscol}"'" = "-" }; @@ -536,13 +540,15 @@ }' $infile > annovarinput #remove any "chr" prefixes - sed -i 's/chr//g' annovarinput + #sed -i '2,$s/chr//g' annovarinput awk 'BEGIN{ FS="\t"; - OFS="\t"; + OFS="\t"; }{ + if(FNR>=1) { + gsub(/chr/,"",$"'"${chrcol}"'") if( $"'"${vartypecol}"'" == "snp" ){ $"'"${startcol}"'" += 1 }; if( $"'"${vartypecol}"'" == "ins" ){ $"'"${refcol}"'" = "-" }; if( $"'"${vartypecol}"'" == "del" ){ $"'"${startcol}"'" +=1; $"'"${obscol}"'" = "-" }; @@ -555,7 +561,7 @@ }' $infile > originalfile #remove any "chr" prefixes - sed -i 's/chr//g' originalfile + #sed -i '2,$s/chr//g' originalfile sed -i 's/omosome/chromosome/g' originalfile @@ -565,16 +571,16 @@ FS="\t"; OFS="\t"; }{ - if(FNR>1) { - printf("%s\t%s\t%s\t%s\t%s\n",$"'"${chrcol}"'",$"'"${startcol}"'",$"'"${endcol}"'",$"'"${refcol}"'",$"'"${obscol}"'"); + if(FNR>1) { + printf("%s\t%s\t%s\t%s\t%s\n",$"'"${chrcol}"'",$"'"${startcol}"'",$"'"${endcol}"'",$"'"${refcol}"'",$"'"${obscol}"'"); } } END{ }' $infile > annovarinput #remove any "chr" prefixes - sed -i 's/chr//g' annovarinput - sed 's/chr//g' $infile > originalfile + sed -i '2,$s/chr//g' annovarinput + sed '2,$s/chr//g' $infile > originalfile sed -i 's/omosome/chromosome/g' originalfile fi @@ -745,9 +751,15 @@ echo -e "\ndbSNP region Annotation, version: $version" $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype ${version} annovarinput $humandb 2>&1 + columnname=${version} + if [[ $columnname == snp* ]] + then + columnname="db${version}" + fi + annovarout=annovarinput.${buildver}_${version}_dropped - sed -i '1i\db\tdb'${version}'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout - joinresults originalfile $annovarout 3 4 5 6 7 B.db${version} + sed -i '1i\db\t'${columnname}'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout + joinresults originalfile $annovarout 3 4 5 6 7 B.${columnname} done @@ -767,6 +779,8 @@ g1000_colheader_AMR="${version}_AMR" g1000_colheader_ASN="${version}_ASN" g1000_colheader_EUR="${version}_EUR" + g1000_colheader_EAS="${version}_EAS" + g1000_colheader_SAS="${version}_SAS" g1000_colheader_CEU="${version}_CEU" g1000_colheader_YRI="${version}_YRI" g1000_colheader_JPTCHB="${version}_JPTCHB" @@ -775,6 +789,8 @@ doAMR="N" doAFR="N" doASN="N" + doEAS="N" + doSAS="N" doEUR="N" doCEU="N" doYRI="N" @@ -791,7 +807,30 @@ doAFR="Y" doASN="Y" doEUR="Y" - fi + fi + elif [ $version == "1000g2014oct" ] + then + fileID="2014_10" + doALL="Y" + doAMR="Y" + doAFR="Y" + doEUR="Y" + doEAS="Y" + if [ $buildver == "hg19" ] + then + doSAS="Y" + fi + + elif [[ $version == "1000g2015aug" && $buildver == "hg19" ]] + then + fileID="2015_08" + doALL="Y" + doAMR="Y" + doAFR="Y" + doEUR="Y" + doEAS="Y" + doSAS="Y" + elif [[ $version == "1000g2012feb" && $buildver == "hg19" ]] then fileID="2012_02" @@ -855,7 +894,29 @@ sed -i '1i\db\t'$g1000_colheader_ASN'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout joinresults originalfile $annovarout 3 4 5 6 7 B.$g1000_colheader_ASN fi - + + # EAS + if [ $doEAS == "Y" ] + then + echo -e "\n1000Genomes EAS" + $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype "${version}_eas" annovarinput $humandb 2>&1 + + annovarout=annovarinput.${buildver}_EAS.sites.${fileID}_dropped + sed -i '1i\db\t'$g1000_colheader_EAS'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout + joinresults originalfile $annovarout 3 4 5 6 7 B.$g1000_colheader_EAS + fi + + # SAS + if [ $doSAS == "Y" ] + then + echo -e "\n1000Genomes SAS" + $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype "${version}_sas" annovarinput $humandb 2>&1 + + annovarout=annovarinput.${buildver}_SAS.sites.${fileID}_dropped + sed -i '1i\db\t'$g1000_colheader_SAS'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout + joinresults originalfile $annovarout 3 4 5 6 7 B.$g1000_colheader_SAS + fi + # EUR if [ $doEUR == "Y" ] then @@ -937,6 +998,8 @@ $scriptsdir/annotate_variation.pl --filter --buildver $buildver $otherinfo -dbtype ljb2_pp2hvar annovarinput $humandb 2>&1 annovarout=annovarinput.${buildver}_ljb2_pp2hvar_dropped + + head $annovarout sed -i '1i\db\tLJB2_PolyPhen2_HVAR\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout joinresults originalfile $annovarout 3 4 5 6 7 B.LJB2_PolyPhen2_HVAR fi @@ -1191,7 +1254,76 @@ done fi + + #ExAC-03 database + if [ $exac03 == "Y" ] + then + echo -e "\nExAC03 Annotation" + $scriptsdir/annotate_variation.pl --filter -otherinfo --buildver $buildver --otherinfo -dbtype exac03 annovarinput $humandb 2>&1 + + #annovarout=annovarinput.${buildver}_exac03_dropped + + # split allelefrequency column into several columns, one per population + awk 'BEGIN{FS="\t" + OFS="\t" + }{ + gsub(",","\t",$2) + print $0 + }END{}' annovarinput.${buildver}_exac03_dropped > $annovarout + + sed -i '1i\db\tExAC_Freq\tExAC_AFR\tExAC_AMR\tExAC_EAS\tExAC_FIN\tExAC_NFE\tExAC_OTH\tExAC_SAS\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout + joinresults originalfile $annovarout 10 11 12 13 14 B.ExAC_Freq,B.ExAC_AFR,B.ExAC_AMR,B.ExAC_EAS,B.ExAC_FIN,B.ExAC_NFE,B.ExAC_OTH,B.ExAC_SAS + fi + #GoNL database + if [ $gonl == "Y" ] + then + + if [ $buildver == "hg19" ] + then + echo -e "\nGoNL Annotation" + $scriptsdir/annotate_variation.pl --filter --buildver $buildver --otherinfo -dbtype generic -genericdbfile ${buildver}_gonl.txt annovarinput $humandb 2>&1 + + ls + annovarout=annovarinput.${buildver}_generic_dropped + + head $annovarout + + sed -i '1i\db\tGoNL\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout + joinresults originalfile $annovarout 3 4 5 6 7 B.GoNL + + fi + + fi + + #SPIDEX database + if [ $spidex == "Y" ] + then + + if [ $buildver == "hg19" ] + then + echo -e "\nSPIDEX Annotation" + $scriptsdir/annotate_variation.pl --filter --buildver $buildver --otherinfo -dbtype spidex annovarinput $humandb 2>&1 + + # split allelefrequency column into several columns, one per population + awk 'BEGIN{FS="\t" + OFS="\t" + }{ + gsub(",","\t",$2) + print $0 + }END{}' annovarinput.${buildver}_spidex_dropped > $annovarout + + #annovarout=annovarinput.${buildver}_spidex_dropped + #head $annovarout + + sed -i '1i\db\tSPIDEX_dpsi_max_tissue\tSPIDEX_dpsi_zscore\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout + joinresults originalfile $annovarout 4 5 6 7 8 B.SPIDEX_dpsi_max_tissue,B.SPIDEX_dpsi_zscore + + fi + + fi + + #GERP++ if [ $gerp == "Y" ] then @@ -1271,6 +1403,16 @@ fi + if [[ $cosmic70 == "Y" && $buildver == "hg19" ]] + then + echo -e "\nCOSMIC70 Annotation" + $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype cosmic70 annovarinput $humandb 2>&1 + + annovarout="annovarinput.${buildver}_cosmic70_dropped" + sed -i '1i\db\tCOSMIC70\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout + joinresults originalfile $annovarout 3 4 5 6 7 B.COSMIC70 + + fi if [[ $clinvar == "Y" && $buildver == "hg19" ]] then @@ -1356,6 +1498,10 @@ cp originalfile_coords $outfile_all cp annovarinput.invalid_input $outfile_invalid 2>&1 + + sed -i 's/chrchr/chr/g' $outfile_all + sed -i 's/chrchr/chr/g' $outfile_invalid + fi #if $dorunannovar