comparison tools/annovar/annovar.sh @ 5:4600be69b96f draft

Added databases 1000g2015aug, SPIDEX, avsnp138, avsnp142, exac03
author saskia-hiltemann
date Thu, 01 Oct 2015 04:24:45 -0400
parents ff5325029a8e
children 69e2067a120d
comparison
equal deleted inserted replaced
4:e423536a0780 5:4600be69b96f
177 # PARSE PARAMETERS 177 # PARSE PARAMETERS
178 # 178 #
179 ################################# 179 #################################
180 180
181 181
182 set -- `getopt -n$0 -u -a --longoptions="inputfile: buildver: humandb: varfile: VCF: chrcol: startcol: endcol: refcol: obscol: vartypecol: convertcoords: geneanno: hgvs: verdbsnp: tfbs: mce: cytoband: segdup: dgv: gwas: ver1000g: cg46: cg69: impactscores: newimpactscores: otherinfo: esp: gerp: cosmic61: cosmic63: cosmic64: cosmic65: cosmic67: cosmic68: clinvar: nci60: outall: outfilt: outinvalid: scriptsdir: dorunannovar: dofilter: filt_dbsnp: filt1000GALL: filt1000GAFR: filt1000GAMR: filt1000GASN: filt1000GEUR: filtESP6500ALL: filtESP6500EA: filtESP6500AA: filtcg46: filtcg69: dummy:" "h:" "$@"` || usage 182 set -- `getopt -n$0 -u -a --longoptions="inputfile: buildver: humandb: varfile: VCF: chrcol: startcol: endcol: refcol: obscol: vartypecol: convertcoords: geneanno: hgvs: verdbsnp: tfbs: mce: cytoband: segdup: dgv: gwas: ver1000g: cg46: cg69: impactscores: newimpactscores: otherinfo: esp: exac03: spidex: gonl: gerp: cosmic61: cosmic63: cosmic64: cosmic65: cosmic67: cosmic68: clinvar: nci60: outall: outfilt: outinvalid: scriptsdir: dorunannovar: dofilter: filt_dbsnp: filt1000GALL: filt1000GAFR: filt1000GAMR: filt1000GASN: filt1000GEUR: filtESP6500ALL: filtESP6500EA: filtESP6500AA: filtcg46: filtcg69: dummy:" "h:" "$@"` || usage
183 [ $# -eq 0 ] && usage 183 [ $# -eq 0 ] && usage
184 184
185 185
186 186
187 while [ $# -gt 0 ] 187 while [ $# -gt 0 ]
214 --impactscores) impactscores=$2;shift;; # Y or N 214 --impactscores) impactscores=$2;shift;; # Y or N
215 --newimpactscores) newimpactscores=$2;shift;; # Y or N 215 --newimpactscores) newimpactscores=$2;shift;; # Y or N
216 --otherinfo) otherinfo=$2;shift;; 216 --otherinfo) otherinfo=$2;shift;;
217 --scriptsdir) scriptsdirtmp=$2;shift;; # Y or N 217 --scriptsdir) scriptsdirtmp=$2;shift;; # Y or N
218 --esp) esp=$2;shift;; # Y or N 218 --esp) esp=$2;shift;; # Y or N
219 --exac03) exac03=$2;shift;;
220 --gonl) gonl=$2;shift;;
221 --spidex) spidex=$2;shift;;
219 --gerp) gerp=$2;shift;; # Y or N 222 --gerp) gerp=$2;shift;; # Y or N
220 --cosmic61) cosmic61=$2;shift;; # Y or N 223 --cosmic61) cosmic61=$2;shift;; # Y or N
221 --cosmic63) cosmic63=$2;shift;; # Y or N 224 --cosmic63) cosmic63=$2;shift;; # Y or N
222 --cosmic64) cosmic64=$2;shift;; # Y or N 225 --cosmic64) cosmic64=$2;shift;; # Y or N
223 --cosmic65) cosmic65=$2;shift;; # Y or N 226 --cosmic65) cosmic65=$2;shift;; # Y or N
522 awk 'BEGIN{ 525 awk 'BEGIN{
523 FS="\t"; 526 FS="\t";
524 OFS="\t"; 527 OFS="\t";
525 }{ 528 }{
526 if(FNR>1) { 529 if(FNR>1) {
530 gsub(/chr/,"",$"'"${chrcol}"'")
527 if( $"'"${vartypecol}"'" == "snp" ){ $"'"${startcol}"'" += 1 }; 531 if( $"'"${vartypecol}"'" == "snp" ){ $"'"${startcol}"'" += 1 };
528 if( $"'"${vartypecol}"'" == "ins" ){ $"'"${refcol}"'" = "-" }; 532 if( $"'"${vartypecol}"'" == "ins" ){ $"'"${refcol}"'" = "-" };
529 if( $"'"${vartypecol}"'" == "del" ){ $"'"${startcol}"'" +=1; $"'"${obscol}"'" = "-" }; 533 if( $"'"${vartypecol}"'" == "del" ){ $"'"${startcol}"'" +=1; $"'"${obscol}"'" = "-" };
530 if( $"'"${vartypecol}"'" == "sub" ){ $"'"${startcol}"'" += 1 }; 534 if( $"'"${vartypecol}"'" == "sub" ){ $"'"${startcol}"'" += 1 };
531 535
534 } 538 }
535 END{ 539 END{
536 }' $infile > annovarinput 540 }' $infile > annovarinput
537 541
538 #remove any "chr" prefixes 542 #remove any "chr" prefixes
539 sed -i 's/chr//g' annovarinput 543 #sed -i '2,$s/chr//g' annovarinput
540 544
541 awk 'BEGIN{ 545 awk 'BEGIN{
542 FS="\t"; 546 FS="\t";
543 OFS="\t"; 547 OFS="\t";
544 }{ 548 }{
549
545 if(FNR>=1) { 550 if(FNR>=1) {
551 gsub(/chr/,"",$"'"${chrcol}"'")
546 if( $"'"${vartypecol}"'" == "snp" ){ $"'"${startcol}"'" += 1 }; 552 if( $"'"${vartypecol}"'" == "snp" ){ $"'"${startcol}"'" += 1 };
547 if( $"'"${vartypecol}"'" == "ins" ){ $"'"${refcol}"'" = "-" }; 553 if( $"'"${vartypecol}"'" == "ins" ){ $"'"${refcol}"'" = "-" };
548 if( $"'"${vartypecol}"'" == "del" ){ $"'"${startcol}"'" +=1; $"'"${obscol}"'" = "-" }; 554 if( $"'"${vartypecol}"'" == "del" ){ $"'"${startcol}"'" +=1; $"'"${obscol}"'" = "-" };
549 if( $"'"${vartypecol}"'" == "sub" ){ $"'"${startcol}"'" += 1 }; 555 if( $"'"${vartypecol}"'" == "sub" ){ $"'"${startcol}"'" += 1 };
550 556
553 } 559 }
554 END{ 560 END{
555 }' $infile > originalfile 561 }' $infile > originalfile
556 562
557 #remove any "chr" prefixes 563 #remove any "chr" prefixes
558 sed -i 's/chr//g' originalfile 564 #sed -i '2,$s/chr//g' originalfile
559 sed -i 's/omosome/chromosome/g' originalfile 565 sed -i 's/omosome/chromosome/g' originalfile
560 566
561 567
562 else #only rearrange columns if already 1-based coordinates 568 else #only rearrange columns if already 1-based coordinates
563 echo "rearranging columns " 569 echo "rearranging columns "
564 awk 'BEGIN{ 570 awk 'BEGIN{
565 FS="\t"; 571 FS="\t";
566 OFS="\t"; 572 OFS="\t";
567 }{ 573 }{
568 if(FNR>1) { 574 if(FNR>1) {
569 printf("%s\t%s\t%s\t%s\t%s\n",$"'"${chrcol}"'",$"'"${startcol}"'",$"'"${endcol}"'",$"'"${refcol}"'",$"'"${obscol}"'"); 575 printf("%s\t%s\t%s\t%s\t%s\n",$"'"${chrcol}"'",$"'"${startcol}"'",$"'"${endcol}"'",$"'"${refcol}"'",$"'"${obscol}"'");
570 } 576 }
571 } 577 }
572 END{ 578 END{
573 }' $infile > annovarinput 579 }' $infile > annovarinput
574 580
575 #remove any "chr" prefixes 581 #remove any "chr" prefixes
576 sed -i 's/chr//g' annovarinput 582 sed -i '2,$s/chr//g' annovarinput
577 sed 's/chr//g' $infile > originalfile 583 sed '2,$s/chr//g' $infile > originalfile
578 sed -i 's/omosome/chromosome/g' originalfile 584 sed -i 's/omosome/chromosome/g' originalfile
579 fi 585 fi
580 586
581 echo "...finished conversion" 587 echo "...finished conversion"
582 588
743 break 749 break
744 fi 750 fi
745 echo -e "\ndbSNP region Annotation, version: $version" 751 echo -e "\ndbSNP region Annotation, version: $version"
746 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype ${version} annovarinput $humandb 2>&1 752 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype ${version} annovarinput $humandb 2>&1
747 753
754 columnname=${version}
755 if [[ $columnname == snp* ]]
756 then
757 columnname="db${version}"
758 fi
759
748 annovarout=annovarinput.${buildver}_${version}_dropped 760 annovarout=annovarinput.${buildver}_${version}_dropped
749 sed -i '1i\db\tdb'${version}'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout 761 sed -i '1i\db\t'${columnname}'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
750 joinresults originalfile $annovarout 3 4 5 6 7 B.db${version} 762 joinresults originalfile $annovarout 3 4 5 6 7 B.${columnname}
751 763
752 764
753 done 765 done
754 766
755 767
765 g1000_colheader_ALL="${version}_ALL" 777 g1000_colheader_ALL="${version}_ALL"
766 g1000_colheader_AFR="${version}_AFR" 778 g1000_colheader_AFR="${version}_AFR"
767 g1000_colheader_AMR="${version}_AMR" 779 g1000_colheader_AMR="${version}_AMR"
768 g1000_colheader_ASN="${version}_ASN" 780 g1000_colheader_ASN="${version}_ASN"
769 g1000_colheader_EUR="${version}_EUR" 781 g1000_colheader_EUR="${version}_EUR"
782 g1000_colheader_EAS="${version}_EAS"
783 g1000_colheader_SAS="${version}_SAS"
770 g1000_colheader_CEU="${version}_CEU" 784 g1000_colheader_CEU="${version}_CEU"
771 g1000_colheader_YRI="${version}_YRI" 785 g1000_colheader_YRI="${version}_YRI"
772 g1000_colheader_JPTCHB="${version}_JPTCHB" 786 g1000_colheader_JPTCHB="${version}_JPTCHB"
773 787
774 doALL="N" 788 doALL="N"
775 doAMR="N" 789 doAMR="N"
776 doAFR="N" 790 doAFR="N"
777 doASN="N" 791 doASN="N"
792 doEAS="N"
793 doSAS="N"
778 doEUR="N" 794 doEUR="N"
779 doCEU="N" 795 doCEU="N"
780 doYRI="N" 796 doYRI="N"
781 doJPTCHB="N" 797 doJPTCHB="N"
782 798
789 then 805 then
790 doAMR="Y" 806 doAMR="Y"
791 doAFR="Y" 807 doAFR="Y"
792 doASN="Y" 808 doASN="Y"
793 doEUR="Y" 809 doEUR="Y"
794 fi 810 fi
811 elif [ $version == "1000g2014oct" ]
812 then
813 fileID="2014_10"
814 doALL="Y"
815 doAMR="Y"
816 doAFR="Y"
817 doEUR="Y"
818 doEAS="Y"
819 if [ $buildver == "hg19" ]
820 then
821 doSAS="Y"
822 fi
823
824 elif [[ $version == "1000g2015aug" && $buildver == "hg19" ]]
825 then
826 fileID="2015_08"
827 doALL="Y"
828 doAMR="Y"
829 doAFR="Y"
830 doEUR="Y"
831 doEAS="Y"
832 doSAS="Y"
833
795 elif [[ $version == "1000g2012feb" && $buildver == "hg19" ]] 834 elif [[ $version == "1000g2012feb" && $buildver == "hg19" ]]
796 then 835 then
797 fileID="2012_02" 836 fileID="2012_02"
798 doALL="Y" 837 doALL="Y"
799 elif [[ $version == "1000g2010nov" && $buildver == "hg19" ]] 838 elif [[ $version == "1000g2010nov" && $buildver == "hg19" ]]
853 892
854 annovarout=annovarinput.${buildver}_ASN.sites.${fileID}_dropped 893 annovarout=annovarinput.${buildver}_ASN.sites.${fileID}_dropped
855 sed -i '1i\db\t'$g1000_colheader_ASN'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout 894 sed -i '1i\db\t'$g1000_colheader_ASN'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
856 joinresults originalfile $annovarout 3 4 5 6 7 B.$g1000_colheader_ASN 895 joinresults originalfile $annovarout 3 4 5 6 7 B.$g1000_colheader_ASN
857 fi 896 fi
858 897
898 # EAS
899 if [ $doEAS == "Y" ]
900 then
901 echo -e "\n1000Genomes EAS"
902 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype "${version}_eas" annovarinput $humandb 2>&1
903
904 annovarout=annovarinput.${buildver}_EAS.sites.${fileID}_dropped
905 sed -i '1i\db\t'$g1000_colheader_EAS'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
906 joinresults originalfile $annovarout 3 4 5 6 7 B.$g1000_colheader_EAS
907 fi
908
909 # SAS
910 if [ $doSAS == "Y" ]
911 then
912 echo -e "\n1000Genomes SAS"
913 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype "${version}_sas" annovarinput $humandb 2>&1
914
915 annovarout=annovarinput.${buildver}_SAS.sites.${fileID}_dropped
916 sed -i '1i\db\t'$g1000_colheader_SAS'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
917 joinresults originalfile $annovarout 3 4 5 6 7 B.$g1000_colheader_SAS
918 fi
919
859 # EUR 920 # EUR
860 if [ $doEUR == "Y" ] 921 if [ $doEUR == "Y" ]
861 then 922 then
862 echo -e "\n1000Genomes EUR" 923 echo -e "\n1000Genomes EUR"
863 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype "${version}_eur" annovarinput $humandb 2>&1 924 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype "${version}_eur" annovarinput $humandb 2>&1
935 then 996 then
936 echo -e "\nLJB2 pp2hvar Annotation" 997 echo -e "\nLJB2 pp2hvar Annotation"
937 $scriptsdir/annotate_variation.pl --filter --buildver $buildver $otherinfo -dbtype ljb2_pp2hvar annovarinput $humandb 2>&1 998 $scriptsdir/annotate_variation.pl --filter --buildver $buildver $otherinfo -dbtype ljb2_pp2hvar annovarinput $humandb 2>&1
938 999
939 annovarout=annovarinput.${buildver}_ljb2_pp2hvar_dropped 1000 annovarout=annovarinput.${buildver}_ljb2_pp2hvar_dropped
1001
1002 head $annovarout
940 sed -i '1i\db\tLJB2_PolyPhen2_HVAR\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout 1003 sed -i '1i\db\tLJB2_PolyPhen2_HVAR\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
941 joinresults originalfile $annovarout 3 4 5 6 7 B.LJB2_PolyPhen2_HVAR 1004 joinresults originalfile $annovarout 3 4 5 6 7 B.LJB2_PolyPhen2_HVAR
942 fi 1005 fi
943 1006
944 if [ $ljb2_lrt == "Y" ] 1007 if [ $ljb2_lrt == "Y" ]
1189 fi 1252 fi
1190 1253
1191 done 1254 done
1192 fi 1255 fi
1193 1256
1194 1257
1258 #ExAC-03 database
1259 if [ $exac03 == "Y" ]
1260 then
1261 echo -e "\nExAC03 Annotation"
1262 $scriptsdir/annotate_variation.pl --filter -otherinfo --buildver $buildver --otherinfo -dbtype exac03 annovarinput $humandb 2>&1
1263
1264 #annovarout=annovarinput.${buildver}_exac03_dropped
1265
1266 # split allelefrequency column into several columns, one per population
1267 awk 'BEGIN{FS="\t"
1268 OFS="\t"
1269 }{
1270 gsub(",","\t",$2)
1271 print $0
1272 }END{}' annovarinput.${buildver}_exac03_dropped > $annovarout
1273
1274 sed -i '1i\db\tExAC_Freq\tExAC_AFR\tExAC_AMR\tExAC_EAS\tExAC_FIN\tExAC_NFE\tExAC_OTH\tExAC_SAS\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
1275 joinresults originalfile $annovarout 10 11 12 13 14 B.ExAC_Freq,B.ExAC_AFR,B.ExAC_AMR,B.ExAC_EAS,B.ExAC_FIN,B.ExAC_NFE,B.ExAC_OTH,B.ExAC_SAS
1276 fi
1277
1278 #GoNL database
1279 if [ $gonl == "Y" ]
1280 then
1281
1282 if [ $buildver == "hg19" ]
1283 then
1284 echo -e "\nGoNL Annotation"
1285 $scriptsdir/annotate_variation.pl --filter --buildver $buildver --otherinfo -dbtype generic -genericdbfile ${buildver}_gonl.txt annovarinput $humandb 2>&1
1286
1287 ls
1288 annovarout=annovarinput.${buildver}_generic_dropped
1289
1290 head $annovarout
1291
1292 sed -i '1i\db\tGoNL\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
1293 joinresults originalfile $annovarout 3 4 5 6 7 B.GoNL
1294
1295 fi
1296
1297 fi
1298
1299 #SPIDEX database
1300 if [ $spidex == "Y" ]
1301 then
1302
1303 if [ $buildver == "hg19" ]
1304 then
1305 echo -e "\nSPIDEX Annotation"
1306 $scriptsdir/annotate_variation.pl --filter --buildver $buildver --otherinfo -dbtype spidex annovarinput $humandb 2>&1
1307
1308 # split allelefrequency column into several columns, one per population
1309 awk 'BEGIN{FS="\t"
1310 OFS="\t"
1311 }{
1312 gsub(",","\t",$2)
1313 print $0
1314 }END{}' annovarinput.${buildver}_spidex_dropped > $annovarout
1315
1316 #annovarout=annovarinput.${buildver}_spidex_dropped
1317 #head $annovarout
1318
1319 sed -i '1i\db\tSPIDEX_dpsi_max_tissue\tSPIDEX_dpsi_zscore\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
1320 joinresults originalfile $annovarout 4 5 6 7 8 B.SPIDEX_dpsi_max_tissue,B.SPIDEX_dpsi_zscore
1321
1322 fi
1323
1324 fi
1325
1326
1195 #GERP++ 1327 #GERP++
1196 if [ $gerp == "Y" ] 1328 if [ $gerp == "Y" ]
1197 then 1329 then
1198 echo -e "\nGERP++ Annotation" 1330 echo -e "\nGERP++ Annotation"
1199 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype gerp++gt2 annovarinput $humandb 2>&1 1331 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype gerp++gt2 annovarinput $humandb 2>&1
1269 sed -i '1i\db\tCOSMIC68\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout 1401 sed -i '1i\db\tCOSMIC68\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
1270 joinresults originalfile $annovarout 3 4 5 6 7 B.COSMIC68 1402 joinresults originalfile $annovarout 3 4 5 6 7 B.COSMIC68
1271 1403
1272 fi 1404 fi
1273 1405
1406 if [[ $cosmic70 == "Y" && $buildver == "hg19" ]]
1407 then
1408 echo -e "\nCOSMIC70 Annotation"
1409 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype cosmic70 annovarinput $humandb 2>&1
1410
1411 annovarout="annovarinput.${buildver}_cosmic70_dropped"
1412 sed -i '1i\db\tCOSMIC70\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
1413 joinresults originalfile $annovarout 3 4 5 6 7 B.COSMIC70
1414
1415 fi
1274 1416
1275 if [[ $clinvar == "Y" && $buildver == "hg19" ]] 1417 if [[ $clinvar == "Y" && $buildver == "hg19" ]]
1276 then 1418 then
1277 echo -e "\nCLINVAR Annotation" 1419 echo -e "\nCLINVAR Annotation"
1278 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype clinvar_20140211 annovarinput $humandb 2>&1 1420 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype clinvar_20140211 annovarinput $humandb 2>&1
1354 echo "Congrats, your input file contained no invalid lines!" > annovarinput.invalid_input 1496 echo "Congrats, your input file contained no invalid lines!" > annovarinput.invalid_input
1355 fi 1497 fi
1356 1498
1357 cp originalfile_coords $outfile_all 1499 cp originalfile_coords $outfile_all
1358 cp annovarinput.invalid_input $outfile_invalid 2>&1 1500 cp annovarinput.invalid_input $outfile_invalid 2>&1
1501
1502 sed -i 's/chrchr/chr/g' $outfile_all
1503 sed -i 's/chrchr/chr/g' $outfile_invalid
1504
1359 fi #if $dorunannovar 1505 fi #if $dorunannovar
1360 1506
1361 1507
1362 1508
1363 1509