Mercurial > repos > rdaveau > gfap
view gfapts/inc/annovar/convert2annovar.pl @ 0:f753b30013e6 draft
Uploaded
author | rdaveau |
---|---|
date | Fri, 29 Jun 2012 10:20:55 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/perl use warnings; use strict; use Getopt::Long; use Pod::Usage; our $VERSION = '$Revision: 466 $'; our $LAST_CHANGED_DATE = '$LastChangedDate: 2011-05-06 05:16:44 -0700 (Fri, 06 May 2011) $'; our ($verbose, $help, $man); our ($variantfile); our ($outfile, $format, $includeinfo, $snpqual, $snppvalue, $coverage, $maxcoverage, $chr, $chrmt, $altcov, $fraction, $species, $filterword, $confraction, $allallele); our %iupac = (R=>'AG', Y=>'CT', S=>'GC', W=>'AT', K=>'GT', M=>'AC', A=>'AA', C=>'CC', G=>'GG', T=>'TT', B=>'CGT', D=>'AGT', H=>'ACT', V=>'ACG', N=>'ACGT', '.'=>'-', '-'=>'-'); GetOptions('verbose'=>\$verbose, 'help|h'=>\$help, 'man'=>\$man, 'outfile=s'=>\$outfile, 'format=s'=>\$format, 'includeinfo'=>\$includeinfo, 'snpqual=f'=>\$snpqual, 'snppvalue=f'=>\$snppvalue, 'coverage=i'=>\$coverage, 'maxcoverage=i'=>\$maxcoverage, 'chr=s'=>\$chr, 'chrmt=s'=>\$chrmt, 'fraction=f'=>\$fraction, 'altcov=i'=>\$altcov, 'species'=>\$species, 'filter=s'=>\$filterword, 'confraction=f'=>\$confraction, 'allallele!'=>\$allallele) or pod2usage (); $help and pod2usage (-verbose=>1, -exitval=>1, -output=>\*STDOUT); $man and pod2usage (-verbose=>2, -exitval=>1, -output=>\*STDOUT); @ARGV or pod2usage (-verbose=>0, -exitval=>1, -output=>\*STDOUT); @ARGV == 1 or pod2usage ("Syntax error"); ($variantfile) = @ARGV; $chrmt ||= 'M'; if (not $format) { $format = 'pileup'; print STDERR "NOTICE: the default --format argument is set as 'pileup'\n"; } if (defined $outfile) { open (STDOUT, ">$outfile") or die "Error: cannot write to output file $outfile: $!\n"; } defined $snpqual and $format eq 'pileup' || $format eq 'vcf4' || pod2usage ("Error in argument: the --snpqual is supported only for the 'pileup' or 'vcf4' format"); defined $snppvalue and $format eq 'gff3-solid' || pod2usage ("Error in argument: the --snppvalue is supported only for the 'gff3-solid' format"); if (not defined $snpqual and $format eq 'pileup') { $snpqual = 20; print STDERR "NOTICE: the default --snpqual argument for pileup format is set as 20\n"; } if (not defined $snppvalue) { $snppvalue = 1; #by default, do not use any of the P-value cutoff in filtering out GFF3-SOLID files (this is differnt from handling pileup files) } if (not defined $coverage) { $coverage = 0; } if (defined $fraction) { $format eq 'pileup' or $format eq 'vcf4' or pod2usage ("Error in argument: the '--fraction' argument is supported for the pileup or vcf4 format only"); $format eq 'vcf4' and print STDERR "NOTICE: the --fraction argument works ONLY on indels for vcf4 format\n"; $fraction >= 0 and $fraction <=1 or pod2suage ("Error in argument: the --fraction argument must be between 0 and 1 inclusive"); } else { $fraction = 0; } if (defined $confraction) { $format eq 'vcf4' and print STDERR "NOTICE: the --confraction argument works ONLY on indels for vcf4 format\n"; $confraction >= 0 and $fraction <=1 or pod2suage ("Error in argument: the --confraction argument must be between 0 and 1 inclusive"); } else { $confraction = 0; } if (defined $altcov) { $format eq 'pileup' or pod2usage ("Error in argument: the '--altcov' argument is supported for the '--format pileup' only"); $altcov < $coverage or pod2suage ("Error in argument: the --altcov argument must be less than --coverage"); $altcov > 0 or pod2suage ("Error in argument: the --altcov argument must be a positive integer"); } if (defined $species) { $format eq 'gff3-solid' or pod2usage ("Error in argument: the '--species' argument is only necessary for the '--format gff3-solid'"); } if ($allallele) { $format eq 'vcf4' or pod2usage ("Error in argument: the '--allallele' argument is only supported for the '--format vcf4'"); } if ($format eq 'pileup') { convertPileup ($variantfile); } elsif ($format eq 'cg') { convertCG ($variantfile); } elsif ($format eq 'gff3-solid') { convertGFF3SolidSNP ($variantfile); } elsif ($format eq 'soap') { print STDERR "WARNING: the support for '--format soap' is not well developed yet and may contain bugs for indel analysis.\n"; convertSOAP ($variantfile); } elsif ($format eq 'maq') { print STDERR "WARNING: the support for '--format maq' is not well developed yet and may contain bugs.\n"; convertMAQSNP ($variantfile); } elsif ($format eq 'casava') { if (not defined $chr) { pod2usage ("Error in argument: please supply --chr argument for the '--format casava'"); } convertCASAVA ($variantfile, $chr); } elsif ($format eq 'vcf4') { convertVCF4 ($variantfile); } elsif ($format eq 'annovar') { convertANNOVAR ($variantfile); } else { pod2usage ("Error in argument: the --format $format is not currently supported. Please contact ANNOVAR developer for adding the support"); } sub convertPileup { my ($variantfile) = @_; my ($countline, $countvar, $counthom, $counthet, $countindel, $countsnp, $countti, $counttv) = qw/0 0 0 0 0 0 0 0/; if ($variantfile eq 'stdin') { *VAR = *STDIN; } else { open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n"; } print STDERR "NOTICE: Column 6-9 in output are heterozygosity status, SNP quality, total reads, reads with mutation\n"; while (<VAR>) { s/[\r\n]+$//; $countline++; my $hom = 'hom'; my @field = split (/\t/, $_); @field >= 10 or die "Error: invalid record found in pileupfile $variantfile (at least 10 fields expected): <$_>\n"; my ($chr, $pos, $wt, $call, @other) = @field; my ($cons_qual, $snp_quality, $readcount, $readallele) = @field[4,5,7,8]; $chr =~ s/^chr//; $wt = uc $wt; #wt may or may not in upper case, it depends on the input FASTA file $call = uc $call; #usually call is in upper case $readallele = uc $readallele; #lower case indicate the opposite strand $includeinfo or @other = (); #unless -includeinfo is set, the other will not be printed $snp_quality >= $snpqual or next; #quality of the variant call $readcount >= $coverage or next; #read count of the variant call $maxcoverage and $readcount <= $maxcoverage || next; #maximum coverage of the variant call if ($wt eq '*') { #indel #example: #1 970271 * +C/+C 39 106 44 5 +C * 1 4 0 0 0 #1 1548977 * */+CCG 29 29 42 3 * +CCG 2 1 0 0 0 #1 1674810 * */+C 24 24 42 6 * +C 5 1 0 0 0 #1 968466 * -CT/-CT 53 339 55 5 -CT * 5 0 0 0 0 #1 1093600 * -GAAA/* 29 29 53 3 -GAAA * 1 2 0 0 0 #1 1110101 * */-A 41 41 17 6 * -A 5 1 0 0 0 #1 1215395 * */-TC 26 26 32 4 * -TC 3 1 0 0 0 my @obs = split (/\//, $call); #example: '+AG/+AG' as homozygotes, '*/-TA' or '*/+T' as heterozygotes @obs == 2 or die "Error: pileup record contains more than two alternative alleles: <$_>\n"; my ($end, $ref, $alt); my ($indelreadcount); #number of reads supporting the indel if ($obs[0] eq $obs[1]) { #something weird may occur in SamTools output: 22 15231121 * */* 360 0 32 158 * +GA 156 2 0 0 0 $obs[0] eq '*' and next; #for deletions, SAMTOOLS represent deletion using a location before the first deleted base in the reference sequence coordinate system #for example, a deletion in Samtools output is "1 109266688 * */-CTT 1429 1429 58 43 * -CTT 24 19 0 0 0" #the correct ANNOVAR input (for rs35029887) should be "1 109266689 109266691 CTT - het 1429" #insertions are fine without change; for example, check rs5745466 in Genome Browser; samtools report "1 76119508 * +AT/+AT" #for this insertion, ANNOVAR input file (for rs5745466) becomes "1 76119508 76119508 - AT hom 1601" if ($obs[0] =~ m/^\-/) { $pos++; #add 1 to position in deletion } $indelreadcount = calculateIndelReadCount ($obs[0], \@field); $indelreadcount/$readcount >= $fraction or next; #do not meet minimum alternative allele fraction threshold defined $altcov and $indelreadcount >= $altcov || next; ($end, $ref, $alt) = recalculateEndRefObs ($pos, $wt, $obs[0]); print STDOUT join ("\t", $chr, $pos, $end, $ref, $alt, $hom, $snp_quality, $readcount, $indelreadcount, @other), "\n"; $counthom++; } else { $hom = 'het'; if ($obs[0] =~ m/^[\-\+]/) { $obs[0] =~ m/^\-/ and $pos++; ($end, $ref, $alt) = recalculateEndRefObs ($pos, $wt, $obs[0]); $indelreadcount = calculateIndelReadCount ($obs[0], \@field); $indelreadcount/$readcount >= $fraction or next; #do not meet minimum alternative allele fraction threshold defined $altcov and $indelreadcount >= $altcov || next; print STDOUT join ("\t", $chr, $pos, $end, $ref, $alt, $hom, $snp_quality, $readcount, $indelreadcount, @other), "\n"; $counthet++; } if ($obs[1] =~ m/^[\-\+]/) { $obs[1] =~ m/^\-/ and $pos++; ($end, $ref, $alt) = recalculateEndRefObs ($pos, $wt, $obs[1]); $indelreadcount = calculateIndelReadCount ($obs[1], \@field); $indelreadcount/$readcount >= $fraction or next; #do not meet minimum alternative allele fraction threshold defined $altcov and $indelreadcount >= $altcov || next; print STDOUT join ("\t", $chr, $pos, $end, $ref, $alt, $hom, $snp_quality, $readcount, $indelreadcount, @other), "\n"; $counthet++; } } $countindel++; } else { #1 798494 G A 36 36 58 3 AAA bbb #1 798677 T K 33 33 52 26 ,$.,,G.GG,.,......,..G,,... b`bbBaJIbFbZWaTNQbb_VZcbbb #1 856182 G A 42 42 50 5 AAAAA B\bbb #1 861034 A M 48 48 49 14 ,$,.,..,cc.c.,c bbBbb`]BFbHbBB #1 864289 T K 22 22 56 6 .g,,g, BH^_BB $wt eq $call and next; #this is not a SNP my $obs = $iupac{$call} or die "Error: invalid best call ($call) in <$_>\n"; my @obs = split (//, $obs); @obs == 2 or die "Error: observed IUPAC allele $call should correspond to two nucleotide alleles: <$_>\n"; if ($obs[0] ne $obs[1]) { $hom = 'het'; } if ($obs[0] eq $wt) { #obs[0] is guaranteed to be an alternative allele @obs = @obs[1,0]; } if ($wt eq 'A' and $obs[0] eq 'G' or $wt eq 'G' and $obs[0] eq 'A' or $wt eq 'C' and $obs[0] eq 'T' or $wt eq 'T' and $obs[0] eq 'C') { unless ($wt ne $obs[0] and $wt ne $obs[1] and $obs[0] ne $obs[1]) { $countti++; } } else { unless ($wt ne $obs[0] and $wt ne $obs[1] and $obs[0] ne $obs[1]) { $counttv++; } } my $mutallelecount; if ($obs[1] eq $wt) { #het SNP if ($chr eq $chrmt) { $hom = calculateAllelicFraction ($obs[0], $field[8], $readcount); } $mutallelecount = calculateMutAlleleCount ($obs[0], $readallele); $mutallelecount/$readcount >= $fraction or next; #do not meet minimum alternative allele fraction threshold defined $altcov and $mutallelecount >= $altcov || next; print STDOUT join ("\t", $chr, $pos, $pos, $wt, $obs[0], $hom, $snp_quality, $readcount, $mutallelecount, @other), "\n"; $counthet++; } elsif ($obs[1] ne $obs[0]) { #het SNP but both differ from reference allele if ($chr eq $chrmt) { $hom = calculateAllelicFraction ($obs[1], $field[8], $readcount); } $mutallelecount = calculateMutAlleleCount ($obs[1], $readallele); $mutallelecount/$readcount >= $fraction or next; #do not meet minimum alternative allele fraction threshold defined $altcov and $mutallelecount >= $altcov || next; print STDOUT join ("\t", $chr, $pos, $pos, $wt, $obs[1], $hom, $snp_quality, $readcount, $mutallelecount, @other), "\n"; if ($chr eq $chrmt) { $hom = calculateAllelicFraction ($obs[0], $field[8], $readcount); } $mutallelecount = calculateMutAlleleCount ($obs[0], $readallele); $mutallelecount/$readcount >= $fraction or next; #do not meet minimum alternative allele fraction threshold defined $altcov and $mutallelecount >= $altcov || next; print STDOUT join ("\t", $chr, $pos, $pos, $wt, $obs[0], $hom, $snp_quality, $readcount, $mutallelecount, @other), "\n"; $counthet++; $counthet++; } else { #homo SNP if ($chr eq $chrmt) { $hom = calculateAllelicFraction ($obs[0], $field[8], $readcount); } $mutallelecount = calculateMutAlleleCount ($obs[0], $readallele); $mutallelecount/$readcount >= $fraction or next; #do not meet minimum alternative allele fraction threshold defined $altcov and $mutallelecount >= $altcov || next; print STDOUT join ("\t", $chr, $pos, $pos, $wt, $obs[0], $hom, $snp_quality, $readcount, $mutallelecount, @other), "\n"; $counthom++; } $countsnp++; } $countvar++; } my $triallelic = $countsnp-$countti-$counttv; print STDERR "NOTICE: Read $countline lines and wrote ${\($counthet+$counthom)} different variants at $countvar genomic positions ($countsnp SNPs and $countindel indels)\n"; print STDERR "NOTICE: Among ${\($counthet+$counthom)} different variants at $countvar positions, $counthet are heterozygotes, $counthom are homozygotes\n"; print STDERR "NOTICE: Among $countsnp SNPs, $countti are transitions, $counttv are transversions", $triallelic?", $triallelic have more than 2 alleles\n":"\n"; } sub calculateIndelReadCount { my ($obs, $field) = @_; #make sure to use upper case in the comparison, for example: #chr10 130133 * */-ca 189 189 59 31 * -ca 27 4 0 0 0 if ($obs eq uc $field->[8]) { return $field->[10]; } elsif ($obs eq uc $field->[9]) { return $field->[11]; } else { die "Error: invalid record in pileup file (indel counts cannot be inferred): <$obs> vs <@$field>\n"; } } sub calculateMutAlleleCount { my ($allele, $string) = @_; #they should be already both in upper case $string =~ s/\^.//g; #^ is followed by mapping quality $string =~ s/\$//g; $string =~ s/[+-]1[^\d]//g; #1 followed by a non-number $string =~ s/[+-]2..//g; $string =~ s/[+-]3...//g; $string =~ s/[+-]4....//g; $string =~ s/[+-]5.....//g; $string =~ s/[+-]6......//g; $string =~ s/[+-]7.......//g; $string =~ s/[+-]8........//g; $string =~ s/[+-]9.........//g; $string =~ s/[+-]10..........//g; #make sure to use upper case letter my @string = split (//, uc $string); my $count = 0; for my $i (0 .. @string-1) { $allele eq $string[$i] and $count++; } return $count; } sub calculateAllelicFraction { my ($obs, $readbase, $readcount) = @_; my @readbase = split (//, $readbase); my $count=0; for my $i (0 .. @readbase-1) { uc $obs eq uc $readbase[$i] and $count++; } my $hom = $count/$readcount; length ($hom) > 5 and $hom > 0.001 and $hom = sprintf ("%.3f", $hom); return $hom; } sub recalculateEndRefObs { #recalculate end position, reference allele and observed allele my ($end, $ref, $obs) = @_; if ($obs =~ m/^\-(\w+)/) { #deletion $end += (length ($1)-1); $ref = $1; $obs = '-'; } elsif ($obs =~ m/^\+(\w+)/) { #insertion $ref = '-'; $obs = $1; } else { die "Error: cannot handle $end, $ref, $obs\n"; } return ($end, $ref, $obs); } sub convertCG { my ($variantfile) = @_; my ($foundheader, $countline, @field); my ($prechr, $prestart, $preend, $prevartype, $preref, $preobs, $prescore, $prexref) = qw/0 0 0 0 0 0 0 0/; open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n"; print STDERR "NOTICE: Converting variants from $variantfile\n"; while (<VAR>) { s/[\r\n]+$//; $countline++; if (m/^>locus/) { $foundheader++; } if (not $foundheader) { $countline > 50 and die "Error: invalid CG-var file format for $variantfile (>locus record is not found within the first 50 lines)\n"; next; } my ($locus, $ploidy, $haplo, $chr, $start, $end, $vartype, $ref, $obs, $score, $haplolink, $xref) = split (/\t/, $_); $chr =~ s/^chr//; $vartype eq 'ins' or $start++; #CG use zero-start, half-open coordinate. Insertion does not need to be processed (example, "16476 2 2 chr1 751820 751820 ins T 49 dbsnp:rs59038458") $obs eq '' and $obs = '-'; $ref eq '' and $ref = '-'; if ($vartype =~ m/^snp|ins|del|delins|sub$/) { #in new versions of the files, "sub" is used instead of "delins". #$chr eq 'M' and next; #ignore chrM markers as they are not diploid if ($chr eq $prechr and $start eq $prestart and $end eq $preend and $obs eq $preobs) { #homozygous mutation print $chr, "\t", $start, "\t", $end, "\t", $ref, "\t", $obs, "\t", $vartype, "\t", ($score+$prescore)/2, "\t", "hom\t", $xref, "\n"; ($prechr, $prestart, $preend, $prevartype, $preref, $preobs, $prescore, $prexref) = qw/0 0 0 0 0 0 0 0/; } else { if ($prestart and $preend) { print $prechr, "\t", $prestart, "\t", $preend, "\t", $preref, "\t", $preobs, "\t", $prevartype, "\t", $prescore, "\thet\t", $prexref, "\n"; } ($prechr, $prestart, $preend, $prevartype, $preref, $preobs, $prescore, $prexref) = ($chr, $start, $end, $vartype, $ref, $obs, $score, $xref); } } } if ($prestart and $preend) { print $prechr, "\t", $prestart, "\t", $preend, "\t", $preref, "\t", $preobs, "\t", $prevartype, "\t", $prescore, "\thet\t", $prexref, "\n"; } print STDERR "NOTICE: Done with $countline lines\n"; } sub convertGFF3SolidSNP { my ($variantfile) = @_; my ($countline, $countvar, $countallvar, @other) = (0, 0, 0); my ($unknown_count); #count of variants with 'unknown' variation type open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n"; $_ = <VAR>; s/[\r\n]+$//; m/^##gff-version\s+3/ or die "Error: invalid first line in GFF3 file ('##gff-version 3' expected): <$_>\n"; $_ = <VAR>; s/[\r\n]+$//; m/^##solid-gff-version/ or print STDERR "WARNING: problematic second line in GFF3-SOLiD file ('##solid-gff-version' expected): <$_>\n"; print STDERR "NOTICE: Column 6-9 in output are heterozygosity status, variant score (P-value), total clipped normal coverage reads, total reads with mutated allele\n"; while (<VAR>) { s/[\r\n]+$//; $countline++; m/^##/ and next; #header of comment lines m/^#/ and next; #header of results lines my @field = split (/\t/, $_); @field == 9 or die "Error: invalid record found in $variantfile (10 fields expected): <$_>\n"; my ($chr, $program, $type, $pos, $end, $score, $attribute) = @field[0,1,2,3,4,5,8]; #score is the P-value for the SNP calls $chr eq 'chr_name' and next; #header line if ($score ne '.') { $score >=0 and $score <=1 or die "Error: invalid score record found in file (0-1 range expected): <$_>\n"; $score <= $snppvalue or next; } if ($species and $species eq 'human') { $chr eq '23' and $chr = 'X'; $chr eq '24' and $chr = 'Y'; $chr eq '25' and $chr = 'M'; } $includeinfo and @other = ($attribute); #unless -includeinfo is set, the other will not be printed my ($readcount, $mutallelecount) = ('.', '.'); #total coverage, coverage for mutated alleles if ($type eq 'unknown') { #SOLiD GDD3 may have unknown variation types #chr1 AB_SOLiD Small Indel Tool unknown 3833062 3833153 1 . . ID=5483;len=no_call;allele-call-pos=3833062;allele-call=/CCAC;allele-pos=3833057;alleles=atccatccacccatc/aTCCATCCACCCACCCATC/NO_CALL;allele-counts=REF,2,2;tight_chrom_pos=none;loose_chrom_pos=3833058-3833069;no_nonred_reads=3;coverage_ratio=8.0000;experimental-zygosity=HEMIZYGOUS;experimental-zygosity-score=1.0000;run_names=L1_1_50_10_r,L1_1_50_15_r,L1_1_50_15_r,L1_1_50_12_r;bead_ids=1018_196_970,699_1263_465,220_513_923,2022_1532_1071;overall_qvs=4,6,2,50;no_mismatches=5,4,2,0;read_pos=27,29,31,13;from_end_pos=23,21,19,37;strands=+,+,+,+;tags=R3,F3,F3,F3;indel_sizes=-92,-112,4,4;non_indel_no_mismatches=0,0,8,0;unmatched-lengths=50,50,50,50;ave-unmatched=50.0000;anchor-match-lengths=48,49,49,49;ave-anchor-length=48.7500;read_seqs=G23223321322112233223100132013201320110011001322332,T33223321322112233223100132013201320110013021322332,T33223321322112233223100132013201320110011001322332,T31001320132013201100110013223322113030332233113032;base_qvs=;non_indel_seqs=T21322332211221121322332230321212121223322332233221,G12020202202020012001200213022002130012332310122030,G12020202202020012001000210022012110312331331122030,G22111012101031010100002002321020002202121121313021;non_indel_qvs= $unknown_count++; next; #do not count this one! } if ($program eq 'SOLiD_diBayes' or $program eq 'AB_SOLiD SNP caller') { #SNP variants #detailed description can be found at http://solidsoftwaretools.com/gf/download/docmanfileversion/208/866/DiBayes_Documentation_v1.2.pdf #chr1 SOLiD_diBayes SNP 559817 559817 0.094413 . . genotype=Y;reference=T;coverage=9;refAlleleCounts=5;refAlleleStarts=4;refAlleleMeanQV=23;novelAlleleCounts=2;novelAlleleStarts=2;novelAlleleMeanQV=14;diColor1=11;diColor2=33;het=1;flag= #chr1 SOLiD_diBayes SNP 714068 714068 0.000000 . . genotype=M;reference=C;coverage=13;refAlleleCounts=7;refAlleleStarts=6;refAlleleMeanQV=25;novelAlleleCounts=6;novelAlleleStarts=4;novelAlleleMeanQV=22;diColor1=00;diColor2=11;het=1;flag= #chr1 SOLiD_diBayes SNP 714835 714835 0.041579 . . genotype=R;reference=A;coverage=5;refAlleleCounts=3;refAlleleStarts=3;refAlleleMeanQV=18;novelAlleleCounts=2;novelAlleleStarts=2;novelAlleleMeanQV=20;diColor1=02;diColor2=20;het=1;flag= $pos == $end or die "Error: invalid record found in GFF3-SOLiD file: start and end discordant: <$_>\n"; my ($wt, $call); if ($attribute =~ m/ref_base=(\w)/) { $wt = $1; } elsif ($attribute =~ m/reference=(\w)/) { $wt = $1; } else { die "Error: invalid record found in GFF3-SOLiD file (ref_base/reference was not found): <$_>\n"; } if ($attribute =~ m/consen_base=(\w)/) { $call = $1; } elsif ($attribute =~ m/genotype=(\w)/) { $call = $1; } else { die "Error: invalid record found in GFF3-SOLiD file (consen_base was not found): <$_>\n"; } if ($attribute =~ m/coverage=(\d+)/) { $readcount = $1; $readcount >= $coverage or next; #read count of the variant call $maxcoverage and $readcount <= $maxcoverage || next; } if ($attribute =~ m/novelAlleleCounts=(\d+)/) { $mutallelecount = $1; $mutallelecount/$readcount >= $fraction or next; #do not meet minimum alternative allele fraction threshold defined $altcov and $mutallelecount >= $altcov || next; } my $obs = $iupac{$call} or die "Error: invalid best call in <$_>\n"; my @obs = split (//, $obs); @obs == 2 or die "Error: observed IUPAC allele $call should correspond to two nucleotide alleles: <$_>\n"; if ($obs[0] eq $wt and $obs[1] eq $wt) { die "Error: reference alleles are identical to observed alleles: <$_>\n"; } elsif ($obs[0] eq $wt) { print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n"; } elsif ($obs[1] eq $wt) { print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n"; } elsif ($obs[1] ne $obs[0]) { print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n"; print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n"; $countallvar++; } else { print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "hom\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n"; } } elsif ($program eq 'AB_CNV_PIPELINE') { #CNV if ($attribute =~ m/copynum=(\d+)/ or $attribute =~ m/copynumber=(\d+)/) { if ($1 < 2) { print $chr, "\t", $pos, "\t", $end, "\t", 0, "\t", '-', "\t", "unk\t", "$score\t.\t.\t", join ("\t", @other), "\n"; } elsif ($1 > 2) { print $chr, "\t", $end, "\t", $end, "\t", '-', "\t", 0, "\t", "unk\t", "$score\t.\t.\t", join ("\t", @other), "\n"; } } else { print $chr, "\t", $end, "\t", $end, "\t", '-', "\t", 0, "\t", "unk\t", "$score\t.\t.\t", join ("\t", @other), "\n"; } } elsif ($program eq 'AB_SOLiD Large Indel Tool') { #CNV #http://solidsoftwaretools.com/gf/download/docmanfileversion/182/780/Large_Indel_Documentation_v1.0.0.pdf ## [FIELDS] (1) chromosome (2) version (3) indel type (4) breakpoint start (5) breakpoint end (6) p-value (7) NA (8) NA (9) attributes #chr10 AB_SOLiD Large Indel Tool insertion 151910 151969 2.77548e-11 . . dev=-71;avgDev=-1.63884;zygosity=HOMOZYGOUS;nRef=0;nDev=14;refDev=0;devDev=-1.60924;refVar=0;devVar=0.0159438;beadIds=1750_720_1641,649_1680_794,503_1756_608,1726_174_1362,1208_1772_353,872_594_1604,1924_990_858,1899_961_1848,901_1226_378,323_1750_1017,1185_180_1237,1519_490_1074,1291_94_324,285_758_922,1135_95_1594,1055_218_1279, #chr10 AB_SOLiD Large Indel Tool insertion 154109 154729 2.1559e-11 . . dev=-66;avgDev=-1.51253;zygosity=HOMOZYGOUS;nRef=0;nDev=15;refDev=0;devDev=-1.02864;refVar=0;devVar=0.133236;beadIds=1728_1671_1739,1250_231_25,811_783_1090,1035_908_491,649_1680_794,503_1756_608,1726_174_1362,1208_1772_353,872_594_1604,1924_990_858,1899_961_1848,901_1226_378,323_1750_1017,1185_180_1237,1519_490_1074,1291_94_324,285_758_922,1135_95_1594,1055_218_1279, my ($call, @call, $zygosity); if ($attribute =~ m#zygosity=HEMIZYGOUS#) { $zygosity = 'het'; } elsif ($attribute =~ m#zygosity=HOMOZYGOUS#) { $zygosity = 'hom'; } else { $zygosity = 'unk'; } if ($type eq 'insertion') { #the true boundary is unknown (start is different from end) so we cannot use "-" to represent reference allele. print $chr, "\t", $pos, "\t", $end, "\t", 0, "\t", 0, "\t", $zygosity, "\t", "$score\t.\t.\t", join ("\t", @other), "\n"; } elsif ($type eq 'deletion') { print $chr, "\t", $pos, "\t", $end, "\t", 0, "\t", '-', "\t", $zygosity, "\t", "$score\t.\t.\t", join ("\t", @other), "\n"; } } elsif ($program eq 'AB_SOLiD Small Indel Tool') { #small indels #typical simple insertion and deletions #chr1 AB_SOLiD Small Indel Tool deletion 1352612 1352614 1 . . ID=1290;del_len=3;allele-call-pos=1352612;allele-call=cct/;allele-pos=1352610;alleles=cccctccat/cCCCAT;allele-counts=REF,2;tight_chrom_pos=1352612-1352614;loose_chrom_pos=1352612-1352614;no_nonred_reads=2;coverage_ratio=11.5000;experimental-zygosity=HEMIZYGOUS;experimental-zygosity-score=1.0000;run_names=L1_1_25_3_r,L1_1_25_8_r;bead_ids=1470_2000_506,822_1710_1767;overall_qvs=18,19;no_mismatches=3,3;read_pos=6,13;from_end_pos=19,12;strands=-,+;tags=R3,R3;indel_sizes=-3,-3;non_indel_no_mismatches=1,-1;unmatched-lengths=25,25;ave-unmatched=25.0000;anchor-match-lengths=24,99;ave-anchor-length=61.5000;read_seqs=G0112310001100003120031200,G0300213000011000132110021;base_qvs=;non_indel_seqs=T2120033002022200220000002,;non_indel_qvs= #chr1 AB_SOLiD Small Indel Tool insertion_site 1311162 1311162 1 . . ID=1249;ins_len=1;allele-call-pos=1311162;allele-call=/G;allele-pos=1311161;alleles=gaggggggg/GAGGGGGGGG/NO_CALL;allele-counts=REF,3,1;tight_chrom_pos=none;loose_chrom_pos=1311160-1311169;no_nonred_reads=3;coverage_ratio=4.6667;experimental-zygosity=HEMIZYGOUS;experimental-zygosity-score=1.0000;run_names=L1_1_25_6_r,L1_1_50_10_r,L1_1_25_2_r,L1_1_25_3_r;bead_ids=850_837_429,1160_878_181,404_1050_1881,1084_64_1343;overall_qvs=20,56,25,25;no_mismatches=3,2,2,1;read_pos=11,22,11,11;from_end_pos=14,28,14,14;strands=+,-,-,-;tags=R3,F3,F3,F3;indel_sizes=1,1,1,1;non_indel_no_mismatches=-1,1,0,1;unmatched-lengths=25,50,25,25;ave-unmatched=31.2500;anchor-match-lengths=99,49,24,24;ave-anchor-length=49.0000;read_seqs=G1020001130221020000000020,T03223323210110021000000022122030100020221222222122,T0102210000000221223301000,T0102210000000221220301000;base_qvs=;non_indel_seqs=,G21202030032202013220021321131212021000122300013132,G1331133120001221220120120,G1331133120001221220120220;non_indel_qvs= #sometimes, allele-call is ambiguous that requires a "block substitution" representation (although they were annotated as insertion or deletion by SOLiD, they should be treated as block substitution by ANNOVAR) #sometimes, mutiple allele calls may be present at the same site #chr1 AB_SOLiD Small Indel Tool deletion 1209357 1209360 1 . . ID=1101;del_len=4;allele-call-pos=1209357;allele-call=ggtggg/TT;allele-pos=1209355;alleles=ggggtgggggggtt/gGTTGGGGTT/gGTGTTTTGCCTT/NO_CALL;allele-counts=REF,3,1,1;tight_chrom_pos=none;loose_chrom_pos=1209357-1209363;no_nonred_reads=4;coverage_ratio=3.0000;experimental-zygosity=HEMIZYGOUS;experimental-zygosity-score=0.9888;run_names=L1_1_25_1_r,L1_1_25_2_r,L1_1_25_4_r,L1_1_25_3_r,L1_1_25_7_r;bead_ids=1017_1024_53,1493_1896_615,1794_647_1473,307_1116_687,773_1492_1671;overall_qvs=24,24,28,24,8;no_mismatches=2,3,2,3,2;read_pos=14,9,14,9,15;from_end_pos=11,16,11,16,10;strands=-,+,-,+,+;tags=F3,R3,F3,F3,F3;indel_sizes=-4,-4,-4,-4,3;non_indel_no_mismatches=1,0,0,0,0;unmatched-lengths=25,25,25,25,25;ave-unmatched=25.0000;anchor-match-lengths=24,24,24,24,24;ave-anchor-length=24.0000;read_seqs=T2221100101000101000221100,G0001122000100000101001020,T2221100101000101000221100,T1112200010100010100112000,T1011220000111000130200001;base_qvs=;non_indel_seqs=G0312033221312111022200300,T0111113210210112100001130,G0312133221312111022200300,G0231003132222112000012020,G3121331033101113122312020;non_indel_qvs= #chr1 AB_SOLiD Small Indel Tool deletion 1209436 1209436 1 . . ID=1103;del_len=1;allele-call-pos=1209436;allele-call=ag/A/G;allele-pos=1209434;alleles=tgagggggtt/tGAGGGGTT/tGGGGGGTT;allele-counts=REF,1,1;tight_chrom_pos=none;loose_chrom_pos=1209436-1209441;no_nonred_reads=2;coverage_ratio=5.0000;experimental-zygosity=HEMIZYGOUS;experimental-zygosity-score=1.0000;run_names=L1_1_25_6_r,L1_1_25_2_r;bead_ids=1315_1584_2005,1706_194_437;overall_qvs=28,21;no_mismatches=0,3;read_pos=9,7;from_end_pos=16,18;strands=-,-;tags=R3,R3;indel_sizes=-1,-1;non_indel_no_mismatches=-1,0;unmatched-lengths=25,25;ave-unmatched=25.0000;anchor-match-lengths=99,24;ave-anchor-length=61.5000;read_seqs=G3001010000011001010000001,G3010100022110010111000110;base_qvs=;non_indel_seqs=,T1112003220020013202122300;non_indel_qvs= #chr1 AB_SOLiD Small Indel Tool insertion_site 1424540 1424540 1 . . ID=1376;ins_len=3;allele-call-pos=1424540;allele-call=tt/CCCAC;allele-pos=1424537;alleles=ttttttg/TTTCCCACTG/NO_CALL;allele-counts=REF,1,1;tight_chrom_pos=none;loose_chrom_pos=1424536-1424543;no_nonred_reads=2;coverage_ratio=11.5000;experimental-zygosity=HEMIZYGOUS;experimental-zygosity-score=1.0000;run_names=L1_1_25_7_r,L1_1_50_16_r;bead_ids=703_437_370,1089_878_1744;overall_qvs=1,9;no_mismatches=3,4;read_pos=5,35;from_end_pos=20,15;strands=-,-;tags=R3,F3;indel_sizes=3,3;non_indel_no_mismatches=2,0;unmatched-lengths=25,50;ave-unmatched=37.5000;anchor-match-lengths=24,47;ave-anchor-length=35.5000;read_seqs=G2032002200200000000000020,T30100102220312202103112130230322210121100200002100;base_qvs=;non_indel_seqs=T2121120003012303000000000,G22213300221101011121030022002222300220322213303102;non_indel_qvs= my ($call, @call, $zygosity); if ($attribute =~ m#experimental-zygosity=HEMIZYGOUS#) { $zygosity = 'het'; } elsif ($attribute =~ m#experimental-zygosity=HOMOZYGOUS#) { $zygosity = 'hom'; } else { $zygosity = 'unk'; } $score = '.'; #by default, score=1 in the output #no_nonred_reads: Number of reads with unique start positions (non-redundant reads). #coverage_ratio: Clipped normal coverage/number of non-redundant reads.Clipped coverage is where the parts of the read that are within 5 bp at either end are not counted as a part of coverage. if ($attribute =~ m/no_nonred_reads=(\d+);coverage_ratio=([\d\.]+)/) { $readcount = int ($1*$2); $readcount >= $coverage or next; #clipped normal read count of the variant call (this could even be lower than mut allele count) $maxcoverage and $readcount <= $maxcoverage || next; } else { $readcount = '.'; } if ($attribute =~ m/allele-counts=REF,(\d+)/) { $mutallelecount = $1; } if ($attribute =~ m#allele\-call=([\w\/]+)#) { @call = split (/\//, $1); if (@call == 1) { #a simple deletion print $chr, "\t", $pos, "\t", $end, "\t", $call[0], "\t", '-', "\t", $zygosity, "\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n"; } elsif ($call[0] eq '') { #a simple insertion (single or multiple allele) for my $i (1 .. @call-1) { print $chr, "\t", $pos, "\t", $pos, "\t", '-', "\t", $call[$i], "\t", $zygosity, "\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n"; $i > 1 and $countallvar++; } } else { #an indel that may have several alleles, or may require a block substitution representation for my $i (1 .. @call-1) { print $chr, "\t", $pos, "\t", $pos+length($call[0])-1, "\t", $call[0], "\t", $call[$i], "\t", $zygosity, "\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n"; $i > 1 and $countallvar++; } } } else { $call = '0'; print $chr, "\t", $pos, "\t", $end, "\t", $call, "\t", '-', "\t", $zygosity, "\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n"; } } else { die "Error: unrecognizable genotype calling program encountered (valid types are SOLiD_diBayes, AB_CNV_PIPELINE, AB_SOLiD Large Indel Tool, AB_SOLiD Small Indel Tool): <$_>\n"; } $countvar++; #variation positions $countallvar++; #all variants (maybe several at one variation position) } print STDERR "NOTICE: Finished processing $variantfile with $countline input lines\n"; print STDERR "NOTICE: Wrote variants in $countvar variation positions ($countallvar variants considering multi-allelic ones)\n"; $unknown_count and print STDERR "WARNING: $unknown_count variants with 'unknown' variation type were skipped\n"; } sub convertSOAP { my ($variantfile) = @_; my ($countline, $countvar, @other); open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n"; while (<VAR>) { s/[\r\n]+$//; $countline++; my @field = split (/\t/, $_); if (@field == 18) { #snp file my ($chr, $pos, $wt, $call, @other) = @field; $chr =~ s/^chr//; $includeinfo or @other = (); #unless -includeinfo is set, the other will not be printed my $obs = $iupac{$call} or die "Error: invalid best call in <$_>\n"; my @obs = split (//, $obs); @obs == 2 or die "Error: observed IUPAC allele $call should correspond to two nucleotide alleles: <$_>\n"; if ($obs[0] eq $wt and $obs[1] eq $wt) { die "Error: reference alleles are identical to observed alleles: <$_>\n"; } elsif ($obs[0] eq $wt) { print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n"; } elsif ($obs[1] eq $wt) { print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", join ("\t", @other), "\n"; } elsif ($obs[1] ne $obs[0]) { print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", join ("\t", @other), "\n"; print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n"; $countvar++; } else { print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "hom\t", join ("\t", @other), "\n"; } } elsif (@field == 6) { #indel file my ($chr, $pos, $strand, $indellen, $call, $homo) = @field; $homo eq 'Homo' and $homo = 'hom'; $homo eq 'Hete' and $homo = 'het'; $chr =~ s/^chr//; $includeinfo or @other = (); #unless -includeinfo is set, the other will not be printed if ($indellen =~ m/^\+(\d+)$/) { #insertion length ($call) == $1 or die "Error: invalid record found in SOAPindel file: <$_>\n"; print join("\t", $chr, $pos, $pos, '-', $call, $homo), "\n"; } elsif ($indellen =~ m/^\-(\d+)$/) { #deletion length ($call) == $1 or die "Error: invalid record found in SOAPindel file: <$_>\n"; print join("\t", $chr, $pos, $pos+$1-1, $call, '-', $homo), "\n"; } else { die "Error: invalid record found in SOAPindel file: <$_>\n"; } } else { die "Error: invalid record found in $variantfile (18 or 6 fields expected, observed ${\(scalar @field)} fields): <$_>\n"; } $countvar++; } print STDERR "NOTICE: Read $countline lines and wrote $countvar variants\n"; } sub convertANNOVAR { my ($variantfile) = @_; my ($countline, $countvar, $countsnp); my ($countti, $counttv) = (0, 0); open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n"; while (<VAR>) { $countline++; my @field = split (/\t/, $_); my ($chr, $start, $end, $ref, $obs) = @field; if ($ref =~ m/^[ACGT]$/ and $obs =~ m/^[ACGT]$/) { if ($ref eq 'A' and $obs eq 'G' or $ref eq 'G' or $obs eq 'A' or $ref eq 'C' and $obs eq 'T' or $ref eq 'T' and $obs eq 'C') { $countti++; } else { $counttv++; } $countsnp++; } print; $countvar++; } print STDERR "NOTICE: Read $countline lines and wrote $countvar variants\n"; print STDERR "NOTICE: Among $countsnp SNPs, $countti are transitions, $counttv are transversions\n"; } sub convertMAQSNP { my ($variantfile) = @_; my ($countline, $countvar, @other); open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n"; while (<VAR>) { s/[\r\n]+$//; $countline++; my @field = split (/\t/, $_); if (@field == 12) { #SNPs my ($chr, $pos, $wt, $call, @other) = @field; $chr =~ s/^chr//; $includeinfo or @other = (); #unless -includeinfo is set, the other will not be printed my $obs = $iupac{$call} or die "Error: invalid best call in <$_>\n"; my @obs = split (//, $obs); @obs == 2 or die "Error: observed IUPAC allele $call should correspond to two nucleotide alleles: <$_>\n"; if ($obs[0] eq $wt and $obs[1] eq $wt) { die "Error: reference alleles are identical to observed alleles: <$_>\n"; } elsif ($obs[0] eq $wt) { print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n"; } elsif ($obs[1] eq $wt) { print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", join ("\t", @other), "\n"; } elsif ($obs[1] ne $obs[0]) { print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", join ("\t", @other), "\n"; print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n"; $countvar++; } else { print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "hom\t", join ("\t", @other), "\n"; } $countvar++; } elsif (@field == 13) { #indels; the deletion start site do not need changes; the duplication start site need additional investigation by ANNOVAR developers my ($chr, $pos, $type, $numread, $call, @other) = @field; $chr =~ s/^chr//; $includeinfo or @other = (); #unless -includeinfo is set, the other will not be printed my @obs = split (/:/, $call); @obs == 2 or die "Error: observed IUPAC allele $call should correspond to two nucleotide alleles: <$_>\n"; if ($obs[0] =~ m/^\-(\d+)/) { my $len = $1; print $chr, "\t", $pos, "\t", $pos+$len-1, "\t", $obs[1], "\t", '-', "\t", "het\t", join ("\t", @other), "\n"; } elsif ($obs[0] =~ m/^(\d+)/) { my $len = $1; print $chr, "\t", $pos, "\t", $pos, "\t", '-', "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n"; } $countvar++; } else { die "Error: invalid record found in $variantfile (12 or 13 fields expected, observed ${\(scalar @field)} fields): <$_>\n"; } } print STDERR "NOTICE: Read $countline lines and wrote $countvar variants\n"; } sub convertCASAVA { my ($variantfile, $chr) = @_; my ($countline, $countvar, @other); my ($intype); my ($pos_index, $call_index, $reference_index, $type_index, $score_index, $total_index, $used_index); my ($ref_indel_index, $quality_index, $maxgtype_index, $bp1_reads_index, $ref_reads_index, $indel_reads_index, $other_reads_index); open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n"; while (<VAR>) { s/[\r\n]+$//; $countline++; my @field; if (m/^#/) { s/^#//; if (s/^\$\sCOLUMNS\s//) { @field = split (/\s+/, $_); } else { @field = split (/\t/, $_); } if (m/\bposition\b/ or m/\bpos\b/) { for my $i (0 .. @field-1) { if ($field[$i] eq 'position' or $field[$i] eq 'pos') { $pos_index = $i; } elsif ($field[$i] eq 'modified_call') { $intype = 'snp'; print STDERR "NOTICE: Automatically detected input type as $intype\n"; $call_index = $i; } elsif ($field[$i] eq 'reference') { $reference_index = $i; } elsif ($field[$i] eq 'type') { $type_index = $i; } elsif ($field[$i] eq 'score') { $score_index = $i; } elsif ($field[$i] eq 'total') { $total_index = $i; } elsif ($field[$i] eq 'used') { $used_index = $i; } elsif ($field[$i] eq 'ref/indel') { $intype = 'indel'; print STDERR "NOTICE: Automatically detected input type as $intype\n"; $ref_indel_index = $i; } elsif ($field[$i] eq 'Q(indel)') { $quality_index = $i; } elsif ($field[$i] eq 'max_gtype') { $maxgtype_index = $i; } elsif ($field[$i] eq 'bp1_reads') { $bp1_reads_index = $i; } elsif ($field[$i] eq 'ref_reads') { $ref_reads_index = $i; } elsif ($field[$i] eq 'indel_reads') { $indel_reads_index = $i; } elsif ($field[$i] eq 'other_reads') { $other_reads_index = $i; } } } next; } $intype or die "Error: unable to recognize the correct type of the input file (make sure that header line is present in $variantfile)\n"; @field = split (/\t/, $_); if ($intype eq 'snp') { #SNPs defined $pos_index and defined $reference_index and defined $call_index or die "Error: unalbe to find the position, reference and modified_call column header in $variantfile\n"; my ($pos, $wt, $obs) = @field[$pos_index, $reference_index, $call_index]; my (@other); defined $pos and defined $wt and defined $obs or die; if ($includeinfo) { defined $score_index and push @other, $field[$score_index]; defined $total_index and push @other, $field[$total_index]; defined $used_index and push @other, $field[$used_index]; defined $type_index and push @other, $field[$type_index]; } length ($obs) == 1 and $obs .= $obs; my @obs = split (//, $obs); @obs == 2 or die "Error: observed allele $obs should correspond to two nucleotide alleles: <$_>\n"; if ($obs[0] eq $wt and $obs[1] eq $wt) { die "Error: reference alleles are identical to observed alleles: <$_>\n"; } elsif ($obs[0] eq $wt) { print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n"; } elsif ($obs[1] eq $wt) { print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", join ("\t", @other), "\n"; } elsif ($obs[1] ne $obs[0]) { print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", join ("\t", @other), "\n"; print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n"; $countvar++; } else { print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "hom\t", join ("\t", @other), "\n"; } $countvar++; } elsif ($intype eq 'indel') { #indels defined $pos_index and defined $ref_indel_index and defined $maxgtype_index or die "Error: unable to find the pos, ref_indel and max_gtype column header in $variantfile\n"; my ($pos, $call, $hom, @other) = @field[$pos_index, $ref_indel_index, $maxgtype_index]; if ($includeinfo) { defined $quality_index and push @other, $field[$quality_index]; defined $maxgtype_index and push @other, $field[$maxgtype_index]; defined $bp1_reads_index and push @other, $field[$bp1_reads_index]; defined $ref_reads_index and push @other, $field[$ref_reads_index]; defined $indel_reads_index and push @other, $field[$indel_reads_index]; defined $other_reads_index and push @other, $field[$other_reads_index]; } #hg19 coordinate below; insertion needs position adjustment!!! deletion is fine #948847 1I CCTCAGGCTT -/A ATAATAGGGC 969 hom 47 het 22 0 16 6 A 1 2 #978604 2D CACTGAGCCC CT/-- GTGTCCTTCC 251 hom 20 het 8 0 4 4 CT 1 0 #1276974 4I CCTCATGCAG ----/ACAC ACACATGCAC 838 hom 39 het 18 0 14 4 AC 2 4 #1289368 2D AGCCCGGGAC TG/-- GGAGCCGCGC 1376 hom 83 het 33 0 25 9 TG 1 0 #185137455 11I10M2I TATGTGTCCT -----------TTTTTTATTT--/AAATGATAGACTTTTTTTTTTAA ATTTCAGAAA 1126 het 988 hom 45 20 24 7 N/A 0 0 #1276931 2D41M4I CACACACATG CACACACACGCACACACGTGCAATGTGAAAACACCTCATGCAG----/--CACACACGCACACACGTGCAATGTGAAAACACCTCATGCAGACAC ACACATGCAC 548 hom 16 het 8 0 11 11 N/A 0 0 my @obs = split (/\//, $call); @obs == 2 or die "Error: observed indel allele $call should correspond to two alleles: <$_>\n"; if ($obs[0] =~ m/^\-+$/) { #insertion my $len = length ($obs[0]); print $chr, "\t", $pos-1, "\t", $pos-1, "\t", '-', "\t", $obs[1], "\t", $hom, "\t", join ("\t", @other), "\n"; } elsif ($obs[1] =~ m/^\-+$/) { #deletion my $len = length ($obs[0]); print $chr, "\t", $pos, "\t", $pos+$len-1, "\t", $obs[0], "\t", '-', "\t", $hom, "\t", join ("\t", @other), "\n"; } elsif (length ($obs[0]) eq length ($obs[1])) { #block substitution $obs[0] =~ s/\-//g; $obs[1] =~ s/\-//g; print $chr, "\t", $pos, "\t", $pos+length($obs[0])-1, "\t", $obs[0], "\t", $obs[1], "\t", $hom, "\t", join ("\t", @other), "\n"; } else { die "Error: invalid record found in indel line: <$_>\n"; } $countvar++; } else { die "Error: invalid record found in $variantfile (11 or 15 fields expected, observed ${\(scalar @field)} fields): <$_>\n"; } } print STDERR "NOTICE: Read $countline lines and wrote $countvar variants\n"; } sub convertVCF4 { my ($variantfile) = @_; my ($countline, $countvar, $counthom, $counthet, $countunknown, $countindel, $countsnp, $countti, $counttv) = qw/0 0 0 0 0 0 0 0 0/; my ($source_program); #the program that generated the VCF4 file open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n"; while (<VAR>) { $countline++; if (m/^##fileformat=VCFv(\d+\.)/) { $1<4 and print STDERR "ERROR: Input file is not in VCF version 4 format but is $_" and exit; } if (m/^##UnifiedGenotyper/) { $source_program = 'gatksnp'; print STDERR "NOTICE: Detected that the VCF4 file is generated by GATK UnifiedGenotyper\n"; print STDERR "NOTICE: column 6-10 represent heterozygosity status, quality score, read depth, RMS mapping quality, quality by depth\n"; $fraction and print STDERR "WARNING: the --fraction argument will be ignored for GATK SNP calls!!!\n"; $confraction and print STDERR "WARNING: the --confraction argument will be ignored for GATK SNP calls!!!\n"; } if (m/^##IndelGenotyper/) { $source_program = 'gatkindel'; print STDERR "NOTICE: Detected that the VCF4 file is generated by GATK IndelGenotyper\n"; print STDERR "NOTICE: column 6-10 represent heterozygosity status, quality score, read depth, read count supporting indel call, RMS mapping quality\n"; } m/^#/ and next; #skip comment lines s/[\r\n]+$//; #delete trailing new lines my $otherinfo = $_; #this is the complete line (when -includeinfo is set, the entire line will be included in output file) #format description: http://www.1000genomes.org/wiki/Analysis/vcf4.0 #standard VCF4 should have 8 columns, but some software may produce more columns (for example, for genotype calls). The first 8 columns should follow the specification #example of VCF4 generated by GATK SNP caller #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE #1 55 . T G 34.82 . DP=2;Dels=0.00;HRun=0;HaplotypeScore=0.00;MQ=14.16;MQ0=0;QD=17.41;SB=-10.00 GT:DP:GL:GQ 0/1:1:-6.66,-0.30,-0.00:1.76 #1 2646 . G A 40.91 . DP=4;Dels=0.00;HRun=0;HaplotypeScore=0.00;MQ=7.50;MQ0=3;QD=10.23;SB=-10.00 GT:DP:GL:GQ 0/1:1:-7.27,-0.30,-0.00:1.76 #example of VCF4 generated by GATK indel caller #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE #1 2525324 . G GC . PASS AC=5,5;DP=12;MM=4.8,3.7142856;MQ=29.0,42.285713;NQSBQ=33.0,46.463768;NQSMM=0.24,0.20289855;SC=0,5,1,6 GT 0/1 #1 3553372 . GC G . PASS AC=6,6;DP=6;MM=0.8333333,0.0;MQ=60.0,0.0;NQSBQ=63.533333,0.0;NQSMM=0.0,0.0;SC=0,6,0,0 GT 1/0 #1 6093011 . CG C . PASS AC=31,31;DP=32;MM=0.7096774,2.0;MQ=59.64516,60.0;NQSBQ=64.192184,39.666668;NQSMM=0.0,0.11111111;SC=23,8,0,1 GT 1/0 #example of VCF4 generated by 1000G #CHROM POS ID REF ALT QUAL FILTER INFO #1 533 . G C . PASS AA=.;AC=6;AN=120;DP=423 #1 41342 . T A . PASS AA=.;AC=29;AN=120;DP=188 #1 41791 . G A . PASS AA=.;AC=5;AN=120;DP=192 #1 44449 . T C . PASS AA=C;AC=2;AN=120;DP=166 #1 44539 rs2462492 C T . PASS AA=T;AC=2;AN=120;DP=131 #example of VCF4 generated by 1000G #CHROM POS ID REF ALT QUAL FILTER INFO #1 1000153 . TCACA T 100 PASS AF=0.115095;HP=1;NF=16;NR=13;NS=52;CA=0;DP=615 #1 1000906 . CA C 48 PASS AF=0.0772696;HP=1;NF=2;NR=9;NS=51;CA=0;DP=281 #1 1000950 rs60561655;-/G CG C 100 PASS AF=0.447771;HP=5;DB;NF=10;NR=20;NS=50;CA=M;DP=291 #1 1010786 rs36095298;-/G,mills,venter A AG 100 PASS AF=0.774334;HP=1;DB;NF=21;NR=27;NS=51;CA=0;DP=306 #1 1026158 . T TGGGGG 100 PASS AF=0.115637;HP=1;NF=5;NR=2;NS=52;CA=0;DP=591 #reserved VCF4 sub-fields in the INFO field # * AA ancestral allele # * AC allele count in genotypes, for each ALT allele, in the same order as listed # * AF allele frequency for each ALT allele in the same order as listed: use this when estimated from primary data, not called genotypes # * AN total number of alleles in called genotypes # * BQ RMS base quality at this position # * CIGAR cigar string describing how to align an alternate allele to the reference allele # * DB dbSNP membership # * DP combined depth across samples, e.g. DP=154 # * END end position of the variant described in this record (esp. for CNVs) # * H2 membership in hapmap2 # * MQ RMS mapping quality, e.g. MQ=52 # * MQ0 Number of MAPQ == 0 reads covering this record # * NS Number of samples with data # * SB strand bias at this position # * SOMATIC indicates that the record is a somatic mutation, for cancer genomics # * VALIDATED validated by follow-up experiment #SAMtools/BCFtools specific information #SAMtools/BCFtools may write the following tags in the INFO field in VCF/BCF. #Tag Description #I16 16 integers: #1 #reference Q13 bases on the forward strand 2 #reference Q13 bases on the reverse strand #3 #non-ref Q13 bases on the forward strand 4 #non-ref Q13 bases on the reverse strand #5 sum of reference base qualities 6 sum of squares of reference base qualities #7 sum of non-ref base qualities 8 sum of squares of non-ref base qualities #9 sum of ref mapping qualities 10 sum of squares of ref mapping qualities #11 sum of non-ref mapping qualities 12 sum of squares of non-ref mapping qualities #13 sum of tail distance for ref bases 14 sum of squares of tail distance for ref bases #15 sum of tail distance for non-ref bases 16 sum of squares of tail distance for non-ref #INDEL Indicating the variant is an INDEL. #DP The number of reads covering or bridging POS. #DP4 Number of 1) forward ref alleles; 2) reverse ref; 3) forward non-ref; 4) reverse non-ref alleles, used in variant calling. Sum can be smaller than DP because low-quality bases are not counted. #PV4 P-values for 1) strand bias (exact test); 2) baseQ bias (t-test); 3) mapQ bias (t); 4) tail distance bias (t) #FQ Consensus quality. If positive, FQ equals the phred-scaled probability of there being two or more different alleles. If negative, FQ equals the minus phred-scaled probability of all chromosomes being identical. Notably, given one sample, FQ is positive at hets and negative at homs. #AF1 EM estimate of the site allele frequency of the strongest non-reference allele. #CI95 Equal-tail (Bayesian) credible interval of the site allele frequency at the 95% level. #PC2 Phred-scaled probability of the alternate allele frequency of group1 samples being larger (,smaller) than of group2 samples. #PCHI2 Posterior weighted chi^2 P-value between group1 and group2 samples. This P-value is conservative. #QCHI2 Phred-scaled PCHI2 #RP Number of permutations yeilding a smaller PCHI2 #example of triallelic variants generated by mpileup/bcftools #1 156706559 . A C,G 114 . DP=20;AF1=1;CI95=1,1;DP4=0,0,1,19;MQ=60;FQ=-63 GT:PL:GQ 1/2:237,126,90,162,0,138:99 #6 31129642 . A G,C 76 . DP=31;AF1=1;CI95=1,1;DP4=0,0,28,3;MQ=60;FQ=-75 GT:PL:GQ 1/2:255,194,146,164,0,119:99 #1 11297762 . T C,A 98 . DP=19;AF1=1;CI95=1,1;DP4=0,0,17,1;MQ=60;FQ=-78 GT:PL:GQ 1/1:131,51,0,120,28,117:99 my @field=split(/\t/,$_); @field >=8 or die "Error: invalid record found in VCF4 file (at least 8 tab-delimited fields expected): <$_>\n"; my ($chr, $start, $ID, $ref_allele, $mut_allele, $quality_score, $filter, $info, $format, $sample) = @field; my ($end); my ($mut_allele2, $zygosity); if ($filterword) { #ensure that the filter field contains the filterword $filter =~ m/\b$filterword\b/i or next; } #sometimes the alleles are not in the same case #chr1 1869771 1869774 actc aCTctc 43.5 13 INDEL;DP=13;AF1=0.5;CI95=0.5,0.5;DP4=0,4,4,0;MQ=37;PV4=0.029,0.45,1,0.46 $ref_allele = uc $ref_allele; $mut_allele = uc $mut_allele; #if ($ID eq '.' || $ID =~ /^rs/) { #per MISHIMA, Hiroyuki suggestion (vcf4's third column (ID column) are not always ".") # $end = $start; #this block is commented out on 2011feb19 #} if ($mut_allele eq '.') { #no variant call was made at this position next; } if ($mut_allele =~ m/([^,]+),([^,]+)/) { $mut_allele = $1; $mut_allele2 = $2; } if(length($ref_allele)==1 && length($mut_allele)==1) { ### output snv my ($unfiltered_read_depth) = $info =~ /DP=(\d+)/; my ($MappingQuality) = $info =~ /MQ=([^;]+)/; my ($QualityByDepth) = $info =~ /QD=([^;]+)/; if ($coverage) { defined $unfiltered_read_depth and $unfiltered_read_depth >= $coverage || next; if ($maxcoverage) { defined $unfiltered_read_depth and $unfiltered_read_depth <= $maxcoverage || next; } } if ($snpqual) { defined $QualityByDepth and $QualityByDepth >= $snpqual || next; #the QD was used here as quality score } if (defined $sample) { if ($sample =~ m#^0/1# or $sample =~ m#^1/0#) { $zygosity = 'het'; $counthet++; } elsif ($sample =~ m#^1/1#) { $zygosity = 'hom'; $counthom++; } else { $zygosity = 'unknown'; $countunknown++; } } else { $zygosity = 'unknown'; $countunknown++; } #the subject is called as homozygous for the first alternative allele (genotype 1/1. i.e. C/C), but since there was one read containing A, samtools still keep both alleles in the VCF file (but gives a very low probabilities for it). #1 11297762 . T C,A 98 . DP=19;AF1=1;CI95=1,1;DP4=0,0,17,1;MQ=60;FQ=-78 GT:PL:GQ 1/1:131,51,0,120,28,117:99 if ($mut_allele2 and $zygosity eq 'hom') { $mut_allele2 = ''; } if (not $mut_allele2) { if ($ref_allele eq 'A' and $mut_allele eq 'G' or $ref_allele eq 'G' and $mut_allele eq 'A' or $ref_allele eq 'C' and $mut_allele eq 'T' or $ref_allele eq 'T' and $mut_allele eq 'C') { $countti++; } else { $counttv++; } } print $chr, "\t", $start, "\t", $start, "\t", $ref_allele, "\t", $mut_allele, "\t$zygosity", "\t", $quality_score, (defined $unfiltered_read_depth)? "\t$unfiltered_read_depth" : '', (defined $MappingQuality) ? "\t$MappingQuality" : '', (defined $QualityByDepth) ? "\t$QualityByDepth" : '', $includeinfo ? "\t$otherinfo" : '', "\n"; if ($allallele) { if ($mut_allele2) { print $chr, "\t", $start, "\t", $start, "\t", $ref_allele, "\t", $mut_allele2, "\t$zygosity", "\t", $quality_score, (defined $unfiltered_read_depth)? "\t$unfiltered_read_depth" : '', (defined $MappingQuality) ? "\t$MappingQuality" : '', (defined $QualityByDepth) ? "\t$QualityByDepth" : '', $includeinfo ? "\t$otherinfo" : '', "\n"; } } $countsnp++; } elsif (length($ref_allele) > 1 || length($mut_allele) > 1) { ### output indel my ($indel_read_depth1, $indel_read_depth2) = $info =~ /AC=([^,;]+),([^,;]+)/; #number of reads supporting consensus indel, any indel my ($unfiltered_read_depth) = $info =~ /DP=(\d+)/; if ($coverage) { defined $unfiltered_read_depth and $unfiltered_read_depth >= $coverage || next; if ($maxcoverage) { defined $unfiltered_read_depth and $unfiltered_read_depth <= $maxcoverage || next; } } if (defined $indel_read_depth1 and defined $unfiltered_read_depth) { $indel_read_depth1/$unfiltered_read_depth >= $fraction or next; #do not meet minimum alternative allele fraction threshold $indel_read_depth1/$indel_read_depth2 >= $confraction or next; } my ($MappingQuality) = $info =~ /MQ=([^;]+),/; #example VCF4 records below: #20 2 . TCG T . PASS DP=100 #Chr1 5473 . AT ATT 23.5 . INDEL;DP=16;AF1=0.5;CI95=0.5,0.5;DP4=4,2,3,1;MQ=42;PV4=1,0.41,0.042,0.24 #Chr1 6498 . ATTTT ATTTTT 53.5 . INDEL;DP=9;AF1=1;CI95=1,1;DP4=0,0,5,3;MQ=28 if(length($ref_allele) > length ($mut_allele)) { # deletion or block substitution my $head = substr($ref_allele, 0, length ($mut_allele)); if ($head eq $mut_allele) { print $chr,"\t"; print $start+length($head),"\t"; print $start+length($ref_allele)-1,"\t"; $ref_allele = substr ($ref_allele, length ($mut_allele)); print $ref_allele,"\t"; print "-"; } else { print $chr, "\t", $start, "\t", $start+length($ref_allele)-1, "\t", $ref_allele, "\t", $mut_allele, "\t"; } } elsif(length($mut_allele) >= length ($ref_allele)) { # insertion or block substitution my $head = substr ($mut_allele, 0, length ($ref_allele)); if ($head eq $ref_allele) { print $chr,"\t"; print $start+length($ref_allele)-1,"\t"; print $start+length($ref_allele)-1,"\t"; $mut_allele = substr ($mut_allele, length ($ref_allele)); print "-\t"; print $mut_allele; } else { print $chr, "\t", $start, "\t", $start+length($ref_allele)-1, "\t", $ref_allele, "\t", $mut_allele, "\t"; } } if (defined $sample) { if ($sample =~ m#^0/1# or $sample =~ m#^1/0#) { print "\thet"; $counthet++; } elsif ($sample =~ m#^1/1#) { print "\thom"; $counthom++; } # BEGIN ARQ elsif ($sample =~ m#^./.#) { print "\tunknown"; $countunknown++; } # END ARQ } print "\t", $quality_score; defined $unfiltered_read_depth and print "\t", $unfiltered_read_depth; defined $indel_read_depth1 and print "\t", $indel_read_depth1; defined $MappingQuality and print "\t", $MappingQuality; $includeinfo and print "\t", $otherinfo; print "\n"; $countindel++; #do the same thing again, exactly like above, except that we work on second mutation; #in the future, consider rewrite this paragraph to make the code more elegant if ($allallele) { if ($mut_allele2) { if(length($ref_allele) > length ($mut_allele2)) { # deletion or block substitution my $head = substr($ref_allele, 0, length ($mut_allele2)); if ($head eq $mut_allele2) { print $chr,"\t"; print $start+length($head),"\t"; print $start+length($ref_allele)-1,"\t"; $ref_allele = substr ($ref_allele, length ($mut_allele2)); print $ref_allele,"\t"; print "-"; } else { print $chr, "\t", $start, "\t", $start+length($ref_allele)-1, "\t", $ref_allele, "\t", $mut_allele2; } } elsif(length($mut_allele2) > length ($ref_allele)) { # insertion or block substitution my $head = substr ($mut_allele2, 0, length ($ref_allele)); if ($head eq $ref_allele) { print $chr,"\t"; print $start+length($ref_allele)-1,"\t"; print $start+length($ref_allele)-1,"\t"; $mut_allele = substr ($mut_allele2, length ($ref_allele)); print "-\t"; print $mut_allele2; } else { print $chr, "\t", $start, "\t", $start+length($ref_allele)-1, "\t", $ref_allele, "\t", $mut_allele2; } } if (defined $sample) { if ($sample =~ m#^0/1# or $sample =~ m#^1/0#) { print "\thet"; $counthet++; } elsif ($sample =~ m#^1/1#) { print "\thom"; $counthom++; } # BEGIN ARQ elsif ($sample =~ m#^./.#) { print "\tunknown"; $countunknown++; } # END ARQ } print "\t", $quality_score; defined $unfiltered_read_depth and print "\t", $unfiltered_read_depth; defined $indel_read_depth1 and print "\t", $indel_read_depth1; defined $MappingQuality and print "\t", $MappingQuality; $includeinfo and print "\t", $otherinfo; print "\n"; } } } $countvar++; } my $triallelic = $countsnp-$countti-$counttv; print STDERR "NOTICE: Read $countline lines and wrote ${\($counthet+$counthom)} different variants at $countvar genomic positions ($countsnp SNPs and $countindel indels)\n"; print STDERR "NOTICE: Among ${\($counthet+$counthom+$countunknown)} different variants at $countvar positions, $counthet are heterozygotes, $counthom are homozygotes\n"; print STDERR "NOTICE: Among $countsnp SNPs, $countti are transitions, $counttv are transversions", $triallelic?", $triallelic have more than 2 alleles\n":"\n"; } =head1 SYNOPSIS convert2annovar.pl [arguments] <variantfile> Optional arguments: -h, --help print help message -m, --man print complete documentation -v, --verbose use verbose output --format <string> input format (default: pileup) --outfile <file> output file name (default: STDOUT) --snpqual <float> quality score threshold in pileup file (default: 20) --snppvalue <float> SNP P-value threshold in GFF3-SOLiD file (default: 1) --coverage <int> read coverage threshold in pileup file (default: 0) --maxcoverage <int> maximum coverage threshold (default: none) --includeinfo include supporting information in output --chr <string> specify the chromosome (for CASAVA format) --chrmt <string> chr identifier for mitochondria (default: M) --altcov <int> alternative allele coverage threshold (for pileup format) --fraction <float> minimum allelic fraction to claim a mutation (for pileup/vcf4_indel format) --species <string> if human, convert chr23/24/25 to X/Y/M (for gff3-solid format) --filter <string> output variants with this filter (case insensitive, for vcf4 format) --confraction <float> minimum consensus indel / all indel fraction (for vcf4_indel format) --allallele print all alleles when multiple calls are present (for vcf4 format) Function: convert variant call file generated from various software programs into ANNOVAR input format Example: convert2annovar.pl -format pileup -outfile variant.query variant.pileup convert2annovar.pl -format cg -outfile variant.query variant.cg convert2annovar.pl -format gff3-solid -outfile variant.query variant.snp.gff convert2annovar.pl -format soap variant.snp > variant.avinput convert2annovar.pl -format maq variant.snp > variant.avinput convert2annovar.pl -format casava -chr 1 variant.snp > variant.avinput convert2annovar.pl -format vcf4 variantfile > variant.avinput convert2annovar.pl -format vcf4 -filter pass variantfile > variant.avinput Version: $LastChangedDate: 2011-05-06 05:16:44 -0700 (Fri, 06 May 2011) $ =head1 OPTIONS =over 8 =item B<--help> print a brief usage message and detailed explanation of options. =item B<--man> print the complete manual of the program. =item B<--verbose> use verbose output. =item B<--format> the format of the input files. =item B<--outfile> specify the output file name. By default, output is written to STDOUT. =item B<--snpqual> quality score threshold in the pileup file, such that variant calls with lower quality scores will not be printed out in the output file. When VCF4 file is used, this argument works on the Quality-by-Depth measure, rather than the raw quality measure. =item B<--coverage> read coverage threshold in the pileup file, such that variants calls generated with lower coverage will not be printed in the output file. =item B<--includeinfo> specify that the output should contain additional information in the input line. By default, only the chr, start, end, reference allele, observed allele and homozygosity status are included in output files. =item B<--chr> specify the chromosome for CASAVA format =item B<--chrmt> specify the name of mitochondria chromosome (default is MT) =item B<--altcov> the minimum coverage of the alternative (mutated) allele to be printed out in output =item B<--fraction> specify the minimum fraction of alternative allele, to print out the mutation. For example, a site has 10 reads, 3 supports alternative allele. A -fraction of 0.4 will not allow the mutation to be printed out. =item B<--species> specify the species from which the sequencing data is obtained. For the GFF3- SOLiD format, when species is human, the chromosome 23, 24 and 25 will be converted to X, Y and M, respectively. =item B<--filter> for VCF4 file, only print out variant calls with this filter annotated. For example, if using GATK VariantFiltration walker, you will see PASS, GATKStandard, HARD_TO_VALIDATE, etc in the filter field. Using 'pass' as a filter is recommended in this case. =item B<--confraction> consesus indel fraction, calculated as reads supporting consensus indels divided by reads supporting any indels =item B<--allallele> print all alleles for mutations at a locus, rather than the first allele, if the input VCF4 file contains multiple alternative alleles for a mutation. By default, this option is off. When it is on, two lines will be printed out in the output, and both will have the same quality scores as VCF4 does not provide separate quality scores for individual alleles. =back =head1 DESCRIPTION This program is used to convert variant call file generated from various software programs into ANNOVAR input format. Currently, the program can handle Samtools genotype-calling pileup format, Solid GFF format, Complete Genomics variant format, SOAP format. These formats are described below. =over 8 =item * B<pileup format> The pileup format can be produced by the Samtools genotyping calling subroutine. Note that the phrase pileup format can be used in several instances, and here I am only referring to the pileup files that contains the actual genotype calls. Using SamTools, given an alignment file in BAM format, a pileup file with genotype calls can be produced by the command below: samtools pileup -vcf ref.fa aln.bam> raw.pileup samtools.pl varFilter raw.pileup > final.pileup ANNOVAR will automatically filter the pileup file so that only SNPs reaching a quality threshold are printed out (default is 20, use --snpqual argument to change this). Most likely, users may want to also apply a coverage threshold, such that SNPs calls from only a few reads are not considered. This can be achieved using the -coverage argument (default value is 0). An example of pileup files for SNPs is shown below: chr1 556674 G G 54 0 60 16 a,.....,...,.... (B%A+%7B;0;%=B<: chr1 556675 C C 55 0 60 16 ,,..A..,...,.... CB%%5%,A/+,%.... chr1 556676 C C 59 0 60 16 g,.....,...,.... .B%%.%.?.=/%...1 chr1 556677 G G 75 0 60 16 ,$,.....,...,.... .B%%9%5A6?)%;?:< chr1 556678 G K 60 60 60 24 ,$.....,...,....^~t^~t^~t^~t^~t^~t^~t^~t^~t B%%B%<A;AA%??<=??;BA%B89 chr1 556679 C C 61 0 60 23 .....a...a....,,,,,,,,, %%1%&?*:2%*&)(89/1A@B@@ chr1 556680 G K 88 93 60 23 ..A..,..A,....ttttttttt %%)%7B:B0%55:7=>>A@B?B; chr1 556681 C C 102 0 60 25 .$....,...,....,,,,,,,,,^~,^~. %%3%.B*4.%.34.6./B=?@@>5. chr1 556682 A A 70 0 60 24 ...C,...,....,,,,,,,,,,. %:%(B:A4%7A?;A><<999=<< chr1 556683 G G 99 0 60 24 ....,...,....,,,,,,,,,,. %A%3B@%?%C?AB@BB/./-1A7? The columns are chromosome, 1-based coordinate, reference base, consensus base, consensus quality, SNP quality, maximum mapping quality of the reads covering the sites, the number of reads covering the site, read bases and base qualities. An example of pileup files for indels is shown below: seq2 156 * +AG/+AG 71 252 99 11 +AG * 3 8 0 ANNOVAR automatically recognizes both SNPs and indels in pileup file, and process them correctly. =item * B<GFF3-SOLiD format> The SOLiD provides a GFF3-compatible format for SNPs, indels and structural variants. A typical example file is given below: ##gff-version 3 ##solid-gff-version 0.3 ##source-version 2 ##type DNA ##date 2009-03-13 ##time 0:0:0 ##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.141 ##reference-file ##input-files Yoruban_snp_10x.txt ##run-path chr_name AB_SOLiD SNP caller SNP coord coord 1 . . coverage=# cov;ref_base=ref;ref_score=score;ref_confi=confi;ref_single=Single;ref_paired=Paired;consen_base=consen;consen_score=score;consen_confi=conf;consen_single=Single;consen_paired=Paired;rs_id=rs_id,dbSNP129 1 AB_SOLiD SNP caller SNP 997 997 1 . . coverage=3;ref_base=A;ref_score=0.3284;ref_confi=0.9142;ref_single=0/0;ref_paired=1/1;consen_base=G;consen_score=0.6716;consen_confi=0.9349;consen_single=0/0;consen_paired=2/2 1 AB_SOLiD SNP caller SNP 2061 2061 1 . . coverage=2;ref_base=G;ref_score=0.0000;ref_confi=0.0000;ref_single=0/0;ref_paired=0/0;consen_base=C;consen_score=1.0000;consen_confi=0.8985;consen_single=0/0;consen_paired=2/2 1 AB_SOLiD SNP caller SNP 4770 4770 1 . . coverage=2;ref_base=A;ref_score=0.0000;ref_confi=0.0000;ref_single=0/0;ref_paired=0/0;consen_base=G;consen_score=1.0000;consen_confi=0.8854;consen_single=0/0;consen_paired=2/2 1 AB_SOLiD SNP caller SNP 4793 4793 1 . . coverage=14;ref_base=A;ref_score=0.0723;ref_confi=0.8746;ref_single=0/0;ref_paired=1/1;consen_base=G;consen_score=0.6549;consen_confi=0.8798;consen_single=0/0;consen_paired=9/9 1 AB_SOLiD SNP caller SNP 6241 6241 1 . . coverage=2;ref_base=T;ref_score=0.0000;ref_confi=0.0000;ref_single=0/0;ref_paired=0/0;consen_base=C;consen_score=1.0000;consen_confi=0.7839;consen_single=0/0;consen_paired=2/2 Newer version of ABI BioScope now use diBayes caller, and the output file is given below: ##gff-version 3 ##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.141 ##List of SNPs. Date Sat Dec 18 10:30:45 2010 Stringency: medium Mate Pair: 1 Read Length: 50 Polymorphism Rate: 0.003000 Bayes Coverage: 60 Bayes_Single_SNP: 1 Filter_Single_SNP: 1 Quick_P_Threshold: 0.997000 Bayes_P_Threshold: 0.040000 Minimum_Allele_Ratio: 0.150000 Minimum_Allele_Ratio_Multiple_of_Dicolor_Error: 100 ##1 chr1 ##2 chr2 ##3 chr3 ##4 chr4 ##5 chr5 ##6 chr6 ##7 chr7 ##8 chr8 ##9 chr9 ##10 chr10 ##11 chr11 ##12 chr12 ##13 chr13 ##14 chr14 ##15 chr15 ##16 chr16 ##17 chr17 ##18 chr18 ##19 chr19 ##20 chr20 ##21 chr21 ##22 chr22 ##23 chrX ##24 chrY ##25 chrM # source-version SOLiD BioScope diBayes(SNP caller) #Chr Source Type Pos_Start Pos_End Score Strand Phase Attributes chr1 SOLiD_diBayes SNP 221367 221367 0.091151 . . genotype=R;reference=G;coverage=3;refAlleleCounts=1;refAlleleStarts=1;refAlleleMeanQV=29;novelAlleleCounts=2;novelAlleleStarts=2;novelAlleleMeanQV=27;diColor1=11;diColor2=33;het=1;flag= chr1 SOLiD_diBayes SNP 555317 555317 0.095188 . . genotype=Y;reference=T;coverage=13;refAlleleCounts=11;refAlleleStarts=10;refAlleleMeanQV=23;novelAlleleCounts=2;novelAlleleStarts=2;novelAlleleMeanQV=29;diColor1=00;diColor2=22;het=1;flag= chr1 SOLiD_diBayes SNP 555327 555327 0.037582 . . genotype=Y;reference=T;coverage=12;refAlleleCounts=6;refAlleleStarts=6;refAlleleMeanQV=19;novelAlleleCounts=2;novelAlleleStarts=2;novelAlleleMeanQV=29;diColor1=12;diColor2=30;het=1;flag= chr1 SOLiD_diBayes SNP 559817 559817 0.094413 . . genotype=Y;reference=T;coverage=9;refAlleleCounts=5;refAlleleStarts=4;refAlleleMeanQV=23;novelAlleleCounts=2;novelAlleleStarts=2;novelAlleleMeanQV=14;diColor1=11;diColor2=33;het=1;flag= chr1 SOLiD_diBayes SNP 714068 714068 0.000000 . . genotype=M;reference=C;coverage=13;refAlleleCounts=7;refAlleleStarts=6;refAlleleMeanQV=25;novelAlleleCounts=6;novelAlleleStarts=4;novelAlleleMeanQV=22;diColor1=00;diColor2=11;het=1;flag= The file conforms to standard GFF3 specifications, but the last column is solid- specific and it gives certain parameters for the SNP calls. An example of the short indel format by GFF3-SOLiD is given below: ##gff-version 3 ##solid-gff-version 0.3 ##source-version SOLiD Corona Lite v.4.0r2.0, find-small-indels.pl v 1.0.1, process-small-indels v 0.2.2, 2009-01-12 12:28:49 ##type DNA ##date 2009-01-26 ##time 18:33:20 ##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.141 ##reference-file ##input-files ../../mp-results/JOAN_20080104_1.pas,../../mp-results/BARB_20071114_1.pas,../../mp-results/BARB_20080227_2.pas ##run-path /data/results2/Yoruban-frag-indel/try.01.06/mp-w2x25-2x-4x-8x-10x/2x ##Filter-settings: max-ave-read-pos=none,min-ave-from-end-pos=9.1,max-nonreds-4filt=2,min-insertion-size=none,min-deletion-size=none,max-insertion-size=none,max-deletion-size=none,require-called-indel-size?=T chr1 AB_SOLiD Small Indel Tool deletion 824501 824501 1 . . del_len=1;tight_chrom_pos=824501-824502;loose_chrom_pos=824501-824502;no_nonred_reads=2;no_mismatches=1,0;read_pos=4,6;from_end_pos=21,19;strands=+,-;tags=R3,F3;indel_sizes=-1,-1;read_seqs=G3021212231123203300032223,T3321132212120222323222101;dbSNP=rs34941678,chr1:824502-824502(-),EXACT,1,/GG chr1 AB_SOLiD Small Indel Tool insertion_site 1118641 1118641 1 . . ins_len=3;tight_chrom_pos=1118641-1118642;loose_chrom_pos=1118641-1118642;no_nonred_reads=2;no_mismatches=0,1;read_pos=17,6;from_end_pos=8,19;strands=+,+;tags=F3,R3;indel_sizes=3,3;read_seqs=T0033001100022331122033112,G3233112203311220000001002 The keyword deletion or insertion_site is used in the fourth column to indicate that file format. An example of the medium CNV format by GFF3-SOLiD is given below: ##gff-version 3 ##solid-gff-version 0.3 ##source-version SOLiD Corona Lite v.4.0r2.0, find-small-indels.pl v 1.0.1, process-small-indels v 0.2.2, 2009-01-12 12:28:49 ##type DNA ##date 2009-01-27 ##time 15:54:36 ##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.141 ##reference-file ##input-files big_d20e5-del12n_up-ConsGrp-2nonred.pas.sum ##run-path /data/results2/Yoruban-frag-indel/try.01.06/mp-results-lmp-e5/big_d20e5-indel_950_2050 chr1 AB_SOLiD Small Indel Tool deletion 3087770 3087831 1 . . del_len=62;tight_chrom_pos=none;loose_chrom_pos=3087768-3087773;no_nonred_reads=2;no_mismatches=2,2;read_pos=27,24;from_end_pos=23,26;strands=-,+;tags=F3,F3;indel_sizes=-62,-62;read_seqs=T11113022103331111130221213201111302212132011113022,T02203111102312122031111023121220311111333012203111 chr1 AB_SOLiD Small Indel Tool deletion 4104535 4104584 1 . . del_len=50;tight_chrom_pos=4104534-4104537;loose_chrom_pos=4104528-4104545;no_nonred_reads=3;no_mismatches=0,4,4;read_pos=19,19,27;from_end_pos=31,31,23;strands=+,+,-;tags=F3,R3,R3;indel_sizes=-50,-50,-50;read_seqs=T31011011013211110130332130332132110110132020312332,G21031011013211112130332130332132110132132020312332,G20321302023001101123123303103303101113231011011011 chr1 AB_SOLiD Small Indel Tool insertion_site 2044888 2044888 1 . . ins_len=18;tight_chrom_pos=2044887-2044888;loose_chrom_pos=2044887-2044889;no_nonred_reads=2;bead_ids=1217_1811_209,1316_908_1346;no_mismatches=0,2;read_pos=13,15;from_end_pos=37,35;strands=-,-;tags=F3,F3;indel_sizes=18,18;read_seqs=T31002301231011013121000101233323031121002301231011,T11121002301231011013121000101233323031121000101231;non_indel_no_mismatches=3,1;non_indel_seqs=NIL,NIL chr1 AB_SOLiD Small Indel Tool insertion_site 74832565 74832565 1 . . ins_len=16;tight_chrom_pos=74832545-74832565;loose_chrom_pos=74832545-74832565;no_nonred_reads=2;bead_ids=1795_181_514,1651_740_519;no_mismatches=0,2;read_pos=13,13;from_end_pos=37,37;strands=-,-;tags=F3,R3;indel_sizes=16,16;read_seqs=T33311111111111111111111111111111111111111111111111,G23311111111111111111111111111111111111111311011111;non_indel_no_mismatches=1,0;non_indel_seqs=NIL,NIL An example of the large indel format by GFF3-SOLiD is given below: ##gff-version 3 ##solid-gff-version 0.3 ##source-version ??? ##type DNA ##date 2009-03-13 ##time 0:0:0 ##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.141 ##reference-file ##input-files /data/results5/yoruban_strikes_back_large_indels/LMP/five_mm_unique_hits_no_rescue/5_point_6x_del_lib_1/results/NA18507_inter_read_indels_5_point_6x.dat ##run-path chr1 AB_SOLiD Large Indel Tool insertion_site 1307279 1307791 1 . . deviation=-742;stddev=7.18;ref_clones=-;dev_clones=4 chr1 AB_SOLiD Large Indel Tool insertion_site 2042742 2042861 1 . . deviation=-933;stddev=8.14;ref_clones=-;dev_clones=3 chr1 AB_SOLiD Large Indel Tool insertion_site 2443482 2444342 1 . . deviation=-547;stddev=11.36;ref_clones=-;dev_clones=17 chr1 AB_SOLiD Large Indel Tool insertion_site 2932046 2932984 1 . . deviation=-329;stddev=6.07;ref_clones=-;dev_clones=14 chr1 AB_SOLiD Large Indel Tool insertion_site 3166925 3167584 1 . . deviation=-752;stddev=13.81;ref_clones=-;dev_clones=14 An example of the CNV format by GFF3-SOLiD if given below: ##gff-version 3 ##solid-gff-version 0.3 ##source-version ??? ##type DNA ##date 2009-03-13 ##time 0:0:0 ##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.141 ##reference-file ##input-files Yoruban_cnv.coords ##run-path chr1 AB_CNV_PIPELINE repeat_region 1062939 1066829 . . . fraction_mappable=51.400002;logratio=-1.039300;copynum=1;numwindows=1 chr1 AB_CNV_PIPELINE repeat_region 1073630 1078667 . . . fraction_mappable=81.000000;logratio=-1.409500;copynum=1;numwindows=2 chr1 AB_CNV_PIPELINE repeat_region 2148325 2150352 . . . fraction_mappable=98.699997;logratio=-1.055000;copynum=1;numwindows=1 chr1 AB_CNV_PIPELINE repeat_region 2245558 2248109 . . . fraction_mappable=78.400002;logratio=-1.042900;copynum=1;numwindows=1 chr1 AB_CNV_PIPELINE repeat_region 3489252 3492632 . . . fraction_mappable=59.200001;logratio=-1.119900;copynum=1;numwindows=1 chr1 AB_CNV_PIPELINE repeat_region 5654415 5657276 . . . fraction_mappable=69.900002;logratio=1.114500;copynum=4;numwindows=1 chr1 AB_CNV_PIPELINE repeat_region 9516165 9522726 . . . fraction_mappable=65.850006;logratio=-1.316700;numwindows=2 chr1 AB_CNV_PIPELINE repeat_region 16795117 16841025 . . . fraction_mappable=44.600002;logratio=1.880778;copynum=7;numwindows=9 The keyword repeat_region is used here, although it actually refers to CNVs. An example of the inversion format by GFF3-SOLiD is given below: ##gff-version 3 ##solid-gff-version 0.2 ##generated by SOLiD inversion tool chr10 AB_SOLiD inversion 46443107 46479585 268.9 . . left=chr10:46443107-46443146;right=chr10:46479583-46479585;leftscore=295.0;rightscore=247.0;count_AAA_further_left=117;count_AAA_left=3;count_AAA_right=3;count_AAA_further_right=97;left_min_count_AAA=chr10:46443107-46443112;count_AAA_min_left=0;count_AAA_max_left=3;right_min_count_AAA=chr10:46479585-46479585;count_AAA_min_right=1;count_AAA_max_right=3;homozygous=UNKNOWN chr4 AB_SOLiD inversion 190822813 190850112 214.7 . . left=chr4:190822813-190822922;right=chr4:190850110-190850112;leftscore=140.0;rightscore=460.0;count_AAA_further_left=110;count_AAA_left=78;count_AAA_right=74;count_AAA_further_right=77;left_min_count_AAA=chr4:190822813-190822814;count_AAA_min_left=69;count_AAA_max_left=77;right_min_count_AAA=chr4:190850110-190850112;count_AAA_min_right=74;count_AAA_max_right=74;homozygous=NO chr6 AB_SOLiD inversion 168834969 168837154 175.3 . . left=chr6:168834969-168835496;right=chr6:168836643-168837154;leftscore=185.4;rightscore=166.2;count_AAA_further_left=67;count_AAA_left=43;count_AAA_right=40;count_AAA_further_right=59;left_min_count_AAA=chr6:168835058-168835124,chr6:168835143-168835161,chr6:168835176-168835181,chr6:168835231-168835262;count_AAA_min_left=23;count_AAA_max_left=29;right_min_count_AAA=chr6:168836643-168836652;count_AAA_min_right=23;count_AAA_max_right=31;homozygous=NO The program should be able to recognize all the above GFF3-SOLiD format automatically, and handle them accordingly. =item * B<Complete Genomics format> This format is provided by the Complete Genomics company to their customers. The file var-[ASM-ID].tsv.bz2 includes a description of all loci where the assembled genome differs from the reference genome. An example of the Complete Genomics format is shown below: #BUILD 1.5.0.5 #GENERATED_AT 2009-Nov-03 19:52:21.722927 #GENERATED_BY dbsnptool #TYPE VAR-ANNOTATION #VAR_ANN_SET /Proj/Pipeline/Production_Data/REF/HUMAN-F_06-REF/dbSNP.csv #VAR_ANN_TYPE dbSNP #VERSION 0.3 >locus ploidy haplotype chromosome begin end varType reference alleleSeq totalScore hapLink xRef 1 2 all chr1 0 959 no-call = ? 2 2 all chr1 959 972 = = = 3 2 all chr1 972 1001 no-call = ? 4 2 all chr1 1001 1008 = = = 5 2 all chr1 1008 1114 no-call = ? 6 2 all chr1 1114 1125 = = = 7 2 all chr1 1125 1191 no-call = ? 8 2 all chr1 1191 1225 = = = 9 2 all chr1 1225 1258 no-call = ? 10 2 all chr1 1258 1267 = = = 12 2 all chr1 1267 1275 no-call = ? 13 2 all chr1 1275 1316 = = = 14 2 all chr1 1316 1346 no-call = ? 15 2 all chr1 1346 1367 = = = 16 2 all chr1 1367 1374 no-call = ? 17 2 all chr1 1374 1388 = = = 18 2 all chr1 1388 1431 no-call = ? 19 2 all chr1 1431 1447 = = = 20 2 all chr1 1447 1454 no-call = ? The following information is provided in documentation from Complete Genomics, that describes the var-ASM format. 1. locus. Identifier of a particular genomic locus 2. ploidy. The ploidy of the reference genome at the locus (= 2 for autosomes, 2 for pseudoautosomal regions on the sex chromosomes, 1 for males on the non-pseudoautosomal parts of the sex chromosomes, 1 for mitochondrion, '?' if varType is 'no-ref' or 'PAR-called-in-X'). The reported ploidy is fully determined by gender, chromosome and location, and is not inferred from the sequence data. 3. haplotype. Identifier for each haplotype at the variation locus. For diploid genomes, 1 or 2. Shorthand of 'all' is allowed where the varType field is one of 'ref', 'no-call', 'no-ref', or 'PAR-called-in-X'. Haplotype numbering does not imply phasing; haplotype 1 in locus 1 is not necessarily in phase with haplotype 1 in locus 2. See hapLink, below, for phasing information. 4. chromosome. Chromosome name in text: 'chr1','chr2', ... ,'chr22','chrX','chrY'. The mitochondrion is represented as 'chrM'. The pseudoautosomal regions within the sex chromosomes X and Y are reported at their coordinates on chromosome X. 5. begin. Reference coordinate specifying the start of the variation (not the locus) using the half-open zero-based coordinate system. See section 'Sequence Coordinate System' for more information. 6. end. Reference coordinate specifying the end of the variation (not the locus) using the half-open zero-based coordinate system. See section 'Sequence Coordinate System' for more information. 7. varType. Type of variation, currently one of: snp: single-nucleotide polymorphism ins: insertion del: deletion sub: Substitution of one or more reference bases with the bases in the allele column 'ref' : no variation; the sequence is identical to the reference sequence on the indicated haplotype no-call-rc: 'no-call reference consistent 'one or more bases are ambiguous, but the allele is potentially consistent with the reference no-call-ri: 'no-call reference inconsistent' one or more bases are ambiguous, but the allele is definitely inconsistent with the reference no-call: an allele is completely indeterminate in length and composition, i.e. alleleSeq = '?' no-ref: the reference sequence is unspecified at this locus. PAR-called-in-X: this locus overlaps one of the pseudoautosomal regions on the sex chromosomes. The called sequence is reported as diploid sequence on Chromosome X; on chromosome Y the sequence is reported as varType = 'PAR-called-in-X'. 8. reference. The reference sequence for the locus of variation. Empty when varType is ins. A value of '=' indicates that the user must consult the reference for the sequence; this shorthand is only used in regions where no haplotype deviates from the reference sequence. 9. alleleSeq. The observed sequence at the locus of variation. Empty when varType is del. '?' isused to indicate 0 or more unknown bases within the sequence; 'N' is used to indicate exactly one unknown base within the sequence.'=' is used as shorthand to indicate identity to the reference sequence for non-variant sequence, i.e. when varType is 'ref'. 10. totalScore. A score corresponding to a single variation and haplotype, representing the confidence in the call. 11. hapLink. Identifier that links a haplotype at one locus to haplotypes at other loci. Currently only populated for very proximate variations that were assembled together. Two calls that share a hapLink identifier are expected to be on the same haplotype, 12. xRef. Field containing external variation identifiers, currently only populated for variations corroborated directly by dbSNP. Format: dbsnp:[rsID], with multiple entries separated by the semicolon (;). In older versions of the format specification, the sub keyword used to be insdel keyword. ANNOVAR takes care of this. =item * B<SOAPsnp format> An example of the SOAP SNP caller format is shown below: chr8 35782 A R 1 A 27 1 2 G 26 1 2 5 0.500000 2.00000 1 5 chr8 35787 G R 0 G 25 4 6 A 17 2 4 10 0.266667 1.60000 0 5 The following information is provided in documentation from BGI who developed SOAP suite. It differs slightly from the description at the SOAPsnp website, and presumably the website is outdated. Format description:(left to right) 1. Chromosome name 2. Position of locus 3. Nucleotide at corresponding locus of reference sequence 4. Genotype of sequencing sample 5. Quality value 6. nucleotide with the highest probability(first nucleotide) 7. Quality value of the nucleotide with the highest probability 8. Number of supported reads that can only be aligned to this locus 9. Number of all supported reads that can be aligned to this locus 10. Nucleotide with higher probability 11. Quality value of nucleotide with higher probability 12. Number of supported reads that can only be aligned to this locus 13. Number of all supported reads that can be aligned to this locus 14. Total number of reads that can be aligned to this locus 15. Order and quality value 16. Estimated copy number for this locus 17. Presence of this locus in the dbSNP database. 1 refers to presence and 0 refers to inexistence 18. The distance between this locus and another closest SNP =item * B<SOAPindel format> The current version of ANNOVAR handles SoapSNP and SoapIndel automatically via a single argument '--format soap'. An example of SOAP indel caller format is shown below: chr11 44061282 - +2 CT Hete chr11 45901572 + +1 C Hete chr11 48242562 * -3 TTC Homo chr11 57228723 * +4 CTTT Homo chr11 57228734 * +4 CTTT Homo chr11 57555685 * -1 C Hete chr11 61482191 - +3 TCC Hete chr11 64608031 * -1 T Homo chr11 64654936 * +1 C Homo chr11 71188303 + -1 T Hete chr11 75741034 + +1 T Hete chr11 76632438 * +1 A Hete chr11 89578266 * -2 AG Homo chr11 104383261 * +1 T Hete chr11 124125940 + +4 CCCC Hete chr12 7760052 * +1 T Homo chr12 8266049 * +3 ACG Homo I do not see a documentation describing this format yet as of September 2010. =item B<--SOAPsv format> An example is given below: Chr2 Deletion 42894 43832 43167 43555 388 0-0-0 FR 41 An explanation of the structural variation format is given below: Format description (from left to right) 1. Chromosome name 2. Type of structure variation 3. Minimal value of start position in cluster 4. Maximal value of end position in cluster 5. Estimated start position of this structure variation 6. Estimated end position of this structure variation 7. Length of SV 8. Breakpoint of SV (only for insertion) 9. Unusual matching mode (F refers to align with forward sequence, R refers to align with reverse sequence) 10. number of paired-end read which support this structure variation =item * B<MAQ format> MAQ can perform alignment and generate genotype calls, including SNP calls and indel calls. The format is described below: For indel header: The output is TAB delimited with each line consisting of chromosome, start position, type of the indel, number of reads across the indel, size of the indel and inserted/deleted nucleotides (separated by colon), number of indels on the reverse strand, number of indels on the forward strand, 5' sequence ahead of the indel, 3' sequence following the indel, number of reads aligned without indels and three additional columns for filters. An example is below: chr10 110583 - 2 -2:AG 0 1 GCGAGACTCAGTATCAAAAAAAAAAAAAAAAA AGAAAGAAAGAAAAAGAAAAAAATAGAAAGAA 1 @2, @72, @0, chr10 120134 - 8 -2:CA 0 1 CTCTTGCCCGCTCACACATGTACACACACGCG CACACACACACACACACATCAGCTACCTACCT 7 @65,62,61,61,45,22,7, @9,12,13,13,29,52,67, @0,0,0,0,0,0,0, chr10 129630 - 1 -1:T 1 0 ATGTTGTGACTCTTAATGGATAAGTTCAGTCA TTTTTTTTTAGCTTTTAACCGGACAAAAAAAG 0 @ @ @ chr10 150209 - 1 4:TTCC 1 0 GCATATAGGGATGGGCACTTTACCTTTCTTTT TTCCTTCCTTCCTTCCTTCCCTTTCCTTTCCT 0 @ @ @ chr10 150244 - 2 -4:TTCT 0 1 CTTCCTTCCTTCCTTCCCTTTCCTTTCCTTTC TTCTTTCTTTCTTTCTTTCTTTTTTTTTTTTT 0 @ @ @ chr10 159622 - 1 3:AGG 0 1 GAAGGAGGAAGGACGGAAGGAGGAAGGAAGGA AGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGA 0 @ @ @ chr10 206372 - 2 2:GT 1 0 ATAATAGTAACTGTGTATTTGATTATGTGTGC GTGTGTGTGTGTGTGTGTGTGTGTGCGTGCTT 1 @37, @37, @8, chr10 245751 - 11 -1:C 0 1 CTCATAAATACAAGTCATAATGAAAGAAATTA CCACCATTTTCTTATTTTCATTCATTTTTAGT 10 @69,64,53,41,30,25,22,14,5,4, @5,10,21,33,44,49,52,60,69,70, @0,0,0,0,0,0,0,0,0,0, chr10 253066 - 1 2:TT 0 1 TATTGATGAGGGTGGATTATACTTTAGAACAC TATTCAAACAGTTCTTCCACATATCTCCCTTT 0 @ @ @ chr10 253455 - 2 -3:AAA 1 0 GTTGCACTCCAGCCTGGCGAGATTCTGTCTCC AAAAAAAAAAAAAAAAATTGTTGTGAAATACA 1 @55, @19, @4, For snp output file: Each line consists of chromosome, position, reference base, consensus base, Phred-like consensus quality, read depth, the average number of hits of reads covering this position, the highest mapping quality of the reads covering the position, the minimum consensus quality in the 3bp flanking regions at each side of the site (6bp in total), the second best call, log likelihood ratio of the second best and the third best call, and the third best call. An example is below: chr10 83603 C T 28 12 2.81 63 34 Y 26 C chr10 83945 G R 59 61 4.75 63 62 A 47 G chr10 83978 G R 47 40 3.31 63 62 A 21 G chr10 84026 G R 89 22 2.44 63 62 G 49 A chr10 84545 C T 54 9 1.69 63 30 N 135 N chr10 85074 G A 42 5 1.19 63 38 N 108 N chr10 85226 A T 42 5 1.00 63 42 N 107 N chr10 85229 C T 42 5 1.00 63 42 N 112 N chr10 87518 A G 39 4 3.25 63 38 N 9 N chr10 116402 T C 39 4 1.00 63 38 N 76 N =item * B<CASAVA format> An example of Illumina CASAVA format is given below: #position A C G T modified_call total used score reference type 14930 3 0 8 0 GA 11 11 29.10:11.10 A SNP_het2 14933 4 0 7 0 GA 11 11 23.50:13.70 G SNP_het1 14976 3 0 8 0 GA 11 11 24.09:9.10 G SNP_het1 15118 2 1 4 0 GA 8 7 10.84:6.30 A SNP_het2 An example of the indels is given below: # ** CASAVA depth-filtered indel calls ** #$ CMDLINE /illumina/pipeline/install/CASAVA_v1.7.0/libexec/CASAVA-1.7.0/filterIndelCalls.pl--meanReadDepth=2.60395068970547 --indelsCovCutoff=-1 --chrom=chr1.fa /data/Basecalls/100806_HARMONIAPILOT-H16_0338_A2065HABXX/Data/Intensities/BaseCalls/CASAVA_PE_L2/Parsed_14-08-10/chr1.fa/Indel/varling_indel_calls_0000.txt /data/Basecalls/100806_HARMONIAPILOT-H16_0338_A2065HABXX/Data/Intensities/BaseCalls/CASAVA_PE_L2/Parsed_14-08-10/chr1.fa/Indel/varling_indel_calls_0001.txt /data/Basecalls/100806_HARMONIAPILOT-H16_0338_A2065HABXX/Data/Intensities/BaseCalls/CASAVA_PE_L2/Parsed_14-08-10/chr1.fa/Indel/varling_indel_calls_0002.txt /data/Basecalls/100806_HARMONIAPILOT-H16_0338_A2065HABXX/Data/Intensities/BaseCalls/CASAVA_PE_L2/Parsed_14-08-10/chr1.fa/Indel/varling_indel_calls_0003.txt /data/Basecalls/100806_HARMONIAPILOT-H16_0338_A2065HABXX/Data/Intensities/BaseCalls/CASAVA_PE_L2/Parsed_14-08-10/chr1.fa/Indel/varling_indel_calls_0004.txt #$ CHROMOSOME chr1.fa #$ MAX_DEPTH undefined # #$ COLUMNS pos CIGAR ref_upstream ref/indel ref_downstream Q(indel) max_gtype Q(max_gtype) max2_gtype bp1_reads ref_reads indel_reads other_reads repeat_unit ref_repeat_count indel_repeat_count 948847 1I CCTCAGGCTT -/A ATAATAGGGC 969 hom 47 het 22 0 16 6 A 1 2 978604 2D CACTGAGCCC CT/-- GTGTCCTTCC 251 hom 20 het 8 0 4 4 CT 1 0 1276974 4I CCTCATGCAG ----/ACAC ACACATGCAC 838 hom 39 het 18 0 14 4 AC 2 4 1289368 2D AGCCCGGGAC TG/-- GGAGCCGCGC 1376 hom 83 het 33 0 25 9 TG 1 0 =item * B<VCF4 format> VCF4 can be used to describe both population-level variation information, or for reads derived from a single individual. One example of the indel format for one individual is given below: ##fileformat=VCFv4.0 ##IGv2_bam_file_used=MIAPACA2.alnReAln.bam ##INFO=<ID=AC,Number=2,Type=Integer,Description="# of reads supporting consensus indel/any indel at the site"> ##INFO=<ID=DP,Number=1,Type=Integer,Description="total coverage at the site"> ##INFO=<ID=MM,Number=2,Type=Float,Description="average # of mismatches per consensus indel-supporting read/per reference-supporting read"> ##INFO=<ID=MQ,Number=2,Type=Float,Description="average mapping quality of consensus indel-supporting reads/reference-supporting reads"> ##INFO=<ID=NQSBQ,Number=2,Type=Float,Description="Within NQS window: average quality of bases from consensus indel-supporting reads/from reference-supporting reads"> ##INFO=<ID=NQSMM,Number=2,Type=Float,Description="Within NQS window: fraction of mismatching bases in consensus indel-supporting reads/in reference-supporting reads"> ##INFO=<ID=SC,Number=4,Type=Integer,Description="strandness: counts of forward-/reverse-aligned indel-supporting reads / forward-/reverse-aligned reference supporting reads"> ##IndelGenotyperV2="" ##reference=hg18.fa ##source=IndelGenotyperV2 #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Miapaca_trimmed_sorted.bam chr1 439 . AC A . PASS AC=5,5;DP=7;MM=7.0,3.0;MQ=23.4,1.0;NQSBQ=23.98,25.5;NQSMM=0.04,0.0;SC=2,3,0,2 GT 1/0 chr1 714048 . T TCAAC . PASS AC=3,3;DP=9;MM=3.0,7.1666665;MQ=1.0,10.833333;NQSBQ=23.266666,21.932203;NQSMM=0.0,0.15254237;SC=3,0,3,3 GT 0/1 chr1 714049 . G GC . PASS AC=3,3;DP=9;MM=3.0,7.1666665;MQ=1.0,10.833333;NQSBQ=23.233334,21.83051;NQSMM=0.0,0.15254237;SC=3,0,3,3 GT 0/1 chr1 813675 . A AATAG . PASS AC=5,5;DP=8;MM=0.4,1.0;MQ=5.0,67.0;NQSBQ=25.74,25.166666;NQSMM=0.0,0.033333335;SC=4,1,1,2 GT 0/1 chr1 813687 . AGAGAGAGAGAAG A . PASS AC=5,5;DP=8;MM=0.4,1.0;MQ=5.0,67.0;NQSBQ=24.54,25.2;NQSMM=0.02,0.06666667;SC=4,1,1,2 GT 1/0 =back The code was written by Dr. Kai Wang and modified by Dr. Germán Gastón Leparc. Various users have provided sample input files for many SNP callin software, for the development of conversion subroutines. We thank these users for their continued support to improve the functionality of the script. For questions or comments, please contact kai@openbioinformatics.org. =cut