Mercurial > repos > iuc > varscan_somatic
changeset 2:2fe9ebb98aad draft
planemo upload for repository https://github.com/galaxyproject/iuc/tree/master/tools/varscan commit 30867f1f022bed18ba1c3b8dc9c54226890b3a9c
author | iuc |
---|---|
date | Tue, 04 Dec 2018 05:15:50 -0500 |
parents | 31a38ce7e8ae |
children | 7903598ccbaf |
files | macros.xml test-data/control_chrM.bam test-data/hg19_chrM.fa test-data/tumor_chrM.bam test-data/varscan_somatic_indel_result1.vcf test-data/varscan_somatic_indel_result2.vcf test-data/varscan_somatic_snp_result1.vcf test-data/varscan_somatic_snp_result2.vcf varscan.py varscan_somatic.xml |
diffstat | 10 files changed, 2030 insertions(+), 195 deletions(-) [+] |
line wrap: on
line diff
--- a/macros.xml Sun Jul 15 09:19:25 2018 -0400 +++ b/macros.xml Tue Dec 04 05:15:50 2018 -0500 @@ -2,7 +2,6 @@ <xml name="requirements"> <requirements> <requirement type="package" version="@VERSION@">varscan</requirement> - <requirement type="package" version="4.2.1">gawk</requirement> <yield/> </requirements> </xml> @@ -20,6 +19,7 @@ <xml name="citations"> <citations> <citation type="doi">10.1101/gr.129684.111</citation> + <citation type="doi">10.1002/0471250953.bi1504s44</citation> </citations> </xml> @@ -52,29 +52,34 @@ </token> - <xml name="min_coverage"> + <xml name="min_coverage" token_help="Minimum depth at a position to make a call"> <param argument="--min-coverage" name="min_coverage" type="integer" value="8" min="1" max="200" - label="Minimum read depth" help="Minimum depth at a position to make a call"/> + label="Minimum coverage" help="@HELP@"/> </xml> <xml name="min_reads2"> <param argument="--min-reads2" name="min_reads2" type="integer" value="2" min="1" max="200" - label="Minimum supporting reads" help="Minimum supporting reads at a position to make a call"/> + label="Minimum supporting reads" help="Minimum number of variant-supporting reads at a position required to make a call"/> </xml> <xml name="min_avg_qual"> <param argument="--min-avg-qual" name="min_avg_qual" type="integer" value="15" min="1" max="50" - label="Minimum base quality at a position to count a read"/> + label="Minimum base quality" + help="The minimum base quality at the variant position required to use a read for calling" /> </xml> <xml name="min_var_freq" token_value="0.01"> <param argument="--min-var-freq" name="min_var_freq" type="float" value="@VALUE@" min="0" max="1" - label="Minimum variant allele frequency threshold"/> + label="Minimum variant allele frequency" + help="Minimum variant allele frequency required for calling a variant"/> </xml> <xml name="min_freq_for_hom"> <param argument="--min-freq-for-hom" name="min_freq_for_hom" type="float" value="0.75" min="0" max="1" - label="Minimum frequency to call homozygote"/> + label="Minimum homozygous variant allele frequency" + help="Minimum variant allele frequency required for calling a homozygous genotype" /> </xml> - <xml name="p_value" token_label="p-value threshold for calling variants" token_value="0.01"> - <param argument="--p-value" name="p_value" type="float" value="@VALUE@" min="0.0" max="1.0" - label="@LABEL@"/> + <xml name="p_value" token_value="0.01" + token_label="p-value threshold for calling variants" + token_help=""> + <param argument="--p-value" name="p_value" type="float" value="@VALUE@" min="0" max="1" + label="@LABEL@" help="@HELP@"/> </xml> <xml name="strand_filter"> <param name="strand_filter" type="select" label="Ignore variants with >90% support on one strand"> @@ -83,4 +88,12 @@ </param> </xml> + <token name="@HELP_HEADER@"><![CDATA[ +**VarScan Overview** + +VarScan_ performs variant detection for massively parallel sequencing data, such as exome, WGS, and transcriptome data. Full documentation of the command line package is available here_. + +.. _VarScan: http://dkoboldt.github.io/varscan/ +.. _here: http://dkoboldt.github.io/varscan/using-varscan.html + ]]></token> </macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/hg19_chrM.fa Tue Dec 04 05:15:50 2018 -0500 @@ -0,0 +1,333 @@ +>chrM +GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCAT +TTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTG +GAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATT +CTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATACCTACTA +AAGTGTGTTAATTAATTAATGCTTGTAGGACATAATAATAACAATTGAAT +GTCTGCACAGCCGCTTTCCACACAGACATCATAACAAAAAATTTCCACCA +AACCCCCCCCTCCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGC +CAAACCCCAAAAACAAAGAACCCTAACACCAGCCTAACCAGATTTCAAAT +TTTATCTTTAGGCGGTATGCACTTTTAACAGTCACCCCCCAACTAACACA +TTATTTTCCCCTCCCACTCCCATACTACTAATCTCATCAATACAACCCCC +GCCCATCCTACCCAGCACACACACACCGCTGCTAACCCCATACCCCGAAC +CAACCAAACCCCAAAGACACCCCCCACAGTTTATGTAGCTTACCTCCTCA +AAGCAATACACTGAAAATGTTTAGACGGGCTCACATCACCCCATAAACAA +ATAGGTTTGGTCCTAGCCTTTCTATTAGCTCTTAGTAAGATTACACATGC +AAGCATCCCCGTTCCAGTGAGTTCACCCTCTAAATCACCACGATCAAAAG +GGACAAGCATCAAGCACGCAGCAATGCAGCTCAAAACGCTTAGCCTAGCC +ACACCCCCACGGGAAACAGCAGTGATTAACCTTTAGCAATAAACGAAAGT +TTAACTAAGCTATACTAACCCCAGGGTTGGTCAATTTCGTGCCAGCCACC +GCGGTCACACGATTAACCCAAGTCAATAGAAGCCGGCGTAAAGAGTGTTT +TAGATCACCCCCTCCCCAATAAAGCTAAAACTCACCTGAGTTGTAAAAAA +CTCCAGTTGACACAAAATAGACTACGAAAGTGGCTTTAACATATCTGAAC +ACACAATAGCTAAGACCCAAACTGGGATTAGATACCCCACTATGCTTAGC +CCTAAACCTCAACAGTTAAATCAACAAAACTGCTCGCCAGAACACTACGA +GCCACAGCTTAAAACTCAAAGGACCTGGCGGTGCTTCATATCCCTCTAGA +GGAGCCTGTTCTGTAATCGATAAACCCCGATCAACCTCACCACCTCTTGC +TCAGCCTATATACCGCCATCTTCAGCAAACCCTGATGAAGGCTACAAAGT +AAGCGCAAGTACCCACGTAAAGACGTTAGGTCAAGGTGTAGCCCATGAGG +TGGCAAGAAATGGGCTACATTTTCTACCCCAGAAAACTACGATAGCCCTT +ATGAAACTTAAGGGTCGAAGGTGGATTTAGCAGTAAACTGAGAGTAGAGT +GCTTAGTTGAACAGGGCCCTGAAGCGCGTACACACCGCCCGTCACCCTCC +TCAAGTATACTTCAAAGGACATTTAACTAAAACCCCTACGCATTTATATA +GAGGAGACAAGTCGTAACATGGTAAGTGTACTGGAAAGTGCACTTGGACG +AACCAGAGTGTAGCTTAACACAAAGCACCCAACTTACACTTAGGAGATTT +CAACTTAACTTGACCGCTCTGAGCTAAACCTAGCCCCAAACCCACTCCAC +CTTACTACCAGACAACCTTAGCCAAACCATTTACCCAAATAAAGTATAGG +CGATAGAAATTGAAACCTGGCGCAATAGATATAGTACCGCAAGGGAAAGA +TGAAAAATTATAACCAAGCATAATATAGCAAGGACTAACCCCTATACCTT +CTGCATAATGAATTAACTAGAAATAACTTTGCAAGGAGAGCCAAAGCTAA +GACCCCCGAAACCAGACGAGCTACCTAAGAACAGCTAAAAGAGCACACCC +GTCTATGTAGCAAAATAGTGGGAAGATTTATAGGTAGAGGCGACAAACCT +ACCGAGCCTGGTGATAGCTGGTTGTCCAAGATAGAATCTTAGTTCAACTT +TAAATTTGCCCACAGAACCCTCTAAATCCCCTTGTAAATTTAACTGTTAG +TCCAAAGAGGAACAGCTCTTTGGACACTAGGAAAAAACCTTGTAGAGAGA +GTAAAAAATTTAACACCCATAGTAGGCCTAAAAGCAGCCACCAATTAAGA +AAGCGTTCAAGCTCAACACCCACTACCTAAAAAATCCCAAACATATAACT +GAACTCCTCACACCCAATTGGACCAATCTATCACCCTATAGAAGAACTAA +TGTTAGTATAAGTAACATGAAAACATTCTCCTCCGCATAAGCCTGCGTCA +GATCAAAACACTGAACTGACAATTAACAGCCCAATATCTACAATCAACCA +ACAAGTCATTATTACCCTCACTGTCAACCCAACACAGGCATGCTCATAAG +GAAAGGTTAAAAAAAGTAAAAGGAACTCGGCAAACCTTACCCCGCCTGTT +TACCAAAAACATCACCTCTAGCATCACCAGTATTAGAGGCACCGCCTGCC +CAGTGACACATGTTTAACGGCCGCGGTACCCTAACCGTGCAaaggtagca +taatcacttgttccttaaatagggacctgtatgaatggctccacgagggt +tcagctgtctcttacttttaaccagtgaaattgacctgcccgtgaagagg +cgggcatgacacagcaagacgagaagaccctatggagctttaatttaTTA +ATGCAAACAGTACCTAACAAACCCACAGGTCCTAAACTACCAAACCTGCA +TTAAAAATTTCGGTTGGGGCGACCTCGGAGCAGAACCCAACCTCCGAGCA +GTACATGCTAAGACTTCACCAGTCAAAGCGAACTACTATACTCAATTGAT +CCAATAACTTGACCAACGGAACAAGTTACCCTAGGGATAACAGCGCAATC +CTATTCTAGAGTCCATATCAACAATAGGGTTTACGACCTCGATGTTGGAT +CAGGACATCCCGATGGTGCAGCCGCTATTAAAGGTTCGTTTGTTCAACGA +TTAAAGTCCTACGTGATCTGAGTTCAGACCGGAGTAATCCAGGTCGGTTT +CTATCTACTTCAAATTCCTCCCTGTACGAAAGGACAAGAGAAATAAGGCC +TACTTCACAAAGCGCCTTCCCCCGTAAATGATATCATCTCAACTTAGTAT +TATACCCACACCCACCCAAGAACAGGGTTTgttaagatggcagagcccgg +taatcgcataaaacttaaaactttacagtcagaggttcaattcctcttct +taacaacaTACCCATGGCCAACCTCCTACTCCTCATTGTACCCATTCTAA +TCGCAATGGCATTCCTAATGCTTACCGAACGAAAAATTCTAGGCTATATA +CAACTACGCAAAGGCCCCAACGTTGTAGGCCCCTACGGGCTACTACAACC +CTTCGCTGACGCCATAAAACTCTTCACCAAAGAGCCCCTAAAACCCGCCA +CATCTACCATCACCCTCTACATCACCGCCCCGACCTTAGCTCTCACCATC +GCTCTTCTACTATGAACCCCCCTCCCCATACCCAACCCCCTGGTCAACCT +CAACCTAGGCCTCCTATTTATTCTAGCCACCTCTAGCCTAGCCGTTTACT +CAATCCTCTGATCAGGGTGAGCATCAAACTCAAACTACGCCCTGATCGGC +GCACTGCGAGCAGTAGCCCAAACAATCTCATATGAAGTCACCCTAGCCAT +CATTCTACTATCAACATTACTAATAAGTGGCTCCTTTAACCTCTCCACCC +TTATCACAACACAAGAACACCTCTGATTACTCCTGCCATCATGACCCTTG +GCCATAATATGATTTATCTCCACACTAGCAGAGACCAACCGAACCCCCTT +CGACCTTGCCGAAGGGGAGTCCGAACTAGTCTCAGGCTTCAACATCGAAT +ACGCCGCAGGCCCCTTCGCCCTATTCTTCATAGCCGAATACACAAACATT +ATTATAATAAACACCCTCACCACTACAATCTTCCTAGGAACAACATATGA +CGCACTCTCCCCTGAACTCTACACAACATATTTTGTCACCAAGACCCTAC +TTCTAACCTCCCTGTTCTTATGAATTCGAACAGCATACCCCCGATTCCGC +TACGACCAACTCATACACCTCCTATGAAAAAACTTCCTACCACTCACCCT +AGCATTACTTATATGATATGTCTCCATACCCATTACAATCTCCAGCATTC +CCCCTCAAACCTAAGAAATATGTCTGATAAAAGAGTTACTTTGATAGAGT +AAATAATAGGAGCTTAAACCCCCTTATTTctaggactatgagaatcgaac +ccatccctgagaatccaaaattctccgtgccacctatcacaccccatcct +aAAGTAAGGTCAGCTAAATAAGCTATCGGGCCCATACCCCGAAAATGTTG +GTTATACCCTTCCCGTACTAATTAATCCCCTGGCCCAACCCGTCATCTAC +TCTACCATCTTTGCAGGCACACTCATCACAGCGCTAAGCTCGCACTGATT +TTTTACCTGAGTAGGCCTAGAAATAAACATGCTAGCTTTTATTCCAGTTC +TAACCAAAAAAATAAACCCTCGTTCCACAGAAGCTGCCATCAAGTATTTC +CTCACGCAAGCAACCGCATCCATAATCCTTCTAATAGCTATCCTCTTCAA +CAATATACTCTCCGGACAATGAACCATAACCAATACTACCAATCAATACT +CATCATTAATAATCATAATGGCTATAGCAATAAAACTAGGAATAGCCCCC +TTTCACTTCTGAGTCCCAGAGGTTACCCAAGGCACCCCTCTGACATCCGG +CCTGCTTCTTCTCACATGACAAAAACTAGCCCCCATCTCAATCATATACC +AAATCTCTCCCTCACTAAACGTAAGCCTTCTCCTCACTCTCTCAATCTTA +TCCATCATAGCAGGCAGTTGAGGTGGATTAAACCAAACCCAGCTACGCAA +AATCTTAGCATACTCCTCAATTACCCACATAGGATGAATAATAGCAGTTC +TACCGTACAACCCTAACATAACCATTCTTAATTTAACTATTTATATTATC +CTAACTACTACCGCATTCCTACTACTCAACTTAAACTCCAGCACCACGAC +CCTACTACTATCTCGCACCTGAAACAAGCTAACATGACTAACACCCTTAA +TTCCATCCACCCTCCTCTCCCTAGGAGGCCTGCCCCCGCTAACCGGCTTT +TTGCCCAAATGGGCCATTATCGAAGAATTCACAAAAAACAATAGCCTCAT +CATCCCCACCATCATAGCCACCATCACCCTCCTTAACCTCTACTTCTACC +TACGCCTAATCTACTCCACCTCAATCACACTACTCCCCATATCTAACAAC +GTAAAAATAAAATGACAGTTTGAACATACAAAACCCACCCCATTCCTCCC +CACACTCATCGCCCTTACCACGCTACTCCTACCTATCTCCCCTTTTATAC +TAATAATCTTATAGAAATTTAGGTTAAATACAGACCAAGAGCCTTCAAAG +CCCTCAGTAAGTTGCAATACTTAATTTCTGCAACAGCTAAGGACTGCAAA +ACCCCACTCTGCATCAACTGAACGCAAATCAGCCACTTTAATTAAGCTAA +GCCCTTACTAGACCAATGGGACTTAAACCCACAAACACTTAGTTAACAGC +TAAGCACCCTAATCAACTGGCTTCAATCTACTTCTCCCGCCGCCGGGAAA +AAAGGCGGGAGAAGCCCCGGCAGGTTTGAAGCTGCTTCTTCGAATTTGCA +ATTCAATATGAAAATCACCTCGGAGCTGGTAAAAAGAGGCCTAACCCCTG +TCTTTAGATTTACAGTCCAATGCTTCACTCAGCCATTTTACCTCACCCCC +ACTGATGTTCGCCGACCGTTGACTATTCTCTACAAACCACAAAGACATTG +GAACACTATACCTATTATTCGGCGCATGAGCTGGAGTCCTAGGCACAGCT +CTAAGCCTCCTTATTCGAGCCGAGCTGGGCCAGCCAGGCAACCTTCTAGG +TAACGACCACATCTACAACGTTATCGTCACAGCCCATGCATTTGTAATAA +TCTTCTTCATAGTAATACCCATCATAATCGGAGGCTTTGGCAACTGACTA +GTTCCCCTAATAATCGGTGCCCCCGATATGGCGTTTCCCCGCATAAACAA +CATAAGCTTCTGACTCTTACCTCCCTCTCTCCTACTCCTGCTCGCATCTG +CTATAGTGGAGGCCGGAGCAGGAACAGGTTGAACAGTCTACCCTCCCTTA +GCAGGGAACTACTCCCACCCTGGAGCCTCCGTAGACCTAACCATCTTCTC +CTTACACCTAGCAGGTGTCTCCTCTATCTTAGGGGCCATCAATTTCATCA +CAACAATTATCAATATAAAACCCCCTGCCATAACCCAATACCAAACGCCC +CTCTTCGTCTGATCCGTCCTAATCACAGCAGTCCTACTTCTCCTATCTCT +CCCAGTCCTAGCTGCTGGCATCACTATACTACTAACAGACCGCAACCTCA +ACACCACCTTCTTCGACCCCGCCGGAGGAGGAGACCCCATTCTATACCAA +CACCTATTCTGATTTTTCGGTCACCCTGAAGTTTATATTCTTATCCTACC +AGGCTTCGGAATAATCTCCCATATTGTAACTTACTACTCCGGAAAAAAAG +AACCATTTGGATACATAGGTATGGTCTGAGCTATGATATCAATTGGCTTC +CTAGGGTTTATCGTGTGAGCACACCATATATTTACAGTAGGAATAGACGT +AGACACACGAGCATATTTCACCTCCGCTACCATAATCATCGCTATCCCCA +CCGGCGTCAAAGTATTTAGCTGACTCGCCACACTCCACGGAAGCAATATG +AAATGATCTGCTGCAGTGCTCTGAGCCCTAGGATTCATCTTTCTTTTCAC +CGTAGGTGGCCTGACTGGCATTGTATTAGCAAACTCATCACTAGACATCG +TACTACACGACACGTACTACGTTGTAGCTCACTTCCACTATGTCCTATCA +ATAGGAGCTGTATTTGCCATCATAGGAGGCTTCATTCACTGATTTCCCCT +ATTCTCAGGCTACACCCTAGACCAAACCTACGCCAAAATCCATTTCACTA +TCATATTCATCGGCGTAAATCTAACTTTCTTCCCACAACACTTTCTCGGC +CTATCCGGAATGCCCCGACGTTACTCGGACTACCCCGATGCATACACCAC +ATGAAACATCCTATCATCTGTAGGCTCATTCATTTCTCTAACAGCAGTAA +TATTAATAATTTTCATGATTTGAGAAGCCTTCGCTTCGAAGCGAAAAGTC +CTAATAGTAGAAGAACCCTCCATAAACCTGGAGTGACTATATGGATGCCC +CCCACCCTACCACACATTCGAAGAACCCGTATACATAAAATCTAGACAaa +aaaggaaggaatcgaaccccccaaagctggtttcaagccaaccccatggc +ctccatgactttttcAAAAAGGTATTAGAAAAACCATTTCATAACTTTGT +CAAAGTTAAATTATAGGCTAAATCCTATATATCTTAATGGCACATGCAGC +GCAAGTAGGTCTACAAGACGCTACTTCCCCTATCATAGAAGAGCTTATCA +CCTTTCATGATCACGCCCTCATAATCATTTTCCTTATCTGCTTCCTAGTC +CTGTATGCCCTTTTCCTAACACTCACAACAAAACTAACTAATACTAACAT +CTCAGACGCTCAGGAAATAGAAACCGTCTGAACTATCCTGCCCGCCATCA +TCCTAGTCCTCATCGCCCTCCCATCCCTACGCATCCTTTACATAACAGAC +GAGGTCAACGATCCCTCCCTTACCATCAAATCAATTGGCCACCAATGGTA +CTGAACCTACGAGTACACCGACTACGGCGGACTAATCTTCAACTCCTACA +TACTTCCCCCATTATTCCTAGAACCAGGCGACCTGCGACTCCTTGACGTT +GACAATCGAGTAGTACTCCCGATTGAAGCCCCCATTCGTATAATAATTAC +ATCACAAGACGTCTTGCACTCATGAGCTGTCCCCACATTAGGCTTAAAAA +CAGATGCAATTCCCGGACGTCTAAACCAAACCACTTTCACCGCTACACGA +CCGGGGGTATACTACGGTCAATGCTCTGAAATCTGTGGAGCAAACCACAG +TTTCATGCCCATCGTCCTAGAATTAATTCCCCTAAAAATCTTTGAAATAG +GGCCCGTATTTACCCTATAGCACCCCCTCTACCCCCTCTAGAGCCCACTG +TAAAGCTAACTTAGCATTAACCTTTTAAGTTAAAGATTAAGAGAACCAAC +ACCTCTTTACAGTGAAATGCCCCAACTAAATACTACCGTATGGCCCACCA +TAATTACCCCCATACTCCTTACACTATTCCTCATCACCCAACTAAAAATA +TTAAACACAAACTACCACCTACCTCCCTCACCAAAGCCCATAAAAATAAA +AAATTATAACAAACCCTGAGAACCAAAATGAACGAAAATCTGTTCGCTTC +ATTCATTGCCCCCACAATCCTAGGCCTACCCGCCGCAGTACTGATCATTC +TATTTCCCCCTCTATTGATCCCCACCTCCAAATATCTCATCAACAACCGA +CTAATCACCACCCAACAATGACTAATCAAACTAACCTCAAAACAAATGAT +AGCCATACACAACACTAAAGGACGAACCTGATCTCTTATACTAGTATCCT +TAATCATTTTTATTGCCACAACTAACCTCCTCGGACTCCTGCCTCACTCA +TTTACACCAACCACCCAACTATCTATAAACCTAGCCATGGCCATCCCCTT +ATGAGCGGGCGCAGTGATTATAGGCTTTCGCTCTAAGATTAAAAATGCCC +TAGCCCACTTCTTACCACAAGGCACACCTACACCCCTTATCCCCATACTA +GTTATTATCGAAACCATCAGCCTACTCATTCAACCAATAGCCCTGGCCGT +ACGCCTAACCGCTAACATTACTGCAGGCCACCTACTCATGCACCTAATTG +GAAGCGCCACCCTAGCAATATCAACCATTAACCTTCCCTCTACACTTATC +ATCTTCACAATTCTAATTCTACTGACTATCCTAGAAATCGCTGTCGCCTT +AATCCAAGCCTACGTTTTCACACTTCTAGTAAGCCTCTACCTGCACGACA +ACACATAATGACCCACCAATCACATGCCTATCATATAGTAAAACCCAGCC +CATGACCCCTAACAGGGGCCCTCTCAGCCCTCCTAATGACCTCCGGCCTA +GCCATGTGATTTCACTTCCACTCCATAACGCTCCTCATACTAGGCCTACT +AACCAACACACTAACCATATACCAATGGTGGCGCGATGTAACACGAGAAA +GCACATACCAAGGCCACCACACACCACCTGTCCAAAAAGGCCTTCGATAC +GGGATAATCCTATTTATTACCTCAGAAGTTTTTTTCTTCGCAGGATTTTT +CTGAGCCTTTTACCACTCCAGCCTAGCCCCTACCCCCCAACTAGGAGGGC +ACTGGCCCCCAACAGGCATCACCCCGCTAAATCCCCTAGAAGTCCCACTC +CTAAACACATCCGTATTACTCGCATCAGGAGTATCAATCACCTGAGCTCA +CCATAGTCTAATAGAAAACAACCGAAACCAAATAATTCAAGCACTGCTTA +TTACAATTTTACTGGGTCTCTATTTTACCCTCCTACAAGCCTCAGAGTAC +TTCGAGTCTCCCTTCACCATTTCCGACGGCATCTACGGCTCAACATTTTT +TGTAGCCACAGGCTTCCACGGACTTCACGTCATTATTGGCTCAACTTTCC +TCACTATCTGCTTCATCCGCCAACTAATATTTCACTTTACATCCAAACAT +CACTTTGGCTTCGAAGCCGCCGCCTGATACTGGCATTTTGTAGATGTGGT +TTGACTATTTCTGTATGTCTCCATCTATTGATGAGGGTCTTACTCTTTTA +GTATAAATAGTACCGTTAACTTCCAATTAACTAGTTTTGACAACATTCAA +AAAAGAGTAATAAACTTCGCCTTAATTTTAATAATCAACACCCTCCTAGC +CTTACTACTAATAATTATTACATTTTGACTACCACAACTCAACGGCTACA +TAGAAAAATCCACCCCTTACGAGTGCGGCTTCGACCCTATATCCCCCGCC +CGCGTCCCTTTCTCCATAAAATTCTTCTTAGTAGCTATTACCTTCTTATT +ATTTGATCTAGAAATTGCCCTCCTTTTACCCCTACCATGAGCCCTACAAA +CAACTAACCTGCCACTAATAGTTATGTCATCCCTCTTATTAATCATCATC +CTAGCCCTAAGTCTGGCCTATGAGTGACTACAAAAAGGATTAGACTGAGC +CGAATTGGTATATAGTTTAAACAAAACGAATGATTTCGACTCATTAAATT +ATGATAATCATATTTACCAAATGCCCCTCATTTACATAAATATTATACTA +GCATTTACCATCTCACTTCTAGGAATACTAGTATATCGCTCACACCTCAT +ATCCTCCCTACTATGCCTAGAAGGAATAATACTATCGCTGTTCATTATAG +CTACTCTCATAACCCTCAACACCCACTCCCTCTTAGCCAATATTGTGCCT +ATTGCCATACTAGTCTTTGCCGCCTGCGAAGCAGCGGTGGGCCTAGCCCT +ACTAGTCTCAATCTCCAACACATATGGCCTAGACTACGTACATAACCTAA +ACCTACTCCAATGCTAAAACTAATCGTCCCAACAATTATATTACTACCAC +TGACATGACTTTCCAAAAAGCACATAATTTGAATCAACACAACCACCCAC +AGCCTAATTATTAGCATCATCCCCCTACTATTTTTTAACCAAATCAACAA +CAACCTATTTAGCTGTTCCCCAACCTTTTCCTCCGACCCCCTAACAACCC +CCCTCCTAATACTAACTACCTGACTCCTACCCCTCACAATCATGGCAAGC +CAACGCCACTTATCCAGCGAACCACTATCACGAAAAAAACTCTACCTCTC +TATACTAATCTCCCTACAAATCTCCTTAATTATAACATTCACAGCCACAG +AACTAATCATATTTTATATCTTCTTCGAAACCACACTTATCCCCACCTTG +GCTATCATCACCCGATGAGGCAACCAGCCAGAACGCCTGAACGCAGGCAC +ATACTTCCTATTCTACACCCTAGTAGGCTCCCTTCCCCTACTCATCGCAC +TAATTTACACTCACAACACCCTAGGCTCACTAAACATTCTACTACTCACT +CTCACTGCCCAAGAACTATCAAACTCCTGAGCCAACAACTTAATATGACT +AGCTTACACAATAGCTTTTATAGTAAAGATACCTCTTTACGGACTCCACT +TATGACTCCCTAAAGCCCATGTCGAAGCCCCCATCGCTGGGTCAATAGTA +CTTGCCGCAGTACTCTTAAAACTAGGCGGCTATGGTATAATACGCCTCAC +ACTCATTCTCAACCCCCTGACAAAACACATAGCCTACCCCTTCCTTGTAC +TATCCCTATGAGGCATAATTATAACAAGCTCCATCTGCCTACGACAAACA +GACCTAAAATCGCTCATTGCATACTCTTCAATCAGCCACATAGCCCTCGT +AGTAACAGCCATTCTCATCCAAACCCCCTGAAGCTTCACCGGCGCAGTCA +TTCTCATAATCGCCCACGGACTCACATCCTCATTACTATTCTGCCTAGCA +AACTCAAACTACGAACGCACTCACAGTCGCATCATAATCCTCTCTCAAGG +ACTTCAAACTCTACTCCCACTAATAGCTTTTTGATGACTTCTAGCAAGCC +TCGCTAACCTCGCCTTACCCCCCACTATTAACCTACTGGGAGAACTCTCT +GTGCTAGTAACCACGTTCTCCTGATCAAATATCACTCTCCTACTTACAGG +ACTCAACATACTAGTCACAGCCCTATACTCCCTCTACATATTTACCACAA +CACAATGGGGCTCACTCACCCACCACATTAACAACATAAAACCCTCATTC +ACACGAGAAAACACCCTCATGTTCATACACCTATCCCCCATTCTCCTCCT +ATCCCTCAACCCCGACATCATTACCGGGTTTTCCTCTTGTAAATATAGTT +TAACCAAAACATCAGATTGTGAATCTGACAACAGAGGCTTACGACCCCTT +ATTTACCGAGAAAGCTCACAAGAACTGCTAACTCATGCCCCCATGTCTAA +CAACATGGCTTTCTCAACTTTTAAAGGATAACAGCTATCCATTGGTCTTA +GGCCCCAAAAATTTTGGTGCAACTCCAAATAAAAGTAATAACCATGCACA +CTACTATAACCACCCTAACCCTGACTTCCCTAATTCCCCCCATCCTTACC +ACCCTCGTTAACCCTAACAAAAAAAACTCATACCCCCATTATGTAAAATC +CATTGTCGCATCCACCTTTATTATCAGTCTCTTCCCCACAACAATATTCA +TGTGCCTAGACCAAGAAGTTATTATCTCGAACTGACACTGAGCCACAACC +CAAACAACCCAGCTCTCCCTAAGCTTCAAACTAGACTACTTCTCCATAAT +ATTCATCCCTGTAGCATTGTTCGTTACATGGTCCATCATAGAATTCTCAC +TGTGATATATAAACTCAGACCCAAACATTAATCAGTTCTTCAAATATCTA +CTCATTTTCCTAATTACCATACTAATCTTAGTTACCGCTAACAACCTATT +CCAACTGTTCATCGGCTGAGAGGGCGTAGGAATTATATCCTTCTTGCTCA +TCAGTTGATGATACGCCCGAGCAGATGCCAACACAGCAGCCATTCAAGCA +GTCCTATACAACCGTATCGGCGATATCGGTTTCATCCTCGCCTTAGCATG +ATTTATCCTACACTCCAACTCATGAGACCCACAACAAATAGCCCTTCTAA +ACGCTAATCCAAGCCTCACCCCACTACTAGGCCTCCTCCTAGCAGCAGCA +GGCAAATCAGCCCAATTAGGTCTCCACCCCTGACTCCCCTCAGCCATAGA +AGGCCCCACCCCAGTCTCAGCCCTACTCCACTCAAGCACTATAGTTGTAG +CAGGAATCTTCTTACTCATCCGCTTCCACCCCCTAGCAGAAAATAGCCCA +CTAATCCAAACTCTAACACTATGCTTAGGCGCTATCACCACTCTGTTCGC +AGCAGTCTGCGCCCTTACACAAAATGACATCAAAAAAATCGTAGCCTTCT +CCACTTCAAGTCAACTAGGACTCATAATAGTTACAATCGGCATCAACCAA +CCACACCTAGCATTCCTGCACATCTGTACCCACGCCTTCTTCAAAGCCAT +ACTATTTATGTGCTCCGGGTCCATCATCCACAACCTTAACAATGAACAAG +ATATTCGAAAAATAGGAGGACTACTCAAAACCATACCTCTCACTTCAACC +TCCCTCACCATTGGCAGCCTAGCATTAGCAGGAATACCTTTCCTCACAGG +TTTCTACTCCAAAGACCACATCATCGAAACCGCAAACATATCATACACAA +ACGCCTGAGCCCTATCTATTACTCTCATCGCTACCTCCCTGACAAGCGCC +TATAGCACTCGAATAATTCTTCTCACCCTAACAGGTCAACCTCGCTTCCC +CACCCTTACTAACATTAACGAAAATAACCCCACCCTACTAAACCCCATTA +AACGCCTGGCAGCCGGAAGCCTATTCGCAGGATTTCTCATTACTAACAAC +ATTTCCCCCGCATCCCCCTTCCAAACAACAATCCCCCTCTACCTAAAACT +CACAGCCCTCGCTGTCACTTTCCTAGGACTTCTAACAGCCCTAGACCTCA +ACTACCTAACCAACAAACTTAAAATAAAATCCCCACTATGCACATTTTAT +TTCTCCAACATACTCGGATTCTACCCTAGCATCACACACCGCACAATCCC +CTATCTAGGCCTTCTTACGAGCCAAAACCTGCCCCTACTCCTCCTAGACC +TAACCTGACTAGAAAAGCTATTACCTAAAACAATTTCACAGCACCAAATC +TCCACCTCCATCATCACCTCAACCCAAAAAGGCATAATTAAACTTTACTT +CCTCTCTTTCTTCTTCCCACTCATCCTAACCCTACTCCTAATCACATAAC +CTATTCCCCCGAGCAATCTCAATTACAATATATACACCAACAAACAATGT +TCAACCAGTAACCACTACTAATCAACGCCCATAATCATACAAAGCCCCCG +CACCAATAGGATCCTCCCGAATCAACCCTGACCCCTCTCCTTCATAAATT +ATTCAGCTTCCTACACTATTAAAGTTTACCACAACCACCACCCCATCATA +CTCTTTCACCCACAGCACCAATCCTACCTCCATCGCTAACCCCACTAAAA +CACTCACCAAGACCTCAACCCCTGACCCCCATGCCTCAGGATACTCCTCA +ATAGCCATCGCTGTAGTATATCCAAAGACAACCATCATTCCCCCTAAATA +AATTAAAAAAACTATTAAACCCATATAACCTCCCCCAAAATTCAGAATAA +TAACACACCCGACCACACCGCTAACAATCAGTACTAAACCCCCATAAATA +GGAGAAGGCTTAGAAGAAAACCCCACAAACCCCATTACTAAACCCACACT +CAACAGAAACAAAGCATACATCATTATTCTCGCACGGACTACAACCACGA +CCAATGATATGAAAAACCATCGTTGTATTTCAACTACAAGAACACCAATG +ACCCCAATACGCAAAATTAACCCCCTAATAAAATTAATTAACCACTCATT +CATCGACCTCCCCACCCCATCCAACATCTCCGCATGATGAAACTTCGGCT +CACTCCTTGGCGCCTGCCTGATCCTCCAAATCACCACAGGACTATTCCTA +GCCATACACTACTCACCAGACGCCTCAACCGCCTTTTCATCAATCGCCCA +CATCACTCGAGACGTAAATTATGGCTGAATCATCCGCTACCTTCACGCCA +ATGGCGCCTCAATATTCTTTATCTGCCTCTTCCTACACATCGGGCGAGGC +CTATATTACGGATCATTTCTCTACTCAGAAACCTGAAACATCGGCATTAT +CCTCCTGCTTGCAACTATAGCAACAGCCTTCATAGGCTATGTCCTCCCGT +GAGGCCAAATATCATTCTGAGGGGCCACAGTAATTACAAACTTACTATCC +GCCATCCCATACATTGGGACAGACCTAGTTCAATGAATCTGAGGAGGCTA +CTCAGTAGACAGTCCCACCCTCACACGATTCTTTACCTTTCACTTCATCT +TACCCTTCATTATTGCAGCCCTAGCAGCACTCCACCTCCTATTCTTGCAC +GAAACGGGATCAAACAACCCCCTAGGAATCACCTCCCATTCCGATAAAAT +CACCTTCCACCCTTACTACACAATCAAAGACGCCCTCGGCTTACTTCTCT +TCCTTCTCTCCTTAATGACATTAACACTATTCTCACCAGACCTCCTAGGC +GACCCAGACAATTATACCCTAGCCAACCCCTTAAACACCCCTCCCCACAT +CAAGCCCGAATGATATTTCCTATTCGCCTACACAATTCTCCGATCCGTCC +CTAACAAACTAGGAGGCGTCCTTGCCCTATTACTATCCATCCTCATCCTA +GCAATAATCCCCATCCTCCATATATCCAAACAACAAAGCATAATATTTCG +CCCACTAAGCCAATCACTTTATTGACTCCTAGCCGCAGACCTCCTCATTC +TAACCTGAATCGGAGGACAACCAGTAAGCTACCCTTTTACCATCATTGGA +CAAGTAGCATCCGTACTATACTTCACAACAATCCTAATCCTAATACCAAC +TATCTCCCTAATTGAAAACAAAATACTCAAATGGGCCTGTCCTTGTAGTA +TAAACTAATACACCAGTCTTGTAAACCGGAGACGAAAACCTTTTTCCAAG +GACAAATCAGAGAAAAAGTCTTTAACTCCACCATTAGCACCCAAAGCTAA +GATTCTAATTTAAACTATTCTCTGTTCTTTCATGGGGAAGCAGATTTGGG +TACCACCCAAGTATTGACTCACCCATCAACAACCGCTATGTATTTCGTAC +ATTACTGCCAGCCACCATGAATATTGTACGGTACCATAAATACTTGACCA +CCTGTAGTACATAAAAACCCAACCCACATCAAACCCCCCCCCCCCATGCT +TACAAGCAAGTACAGCAATCAACCTTCAACTATCACACATCAACTGCAAC +TCCAAAGCCACCCCTCACCCACTAGGATACCAACAAACCTACCCACCCTT +AACAGTACATAGTACATAAAGTCATTTACCGTACATAGCACATTACAGTC +AAATCCCTTCTCGTCCCCATGGATGACCCCCCTCAGATAGGGGTCCCTTG +ACCACCATCCTCCGTGAAATCAATATCCCGCACAAGAGTGCTACTCTCCT +CGCTCCGGGCCCATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGAC +ATCTGGTTCCTACTTCAGGGCCATAAAGCCTAAATAGCCCACACGTTCCC +CTTAAATAAGACATCACGATG
--- a/test-data/varscan_somatic_indel_result1.vcf Sun Jul 15 09:19:25 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,18 +0,0 @@ -##fileformat=VCFv4.1 -##source=VarScan2 -##INFO=<ID=DP,Number=1,Type=Integer,Description="Total depth of quality bases"> -##INFO=<ID=SOMATIC,Number=0,Type=Flag,Description="Indicates if record is a somatic mutation"> -##INFO=<ID=SS,Number=1,Type=String,Description="Somatic status of variant (0=Reference,1=Germline,2=Somatic,3=LOH, or 5=Unknown)"> -##INFO=<ID=SSC,Number=1,Type=String,Description="Somatic score in Phred scale (0-255) derived from somatic p-value"> -##INFO=<ID=GPV,Number=1,Type=Float,Description="Fisher's Exact Test P-value of tumor+normal versus no variant for Germline calls"> -##INFO=<ID=SPV,Number=1,Type=Float,Description="Fisher's Exact Test P-value of tumor versus normal for Somatic/LOH calls"> -##FILTER=<ID=str10,Description="Less than 10% or more than 90% of variant supporting reads on one strand"> -##FILTER=<ID=indelError,Description="Likely artifact due to indel reads at this position"> -##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> -##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality"> -##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth"> -##FORMAT=<ID=RD,Number=1,Type=Integer,Description="Depth of reference-supporting bases (reads1)"> -##FORMAT=<ID=AD,Number=1,Type=Integer,Description="Depth of variant-supporting bases (reads2)"> -##FORMAT=<ID=FREQ,Number=1,Type=String,Description="Variant allele frequency"> -##FORMAT=<ID=DP4,Number=1,Type=String,Description="Strand read counts: ref/fwd, ref/rev, var/fwd, var/rev"> -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL TUMOR
--- a/test-data/varscan_somatic_indel_result2.vcf Sun Jul 15 09:19:25 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,18 +0,0 @@ -##fileformat=VCFv4.1 -##source=VarScan2 -##INFO=<ID=DP,Number=1,Type=Integer,Description="Total depth of quality bases"> -##INFO=<ID=SOMATIC,Number=0,Type=Flag,Description="Indicates if record is a somatic mutation"> -##INFO=<ID=SS,Number=1,Type=String,Description="Somatic status of variant (0=Reference,1=Germline,2=Somatic,3=LOH, or 5=Unknown)"> -##INFO=<ID=SSC,Number=1,Type=String,Description="Somatic score in Phred scale (0-255) derived from somatic p-value"> -##INFO=<ID=GPV,Number=1,Type=Float,Description="Fisher's Exact Test P-value of tumor+normal versus no variant for Germline calls"> -##INFO=<ID=SPV,Number=1,Type=Float,Description="Fisher's Exact Test P-value of tumor versus normal for Somatic/LOH calls"> -##FILTER=<ID=str10,Description="Less than 10% or more than 90% of variant supporting reads on one strand"> -##FILTER=<ID=indelError,Description="Likely artifact due to indel reads at this position"> -##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> -##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality"> -##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth"> -##FORMAT=<ID=RD,Number=1,Type=Integer,Description="Depth of reference-supporting bases (reads1)"> -##FORMAT=<ID=AD,Number=1,Type=Integer,Description="Depth of variant-supporting bases (reads2)"> -##FORMAT=<ID=FREQ,Number=1,Type=String,Description="Variant allele frequency"> -##FORMAT=<ID=DP4,Number=1,Type=String,Description="Strand read counts: ref/fwd, ref/rev, var/fwd, var/rev"> -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL TUMOR
--- a/test-data/varscan_somatic_snp_result1.vcf Sun Jul 15 09:19:25 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,41 +0,0 @@ -##fileformat=VCFv4.1 -##source=VarScan2 -##INFO=<ID=DP,Number=1,Type=Integer,Description="Total depth of quality bases"> -##INFO=<ID=SOMATIC,Number=0,Type=Flag,Description="Indicates if record is a somatic mutation"> -##INFO=<ID=SS,Number=1,Type=String,Description="Somatic status of variant (0=Reference,1=Germline,2=Somatic,3=LOH, or 5=Unknown)"> -##INFO=<ID=SSC,Number=1,Type=String,Description="Somatic score in Phred scale (0-255) derived from somatic p-value"> -##INFO=<ID=GPV,Number=1,Type=Float,Description="Fisher's Exact Test P-value of tumor+normal versus no variant for Germline calls"> -##INFO=<ID=SPV,Number=1,Type=Float,Description="Fisher's Exact Test P-value of tumor versus normal for Somatic/LOH calls"> -##FILTER=<ID=str10,Description="Less than 10% or more than 90% of variant supporting reads on one strand"> -##FILTER=<ID=indelError,Description="Likely artifact due to indel reads at this position"> -##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> -##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality"> -##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth"> -##FORMAT=<ID=RD,Number=1,Type=Integer,Description="Depth of reference-supporting bases (reads1)"> -##FORMAT=<ID=AD,Number=1,Type=Integer,Description="Depth of variant-supporting bases (reads2)"> -##FORMAT=<ID=FREQ,Number=1,Type=String,Description="Variant allele frequency"> -##FORMAT=<ID=DP4,Number=1,Type=String,Description="Strand read counts: ref/fwd, ref/rev, var/fwd, var/rev"> -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL TUMOR -chr1 51436072 . C A . PASS DP=47;SOMATIC;SS=2;SSC=3;GPV=1E0;SPV=4.4681E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:26:26:0:0%:23,3,0,0 0/1:.:21:20:1:4.76%:19,1,1,0 -chr1 51436311 . T C . PASS DP=16;SOMATIC;SS=2;SSC=3;GPV=1E0;SPV=4.375E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:9:9:0:0%:1,8,0,0 0/1:.:7:6:1:14.29%:0,6,0,1 -chr1 51436320 . G A . PASS DP=19;SOMATIC;SS=2;SSC=2;GPV=1E0;SPV=5.2632E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:9:9:0:0%:1,8,0,0 0/1:.:10:9:1:10%:0,9,0,1 -chr1 51439628 . T C . str10 DP=237;SOMATIC;SS=2;SSC=3;GPV=1E0;SPV=4.8101E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:123:123:0:0%:77,46,0,0 0/1:.:114:113:1:0.88%:74,39,0,1 -chr1 51439638 . G A . str10 DP=234;SOMATIC;SS=2;SSC=3;GPV=1E0;SPV=4.9145E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:119:119:0:0%:72,47,0,0 0/1:.:115:114:1:0.87%:75,39,0,1 -chr1 51439665 . C T . PASS DP=226;SOMATIC;SS=2;SSC=9;GPV=1E0;SPV=1.2006E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:114:114:0:0%:56,58,0,0 0/1:.:112:109:3:2.68%:63,46,2,1 -chr1 51439671 . G A . str10 DP=222;SOMATIC;SS=2;SSC=2;GPV=1E0;SPV=5.045E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:110:110:0:0%:53,57,0,0 0/1:.:112:111:1:0.89%:59,52,1,0 -chr1 51439684 . G T . str10 DP=210;SOMATIC;SS=2;SSC=3;GPV=1E0;SPV=4.9524E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:106:106:0:0%:51,55,0,0 0/1:.:104:103:1:0.96%:53,50,1,0 -chr1 51439703 . C T . str10 DP=202;SOMATIC;SS=2;SSC=2;GPV=1E0;SPV=5.099E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:99:99:0:0%:46,53,0,0 0/1:.:103:102:1:0.97%:48,54,0,1 -chr1 51439705 . G T . str10 DP=204;SOMATIC;SS=2;SSC=2;GPV=1E0;SPV=5.1961E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:98:98:0:0%:42,56,0,0 0/1:.:106:105:1:0.94%:48,57,0,1 -chr1 51439706 . G T . str10 DP=201;SOMATIC;SS=2;SSC=2;GPV=1E0;SPV=5.1741E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:97:97:0:0%:41,56,0,0 0/1:.:104:103:1:0.96%:46,57,1,0 -chr1 51439726 . C G . str10 DP=187;SOMATIC;SS=2;SSC=2;GPV=1E0;SPV=5.1872E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:90:90:0:0%:37,53,0,0 0/1:.:97:96:1:1.03%:44,52,1,0 -chr1 51439751 . C G . str10 DP=168;SOMATIC;SS=2;SSC=2;GPV=1E0;SPV=5.3293E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:79:78:0:0%:28,50,0,0 0/1:.:89:88:1:1.12%:35,53,0,1 -chr1 51439763 . G A . PASS DP=159;SOMATIC;SS=2;SSC=5;GPV=1E0;SPV=2.7092E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:76:76:0:0%:34,42,0,0 0/1:.:83:81:2:2.41%:32,49,1,1 -chr1 51439766 . G T . str10 DP=154;SOMATIC;SS=2;SSC=2;GPV=1E0;SPV=5.1299E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:75:75:0:0%:34,41,0,0 0/1:.:79:78:1:1.27%:30,48,0,1 -chr1 51439788 . T C . str10 DP=136;SOMATIC;SS=2;SSC=2;GPV=1E0;SPV=5.1471E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:66:66:0:0%:21,45,0,0 0/1:.:70:69:1:1.43%:24,45,1,0 -chr1 51439828 . G A . str10 DP=122;SOMATIC;SS=2;SSC=2;GPV=1E0;SPV=5.7377E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:52:52:0:0%:14,38,0,0 0/1:.:70:69:1:1.43%:27,42,0,1 -chr1 51439832 . C G . str10 DP=125;SOMATIC;SS=2;SSC=2;GPV=1E0;SPV=5.52E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:56:56:0:0%:14,42,0,0 0/1:.:69:68:1:1.45%:25,43,0,1 -chr1 51439876 . T G . str10 DP=105;SOMATIC;SS=2;SSC=2;GPV=1E0;SPV=5.619E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:46:46:0:0%:10,36,0,0 0/1:.:59:58:1:1.69%:23,35,0,1 -chr1 51439882 . G T . str10 DP=105;SOMATIC;SS=2;SSC=2;GPV=1E0;SPV=5.2381E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:50:50:0:0%:13,37,0,0 0/1:.:55:54:1:1.82%:22,32,0,1 -chr1 51439889 . G T . str10 DP=97;SOMATIC;SS=2;SSC=2;GPV=1E0;SPV=5.1546E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:47:47:0:0%:14,33,0,0 0/1:.:50:49:1:2%:21,28,0,1 -chr1 51439953 . G T . str10 DP=59;SOMATIC;SS=2;SSC=2;GPV=1E0;SPV=5.9322E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:24:24:0:0%:5,19,0,0 0/1:.:35:34:1:2.86%:7,27,0,1 -chr1 51440035 . G T . PASS DP=21;SOMATIC;SS=2;SSC=2;GPV=1E0;SPV=6.1905E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:8:8:0:0%:1,7,0,0 0/1:.:13:12:1:7.69%:1,11,0,1
--- a/test-data/varscan_somatic_snp_result2.vcf Sun Jul 15 09:19:25 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,26 +0,0 @@ -##fileformat=VCFv4.1 -##source=VarScan2 -##INFO=<ID=DP,Number=1,Type=Integer,Description="Total depth of quality bases"> -##INFO=<ID=SOMATIC,Number=0,Type=Flag,Description="Indicates if record is a somatic mutation"> -##INFO=<ID=SS,Number=1,Type=String,Description="Somatic status of variant (0=Reference,1=Germline,2=Somatic,3=LOH, or 5=Unknown)"> -##INFO=<ID=SSC,Number=1,Type=String,Description="Somatic score in Phred scale (0-255) derived from somatic p-value"> -##INFO=<ID=GPV,Number=1,Type=Float,Description="Fisher's Exact Test P-value of tumor+normal versus no variant for Germline calls"> -##INFO=<ID=SPV,Number=1,Type=Float,Description="Fisher's Exact Test P-value of tumor versus normal for Somatic/LOH calls"> -##FILTER=<ID=str10,Description="Less than 10% or more than 90% of variant supporting reads on one strand"> -##FILTER=<ID=indelError,Description="Likely artifact due to indel reads at this position"> -##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> -##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality"> -##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth"> -##FORMAT=<ID=RD,Number=1,Type=Integer,Description="Depth of reference-supporting bases (reads1)"> -##FORMAT=<ID=AD,Number=1,Type=Integer,Description="Depth of variant-supporting bases (reads2)"> -##FORMAT=<ID=FREQ,Number=1,Type=String,Description="Variant allele frequency"> -##FORMAT=<ID=DP4,Number=1,Type=String,Description="Strand read counts: ref/fwd, ref/rev, var/fwd, var/rev"> -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL TUMOR -chr1 51436072 . C A . PASS DP=47;SOMATIC;SS=2;SSC=3;GPV=1E0;SPV=4.4681E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:26:26:0:0%:23,3,0,0 0/1:.:21:20:1:4.76%:19,1,1,0 -chr1 51436311 . T C . PASS DP=16;SOMATIC;SS=2;SSC=3;GPV=1E0;SPV=4.375E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:9:9:0:0%:1,8,0,0 0/1:.:7:6:1:14.29%:0,6,0,1 -chr1 51436320 . G A . PASS DP=19;SOMATIC;SS=2;SSC=2;GPV=1E0;SPV=5.2632E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:9:9:0:0%:1,8,0,0 0/1:.:10:9:1:10%:0,9,0,1 -chr1 51439665 . C T . PASS DP=226;SOMATIC;SS=2;SSC=9;GPV=1E0;SPV=1.2006E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:114:114:0:0%:56,58,0,0 0/1:.:112:109:3:2.68%:63,46,2,1 -chr1 51439763 . G A . PASS DP=159;SOMATIC;SS=2;SSC=5;GPV=1E0;SPV=2.7092E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:76:76:0:0%:34,42,0,0 0/1:.:83:81:2:2.41%:32,49,1,1 -chr1 51440025 . A C . PASS DP=27;SOMATIC;SS=2;SSC=1;GPV=1E0;SPV=6.6667E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:9:9:0:0%:1,8,0,0 0/1:.:18:17:1:5.56%:1,16,0,1 -chr1 51440035 . G T . PASS DP=21;SOMATIC;SS=2;SSC=2;GPV=1E0;SPV=6.1905E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:8:8:0:0%:1,7,0,0 0/1:.:13:12:1:7.69%:1,11,0,1 -chr1 51440056 . T G . PASS DP=22;SOMATIC;SS=2;SSC=1;GPV=1E0;SPV=6.3636E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:8:8:0:0%:1,7,0,0 0/1:.:14:13:1:7.14%:1,12,0,1
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/varscan.py Tue Dec 04 05:15:50 2018 -0500 @@ -0,0 +1,1178 @@ +#!/usr/bin/env python3 +from __future__ import print_function + +import argparse +import io +import os +import subprocess +import sys +import tempfile +import time +from contextlib import ExitStack +from functools import partial +from threading import Thread + +import pysam + + +class VariantCallingError (RuntimeError): + """Exception class for issues with samtools and varscan subprocesses.""" + + def __init__(self, message=None, call='', error=''): + self.message = message + self.call = call.strip() + self.error = error.strip() + + def __str__(self): + if self.message is None: + return '' + if self.error: + msg_header = '"{0}" failed with:\n{1}\n\n'.format( + self.call, self.error + ) + else: + msg_header = '{0} failed.\n' + 'No further information about this error is available.\n\n'.format( + self.call + ) + return msg_header + self.message + + +class VarScanCaller (object): + def __init__(self, ref_genome, bam_input_files, + max_depth=None, + min_mapqual=None, min_basequal=None, + threads=1, verbose=False, quiet=True + ): + self.ref_genome = ref_genome + self.bam_input_files = bam_input_files + self.max_depth = max_depth + self.min_mapqual = min_mapqual + self.min_basequal = min_basequal + self.threads = threads + self.verbose = verbose + self.quiet = quiet + + with pysam.FastaFile(ref_genome) as ref_fa: + self.ref_contigs = ref_fa.references + self.ref_lengths = ref_fa.lengths + + self.pileup_engine = ['samtools', 'mpileup'] + self.varcall_engine = ['varscan', 'somatic'] + self.requires_stdout_redirect = False + self.TemporaryContigVCF = partial( + tempfile.NamedTemporaryFile, + mode='wb', suffix='', delete=False, dir=os.getcwd() + ) + self.tmpfiles = [] + + def _get_pysam_pileup_args(self): + param_dict = {} + if self.max_depth is not None: + param_dict['max_depth'] = self.max_depth + if self.min_mapqual is not None: + param_dict['min_mapping_quality'] = self.min_mapqual + if self.min_basequal is not None: + param_dict['min_base_quality'] = self.min_basequal + param_dict['compute_baq'] = False + param_dict['stepper'] = 'samtools' + return param_dict + + def varcall_parallel(self, normal_purity=None, tumor_purity=None, + min_coverage=None, + min_var_count=None, + min_var_freq=None, min_hom_freq=None, + p_value=None, somatic_p_value=None, + threads=None, verbose=None, quiet=None + ): + if not threads: + threads = self.threads + if verbose is None: + verbose = self.verbose + if quiet is None: + quiet = self.quiet + # mapping of method parameters to varcall engine command line options + varcall_engine_option_mapping = [ + ('--normal-purity', normal_purity), + ('--tumor-purity', tumor_purity), + ('--min-coverage', min_coverage), + ('--min-reads2', min_var_count), + ('--min-var-freq', min_var_freq), + ('--min-freq-for-hom', min_hom_freq), + ('--p-value', p_value), + ('--somatic-p-value', somatic_p_value), + ('--min-avg-qual', self.min_basequal) + ] + varcall_engine_options = [] + for option, value in varcall_engine_option_mapping: + if value is not None: + varcall_engine_options += [option, str(value)] + pileup_engine_options = ['-B'] + if self.max_depth is not None: + pileup_engine_options += ['-d', str(self.max_depth)] + if self.min_mapqual is not None: + pileup_engine_options += ['-q', str(self.min_mapqual)] + if self.min_basequal is not None: + pileup_engine_options += ['-Q', str(self.min_basequal)] + + # Create a tuple of calls to samtools mpileup and varscan for + # each contig. The contig name is stored as the third element of + # that tuple. + # The calls are stored in the reverse order of the contig list so + # that they can be popped off later in the original order + calls = [( + self.pileup_engine + pileup_engine_options + [ + '-r', contig + ':', + '-f', self.ref_genome + ] + self.bam_input_files, + self.varcall_engine + [ + '-', '{out}', '--output-vcf', '1', '--mpileup', '1' + ] + varcall_engine_options, + contig + ) for contig in self.ref_contigs[::-1]] + + if verbose: + print('Starting variant calling ..') + + # launch subprocesses and monitor their status + subprocesses = [] + error_table = {} + tmp_io_started = [] + tmp_io_finished = [] + self.tmpfiles = [] + + def enqueue_stderr_output(out, stderr_buffer): + for line in iter(out.readline, b''): + # Eventually we are going to print the contents of + # the stderr_buffer to sys.stderr so we can + # decode things here using its encoding. + # We do a 'backslashreplace' just to be on the safe side. + stderr_buffer.write(line.decode(sys.stderr.encoding, + 'backslashreplace')) + out.close() + + try: + while subprocesses or calls: + while calls and len(subprocesses) < threads: + # There are still calls waiting for execution and we + # have unoccupied threads so we launch a new combined + # call to samtools mpileup and the variant caller. + + # pop the call arguments from our call stack + call = calls.pop() + # get the name of the contig that this call is going + # to work on + contig = call[2] + # Based on the contig name, generate a readable and + # file system-compatible prefix and use it to create + # a named temporary file, to which the call output + # will be redirected. + # At the moment we create the output file we add it to + # the list of all temporary output files so that we can + # remove it eventually during cleanup. + call_out = self.TemporaryContigVCF( + prefix=''.join( + c if c.isalnum() else '_' for c in contig + ) + '_', + ) + # maintain a list of variant call outputs + # in the order the subprocesses got launched + tmp_io_started.append(call_out.name) + if self.requires_stdout_redirect: + # redirect stdout to the temporary file just created + stdout_p2 = call_out + stderr_p2 = subprocess.PIPE + else: + # variant caller wants to write output to file directly + stdout_p2 = subprocess.PIPE + stderr_p2 = subprocess.STDOUT + call[1][call[1].index('{out}')] = call_out.name + call_out.close() + # for reporting purposes, join the arguments for the + # samtools and the variant caller calls into readable + # strings + c_str = (' '.join(call[0]), ' '.join(call[1])) + error_table[c_str] = [io.StringIO(), io.StringIO()] + # start the subprocesses + p1 = subprocess.Popen( + call[0], + stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + p2 = subprocess.Popen( + call[1], + stdin=p1.stdout, + stdout=stdout_p2, + stderr=stderr_p2 + ) + # subprocess bookkeeping + subprocesses.append((c_str, p1, p2, call_out, contig)) + # make sure our newly launched call does not block + # because its buffers fill up + p1.stdout.close() + t1 = Thread( + target=enqueue_stderr_output, + args=(p1.stderr, error_table[c_str][0]) + ) + t2 = Thread( + target=enqueue_stderr_output, + args=( + p2.stderr + if self.requires_stdout_redirect else + p2.stdout, + error_table[c_str][1] + ) + ) + t1.daemon = t2.daemon = True + t1.start() + t2.start() + + if verbose: + print( + 'Calling variants for contig: {0}' + .format(call[2]) + ) + + # monitor all running calls to see if any of them are done + for call, p1, p2, ofo, contig in subprocesses: + p1_stat = p1.poll() + p2_stat = p2.poll() + if p1_stat is not None or p2_stat is not None: + # There is an outcome for this process! + # Lets see what it is + if p1_stat or p2_stat: + print() + print( + error_table[call][0].getvalue(), + error_table[call][1].getvalue(), + file=sys.stderr + ) + raise VariantCallingError( + 'Variant Calling for contig {0} failed.' + .format(contig), + call='{0} | {1}'.format(call[0], call[1]) + ) + if p1_stat == 0 and p2_stat is None: + # VarScan is not handling the no output from + # samtools mpileup situation correctly so maybe + # that's the issue here + last_words = error_table[call][1].getvalue( + ).splitlines()[-4:] + if len(last_words) < 4 or any( + not msg.startswith('Input stream not ready') + for msg in last_words + ): + break + # lets give this process a bit more time + # VarScan is waiting for input it will never + # get, stop it. + p2.terminate() + subprocesses.remove((call, p1, p2, ofo, contig)) + ofo.close() + break + if p2_stat == 0: + # Things went fine. + # maintain a list of variant call outputs + # that finished successfully (in the order + # they finished) + tmp_io_finished.append(ofo.name) + if verbose: + print() + print('Contig {0} finished.'.format(contig)) + if not quiet: + print() + print( + 'stderr output from samtools mpileup/' + 'bcftools:'.upper(), + file=sys.stderr + ) + print( + error_table[call][0].getvalue(), + error_table[call][1].getvalue(), + file=sys.stderr + ) + # Discard the collected stderr output from + # the call, remove the call from the list of + # running calls and close its output file. + del error_table[call] + subprocesses.remove((call, p1, p2, ofo, contig)) + # Closing the output file is important or we + # may hit the file system limit for open files + # if there are lots of contigs. + ofo.close() + break + # wait a bit in between monitoring cycles + time.sleep(2) + finally: + for call, p1, p2, ofo, contig in subprocesses: + # make sure we do not leave running subprocesses behind + for proc in (p1, p2): + try: + proc.terminate() + except Exception: + pass + # close currently open files + ofo.close() + # store the files with finished content in the order that + # the corresponding jobs were launched + self.tmpfiles = [f for f in tmp_io_started if f in tmp_io_finished] + # Make sure remaining buffered stderr output of + # subprocesses does not get lost. + # Currently, we don't screen this output for real errors, + # but simply rewrite everything. + if not quiet and error_table: + print() + print( + 'stderr output from samtools mpileup/bcftools:'.upper(), + file=sys.stderr + ) + for call, errors in error_table.items(): + print(' | '.join(call), ':', file=sys.stderr) + print('-' * 20, file=sys.stderr) + print('samtools mpileup output:', file=sys.stderr) + print(errors[0].getvalue(), file=sys.stderr) + print('varscan somatic output:', file=sys.stderr) + print(errors[1].getvalue(), file=sys.stderr) + + def _add_ref_contigs_to_header(self, header): + for chrom, length in zip(self.ref_contigs, self.ref_lengths): + header.add_meta( + 'contig', + items=[('ID', chrom), ('length', length)] + ) + + def _add_filters_to_header(self, header): + varscan_fpfilters = { + 'VarCount': 'Fewer than {min_var_count2} variant-supporting reads', + 'VarFreq': 'Variant allele frequency below {min_var_freq2}', + 'VarAvgRL': + 'Average clipped length of variant-supporting reads < ' + '{min_var_len}', + 'VarReadPos': 'Relative average read position < {min_var_readpos}', + 'VarDist3': + 'Average distance to effective 3\' end < {min_var_dist3}', + 'VarMMQS': + 'Average mismatch quality sum for variant reads > ' + '{max_var_mmqs}', + 'VarMapQual': + 'Average mapping quality of variant reads < {min_var_mapqual}', + 'VarBaseQual': + 'Average base quality of variant reads < {min_var_basequal}', + 'Strand': + 'Strand representation of variant reads < {min_strandedness}', + 'RefAvgRL': + 'Average clipped length of ref-supporting reads < ' + '{min_ref_len}', + 'RefReadPos': + 'Relative average read position < {min_ref_readpos}', + 'RefDist3': + 'Average distance to effective 3\' end < {min_ref_dist3}', + 'RefMapQual': + 'Average mapping quality of reference reads < ' + '{min_ref_mapqual}', + 'RefBaseQual': + 'Average base quality of ref-supporting reads < ' + '{min_ref_basequal}', + 'RefMMQS': + 'Average mismatch quality sum for ref-supporting reads > ' + '{max_ref_mmqs}', + 'MMQSdiff': + 'Mismatch quality sum difference (var - ref) > ' + '{max_mmqs_diff}', + 'MinMMQSdiff': + 'Mismatch quality sum difference (var - ref) < ' + '{max_mmqs_diff}', + 'MapQualDiff': + 'Mapping quality difference (ref - var) > {max_mapqual_diff}', + 'MaxBAQdiff': + 'Average base quality difference (ref - var) > ' + '{max_basequal_diff}', + 'ReadLenDiff': + 'Average supporting read length difference (ref - var) > ' + '{max_relative_len_diff}', + } + for filter_id, description in varscan_fpfilters.items(): + header.filters.add(filter_id, None, None, description) + + def _add_indel_info_flag_to_header(self, header): + header.info.add( + 'INDEL', 0, 'Flag', 'Indicates that the variant is an INDEL' + ) + + def _compile_common_header(self, varcall_template, no_filters=False): + # fix the header generated by VarScan + # by adding reference and contig information + common_header = pysam.VariantHeader() + common_header.add_meta('reference', value=self.ref_genome) + self._add_ref_contigs_to_header(common_header) + if not no_filters: + # add filter info + self._add_filters_to_header(common_header) + # change the source information + common_header.add_meta('source', value='varscan.py') + # declare an INDEL flag for record INFO fields + self._add_indel_info_flag_to_header(common_header) + # take the remaining metadata from the template header produced by + # VarScan + with pysam.VariantFile(varcall_template, 'r') as original_data: + varscan_header = original_data.header + for sample in varscan_header.samples: + common_header.samples.add(sample) + common_header.merge(varscan_header) + return common_header + + def pileup_masker(self, mask): + def apply_mask_on_pileup(piled_items): + for item, status in zip(piled_items, mask): + if status: + yield item + return apply_mask_on_pileup + + def get_allele_specific_pileup_column_stats( + self, allele, pile_column, ref_fetch + ): + # number of reads supporting the given allele on + # forward and reverse strand, and in total + var_reads_plus = var_reads_minus = 0 + var_supp_read_mask = [] + for base in pile_column.get_query_sequences(): + if base == allele: + # allele supporting read on + strand + var_reads_plus += 1 + var_supp_read_mask.append(True) + elif base.upper() == allele: + # allele supporting read on - strand + var_reads_minus += 1 + var_supp_read_mask.append(True) + else: + var_supp_read_mask.append(False) + var_reads_total = var_reads_plus + var_reads_minus + + if var_reads_total == 0: + # No stats without reads! + return None + + var_supp_only = self.pileup_masker(var_supp_read_mask) + + # average mapping quality of the reads supporting the + # given allele + avg_mapping_quality = sum( + mq for mq in var_supp_only( + pile_column.get_mapping_qualities() + ) + ) / var_reads_total + + # for the remaining stats we need access to complete + # read information + piled_reads = [ + p for p in var_supp_only(pile_column.pileups) + ] + assert len(piled_reads) == var_reads_total + sum_avg_base_qualities = 0 + sum_dist_from_center = 0 + sum_dist_from_3prime = 0 + sum_clipped_length = 0 + sum_unclipped_length = 0 + sum_num_mismatches_as_fraction = 0 + sum_mismatch_qualities = 0 + + for p in piled_reads: + sum_avg_base_qualities += sum( + p.alignment.query_qualities + ) / p.alignment.infer_query_length() + sum_clipped_length += p.alignment.query_alignment_length + unclipped_length = p.alignment.infer_read_length() + sum_unclipped_length += unclipped_length + read_center = p.alignment.query_alignment_length / 2 + sum_dist_from_center += 1 - abs( + p.query_position - read_center + ) / read_center + if p.alignment.is_reverse: + sum_dist_from_3prime += p.query_position / unclipped_length + else: + sum_dist_from_3prime += 1 - p.query_position / unclipped_length + + sum_num_mismatches = 0 + for qpos, rpos in p.alignment.get_aligned_pairs(): + if qpos is not None and rpos is not None: + if p.alignment.query_sequence[qpos] != ref_fetch( + rpos, rpos + 1 + ).upper(): # ref bases can be lowercase! + sum_num_mismatches += 1 + sum_mismatch_qualities += p.alignment.query_qualities[ + qpos + ] + sum_num_mismatches_as_fraction += ( + sum_num_mismatches / p.alignment.query_alignment_length + ) + avg_basequality = sum_avg_base_qualities / var_reads_total + avg_pos_as_fraction = sum_dist_from_center / var_reads_total + avg_num_mismatches_as_fraction = ( + sum_num_mismatches_as_fraction / var_reads_total + ) + avg_sum_mismatch_qualities = sum_mismatch_qualities / var_reads_total + avg_clipped_length = sum_clipped_length / var_reads_total + avg_distance_to_effective_3p_end = ( + sum_dist_from_3prime / var_reads_total + ) + + return ( + avg_mapping_quality, + avg_basequality, + var_reads_plus, + var_reads_minus, + avg_pos_as_fraction, + avg_num_mismatches_as_fraction, + avg_sum_mismatch_qualities, + avg_clipped_length, + avg_distance_to_effective_3p_end + ) + + def _postprocess_variant_records(self, invcf, + min_var_count2, min_var_count2_lc, + min_var_freq2, max_somatic_p, + max_somatic_p_depth, + min_ref_readpos, min_var_readpos, + min_ref_dist3, min_var_dist3, + min_ref_len, min_var_len, + max_relative_len_diff, + min_strandedness, min_strand_reads, + min_ref_basequal, min_var_basequal, + max_basequal_diff, + min_ref_mapqual, min_var_mapqual, + max_mapqual_diff, + max_ref_mmqs, max_var_mmqs, + min_mmqs_diff, max_mmqs_diff, + **args): + # set FILTER field according to Varscan criteria + # multiple FILTER entries must be separated by semicolons + # No filters applied should be indicated with MISSING + + # since posterior filters are always applied to just one sample, + # a better place to store the info is in the FT genotype field: + # can be PASS, '.' to indicate that filters have not been applied, + # or a semicolon-separated list of filters that failed + # unfortunately, gemini does not support this field + + with ExitStack() as io_stack: + normal_reads, tumor_reads = ( + io_stack.enter_context( + pysam.Samfile(fn, 'rb')) for fn in self.bam_input_files + ) + refseq = io_stack.enter_context(pysam.FastaFile(self.ref_genome)) + pileup_args = self._get_pysam_pileup_args() + for record in invcf: + if any(len(allele) > 1 for allele in record.alleles): + # skip indel postprocessing for the moment + yield record + continue + # get pileup for genomic region affected by this variant + if record.info['SS'] == '2': + # a somatic variant => generate pileup from tumor data + pile = tumor_reads.pileup( + record.chrom, record.start, record.stop, + **pileup_args + ) + sample_of_interest = 'TUMOR' + elif record.info['SS'] in ['1', '3']: + # a germline or LOH variant => pileup from normal data + pile = normal_reads.pileup( + record.chrom, record.start, record.stop, + **pileup_args + ) + sample_of_interest = 'NORMAL' + else: + # TO DO: figure out if there is anything interesting to do + # for SS status codes 0 (reference) and 5 (unknown) + yield record + continue + + # apply false-positive filtering a la varscan fpfilter + # find the variant site in the pileup columns + for pile_column in pile: + if pile_column.reference_pos == record.start: + break + # extract required information + # overall read depth at the site + read_depth = pile_column.get_num_aligned() + assert read_depth > 0 + # no multiallelic sites in varscan + assert len(record.alleles) == 2 + if record.samples[sample_of_interest]['RD'] > 0: + ref_stats, alt_stats = [ + self.get_allele_specific_pileup_column_stats( + allele, + pile_column, + partial( + pysam.FastaFile.fetch, refseq, record.chrom + ) + ) + for allele in record.alleles + ] + else: + ref_stats = None + alt_stats = self.get_allele_specific_pileup_column_stats( + record.alleles[1], + pile_column, + partial(pysam.FastaFile.fetch, refseq, record.chrom) + ) + ref_count = 0 + if ref_stats: + ref_count = ref_stats[2] + ref_stats[3] + if ref_stats[1] < min_ref_basequal: + record.filter.add('RefBaseQual') + if ref_count >= 2: + if ref_stats[0] < min_ref_mapqual: + record.filter.add('RefMapQual') + if ref_stats[4] < min_ref_readpos: + record.filter.add('RefReadPos') + # ref_stats[5] (avg_num_mismatches_as_fraction + # is not a filter criterion in VarScan fpfilter + if ref_stats[6] > max_ref_mmqs: + record.filter.add('RefMMQS') + if ref_stats[7] < min_ref_len: + # VarScan fpfilter does not apply this filter + # for indels, but there is no reason + # not to do it. + record.filter.add('RefAvgRL') + if ref_stats[8] < min_ref_dist3: + record.filter.add('RefDist3') + if alt_stats: + alt_count = alt_stats[2] + alt_stats[3] + if ( + alt_count < min_var_count2_lc + ) or ( + read_depth >= max_somatic_p_depth and + alt_count < min_var_count2 + ): + record.filter.add('VarCount') + if alt_count / read_depth < min_var_freq2: + record.filter.add('VarFreq') + if alt_stats[1] < min_var_basequal: + record.filter.add('VarBaseQual') + if alt_count > min_strand_reads: + if ( + alt_stats[2] / alt_count < min_strandedness + ) or ( + alt_stats[3] / alt_count < min_strandedness + ): + record.filter.add('Strand') + if alt_stats[2] + alt_stats[3] >= 2: + if alt_stats[0] < min_var_mapqual: + record.filter.add('VarMapQual') + if alt_stats[4] < min_var_readpos: + record.filter.add('VarReadPos') + # alt_stats[5] (avg_num_mismatches_as_fraction + # is not a filter criterion in VarScan fpfilter + if alt_stats[6] > max_var_mmqs: + record.filter.add('VarMMQS') + if alt_stats[7] < min_var_len: + # VarScan fpfilter does not apply this filter + # for indels, but there is no reason + # not to do it. + record.filter.add('VarAvgRL') + if alt_stats[8] < min_var_dist3: + record.filter.add('VarDist3') + if ref_count >= 2 and alt_count >= 2: + if (ref_stats[0] - alt_stats[0]) > max_mapqual_diff: + record.filter.add('MapQualDiff') + if (ref_stats[1] - alt_stats[1]) > max_basequal_diff: + record.filter.add('MaxBAQdiff') + mmqs_diff = alt_stats[6] - ref_stats[6] + if mmqs_diff < min_mmqs_diff: + record.filter.add('MinMMQSdiff') + if mmqs_diff > max_mmqs_diff: + record.filter.add('MMQSdiff') + if ( + 1 - alt_stats[7] / ref_stats[7] + ) > max_relative_len_diff: + record.filter.add('ReadLenDiff') + else: + # No variant-supporting reads for this record! + # This can happen in rare cases because of + # samtools mpileup issues, but indicates a + # rather unreliable variant call. + record.filter.add('VarCount') + record.filter.add('VarFreq') + yield record + + def _indel_flagged_records(self, vcf): + for record in vcf: + record.info['INDEL'] = True + yield record + + def _merge_generator(self, vcf1, vcf2): + try: + record1 = next(vcf1) + except StopIteration: + for record2 in vcf2: + yield record2 + return + try: + record2 = next(vcf2) + except StopIteration: + yield record1 + for record1 in vcf1: + yield record1 + return + while True: + if (record1.start, record1.stop) < (record2.start, record2.stop): + yield record1 + try: + record1 = next(vcf1) + except StopIteration: + yield record2 + for record2 in vcf2: + yield record2 + return + else: + yield record2 + try: + record2 = next(vcf2) + except StopIteration: + yield record1 + for record1 in vcf1: + yield record1 + return + + def merge_and_postprocess(self, snps_out, indels_out=None, + no_filters=False, **filter_args): + temporary_data = self.tmpfiles + self.tmpfiles = [] + temporary_snp_files = [f + '.snp.vcf' for f in temporary_data] + temporary_indel_files = [f + '.indel.vcf' for f in temporary_data] + + for f in temporary_data: + try: + os.remove(f) + except Exception: + pass + + def noop_gen(data, **kwargs): + for d in data: + yield d + + if no_filters: + apply_filters = noop_gen + else: + apply_filters = self._postprocess_variant_records + + output_header = self._compile_common_header( + temporary_snp_files[0], + no_filters + ) + if indels_out is None: + with open(snps_out, 'w') as o: + o.write(str(output_header).format(**filter_args)) + for snp_f, indel_f in zip( + temporary_snp_files, temporary_indel_files + ): + with pysam.VariantFile(snp_f, 'r') as snp_invcf: + # fix the input header on the fly + # to avoid Warnings from htslib about missing contig + # info + self._add_ref_contigs_to_header(snp_invcf.header) + self._add_filters_to_header(snp_invcf.header) + self._add_indel_info_flag_to_header(snp_invcf.header) + with pysam.VariantFile(indel_f, 'r') as indel_invcf: + # fix the input header on the fly + # to avoid Warnings from htslib about missing + # contig info + self._add_ref_contigs_to_header(indel_invcf.header) + self._add_filters_to_header(indel_invcf.header) + self._add_indel_info_flag_to_header( + indel_invcf.header + ) + for record in apply_filters( + self._merge_generator( + snp_invcf, + self._indel_flagged_records(indel_invcf) + ), + **filter_args + ): + o.write(str(record)) + try: + os.remove(snp_f) + except Exception: + pass + try: + os.remove(indel_f) + except Exception: + pass + + else: + with open(snps_out, 'w') as o: + o.write(str(output_header).format(**filter_args)) + for f in temporary_snp_files: + with pysam.VariantFile(f, 'r') as invcf: + # fix the input header on the fly + # to avoid Warnings from htslib about missing + # contig info and errors because of undeclared + # filters + self._add_ref_contigs_to_header(invcf.header) + self._add_filters_to_header(invcf.header) + for record in apply_filters( + invcf, **filter_args + ): + o.write(str(record)) + try: + os.remove(f) + except Exception: + pass + with open(indels_out, 'w') as o: + o.write(str(output_header)) + for f in temporary_indel_files: + with pysam.VariantFile(f, 'r') as invcf: + # fix the input header on the fly + # to avoid Warnings from htslib about missing + # contig info and errors because of undeclared + # filters + self._add_ref_contigs_to_header(invcf.header) + self._add_filters_to_header(invcf.header) + self._add_indel_info_flag_to_header(invcf.header) + for record in apply_filters( + self._indel_flagged_records(invcf), **filter_args + ): + o.write(str(record)) + try: + os.remove(f) + except Exception: + pass + + +def varscan_call(ref_genome, normal, tumor, output_path, **args): + """Preparse arguments and orchestrate calling and postprocessing.""" + + if args.pop('split_output'): + if '%T' in output_path: + out = ( + output_path.replace('%T', 'snp'), + output_path.replace('%T', 'indel') + ) + else: + out = ( + output_path + '.snp', + output_path + '.indel' + ) + else: + out = (output_path, None) + + instance_args = { + k: args.pop(k) for k in [ + 'max_depth', + 'min_mapqual', + 'min_basequal', + 'threads', + 'verbose', + 'quiet' + ] + } + varscan_somatic_args = { + k: args.pop(k) for k in [ + 'normal_purity', + 'tumor_purity', + 'min_coverage', + 'min_var_count', + 'min_var_freq', + 'min_hom_freq', + 'somatic_p_value', + 'p_value' + ] + } + + v = VarScanCaller(ref_genome, [normal, tumor], **instance_args) + v.varcall_parallel(**varscan_somatic_args) + v.merge_and_postprocess(*out, **args) + + +if __name__ == '__main__': + p = argparse.ArgumentParser() + p.add_argument( + 'ref_genome', + metavar='reference_genome', + help='the reference genome (in fasta format)' + ) + p.add_argument( + '--normal', + metavar='BAM_file', required=True, + help='the BAM input file of aligned reads from the normal sample' + ) + p.add_argument( + '--tumor', + metavar='BAM_file', required=True, + help='the BAM input file of aligned reads from the tumor sample' + ) + p.add_argument( + '-o', '--ofile', required=True, + metavar='OFILE', dest='output_path', + help='Name of the variant output file. With --split-output, the name ' + 'may use the %%T replacement token or will be used as the ' + 'basename for the two output files to be generated (see ' + '-s|--split-output below).' + ) + p.add_argument( + '-s', '--split-output', + dest='split_output', action='store_true', default=False, + help='indicate that separate output files for SNPs and indels ' + 'should be generated (original VarScan behavior). If specified, ' + '%%T in the --ofile file name will be replaced with "snp" and ' + '"indel" to generate the names of the SNP and indel output ' + 'files, respectively. If %%T is not found in the file name, it ' + 'will get interpreted as a basename to which ".snp"/".indel" ' + 'will be appended.' + ) + p.add_argument( + '-t', '--threads', + type=int, default=1, + help='level of parallelism' + ) + p.add_argument( + '-v', '--verbose', + action='store_true', + help='be verbose about progress' + ) + p.add_argument( + '-q', '--quiet', + action='store_true', + help='suppress output from wrapped tools' + ) + call_group = p.add_argument_group('Variant calling parameters') + call_group.add_argument( + '--normal-purity', + dest='normal_purity', type=float, + default=1.0, + help='Estimated purity of the normal sample (default: 1.0)' + ) + call_group.add_argument( + '--tumor-purity', + dest='tumor_purity', type=float, + default=1.0, + help='Estimated purity of the tumor sample (default: 1.0)' + ) + call_group.add_argument( + '--max-pileup-depth', + dest='max_depth', type=int, default=8000, + help='Maximum depth of generated pileups (samtools mpileup -d option; ' + 'default: 8000)' + ) + call_group.add_argument( + '--min-basequal', + dest='min_basequal', type=int, + default=13, + help='Minimum base quality at the variant position to use a read ' + '(default: 13)' + ) + call_group.add_argument( + '--min-mapqual', + dest='min_mapqual', type=int, + default=0, + help='Minimum mapping quality required to use a read ' + '(default: 0)' + ) + call_group.add_argument( + '--min-coverage', + dest='min_coverage', type=int, + default=8, + help='Minimum site coverage required in the normal and in the tumor ' + 'sample to call a variant (default: 8)' + ) + call_group.add_argument( + '--min-var-count', + dest='min_var_count', type=int, + default=2, + help='Minimum number of variant-supporting reads required to call a ' + 'variant (default: 2)' + ) + call_group.add_argument( + '--min-var-freq', + dest='min_var_freq', type=float, + default=0.1, + help='Minimum variant allele frequency for calling (default: 0.1)' + ) + call_group.add_argument( + '--min-hom-freq', + dest='min_hom_freq', type=float, + default=0.75, + help='Minimum variant allele frequency for homozygous call ' + '(default: 0.75)' + ) + call_group.add_argument( + '--p-value', + dest='p_value', type=float, + default=0.99, + help='P-value threshold for heterozygous call (default: 0.99)' + ) + call_group.add_argument( + '--somatic-p-value', + dest='somatic_p_value', type=float, + default=0.05, + help='P-value threshold for somatic call (default: 0.05)' + ) + filter_group = p.add_argument_group('Posterior variant filter parameters') + filter_group.add_argument( + '--no-filters', + dest='no_filters', action='store_true', + help='Disable all posterior variant filters. ' + 'If specified, all following options will be ignored' + ) + filter_group.add_argument( + '--min-var-count2', + dest='min_var_count2', type=int, + default=4, + help='Minimum number of variant-supporting reads (default: 4)' + ) + filter_group.add_argument( + '--min-var-count2-lc', + dest='min_var_count2_lc', type=int, + default=2, + help='Minimum number of variant-supporting reads when depth below ' + '--somatic-p-depth (default: 2)' + ) + filter_group.add_argument( + '--min-var-freq2', + dest='min_var_freq2', type=float, + default=0.05, + help='Minimum variant allele frequency (default: 0.05)' + ) + filter_group.add_argument( + '--max-somatic-p', + dest='max_somatic_p', type=float, + default=0.05, + help='Maximum somatic p-value (default: 0.05)' + ) + filter_group.add_argument( + '--max-somatic-p-depth', + dest='max_somatic_p_depth', type=int, + default=10, + help='Depth required to run --max-somatic-p filter (default: 10)' + ) + filter_group.add_argument( + '--min-ref-readpos', + dest='min_ref_readpos', type=float, + default=0.1, + help='Minimum average relative distance of site from the ends of ' + 'ref-supporting reads (default: 0.1)' + ) + filter_group.add_argument( + '--min-var-readpos', + dest='min_var_readpos', type=float, + default=0.1, + help='Minimum average relative distance of site from the ends of ' + 'variant-supporting reads (default: 0.1)' + ) + filter_group.add_argument( + '--min-ref-dist3', + dest='min_ref_dist3', type=float, + default=0.1, + help='Minimum average relative distance of site from the effective ' + '3\'end of ref-supporting reads (default: 0.1)' + ) + filter_group.add_argument( + '--min-var-dist3', + dest='min_var_dist3', type=float, + default=0.1, + help='Minimum average relative distance of site from the effective ' + '3\'end of variant-supporting reads (default: 0.1)' + ) + filter_group.add_argument( + '--min-ref-len', + dest='min_ref_len', type=int, + default=90, + help='Minimum average trimmed length of reads supporting the ref ' + 'allele (default: 90)' + ) + filter_group.add_argument( + '--min-var-len', + dest='min_var_len', type=int, + default=90, + help='Minimum average trimmed length of reads supporting the variant ' + 'allele (default: 90)' + ) + filter_group.add_argument( + '--max-len-diff', + dest='max_relative_len_diff', type=float, + default=0.25, + help='Maximum average relative read length difference (ref - var; ' + 'default: 0.25)' + ) + filter_group.add_argument( + '--min-strandedness', + dest='min_strandedness', type=float, + default=0.01, + help='Minimum fraction of variant reads from each strand ' + '(default: 0.01)' + ) + filter_group.add_argument( + '--min-strand-reads', + dest='min_strand_reads', type=int, + default=5, + help='Minimum allele depth required to run --min-strandedness filter ' + '(default: 5)' + ) + filter_group.add_argument( + '--min-ref-basequal', + dest='min_ref_basequal', type=int, + default=15, + help='Minimum average base quality for the ref allele (default: 15)' + ) + filter_group.add_argument( + '--min-var-basequal', + dest='min_var_basequal', type=int, + default=15, + help='Minimum average base quality for the variant allele ' + '(default: 15)' + ) + filter_group.add_argument( + '--max-basequal-diff', + dest='max_basequal_diff', type=int, + default=50, + help='Maximum average base quality diff (ref - var; default: 50)' + ) + filter_group.add_argument( + '--min-ref-mapqual', + dest='min_ref_mapqual', type=int, + default=15, + help='Minimum average mapping quality of reads supporting the ref ' + 'allele (default: 15)' + ) + filter_group.add_argument( + '--min-var-mapqual', + dest='min_var_mapqual', type=int, + default=15, + help='Minimum average mapping quality of reads supporting the variant ' + 'allele (default: 15)' + ) + filter_group.add_argument( + '--max-mapqual-diff', + dest='max_mapqual_diff', type=int, + default=50, + help='Maximum average mapping quality difference (ref - var; ' + 'default: 50)' + ) + filter_group.add_argument( + '--max-ref-mmqs', + dest='max_ref_mmqs', type=int, + default=100, + help='Maximum mismatch quality sum of reads supporting the ref ' + 'allele (default: 100)' + ) + filter_group.add_argument( + '--max-var-mmqs', + dest='max_var_mmqs', type=int, + default=100, + help='Maximum mismatch quality sum of reads supporting the variant ' + 'allele (default: 100)' + ) + filter_group.add_argument( + '--min-mmqs-diff', + dest='min_mmqs_diff', type=int, + default=0, + help='Minimum mismatch quality sum difference (var - ref; default: 0)' + ) + filter_group.add_argument( + '--max-mmqs-diff', + dest='max_mmqs_diff', type=int, + default=50, + help='Maximum mismatch quality sum difference (var - ref; default: 50)' + ) + args = vars(p.parse_args()) + varscan_call(**args)
--- a/varscan_somatic.xml Sun Jul 15 09:19:25 2018 -0400 +++ b/varscan_somatic.xml Tue Dec 04 05:15:50 2018 -0500 @@ -1,120 +1,534 @@ <tool id="varscan_somatic" name="VarScan somatic" version="@VERSION@.1"> - <description>Call germline/somatic variants from tumor-normal pileups</description> + <description>Call germline/somatic and LOH variants from tumor-normal sample pairs</description> <macros> <import>macros.xml</import> + <macro name="test_mentions_contig"> + <assert_contents> + <has_line_matching + expression="##contig=.ID=chrM,length=16571." /> + </assert_contents> + </macro> + <macro name="test_mentions_filters"> + <assert_contents> + <has_line_matching + expression="##FILTER=.ID=VarCount,Description=.+" /> + <has_line_matching + expression="##FILTER=.ID=ReadLenDiff,Description=.+" /> + <has_line_matching + expression="##FILTER=.ID=RefDist3,Description=.+" /> + </assert_contents> + </macro> + <macro name="test_not_mentions_filters"> + <assert_contents> + <not_has_text + text="##FILTER=<ID=VarCount,Description=" /> + <not_has_text + text="##FILTER=<ID=ReadLenDiff,Description=" /> + <not_has_text + text="##FILTER=<ID=RefDist3,Description=" /> + </assert_contents> + </macro> </macros> - <expand macro="requirements" /> - <expand macro="stdio" /> + <expand macro="requirements"> + <requirement type="package" version="3.6.7">python</requirement> + <requirement type="package" version="0.15.1">pysam</requirement> + </expand> + <stdio> + <exit_code range="1:" /> + </stdio> <command><![CDATA[ - varscan somatic - @INPUT_PILEUPS@ - --min-coverage ${min_coverage} - --min-reads2 ${min_reads2} - --min-avg-qual ${min_avg_qual} - --min-var-freq ${min_var_freq} - --min-freq-for-hom ${min_freq_for_hom} + #if str($reference.source) == "history": + #set ref_genome = 'ref.fa' + ln -s -f '$reference.genome' $ref_genome && + #else: + #set ref_genome = '$reference.genome.fields.path' + #end if + #set normal_data = 'normal.bam' + #set tumor_data = 'tumor.bam' + ln -s -f '$normal_bam' $normal_data && + ln -s -f '$tumor_bam' $tumor_data && + ln -s -f '${normal_bam.metadata.bam_index}' ${normal_data}.bai && + ln -s -f '${tumor_bam.metadata.bam_index}' ${tumor_data}.bai && + python3 $__tool_directory__/varscan.py + --normal '$normal_data' + --tumor '$tumor_data' --normal-purity ${normal_purity} --tumor-purity ${tumor_purity} - --tumor-purity ${tumor_purity} - --min-coverage-normal ${min_coverage_normal} - --somatic-p-value ${somatic_p_value} - --p-value ${p_value} - #if str($strand_filter) == 'yes': - --strand-filter 1 + #if str($split_output): + --ofile variants_out + $split_output + #else: + --ofile '$output' + #end if + --threads \${GALAXY_SLOTS:-2} + #if str($call_params.settings) == "custom": + ## samtools mpileup parameters + --min-basequal ${call_params.min_avg_qual} + --min-mapqual ${call_params.min_mapqual} + ## VarScan parameters + --min-coverage ${call_params.min_coverage} + --min-var-count ${call_params.min_reads2} + --min-var-freq ${call_params.min_var_freq} + --min-hom-freq ${call_params.min_freq_for_hom} + --p-value ${call_params.p_value} + --somatic-p-value ${call_params.somatic_p_value} #end if - - --output-vcf 1 + #if str($filter_params.settings) == "no_filter": + --no-filters + #elif str($filter_params.settings) == "dream3_settings": + --min-var-count2 3 + --min-var-count2-lc 1 + --min-var-freq2 0.05 + --max-somatic-p 0.05 + --max-somatic-p-depth 10 + --min-ref-readpos 0.2 + --min-var-readpos 0.15 + --min-ref-dist3 0.2 + --min-var-dist3 0.15 + --min-ref-len 90 + --min-var-len 90 + --max-len-diff 0.05 + --min-strandedness 0 + --min-strand-reads 5 + --min-ref-basequal 15 + --min-var-basequal 30 + --max-basequal-diff 50 + --min-ref-mapqual 20 + --min-var-mapqual 30 + --max-mapqual-diff 10 + --max-ref-mmqs 50 + --max-var-mmqs 100 + --min-mmqs-diff 0 + --max-mmqs-diff 50 + #elif str($filter_params.settings) == "custom": + --min-var-count2 ${filter_params.min_var_count} + --min-var-count2-lc ${filter_params.min_var_count_lc} + --min-var-freq2 ${filter_params.min_var_freq2} + --max-somatic-p ${filter_params.max_somatic_p} + --max-somatic-p-depth ${filter_params.max_somatic_p_depth} + --min-ref-readpos ${filter_params.min_ref_readpos} + --min-var-readpos ${filter_params.min_var_readpos} + --min-ref-dist3 ${filter_params.min_ref_dist3} + --min-var-dist3 ${filter_params.min_var_dist3} + --min-ref-len ${filter_params.min_ref_len} + --min-var-len ${filter_params.min_var_len} + --max-len-diff ${filter_params.max_len_diff} + --min-strandedness ${filter_params.min_strandedness} + --min-strand-reads ${filter_params.min_strand_reads} + --min-ref-basequal ${filter_params.min_ref_basequal} + --min-var-basequal ${filter_params.min_var_basequal} + --max-basequal-diff ${filter_params.max_basequal_diff} + --min-ref-mapqual ${filter_params.min_ref_mapqual} + --min-var-mapqual ${filter_params.min_var_mapqual} + --max-mapqual-diff ${filter_params.max_mapqual_diff} + --max-ref-mmqs ${filter_params.max_ref_mmqs} + --max-var-mmqs ${filter_params.max_var_mmqs} + --min-mmqs-diff ${filter_params.min_mmqs_diff} + --max-mmqs-diff ${filter_params.max_mmqs_diff} + #end if + --verbose + $ref_genome ]]></command> <inputs> - - <expand macro="input_pileups"/> - - <expand macro="min_coverage" /> - <param argument="--min-coverage-normal" name="min_coverage_normal" type="integer" value="8" min="1" max="200" - label="Minimum read depth from the normal sample" help="Minimum depth at a position to make a call" /> - <param argument="--min-coverage-tumor" name="min_coverage_tumor" type="integer" value="6" min="1" max="200" - label="Minimum read depth from the tumor sample" help="Minimum depth at a position to make a call" /> - <expand macro="min_reads2" /> - <expand macro="min_avg_qual" /> - <expand macro="min_var_freq" value="0.10" /> - <expand macro="min_freq_for_hom" /> - <param argument="--normal-purity" name="normal_purity" type="float" value="1.00" min="0" max="1.00" - label="Estimated purity (non-tumor content) of normal sample"/> - <param argument="--tumor-purity" name="tumor_purity" type="float" value="1.00" min="0" max="1.00" - label="Estimated purity (tumor content) of tumor sample"/> - <expand macro="p_value" label="P-value threshold to call a heterozygote" value="0.99"/> - <param argument="--somatic-p-value" name="somatic_p_value" type="float" value="0.05" min="0" max="1" - label="p-value threshold for calling somatic sites"/> - <expand macro="strand_filter" /> + <conditional name="reference"> + <param name="source" type="select" + label="Will you select a reference genome from your history or use a built-in genome?"> + <option value="cached">Use a built-in genome</option> + <option value="history">Use a genome from my history</option> + </param> + <when value="cached"> + <param name="genome" type="select" + label="reference genome" + help="The fasta reference genome that variants should be called against."> + <options from_data_table="fasta_indexes" /> + </param> + </when> + <when value="history"> + <param name="genome" type="data" format="fasta" + label="reference genome" + help="The fasta reference genome that variants should be called against."/> + </when> + </conditional> + <param name="normal_bam" type="data" format="bam" + label="aligned reads from normal sample" /> + <param name="tumor_bam" type="data" format="bam" + label="aligned reads from tumor sample" /> + <param argument="--normal-purity" name="normal_purity" type="float" value="1.0" min="0" max="1.0" + label="Estimated purity (non-tumor content) of normal sample"/> + <param argument="--tumor-purity" name="tumor_purity" type="float" value="1.0" min="0" max="1.0" + label="Estimated purity (tumor content) of tumor sample"/> + <param name="split_output" type="boolean" truevalue="--split-output" falsevalue="" checked="false" + label="Generate separate output datasets for SNP and indel calls?" /> + <conditional name="call_params"> + <param name="settings" label="Settings for Variant Calling" type="select"> + <option value="varscan_defaults" selected="true">Use default values</option> + <option value="custom">Customize settings</option> + </param> + <when value="custom"> + <param argument="samtools mpileup -Q" name="min_avg_qual" type="integer" value="13" min="0" max="50" + label="Minimum base quality" + help="The minimum base quality at the variant position required to use a read for calling" /> + <param argument="samtools mpileup -q" name="min_mapqual" type="integer" value="0" min="0" max="60" + label="Minimum mapping quality" + help="The minimum mapping quality required for a read to be considered in variant calling" /> + <expand macro="min_coverage" + help="Minimum site coverage required in the normal and in the tumor sample to call a variant. This threshold gets applied after eliminating reads with low base and mapping qualitiy as defined above." /> + <expand macro="min_reads2" /> + <expand macro="min_var_freq" value="0.1" /> + <expand macro="min_freq_for_hom" /> + <expand macro="p_value" value="0.99" + help="The p-value threshold used to determine if a variant should be called for either sample" /> + <param argument="--somatic-p-value" name="somatic_p_value" type="float" value="0.05" min="0" max="1" + label="P-value threshold for calling somatic variants and LOH events" + help="The p-value threshold used to determine if read count differences between the normal and the tumor sample justify classification of a variant as somatic or as an LOH event" /> + </when> + <when value="varscan_defaults" /> + </conditional> + <conditional name="filter_params"> + <param name="settings" label="Settings for Posterior Variant Filtering" type="select"> + <option value="varscan_defaults" selected="true">Use default values</option> + <option value="dream3_settings">Use settings optimized for DREAM-3</option> + <option value="no_filter">Do not perform posterior filtering</option> + <option value="custom">Customize settings</option> + </param> + <when value="varscan_defaults" /> + <when value="dream3_settings" /> + <when value="no_filter" /> + <when value="custom"> + <param argument="--min-var-count" name="min_var_count" type="integer" value="4" min="1" max="200" + label="Minimum number of variant-supporting reads" + help="" /> + <param argument="--min-var-count-lc" name="min_var_count_lc" type="integer" value="2" min="1" max="200" + label="Low coverage minimum number of variant-supporting reads" + help="Will be applied instead of the --min-var-count limit for sites with poor overall (less than --max-somatic-p-depth) coverage" /> + <param argument="--min-var-freq" name="min_var_freq2" type="float" value="0.05" min="0" max="1" + label="Minimum variant allele frequency" + help="" /> + <param argument="--max-somatic-p" name="max_somatic_p" type="float" value="0.05" min="0" max="1" + label="Maximum somatic p-value allowed for a somatic call" + help="" /> + <param argument="--max-somatic-p-depth" name="max_somatic_p_depth" type="integer" value="10" min="2" max="200" + label="Depth required at variant site to run --max-somatic-p filter" + help="" /> + <param argument="--min-ref-readpos" name="min_ref_readpos" type="float" value="0.1" min="0" max="1" + label="Minimum relative variant position in ref-supporting reads" + help="The minimum average relative distance from the ends of ref-supporting reads required for variant sites" /> + <param argument="--min-var-readpos" name="min_var_readpos" type="float" value="0.1" min="0" max="1" + label="Minimum relative variant position in variant-supporting reads" + help="The minimum average relative distance from the ends of variant-supporting reads required for variant sites" /> + <param argument="--min-ref-dist3" name="min_ref_dist3" type="float" value="0.1" min="0" max="1" + label="Minimum distance of variant site from 3'-end of ref-supporting reads" + help="The minimum average relative distance from the effective 3'end of ref-supporting reads required for variant sites" /> + <param argument="--min-var-dist3" name="min_var_dist3" type="float" value="0.1" min="0" max="1" + label="Minimum distance of variant site from 3'-end of variant-supporting reads" + help="The minimum average relative distance from the effective 3'end of variant-supporting reads required for variant sites" /> + <param argument="--min-ref-avgrl" name="min_ref_len" type="integer" value="90" min="0" max="200" + label="Minimum length of ref-supporting reads" + help="The minimum average trimmed length required for reads supporting the reference allele" /> + <param argument="--min-var-avgrl" name="min_var_len" type="integer" value="90" min="0" max="200" + label="Minimum length of variant-supporting reads" + help="The minimum average trimmed length required for reads supporting the variant allele" /> + <param argument="--max-rl-diff" name="max_len_diff" type="float" value="0.25" min="0" max="1" + label="Maximum relative read length difference" + help="The maximum allowed average relative read length difference (ref - var) between reads supporting the reference and the variant allele" /> + <param argument="--min-strandedness" name="min_strandedness" type="float" value="0.01" min="0" max="0.5" + label="Minimum fraction of variant reads from each strand" + help="The minimum fraction of variant reads that are required to come from the forward and from the reverse strand" /> + <param argument="--min-strand-reads" name="min_strand_reads" type="integer" value="5" min="2" max="200" + label="Minimum variant allele depth required to apply the --min-strandedness filter" + help="" /> + <param argument="--min-ref-basequal" name="min_ref_basequal" type="integer" value="15" min="1" max="50" + label="Minimum average base quality for the ref allele" + help="The minimum average base quality required at the variant site for reads supporting the reference allele" /> + <param argument="--min-var-basequal" name="min_var_basequal" type="integer" value="15" min="1" max="50" + label="Minimum average base quality for the variant allele" + help="The minimum average base quality required at the variant site for reads supporting the variant allele" /> + <param argument="max-basequal-diff" name="max_basequal_diff" type="integer" value="50" min="0" max="50" + label="Maximum base quality difference between ref- and variant-supporting reads" + help="The maximum average base quality difference (ref - var) allowed between the variant site positions of reads supporting the reference and the variant allele" /> + <param argument="--min-ref-mapqual" name="min_ref_mapqual" type="integer" value="15" min="1" max="60" + label="Minimum average mapping quality of ref-supporting reads" + help="The minimum average mapping quality required for reads supporting the reference allele" /> + <param argument="--min-var-mapqual" name="min_var_mapqual" type="integer" value="15" min="1" max="60" + label="Minimum average mapping quality of variant-supporting reads" + help="The minimum average mapping quality required for reads supporting the variant allele" /> + <param argument="--max-mapqual-diff" name="max_mapqual_diff" type="integer" value="50" min="0" max="60" + label="Maximum mapping quality difference between ref- and variant-supporting reads" + help="The maximum average mapping quality difference (ref - var) allowed between reads supporting the reference and the variant allele" /> + <param argument="--max-ref-mmqs" name="max_ref_mmqs" type="integer" value="100" min="0" + label="Maximum mismatch base quality sum of ref-supporting reads" + help="The maximum mismatch base quality sum allowed for reads supporting the reference allele" /> + <param argument="--max-var-mmqs" name="max_var_mmqs" type="integer" value="100" min="0" + label="Maximum mismatch base quality sum of var-supporting reads" + help="The maximum mismatch base quality sum allowed for reads supporting the variant allele" /> + <param argument="--min-mmqs-diff" name="min_mmqs_diff" type="integer" value="0" min="0" + label="Minimum difference between mismatch base quality sums of variant- and ref-supporting reads" + help="The minimum difference in the mismatch base quality sums (var - ref) required between reads supporting the variant and the reference allele" /> + <param argument="--max-mmqs-diff" name="max_mmqs_diff" type="integer" value="50" min="1" + label="Maximum difference between mismatch base quality sums of variant- and ref-supporting reads" + help="The maximum difference in the mismatch base quality sums (var - ref) allowed between reads supporting the variant and the reference allele" /> + </when> + </conditional> </inputs> <outputs> - <data name="output_indel" from_work_dir="galaxy_out.indel.vcf" format="vcf"/> - <data name="output_snp" from_work_dir="galaxy_out.snp.vcf" format="vcf"/> + <data name="output" format="vcf"> + <filter>not split_output</filter> + </data> + <data name="output_snp" from_work_dir="variants_out.snp" format="vcf" + label="Varscan somatic SNP calls on ${on_string}"> + <filter>split_output</filter> + </data> + <data name="output_indel" from_work_dir="variants_out.indel" format="vcf" + label="Varscan somatic indel calls on ${on_string}"> + <filter>split_output</filter> + </data> </outputs> <tests> - <test> - <conditional name="pileup"> - <param name="pileup_select" value="separated" /> - <param name="normal_pileup" value="N_Region_Chr1_CDKN2C.pileup.gz" /> - <param name="tumor_pileup" value="T_Region_Chr1_CDKN2C.pileup.gz" /> + <test expect_num_outputs="1"> + <conditional name="reference"> + <param name="source" value="history" /> + <param name="genome" value="hg19_chrM.fa" /> + </conditional> + <param name="normal_bam" value="control_chrM.bam" /> + <param name="tumor_bam" value="tumor_chrM.bam" /> + <param name="split_output" value="false" /> + <conditional name="call_params"> + <param name="settings" value="varscan_defaults" /> + </conditional> + <conditional name="filter_params"> + <param name="settings" value="varscan_defaults" /> + </conditional> + <output name="output"> + <expand macro="test_mentions_contig" /> + <expand macro="test_mentions_filters" /> + </output> + </test> + <test expect_num_outputs="2"> + <conditional name="reference"> + <param name="source" value="history" /> + <param name="genome" value="hg19_chrM.fa" /> + </conditional> + <param name="normal_bam" value="control_chrM.bam" /> + <param name="tumor_bam" value="tumor_chrM.bam" /> + <param name="split_output" value="true" /> + <conditional name="call_params"> + <param name="settings" value="varscan_defaults" /> </conditional> - <param name="min_coverage" value="2" /> - <param name="min_coverage_normal" value="2" /> - <param name="min_coverage_tumor" value="2" /> - <param name="min_reads2" value="1" /> - <param name="min_avg_qual" value="5" /> - <param name="min_var_freq" value="0.01" /> - <param name="min_freq_for_hom" value="0.75" /> + <conditional name="filter_params"> + <param name="settings" value="varscan_defaults" /> + </conditional> + <output name="output_indel"> + <expand macro="test_mentions_contig" /> + <expand macro="test_mentions_filters" /> + </output> + <output name="output_snp"> + <expand macro="test_mentions_contig" /> + <expand macro="test_mentions_filters" /> + </output> + </test> + <test expect_num_outputs="1"> + <conditional name="reference"> + <param name="source" value="history" /> + <param name="genome" value="hg19_chrM.fa" /> + </conditional> + <param name="normal_bam" value="control_chrM.bam" /> + <param name="tumor_bam" value="tumor_chrM.bam" /> + <param name="split_output" value="false" /> + <conditional name="call_params"> + <param name="settings" value="custom" /> + <param name="min_coverage" value="2" /> + <param name="min_reads2" value="1" /> + <param name="min_avg_qual" value="5" /> + <param name="min_var_freq" value="0.01" /> + <param name="min_freq_for_hom" value="0.66" /> + <param name="p_value" value="0.97" /> + <param name="somatic_p_value" value="0.09" /> + </conditional> <param name="normal_purity" value="0.6" /> <param name="tumor_purity" value="0.6" /> - <param name="p_value" value="0.99" /> - <output name="output_indel" file="varscan_somatic_indel_result1.vcf" lines_diff="0" /> - <output name="output_snp" file="varscan_somatic_snp_result1.vcf" lines_diff="0" /> + <conditional name="filter_params"> + <param name="settings" value="varscan_defaults" /> + </conditional> + <assert_stderr> + <has_line_matching + expression=" Min coverage:	2x for Normal, 2x for Tumor" /> + <has_line_matching + expression="Min reads2:	1" /> + <has_line_matching + expression="Min var freq:	0.01" /> + <has_line_matching + expression="Min freq for hom:	0.66" /> + <has_line_matching + expression="Normal purity:	0.6" /> + <has_line_matching + expression="Tumor purity:	0.6" /> + <has_line_matching + expression="Min avg qual:	5" /> + <has_line_matching + expression="P-value thresh:	0.97" /> + <has_line_matching + expression="Somatic p-value:	0.09" /> + </assert_stderr> + <output name="output"> + <expand macro="test_mentions_contig" /> + <expand macro="test_mentions_filters" /> + </output> </test> - <test> - <conditional name="pileup"> - <param name="pileup_select" value="combined" /> - <param name="combined_pileup" value="NT.pileup.gz" /> + <test expect_num_outputs="1"> + <conditional name="reference"> + <param name="source" value="history" /> + <param name="genome" value="hg19_chrM.fa" /> + </conditional> + <param name="normal_bam" value="control_chrM.bam" /> + <param name="tumor_bam" value="tumor_chrM.bam" /> + <param name="split_output" value="false" /> + <conditional name="call_params"> + <param name="settings" value="varscan_defaults" /> + </conditional> + <conditional name="filter_params"> + <param name="settings" value="dream3_settings" /> </conditional> - <param name="min_coverage" value="2" /> - <param name="min_coverage_normal" value="2" /> - <param name="min_coverage_tumor" value="2" /> - <param name="min_reads2" value="1" /> - <param name="min_avg_qual" value="5" /> - <param name="min_var_freq" value="0.01" /> - <param name="min_freq_for_hom" value="0.75" /> - <param name="normal_purity" value="0.6" /> - <param name="tumor_purity" value="0.6" /> - <param name="p_value" value="0.99" /> - <output name="output_indel" file="varscan_somatic_indel_result2.vcf" lines_diff="0" /> - <output name="output_snp" file="varscan_somatic_snp_result2.vcf" lines_diff="0" /> + <output name="output"> + <expand macro="test_mentions_contig" /> + <expand macro="test_mentions_filters" /> + </output> + </test> + <test expect_num_outputs="1"> + <conditional name="reference"> + <param name="source" value="history" /> + <param name="genome" value="hg19_chrM.fa" /> + </conditional> + <param name="normal_bam" value="control_chrM.bam" /> + <param name="tumor_bam" value="tumor_chrM.bam" /> + <param name="split_output" value="false" /> + <conditional name="call_params"> + <param name="settings" value="varscan_defaults" /> + </conditional> + <conditional name="filter_params"> + <param name="settings" value="no_filter" /> + </conditional> + <output name="output"> + <expand macro="test_mentions_contig" /> + <expand macro="test_not_mentions_filters" /> + </output> + </test> + <test expect_num_outputs="1"> + <conditional name="reference"> + <param name="source" value="history" /> + <param name="genome" value="hg19_chrM.fa" /> + </conditional> + <param name="normal_bam" value="control_chrM.bam" /> + <param name="tumor_bam" value="tumor_chrM.bam" /> + <param name="split_output" value="false" /> + <conditional name="call_params"> + <param name="settings" value="varscan_defaults" /> + </conditional> + <conditional name="filter_params"> + <param name="settings" value="custom" /> + <param name="min_ref_basequal" value="28" /> + <param name="min_var_basequal" value="28" /> + </conditional> + <output name="output"> + <expand macro="test_mentions_contig" /> + <expand macro="test_mentions_filters" /> + </output> </test> </tests> <help> -**VarScan Overview** +@HELP_HEADER@ + +**The Varscan Somatic tool for Galaxy** + +This tool wraps the functionality of the ``varscan somatic`` and the +``varscan fpfilter`` command line tools. -VarScan_ performs variant detection for massively parallel sequencing data, such as exome, WGS, and transcriptome data. -It calls variants from a mpileup dataset and produces a VCF 4.1. Full documentation is available online_. +.. class:: infomark + + The wrapper aims at providing the same functionality as the + ``varscan fpfilter`` tool, but implements it using ``pysam`` internally. + Note that, as one limitation compared to the original ``varscan`` tool, + the current version does not apply filters to indels! -This tool calls germline/somatic variants from tumor-normal pileups. +The tool is designed to detect genetic variants in a **pair of samples** +representing normal and tumor tissue from the same individual. It classifies +the variants, according to their most likely origin, as **somatic** (variant is +found in the tumor, but not in the normal sample, *i.e.*, is the consequence of +a somatic mutation event), **germline** (variant is found in both samples => +germline mutation event) and **LOH** (variant is found in both samples, but +only the tumor sample appears to be homozygous for it => loss of heterozygosity +event). +This classification is encoded in the variant ``INFO`` fields of the VCF output +produced by the tool in the form of a status code ``SS`` (somatic status), +where: + +- ``SS=1`` signifies a likely germline variant, +- ``SS=2`` a somatic variant +- ``SS=3`` a LOH variant -.. _VarScan: http://dkoboldt.github.io/varscan/ -.. _online: http://dkoboldt.github.io/varscan/using-varscan.html +In addition, ``SS=0`` indicates a possible variant, but with insufficient +evidence for an, at least, heterozygous state in either individual sample, and +``SS=5`` is used for variants of unexplained origin (*e.g.*, variants found in +the normal, but not in the tumor tissue sample). + +In a second step, following variant calling, the tool can try to detect likely +false-positive calls by re-inspecting the data at the variant sites more +carefully and looking for signs that may indicate problems with the +sequencing data or its mapping. If a called variant is deemed a possible +false-positive at this step, this gets indicated in the ``FILTER`` field of the +variant record in the VCF output. For high confidence variants passing all +posterior (applied after variant calling) filters the value of the field will +be ``PASS``, for variants failing any of the posterior filters the value will +be a ``;``-separated list of the problematic filters. + **Input** -:: - - mpileup file - The SAMtools mpileup files for the normal and tumor tissue - +The tool takes as input a reference genome (in fasta format) and a pair of +aligned reads datasets (bam format). **Output** -VarScan produces a VCF 4.1 dataset as output. +A VCF dataset of called variants. When asked to *Generate separate output +datasets for SNP and indel calls*, the tool will behave like the +``varscan somatic`` command line tool and produce two VCF datasets - one with +just the single nucleotide variants, while the other one will store +insertion/deletion variants. + +**Options** + +*Estimated purity of normal sample / of tumor sample* + +Since, in practice, it is often impossible to isolate tissue samples without +contamination from surrounding tissue or from invading cells, these two fields +let you indicate your estimate of the purity of the two samples (as fractions +between 0 and 1, where 1 would indicate a contamination-free sample and 0.5 a +sample to which the desired tissue contributes only 50%, while the other 50% +consist of cells from the other tissue type). + +*Settings for Variant Calling* +Settings in this section will affect the steps of variant calling and +classification. You can accept VarScan's default values for the corresponding +parameters or customize them according to your needs. +*Settings for Posterior Variant Filtering* + +Use the parameters in this section to configure the false-positive filtering +step that follows variant calling and classification. These settings will not +influence the number of variants detected nor their classification, but may +change the ``FILTER`` field of variant records to indicate which variants +failed to pass certain filters. You can use this information with downstream +tools to exclude certain variants from further analysis steps or include only +high confidence variants that passed all filters (those with ``PASS`` as their +``INFO`` field value. You can accept the orignal filter defaults of the +``varscan fpfilter`` command line tool, use the settings established for the +tool in the `DREAM3 challenge`_, or choose to customize the settings. +Alternatively, you can also choose to skip posterior filtering entirely, in +which case all variants will have their ``INFO`` field set to ``PASS``. + +.. _DREAM3 challenge: https://www.synapse.org/#!Synapse:syn312572/wiki/58893 </help> <expand macro="citations" /> </tool>