changeset 0:179342c7b86c draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 4d3dfd4bcb567178107dcfd808ff03f9fec0bdbd
author iuc
date Wed, 12 Oct 2022 07:43:59 +0000
parents
children 630e6aeeb7e8
files all_fasta.loc.sample convert_VCF_info_fields.py macros.xml snp.xml test-data/alignment.bam test-data/all_fasta.loc test-data/annotate_vcf_test.pileup test-data/annotate_vcf_test.vcf test-data/annotated.vcf test-data/assembly.fasta test-data/basecalls.fastq.gz test-data/consensus.hdf test-data/medaka_test.bam test-data/medaka_test.bam.bai test-data/medaka_test.hdf test-data/ref.fasta test-data/ref.fasta.fai test-data/ref.fasta.gz test-data/variants.vcf.gz tool_data_table_conf.xml.sample tool_data_table_conf.xml.test
diffstat 21 files changed, 641 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/all_fasta.loc.sample	Wed Oct 12 07:43:59 2022 +0000
@@ -0,0 +1,18 @@
+#This file lists the locations and dbkeys of all the fasta files
+#under the "genome" directory (a directory that contains a directory
+#for each build). The script extract_fasta.py will generate the file
+#all_fasta.loc. This file has the format (white space characters are
+#TAB characters):
+#
+#<unique_build_id>	<dbkey>	<display_name>	<file_path>
+#
+#So, all_fasta.loc could look something like this:
+#
+#apiMel3	apiMel3	Honeybee (Apis mellifera): apiMel3	/path/to/genome/apiMel3/apiMel3.fa
+#hg19canon	hg19	Human (Homo sapiens): hg19 Canonical	/path/to/genome/hg19/hg19canon.fa
+#hg19full	hg19	Human (Homo sapiens): hg19 Full	/path/to/genome/hg19/hg19full.fa
+#
+#Your all_fasta.loc file should contain an entry for each individual
+#fasta file. So there will be multiple fasta files for each build,
+#such as with hg19 above.
+#
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/convert_VCF_info_fields.py	Wed Oct 12 07:43:59 2022 +0000
@@ -0,0 +1,164 @@
+#!/usr/bin/env python3
+
+# Takes in VCF file annotated with medaka tools annotate and converts
+#
+# Usage statement:
+# python convert_VCF_info_fields.py in_vcf.vcf out_vcf.vcf
+
+# 10/21/2020 - Nathan P. Roach, natproach@gmail.com
+
+import re
+import sys
+from collections import OrderedDict
+from math import log10
+
+import scipy.stats
+
+
+def pval_to_phredqual(pval):
+    try:
+        ret = round(-10 * log10(pval))
+    except ValueError:
+        ret = 2147483647  # transform pval of 0.0 to max signed 32 bit int
+    return ret
+
+
+def parseInfoField(info):
+    info_fields = info.split(";")
+    info_dict = OrderedDict()
+    for info_field in info_fields:
+        code, val = info_field.split("=")
+        info_dict[code] = val
+    return info_dict
+
+
+def annotateVCF(in_vcf_filepath, out_vcf_filepath):
+    """Postprocess output of medaka tools annotate.
+
+    Splits multiallelic sites into separate records.
+    Replaces medaka INFO fields that might represent information of the ref
+    and multiple alternate alleles with simple ref, alt allele counterparts.
+    """
+
+    in_vcf = open(in_vcf_filepath, "r")
+    # medaka INFO fields that do not make sense after splitting of
+    # multi-allelic records
+    # DP will be overwritten with the value of DPSP because medaka tools
+    # annotate currently only calculates the latter correctly
+    # (https://github.com/nanoporetech/medaka/issues/192).
+    # DPS, which is as unreliable as DP, gets skipped and the code
+    # calculates the spanning reads equivalent DPSPS instead.
+    to_skip = {"SC", "SR", "AR", "DP", "DPSP", "DPS"}
+    struct_meta_pat = re.compile("##(.+)=<ID=([^,]+)(,.+)?>")
+    header_lines = []
+    contig_ids = set()
+    contig_ids_simple = set()
+    # parse the metadata lines of the input VCF and drop:
+    # - duplicate lines
+    # - INFO lines declaring keys we are not going to write
+    # - redundant contig information
+    while True:
+        line = in_vcf.readline()
+        if line[:2] != "##":
+            assert line.startswith("#CHROM")
+            break
+        if line in header_lines:
+            # the annotate tool may generate lines already written by
+            # medaka variant again (example: medaka version line)
+            continue
+        match = struct_meta_pat.match(line)
+        if match:
+            match_type, match_id, match_misc = match.groups()
+            if match_type == "INFO":
+                if match_id == "DPSP":
+                    line = line.replace("DPSP", "DP")
+                elif match_id in to_skip:
+                    continue
+            elif match_type == "contig":
+                contig_ids.add(match_id)
+                if not match_misc:
+                    # the annotate tools writes its own contig info,
+                    # which is redundant with contig info generated by
+                    # medaka variant, but lacks a length value.
+                    # We don't need the incomplete line.
+                    contig_ids_simple.add(match_id)
+                    continue
+        header_lines.append(line)
+    # Lets check the above assumption about each ID-only contig line
+    # having a more complete counterpart.
+    assert not (contig_ids_simple - contig_ids)
+    header_lines.insert(1, "##convert_VCF_info_fields=0.2\n")
+    header_lines += [
+        '##INFO=<ID=DPSPS,Number=2,Type=Integer,Description="Depth of spanning reads by strand">\n',
+        '##INFO=<ID=AF,Number=1,Type=Float,Description="Spanning Reads Allele Frequency">\n',
+        '##INFO=<ID=FAF,Number=1,Type=Float,Description="Forward Spanning Reads Allele Frequency">\n',
+        '##INFO=<ID=RAF,Number=1,Type=Float,Description="Reverse Spanning Reads Allele Frequency">\n',
+        '##INFO=<ID=SB,Number=1,Type=Integer,Description="Phred-scaled strand bias of spanning reads at this position">\n',
+        '##INFO=<ID=DP4,Number=4,Type=Integer,Description="Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases in spanning reads">\n',
+        '##INFO=<ID=AS,Number=4,Type=Integer,Description="Total alignment score to ref and alt allele of spanning reads by strand (ref fwd, ref rev, alt fwd, alt rev) aligned with parasail match 5, mismatch -4, open 5, extend 3">\n',
+        line,
+    ]
+
+    with open(out_vcf_filepath, "w") as out_vcf:
+        out_vcf.writelines(header_lines)
+        for line in in_vcf:
+            fields = line.split("\t")
+            info_dict = parseInfoField(fields[7])
+            sr_list = [int(x) for x in info_dict["SR"].split(",")]
+            sc_list = [int(x) for x in info_dict["SC"].split(",")]
+            if len(sr_list) != len(sc_list):
+                print("WARNING - SR and SC are different lengths, " "skipping variant")
+                print(line.strip())  # Print the line for debugging purposes
+                continue
+            variant_list = fields[4].split(",")
+            dpsp = int(info_dict["DPSP"])
+            ref_fwd, ref_rev = 0, 1
+            dpspf, dpspr = (int(x) for x in info_dict["AR"].split(","))
+            for i in range(0, len(sr_list), 2):
+                dpspf += sr_list[i]
+                dpspr += sr_list[i + 1]
+            for j, i in enumerate(range(2, len(sr_list), 2)):
+                dp4 = (sr_list[ref_fwd], sr_list[ref_rev], sr_list[i], sr_list[i + 1])
+                dp2x2 = [[dp4[0], dp4[1]], [dp4[2], dp4[3]]]
+                _, p_val = scipy.stats.fisher_exact(dp2x2)
+                sb = pval_to_phredqual(p_val)
+
+                as_ = (sc_list[ref_fwd], sc_list[ref_rev], sc_list[i], sc_list[i + 1])
+
+                info = []
+                for code in info_dict:
+                    if code in to_skip:
+                        continue
+                    val = info_dict[code]
+                    info.append("%s=%s" % (code, val))
+
+                info.append("DP=%d" % dpsp)
+                info.append("DPSPS=%d,%d" % (dpspf, dpspr))
+
+                if dpsp == 0:
+                    info.append("AF=NaN")
+                else:
+                    af = (dp4[2] + dp4[3]) / dpsp
+                    info.append("AF=%.6f" % af)
+                if dpspf == 0:
+                    info.append("FAF=NaN")
+                else:
+                    faf = dp4[2] / dpspf
+                    info.append("FAF=%.6f" % faf)
+                if dpspr == 0:
+                    info.append("RAF=NaN")
+                else:
+                    raf = dp4[3] / dpspr
+                    info.append("RAF=%.6f" % raf)
+                info.append("SB=%d" % sb)
+                info.append("DP4=%d,%d,%d,%d" % dp4)
+                info.append("AS=%d,%d,%d,%d" % as_)
+                new_info = ";".join(info)
+                fields[4] = variant_list[j]
+                fields[7] = new_info
+                out_vcf.write("\t".join(fields))
+    in_vcf.close()
+
+
+if __name__ == "__main__":
+    annotateVCF(sys.argv[1], sys.argv[2])
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Wed Oct 12 07:43:59 2022 +0000
@@ -0,0 +1,188 @@
+<macros>
+    <token name="@TOOL_VERSION@">1.7.2</token>
+    <token name="@VERSION_SUFFIX@">0</token>
+    <token name="@PROFILE@">21.01</token>
+    <xml name="bio_tools">
+        <xrefs>
+            <xref type="bio.tools">medaka</xref>
+        </xrefs>
+    </xml>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="@TOOL_VERSION@">medaka</requirement>
+            <yield />
+        </requirements>
+    </xml>
+    <xml name="version_command">
+        <version_command>medaka --version</version_command>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="bibtex">@online{medaka,
+              author = {Oxford Nanopore Technologies Ltd.},
+              title = {medaka},
+              year = 2020,
+              url = {https://github.com/nanoporetech/medaka},
+              urldate = {2020-05-06}
+            }</citation>
+        </citations>
+    </xml>
+
+    <!--
+        command
+    -->
+
+    <token name="@REF_FASTA@"><![CDATA[
+        #if $reference_source.reference_source_selector == 'history':
+            #if $reference_source.ref_file.ext.endswith(".gz")
+                gunzip -c '$reference_source.ref_file' > reference.fa &&
+            #else
+                ln -f -s '$reference_source.ref_file' reference.fa &&
+            #end if
+        #else:
+            ln -f -s '$reference_source.ref_file.fields.path' reference.fa &&
+        #end if
+    ]]></token>
+
+    <!--
+        input
+    -->
+
+    <xml name="b" token_argument="-b">
+        <param argument="@ARGUMENT@" type="integer" value="100" min="1" label="Set inference batch size"/>
+    </xml>
+    <xml name="model" token_argument="-m" token_label="Select model">
+        <param argument="@ARGUMENT@" type="select" label="@LABEL@" help="For best results it is important to specify the correct model, 
+            according to the basecaller used. Medaka models are named to indicate i) the pore type, ii) the sequencing device (MinION 
+            or PromethION), iii) the basecaller variant, and iv) the basecaller version">
+            <option value="r103_fast_g507">r103_fast_g507</option>
+            <option value="r103_fast_snp_g507">r103_fast_snp_g507</option>
+            <option value="r103_fast_variant_g507">r103_fast_variant_g507</option>
+            <option value="r103_hac_g507">r103_hac_g507</option>
+            <option value="r103_hac_snp_g507">r103_hac_snp_g507</option>
+            <option value="r103_hac_variant_g507">r103_hac_variant_g507</option>
+            <option value="r103_min_high_g345">r103_min_high_g345</option>
+            <option value="r103_min_high_g360">r103_min_high_g360</option>
+            <option value="r103_prom_high_g360">r103_prom_high_g360</option>
+            <option value="r103_prom_snp_g3210">r103_prom_snp_g3210</option>
+            <option value="r103_prom_variant_g3210">r103_prom_variant_g3210</option>
+            <option value="r103_sup_g507">r103_sup_g507</option>
+            <option value="r103_sup_snp_g507">r103_sup_snp_g507</option>
+            <option value="r103_sup_variant_g507">r103_sup_variant_g507</option>
+            <option value="r1041_e82_400bps_fast_g615">r1041_e82_400bps_fast_g615</option>
+            <option value="r1041_e82_400bps_fast_variant_g615">r1041_e82_400bps_fast_variant_g615</option>
+            <option value="r1041_e82_400bps_hac_g615">r1041_e82_400bps_hac_g615</option>
+            <option value="r1041_e82_400bps_hac_variant_g615">r1041_e82_400bps_hac_variant_g615</option>
+            <option value="r1041_e82_400bps_sup_g615">r1041_e82_400bps_sup_g615</option>
+            <option value="r1041_e82_400bps_sup_variant_g615">r1041_e82_400bps_sup_variant_g615</option>
+            <option value="r104_e81_fast_g5015">r104_e81_fast_g5015</option>
+            <option value="r104_e81_hac_g5015">r104_e81_hac_g5015</option>
+            <option value="r104_e81_sup_g5015">r104_e81_sup_g5015</option>
+            <option value="r104_e81_fast_variant_g5015">r104_e81_fast_variant_g5015</option>
+            <option value="r104_e81_hac_variant_g5015">r104_e81_hac_variant_g5015</option>
+            <option value="r104_e81_sup_g610">r104_e81_sup_g610</option>
+            <option value="r104_e81_sup_variant_g610">r104_e81_sup_variant_g610</option>            
+            <option value="r10_min_high_g303">r10_min_high_g303</option>
+            <option value="r10_min_high_g340">r10_min_high_g340</option>   
+            <option value="r941_e81_fast_variant_g514">r941_e81_fast_variant_g514</option>
+            <option value="r941_e81_hac_g514">r941_e81_hac_g514</option>
+            <option value="r941_e81_hac_variant_g514">r941_e81_hac_variant_g514</option>
+            <option value="r941_e81_sup_g514">r941_e81_sup_g514</option>
+            <option value="r941_e81_sup_variant_g514">r941_e81_sup_variant_g514</option>
+            <option value="r941_min_fast_g303">r941_min_fast_g303</option>
+            <option value="r941_min_fast_g507">r941_min_fast_g507</option>
+            <option value="r941_min_fast_snp_g507">r941_min_fast_snp_g507</option>
+            <option value="r941_min_fast_variant_g507">r941_min_fast_variant_g507</option>
+            <option value="r941_min_hac_g507">r941_min_hac_g507</option>
+            <option value="r941_min_hac_snp_g507">r941_min_hac_snp_g507</option>
+            <option value="r941_min_hac_variant_g507">r941_min_hac_variant_g507</option>
+            <option value="r941_min_high_g303">r941_min_high_g303</option>
+            <option value="r941_min_high_g330">r941_min_high_g330</option>
+            <option value="r941_min_high_g340_rle">r941_min_high_g340_rle</option>
+            <option value="r941_min_high_g344">r941_min_high_g344</option>
+            <option value="r941_min_high_g351">r941_min_high_g351</option>
+            <option value="r941_min_high_g360" selected="true">r941_min_high_g360</option>
+            <option value="r941_min_sup_g507">r941_min_sup_g507</option>
+            <option value="r941_min_sup_snp_g507">r941_min_sup_snp_g507</option>
+            <option value="r941_min_sup_variant_g507">r941_min_sup_variant_g507</option>
+            <option value="r941_prom_fast_g303">r941_prom_fast_g303</option>
+            <option value="r941_prom_fast_g507">r941_prom_fast_g507</option>
+            <option value="r941_prom_fast_snp_g507">r941_prom_fast_snp_g507</option>
+            <option value="r941_prom_fast_variant_g507">r941_prom_fast_variant_g507</option>
+            <option value="r941_prom_hac_g507">r941_prom_hac_g507</option>
+            <option value="r941_prom_hac_snp_g507">r941_prom_hac_snp_g507</option>
+            <option value="r941_prom_hac_variant_g507">r941_prom_hac_variant_g507</option>
+            <option value="r941_prom_high_g303">r941_prom_high_g303</option>
+            <option value="r941_prom_high_g330">r941_prom_high_g330</option>
+            <option value="r941_prom_high_g344">r941_prom_high_g344</option>
+            <option value="r941_prom_high_g360">r941_prom_high_g360</option>
+            <option value="r941_prom_high_g4011">r941_prom_high_g4011</option>
+            <option value="r941_prom_snp_g303">r941_prom_snp_g303</option>
+            <option value="r941_prom_snp_g322">r941_prom_snp_g322</option>
+            <option value="r941_prom_snp_g360">r941_prom_snp_g360</option>
+            <option value="r941_prom_sup_g507">r941_prom_sup_g507</option>
+            <option value="r941_prom_sup_snp_g507">r941_prom_sup_snp_g507</option>
+            <option value="r941_prom_sup_variant_g507">r941_prom_sup_variant_g507</option>
+            <option value="r941_prom_variant_g303">r941_prom_variant_g303</option>
+            <option value="r941_prom_variant_g322">r941_prom_variant_g322</option>
+            <option value="r941_prom_variant_g360">r941_prom_variant_g360</option>
+            <option value="r941_sup_plant_g610">r941_sup_plant_g610</option>
+            <option value="r941_sup_plant_variant_g610">r941_sup_plant_variant_g610</option>
+        </param>
+    </xml>
+    <xml name="reference">
+        <conditional name="reference_source">
+            <param name="reference_source_selector" type="select" label="Choose the source for the reference genome">
+                <option value="cached">Use a built-in genome</option>
+                <option value="history">Use a genome from history</option>
+            </param>
+            <when value="cached">
+                <param name="ref_file" type="select" label="Using reference genome" help="Select genome from the list">
+                    <options from_data_table="all_fasta">
+                        <filter type="sort_by" column="2"/>
+                        <validator type="no_options" message="No reference genomes are available"/>
+                    </options>
+                    <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/>
+                </param>
+            </when>
+            <when value="history">
+                <param name="ref_file" type="data" format="fasta,fasta.gz" label="Use the following dataset as the reference sequence" help="You can upload a FASTA or FASTQ sequence to the history and use it as reference"/>
+            </when>
+        </conditional>
+    </xml>
+
+    <!--
+        Help
+    -->
+
+    <token name="@WID@"><![CDATA[
+*medaka* is a tool suite to create a consensus sequence from nanopore sequencing data.
+
+This task is performed using neural networks applied from a pileup of individual sequencing reads against a draft assembly. It outperforms graph-based methods operating on basecalled data, and can be competitive with state-of-the-art signal-based methods, whilst being much faster.
+    ]]></token>
+
+    <token name="@MODELS@"><![CDATA[
+
+----
+
+.. class:: infomark
+
+**Models**
+
+For best results it is important to specify the correct model, -m in the above, according to the basecaller used. Allowed values can be found by running medaka tools list\_models.
+
+Medaka models are named to indicate i) the pore type, ii) the sequencing device (MinION or PromethION), iii) the basecaller variant, and iv) the basecaller version, with the format:
+
+    ::
+
+        {pore}_{device}_{caller variant}_{caller version}
+
+For example the model named r941_min_fast_g303 should be used with data from MinION (or GridION) R9.4.1 flowcells using the fast Guppy basecaller version 3.0.3. By contrast the model 
+r941_prom_hac_g303 should be used with PromethION data and the high accuracy basecaller (termed "hac" in Guppy configuration files). Where a version of Guppy has been used without an exactly corresponding medaka model, the medaka model with the highest version equal to or less than the guppy version should be selected.
+          
+    ]]></token>
+
+    <token name="@REFERENCES@"><![CDATA[
+More information are available in the `manual <https://github.com/nanoporetech/medaka/tree/master/docs>`_ and `github <https://github.com/nanoporetech/medaka>`_.
+    ]]></token>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/snp.xml	Wed Oct 12 07:43:59 2022 +0000
@@ -0,0 +1,169 @@
+<tool id="medaka_snp" name="medaka SNP tool" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+    <description>decodes probabilities to SNPs</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements"/>
+    <expand macro="version_command"/>
+    <command detect_errors="exit_code"><![CDATA[
+## initialize
+@REF_FASTA@
+
+
+#if $pool.pool_mode == "Yes":
+    ## run
+    medaka snp
+    ## optional
+    --debug
+    #if $regions
+        --regions '${regions}'
+    #end if
+    --threshold $threshold
+    #if $ref_vcf
+        --ref_vcf '$ref_vcf'
+    #end if
+    $verbose
+    ## required
+    reference.fa
+    #for $current in $pool.inputs
+        '$current'
+    #end for
+#elif $pool.pool_mode == "No":
+    ## run
+    medaka snp
+    ## optional
+    --debug
+    #if $regions
+        --regions '${regions}'
+    #end if
+    --threshold $threshold
+    #if $ref_vcf
+        --ref_vcf '$ref_vcf'
+    #end if
+    $verbose
+    ## required
+    reference.fa
+    '$pool.input'
+#end if
+#if str($output_annotated.output_annotated_select) == 'false':
+    '$out_SNPs' ##output
+    2>&1 | tee '$out_log'
+#else
+    raw.vcf ##output of medaka snp
+    2>&1 | tee '$out_log'
+    && ln -s '$output_annotated.in_bam' in.bam
+    && ln -s '$output_annotated.in_bam.metadata.bam_index' in.bai
+    && medaka tools annotate --dpsp --pad $output_annotated.pad raw.vcf reference.fa in.bam tmp.vcf
+    && python '$__tool_directory__/convert_VCF_info_fields.py' tmp.vcf '$out_SNPs'
+#end if
+    ]]></command>
+    <inputs>
+        <conditional name="pool">
+            <param name="pool_mode"  type="select" label="Are you pooling HDF5 datasets?">
+                <option value="No" selected="true">No</option>
+                <option value="Yes">Yes</option>
+            </param>
+            <when value="Yes">
+                <param name="inputs" type="data" format="h5" multiple="true" label="Select consensus file(s)"/>
+            </when>
+            <when value="No">
+                <param name="input" type="data" format="h5" label="Select consensus file(s)"/>
+            </when>
+        </conditional>
+        <expand macro="reference"/>
+        <param argument="--regions" type="text" value="" optional="true" label="Set reference names to limit SNP calling" help="Separated by ','.">
+            <sanitizer invalid_char="">
+                <valid initial="string.ascii_letters,string.digits">
+                    <add value="_"/>
+                    <add value=","/>
+                    <add value="."/>
+                </valid>
+            </sanitizer>
+        </param>
+        <param argument="--threshold" type="float" label="Threshold for considering secondary calls" value="0.04" min="0" max="1" help="A value of 1 will result in haploid decoding" optional="true"/>
+        <param name="ref_vcf" type="data" format="vcf" optional="true" label="Reference vcf"/>        
+        <param argument="--verbose" type="boolean" truevalue="--verbose" falsevalue="" label="Populate VCF info fields?"/>
+        <conditional name="output_annotated">
+            <param name="output_annotated_select" type="select"
+            label="Type of VCF to generate"
+            help="SNP INFO fields in the VCF can be extended to include allele frequency, depth of coverage, etc., but this requires a BAM dataset to calculate those values from.">
+                <option value="true" selected="true">Write annotated VCF with extended INFO</option>
+                <option value="false">Write original decoded VCF with minimal INFO field</option>
+            </param>
+            <when value="true">
+                <param name="in_bam" type="data" format="bam" optional="false" label="BAM to caclulate additional INFO fields from"/>
+                <param name="pad" type="integer" min="1" value="25"
+                label="Padding width on either side of SNP for realignment"
+                help="To calculate the additional INFO fields the tool will run medaka tools anntotate, which performs local realignment of the region +- this width around each SNP. All calculated new fields will depend on the width chosen, so only change this value if you know what you are doing." />
+            </when>
+            <when value="false"/>
+        </conditional>
+        <param name="output_log_bool" type="boolean" label="Output log file?" checked="true"/>
+    </inputs>
+    <outputs>
+        <data name="out_SNPs" format="vcf" label="${tool.name} on ${on_string}: called SNPs"/>
+        <data name="out_log" format="tabular" label="${tool.name} on ${on_string}: Log">
+            <filter>output_log_bool</filter>
+        </data>
+    </outputs>
+    <tests>
+        <!--No annotation or log-->
+        <test expect_num_outputs="1">
+            <conditional name="pool">
+                <param name="pool_mode" value="No"/>
+                <param name="input" value="medaka_test.hdf"/>
+            </conditional>
+            <conditional name="reference_source">
+                <param name="reference_source_selector" value="history"/>
+                <param name="ref_file" value="ref.fasta"/>
+            </conditional>
+            <conditional name="output_annotated">
+                <param name="output_annotated_select" value="false"/>
+            </conditional>
+            <param name="output_log_bool" value="false"/>
+            <output name="out_SNPs">
+                <assert_contents>
+                    <has_n_lines n="7"/>
+                    <has_line line="##fileformat=VCFv4.1" />
+                    <has_line_matching expression="##medaka_version=[0-9]+\.[0-9]+\.[0-9]+" />
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help><![CDATA[
+.. class:: infomark
+
+**What it does**
+
+@WID@
+
+This module decodes probabilities to SNPs but NOT indels.
+
+For a more general solution see the medaka *variant* tool.
+----
+
+.. class:: infomark
+
+**Input**
+
+- reference sequence (FASTA)
+- (several) consensus files (H5/HDF)
+
+----
+
+.. class:: infomark
+
+**Output**
+
+- decoded SNP probabilities (VCF)
+
+----
+
+.. class:: infomark
+
+**References**
+
+@REFERENCES@
+    ]]></help>
+    <expand macro="citations"/>
+</tool>
Binary file test-data/alignment.bam has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/all_fasta.loc	Wed Oct 12 07:43:59 2022 +0000
@@ -0,0 +1,1 @@
+ref_fasta	ref_fasta	ref_fasta	${__HERE__}/ref.fasta
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/annotate_vcf_test.pileup	Wed Oct 12 07:43:59 2022 +0000
@@ -0,0 +1,3 @@
+NC_045512.2	45	G	31	CCCCACCCCCCccccccccc,ccCCC*,+3ttccCC	~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+NC_045512.2	685	A	75	.-9AAGTCATTT,-9aagtcattt,-9aagtcattt*.-9AAGTCATTT*G.-9AAGTCATTT.-9AAGTCATTT,-9aagtcattt*,-9aagtcattt,-9aagtcattt,-9aagtcattt,-9aagtcattt,,-9aagtcattt,-9aagtcattt,-9aagtcatttG.-9AAGTCATTT*,-9aagtcatttG.-9AAGTCATTT.-9AAGTCATTTG.-9AAGTCATTTG.-9AAGTCATTT.-9AAGTCATTT.-9AAGTCATTT.-9AAGTCATTT.-9AAGTCATTT*.-9AAGTCATTT*.-9AAGTCATTT.-9AAGTCATTT.-9AAGTCATTT.-9AAGTCATTT.-9AAGTCATTTG.-9AAGTCATTT.-9AAGTCATTT.-9AAGTCATTT.-9AAGTCATTT.-9AAGTCATTT.-9AAGTCATTT*,-9aagtcattt,-9aagtcattt*,-9aagtcattt,-9aagtcattt,-9aagtcattt,-9aagtcattt,-9aagtcattt,-9aagtcattt*,-4aagt,-9aagtcattt,-9aagtcattt**G,-9aagtcattt,-9aagtcattt,-9aagtcatttG,-9aagtcattt,-9aagtcattt*,-9aagtcattt*	~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+NC_045512.2	1813	T	51	..-3AAA..-2AA..-1A*.-1A.-3AAA*..-1A..-1A.-2AA.-1A.-3AAA.-2AA.-2AA.-3AAA.,-1a,-1a,-6aaaaaa,,,-1a,,-2aa,-2aa,,-1a,-1a,*,-1a,-1a,+1a,-2aa,-1a,+1a,-1a,,-2aa.-3AAA,-1a.-2AA,-1a*,-2aa,-2aa	~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/annotate_vcf_test.vcf	Wed Oct 12 07:43:59 2022 +0000
@@ -0,0 +1,9 @@
+##fileformat=VCFv4.1
+##medaka_version=1.0.3
+##contig=<ID=NC_045512.2>
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Medaka genotype.">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Medaka genotype quality score">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	SAMPLE
+NC_045512.2	45	.	G	C	46.359	PASS		GT:GQ	1:46
+NC_045512.2	685	.	AAAGTCATTT	A	260.487	PASS		GT:GQ	1:260
+NC_045512.2	1813	.	TA	T	11.034	PASS		GT:GQ	1:11
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/annotated.vcf	Wed Oct 12 07:43:59 2022 +0000
@@ -0,0 +1,16 @@
+##fileformat=VCFv4.1
+##medaka_version=1.0.3
+##contig=<ID=NC_045512.2>
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Medaka genotype.">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Medaka genotype quality score">
+##annotateVCFVersion=0.1
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Raw Depth">
+##INFO=<ID=AF,Number=1,Type=Float,Description="Allele Frequency">
+##INFO=<ID=FAF,Number=1,Type=Float,Description="Forward Allele Frequency">
+##INFO=<ID=RAF,Number=1,Type=Float,Description="Reverse Allele Frequency">
+##INFO=<ID=SB,Number=1,Type=Integer,Description="Phred-scaled strand bias at this position">
+##INFO=<ID=DP4,Number=4,Type=Integer,Description="Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	SAMPLE
+NC_045512.2	45	.	G	C	46.359	PASS	DP=31;AF=0.900000;FAF=0.937500;RAF=0.857143;SB=6;DP4=0,2,15,12	GT:GQ	1:46
+NC_045512.2	685	.	AAAGTCATTT	A	260.487	PASS	DP=75;AF=0.880000;FAF=0.789474;RAF=0.972973;SB=0;DP4=0,0,30,36	GT:GQ	1:260
+NC_045512.2	1813	.	TA	T	11.034	PASS	DP=51;AF=0.607843;FAF=0.521739;RAF=0.678571;SB=0;DP4=0,0,12,19	GT:GQ	1:11
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/assembly.fasta	Wed Oct 12 07:43:59 2022 +0000
@@ -0,0 +1,56 @@
+>4 length=3792 depth=114.52x circular=true
+TTGCGGTGTCTGCGTCAGAATCGCGTTCAGCGCGTTTCAGCGGTGCGTACAATTACAGCATTATGTTAAA
+TTTTATAATTGTCTTTAGTCATTGCGCTATGATTTTGTAGGTGGAGTTGTTTCATATTTTTGATATAAAT
+TCTTTTATCTCACTGAAGCAGTCATCGTCAGTATCAATGGTTTTGTTCTTCATTGATATGCCTATTCTAG
+AATCTGATTCCTTATTACAACTAAGGTAAACTTGAACTTTTGCTTTCTTTTCATTATTTGGGTAAGCGTA
+ATCAGGATCTAAGGATGTAATAATAGATTCTTGTTCTTTTATTTTATTGAATAGATCGTCTATATATCTA
+AACACCTCTTTCTTACACGCGTGAGCAATGATTAGATCTTCTATTGCGCCATTCTTGTTTTTTCCTGGCA
+TGATAAAGAAAGATATTTTTGTGCTTCCGTTAATGGCTATTTCATTATGCGAGCTTGGTGATGAAATGTT
+AAGCTCATTTAATGTTGAAATTATCGATCTTTCCGTGTCAGAAAACGATTCGTCGGCATCGATGATTAAG
+AATATACTTTCTTTTTCATTAAAAGACTCTATGCTTTTAAATGCTCTAAGGGTTTGTTTTAGTTTTGTCT
+TTCCATTTATTTTTTCAACATAAACATCATTAATTCCTTCTATTTCAAGTAGATTGCAAAGGAAGTGAAA
+ATCATCATCACCCTCAACGAAAATGACTTTTTTATAATCATTTATTTTTCTCGGTTTCATTTATCTCACC
+TCCCAGCCATTTTCAACTGAAAAAGAAAAATCATCCATGTTAAATTGCATTGCTGTAGGTTTTTGAGATT
+CCTTGTCTCTTCCCAGTCTTATGTAAGATATATCTTTTCTTTTTTCATCTAATTCGTTTATCGCACATAT
+TACATCATGGCTGTGTGTTGTTGTAAATATTTGATTATTATTGGTTTTTGCTGCTTCTATCAAAGCGTTG
+ATCATATCTTTGATTACTGAGTAATGAATTCCATTTTCTATCTCATCAATGAGAATAATTGAGTCTTTTT
+GAACTAAGATTGAGCTAATGAAAGACAGCGCTCTGCTTACTCCTTCACCAAGCATTGATATTTCATTTAA
+TTCACTTAGTCCTATATCTAAAAAAATCTCTTTGTTTTGATCTGTTGCTATTATTGCAATATCGTTAATT
+CTATTATCTATTCTTTTTAGGTTGCCTATTATTTCATCTTTTCTTTTTTGTGTTAGTAGTTTTGATACGT
+TTTTTATCGTTGCTTCTTTGTTTATGTTTCTTGATGTGGTTATTATCGCTGCGGTTTTGAATTCATATAG
+CTTACCTTTTTTAGTTTGTTTTATTTCTCCGTTTATTTCTTGTGCTTTTTGTGTTATTTTTATAGTAATC
+AATTCTTCTGGGGGGGTTCTTTTTCCGTTCTGTGTTTCACTAACCTTAATCATTAAAACATTCCCCTTTT
+CCTGAGAGCCGAGGGTTTGTCGCAATGGAGGGGTGTTTGATAATATTGACTCAACATTGAGGGACACGCT
+TGATTCACTTTGTGTGTTATCTATCGTTGAGTATTTTTGTGTTACTTCTGACCTTTCATCTCTTATTGTT
+ATGGACATTTCTTTGGATGTGTCCATGTTATAAAAATAGCTATCCCAAAATGAGTTTTCTTTATTTAAAA
+GAGATGCTTCATTTCTAAAGCTTTTTACTTTTAACAGAGCTGCTGGGTTTTTTACGTCATAAAATGAAAA
+TATCGCGTCTAATATCGATGTTTTCCCAAAGTTGTTTTTTCCGGATATTACATTAAGTGTTTTTAGTTGT
+TTGAGAGTTATGCTCTCGAAACACTTAAAATTTTTTAAGTGGATATCTTTTATCATGTTTAAACTTGCTC
+TCCATTTCTGTTTACGTTTACATAGTAATAATGTGTTTATTTATGCATCACTATGCCACTGTTTCTTTTC
+TAGAAGTATAATCAAAATAATATTACTTTATAGTCCTGTTTGGATTGATTTTAACCAGTTTGTCGGAGTT
+TCTTCTTGTGATTATTGTGTTCTGGTCGTAATCTCCTAAACTGAAAACGAAAAAACCACCTGGGGAGGTG
+GTTTGATCGAAGGTTAAGTCAGTTGGGGAACTGCTTAACCAGGTAACTGTCTTGTGCGAGACGAGTCACC
+AAAACTGTCCTTTCAGTGTAGCTGTAGTCAGGCCATCACTTCAAGTACTCTCTTGAACCTCTCGCACATC
+GACTTTTACCAATGGCTGCTGCCAGTGGCGCTTTGTCGTGTCTTACCGGGTTGGACTCAAGACGATAGTT
+ACCGGATAAGGCGCAGCGGTCGGGCTGAACGGGGGGTTCGTGCACACAGCCCAGCTTGGAGCGAACGACC
+TACACCGAACCGAGATACCTACAGCGTGAGCTATGAGAAAGCGCCACGCTTCCCGAAGGGAGAAAGGCGG
+ACAGGTATCCGGTAAGCGGCAGGGTCGGAACAGGAGAGCGCACGAGGGAGCTTCCAGGGGGAAACACCTG
+GTATCTTTATAGTCCTGTCGGGTTTCGCCACCTCTGGCTTGAGCGTCGATTTTTGTGATGCTCGTCAGGG
+GGGCGGAGCCTATGGAAAAACGCCTGCGGCGCTGGCTTCCTCCGGTGCTTTGCTTTTTGCTCACATGTTC
+TTTCCGACTTTATCCCCCGATTCTGTGGATAACCGTATTACCGCTTTTGAGTGAGCTGACACCGCTCGCC
+GCAGTCGAACGACCGAGCGTAGCGAGTCAGTGAGCGAGGAAGCGGAAGAGCGCCTTATGTGACATTTTCT
+CCTTACGCTCTGTTGTGCCGTTCGGCATCCTGTCCTGAGCGTTATCTCTCTGTGCTATTTTCTACTTCAA
+AGCGTGTCTGGATGCTGTTCTGGAGTTCTTCTGCGAGTTCGTGCAGCTTCTCACACATGGCAGCCTGTTC
+GTCAGCATCGGGTGCGTCCAGTTTTTCGAGCAGCGTCAGGCTCTGACTTTTTATGAATCCCGCCATGTTG
+AGTACGGCTTGCTGCTGCTTGTTCATCTTTTCGTTTTCTCCGTTCTGTCTGTCATCTGCGTTGTGTGATT
+ATATCGCGTACCACTTTTCGACTGTTTTGCTGCCGTTATTCTGCCGCTTGGCTTTTTGACGGGCATTTCT
+GTCAGACAACACTGTCACTGCCAAAAAACTGCCGTGCCTTTGTCGGTAATTCGAGCTTGCTGACAGGACA
+GGATGTGCAATTGTTATACCGCGCATACATGCACGCTATTACAATTGCCCTGGTCAGGGCTTTGCCCCGA
+CACCCATGTCAGATACGGAGCCATGTTTTATGACAAAACGAAGTGGAAGTAATACGCGCAGGCGGGCTAT
+CAGTCGCCCTGTTCGTCTGACGGCAGAAGAAGACCAGGAAATCAGAAAAAGGGCTGCTGAATGCGGCAAG
+ACCGTTTCCGGTTTTTTACGGGCGGCAGCTCTCGGTAAGAAAGTTAACTCACTGACTGATGATCGGGTAC
+TGAAAGAAGTTATGAGACTGGGGGCGTTACAGAAAAAACTCTTTATCGACGGCAAGCGTGTCGGGGACAG
+GGAGTATGCGGAGGTGCTGATCGCTATTACGGAGTATCACCGTGCCCTGTTATCCAGACTTATGGCAGAT
+TAGCTTCCCGGAGAGAAAACTGTCGAAAACAGACGGTATGAACACCGTAAGCTCCCAAAGTGATCACCAT
+TCGCTTTCATGCATAGCTATGCAGCGAGCTGAAAACGATCCTGACGCATCCTTCCTGTTTTCCCGGGGTA
+AAACATCTCTTT
\ No newline at end of file
Binary file test-data/basecalls.fastq.gz has changed
Binary file test-data/consensus.hdf has changed
Binary file test-data/medaka_test.bam has changed
Binary file test-data/medaka_test.bam.bai has changed
Binary file test-data/medaka_test.hdf has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ref.fasta	Wed Oct 12 07:43:59 2022 +0000
@@ -0,0 +1,2 @@
+>NC_045512.2
+attaaaggtttataccttcccaggtaacaaaccaaccaactttcgatctcttgtagatctgttctctaaacgaactttaaaatctgtgtggctGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATGGCACTTGTGGCTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAAACGTTCGGATGCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGGTAGCAGAACTCGAAGGCATTCAGTACGGTCGTAGTGGTGAGACACTTGGTGTCCTTGTCCCTCATGTGGGCGAAATACCAGTGGCTTACCGCAAGGTTCTTCTTCGTAAGAACGGTAATAAAGGAGCTGGTGGCCATAGTTACGGCGCCGATCTAAAGTCATTTGACTTAGGCGACGAGCTTGGCACTGATCCTTATGAAGATTTTCAAGAAAACTGGAACACTAAACATAGCAGTGGTGTTACCCGTGAACTCATGCGTGAGCTTAACGGAGGGGCATACACTCGCTATGTCGATAACAACTTCTGTGGCCCTGATGGCTACCCTCTTGAGTGCATTAAAGACCTTCTAGCACGTGCTGGTAAAGCTTCATGCACTTTGTCCGAACAACTGGACTTTATTGACACTAAGAGGGGTGTATACTGCTGCCGTGAACATGAGCATGAAATTGCTTGGTACACGGAACGTTCTGAAAAGAGCTATGAATTGCAGACACCTTTTGAAATTAAATTGGCAAAGAAATTTGACACCTTCAATGGGGAATGTCCAAATTTTGTATTTCCCTTAAATTCCATAATCAAGACTATTCAACCAAGGGTTGAAAAGAAAAAGCTTGATGGCTTTATGGGTAGAATTCGATCTGTCTATCCAGTTGCGTCACCAAATGAATGCAACCAAATGTGCCTTTCAACTCTCATGAAGTGTGATCATTGTGGTGAAACTTCATGGCAGACGGGCGATTTTGTTAAAGCCACTTGCGAATTTTGTGGCACTGAGAATTTGACTAAAGAAGGTGCCACTACTTGTGGTTACTTACCCCAAAATGCTGTTGTTAAAATTTATTGTCCAGCATGTCACAATTCAGAAGTAGGACCTGAGCATAGTCTTGCCGAATACCATAATGAATCTGGCTTGAAAACCATTCTTCGTAAGGGTGGTCGCACTATTGCCTTTGGAGGCTGTGTGTTCTCTTATGTTGGTTGCCATAACAAGTGTGCCTATTGGGTTCCACGTGCTAGCGCTAACATAGGTTGTAACCATACAGGTGTTGTTGGAGAAGGTTCCGAAGGTCTTAATGACAACCTTCTTGAAATACTCCAAAAAGAGAAAGTCAACATCAATATTGTTGGTGACTTTAAACTTAATGAAGAGATCGCCATTATTTTGGCATCTTTTTCTGCTTCCACAAGTGCTTTTGTGGAAACTGTGAAAGGTTTGGATTATAAAGCATTCAAACAAATTGTTGAATCCTGTGGTAATTTTAAAGTTACAAAAGGAAAAGCTAAAAAAGGTGCCTGGAATATTGGTGAACAGAAATCAATACTGAGTCCTCTTTATGCATTTGCATCAGAGGCTGCTCGTGTTGTACGATCAATTTTCTCCCGCACTCTTGAAACTGCTCAAAATTCTGTGCGTGTTTTACAGAAGGCCGCTATAACAATACTAGATGGAATTTCACAGTATTCACTGAGACTCATTGATGCTATGATGTTCACATCTGATTTGGCTACTAACAATCTAGTTGTAATGGCCTACATTACAGGTGGTGTTGTTCAGTTGACTTCGCAGTGGCTAACTAACATCTTTGGCACTGTTTATGAAAAACTCAAACCCGTCCTTGATTGGCTTGAAGAGAAGTTTAAGGAAGGTGTAGAGTTTCTTAGAGACGGTTGGGAAATTGTTAAATTTATCTCAACCTGTGCTTGTGAAATTGTCGGTGGACAAATTGTCACCTGTGCAAAGGAAATTAAGGAGAGTGTTCAGACATTCTTTAAGCTTGTAAATAAATTTTTGGCTTTGTGTGCTGACTCTATCATTATTGGTGGAGCTAAACTTAAAGCCTTGAATTTAGGTGAAACATTTGTCACGCACTCAAAGGGATTGTACAGAAAGTGTGTTAAATCCAGAGAAGAAACTGGCCTACTCATGCCTCTAAAAGCCCCAAAAGAAATTATCTTCTTAGAGGGAGAAACACTTCCCACAGAAGTGTTAACAGAGGAAGTTGTCTTGAAAACTGGTGATTTACAACCATTAGAACAACCTACTAGTGAAGCTGTTGAAGCTCCATTGGTTGGTACACCAGTTTGTATTAACGGGCTTATGTTGCTCGAAATCAAAGACACAGAAAAGTACTGTGCCCTTGCACCTAATATGATGGTAACAAACAATACCTTCACACTCAAAGGCGGTGCACCAACAAAGGTTACTTTTGGTGATGACACTGTGATAGAAGTGCAAGGTTACAAGAGTGTGAATATCACTTTTGAACTTGATGAAAGGATTGATAAAGTACTTAATGAGAAGTGCTCTGCCTATACAGTTGAACTCGGTACAGAAGTAAATGAGTTCGCCTGTGTTGTGGCAGATGCTGTCATAAAAACTTTGCAACCAGTATCTGAATTACTTACACC
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ref.fasta.fai	Wed Oct 12 07:43:59 2022 +0000
@@ -0,0 +1,1 @@
+NC_045512.2	2940	13	2940	2941
Binary file test-data/ref.fasta.gz has changed
Binary file test-data/variants.vcf.gz has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Wed Oct 12 07:43:59 2022 +0000
@@ -0,0 +1,7 @@
+<tables>
+    <!-- Locations of all fasta files under genome directory -->
+    <table name="all_fasta" comment_char="#" allow_duplicate_entries="False">
+        <columns>value, dbkey, name, path</columns>
+        <file path="all_fasta.loc" />
+    </table>
+</tables>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.test	Wed Oct 12 07:43:59 2022 +0000
@@ -0,0 +1,7 @@
+<tables>
+    <!-- Locations of all fasta files under genome directory -->
+    <table name="all_fasta" comment_char="#" allow_duplicate_entries="False">
+        <columns>value, dbkey, name, path</columns>
+        <file path="${__HERE__}/test-data/all_fasta.loc"/>
+    </table>
+</tables>
\ No newline at end of file