Mercurial > repos > iuc > medaka_snp
changeset 0:179342c7b86c draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 4d3dfd4bcb567178107dcfd808ff03f9fec0bdbd
author | iuc |
---|---|
date | Wed, 12 Oct 2022 07:43:59 +0000 |
parents | |
children | 630e6aeeb7e8 |
files | all_fasta.loc.sample convert_VCF_info_fields.py macros.xml snp.xml test-data/alignment.bam test-data/all_fasta.loc test-data/annotate_vcf_test.pileup test-data/annotate_vcf_test.vcf test-data/annotated.vcf test-data/assembly.fasta test-data/basecalls.fastq.gz test-data/consensus.hdf test-data/medaka_test.bam test-data/medaka_test.bam.bai test-data/medaka_test.hdf test-data/ref.fasta test-data/ref.fasta.fai test-data/ref.fasta.gz test-data/variants.vcf.gz tool_data_table_conf.xml.sample tool_data_table_conf.xml.test |
diffstat | 21 files changed, 641 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/all_fasta.loc.sample Wed Oct 12 07:43:59 2022 +0000 @@ -0,0 +1,18 @@ +#This file lists the locations and dbkeys of all the fasta files +#under the "genome" directory (a directory that contains a directory +#for each build). The script extract_fasta.py will generate the file +#all_fasta.loc. This file has the format (white space characters are +#TAB characters): +# +#<unique_build_id> <dbkey> <display_name> <file_path> +# +#So, all_fasta.loc could look something like this: +# +#apiMel3 apiMel3 Honeybee (Apis mellifera): apiMel3 /path/to/genome/apiMel3/apiMel3.fa +#hg19canon hg19 Human (Homo sapiens): hg19 Canonical /path/to/genome/hg19/hg19canon.fa +#hg19full hg19 Human (Homo sapiens): hg19 Full /path/to/genome/hg19/hg19full.fa +# +#Your all_fasta.loc file should contain an entry for each individual +#fasta file. So there will be multiple fasta files for each build, +#such as with hg19 above. +#
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/convert_VCF_info_fields.py Wed Oct 12 07:43:59 2022 +0000 @@ -0,0 +1,164 @@ +#!/usr/bin/env python3 + +# Takes in VCF file annotated with medaka tools annotate and converts +# +# Usage statement: +# python convert_VCF_info_fields.py in_vcf.vcf out_vcf.vcf + +# 10/21/2020 - Nathan P. Roach, natproach@gmail.com + +import re +import sys +from collections import OrderedDict +from math import log10 + +import scipy.stats + + +def pval_to_phredqual(pval): + try: + ret = round(-10 * log10(pval)) + except ValueError: + ret = 2147483647 # transform pval of 0.0 to max signed 32 bit int + return ret + + +def parseInfoField(info): + info_fields = info.split(";") + info_dict = OrderedDict() + for info_field in info_fields: + code, val = info_field.split("=") + info_dict[code] = val + return info_dict + + +def annotateVCF(in_vcf_filepath, out_vcf_filepath): + """Postprocess output of medaka tools annotate. + + Splits multiallelic sites into separate records. + Replaces medaka INFO fields that might represent information of the ref + and multiple alternate alleles with simple ref, alt allele counterparts. + """ + + in_vcf = open(in_vcf_filepath, "r") + # medaka INFO fields that do not make sense after splitting of + # multi-allelic records + # DP will be overwritten with the value of DPSP because medaka tools + # annotate currently only calculates the latter correctly + # (https://github.com/nanoporetech/medaka/issues/192). + # DPS, which is as unreliable as DP, gets skipped and the code + # calculates the spanning reads equivalent DPSPS instead. + to_skip = {"SC", "SR", "AR", "DP", "DPSP", "DPS"} + struct_meta_pat = re.compile("##(.+)=<ID=([^,]+)(,.+)?>") + header_lines = [] + contig_ids = set() + contig_ids_simple = set() + # parse the metadata lines of the input VCF and drop: + # - duplicate lines + # - INFO lines declaring keys we are not going to write + # - redundant contig information + while True: + line = in_vcf.readline() + if line[:2] != "##": + assert line.startswith("#CHROM") + break + if line in header_lines: + # the annotate tool may generate lines already written by + # medaka variant again (example: medaka version line) + continue + match = struct_meta_pat.match(line) + if match: + match_type, match_id, match_misc = match.groups() + if match_type == "INFO": + if match_id == "DPSP": + line = line.replace("DPSP", "DP") + elif match_id in to_skip: + continue + elif match_type == "contig": + contig_ids.add(match_id) + if not match_misc: + # the annotate tools writes its own contig info, + # which is redundant with contig info generated by + # medaka variant, but lacks a length value. + # We don't need the incomplete line. + contig_ids_simple.add(match_id) + continue + header_lines.append(line) + # Lets check the above assumption about each ID-only contig line + # having a more complete counterpart. + assert not (contig_ids_simple - contig_ids) + header_lines.insert(1, "##convert_VCF_info_fields=0.2\n") + header_lines += [ + '##INFO=<ID=DPSPS,Number=2,Type=Integer,Description="Depth of spanning reads by strand">\n', + '##INFO=<ID=AF,Number=1,Type=Float,Description="Spanning Reads Allele Frequency">\n', + '##INFO=<ID=FAF,Number=1,Type=Float,Description="Forward Spanning Reads Allele Frequency">\n', + '##INFO=<ID=RAF,Number=1,Type=Float,Description="Reverse Spanning Reads Allele Frequency">\n', + '##INFO=<ID=SB,Number=1,Type=Integer,Description="Phred-scaled strand bias of spanning reads at this position">\n', + '##INFO=<ID=DP4,Number=4,Type=Integer,Description="Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases in spanning reads">\n', + '##INFO=<ID=AS,Number=4,Type=Integer,Description="Total alignment score to ref and alt allele of spanning reads by strand (ref fwd, ref rev, alt fwd, alt rev) aligned with parasail match 5, mismatch -4, open 5, extend 3">\n', + line, + ] + + with open(out_vcf_filepath, "w") as out_vcf: + out_vcf.writelines(header_lines) + for line in in_vcf: + fields = line.split("\t") + info_dict = parseInfoField(fields[7]) + sr_list = [int(x) for x in info_dict["SR"].split(",")] + sc_list = [int(x) for x in info_dict["SC"].split(",")] + if len(sr_list) != len(sc_list): + print("WARNING - SR and SC are different lengths, " "skipping variant") + print(line.strip()) # Print the line for debugging purposes + continue + variant_list = fields[4].split(",") + dpsp = int(info_dict["DPSP"]) + ref_fwd, ref_rev = 0, 1 + dpspf, dpspr = (int(x) for x in info_dict["AR"].split(",")) + for i in range(0, len(sr_list), 2): + dpspf += sr_list[i] + dpspr += sr_list[i + 1] + for j, i in enumerate(range(2, len(sr_list), 2)): + dp4 = (sr_list[ref_fwd], sr_list[ref_rev], sr_list[i], sr_list[i + 1]) + dp2x2 = [[dp4[0], dp4[1]], [dp4[2], dp4[3]]] + _, p_val = scipy.stats.fisher_exact(dp2x2) + sb = pval_to_phredqual(p_val) + + as_ = (sc_list[ref_fwd], sc_list[ref_rev], sc_list[i], sc_list[i + 1]) + + info = [] + for code in info_dict: + if code in to_skip: + continue + val = info_dict[code] + info.append("%s=%s" % (code, val)) + + info.append("DP=%d" % dpsp) + info.append("DPSPS=%d,%d" % (dpspf, dpspr)) + + if dpsp == 0: + info.append("AF=NaN") + else: + af = (dp4[2] + dp4[3]) / dpsp + info.append("AF=%.6f" % af) + if dpspf == 0: + info.append("FAF=NaN") + else: + faf = dp4[2] / dpspf + info.append("FAF=%.6f" % faf) + if dpspr == 0: + info.append("RAF=NaN") + else: + raf = dp4[3] / dpspr + info.append("RAF=%.6f" % raf) + info.append("SB=%d" % sb) + info.append("DP4=%d,%d,%d,%d" % dp4) + info.append("AS=%d,%d,%d,%d" % as_) + new_info = ";".join(info) + fields[4] = variant_list[j] + fields[7] = new_info + out_vcf.write("\t".join(fields)) + in_vcf.close() + + +if __name__ == "__main__": + annotateVCF(sys.argv[1], sys.argv[2])
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Wed Oct 12 07:43:59 2022 +0000 @@ -0,0 +1,188 @@ +<macros> + <token name="@TOOL_VERSION@">1.7.2</token> + <token name="@VERSION_SUFFIX@">0</token> + <token name="@PROFILE@">21.01</token> + <xml name="bio_tools"> + <xrefs> + <xref type="bio.tools">medaka</xref> + </xrefs> + </xml> + <xml name="requirements"> + <requirements> + <requirement type="package" version="@TOOL_VERSION@">medaka</requirement> + <yield /> + </requirements> + </xml> + <xml name="version_command"> + <version_command>medaka --version</version_command> + </xml> + <xml name="citations"> + <citations> + <citation type="bibtex">@online{medaka, + author = {Oxford Nanopore Technologies Ltd.}, + title = {medaka}, + year = 2020, + url = {https://github.com/nanoporetech/medaka}, + urldate = {2020-05-06} + }</citation> + </citations> + </xml> + + <!-- + command + --> + + <token name="@REF_FASTA@"><![CDATA[ + #if $reference_source.reference_source_selector == 'history': + #if $reference_source.ref_file.ext.endswith(".gz") + gunzip -c '$reference_source.ref_file' > reference.fa && + #else + ln -f -s '$reference_source.ref_file' reference.fa && + #end if + #else: + ln -f -s '$reference_source.ref_file.fields.path' reference.fa && + #end if + ]]></token> + + <!-- + input + --> + + <xml name="b" token_argument="-b"> + <param argument="@ARGUMENT@" type="integer" value="100" min="1" label="Set inference batch size"/> + </xml> + <xml name="model" token_argument="-m" token_label="Select model"> + <param argument="@ARGUMENT@" type="select" label="@LABEL@" help="For best results it is important to specify the correct model, + according to the basecaller used. Medaka models are named to indicate i) the pore type, ii) the sequencing device (MinION + or PromethION), iii) the basecaller variant, and iv) the basecaller version"> + <option value="r103_fast_g507">r103_fast_g507</option> + <option value="r103_fast_snp_g507">r103_fast_snp_g507</option> + <option value="r103_fast_variant_g507">r103_fast_variant_g507</option> + <option value="r103_hac_g507">r103_hac_g507</option> + <option value="r103_hac_snp_g507">r103_hac_snp_g507</option> + <option value="r103_hac_variant_g507">r103_hac_variant_g507</option> + <option value="r103_min_high_g345">r103_min_high_g345</option> + <option value="r103_min_high_g360">r103_min_high_g360</option> + <option value="r103_prom_high_g360">r103_prom_high_g360</option> + <option value="r103_prom_snp_g3210">r103_prom_snp_g3210</option> + <option value="r103_prom_variant_g3210">r103_prom_variant_g3210</option> + <option value="r103_sup_g507">r103_sup_g507</option> + <option value="r103_sup_snp_g507">r103_sup_snp_g507</option> + <option value="r103_sup_variant_g507">r103_sup_variant_g507</option> + <option value="r1041_e82_400bps_fast_g615">r1041_e82_400bps_fast_g615</option> + <option value="r1041_e82_400bps_fast_variant_g615">r1041_e82_400bps_fast_variant_g615</option> + <option value="r1041_e82_400bps_hac_g615">r1041_e82_400bps_hac_g615</option> + <option value="r1041_e82_400bps_hac_variant_g615">r1041_e82_400bps_hac_variant_g615</option> + <option value="r1041_e82_400bps_sup_g615">r1041_e82_400bps_sup_g615</option> + <option value="r1041_e82_400bps_sup_variant_g615">r1041_e82_400bps_sup_variant_g615</option> + <option value="r104_e81_fast_g5015">r104_e81_fast_g5015</option> + <option value="r104_e81_hac_g5015">r104_e81_hac_g5015</option> + <option value="r104_e81_sup_g5015">r104_e81_sup_g5015</option> + <option value="r104_e81_fast_variant_g5015">r104_e81_fast_variant_g5015</option> + <option value="r104_e81_hac_variant_g5015">r104_e81_hac_variant_g5015</option> + <option value="r104_e81_sup_g610">r104_e81_sup_g610</option> + <option value="r104_e81_sup_variant_g610">r104_e81_sup_variant_g610</option> + <option value="r10_min_high_g303">r10_min_high_g303</option> + <option value="r10_min_high_g340">r10_min_high_g340</option> + <option value="r941_e81_fast_variant_g514">r941_e81_fast_variant_g514</option> + <option value="r941_e81_hac_g514">r941_e81_hac_g514</option> + <option value="r941_e81_hac_variant_g514">r941_e81_hac_variant_g514</option> + <option value="r941_e81_sup_g514">r941_e81_sup_g514</option> + <option value="r941_e81_sup_variant_g514">r941_e81_sup_variant_g514</option> + <option value="r941_min_fast_g303">r941_min_fast_g303</option> + <option value="r941_min_fast_g507">r941_min_fast_g507</option> + <option value="r941_min_fast_snp_g507">r941_min_fast_snp_g507</option> + <option value="r941_min_fast_variant_g507">r941_min_fast_variant_g507</option> + <option value="r941_min_hac_g507">r941_min_hac_g507</option> + <option value="r941_min_hac_snp_g507">r941_min_hac_snp_g507</option> + <option value="r941_min_hac_variant_g507">r941_min_hac_variant_g507</option> + <option value="r941_min_high_g303">r941_min_high_g303</option> + <option value="r941_min_high_g330">r941_min_high_g330</option> + <option value="r941_min_high_g340_rle">r941_min_high_g340_rle</option> + <option value="r941_min_high_g344">r941_min_high_g344</option> + <option value="r941_min_high_g351">r941_min_high_g351</option> + <option value="r941_min_high_g360" selected="true">r941_min_high_g360</option> + <option value="r941_min_sup_g507">r941_min_sup_g507</option> + <option value="r941_min_sup_snp_g507">r941_min_sup_snp_g507</option> + <option value="r941_min_sup_variant_g507">r941_min_sup_variant_g507</option> + <option value="r941_prom_fast_g303">r941_prom_fast_g303</option> + <option value="r941_prom_fast_g507">r941_prom_fast_g507</option> + <option value="r941_prom_fast_snp_g507">r941_prom_fast_snp_g507</option> + <option value="r941_prom_fast_variant_g507">r941_prom_fast_variant_g507</option> + <option value="r941_prom_hac_g507">r941_prom_hac_g507</option> + <option value="r941_prom_hac_snp_g507">r941_prom_hac_snp_g507</option> + <option value="r941_prom_hac_variant_g507">r941_prom_hac_variant_g507</option> + <option value="r941_prom_high_g303">r941_prom_high_g303</option> + <option value="r941_prom_high_g330">r941_prom_high_g330</option> + <option value="r941_prom_high_g344">r941_prom_high_g344</option> + <option value="r941_prom_high_g360">r941_prom_high_g360</option> + <option value="r941_prom_high_g4011">r941_prom_high_g4011</option> + <option value="r941_prom_snp_g303">r941_prom_snp_g303</option> + <option value="r941_prom_snp_g322">r941_prom_snp_g322</option> + <option value="r941_prom_snp_g360">r941_prom_snp_g360</option> + <option value="r941_prom_sup_g507">r941_prom_sup_g507</option> + <option value="r941_prom_sup_snp_g507">r941_prom_sup_snp_g507</option> + <option value="r941_prom_sup_variant_g507">r941_prom_sup_variant_g507</option> + <option value="r941_prom_variant_g303">r941_prom_variant_g303</option> + <option value="r941_prom_variant_g322">r941_prom_variant_g322</option> + <option value="r941_prom_variant_g360">r941_prom_variant_g360</option> + <option value="r941_sup_plant_g610">r941_sup_plant_g610</option> + <option value="r941_sup_plant_variant_g610">r941_sup_plant_variant_g610</option> + </param> + </xml> + <xml name="reference"> + <conditional name="reference_source"> + <param name="reference_source_selector" type="select" label="Choose the source for the reference genome"> + <option value="cached">Use a built-in genome</option> + <option value="history">Use a genome from history</option> + </param> + <when value="cached"> + <param name="ref_file" type="select" label="Using reference genome" help="Select genome from the list"> + <options from_data_table="all_fasta"> + <filter type="sort_by" column="2"/> + <validator type="no_options" message="No reference genomes are available"/> + </options> + <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/> + </param> + </when> + <when value="history"> + <param name="ref_file" type="data" format="fasta,fasta.gz" label="Use the following dataset as the reference sequence" help="You can upload a FASTA or FASTQ sequence to the history and use it as reference"/> + </when> + </conditional> + </xml> + + <!-- + Help + --> + + <token name="@WID@"><![CDATA[ +*medaka* is a tool suite to create a consensus sequence from nanopore sequencing data. + +This task is performed using neural networks applied from a pileup of individual sequencing reads against a draft assembly. It outperforms graph-based methods operating on basecalled data, and can be competitive with state-of-the-art signal-based methods, whilst being much faster. + ]]></token> + + <token name="@MODELS@"><![CDATA[ + +---- + +.. class:: infomark + +**Models** + +For best results it is important to specify the correct model, -m in the above, according to the basecaller used. Allowed values can be found by running medaka tools list\_models. + +Medaka models are named to indicate i) the pore type, ii) the sequencing device (MinION or PromethION), iii) the basecaller variant, and iv) the basecaller version, with the format: + + :: + + {pore}_{device}_{caller variant}_{caller version} + +For example the model named r941_min_fast_g303 should be used with data from MinION (or GridION) R9.4.1 flowcells using the fast Guppy basecaller version 3.0.3. By contrast the model +r941_prom_hac_g303 should be used with PromethION data and the high accuracy basecaller (termed "hac" in Guppy configuration files). Where a version of Guppy has been used without an exactly corresponding medaka model, the medaka model with the highest version equal to or less than the guppy version should be selected. + + ]]></token> + + <token name="@REFERENCES@"><![CDATA[ +More information are available in the `manual <https://github.com/nanoporetech/medaka/tree/master/docs>`_ and `github <https://github.com/nanoporetech/medaka>`_. + ]]></token> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/snp.xml Wed Oct 12 07:43:59 2022 +0000 @@ -0,0 +1,169 @@ +<tool id="medaka_snp" name="medaka SNP tool" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> + <description>decodes probabilities to SNPs</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements"/> + <expand macro="version_command"/> + <command detect_errors="exit_code"><![CDATA[ +## initialize +@REF_FASTA@ + + +#if $pool.pool_mode == "Yes": + ## run + medaka snp + ## optional + --debug + #if $regions + --regions '${regions}' + #end if + --threshold $threshold + #if $ref_vcf + --ref_vcf '$ref_vcf' + #end if + $verbose + ## required + reference.fa + #for $current in $pool.inputs + '$current' + #end for +#elif $pool.pool_mode == "No": + ## run + medaka snp + ## optional + --debug + #if $regions + --regions '${regions}' + #end if + --threshold $threshold + #if $ref_vcf + --ref_vcf '$ref_vcf' + #end if + $verbose + ## required + reference.fa + '$pool.input' +#end if +#if str($output_annotated.output_annotated_select) == 'false': + '$out_SNPs' ##output + 2>&1 | tee '$out_log' +#else + raw.vcf ##output of medaka snp + 2>&1 | tee '$out_log' + && ln -s '$output_annotated.in_bam' in.bam + && ln -s '$output_annotated.in_bam.metadata.bam_index' in.bai + && medaka tools annotate --dpsp --pad $output_annotated.pad raw.vcf reference.fa in.bam tmp.vcf + && python '$__tool_directory__/convert_VCF_info_fields.py' tmp.vcf '$out_SNPs' +#end if + ]]></command> + <inputs> + <conditional name="pool"> + <param name="pool_mode" type="select" label="Are you pooling HDF5 datasets?"> + <option value="No" selected="true">No</option> + <option value="Yes">Yes</option> + </param> + <when value="Yes"> + <param name="inputs" type="data" format="h5" multiple="true" label="Select consensus file(s)"/> + </when> + <when value="No"> + <param name="input" type="data" format="h5" label="Select consensus file(s)"/> + </when> + </conditional> + <expand macro="reference"/> + <param argument="--regions" type="text" value="" optional="true" label="Set reference names to limit SNP calling" help="Separated by ','."> + <sanitizer invalid_char=""> + <valid initial="string.ascii_letters,string.digits"> + <add value="_"/> + <add value=","/> + <add value="."/> + </valid> + </sanitizer> + </param> + <param argument="--threshold" type="float" label="Threshold for considering secondary calls" value="0.04" min="0" max="1" help="A value of 1 will result in haploid decoding" optional="true"/> + <param name="ref_vcf" type="data" format="vcf" optional="true" label="Reference vcf"/> + <param argument="--verbose" type="boolean" truevalue="--verbose" falsevalue="" label="Populate VCF info fields?"/> + <conditional name="output_annotated"> + <param name="output_annotated_select" type="select" + label="Type of VCF to generate" + help="SNP INFO fields in the VCF can be extended to include allele frequency, depth of coverage, etc., but this requires a BAM dataset to calculate those values from."> + <option value="true" selected="true">Write annotated VCF with extended INFO</option> + <option value="false">Write original decoded VCF with minimal INFO field</option> + </param> + <when value="true"> + <param name="in_bam" type="data" format="bam" optional="false" label="BAM to caclulate additional INFO fields from"/> + <param name="pad" type="integer" min="1" value="25" + label="Padding width on either side of SNP for realignment" + help="To calculate the additional INFO fields the tool will run medaka tools anntotate, which performs local realignment of the region +- this width around each SNP. All calculated new fields will depend on the width chosen, so only change this value if you know what you are doing." /> + </when> + <when value="false"/> + </conditional> + <param name="output_log_bool" type="boolean" label="Output log file?" checked="true"/> + </inputs> + <outputs> + <data name="out_SNPs" format="vcf" label="${tool.name} on ${on_string}: called SNPs"/> + <data name="out_log" format="tabular" label="${tool.name} on ${on_string}: Log"> + <filter>output_log_bool</filter> + </data> + </outputs> + <tests> + <!--No annotation or log--> + <test expect_num_outputs="1"> + <conditional name="pool"> + <param name="pool_mode" value="No"/> + <param name="input" value="medaka_test.hdf"/> + </conditional> + <conditional name="reference_source"> + <param name="reference_source_selector" value="history"/> + <param name="ref_file" value="ref.fasta"/> + </conditional> + <conditional name="output_annotated"> + <param name="output_annotated_select" value="false"/> + </conditional> + <param name="output_log_bool" value="false"/> + <output name="out_SNPs"> + <assert_contents> + <has_n_lines n="7"/> + <has_line line="##fileformat=VCFv4.1" /> + <has_line_matching expression="##medaka_version=[0-9]+\.[0-9]+\.[0-9]+" /> + </assert_contents> + </output> + </test> + </tests> + <help><![CDATA[ +.. class:: infomark + +**What it does** + +@WID@ + +This module decodes probabilities to SNPs but NOT indels. + +For a more general solution see the medaka *variant* tool. +---- + +.. class:: infomark + +**Input** + +- reference sequence (FASTA) +- (several) consensus files (H5/HDF) + +---- + +.. class:: infomark + +**Output** + +- decoded SNP probabilities (VCF) + +---- + +.. class:: infomark + +**References** + +@REFERENCES@ + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/all_fasta.loc Wed Oct 12 07:43:59 2022 +0000 @@ -0,0 +1,1 @@ +ref_fasta ref_fasta ref_fasta ${__HERE__}/ref.fasta \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/annotate_vcf_test.pileup Wed Oct 12 07:43:59 2022 +0000 @@ -0,0 +1,3 @@ +NC_045512.2 45 G 31 CCCCACCCCCCccccccccc,ccCCC*,+3ttccCC ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +NC_045512.2 685 A 75 .-9AAGTCATTT,-9aagtcattt,-9aagtcattt*.-9AAGTCATTT*G.-9AAGTCATTT.-9AAGTCATTT,-9aagtcattt*,-9aagtcattt,-9aagtcattt,-9aagtcattt,-9aagtcattt,,-9aagtcattt,-9aagtcattt,-9aagtcatttG.-9AAGTCATTT*,-9aagtcatttG.-9AAGTCATTT.-9AAGTCATTTG.-9AAGTCATTTG.-9AAGTCATTT.-9AAGTCATTT.-9AAGTCATTT.-9AAGTCATTT.-9AAGTCATTT*.-9AAGTCATTT*.-9AAGTCATTT.-9AAGTCATTT.-9AAGTCATTT.-9AAGTCATTT.-9AAGTCATTTG.-9AAGTCATTT.-9AAGTCATTT.-9AAGTCATTT.-9AAGTCATTT.-9AAGTCATTT.-9AAGTCATTT*,-9aagtcattt,-9aagtcattt*,-9aagtcattt,-9aagtcattt,-9aagtcattt,-9aagtcattt,-9aagtcattt,-9aagtcattt*,-4aagt,-9aagtcattt,-9aagtcattt**G,-9aagtcattt,-9aagtcattt,-9aagtcatttG,-9aagtcattt,-9aagtcattt*,-9aagtcattt* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +NC_045512.2 1813 T 51 ..-3AAA..-2AA..-1A*.-1A.-3AAA*..-1A..-1A.-2AA.-1A.-3AAA.-2AA.-2AA.-3AAA.,-1a,-1a,-6aaaaaa,,,-1a,,-2aa,-2aa,,-1a,-1a,*,-1a,-1a,+1a,-2aa,-1a,+1a,-1a,,-2aa.-3AAA,-1a.-2AA,-1a*,-2aa,-2aa ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/annotate_vcf_test.vcf Wed Oct 12 07:43:59 2022 +0000 @@ -0,0 +1,9 @@ +##fileformat=VCFv4.1 +##medaka_version=1.0.3 +##contig=<ID=NC_045512.2> +##FORMAT=<ID=GT,Number=1,Type=String,Description="Medaka genotype."> +##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Medaka genotype quality score"> +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE +NC_045512.2 45 . G C 46.359 PASS GT:GQ 1:46 +NC_045512.2 685 . AAAGTCATTT A 260.487 PASS GT:GQ 1:260 +NC_045512.2 1813 . TA T 11.034 PASS GT:GQ 1:11
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/annotated.vcf Wed Oct 12 07:43:59 2022 +0000 @@ -0,0 +1,16 @@ +##fileformat=VCFv4.1 +##medaka_version=1.0.3 +##contig=<ID=NC_045512.2> +##FORMAT=<ID=GT,Number=1,Type=String,Description="Medaka genotype."> +##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Medaka genotype quality score"> +##annotateVCFVersion=0.1 +##INFO=<ID=DP,Number=1,Type=Integer,Description="Raw Depth"> +##INFO=<ID=AF,Number=1,Type=Float,Description="Allele Frequency"> +##INFO=<ID=FAF,Number=1,Type=Float,Description="Forward Allele Frequency"> +##INFO=<ID=RAF,Number=1,Type=Float,Description="Reverse Allele Frequency"> +##INFO=<ID=SB,Number=1,Type=Integer,Description="Phred-scaled strand bias at this position"> +##INFO=<ID=DP4,Number=4,Type=Integer,Description="Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases"> +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE +NC_045512.2 45 . G C 46.359 PASS DP=31;AF=0.900000;FAF=0.937500;RAF=0.857143;SB=6;DP4=0,2,15,12 GT:GQ 1:46 +NC_045512.2 685 . AAAGTCATTT A 260.487 PASS DP=75;AF=0.880000;FAF=0.789474;RAF=0.972973;SB=0;DP4=0,0,30,36 GT:GQ 1:260 +NC_045512.2 1813 . TA T 11.034 PASS DP=51;AF=0.607843;FAF=0.521739;RAF=0.678571;SB=0;DP4=0,0,12,19 GT:GQ 1:11
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/assembly.fasta Wed Oct 12 07:43:59 2022 +0000 @@ -0,0 +1,56 @@ +>4 length=3792 depth=114.52x circular=true +TTGCGGTGTCTGCGTCAGAATCGCGTTCAGCGCGTTTCAGCGGTGCGTACAATTACAGCATTATGTTAAA +TTTTATAATTGTCTTTAGTCATTGCGCTATGATTTTGTAGGTGGAGTTGTTTCATATTTTTGATATAAAT +TCTTTTATCTCACTGAAGCAGTCATCGTCAGTATCAATGGTTTTGTTCTTCATTGATATGCCTATTCTAG +AATCTGATTCCTTATTACAACTAAGGTAAACTTGAACTTTTGCTTTCTTTTCATTATTTGGGTAAGCGTA +ATCAGGATCTAAGGATGTAATAATAGATTCTTGTTCTTTTATTTTATTGAATAGATCGTCTATATATCTA +AACACCTCTTTCTTACACGCGTGAGCAATGATTAGATCTTCTATTGCGCCATTCTTGTTTTTTCCTGGCA +TGATAAAGAAAGATATTTTTGTGCTTCCGTTAATGGCTATTTCATTATGCGAGCTTGGTGATGAAATGTT +AAGCTCATTTAATGTTGAAATTATCGATCTTTCCGTGTCAGAAAACGATTCGTCGGCATCGATGATTAAG +AATATACTTTCTTTTTCATTAAAAGACTCTATGCTTTTAAATGCTCTAAGGGTTTGTTTTAGTTTTGTCT +TTCCATTTATTTTTTCAACATAAACATCATTAATTCCTTCTATTTCAAGTAGATTGCAAAGGAAGTGAAA +ATCATCATCACCCTCAACGAAAATGACTTTTTTATAATCATTTATTTTTCTCGGTTTCATTTATCTCACC +TCCCAGCCATTTTCAACTGAAAAAGAAAAATCATCCATGTTAAATTGCATTGCTGTAGGTTTTTGAGATT +CCTTGTCTCTTCCCAGTCTTATGTAAGATATATCTTTTCTTTTTTCATCTAATTCGTTTATCGCACATAT +TACATCATGGCTGTGTGTTGTTGTAAATATTTGATTATTATTGGTTTTTGCTGCTTCTATCAAAGCGTTG +ATCATATCTTTGATTACTGAGTAATGAATTCCATTTTCTATCTCATCAATGAGAATAATTGAGTCTTTTT +GAACTAAGATTGAGCTAATGAAAGACAGCGCTCTGCTTACTCCTTCACCAAGCATTGATATTTCATTTAA +TTCACTTAGTCCTATATCTAAAAAAATCTCTTTGTTTTGATCTGTTGCTATTATTGCAATATCGTTAATT +CTATTATCTATTCTTTTTAGGTTGCCTATTATTTCATCTTTTCTTTTTTGTGTTAGTAGTTTTGATACGT +TTTTTATCGTTGCTTCTTTGTTTATGTTTCTTGATGTGGTTATTATCGCTGCGGTTTTGAATTCATATAG +CTTACCTTTTTTAGTTTGTTTTATTTCTCCGTTTATTTCTTGTGCTTTTTGTGTTATTTTTATAGTAATC +AATTCTTCTGGGGGGGTTCTTTTTCCGTTCTGTGTTTCACTAACCTTAATCATTAAAACATTCCCCTTTT +CCTGAGAGCCGAGGGTTTGTCGCAATGGAGGGGTGTTTGATAATATTGACTCAACATTGAGGGACACGCT +TGATTCACTTTGTGTGTTATCTATCGTTGAGTATTTTTGTGTTACTTCTGACCTTTCATCTCTTATTGTT +ATGGACATTTCTTTGGATGTGTCCATGTTATAAAAATAGCTATCCCAAAATGAGTTTTCTTTATTTAAAA +GAGATGCTTCATTTCTAAAGCTTTTTACTTTTAACAGAGCTGCTGGGTTTTTTACGTCATAAAATGAAAA +TATCGCGTCTAATATCGATGTTTTCCCAAAGTTGTTTTTTCCGGATATTACATTAAGTGTTTTTAGTTGT +TTGAGAGTTATGCTCTCGAAACACTTAAAATTTTTTAAGTGGATATCTTTTATCATGTTTAAACTTGCTC +TCCATTTCTGTTTACGTTTACATAGTAATAATGTGTTTATTTATGCATCACTATGCCACTGTTTCTTTTC +TAGAAGTATAATCAAAATAATATTACTTTATAGTCCTGTTTGGATTGATTTTAACCAGTTTGTCGGAGTT +TCTTCTTGTGATTATTGTGTTCTGGTCGTAATCTCCTAAACTGAAAACGAAAAAACCACCTGGGGAGGTG +GTTTGATCGAAGGTTAAGTCAGTTGGGGAACTGCTTAACCAGGTAACTGTCTTGTGCGAGACGAGTCACC +AAAACTGTCCTTTCAGTGTAGCTGTAGTCAGGCCATCACTTCAAGTACTCTCTTGAACCTCTCGCACATC +GACTTTTACCAATGGCTGCTGCCAGTGGCGCTTTGTCGTGTCTTACCGGGTTGGACTCAAGACGATAGTT +ACCGGATAAGGCGCAGCGGTCGGGCTGAACGGGGGGTTCGTGCACACAGCCCAGCTTGGAGCGAACGACC +TACACCGAACCGAGATACCTACAGCGTGAGCTATGAGAAAGCGCCACGCTTCCCGAAGGGAGAAAGGCGG +ACAGGTATCCGGTAAGCGGCAGGGTCGGAACAGGAGAGCGCACGAGGGAGCTTCCAGGGGGAAACACCTG +GTATCTTTATAGTCCTGTCGGGTTTCGCCACCTCTGGCTTGAGCGTCGATTTTTGTGATGCTCGTCAGGG +GGGCGGAGCCTATGGAAAAACGCCTGCGGCGCTGGCTTCCTCCGGTGCTTTGCTTTTTGCTCACATGTTC +TTTCCGACTTTATCCCCCGATTCTGTGGATAACCGTATTACCGCTTTTGAGTGAGCTGACACCGCTCGCC +GCAGTCGAACGACCGAGCGTAGCGAGTCAGTGAGCGAGGAAGCGGAAGAGCGCCTTATGTGACATTTTCT +CCTTACGCTCTGTTGTGCCGTTCGGCATCCTGTCCTGAGCGTTATCTCTCTGTGCTATTTTCTACTTCAA +AGCGTGTCTGGATGCTGTTCTGGAGTTCTTCTGCGAGTTCGTGCAGCTTCTCACACATGGCAGCCTGTTC +GTCAGCATCGGGTGCGTCCAGTTTTTCGAGCAGCGTCAGGCTCTGACTTTTTATGAATCCCGCCATGTTG +AGTACGGCTTGCTGCTGCTTGTTCATCTTTTCGTTTTCTCCGTTCTGTCTGTCATCTGCGTTGTGTGATT +ATATCGCGTACCACTTTTCGACTGTTTTGCTGCCGTTATTCTGCCGCTTGGCTTTTTGACGGGCATTTCT +GTCAGACAACACTGTCACTGCCAAAAAACTGCCGTGCCTTTGTCGGTAATTCGAGCTTGCTGACAGGACA +GGATGTGCAATTGTTATACCGCGCATACATGCACGCTATTACAATTGCCCTGGTCAGGGCTTTGCCCCGA +CACCCATGTCAGATACGGAGCCATGTTTTATGACAAAACGAAGTGGAAGTAATACGCGCAGGCGGGCTAT +CAGTCGCCCTGTTCGTCTGACGGCAGAAGAAGACCAGGAAATCAGAAAAAGGGCTGCTGAATGCGGCAAG +ACCGTTTCCGGTTTTTTACGGGCGGCAGCTCTCGGTAAGAAAGTTAACTCACTGACTGATGATCGGGTAC +TGAAAGAAGTTATGAGACTGGGGGCGTTACAGAAAAAACTCTTTATCGACGGCAAGCGTGTCGGGGACAG +GGAGTATGCGGAGGTGCTGATCGCTATTACGGAGTATCACCGTGCCCTGTTATCCAGACTTATGGCAGAT +TAGCTTCCCGGAGAGAAAACTGTCGAAAACAGACGGTATGAACACCGTAAGCTCCCAAAGTGATCACCAT +TCGCTTTCATGCATAGCTATGCAGCGAGCTGAAAACGATCCTGACGCATCCTTCCTGTTTTCCCGGGGTA +AAACATCTCTTT \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/ref.fasta Wed Oct 12 07:43:59 2022 +0000 @@ -0,0 +1,2 @@ +>NC_045512.2 +attaaaggtttataccttcccaggtaacaaaccaaccaactttcgatctcttgtagatctgttctctaaacgaactttaaaatctgtgtggctGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATGGCACTTGTGGCTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAAACGTTCGGATGCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGGTAGCAGAACTCGAAGGCATTCAGTACGGTCGTAGTGGTGAGACACTTGGTGTCCTTGTCCCTCATGTGGGCGAAATACCAGTGGCTTACCGCAAGGTTCTTCTTCGTAAGAACGGTAATAAAGGAGCTGGTGGCCATAGTTACGGCGCCGATCTAAAGTCATTTGACTTAGGCGACGAGCTTGGCACTGATCCTTATGAAGATTTTCAAGAAAACTGGAACACTAAACATAGCAGTGGTGTTACCCGTGAACTCATGCGTGAGCTTAACGGAGGGGCATACACTCGCTATGTCGATAACAACTTCTGTGGCCCTGATGGCTACCCTCTTGAGTGCATTAAAGACCTTCTAGCACGTGCTGGTAAAGCTTCATGCACTTTGTCCGAACAACTGGACTTTATTGACACTAAGAGGGGTGTATACTGCTGCCGTGAACATGAGCATGAAATTGCTTGGTACACGGAACGTTCTGAAAAGAGCTATGAATTGCAGACACCTTTTGAAATTAAATTGGCAAAGAAATTTGACACCTTCAATGGGGAATGTCCAAATTTTGTATTTCCCTTAAATTCCATAATCAAGACTATTCAACCAAGGGTTGAAAAGAAAAAGCTTGATGGCTTTATGGGTAGAATTCGATCTGTCTATCCAGTTGCGTCACCAAATGAATGCAACCAAATGTGCCTTTCAACTCTCATGAAGTGTGATCATTGTGGTGAAACTTCATGGCAGACGGGCGATTTTGTTAAAGCCACTTGCGAATTTTGTGGCACTGAGAATTTGACTAAAGAAGGTGCCACTACTTGTGGTTACTTACCCCAAAATGCTGTTGTTAAAATTTATTGTCCAGCATGTCACAATTCAGAAGTAGGACCTGAGCATAGTCTTGCCGAATACCATAATGAATCTGGCTTGAAAACCATTCTTCGTAAGGGTGGTCGCACTATTGCCTTTGGAGGCTGTGTGTTCTCTTATGTTGGTTGCCATAACAAGTGTGCCTATTGGGTTCCACGTGCTAGCGCTAACATAGGTTGTAACCATACAGGTGTTGTTGGAGAAGGTTCCGAAGGTCTTAATGACAACCTTCTTGAAATACTCCAAAAAGAGAAAGTCAACATCAATATTGTTGGTGACTTTAAACTTAATGAAGAGATCGCCATTATTTTGGCATCTTTTTCTGCTTCCACAAGTGCTTTTGTGGAAACTGTGAAAGGTTTGGATTATAAAGCATTCAAACAAATTGTTGAATCCTGTGGTAATTTTAAAGTTACAAAAGGAAAAGCTAAAAAAGGTGCCTGGAATATTGGTGAACAGAAATCAATACTGAGTCCTCTTTATGCATTTGCATCAGAGGCTGCTCGTGTTGTACGATCAATTTTCTCCCGCACTCTTGAAACTGCTCAAAATTCTGTGCGTGTTTTACAGAAGGCCGCTATAACAATACTAGATGGAATTTCACAGTATTCACTGAGACTCATTGATGCTATGATGTTCACATCTGATTTGGCTACTAACAATCTAGTTGTAATGGCCTACATTACAGGTGGTGTTGTTCAGTTGACTTCGCAGTGGCTAACTAACATCTTTGGCACTGTTTATGAAAAACTCAAACCCGTCCTTGATTGGCTTGAAGAGAAGTTTAAGGAAGGTGTAGAGTTTCTTAGAGACGGTTGGGAAATTGTTAAATTTATCTCAACCTGTGCTTGTGAAATTGTCGGTGGACAAATTGTCACCTGTGCAAAGGAAATTAAGGAGAGTGTTCAGACATTCTTTAAGCTTGTAAATAAATTTTTGGCTTTGTGTGCTGACTCTATCATTATTGGTGGAGCTAAACTTAAAGCCTTGAATTTAGGTGAAACATTTGTCACGCACTCAAAGGGATTGTACAGAAAGTGTGTTAAATCCAGAGAAGAAACTGGCCTACTCATGCCTCTAAAAGCCCCAAAAGAAATTATCTTCTTAGAGGGAGAAACACTTCCCACAGAAGTGTTAACAGAGGAAGTTGTCTTGAAAACTGGTGATTTACAACCATTAGAACAACCTACTAGTGAAGCTGTTGAAGCTCCATTGGTTGGTACACCAGTTTGTATTAACGGGCTTATGTTGCTCGAAATCAAAGACACAGAAAAGTACTGTGCCCTTGCACCTAATATGATGGTAACAAACAATACCTTCACACTCAAAGGCGGTGCACCAACAAAGGTTACTTTTGGTGATGACACTGTGATAGAAGTGCAAGGTTACAAGAGTGTGAATATCACTTTTGAACTTGATGAAAGGATTGATAAAGTACTTAATGAGAAGTGCTCTGCCTATACAGTTGAACTCGGTACAGAAGTAAATGAGTTCGCCTGTGTTGTGGCAGATGCTGTCATAAAAACTTTGCAACCAGTATCTGAATTACTTACACC
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/ref.fasta.fai Wed Oct 12 07:43:59 2022 +0000 @@ -0,0 +1,1 @@ +NC_045512.2 2940 13 2940 2941
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Wed Oct 12 07:43:59 2022 +0000 @@ -0,0 +1,7 @@ +<tables> + <!-- Locations of all fasta files under genome directory --> + <table name="all_fasta" comment_char="#" allow_duplicate_entries="False"> + <columns>value, dbkey, name, path</columns> + <file path="all_fasta.loc" /> + </table> +</tables>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.test Wed Oct 12 07:43:59 2022 +0000 @@ -0,0 +1,7 @@ +<tables> + <!-- Locations of all fasta files under genome directory --> + <table name="all_fasta" comment_char="#" allow_duplicate_entries="False"> + <columns>value, dbkey, name, path</columns> + <file path="${__HERE__}/test-data/all_fasta.loc"/> + </table> +</tables> \ No newline at end of file