view eventaligncollapse.xml @ 2:74d7b06c2b92 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/nanopolishcomp commit fc685e81241eb196e57669961a42d03c60e7c9b6
author iuc
date Wed, 13 Mar 2024 11:51:58 +0000
parents ad011fc670d6
children
line wrap: on
line source

<?xml version="1.0"?>
<tool id="nanopolishcomp_eventaligncollapse" name="NanopolishComp: EventalignCollapse" version="@TOOL_VERSION@+@WRAPPER_VERSION@">
    <description>by kmers rather than by event</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="xrefs"/>
    <expand macro="requirements"/>
    <expand macro="version_command"/>
    <command detect_errors="exit_code"><![CDATA[
        ## initialize
        ## requires a minimum of 3 threads
        threads=\$((3 > \${GALAXY_SLOTS:-3} ? 3 : \${GALAXY_SLOTS:-3})) &&

        ## run
        NanopolishComp
            Eventalign_collapse
            -i '$i'
            -s
            -r $r
            -f
            #for $current in $f
                $current
            #end for
            -o 'results'
            -p 'out'
            -t \$threads
            -v
        ]]></command>
    <inputs>
        <param argument="-i" type="data" format="tabular" label="Select nanopolish eventalign file"/>
        <param argument="-s" type="boolean" truevalue="-s" falsevalue="" label="Should raw samples be written?" help="You need to run nanopolish eventalign with --sample option to make use of this feature."/>
        <param argument="-r" type="integer" value="0" min="0" label="Set maximum number of reads to parse" help="Use 0 to deactivate this option."/>
        <param argument="-f" type="select" multiple="true" label="Select statistical fields to compute" help="You need to run nanopolish eventalign with --sample option to make use of this feature.">
            <option value="mean" selected="true">Mean</option>
            <option value="std">Std</option>
            <option value="median" selected="true">Median</option>
            <option value="mad">Mad</option>
            <option value="num_signals" selected="true">Number of signals</option>
        </param>
        <param name="out" type="select" multiple="true" label="Select output file(s)">
            <option value="eventalign_collapse" selected="true">Eventalign Collapse</option>
            <option value="index" selected="true">Index</option>
            <option value="log">Log</option>
        </param>
    </inputs>
    <outputs>
        <data name="out_eventalign_collapse" format="tabular" from_work_dir="results/out_eventalign_collapse.tsv" label="${tool.name} on ${on_string}: Eventalign Collapse">
            <filter>'eventalign_collapse' in out</filter>
        </data>
        <data name="out_index" format="tabular" from_work_dir="results/out_eventalign_collapse.tsv.idx" label="${tool.name} on ${on_string}: Index">
            <filter>'index' in out</filter>
        </data>
        <data name="out_log" format="txt" from_work_dir="results/out_eventalign_collapse.log" label="${tool.name} on ${on_string}: Log">
            <filter>'log' in out</filter>
        </data>
    </outputs>
    <tests>
        <!-- #1 default -->
        <test expect_num_outputs="3">
            <param name="i" value="sample_event.tsv"/>
            <param name="out" value="eventalign_collapse,index,log"/>
            <output name="out_eventalign_collapse">
                <assert_contents>
                    <has_n_lines n="236"/>
                    <has_text_matching expression="ref_pos&#009;ref_kmer"/>
                    <has_text_matching expression="22102&#009;GGAAA"/>
                </assert_contents>
            </output>
            <output name="out_index">
                <assert_contents>
                    <has_n_lines n="60"/>
                    <has_text_matching expression="ref_id&#009;ref_start"/>
                    <has_text_matching expression="chr&#009;22102"/>
                </assert_contents>
            </output>
            <output name="out_log">
                <assert_contents>
                    <has_n_lines n="13"/>
                    <has_line line="General options:"/>
                </assert_contents>
            </output>
        </test>
        <!-- #2 -->
        <test expect_num_outputs="2">
            <param name="i" value="sample_event.tsv"/>
            <param name="s" value="true"/>
            <param name="r" value="10"/>
            <param name="f" value="mean,std,median,mad,num_signals"/>
            <param name="out" value="eventalign_collapse,index"/>
            <output name="out_eventalign_collapse">
                <assert_contents>
                    <has_n_lines n="236"/>
                    <has_text_matching expression="ref_pos&#009;ref_kmer"/>
                    <has_text_matching expression="22102&#009;GGAAA"/>
                </assert_contents>
            </output>
            <output name="out_index">
                <assert_contents>
                    <has_n_lines n="60"/>
                    <has_text_matching expression="ref_id&#009;ref_start"/>
                    <has_text_matching expression="chr&#009;22102"/>
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[
.. class:: infomark

**What it does**

@WID@

Eventalign_collapse collapses the raw file generated by nanopolish eventalign by kmers rather than by event.

**Input**

A nanopolish eventalign tabular output file.

**Output**

Contrary to nanopolish eventalign output text file, in Eventalign_collapse the reads are separated by a hashtag headers containing the read_id and ref_id. This reduces the redundancy and makes it easier to find the start and end of a read.

::

    Example : #7ef1d7b9-5824-4382-b23b-78d82c07ebbd YHR055C.

The main data file contains the following fields:

- ref_pos: Reference sequence ID (contig).
- ref_kmer: Sequence of the reference kmers.
- -num_events: Number of events for this kmer before collapsing.
- dwell_time: dwell time for this kmer in seconds
- NNNNN_dwell_time: dwell time of events for this kmers with a model sequence "NNNNN" (events ignored by nanopolish HMM).
- mismatch_dwell_time: dwell time of events for this kmers with a model sequence different from the reference kmer
- start_idx: Only if nanopolish eventalign called with --signal_idx. Start coordinate on original raw signal in fast5 file
- end_idx: Only if nanopolish eventalign called with --signal_idx. End coordinate on original raw signal in fast5 file
- mean: Only if nanopolish eventalign called with --samples. Mean of the normalised signal values provided by Nanopolish eventalign
- median: Only if nanopolish eventalign called with --samples. Median of the normalised signal values provided by Nanopolish eventalign
- std: Only if nanopolish eventalign called with --samples. Standard deviation of the normalised signal values provided by Nanopolish eventalign
- mad: Only if nanopolish eventalign called with --samples. Median absolute deviation of the normalised signal values provided by Nanopolish eventalign
- num_signals: Only if nanopolish eventalign called with --samples. Number of raw signal points.
- samples: Only if nanopolish eventalign called with --samples and Eventalign_collapse called with --write_samples. List of normalised signal intensity values for this kmer

In addition Eventalign_collapse also generates an useful index file containing reads level information. It contains the following fields:

- read_id: Name or index of the read
- ref_id: Name of the reference sequence the read was aligned on (contig)
- ref_start: Start coordinate of the alignment on the reference sequence
- ref_end: End coordinate of the alignment on the reference sequence
- dwell_time: Cumulative dwell time in seconds for the entire resquiggled sequence
- kmers: Overall number of resquiggled kmers
- NNNNN_kmers: Number of resquiggled kmers containing at least 1 event for which the model sequence was "NNNNN"
- mismatching_kmers: Number of resquiggled kmers containing at least 1 event for which the model sequence diverged from the reference sequence
- missing_kmers: Number of skipped/missing reference positions in nanopolish output
- byte_offset: Number of characters before the start of the sequence in the main output file. This can be used in conjunction with file.seek() to directly access the start of a read. An example is provided in the Usage notebook.
- byte_len: Length of characters after byte_offset to the end of the read, excluding the last newline. This can be used in conjunction with read() to read all the text chunk corresponding to the read.

.. class:: infomark

**References**

@REFERENCES@
    ]]></help>
    <expand macro="citations"/>
</tool>