Mercurial > repos > iuc > data_manager_hisat2_index_builder

<tool id="hisat2_index_builder_data_manager" name="HISAT2 index" tool_type="manage_data" version="1.0.0">
    <description>builder</description>
    <requirements>
        <requirement type="package" version="2.0">hisat</requirement>
    </requirements>
    <stdio>
        <exit_code range=":-1" />
        <exit_code range="1:" />
    </stdio>
    <command><![CDATA[
        #if $advanced.adv_param_select == 'yes' and $advanced.gtf_input:
            ln -s "${advanced.gtf_input}" gtf_file.gtf &&
            python \$HISAT2_ROOT_DIR/bin/extract_splice_sites.py gtf_file.gtf > splice_sites.txt &&
            python \$HISAT2_ROOT_DIR/bin/extract_exons.py gtf_file.gtf > exon.txt &&
            ls -lh &&
        #end if
        #if $advanced.adv_param_select == 'yes' and $advanced.snps:
            ln -s "${all_fasta_source.fields.path}" genome.fa &&
            ln -s "${advanced.snps}" snps.tabular &&
            python \$HISAT2_ROOT_DIR/bin/extract_snps.py --genome_file genome.fa --snp_file snps.tabular > snps.txt &&
        #end if
        python $__tool_directory__/hisat2_index_builder.py --output "${out_file}"
            --fasta_filename "${all_fasta_source.fields.path}"
            --fasta_dbkey "${all_fasta_source.fields.dbkey}"
            --fasta_description "${all_fasta_source.fields.name}"
            --data_table_name "hisat2_indexes"
            #if $advanced.adv_param_select == 'yes':
                --indexer_options "
                --noauto
                -p \${GALAXY_SLOTS:-1}
                #if $snps:
                    --snps `pwd`/snps.txt
                #end if
                #if $advanced.gtf_input:
                    --ss `pwd`/splice_sites.txt
                    --exon `pwd`/exon.txt
                #end if
                --bmax $advanced.bmax
                --bmaxdivn $advanced.bmaxdivn
                --dcv $advanced.dcv
                --offrate $advanced.offrate
                "
            #end if
        ]]>
    </command>
    <inputs>
        <param label="Source FASTA Sequence" name="all_fasta_source" type="select">
            <options from_data_table="all_fasta" />
        </param>
        <conditional name="advanced" label="Advanced parameters">
            <param name="adv_param_select" type="select" label="Advanced parameters">
                <option value="no">Use defaults</option>
                <option value="yes">Fine-tune indexing parameters</option>
            </param>
            <when value="yes">
                <param type="integer" name="bmax" label="Maximum number of suffixes allowed in a block." help="--bmax" value="4" />
                <param type="integer" name="bmaxdivn" label="Maximum number of suffixes allowed in a block, expressed as a fraction of the length of the reference." help="--bmaxdivn" value="4" />
                <param type="integer" name="dcv" label="Period for the difference-cover sample." help="--dcv: A larger period yields less memory overhead, but may make suffix sorting slower, especially if repeats are present. Must be a power of 2 no greater than 4096. " value="1024" min="2" max="4096" />
                <param type="integer" name="offrate" label="Mark rows in the Burrows-Wheeler transform" help="--offrate: To map alignments back to positions on the reference sequences, it's necessary to annotate (&quot;mark&quot;) some or all of the Burrows-Wheeler rows with their corresponding location on the genome. This parameter governs how many rows get marked: the indexer will mark every 2^&lt;int&gt; rows. Marking more rows makes reference-position lookups faster, but requires more memory to hold the annotations at runtime. The default is 4 (every 16th row is marked; for human genome, annotations occupy about 680 megabytes)." value="4" />
                <param type="data" format="tabular" name="snps" label="Provide a list of SNPs in the UCSC dbSNP format" optional="True" help="This should be a dataset in the Data Manager History (automatically created). If you include SNPs or splice sites and exons, building an index on the human genome will consume up to 200GB RAM as index building involves a graph construction." />
                <param type="data" format="gtf" name="gtf_input" label="Provide a GTF file for HISAT2 to extract splice sites from" optional="True" help="This should be a dataset in the Data Manager History (automatically created). If you include SNPs or splice sites and exons, building an index on the human genome will consume up to 200GB RAM as index building involves a graph construction." />
            </when>
            <when value="no" />
        </conditional>
        <param label="Name of sequence" name="sequence_name" type="text" value="" />
        <param label="ID for sequence" name="sequence_id" type="text" value="" />
    </inputs>
    <outputs>
        <data format="data_manager_json" name="out_file" />
    </outputs>
    <help>
<![CDATA[
.. class:: infomark

**Notice:** If you leave name, description, or id blank, it will be generated automatically.

What is HISAT2?
---------------

`HISAT <http://ccb.jhu.edu/software/hisat>`__ is a fast and sensitive alignment
program for mapping next-generation sequencing reads (both DNA and RNA) against
the general human population (as well as against a single reference genome).
Based on an extension of BWT for graphs (`BWT <http://dl.acm.org/citation.cfm?id=2674828>`__)
we designed and implemented a graph FM index (GFM), an original approach and
its first implementation to the best of our knowledge. In addition to using one
global GFM index that represents the general population, HISAT2 uses a large set
of small GFM indexes that collectively cover the whole genome (each index
representing a genomic region of 56 Kbp, with 55,000 indexes needed to cover
the human population). These small indexes (called local indexes), combined
with several alignment strategies, enable rapid and accurate alignment of
sequencing reads. This new indexing scheme is called a Hierarchical Graph
FM index (HGFM).  In addition to spliced alignment, HISAT handles reads
involving indels and supports a paired-end alignment mode. Multiple processors
can be used simultaneously to achieve greater alignment speed. HISAT outputs
alignments in `SAM <http://samtools.sourceforge.net/SAM1.pdf>`__ format, enabling
interoperation with a large number of other tools (e.g. `SAMtools <http://samtools.sourceforge.net>`__,
`GATK <http://www.broadinstitute.org/gsa/wiki/index.php/The_Genome_Analysis_Toolkit>`__)
that use SAM. HISAT is distributed under the `GPLv3 license <http://www.gnu.org/licenses/gpl-3.0.html>`__,
and it runs on the command line under Linux, Mac OS X and Windows.
]]>
    </help>
    <citations>
        <citation type="doi">10.1038/nmeth.3317</citation>
    </citations>
</tool>
author	iuc
date	Sat, 10 Oct 2015 16:23:50 -0400
parents	b13b84cb11b5
children	d210e1f185bd