Mercurial > repos > iuc > extract_genomic_dna

<tool id="Extract genomic DNA 1" name="Extract Genomic DNA" version="3.0.1">
    <description>using coordinates from assembled/unassembled genomes</description>
    <requirements>
        <requirement type="package" version="0.7.1">bx-python</requirement>
        <requirement type="package" version="35x1">faToTwoBit</requirement>
    </requirements>
    <command>
        <![CDATA[
            #set genome = $input.metadata.dbkey
            #set datatype = $input.datatype
            mkdir -p output_dir &&
            python $__tool_directory__/extract_genomic_dna.py
            --input "$input"
            --genome "$genome"
            #if $input.is_of_type("gff"):
                --input_format "gff"
                --columns "1,4,5,7"
                --interpret_features $interpret_features
            #else:
                --input_format "interval"
                --columns "${input.metadata.chromCol},${input.metadata.startCol},${input.metadata.endCol},${input.metadata.strandCol},${input.metadata.nameCol}"
            #end if
            --reference_genome_source $reference_genome_cond.reference_genome_source
            #if str($reference_genome_cond.reference_genome_source) == "cached"
                --reference_genome $reference_genome_cond.reference_genome.fields.path
            #else:
                --reference_genome $reference_genome_cond.reference_genome
            #end if
            --output_format $output_format
            --output $output
        ]]>
    </command>
    <inputs>
        <param name="input" type="data" format="gff,interval" label="Fetch sequences for intervals in">
            <validator type="unspecified_build" />
        </param>
        <param name="interpret_features" type="select" label="Interpret features when possible" help="Applicable only when input dataset format is in the gff family">
            <option value="yes">Yes</option>
            <option value="no">No</option>
        </param>
        <conditional name="reference_genome_cond">
            <param name="reference_genome_source" type="select" label="Choose the source for the reference genome">
                <option value="cached">locally cached</option>
                <option value="history">from history</option>
            </param>
            <when value="cached">
                <param name="reference_genome" type="select" label="Using reference genome">
                    <options from_data_table="twobit">
                        <filter type="data_meta" key="dbkey" ref="input" column="0"/>
                    </options>
                    <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/>
                </param>
            </when>
            <when value="history">
                <param name="reference_genome" type="data" format="fasta" label="Using reference genome">
                    <options>
                        <filter type="data_meta" key="dbkey" ref="input"/>
                    </options>
                    <validator type="no_options" message="The current history does not include a fasta dataset with the build associated with the selected input file"/>
                </param>
            </when>
        </conditional>
        <param name="output_format" type="select" label="Select output format">
            <option value="fasta" selected="True">fasta</option>
            <option value="interval">interval</option>
        </param>
    </inputs>
    <outputs>
        <data format_source="input" name="output" metadata_source="input">
            <change_format>
                <when input="output_format" value="fasta" format="fasta" />
            </change_format>
        </data>
    </outputs>
    <tests>
        <test>
            <param name="input" value="1.bed" dbkey="hg17" ftype="bed" />
            <param name="interpret_features" value="yes"/>
            <param name="index_source" value="cached"/>
            <param name="out_format" value="fasta"/>
            <output name="out_file1" file="extract_genomic_dna_out1.fasta" compare="contains" />
        </test>
        <test>
            <param name="input" value="droPer1.bed" dbkey="droPer1" ftype="bed" />
            <param name="interpret_features" value="yes"/>
            <param name="index_source" value="cached"/>
            <param name="out_format" value="fasta"/>
            <output name="out_file1" file="extract_genomic_dna_out2.fasta" compare="contains" />
        </test>
        <test>
            <param name="input" value="1.bed" dbkey="hg17" ftype="bed" />
            <param name="interpret_features" value="yes"/>
            <param name="index_source" value="cached"/>
            <param name="out_format" value="interval"/>
            <output name="out_file1" file="extract_genomic_dna_out3.interval" compare="contains" />
        </test>
        <!-- Test GFF file support. -->
        <test>
            <param name="input" value="gff_filter_by_attribute_out1.gff" dbkey="mm9" ftype="gff" />
            <param name="interpret_features" value="no"/>
            <param name="index_source" value="cached"/>
            <param name="out_format" value="interval"/>
            <output name="out_file1" file="extract_genomic_dna_out4.gff" compare="contains" />
        </test>
        <test>
            <param name="input" value="gff_filter_by_attribute_out1.gff" dbkey="mm9" ftype="gff" />
            <param name="interpret_features" value="no"/>
            <param name="out_format" value="fasta"/>
            <param name="index_source" value="cached"/>
            <output name="out_file1" file="extract_genomic_dna_out5.fasta" compare="contains" />
        </test>
        <!-- Test custom sequences support and GFF feature interpretation. -->
        <test>
            <param name="input" value="cufflinks_out1.gtf" dbkey="mm9" ftype="gff" />
            <param name="interpret_features" value="no"/>
            <param name="index_source" value="history"/>
            <param name="ref_file" value="tophat_in1.fasta"/>
            <param name="out_format" value="fasta"/>
            <output name="out_file1" file="extract_genomic_dna_out6.fasta" compare="contains" />
        </test>
        <test>
            <param name="input" value="cufflinks_out1.gtf" dbkey="mm9" ftype="gff" />
            <param name="interpret_features" value="yes"/>
            <param name="index_source" value="history"/>
            <param name="ref_file" value="tophat_in1.fasta"/>
            <param name="out_format" value="fasta"/>
            <output name="out_file1" file="extract_genomic_dna_out7.fasta" compare="contains" />
        </test>
    </tests>
    <help>

.. class:: warningmark

This tool requires interval or gff (special tabular formatted data).  If your data is not TAB delimited, first use *Text Manipulation-&gt;Convert*.

.. class:: warningmark

Make sure that the genome build is specified for the dataset from which you are extracting sequences (click the pencil icon in the history item if it is not specified).

.. class:: warningmark

All of the following will cause a line from the input dataset to be skipped and a warning generated.  The number of warnings and skipped lines is documented in the resulting history item.
 - Any lines that do not contain at least 3 columns, a chromosome and numerical start and end coordinates.
 - Sequences that fall outside of the range of a line's start and end coordinates.
 - Chromosome, start or end coordinates that are invalid for the specified build.
 - Any lines whose data columns are not separated by a **TAB** character ( other white-space characters are invalid ).

.. class:: infomark

 **Extract genomic DNA using coordinates from ASSEMBLED genomes and UNassembled genomes** previously were achieved by two separate tools.

-----

**What it does**

This tool uses coordinate, strand, and build information to fetch genomic DNAs in FASTA or interval format.

If strand is not defined, the default value is "+".

-----

**Example**

If the input dataset is::

    chr7  127475281  127475310  NM_000230  0  +
    chr7  127485994  127486166  NM_000230  0  +
    chr7  127486011  127486166  D49487     0  +

Extracting sequences with **FASTA** output data type returns::

    &gt;hg17_chr7_127475281_127475310_+ NM_000230
    GTAGGAATCGCAGCGCCAGCGGTTGCAAG
    &gt;hg17_chr7_127485994_127486166_+ NM_000230
    GCCCAAGAAGCCCATCCTGGGAAGGAAAATGCATTGGGGAACCCTGTGCG
    GATTCTTGTGGCTTTGGCCCTATCTTTTCTATGTCCAAGCTGTGCCCATC
    CAAAAAGTCCAAGATGACACCAAAACCCTCATCAAGACAATTGTCACCAG
    GATCAATGACATTTCACACACG
    &gt;hg17_chr7_127486011_127486166_+ D49487
    TGGGAAGGAAAATGCATTGGGGAACCCTGTGCGGATTCTTGTGGCTTTGG
    CCCTATCTTTTCTATGTCCAAGCTGTGCCCATCCAAAAAGTCCAAGATGA
    CACCAAAACCCTCATCAAGACAATTGTCACCAGGATCAATGACATTTCAC
    ACACG

Extracting sequences with **Interval** output data type returns::

    chr7    127475281       127475310       NM_000230       0       +       GTAGGAATCGCAGCGCCAGCGGTTGCAAG
    chr7    127485994       127486166       NM_000230       0       +       GCCCAAGAAGCCCATCCTGGGAAGGAAAATGCATTGGGGAACCCTGTGCGGATTCTTGTGGCTTTGGCCCTATCTTTTCTATGTCCAAGCTGTGCCCATCCAAAAAGTCCAAGATGACACCAAAACCCTCATCAAGACAATTGTCACCAGGATCAATGACATTTCACACACG
    chr7    127486011       127486166       D49487  0       +       TGGGAAGGAAAATGCATTGGGGAACCCTGTGCGGATTCTTGTGGCTTTGGCCCTATCTTTTCTATGTCCAAGCTGTGCCCATCCAAAAAGTCCAAGATGACACCAAAACCCTCATCAAGACAATTGTCACCAGGATCAATGACATTTCACACACG

    </help>
    <citations>
        <citation type="bibtex">
            @unpublished{None,
            author = {Guru Ananda,Greg Von Kuster},
            title = {None},
            year = {None},
            eprint = {None},
            url = {http://www.bx.psu.edu/~anton/labSite/}
        }</citation>
    </citations>
</tool>
author	iuc
date	Wed, 20 Jan 2016 09:49:37 -0500
parents	8dd8e89c0603
children	702970e4a134