Mercurial > repos > iuc > getorganelle

<tool id="get_annotated_regions_from_gb" name="Get annotated regions from genbank files (getorganelle)" version="0.1.0" python_template_version="3.5">
    <macros>
        <import>macros.xml</import>
    </macros>
    <requirements>
        <requirement type="package" version="@TOOL_VERSION@">getorganelle</requirement>
        <requirement type="package" version="@BIOPYTHON_VERSION@">biopython</requirement>
    </requirements>
    <command detect_errors="exit_code"><![CDATA[
        #import re

        ## create list of symbolic links to input files

        #set file_names = []
        #for $input in $inputs
            #set $input_name = re.sub('[^\w\-\s]', '_', str($input.name)).replace('_gb', '.gb')
            ln -s '$input' '$input_name' &&
            $file_names.append($input_name)
        #end for

        ## run python script

        get_annotated_regions_from_gb.py
        #for file in file_names:
            '$file'
        #end for
        -o results_directory
        #if str($gene_type_selector) == "CDS":
            -t CDS
        #end if
        #if str($gene_type_selector) == "tRNA":
            -t tRNA
        #end if
        #if str($gene_type_selector) == "rRNA":
            -t rRNA
        #end if
        --mix

    ]]></command>
    <inputs>
        <param type="data" multiple="true" name="inputs" format="genbank" label="Annotated genbank file(s)" help="Genbank files with annotated regions to extract. Multiple files can be selected." />
        <param name="gene_type_selector" type="select" label="Gene type">
            <option value="CDS" selected="true">CDS</option>
            <option value="tRNA">tRNA</option>
            <option value="rRNA">rRNA</option>
        </param>
    </inputs>
    <outputs>
        <data name="output_fasta" format="fasta" from_work_dir="results_directory/gene/gene.fasta" label='${tool.name} on ${on_string}: Annotated genes'/>
    </outputs>
    <tests>
        <test>
            <param name="inputs" value="NC_047059.gb,NC_047060.gb,NC_047400.gb"/>
            <param name="gene_type_selector" value="CDS"/>
            <assert_stdout>
                <has_text text="Time cost" />
            </assert_stdout>
            <output name="output_fasta">
                <assert_contents>
                    <has_line line=">matK CDS - NC_047059--Styphnolobium_japonicum_voucher_Yi15212-KUN_plastid__complete_genome" />
                    <has_line line=">matK CDS - NC_047060--Haematoxylum_brasiletto_voucher_N._Zamora6857-Costa_Rica_plastid__complete_genome" />
                    <has_line line=">matK CDS - NC_047400--Chamaecrista_mimosoides_voucher_Yi15441-KUN_plastid__complete_genome" />
                </assert_contents>
            </output>
        </test>
        <test>
            <param name="inputs" value="NC_047059.gb,NC_047060.gb,NC_047400.gb"/>
            <param name="gene_type_selector" value="tRNA"/>
            <assert_stdout>
                <has_text text="Time cost" />
            </assert_stdout>
            <output name="output_fasta">
                <assert_contents>
                    <has_line line=">trnA-UGC tRNA - NC_047059--Styphnolobium_japonicum_voucher_Yi15212-KUN_plastid__complete_genome" />
                    <has_line line=">trnA-UGC tRNA - NC_047060--Haematoxylum_brasiletto_voucher_N._Zamora6857-Costa_Rica_plastid__complete_genome" />
                    <has_line line=">trnA-UGC tRNA - NC_047400--Chamaecrista_mimosoides_voucher_Yi15441-KUN_plastid__complete_genome" />
                </assert_contents>
            </output>
        </test>
        <test>
            <param name="inputs" value="NC_047059.gb,NC_047060.gb,NC_047400.gb"/>
            <param name="gene_type_selector" value="rRNA"/>
            <assert_stdout>
                <has_text text="Time cost" />
            </assert_stdout>
            <output name="output_fasta">
                <assert_contents>
                    <has_line line=">rrn16 rRNA - NC_047059--Styphnolobium_japonicum_voucher_Yi15212-KUN_plastid__complete_genome" />
                    <has_line line=">rrn16 rRNA - NC_047060--Haematoxylum_brasiletto_voucher_N._Zamora6857-Costa_Rica_plastid__complete_genome" />
                    <has_line line=">rrn16 rRNA - NC_047400--Chamaecrista_mimosoides_voucher_Yi15441-KUN_plastid__complete_genome" />
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[

            Python script to extract annotated genes from genbank files. The annotated genes are in the correct format to be used as seed sequences in GetOrganelle.
            Help information from the python script is below. Only options for input genebank files and gene type are included in this tool.

            By jinjianjun@mail.kib.ac.cn 2017
            Usage: get_annotated_regions_from_gb.py gb_files -o out_dir

            Options:
              -h, --help            show this help message and exit
              -o OUT_PUT            Output.
              -t GENE_TYPES         Annotation type taken as gene. Default: CDS,tRNA,rRNA
              --separate-copy       By default, only keep one copy (see '--copy-mode' for
                                    more) if there are several regions with the same name.
                                    Exception: if there are one copy with intron(s) and
                                    another copy without intron, they would be both kept.
                                    This exception was specially made for the convenience
                                    of commonly-incorrectly-annotated rps12 gene of
                                    plastome.
              --copy-mode=COPY_MODE
                                    first|longest|leastN|leastN_longest (default).
              --separate-exon       By default, combining exons.
              --keys=GENE_KEYS      The key to the gene name: gene, label, product or
                                    other keys in the qualifiers region.Default:
                                    gene,label,product,note.
              --mix                 Mix different genes into a single fasta file. In this
                                    mode, the sequence header will be gene_name - gb_info
              --case-mode=CASE_TREATMENT
                                    first: Gene name case-non-sensitive. Consistent to the
                                    first appearance.  lower: Gene name case-non-
                                    sensitive. All gene name set to lower case.  upper:
                                    Gene name case-non-sensitive. All gene name set to
                                    Upper case.  raw: Gene name case-sensitive.
              --ignore-format-error
                                    Skip the Error: key "*" not found in annotation. Not
                                    suggested.
              --translate-to-product
                                    Translate the tRNA gene name to the form of their
                                    product. Default: False
              --overwrite           Choose to overwrite previous result.

    ]]></help>
    <citations>
        <citation type="doi">10.1093/sysbio/syaa047</citation>
    </citations>
</tool>
author	iuc
date	Mon, 15 Apr 2024 06:27:37 +0000
parents	7348b69e5109
children