view retrieve_ensembl_bed.xml @ 0:da1b538b87e5 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/proteogenomics/retrieve_ensembl_bed commit 88cf1e923a8c9e5bc6953ad412d15a7c70f054d1
author galaxyp
date Mon, 22 Jan 2018 13:13:47 -0500
parents
children 9c4a48f5d4e7
line wrap: on
line source

<tool id="retrieve_ensembl_bed" name="Retrieve Ensembl features in BED format" version="0.1.0">
    <description>using Ensembl REST API</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <requirements>
        <expand macro="ensembl_requirements" />
        <expand macro="bedutil_requirements" />
    </requirements>
    <command detect_errors="exit_code"><![CDATA[
        python '$__tool_directory__/retrieve_ensembl_bed.py'  
            --species '$species'
            #if $extended_bed:
                --extended_bed
            #end if
            $ucsc_chrom_names
            #if $biotypes:
                --biotypes '$biotypes'
            #end if
            #if $regions:
                --regions '$regions'
            #end if
            '$transcript_bed'
    ]]></command>
    <inputs>
        <param name="species" type="text" value="" label="Ensembl species" >
            <help>
            </help>
            <expand macro="species_options" />
            <validator type="regex" message="Enter an Ensembl organism">^\w+.*$</validator>
        </param>
        <param name="extended_bed" type="boolean" truevalue=",second_name,cds_start_status,cds_end_status,exon_frames,type,gene_name,second_gene_name,gene_type" falsevalue="" checked="true" 
               label="Keep extra columns from ensembl BED"/>
        <param name="ucsc_chrom_names" type="boolean" truevalue="--ucsc_chrom_names" falsevalue="" checked="false" 
               label="Use the UCSC names for Chromosomes"/>
        <param name="biotypes" type="text" value="" optional="true" label="Restrict Feature retrieval to these biotypes" >
            <expand macro="biotypes_help" />
        </param>
        <param name="regions" type="text" value="" optional="true" label="Restrict Feature retrieval to comma-separated list of regions" >
            <help>Each region is specifed as: chr or chr:pos or chr:from-to</help>
            <validator type="regex" message="">^(\w+(:\d+(-\d+)?)?(,\w+(:\d+(-\d+)?)?)*)?$</validator>
        </param>
    </inputs>
    <outputs>
        <data name="transcript_bed" format="bed" label="Ensembl ${species} transcripts.bed">
            <actions>
                <action name="column_names" type="metadata" 
                 default="chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts${extended_bed}"/>
            </actions>
        </data>
    </outputs>
    <tests>
        <test>
            <param name="species" value="human"/>
            <param name="biotypes" value="protein_coding"/>
            <param name="regions" value="1:51194990-51275150"/>
            <output name="transcript_bed">
                <assert_contents>
                    <has_text_matching expression="(chr)?1\t\d+\t\d+\tENST" />
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[
Retrieve Ensembl cDNAs in BED format

usage: retrieve_ensembl_bed.py [-h] [-s SPECIES] [-R REGIONS] [-B BIOTYPES]
                               [-X] [-U] [-t] [-v] [-d]
                               output

positional arguments:
  output                Output BED filepath, or for stdout: "-"

optional arguments:
  -h, --help            show this help message and exit
  -s SPECIES, --species SPECIES
                        Ensembl Species to retrieve
  -R REGIONS, --regions REGIONS
                        Restrict Ensembl retrieval to regions e.g.:
                        X,2:20000-25000,3:100-500+
  -B BIOTYPES, --biotypes BIOTYPES
                        Restrict Ensembl biotypes to retrieve
  -X, --extended_bed    Include the extended columns returned from Ensembl
  -U, --ucsc_chrom_names
                        Use the UCSC names for Chromosomes
  -t, --toplevel        Print Ensembl toplevel for species
  -v, --verbose         Verbose
  -d, --debug           Debug


Ensembl REST API returns an extended BED format with these additional columns::

  second_name, cds_start_status, cds_end_status, exon_frames, type, gene_name, second_gene_name, gene_type

    ]]></help>
    <citations>
        <citation type="doi">10.1093/bioinformatics/btu613</citation>
        <citation type="doi">10.1093/nar/gku1010</citation>
    </citations>
</tool>