view RepeatMasker.xml @ 1:f8f1a3878edd draft default tip

0.1.1 version, prevent a crash if no repeat is found. Thanks to Simon Guest
author bjoern-gruening
date Fri, 01 Feb 2013 03:29:37 -0500
parents
children
line wrap: on
line source

<tool id="repeatmasker_wrapper" name="RepeatMasker" version="0.1.1">
    <description>Masks different kind of repeats</description>
    <command>
## The command is a Cheetah template which allows some Python based syntax.
## Lines starting hash hash are comments. Galaxy will turn newlines into spaces

## create temp directory
#import tempfile, os
#set $dirname = os.path.abspath(tempfile.mkdtemp())
#set $input_filename = os.path.split(str($query))[-1]
#set $output_basename = os.path.join($dirname, $input_filename)


RepeatMasker 
-parallel 8

$nolow
$noint
$norna

#if str($species)!="all":
    $species
#end if


-dir $dirname

#if $adv_opts.adv_opts_selector=="advanced":

    #if str($adv_opts.gc)!="0":
        -gc $adv_opts.gc
    #end if

    $adv_opts.gccalc

    #set $output_files_list = str($adv_opts.output_files).split(',')
    #if "gff" in $output_files_list:
        -gff
    #end if
    #if "html" in $output_files_list:
        -html
    #end if

    $adv_opts.slow_search
    $adv_opts.quick_search
    $adv_opts.rush_search
    $adv_opts.only_alus
    $adv_opts.is_only

#else:
    ## Set defaults
    -gff

## End of advanced options:
#end if

$query


> /dev/null 2> /dev/null;
## Copy the output files to galaxy
## AgR: if there are no repeats, the output files may not exist.
## This causes the job to fail, so touch files to ensure they exist.
#if $adv_opts.adv_opts_selector=="advanced":

    #if "summary" in $output_files_list:
        ## Write out the summary file (default)
        #set $summary_file = $output_basename + '.tbl'
        touch $summary_file
        cp $summary_file $output_summary;
    #end if

    #if "gff" in $output_files_list:
        ## Write out the gff file (default)
        #set $gff_file = $output_basename + '.out.gff'
        touch $gff_file
        cp $gff_file $output_gff;
    #end if

    #if "html" in $output_files_list:
        ## Write out the html file
        #set $html_file = $output_basename + '.out.html'
        touch $html_file
        cp $html_file $output_html;
    #end if

#else:

    ## Write out the summary file (default)
    #set $summary_file = $output_basename + '.tbl'
    touch $summary_file
    cp $summary_file $output_summary;

    ## Write out the gff file (default)
    #set $gff_file = $output_basename + '.out.gff'
    touch $gff_file
    cp $gff_file $output_gff;


## End of advanced options:
#end if

## Write out mask sequence file
#set $mask_sequence_file = $output_basename + '.masked'
touch $mask_sequence_file
cp $mask_sequence_file $output_mask;

## Write out standard file (default)
## The default '.out' file from RepeatMasker has a 3-line header and spaces rather
## than tabs. Remove the header and replace the whitespaces with tab
#set $standard_file = $output_basename + '.out'
tail -n +4 $standard_file | tr -s ' ' '\t' > $output_std;

## Delete all temporary files
rm $dirname -r;


    </command>
    <inputs>
        <param name="query" type="data" format="fasta" label="Nucleotide query sequence(s)"/> 

        <param name="nolow" type="boolean" label="No low complexity DNA" truevalue="-nolow" falsevalue="" checked="false" help="Does not mask low_complexity DNA or simple repeats."/>
        <param name="noint" type="boolean" label="No interspersed repeats" truevalue="-noint" falsevalue="" checked="false" help="Only masks low complex/simple repeats (no interspersed repeats)."/>

        <param name="norna" type="boolean" label="No small RNA genes" truevalue="-norna" falsevalue="" checked="false" help="Does not mask small RNA (pseudo) genes."/>

        <!--
            Specify the species or clade of the input sequence. The species name
            must be a valid NCBI Taxonomy Database species name and be contained
            in the RepeatMasker repeat database. The following collection is not complete.
        -->
        <param name="species" type="select" label="Species" help="The list is not complete, if you need other species contact your administrator.">
            <option value="-species anopheles">anopheles</option>
            <option value="-species arabidopsis">arabidopsis</option>
            <option value="-species artiodactyl">artiodactyl</option>
            <option value="-species aspergillus">aspergillus</option>
            <option value="-species carnivore">carnivore</option>
            <option value="-species cat">cat</option>
            <option value="-species chicken">chicken</option>
            <option value="-species 'ciona intestinalis'">ciona intestinalis</option>
            <option value="-species 'ciona savignyi'">ciona savignyi</option>
            <option value="-species cow">cow</option>
            <option value="-species danio">danio</option>
            <option value="-species diatoaea">diatoaea</option>
            <option value="-species dog">dog</option>
            <option value="-species drosophila">drosophila</option>
            <option value="-species elegans">elegans</option>
            <option value="-species fugu">fugu</option>
            <option value="-species fungi" selected="true">fungi</option>
            <option value="-species human">human</option>
            <option value="-species maize">maize</option>
            <option value="-species mammal">mammal</option>
            <option value="-species mouse">mouse</option>
            <option value="-species pig">pig</option>
            <option value="-species rat">rat</option>
            <option value="-species rice">rice</option>
            <option value="-species rodentia">rodentia</option>
            <option value="-species wheat">wheat</option>
        </param>

        <conditional name="adv_opts">
            <param name="adv_opts_selector" type="select" label="Advanced Options">
              <option value="basic" selected="True">Hide Advanced Options</option>
              <option value="advanced">Show Advanced Options</option>
            </param>
            <when value="basic" />
            <when value="advanced">


              <param name="is_only" type="boolean" label="Mask only E coli insertion elements" truevalue="-is_only" falsevalue="" checked="false" help="Only clips E coli insertion elements out of fasta and .qual files."/>


              <param name="slow_search" type="boolean" label="Slow search" truevalue="-s" falsevalue="" checked="false" help="0-5% more sensitive, 2-3 times slower than default."/>
              <param name="quick_search" type="boolean" label="Quick search" truevalue="-q" falsevalue="" checked="false" help="5-10% less sensitive, 2-5 times faster than default."/>
              <param name="rush_search" type="boolean" label="Rush search" truevalue="-qq" falsevalue="" checked="false" help="about 10% less sensitive, 4->10 times faster than default."/>

              <param name="only_alus" type="boolean" label="Only Alus" truevalue="-alu" falsevalue="" checked="false" help="Only masks Alus (and 7SLRNA, SVA and LTR5)(only for primate DNA)."/>

              <param name="gccalc" type="boolean" label="Use GC depended matrices, automaticly" truevalue="-gccalc" falsevalue="" checked="true" help="RepeatMasker calculates the GC content even for batch files/small seqs"/>

              <param name="output_files" type="select" multiple="true" label="Additional output files">
                  <option selected="true" value="summary">Summary file</option>
                  <option value="gff">GFF file</option>
                  <option value="html">HTML file</option>
                  <option value="mask">Mask FastA file</option>
              </param>


              <param name="gc" type="integer" value="0" label="Use GC depended matrices" help="Use matrices calculated for 'number' percentage background GC level">
                    <validator type="in_range" min="0" />
                    <validator type="in_range" max="100" />
              </param>

            </when>
        </conditional>

    </inputs>
    <outputs>
        <data name="output_std" format="tabular" label="${tool.name} on ${on_string}: Standard" />
        <data name="output_mask" format="fasta" label="${tool.name} on ${on_string}: Mask sequence">
            <filter>
                    (adv_opts['adv_opts_selector'] == 'advanced' and 'mask' in adv_opts['output_files'])
            </filter>
        </data>
        <data name="output_summary" format="txt" label="${tool.name} on ${on_string}: Summary">
            <filter>(
                    (adv_opts['adv_opts_selector'] == 'advanced' and 'summary' in adv_opts['output_files'])
                    or
                    (adv_opts['adv_opts_selector'] == 'basic')
                    )
            </filter>
        </data>
        <data name="output_html" format="html" label="${tool.name} on ${on_string}: HTML">
            <filter>(adv_opts['adv_opts_selector'] == 'advanced' and 'html' in adv_opts['output_files'])</filter>
        </data>
        <data name="output_gff" format="gff" label="${tool.name} on ${on_string}: GFF">
            <filter>
                    (adv_opts['adv_opts_selector'] == 'advanced' and 'gff' in adv_opts['output_files'])
            </filter>
        </data>
    </outputs>
    <requirements>
        <requirement type="binary">RepeatMasker</requirement>
    </requirements>
    <help>
    
.. class:: warningmark

**What it does**

RepeatMasker is a program that screens DNA sequences for *interspersed repeats*
and *low complexity* DNA sequences. The output of the program is a detailed
annotation of the repeats that are present in the query sequence as well as a
modified version of the query sequence in which all the annotated repeats have
been masked (default: replaced by Ns).

-----

**How to read the results**



The annotation file contains the cross_match output lines. It lists all best matches
(above a set minimum score) between the query sequence and any of the sequences in
the repeat database or with low complexity DNA. The term "best matches" reflects
that a match is not shown if its domain is over 80% contained within the domain
of a higher scoring match, where the "domain" of a match is the region in
the query sequence that is defined by the alignment start and stop. These domains
have been masked in the returned masked sequence file. In the output, matches are
ordered by query name, and for each query by position of the start of the alignment.

Example:

======== ========= ========= ========= ========== =========== ========= ========= ============ =============== =================== ================ ============== ======= ==
SW score perc div. perc del. perc ins. query seq. q-pos begin q-pos end (left)    w complement matching repeat repeat class/family repeat-pos begin repeat-pos end (left)  ID
======== ========= ========= ========= ========== =========== ========= ========= ============ =============== =================== ================ ============== ======= ==
    1306 15.6      6.2       0.0       HSU08988   6563        6781      \(22462)  C            MER7A           DNA/MER2_type       336              103            \(0)    1
   12204 10.0      2.4       1.8       HSU08988   6782        7714      \(21529)  C            TIGGER1         DNA/MER2_type       2418             1493           \(0)    2
     279  3.0      0.0       0.0       HSU08988   7719        7751      \(21492)  +            (TTTTA)n        Simple_repeat       1                33             \(0)    3
    1765 13.4      6.5       1.8       HSU08988   7752        8022      \(21221)  C            AluSx           SINE/Alu            289              1              \(23)   4
   12204 10.0      2.4       1.8       HSU08988   8023        8694      \(20549)  C            TIGGER1         DNA/MER2_type       1493             827            \(925)  5
    1984 11.1      0.3       0.7       HSU08988   8695        9000      \(20243)  C            AluSg           SINE/Alu            305              1              \(5)    6
   12204 10.0      2.4       1.8       HSU08988   9001        9695      \(19548)  C            TIGGER1         DNA/MER2_type       827              2              \(1591) 7
     711 21.2      1.4       0.0       HSU08988   9696        9816      \(19427)  C            MER7A           DNA/MER2_type       122              2              \(224)  8
======== ========= ========= ========= ========== =========== ========= ========= ============ =============== =================== ================ ============== ======= ==

This is a sequence in which a Tigger1 DNA transposon has integrated into a MER7 DNA transposon copy.
Subsequently two Alus integrated in the Tigger1 sequence. The simple repeat is derived from the
poly A of the Alu element. The first line is interpreted like this:

:Table description:

1. **1306** = Smith-Waterman score of the match, usually complexity adjusted
        The SW scores are not always directly comparable. Sometimes
        the complexity adjustment has been turned off, and a variety of
        scoring-matrices are used.

#. **15.6** = % substitutions in matching region compared to the consensus
#. **6.2** = % of bases opposite a gap in the query sequence (deleted bp)
#. **0.0** = % of bases opposite a gap in the repeat consensus (inserted bp)
#. **HSU08988** = name of query sequence
#. **6563** = starting position of match in query sequence
#. **7714** = ending position of match in query sequence
#. **(22462)** = no. of bases in query sequence past the ending position of match
#. **C**       = match is with the Complement of the consensus sequence in the database
#. **MER7A**   = name of the matching interspersed repeat
#. **DNA/MER2_type** = the class of the repeat, in this case a DNA transposon fossil of the MER2 group (see below for list and references)
#. **2418**    = starting position of match in database sequence (using top-strand numbering)
#. **1465**    = ending position of match in database sequence
#. **(0)**     = no. of bases in (complement of) the repeat consensus sequence prior to beginning of the match (so 0 means that the match extended all the way to the end of the repeat consensus sequence)
#. **1**    = Identifier

An asterisk (\*) in the final column (no example shown) indicates that there is
a higher-scoring match whose domain partly (&lt;80%) includes the domain of this match. 

Note that the SW score and divergence numbers for the three Tigger1 lines are identical.
This is because the information is derived from a single alignment (the Alus were deleted
from the query before the alignment with the Tigger element was performed).
The program makes educated guesses about many fragments if they are derived from
the same element (e.g. it knows that the MER7A fragments represent one insert).
In a next version I can identify each element with a unique ID, if interest exists
(this could help to represent repeats cleaner in graphic displays). 


-------

**References**

Smit, AFA, Hubley, R and Green, P. RepeatMasker Open-3.0.

http://www.repeatmasker.org/

    </help>
</tool>