Mercurial > repos > bgruening > repeat_masker
view RepeatMasker.xml @ 1:880265000696 draft
Uploaded
author | bgruening |
---|---|
date | Tue, 25 Jun 2013 08:58:21 -0400 |
parents | d4a2c739da3f |
children | 5673e72241aa |
line wrap: on
line source
<tool id="repeatmasker_wrapper" name="RepeatMasker" version="0.1.2"> <description>Masks different kind of repeats</description> <command> ## The command is a Cheetah template which allows some Python based syntax. ## Lines starting hash hash are comments. Galaxy will turn newlines into spaces ## create temp directory #import tempfile, os #set $dirname = os.path.abspath( tempfile.mkdtemp() ) #set $input_filename = os.path.split( str($query) )[-1] #set $output_basename = os.path.join( $dirname, $input_filename ) RepeatMasker -parallel 8 $nolow $noint $norna #if str($species)!="all": $species #end if -dir $dirname #if $adv_opts.adv_opts_selector=="advanced": #if str($adv_opts.gc)!="0": -gc $adv_opts.gc #end if $adv_opts.gccalc #set $output_files_list = str($adv_opts.output_files).split(',') #if "gff" in $output_files_list: -gff #end if #if "html" in $output_files_list: -html #end if $adv_opts.slow_search $adv_opts.quick_search $adv_opts.rush_search $adv_opts.only_alus $adv_opts.is_only #else: ## Set defaults -gff ## End of advanced options: #end if $query 2>&1; ## Copy the output files to galaxy ## AgR: if there are no repeats, the output files may not exist. ## This causes the job to fail, so touch files to ensure they exist. #if $adv_opts.adv_opts_selector=="advanced": #if "summary" in $output_files_list: ## Write out the summary file (default) #set $summary_file = $output_basename + '.tbl' touch $summary_file; cp $summary_file $output_summary; #end if #if "gff" in $output_files_list: ## Write out the gff file (default) #set $gff_file = $output_basename + '.out.gff' touch $gff_file; cp $gff_file $output_gff; #end if #if "html" in $output_files_list: ## Write out the html file #set $html_file = $output_basename + '.out.html' touch $html_file; cp $html_file $output_html; #end if #else: ## Write out the summary file (default) #set $summary_file = $output_basename + '.tbl' touch $summary_file; cp $summary_file $output_summary; ## Write out the gff file (default) #set $gff_file = $output_basename + '.out.gff' touch $gff_file; cp $gff_file $output_gff; ## End of advanced options: #end if ## Write out mask sequence file #set $mask_sequence_file = $output_basename + '.masked' touch $mask_sequence_file; cp $mask_sequence_file $output_mask; ## Write out standard file (default) ## The default '.out' file from RepeatMasker has a 3-line header and spaces rather ## than tabs. Remove the header and replace the whitespaces with tab #set $standard_file = $output_basename + '.out' tail -n +4 $standard_file | tr -s ' ' '\t' > $output_std; ## Delete all temporary files rm $dirname -r </command> <inputs> <param name="query" type="data" format="fasta" label="Nucleotide query sequence(s)"/> <param name="nolow" type="boolean" label="No low complexity DNA" truevalue="-nolow" falsevalue="" checked="false" help="Does not mask low_complexity DNA or simple repeats."/> <param name="noint" type="boolean" label="No interspersed repeats" truevalue="-noint" falsevalue="" checked="false" help="Only masks low complex/simple repeats (no interspersed repeats)."/> <param name="norna" type="boolean" label="No small RNA genes" truevalue="-norna" falsevalue="" checked="false" help="Does not mask small RNA (pseudo) genes."/> <!-- Specify the species or clade of the input sequence. The species name must be a valid NCBI Taxonomy Database species name and be contained in the RepeatMasker repeat database. The following collection is not complete. --> <param name="species" type="select" label="Species" help="The list is not complete, if you need other species contact your administrator."> <option value="-species anopheles">anopheles</option> <option value="-species arabidopsis">arabidopsis</option> <option value="-species artiodactyl">artiodactyl</option> <option value="-species aspergillus">aspergillus</option> <option value="-species carnivore">carnivore</option> <option value="-species cat">cat</option> <option value="-species chicken">chicken</option> <option value="-species 'ciona intestinalis'">ciona intestinalis</option> <option value="-species 'ciona savignyi'">ciona savignyi</option> <option value="-species cow">cow</option> <option value="-species danio">danio</option> <option value="-species diatoaea">diatoaea</option> <option value="-species dog">dog</option> <option value="-species drosophila">drosophila</option> <option value="-species elegans">elegans</option> <option value="-species fugu">fugu</option> <option value="-species fungi" selected="true">fungi</option> <option value="-species human">human</option> <option value="-species maize">maize</option> <option value="-species mammal">mammal</option> <option value="-species mouse">mouse</option> <option value="-species pig">pig</option> <option value="-species rat">rat</option> <option value="-species rice">rice</option> <option value="-species rodentia">rodentia</option> <option value="-species wheat">wheat</option> </param> <conditional name="adv_opts"> <param name="adv_opts_selector" type="select" label="Advanced Options"> <option value="basic" selected="True">Hide Advanced Options</option> <option value="advanced">Show Advanced Options</option> </param> <when value="basic" /> <when value="advanced"> <param name="is_only" type="boolean" label="Mask only E coli insertion elements" truevalue="-is_only" falsevalue="" checked="false" help="Only clips E coli insertion elements out of fasta and .qual files."/> <param name="slow_search" type="boolean" label="Slow search" truevalue="-s" falsevalue="" checked="false" help="0-5% more sensitive, 2-3 times slower than default."/> <param name="quick_search" type="boolean" label="Quick search" truevalue="-q" falsevalue="" checked="false" help="5-10% less sensitive, 2-5 times faster than default."/> <param name="rush_search" type="boolean" label="Rush search" truevalue="-qq" falsevalue="" checked="false" help="about 10% less sensitive, 4->10 times faster than default."/> <param name="only_alus" type="boolean" label="Only Alus" truevalue="-alu" falsevalue="" checked="false" help="Only masks Alus (and 7SLRNA, SVA and LTR5)(only for primate DNA)."/> <param name="gccalc" type="boolean" label="Use GC depended matrices, automaticly" truevalue="-gccalc" falsevalue="" checked="true" help="RepeatMasker calculates the GC content even for batch files/small seqs"/> <param name="output_files" type="select" multiple="true" label="Additional output files"> <option selected="true" value="summary">Summary file</option> <option value="gff">GFF file</option> <option value="html">HTML file</option> <option value="mask">Mask FastA file</option> </param> <param name="gc" type="integer" value="0" label="Use GC depended matrices" help="Use matrices calculated for 'number' percentage background GC level"> <validator type="in_range" min="0" /> <validator type="in_range" max="100" /> </param> </when> </conditional> </inputs> <outputs> <data name="output_std" format="tabular" label="${tool.name} on ${on_string}: Standard" /> <data name="output_mask" format="fasta" label="${tool.name} on ${on_string}: Mask sequence"> <filter> (adv_opts['adv_opts_selector'] == 'advanced' and 'mask' in adv_opts['output_files']) </filter> </data> <data name="output_summary" format="txt" label="${tool.name} on ${on_string}: Summary"> <filter>( (adv_opts['adv_opts_selector'] == 'advanced' and 'summary' in adv_opts['output_files']) or (adv_opts['adv_opts_selector'] == 'basic') ) </filter> </data> <data name="output_html" format="html" label="${tool.name} on ${on_string}: HTML"> <filter>(adv_opts['adv_opts_selector'] == 'advanced' and 'html' in adv_opts['output_files'])</filter> </data> <data name="output_gff" format="gff" label="${tool.name} on ${on_string}: GFF"> <filter> (adv_opts['adv_opts_selector'] == 'advanced' and 'gff' in adv_opts['output_files']) </filter> </data> </outputs> <requirements> <requirement type="binary">RepeatMasker</requirement> </requirements> <help> .. class:: warningmark **What it does** RepeatMasker is a program that screens DNA sequences for *interspersed repeats* and *low complexity* DNA sequences. The output of the program is a detailed annotation of the repeats that are present in the query sequence as well as a modified version of the query sequence in which all the annotated repeats have been masked (default: replaced by Ns). ----- **How to read the results** The annotation file contains the cross_match output lines. It lists all best matches (above a set minimum score) between the query sequence and any of the sequences in the repeat database or with low complexity DNA. The term "best matches" reflects that a match is not shown if its domain is over 80% contained within the domain of a higher scoring match, where the "domain" of a match is the region in the query sequence that is defined by the alignment start and stop. These domains have been masked in the returned masked sequence file. In the output, matches are ordered by query name, and for each query by position of the start of the alignment. Example: ======== ========= ========= ========= ========== =========== ========= ========= ============ =============== =================== ================ ============== ======= == SW score perc div. perc del. perc ins. query seq. q-pos begin q-pos end (left) w complement matching repeat repeat class/family repeat-pos begin repeat-pos end (left) ID ======== ========= ========= ========= ========== =========== ========= ========= ============ =============== =================== ================ ============== ======= == 1306 15.6 6.2 0.0 HSU08988 6563 6781 \(22462) C MER7A DNA/MER2_type 336 103 \(0) 1 12204 10.0 2.4 1.8 HSU08988 6782 7714 \(21529) C TIGGER1 DNA/MER2_type 2418 1493 \(0) 2 279 3.0 0.0 0.0 HSU08988 7719 7751 \(21492) + (TTTTA)n Simple_repeat 1 33 \(0) 3 1765 13.4 6.5 1.8 HSU08988 7752 8022 \(21221) C AluSx SINE/Alu 289 1 \(23) 4 12204 10.0 2.4 1.8 HSU08988 8023 8694 \(20549) C TIGGER1 DNA/MER2_type 1493 827 \(925) 5 1984 11.1 0.3 0.7 HSU08988 8695 9000 \(20243) C AluSg SINE/Alu 305 1 \(5) 6 12204 10.0 2.4 1.8 HSU08988 9001 9695 \(19548) C TIGGER1 DNA/MER2_type 827 2 \(1591) 7 711 21.2 1.4 0.0 HSU08988 9696 9816 \(19427) C MER7A DNA/MER2_type 122 2 \(224) 8 ======== ========= ========= ========= ========== =========== ========= ========= ============ =============== =================== ================ ============== ======= == This is a sequence in which a Tigger1 DNA transposon has integrated into a MER7 DNA transposon copy. Subsequently two Alus integrated in the Tigger1 sequence. The simple repeat is derived from the poly A of the Alu element. The first line is interpreted like this: :Table description: 1. **1306** = Smith-Waterman score of the match, usually complexity adjusted The SW scores are not always directly comparable. Sometimes the complexity adjustment has been turned off, and a variety of scoring-matrices are used. #. **15.6** = % substitutions in matching region compared to the consensus #. **6.2** = % of bases opposite a gap in the query sequence (deleted bp) #. **0.0** = % of bases opposite a gap in the repeat consensus (inserted bp) #. **HSU08988** = name of query sequence #. **6563** = starting position of match in query sequence #. **7714** = ending position of match in query sequence #. **(22462)** = no. of bases in query sequence past the ending position of match #. **C** = match is with the Complement of the consensus sequence in the database #. **MER7A** = name of the matching interspersed repeat #. **DNA/MER2_type** = the class of the repeat, in this case a DNA transposon fossil of the MER2 group (see below for list and references) #. **2418** = starting position of match in database sequence (using top-strand numbering) #. **1465** = ending position of match in database sequence #. **(0)** = no. of bases in (complement of) the repeat consensus sequence prior to beginning of the match (so 0 means that the match extended all the way to the end of the repeat consensus sequence) #. **1** = Identifier An asterisk (\*) in the final column (no example shown) indicates that there is a higher-scoring match whose domain partly (<80%) includes the domain of this match. Note that the SW score and divergence numbers for the three Tigger1 lines are identical. This is because the information is derived from a single alignment (the Alus were deleted from the query before the alignment with the Tigger element was performed). The program makes educated guesses about many fragments if they are derived from the same element (e.g. it knows that the MER7A fragments represent one insert). In a next version I can identify each element with a unique ID, if interest exists (this could help to represent repeats cleaner in graphic displays). ------- **References** Smit, AFA, Hubley, R and Green, P. RepeatMasker Open-3.0. http://www.repeatmasker.org/ </help> </tool>