comparison RepeatMasker.xml @ 0:d4a2c739da3f draft

Initial release under a consistent username. Fixes for stdout and the trailing semicolon.
author bgruening
date Tue, 25 Jun 2013 04:33:41 -0400
parents
children 880265000696
comparison
equal deleted inserted replaced
-1:000000000000 0:d4a2c739da3f
1 <tool id="repeatmasker_wrapper" name="RepeatMasker" version="0.1.2">
2 <description>Masks different kind of repeats</description>
3 <command>
4 ## The command is a Cheetah template which allows some Python based syntax.
5 ## Lines starting hash hash are comments. Galaxy will turn newlines into spaces
6
7 ## create temp directory
8 #import tempfile, os
9 #set $dirname = os.path.abspath( tempfile.mkdtemp() )
10 #set $input_filename = os.path.split( str($query) )[-1]
11 #set $output_basename = os.path.join( $dirname, $input_filename )
12
13
14 RepeatMasker
15 -parallel 8
16
17 $nolow
18 $noint
19 $norna
20
21 #if str($species)!="all":
22 $species
23 #end if
24
25
26 -dir $dirname
27
28 #if $adv_opts.adv_opts_selector=="advanced":
29
30 #if str($adv_opts.gc)!="0":
31 -gc $adv_opts.gc
32 #end if
33
34 $adv_opts.gccalc
35
36 #set $output_files_list = str($adv_opts.output_files).split(',')
37 #if "gff" in $output_files_list:
38 -gff
39 #end if
40 #if "html" in $output_files_list:
41 -html
42 #end if
43
44 $adv_opts.slow_search
45 $adv_opts.quick_search
46 $adv_opts.rush_search
47 $adv_opts.only_alus
48 $adv_opts.is_only
49
50 #else:
51 ## Set defaults
52 -gff
53
54 ## End of advanced options:
55 #end if
56
57 $query
58
59 2>&#38;1;
60
61 ## Copy the output files to galaxy
62 ## AgR: if there are no repeats, the output files may not exist.
63 ## This causes the job to fail, so touch files to ensure they exist.
64 #if $adv_opts.adv_opts_selector=="advanced":
65
66 #if "summary" in $output_files_list:
67 ## Write out the summary file (default)
68 #set $summary_file = $output_basename + '.tbl'
69 touch $summary_file
70 cp $summary_file $output_summary;
71 #end if
72
73 #if "gff" in $output_files_list:
74 ## Write out the gff file (default)
75 #set $gff_file = $output_basename + '.out.gff'
76 touch $gff_file
77 cp $gff_file $output_gff;
78 #end if
79
80 #if "html" in $output_files_list:
81 ## Write out the html file
82 #set $html_file = $output_basename + '.out.html'
83 touch $html_file
84 cp $html_file $output_html;
85 #end if
86
87 #else:
88
89 ## Write out the summary file (default)
90 #set $summary_file = $output_basename + '.tbl'
91 touch $summary_file
92 cp $summary_file $output_summary;
93
94 ## Write out the gff file (default)
95 #set $gff_file = $output_basename + '.out.gff'
96 touch $gff_file
97 cp $gff_file $output_gff;
98
99
100 ## End of advanced options:
101 #end if
102
103 ## Write out mask sequence file
104 #set $mask_sequence_file = $output_basename + '.masked'
105 touch $mask_sequence_file
106 cp $mask_sequence_file $output_mask;
107
108 ## Write out standard file (default)
109 ## The default '.out' file from RepeatMasker has a 3-line header and spaces rather
110 ## than tabs. Remove the header and replace the whitespaces with tab
111 #set $standard_file = $output_basename + '.out'
112 tail -n +4 $standard_file | tr -s ' ' '\t' > $output_std;
113
114 ## Delete all temporary files
115 rm $dirname -r
116
117 </command>
118 <inputs>
119 <param name="query" type="data" format="fasta" label="Nucleotide query sequence(s)"/>
120
121 <param name="nolow" type="boolean" label="No low complexity DNA" truevalue="-nolow" falsevalue="" checked="false" help="Does not mask low_complexity DNA or simple repeats."/>
122 <param name="noint" type="boolean" label="No interspersed repeats" truevalue="-noint" falsevalue="" checked="false" help="Only masks low complex/simple repeats (no interspersed repeats)."/>
123
124 <param name="norna" type="boolean" label="No small RNA genes" truevalue="-norna" falsevalue="" checked="false" help="Does not mask small RNA (pseudo) genes."/>
125
126 <!--
127 Specify the species or clade of the input sequence. The species name
128 must be a valid NCBI Taxonomy Database species name and be contained
129 in the RepeatMasker repeat database. The following collection is not complete.
130 -->
131 <param name="species" type="select" label="Species" help="The list is not complete, if you need other species contact your administrator.">
132 <option value="-species anopheles">anopheles</option>
133 <option value="-species arabidopsis">arabidopsis</option>
134 <option value="-species artiodactyl">artiodactyl</option>
135 <option value="-species aspergillus">aspergillus</option>
136 <option value="-species carnivore">carnivore</option>
137 <option value="-species cat">cat</option>
138 <option value="-species chicken">chicken</option>
139 <option value="-species 'ciona intestinalis'">ciona intestinalis</option>
140 <option value="-species 'ciona savignyi'">ciona savignyi</option>
141 <option value="-species cow">cow</option>
142 <option value="-species danio">danio</option>
143 <option value="-species diatoaea">diatoaea</option>
144 <option value="-species dog">dog</option>
145 <option value="-species drosophila">drosophila</option>
146 <option value="-species elegans">elegans</option>
147 <option value="-species fugu">fugu</option>
148 <option value="-species fungi" selected="true">fungi</option>
149 <option value="-species human">human</option>
150 <option value="-species maize">maize</option>
151 <option value="-species mammal">mammal</option>
152 <option value="-species mouse">mouse</option>
153 <option value="-species pig">pig</option>
154 <option value="-species rat">rat</option>
155 <option value="-species rice">rice</option>
156 <option value="-species rodentia">rodentia</option>
157 <option value="-species wheat">wheat</option>
158 </param>
159
160 <conditional name="adv_opts">
161 <param name="adv_opts_selector" type="select" label="Advanced Options">
162 <option value="basic" selected="True">Hide Advanced Options</option>
163 <option value="advanced">Show Advanced Options</option>
164 </param>
165 <when value="basic" />
166 <when value="advanced">
167 <param name="is_only" type="boolean" label="Mask only E coli insertion elements" truevalue="-is_only" falsevalue="" checked="false" help="Only clips E coli insertion elements out of fasta and .qual files."/>
168 <param name="slow_search" type="boolean" label="Slow search" truevalue="-s" falsevalue="" checked="false" help="0-5% more sensitive, 2-3 times slower than default."/>
169 <param name="quick_search" type="boolean" label="Quick search" truevalue="-q" falsevalue="" checked="false" help="5-10% less sensitive, 2-5 times faster than default."/>
170 <param name="rush_search" type="boolean" label="Rush search" truevalue="-qq" falsevalue="" checked="false" help="about 10% less sensitive, 4->10 times faster than default."/>
171 <param name="only_alus" type="boolean" label="Only Alus" truevalue="-alu" falsevalue="" checked="false" help="Only masks Alus (and 7SLRNA, SVA and LTR5)(only for primate DNA)."/>
172 <param name="gccalc" type="boolean" label="Use GC depended matrices, automaticly" truevalue="-gccalc" falsevalue="" checked="true" help="RepeatMasker calculates the GC content even for batch files/small seqs"/>
173
174 <param name="output_files" type="select" multiple="true" label="Additional output files">
175 <option selected="true" value="summary">Summary file</option>
176 <option value="gff">GFF file</option>
177 <option value="html">HTML file</option>
178 <option value="mask">Mask FastA file</option>
179 </param>
180
181 <param name="gc" type="integer" value="0" label="Use GC depended matrices" help="Use matrices calculated for 'number' percentage background GC level">
182 <validator type="in_range" min="0" />
183 <validator type="in_range" max="100" />
184 </param>
185
186 </when>
187 </conditional>
188
189 </inputs>
190 <outputs>
191 <data name="output_std" format="tabular" label="${tool.name} on ${on_string}: Standard" />
192 <data name="output_mask" format="fasta" label="${tool.name} on ${on_string}: Mask sequence">
193 <filter>
194 (adv_opts['adv_opts_selector'] == 'advanced' and 'mask' in adv_opts['output_files'])
195 </filter>
196 </data>
197 <data name="output_summary" format="txt" label="${tool.name} on ${on_string}: Summary">
198 <filter>(
199 (adv_opts['adv_opts_selector'] == 'advanced' and 'summary' in adv_opts['output_files'])
200 or
201 (adv_opts['adv_opts_selector'] == 'basic')
202 )
203 </filter>
204 </data>
205 <data name="output_html" format="html" label="${tool.name} on ${on_string}: HTML">
206 <filter>(adv_opts['adv_opts_selector'] == 'advanced' and 'html' in adv_opts['output_files'])</filter>
207 </data>
208 <data name="output_gff" format="gff" label="${tool.name} on ${on_string}: GFF">
209 <filter>
210 (adv_opts['adv_opts_selector'] == 'advanced' and 'gff' in adv_opts['output_files'])
211 </filter>
212 </data>
213 </outputs>
214 <requirements>
215 <requirement type="binary">RepeatMasker</requirement>
216 </requirements>
217 <help>
218
219 .. class:: warningmark
220
221 **What it does**
222
223 RepeatMasker is a program that screens DNA sequences for *interspersed repeats*
224 and *low complexity* DNA sequences. The output of the program is a detailed
225 annotation of the repeats that are present in the query sequence as well as a
226 modified version of the query sequence in which all the annotated repeats have
227 been masked (default: replaced by Ns).
228
229 -----
230
231 **How to read the results**
232
233
234
235 The annotation file contains the cross_match output lines. It lists all best matches
236 (above a set minimum score) between the query sequence and any of the sequences in
237 the repeat database or with low complexity DNA. The term "best matches" reflects
238 that a match is not shown if its domain is over 80% contained within the domain
239 of a higher scoring match, where the "domain" of a match is the region in
240 the query sequence that is defined by the alignment start and stop. These domains
241 have been masked in the returned masked sequence file. In the output, matches are
242 ordered by query name, and for each query by position of the start of the alignment.
243
244 Example:
245
246 ======== ========= ========= ========= ========== =========== ========= ========= ============ =============== =================== ================ ============== ======= ==
247 SW score perc div. perc del. perc ins. query seq. q-pos begin q-pos end (left) w complement matching repeat repeat class/family repeat-pos begin repeat-pos end (left) ID
248 ======== ========= ========= ========= ========== =========== ========= ========= ============ =============== =================== ================ ============== ======= ==
249 1306 15.6 6.2 0.0 HSU08988 6563 6781 \(22462) C MER7A DNA/MER2_type 336 103 \(0) 1
250 12204 10.0 2.4 1.8 HSU08988 6782 7714 \(21529) C TIGGER1 DNA/MER2_type 2418 1493 \(0) 2
251 279 3.0 0.0 0.0 HSU08988 7719 7751 \(21492) + (TTTTA)n Simple_repeat 1 33 \(0) 3
252 1765 13.4 6.5 1.8 HSU08988 7752 8022 \(21221) C AluSx SINE/Alu 289 1 \(23) 4
253 12204 10.0 2.4 1.8 HSU08988 8023 8694 \(20549) C TIGGER1 DNA/MER2_type 1493 827 \(925) 5
254 1984 11.1 0.3 0.7 HSU08988 8695 9000 \(20243) C AluSg SINE/Alu 305 1 \(5) 6
255 12204 10.0 2.4 1.8 HSU08988 9001 9695 \(19548) C TIGGER1 DNA/MER2_type 827 2 \(1591) 7
256 711 21.2 1.4 0.0 HSU08988 9696 9816 \(19427) C MER7A DNA/MER2_type 122 2 \(224) 8
257 ======== ========= ========= ========= ========== =========== ========= ========= ============ =============== =================== ================ ============== ======= ==
258
259 This is a sequence in which a Tigger1 DNA transposon has integrated into a MER7 DNA transposon copy.
260 Subsequently two Alus integrated in the Tigger1 sequence. The simple repeat is derived from the
261 poly A of the Alu element. The first line is interpreted like this:
262
263 :Table description:
264
265 1. **1306** = Smith-Waterman score of the match, usually complexity adjusted
266 The SW scores are not always directly comparable. Sometimes
267 the complexity adjustment has been turned off, and a variety of
268 scoring-matrices are used.
269
270 #. **15.6** = % substitutions in matching region compared to the consensus
271 #. **6.2** = % of bases opposite a gap in the query sequence (deleted bp)
272 #. **0.0** = % of bases opposite a gap in the repeat consensus (inserted bp)
273 #. **HSU08988** = name of query sequence
274 #. **6563** = starting position of match in query sequence
275 #. **7714** = ending position of match in query sequence
276 #. **(22462)** = no. of bases in query sequence past the ending position of match
277 #. **C** = match is with the Complement of the consensus sequence in the database
278 #. **MER7A** = name of the matching interspersed repeat
279 #. **DNA/MER2_type** = the class of the repeat, in this case a DNA transposon fossil of the MER2 group (see below for list and references)
280 #. **2418** = starting position of match in database sequence (using top-strand numbering)
281 #. **1465** = ending position of match in database sequence
282 #. **(0)** = no. of bases in (complement of) the repeat consensus sequence prior to beginning of the match (so 0 means that the match extended all the way to the end of the repeat consensus sequence)
283 #. **1** = Identifier
284
285 An asterisk (\*) in the final column (no example shown) indicates that there is
286 a higher-scoring match whose domain partly (&lt;80%) includes the domain of this match.
287
288 Note that the SW score and divergence numbers for the three Tigger1 lines are identical.
289 This is because the information is derived from a single alignment (the Alus were deleted
290 from the query before the alignment with the Tigger element was performed).
291 The program makes educated guesses about many fragments if they are derived from
292 the same element (e.g. it knows that the MER7A fragments represent one insert).
293 In a next version I can identify each element with a unique ID, if interest exists
294 (this could help to represent repeats cleaner in graphic displays).
295
296
297 -------
298
299 **References**
300
301 Smit, AFA, Hubley, R and Green, P. RepeatMasker Open-3.0.
302
303 http://www.repeatmasker.org/
304
305 </help>
306 </tool>