comparison RepeatMasker.xml @ 1:f8f1a3878edd draft default tip

0.1.1 version, prevent a crash if no repeat is found. Thanks to Simon Guest
author bjoern-gruening
date Fri, 01 Feb 2013 03:29:37 -0500
parents
children
comparison
equal deleted inserted replaced
0:13df908a02b0 1:f8f1a3878edd
1 <tool id="repeatmasker_wrapper" name="RepeatMasker" version="0.1.1">
2 <description>Masks different kind of repeats</description>
3 <command>
4 ## The command is a Cheetah template which allows some Python based syntax.
5 ## Lines starting hash hash are comments. Galaxy will turn newlines into spaces
6
7 ## create temp directory
8 #import tempfile, os
9 #set $dirname = os.path.abspath(tempfile.mkdtemp())
10 #set $input_filename = os.path.split(str($query))[-1]
11 #set $output_basename = os.path.join($dirname, $input_filename)
12
13
14 RepeatMasker
15 -parallel 8
16
17 $nolow
18 $noint
19 $norna
20
21 #if str($species)!="all":
22 $species
23 #end if
24
25
26 -dir $dirname
27
28 #if $adv_opts.adv_opts_selector=="advanced":
29
30 #if str($adv_opts.gc)!="0":
31 -gc $adv_opts.gc
32 #end if
33
34 $adv_opts.gccalc
35
36 #set $output_files_list = str($adv_opts.output_files).split(',')
37 #if "gff" in $output_files_list:
38 -gff
39 #end if
40 #if "html" in $output_files_list:
41 -html
42 #end if
43
44 $adv_opts.slow_search
45 $adv_opts.quick_search
46 $adv_opts.rush_search
47 $adv_opts.only_alus
48 $adv_opts.is_only
49
50 #else:
51 ## Set defaults
52 -gff
53
54 ## End of advanced options:
55 #end if
56
57 $query
58
59
60 > /dev/null 2> /dev/null;
61 ## Copy the output files to galaxy
62 ## AgR: if there are no repeats, the output files may not exist.
63 ## This causes the job to fail, so touch files to ensure they exist.
64 #if $adv_opts.adv_opts_selector=="advanced":
65
66 #if "summary" in $output_files_list:
67 ## Write out the summary file (default)
68 #set $summary_file = $output_basename + '.tbl'
69 touch $summary_file
70 cp $summary_file $output_summary;
71 #end if
72
73 #if "gff" in $output_files_list:
74 ## Write out the gff file (default)
75 #set $gff_file = $output_basename + '.out.gff'
76 touch $gff_file
77 cp $gff_file $output_gff;
78 #end if
79
80 #if "html" in $output_files_list:
81 ## Write out the html file
82 #set $html_file = $output_basename + '.out.html'
83 touch $html_file
84 cp $html_file $output_html;
85 #end if
86
87 #else:
88
89 ## Write out the summary file (default)
90 #set $summary_file = $output_basename + '.tbl'
91 touch $summary_file
92 cp $summary_file $output_summary;
93
94 ## Write out the gff file (default)
95 #set $gff_file = $output_basename + '.out.gff'
96 touch $gff_file
97 cp $gff_file $output_gff;
98
99
100 ## End of advanced options:
101 #end if
102
103 ## Write out mask sequence file
104 #set $mask_sequence_file = $output_basename + '.masked'
105 touch $mask_sequence_file
106 cp $mask_sequence_file $output_mask;
107
108 ## Write out standard file (default)
109 ## The default '.out' file from RepeatMasker has a 3-line header and spaces rather
110 ## than tabs. Remove the header and replace the whitespaces with tab
111 #set $standard_file = $output_basename + '.out'
112 tail -n +4 $standard_file | tr -s ' ' '\t' > $output_std;
113
114 ## Delete all temporary files
115 rm $dirname -r;
116
117
118 </command>
119 <inputs>
120 <param name="query" type="data" format="fasta" label="Nucleotide query sequence(s)"/>
121
122 <param name="nolow" type="boolean" label="No low complexity DNA" truevalue="-nolow" falsevalue="" checked="false" help="Does not mask low_complexity DNA or simple repeats."/>
123 <param name="noint" type="boolean" label="No interspersed repeats" truevalue="-noint" falsevalue="" checked="false" help="Only masks low complex/simple repeats (no interspersed repeats)."/>
124
125 <param name="norna" type="boolean" label="No small RNA genes" truevalue="-norna" falsevalue="" checked="false" help="Does not mask small RNA (pseudo) genes."/>
126
127 <!--
128 Specify the species or clade of the input sequence. The species name
129 must be a valid NCBI Taxonomy Database species name and be contained
130 in the RepeatMasker repeat database. The following collection is not complete.
131 -->
132 <param name="species" type="select" label="Species" help="The list is not complete, if you need other species contact your administrator.">
133 <option value="-species anopheles">anopheles</option>
134 <option value="-species arabidopsis">arabidopsis</option>
135 <option value="-species artiodactyl">artiodactyl</option>
136 <option value="-species aspergillus">aspergillus</option>
137 <option value="-species carnivore">carnivore</option>
138 <option value="-species cat">cat</option>
139 <option value="-species chicken">chicken</option>
140 <option value="-species 'ciona intestinalis'">ciona intestinalis</option>
141 <option value="-species 'ciona savignyi'">ciona savignyi</option>
142 <option value="-species cow">cow</option>
143 <option value="-species danio">danio</option>
144 <option value="-species diatoaea">diatoaea</option>
145 <option value="-species dog">dog</option>
146 <option value="-species drosophila">drosophila</option>
147 <option value="-species elegans">elegans</option>
148 <option value="-species fugu">fugu</option>
149 <option value="-species fungi" selected="true">fungi</option>
150 <option value="-species human">human</option>
151 <option value="-species maize">maize</option>
152 <option value="-species mammal">mammal</option>
153 <option value="-species mouse">mouse</option>
154 <option value="-species pig">pig</option>
155 <option value="-species rat">rat</option>
156 <option value="-species rice">rice</option>
157 <option value="-species rodentia">rodentia</option>
158 <option value="-species wheat">wheat</option>
159 </param>
160
161 <conditional name="adv_opts">
162 <param name="adv_opts_selector" type="select" label="Advanced Options">
163 <option value="basic" selected="True">Hide Advanced Options</option>
164 <option value="advanced">Show Advanced Options</option>
165 </param>
166 <when value="basic" />
167 <when value="advanced">
168
169
170 <param name="is_only" type="boolean" label="Mask only E coli insertion elements" truevalue="-is_only" falsevalue="" checked="false" help="Only clips E coli insertion elements out of fasta and .qual files."/>
171
172
173 <param name="slow_search" type="boolean" label="Slow search" truevalue="-s" falsevalue="" checked="false" help="0-5% more sensitive, 2-3 times slower than default."/>
174 <param name="quick_search" type="boolean" label="Quick search" truevalue="-q" falsevalue="" checked="false" help="5-10% less sensitive, 2-5 times faster than default."/>
175 <param name="rush_search" type="boolean" label="Rush search" truevalue="-qq" falsevalue="" checked="false" help="about 10% less sensitive, 4->10 times faster than default."/>
176
177 <param name="only_alus" type="boolean" label="Only Alus" truevalue="-alu" falsevalue="" checked="false" help="Only masks Alus (and 7SLRNA, SVA and LTR5)(only for primate DNA)."/>
178
179 <param name="gccalc" type="boolean" label="Use GC depended matrices, automaticly" truevalue="-gccalc" falsevalue="" checked="true" help="RepeatMasker calculates the GC content even for batch files/small seqs"/>
180
181 <param name="output_files" type="select" multiple="true" label="Additional output files">
182 <option selected="true" value="summary">Summary file</option>
183 <option value="gff">GFF file</option>
184 <option value="html">HTML file</option>
185 <option value="mask">Mask FastA file</option>
186 </param>
187
188
189 <param name="gc" type="integer" value="0" label="Use GC depended matrices" help="Use matrices calculated for 'number' percentage background GC level">
190 <validator type="in_range" min="0" />
191 <validator type="in_range" max="100" />
192 </param>
193
194 </when>
195 </conditional>
196
197 </inputs>
198 <outputs>
199 <data name="output_std" format="tabular" label="${tool.name} on ${on_string}: Standard" />
200 <data name="output_mask" format="fasta" label="${tool.name} on ${on_string}: Mask sequence">
201 <filter>
202 (adv_opts['adv_opts_selector'] == 'advanced' and 'mask' in adv_opts['output_files'])
203 </filter>
204 </data>
205 <data name="output_summary" format="txt" label="${tool.name} on ${on_string}: Summary">
206 <filter>(
207 (adv_opts['adv_opts_selector'] == 'advanced' and 'summary' in adv_opts['output_files'])
208 or
209 (adv_opts['adv_opts_selector'] == 'basic')
210 )
211 </filter>
212 </data>
213 <data name="output_html" format="html" label="${tool.name} on ${on_string}: HTML">
214 <filter>(adv_opts['adv_opts_selector'] == 'advanced' and 'html' in adv_opts['output_files'])</filter>
215 </data>
216 <data name="output_gff" format="gff" label="${tool.name} on ${on_string}: GFF">
217 <filter>
218 (adv_opts['adv_opts_selector'] == 'advanced' and 'gff' in adv_opts['output_files'])
219 </filter>
220 </data>
221 </outputs>
222 <requirements>
223 <requirement type="binary">RepeatMasker</requirement>
224 </requirements>
225 <help>
226
227 .. class:: warningmark
228
229 **What it does**
230
231 RepeatMasker is a program that screens DNA sequences for *interspersed repeats*
232 and *low complexity* DNA sequences. The output of the program is a detailed
233 annotation of the repeats that are present in the query sequence as well as a
234 modified version of the query sequence in which all the annotated repeats have
235 been masked (default: replaced by Ns).
236
237 -----
238
239 **How to read the results**
240
241
242
243 The annotation file contains the cross_match output lines. It lists all best matches
244 (above a set minimum score) between the query sequence and any of the sequences in
245 the repeat database or with low complexity DNA. The term "best matches" reflects
246 that a match is not shown if its domain is over 80% contained within the domain
247 of a higher scoring match, where the "domain" of a match is the region in
248 the query sequence that is defined by the alignment start and stop. These domains
249 have been masked in the returned masked sequence file. In the output, matches are
250 ordered by query name, and for each query by position of the start of the alignment.
251
252 Example:
253
254 ======== ========= ========= ========= ========== =========== ========= ========= ============ =============== =================== ================ ============== ======= ==
255 SW score perc div. perc del. perc ins. query seq. q-pos begin q-pos end (left) w complement matching repeat repeat class/family repeat-pos begin repeat-pos end (left) ID
256 ======== ========= ========= ========= ========== =========== ========= ========= ============ =============== =================== ================ ============== ======= ==
257 1306 15.6 6.2 0.0 HSU08988 6563 6781 \(22462) C MER7A DNA/MER2_type 336 103 \(0) 1
258 12204 10.0 2.4 1.8 HSU08988 6782 7714 \(21529) C TIGGER1 DNA/MER2_type 2418 1493 \(0) 2
259 279 3.0 0.0 0.0 HSU08988 7719 7751 \(21492) + (TTTTA)n Simple_repeat 1 33 \(0) 3
260 1765 13.4 6.5 1.8 HSU08988 7752 8022 \(21221) C AluSx SINE/Alu 289 1 \(23) 4
261 12204 10.0 2.4 1.8 HSU08988 8023 8694 \(20549) C TIGGER1 DNA/MER2_type 1493 827 \(925) 5
262 1984 11.1 0.3 0.7 HSU08988 8695 9000 \(20243) C AluSg SINE/Alu 305 1 \(5) 6
263 12204 10.0 2.4 1.8 HSU08988 9001 9695 \(19548) C TIGGER1 DNA/MER2_type 827 2 \(1591) 7
264 711 21.2 1.4 0.0 HSU08988 9696 9816 \(19427) C MER7A DNA/MER2_type 122 2 \(224) 8
265 ======== ========= ========= ========= ========== =========== ========= ========= ============ =============== =================== ================ ============== ======= ==
266
267 This is a sequence in which a Tigger1 DNA transposon has integrated into a MER7 DNA transposon copy.
268 Subsequently two Alus integrated in the Tigger1 sequence. The simple repeat is derived from the
269 poly A of the Alu element. The first line is interpreted like this:
270
271 :Table description:
272
273 1. **1306** = Smith-Waterman score of the match, usually complexity adjusted
274 The SW scores are not always directly comparable. Sometimes
275 the complexity adjustment has been turned off, and a variety of
276 scoring-matrices are used.
277
278 #. **15.6** = % substitutions in matching region compared to the consensus
279 #. **6.2** = % of bases opposite a gap in the query sequence (deleted bp)
280 #. **0.0** = % of bases opposite a gap in the repeat consensus (inserted bp)
281 #. **HSU08988** = name of query sequence
282 #. **6563** = starting position of match in query sequence
283 #. **7714** = ending position of match in query sequence
284 #. **(22462)** = no. of bases in query sequence past the ending position of match
285 #. **C** = match is with the Complement of the consensus sequence in the database
286 #. **MER7A** = name of the matching interspersed repeat
287 #. **DNA/MER2_type** = the class of the repeat, in this case a DNA transposon fossil of the MER2 group (see below for list and references)
288 #. **2418** = starting position of match in database sequence (using top-strand numbering)
289 #. **1465** = ending position of match in database sequence
290 #. **(0)** = no. of bases in (complement of) the repeat consensus sequence prior to beginning of the match (so 0 means that the match extended all the way to the end of the repeat consensus sequence)
291 #. **1** = Identifier
292
293 An asterisk (\*) in the final column (no example shown) indicates that there is
294 a higher-scoring match whose domain partly (&lt;80%) includes the domain of this match.
295
296 Note that the SW score and divergence numbers for the three Tigger1 lines are identical.
297 This is because the information is derived from a single alignment (the Alus were deleted
298 from the query before the alignment with the Tigger element was performed).
299 The program makes educated guesses about many fragments if they are derived from
300 the same element (e.g. it knows that the MER7A fragments represent one insert).
301 In a next version I can identify each element with a unique ID, if interest exists
302 (this could help to represent repeats cleaner in graphic displays).
303
304
305 -------
306
307 **References**
308
309 Smit, AFA, Hubley, R and Green, P. RepeatMasker Open-3.0.
310
311 http://www.repeatmasker.org/
312
313 </help>
314 </tool>