Mercurial > repos > bjoern-gruening > repeat_masker
comparison RepeatMasker.xml @ 1:f8f1a3878edd draft default tip
0.1.1 version, prevent a crash if no repeat is found. Thanks to Simon Guest
author | bjoern-gruening |
---|---|
date | Fri, 01 Feb 2013 03:29:37 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:13df908a02b0 | 1:f8f1a3878edd |
---|---|
1 <tool id="repeatmasker_wrapper" name="RepeatMasker" version="0.1.1"> | |
2 <description>Masks different kind of repeats</description> | |
3 <command> | |
4 ## The command is a Cheetah template which allows some Python based syntax. | |
5 ## Lines starting hash hash are comments. Galaxy will turn newlines into spaces | |
6 | |
7 ## create temp directory | |
8 #import tempfile, os | |
9 #set $dirname = os.path.abspath(tempfile.mkdtemp()) | |
10 #set $input_filename = os.path.split(str($query))[-1] | |
11 #set $output_basename = os.path.join($dirname, $input_filename) | |
12 | |
13 | |
14 RepeatMasker | |
15 -parallel 8 | |
16 | |
17 $nolow | |
18 $noint | |
19 $norna | |
20 | |
21 #if str($species)!="all": | |
22 $species | |
23 #end if | |
24 | |
25 | |
26 -dir $dirname | |
27 | |
28 #if $adv_opts.adv_opts_selector=="advanced": | |
29 | |
30 #if str($adv_opts.gc)!="0": | |
31 -gc $adv_opts.gc | |
32 #end if | |
33 | |
34 $adv_opts.gccalc | |
35 | |
36 #set $output_files_list = str($adv_opts.output_files).split(',') | |
37 #if "gff" in $output_files_list: | |
38 -gff | |
39 #end if | |
40 #if "html" in $output_files_list: | |
41 -html | |
42 #end if | |
43 | |
44 $adv_opts.slow_search | |
45 $adv_opts.quick_search | |
46 $adv_opts.rush_search | |
47 $adv_opts.only_alus | |
48 $adv_opts.is_only | |
49 | |
50 #else: | |
51 ## Set defaults | |
52 -gff | |
53 | |
54 ## End of advanced options: | |
55 #end if | |
56 | |
57 $query | |
58 | |
59 | |
60 > /dev/null 2> /dev/null; | |
61 ## Copy the output files to galaxy | |
62 ## AgR: if there are no repeats, the output files may not exist. | |
63 ## This causes the job to fail, so touch files to ensure they exist. | |
64 #if $adv_opts.adv_opts_selector=="advanced": | |
65 | |
66 #if "summary" in $output_files_list: | |
67 ## Write out the summary file (default) | |
68 #set $summary_file = $output_basename + '.tbl' | |
69 touch $summary_file | |
70 cp $summary_file $output_summary; | |
71 #end if | |
72 | |
73 #if "gff" in $output_files_list: | |
74 ## Write out the gff file (default) | |
75 #set $gff_file = $output_basename + '.out.gff' | |
76 touch $gff_file | |
77 cp $gff_file $output_gff; | |
78 #end if | |
79 | |
80 #if "html" in $output_files_list: | |
81 ## Write out the html file | |
82 #set $html_file = $output_basename + '.out.html' | |
83 touch $html_file | |
84 cp $html_file $output_html; | |
85 #end if | |
86 | |
87 #else: | |
88 | |
89 ## Write out the summary file (default) | |
90 #set $summary_file = $output_basename + '.tbl' | |
91 touch $summary_file | |
92 cp $summary_file $output_summary; | |
93 | |
94 ## Write out the gff file (default) | |
95 #set $gff_file = $output_basename + '.out.gff' | |
96 touch $gff_file | |
97 cp $gff_file $output_gff; | |
98 | |
99 | |
100 ## End of advanced options: | |
101 #end if | |
102 | |
103 ## Write out mask sequence file | |
104 #set $mask_sequence_file = $output_basename + '.masked' | |
105 touch $mask_sequence_file | |
106 cp $mask_sequence_file $output_mask; | |
107 | |
108 ## Write out standard file (default) | |
109 ## The default '.out' file from RepeatMasker has a 3-line header and spaces rather | |
110 ## than tabs. Remove the header and replace the whitespaces with tab | |
111 #set $standard_file = $output_basename + '.out' | |
112 tail -n +4 $standard_file | tr -s ' ' '\t' > $output_std; | |
113 | |
114 ## Delete all temporary files | |
115 rm $dirname -r; | |
116 | |
117 | |
118 </command> | |
119 <inputs> | |
120 <param name="query" type="data" format="fasta" label="Nucleotide query sequence(s)"/> | |
121 | |
122 <param name="nolow" type="boolean" label="No low complexity DNA" truevalue="-nolow" falsevalue="" checked="false" help="Does not mask low_complexity DNA or simple repeats."/> | |
123 <param name="noint" type="boolean" label="No interspersed repeats" truevalue="-noint" falsevalue="" checked="false" help="Only masks low complex/simple repeats (no interspersed repeats)."/> | |
124 | |
125 <param name="norna" type="boolean" label="No small RNA genes" truevalue="-norna" falsevalue="" checked="false" help="Does not mask small RNA (pseudo) genes."/> | |
126 | |
127 <!-- | |
128 Specify the species or clade of the input sequence. The species name | |
129 must be a valid NCBI Taxonomy Database species name and be contained | |
130 in the RepeatMasker repeat database. The following collection is not complete. | |
131 --> | |
132 <param name="species" type="select" label="Species" help="The list is not complete, if you need other species contact your administrator."> | |
133 <option value="-species anopheles">anopheles</option> | |
134 <option value="-species arabidopsis">arabidopsis</option> | |
135 <option value="-species artiodactyl">artiodactyl</option> | |
136 <option value="-species aspergillus">aspergillus</option> | |
137 <option value="-species carnivore">carnivore</option> | |
138 <option value="-species cat">cat</option> | |
139 <option value="-species chicken">chicken</option> | |
140 <option value="-species 'ciona intestinalis'">ciona intestinalis</option> | |
141 <option value="-species 'ciona savignyi'">ciona savignyi</option> | |
142 <option value="-species cow">cow</option> | |
143 <option value="-species danio">danio</option> | |
144 <option value="-species diatoaea">diatoaea</option> | |
145 <option value="-species dog">dog</option> | |
146 <option value="-species drosophila">drosophila</option> | |
147 <option value="-species elegans">elegans</option> | |
148 <option value="-species fugu">fugu</option> | |
149 <option value="-species fungi" selected="true">fungi</option> | |
150 <option value="-species human">human</option> | |
151 <option value="-species maize">maize</option> | |
152 <option value="-species mammal">mammal</option> | |
153 <option value="-species mouse">mouse</option> | |
154 <option value="-species pig">pig</option> | |
155 <option value="-species rat">rat</option> | |
156 <option value="-species rice">rice</option> | |
157 <option value="-species rodentia">rodentia</option> | |
158 <option value="-species wheat">wheat</option> | |
159 </param> | |
160 | |
161 <conditional name="adv_opts"> | |
162 <param name="adv_opts_selector" type="select" label="Advanced Options"> | |
163 <option value="basic" selected="True">Hide Advanced Options</option> | |
164 <option value="advanced">Show Advanced Options</option> | |
165 </param> | |
166 <when value="basic" /> | |
167 <when value="advanced"> | |
168 | |
169 | |
170 <param name="is_only" type="boolean" label="Mask only E coli insertion elements" truevalue="-is_only" falsevalue="" checked="false" help="Only clips E coli insertion elements out of fasta and .qual files."/> | |
171 | |
172 | |
173 <param name="slow_search" type="boolean" label="Slow search" truevalue="-s" falsevalue="" checked="false" help="0-5% more sensitive, 2-3 times slower than default."/> | |
174 <param name="quick_search" type="boolean" label="Quick search" truevalue="-q" falsevalue="" checked="false" help="5-10% less sensitive, 2-5 times faster than default."/> | |
175 <param name="rush_search" type="boolean" label="Rush search" truevalue="-qq" falsevalue="" checked="false" help="about 10% less sensitive, 4->10 times faster than default."/> | |
176 | |
177 <param name="only_alus" type="boolean" label="Only Alus" truevalue="-alu" falsevalue="" checked="false" help="Only masks Alus (and 7SLRNA, SVA and LTR5)(only for primate DNA)."/> | |
178 | |
179 <param name="gccalc" type="boolean" label="Use GC depended matrices, automaticly" truevalue="-gccalc" falsevalue="" checked="true" help="RepeatMasker calculates the GC content even for batch files/small seqs"/> | |
180 | |
181 <param name="output_files" type="select" multiple="true" label="Additional output files"> | |
182 <option selected="true" value="summary">Summary file</option> | |
183 <option value="gff">GFF file</option> | |
184 <option value="html">HTML file</option> | |
185 <option value="mask">Mask FastA file</option> | |
186 </param> | |
187 | |
188 | |
189 <param name="gc" type="integer" value="0" label="Use GC depended matrices" help="Use matrices calculated for 'number' percentage background GC level"> | |
190 <validator type="in_range" min="0" /> | |
191 <validator type="in_range" max="100" /> | |
192 </param> | |
193 | |
194 </when> | |
195 </conditional> | |
196 | |
197 </inputs> | |
198 <outputs> | |
199 <data name="output_std" format="tabular" label="${tool.name} on ${on_string}: Standard" /> | |
200 <data name="output_mask" format="fasta" label="${tool.name} on ${on_string}: Mask sequence"> | |
201 <filter> | |
202 (adv_opts['adv_opts_selector'] == 'advanced' and 'mask' in adv_opts['output_files']) | |
203 </filter> | |
204 </data> | |
205 <data name="output_summary" format="txt" label="${tool.name} on ${on_string}: Summary"> | |
206 <filter>( | |
207 (adv_opts['adv_opts_selector'] == 'advanced' and 'summary' in adv_opts['output_files']) | |
208 or | |
209 (adv_opts['adv_opts_selector'] == 'basic') | |
210 ) | |
211 </filter> | |
212 </data> | |
213 <data name="output_html" format="html" label="${tool.name} on ${on_string}: HTML"> | |
214 <filter>(adv_opts['adv_opts_selector'] == 'advanced' and 'html' in adv_opts['output_files'])</filter> | |
215 </data> | |
216 <data name="output_gff" format="gff" label="${tool.name} on ${on_string}: GFF"> | |
217 <filter> | |
218 (adv_opts['adv_opts_selector'] == 'advanced' and 'gff' in adv_opts['output_files']) | |
219 </filter> | |
220 </data> | |
221 </outputs> | |
222 <requirements> | |
223 <requirement type="binary">RepeatMasker</requirement> | |
224 </requirements> | |
225 <help> | |
226 | |
227 .. class:: warningmark | |
228 | |
229 **What it does** | |
230 | |
231 RepeatMasker is a program that screens DNA sequences for *interspersed repeats* | |
232 and *low complexity* DNA sequences. The output of the program is a detailed | |
233 annotation of the repeats that are present in the query sequence as well as a | |
234 modified version of the query sequence in which all the annotated repeats have | |
235 been masked (default: replaced by Ns). | |
236 | |
237 ----- | |
238 | |
239 **How to read the results** | |
240 | |
241 | |
242 | |
243 The annotation file contains the cross_match output lines. It lists all best matches | |
244 (above a set minimum score) between the query sequence and any of the sequences in | |
245 the repeat database or with low complexity DNA. The term "best matches" reflects | |
246 that a match is not shown if its domain is over 80% contained within the domain | |
247 of a higher scoring match, where the "domain" of a match is the region in | |
248 the query sequence that is defined by the alignment start and stop. These domains | |
249 have been masked in the returned masked sequence file. In the output, matches are | |
250 ordered by query name, and for each query by position of the start of the alignment. | |
251 | |
252 Example: | |
253 | |
254 ======== ========= ========= ========= ========== =========== ========= ========= ============ =============== =================== ================ ============== ======= == | |
255 SW score perc div. perc del. perc ins. query seq. q-pos begin q-pos end (left) w complement matching repeat repeat class/family repeat-pos begin repeat-pos end (left) ID | |
256 ======== ========= ========= ========= ========== =========== ========= ========= ============ =============== =================== ================ ============== ======= == | |
257 1306 15.6 6.2 0.0 HSU08988 6563 6781 \(22462) C MER7A DNA/MER2_type 336 103 \(0) 1 | |
258 12204 10.0 2.4 1.8 HSU08988 6782 7714 \(21529) C TIGGER1 DNA/MER2_type 2418 1493 \(0) 2 | |
259 279 3.0 0.0 0.0 HSU08988 7719 7751 \(21492) + (TTTTA)n Simple_repeat 1 33 \(0) 3 | |
260 1765 13.4 6.5 1.8 HSU08988 7752 8022 \(21221) C AluSx SINE/Alu 289 1 \(23) 4 | |
261 12204 10.0 2.4 1.8 HSU08988 8023 8694 \(20549) C TIGGER1 DNA/MER2_type 1493 827 \(925) 5 | |
262 1984 11.1 0.3 0.7 HSU08988 8695 9000 \(20243) C AluSg SINE/Alu 305 1 \(5) 6 | |
263 12204 10.0 2.4 1.8 HSU08988 9001 9695 \(19548) C TIGGER1 DNA/MER2_type 827 2 \(1591) 7 | |
264 711 21.2 1.4 0.0 HSU08988 9696 9816 \(19427) C MER7A DNA/MER2_type 122 2 \(224) 8 | |
265 ======== ========= ========= ========= ========== =========== ========= ========= ============ =============== =================== ================ ============== ======= == | |
266 | |
267 This is a sequence in which a Tigger1 DNA transposon has integrated into a MER7 DNA transposon copy. | |
268 Subsequently two Alus integrated in the Tigger1 sequence. The simple repeat is derived from the | |
269 poly A of the Alu element. The first line is interpreted like this: | |
270 | |
271 :Table description: | |
272 | |
273 1. **1306** = Smith-Waterman score of the match, usually complexity adjusted | |
274 The SW scores are not always directly comparable. Sometimes | |
275 the complexity adjustment has been turned off, and a variety of | |
276 scoring-matrices are used. | |
277 | |
278 #. **15.6** = % substitutions in matching region compared to the consensus | |
279 #. **6.2** = % of bases opposite a gap in the query sequence (deleted bp) | |
280 #. **0.0** = % of bases opposite a gap in the repeat consensus (inserted bp) | |
281 #. **HSU08988** = name of query sequence | |
282 #. **6563** = starting position of match in query sequence | |
283 #. **7714** = ending position of match in query sequence | |
284 #. **(22462)** = no. of bases in query sequence past the ending position of match | |
285 #. **C** = match is with the Complement of the consensus sequence in the database | |
286 #. **MER7A** = name of the matching interspersed repeat | |
287 #. **DNA/MER2_type** = the class of the repeat, in this case a DNA transposon fossil of the MER2 group (see below for list and references) | |
288 #. **2418** = starting position of match in database sequence (using top-strand numbering) | |
289 #. **1465** = ending position of match in database sequence | |
290 #. **(0)** = no. of bases in (complement of) the repeat consensus sequence prior to beginning of the match (so 0 means that the match extended all the way to the end of the repeat consensus sequence) | |
291 #. **1** = Identifier | |
292 | |
293 An asterisk (\*) in the final column (no example shown) indicates that there is | |
294 a higher-scoring match whose domain partly (<80%) includes the domain of this match. | |
295 | |
296 Note that the SW score and divergence numbers for the three Tigger1 lines are identical. | |
297 This is because the information is derived from a single alignment (the Alus were deleted | |
298 from the query before the alignment with the Tigger element was performed). | |
299 The program makes educated guesses about many fragments if they are derived from | |
300 the same element (e.g. it knows that the MER7A fragments represent one insert). | |
301 In a next version I can identify each element with a unique ID, if interest exists | |
302 (this could help to represent repeats cleaner in graphic displays). | |
303 | |
304 | |
305 ------- | |
306 | |
307 **References** | |
308 | |
309 Smit, AFA, Hubley, R and Green, P. RepeatMasker Open-3.0. | |
310 | |
311 http://www.repeatmasker.org/ | |
312 | |
313 </help> | |
314 </tool> |