Mercurial > repos > bgruening > repeat_masker
comparison RepeatMasker.xml @ 0:d4a2c739da3f draft
Initial release under a consistent username. Fixes for stdout and the trailing semicolon.
author | bgruening |
---|---|
date | Tue, 25 Jun 2013 04:33:41 -0400 |
parents | |
children | 880265000696 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d4a2c739da3f |
---|---|
1 <tool id="repeatmasker_wrapper" name="RepeatMasker" version="0.1.2"> | |
2 <description>Masks different kind of repeats</description> | |
3 <command> | |
4 ## The command is a Cheetah template which allows some Python based syntax. | |
5 ## Lines starting hash hash are comments. Galaxy will turn newlines into spaces | |
6 | |
7 ## create temp directory | |
8 #import tempfile, os | |
9 #set $dirname = os.path.abspath( tempfile.mkdtemp() ) | |
10 #set $input_filename = os.path.split( str($query) )[-1] | |
11 #set $output_basename = os.path.join( $dirname, $input_filename ) | |
12 | |
13 | |
14 RepeatMasker | |
15 -parallel 8 | |
16 | |
17 $nolow | |
18 $noint | |
19 $norna | |
20 | |
21 #if str($species)!="all": | |
22 $species | |
23 #end if | |
24 | |
25 | |
26 -dir $dirname | |
27 | |
28 #if $adv_opts.adv_opts_selector=="advanced": | |
29 | |
30 #if str($adv_opts.gc)!="0": | |
31 -gc $adv_opts.gc | |
32 #end if | |
33 | |
34 $adv_opts.gccalc | |
35 | |
36 #set $output_files_list = str($adv_opts.output_files).split(',') | |
37 #if "gff" in $output_files_list: | |
38 -gff | |
39 #end if | |
40 #if "html" in $output_files_list: | |
41 -html | |
42 #end if | |
43 | |
44 $adv_opts.slow_search | |
45 $adv_opts.quick_search | |
46 $adv_opts.rush_search | |
47 $adv_opts.only_alus | |
48 $adv_opts.is_only | |
49 | |
50 #else: | |
51 ## Set defaults | |
52 -gff | |
53 | |
54 ## End of advanced options: | |
55 #end if | |
56 | |
57 $query | |
58 | |
59 2>&1; | |
60 | |
61 ## Copy the output files to galaxy | |
62 ## AgR: if there are no repeats, the output files may not exist. | |
63 ## This causes the job to fail, so touch files to ensure they exist. | |
64 #if $adv_opts.adv_opts_selector=="advanced": | |
65 | |
66 #if "summary" in $output_files_list: | |
67 ## Write out the summary file (default) | |
68 #set $summary_file = $output_basename + '.tbl' | |
69 touch $summary_file | |
70 cp $summary_file $output_summary; | |
71 #end if | |
72 | |
73 #if "gff" in $output_files_list: | |
74 ## Write out the gff file (default) | |
75 #set $gff_file = $output_basename + '.out.gff' | |
76 touch $gff_file | |
77 cp $gff_file $output_gff; | |
78 #end if | |
79 | |
80 #if "html" in $output_files_list: | |
81 ## Write out the html file | |
82 #set $html_file = $output_basename + '.out.html' | |
83 touch $html_file | |
84 cp $html_file $output_html; | |
85 #end if | |
86 | |
87 #else: | |
88 | |
89 ## Write out the summary file (default) | |
90 #set $summary_file = $output_basename + '.tbl' | |
91 touch $summary_file | |
92 cp $summary_file $output_summary; | |
93 | |
94 ## Write out the gff file (default) | |
95 #set $gff_file = $output_basename + '.out.gff' | |
96 touch $gff_file | |
97 cp $gff_file $output_gff; | |
98 | |
99 | |
100 ## End of advanced options: | |
101 #end if | |
102 | |
103 ## Write out mask sequence file | |
104 #set $mask_sequence_file = $output_basename + '.masked' | |
105 touch $mask_sequence_file | |
106 cp $mask_sequence_file $output_mask; | |
107 | |
108 ## Write out standard file (default) | |
109 ## The default '.out' file from RepeatMasker has a 3-line header and spaces rather | |
110 ## than tabs. Remove the header and replace the whitespaces with tab | |
111 #set $standard_file = $output_basename + '.out' | |
112 tail -n +4 $standard_file | tr -s ' ' '\t' > $output_std; | |
113 | |
114 ## Delete all temporary files | |
115 rm $dirname -r | |
116 | |
117 </command> | |
118 <inputs> | |
119 <param name="query" type="data" format="fasta" label="Nucleotide query sequence(s)"/> | |
120 | |
121 <param name="nolow" type="boolean" label="No low complexity DNA" truevalue="-nolow" falsevalue="" checked="false" help="Does not mask low_complexity DNA or simple repeats."/> | |
122 <param name="noint" type="boolean" label="No interspersed repeats" truevalue="-noint" falsevalue="" checked="false" help="Only masks low complex/simple repeats (no interspersed repeats)."/> | |
123 | |
124 <param name="norna" type="boolean" label="No small RNA genes" truevalue="-norna" falsevalue="" checked="false" help="Does not mask small RNA (pseudo) genes."/> | |
125 | |
126 <!-- | |
127 Specify the species or clade of the input sequence. The species name | |
128 must be a valid NCBI Taxonomy Database species name and be contained | |
129 in the RepeatMasker repeat database. The following collection is not complete. | |
130 --> | |
131 <param name="species" type="select" label="Species" help="The list is not complete, if you need other species contact your administrator."> | |
132 <option value="-species anopheles">anopheles</option> | |
133 <option value="-species arabidopsis">arabidopsis</option> | |
134 <option value="-species artiodactyl">artiodactyl</option> | |
135 <option value="-species aspergillus">aspergillus</option> | |
136 <option value="-species carnivore">carnivore</option> | |
137 <option value="-species cat">cat</option> | |
138 <option value="-species chicken">chicken</option> | |
139 <option value="-species 'ciona intestinalis'">ciona intestinalis</option> | |
140 <option value="-species 'ciona savignyi'">ciona savignyi</option> | |
141 <option value="-species cow">cow</option> | |
142 <option value="-species danio">danio</option> | |
143 <option value="-species diatoaea">diatoaea</option> | |
144 <option value="-species dog">dog</option> | |
145 <option value="-species drosophila">drosophila</option> | |
146 <option value="-species elegans">elegans</option> | |
147 <option value="-species fugu">fugu</option> | |
148 <option value="-species fungi" selected="true">fungi</option> | |
149 <option value="-species human">human</option> | |
150 <option value="-species maize">maize</option> | |
151 <option value="-species mammal">mammal</option> | |
152 <option value="-species mouse">mouse</option> | |
153 <option value="-species pig">pig</option> | |
154 <option value="-species rat">rat</option> | |
155 <option value="-species rice">rice</option> | |
156 <option value="-species rodentia">rodentia</option> | |
157 <option value="-species wheat">wheat</option> | |
158 </param> | |
159 | |
160 <conditional name="adv_opts"> | |
161 <param name="adv_opts_selector" type="select" label="Advanced Options"> | |
162 <option value="basic" selected="True">Hide Advanced Options</option> | |
163 <option value="advanced">Show Advanced Options</option> | |
164 </param> | |
165 <when value="basic" /> | |
166 <when value="advanced"> | |
167 <param name="is_only" type="boolean" label="Mask only E coli insertion elements" truevalue="-is_only" falsevalue="" checked="false" help="Only clips E coli insertion elements out of fasta and .qual files."/> | |
168 <param name="slow_search" type="boolean" label="Slow search" truevalue="-s" falsevalue="" checked="false" help="0-5% more sensitive, 2-3 times slower than default."/> | |
169 <param name="quick_search" type="boolean" label="Quick search" truevalue="-q" falsevalue="" checked="false" help="5-10% less sensitive, 2-5 times faster than default."/> | |
170 <param name="rush_search" type="boolean" label="Rush search" truevalue="-qq" falsevalue="" checked="false" help="about 10% less sensitive, 4->10 times faster than default."/> | |
171 <param name="only_alus" type="boolean" label="Only Alus" truevalue="-alu" falsevalue="" checked="false" help="Only masks Alus (and 7SLRNA, SVA and LTR5)(only for primate DNA)."/> | |
172 <param name="gccalc" type="boolean" label="Use GC depended matrices, automaticly" truevalue="-gccalc" falsevalue="" checked="true" help="RepeatMasker calculates the GC content even for batch files/small seqs"/> | |
173 | |
174 <param name="output_files" type="select" multiple="true" label="Additional output files"> | |
175 <option selected="true" value="summary">Summary file</option> | |
176 <option value="gff">GFF file</option> | |
177 <option value="html">HTML file</option> | |
178 <option value="mask">Mask FastA file</option> | |
179 </param> | |
180 | |
181 <param name="gc" type="integer" value="0" label="Use GC depended matrices" help="Use matrices calculated for 'number' percentage background GC level"> | |
182 <validator type="in_range" min="0" /> | |
183 <validator type="in_range" max="100" /> | |
184 </param> | |
185 | |
186 </when> | |
187 </conditional> | |
188 | |
189 </inputs> | |
190 <outputs> | |
191 <data name="output_std" format="tabular" label="${tool.name} on ${on_string}: Standard" /> | |
192 <data name="output_mask" format="fasta" label="${tool.name} on ${on_string}: Mask sequence"> | |
193 <filter> | |
194 (adv_opts['adv_opts_selector'] == 'advanced' and 'mask' in adv_opts['output_files']) | |
195 </filter> | |
196 </data> | |
197 <data name="output_summary" format="txt" label="${tool.name} on ${on_string}: Summary"> | |
198 <filter>( | |
199 (adv_opts['adv_opts_selector'] == 'advanced' and 'summary' in adv_opts['output_files']) | |
200 or | |
201 (adv_opts['adv_opts_selector'] == 'basic') | |
202 ) | |
203 </filter> | |
204 </data> | |
205 <data name="output_html" format="html" label="${tool.name} on ${on_string}: HTML"> | |
206 <filter>(adv_opts['adv_opts_selector'] == 'advanced' and 'html' in adv_opts['output_files'])</filter> | |
207 </data> | |
208 <data name="output_gff" format="gff" label="${tool.name} on ${on_string}: GFF"> | |
209 <filter> | |
210 (adv_opts['adv_opts_selector'] == 'advanced' and 'gff' in adv_opts['output_files']) | |
211 </filter> | |
212 </data> | |
213 </outputs> | |
214 <requirements> | |
215 <requirement type="binary">RepeatMasker</requirement> | |
216 </requirements> | |
217 <help> | |
218 | |
219 .. class:: warningmark | |
220 | |
221 **What it does** | |
222 | |
223 RepeatMasker is a program that screens DNA sequences for *interspersed repeats* | |
224 and *low complexity* DNA sequences. The output of the program is a detailed | |
225 annotation of the repeats that are present in the query sequence as well as a | |
226 modified version of the query sequence in which all the annotated repeats have | |
227 been masked (default: replaced by Ns). | |
228 | |
229 ----- | |
230 | |
231 **How to read the results** | |
232 | |
233 | |
234 | |
235 The annotation file contains the cross_match output lines. It lists all best matches | |
236 (above a set minimum score) between the query sequence and any of the sequences in | |
237 the repeat database or with low complexity DNA. The term "best matches" reflects | |
238 that a match is not shown if its domain is over 80% contained within the domain | |
239 of a higher scoring match, where the "domain" of a match is the region in | |
240 the query sequence that is defined by the alignment start and stop. These domains | |
241 have been masked in the returned masked sequence file. In the output, matches are | |
242 ordered by query name, and for each query by position of the start of the alignment. | |
243 | |
244 Example: | |
245 | |
246 ======== ========= ========= ========= ========== =========== ========= ========= ============ =============== =================== ================ ============== ======= == | |
247 SW score perc div. perc del. perc ins. query seq. q-pos begin q-pos end (left) w complement matching repeat repeat class/family repeat-pos begin repeat-pos end (left) ID | |
248 ======== ========= ========= ========= ========== =========== ========= ========= ============ =============== =================== ================ ============== ======= == | |
249 1306 15.6 6.2 0.0 HSU08988 6563 6781 \(22462) C MER7A DNA/MER2_type 336 103 \(0) 1 | |
250 12204 10.0 2.4 1.8 HSU08988 6782 7714 \(21529) C TIGGER1 DNA/MER2_type 2418 1493 \(0) 2 | |
251 279 3.0 0.0 0.0 HSU08988 7719 7751 \(21492) + (TTTTA)n Simple_repeat 1 33 \(0) 3 | |
252 1765 13.4 6.5 1.8 HSU08988 7752 8022 \(21221) C AluSx SINE/Alu 289 1 \(23) 4 | |
253 12204 10.0 2.4 1.8 HSU08988 8023 8694 \(20549) C TIGGER1 DNA/MER2_type 1493 827 \(925) 5 | |
254 1984 11.1 0.3 0.7 HSU08988 8695 9000 \(20243) C AluSg SINE/Alu 305 1 \(5) 6 | |
255 12204 10.0 2.4 1.8 HSU08988 9001 9695 \(19548) C TIGGER1 DNA/MER2_type 827 2 \(1591) 7 | |
256 711 21.2 1.4 0.0 HSU08988 9696 9816 \(19427) C MER7A DNA/MER2_type 122 2 \(224) 8 | |
257 ======== ========= ========= ========= ========== =========== ========= ========= ============ =============== =================== ================ ============== ======= == | |
258 | |
259 This is a sequence in which a Tigger1 DNA transposon has integrated into a MER7 DNA transposon copy. | |
260 Subsequently two Alus integrated in the Tigger1 sequence. The simple repeat is derived from the | |
261 poly A of the Alu element. The first line is interpreted like this: | |
262 | |
263 :Table description: | |
264 | |
265 1. **1306** = Smith-Waterman score of the match, usually complexity adjusted | |
266 The SW scores are not always directly comparable. Sometimes | |
267 the complexity adjustment has been turned off, and a variety of | |
268 scoring-matrices are used. | |
269 | |
270 #. **15.6** = % substitutions in matching region compared to the consensus | |
271 #. **6.2** = % of bases opposite a gap in the query sequence (deleted bp) | |
272 #. **0.0** = % of bases opposite a gap in the repeat consensus (inserted bp) | |
273 #. **HSU08988** = name of query sequence | |
274 #. **6563** = starting position of match in query sequence | |
275 #. **7714** = ending position of match in query sequence | |
276 #. **(22462)** = no. of bases in query sequence past the ending position of match | |
277 #. **C** = match is with the Complement of the consensus sequence in the database | |
278 #. **MER7A** = name of the matching interspersed repeat | |
279 #. **DNA/MER2_type** = the class of the repeat, in this case a DNA transposon fossil of the MER2 group (see below for list and references) | |
280 #. **2418** = starting position of match in database sequence (using top-strand numbering) | |
281 #. **1465** = ending position of match in database sequence | |
282 #. **(0)** = no. of bases in (complement of) the repeat consensus sequence prior to beginning of the match (so 0 means that the match extended all the way to the end of the repeat consensus sequence) | |
283 #. **1** = Identifier | |
284 | |
285 An asterisk (\*) in the final column (no example shown) indicates that there is | |
286 a higher-scoring match whose domain partly (<80%) includes the domain of this match. | |
287 | |
288 Note that the SW score and divergence numbers for the three Tigger1 lines are identical. | |
289 This is because the information is derived from a single alignment (the Alus were deleted | |
290 from the query before the alignment with the Tigger element was performed). | |
291 The program makes educated guesses about many fragments if they are derived from | |
292 the same element (e.g. it knows that the MER7A fragments represent one insert). | |
293 In a next version I can identify each element with a unique ID, if interest exists | |
294 (this could help to represent repeats cleaner in graphic displays). | |
295 | |
296 | |
297 ------- | |
298 | |
299 **References** | |
300 | |
301 Smit, AFA, Hubley, R and Green, P. RepeatMasker Open-3.0. | |
302 | |
303 http://www.repeatmasker.org/ | |
304 | |
305 </help> | |
306 </tool> |