Mercurial > repos > petr-novak > repeatexplorer2_cerit
comparison repex_full_clustering.xml @ 0:e2b8e71b85b9 draft
Uploaded
author | petr-novak |
---|---|
date | Wed, 08 Jan 2020 06:25:59 -0500 |
parents | |
children | 968f0867acc5 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e2b8e71b85b9 |
---|---|
1 <tool id="repeatexplorer2" name="RepeatExplorer2 clustering: " version="2.3.7" > | |
2 <stdio> | |
3 <regex match="lastdb: can't open file: NEAR" source="stderr" level="fatal" description="Version of last is too old, use ver 956 or higher\n" /> | |
4 <regex match="Traceback" source="stderr" level="fatal" description="Unknown error" /> | |
5 <regex match="error" source="stderr" level="fatal" description="Unknown error" /> | |
6 <regex match="Warning" source="stderr" level="warning" description="Unknown error" /> | |
7 <exit_code range="1:" level="fatal" description="Error" /> | |
8 </stdio> | |
9 <description>Improved version or repeat discovery and characterization using graph based sequence clustering</description> | |
10 <requirements> | |
11 <requirement type="package">last</requirement> | |
12 <requirement type="package">imagemagick</requirement> | |
13 <requirement type="package">mafft</requirement> | |
14 <requirement type="package">blast</requirement> | |
15 <requirement type="package">diamond</requirement> | |
16 <requirement type="package">blast-legacy</requirement> | |
17 <requirement type="package">r-igraph</requirement> | |
18 <requirement type="package">r-data.tree</requirement> | |
19 <requirement type="package">r-stringr</requirement> | |
20 <requirement type="package">r-r2html</requirement> | |
21 <requirement type="package">r-hwriter</requirement> | |
22 <requirement type="package">r-dt</requirement> | |
23 <requirement type="package">r-scales</requirement> | |
24 <requirement type="package">r-plotrix</requirement> | |
25 <requirement type="package">r-png</requirement> | |
26 <requirement type="package">r-plyr</requirement> | |
27 <requirement type="package">r-dplyr</requirement> | |
28 <requirement type="package">r-optparse</requirement> | |
29 <requirement type="package">r-dbi</requirement> | |
30 <requirement type="package">r-rsqlite</requirement> | |
31 <requirement type="package">r-rserve</requirement> | |
32 <requirement type="package">bioconductor-biostrings</requirement> | |
33 <requirement type="package" version="2.3.7">repex_tarean</requirement> | |
34 <requirement type="set_environment">REPEX</requirement> | |
35 <requirement type="set_environment">REPEX_VERSION</requirement> | |
36 <requirement type="package" version="0.9.1" >pyrserve</requirement> | |
37 </requirements> | |
38 <command > | |
39 export PYTHONHASHSEED=0; | |
40 \${REPEX}/seqclust --sample ${sample} --output_dir=tarean_output --logfile=${log} --cleanup $paired --taxon $taxon | |
41 | |
42 #if $advanced_options.advanced: | |
43 --mincl $advanced_options.size_threshold $advanced_options.keep_names $advanced_options.automatic_filtering -D $advanced_options.blastx.options_blastx | |
44 --assembly_min $advanced_options.assembly_min_cluster_size | |
45 | |
46 #if $advanced_options.comparative.options_comparative: | |
47 --prefix_length $advanced_options.comparative.prefix_length | |
48 #end if | |
49 | |
50 #if $advanced_options.custom_library.options_custom_library: | |
51 -d $advanced_options.custom_library.library extra_database | |
52 #end if | |
53 | |
54 #if $advanced_options.options.options: | |
55 -opt $advanced_options.options.options | |
56 #end if | |
57 #end if | |
58 ${FastaFile} >stdout.log 2> stderr.log ; | |
59 echo "STDOUT CONTENT:" >> ${log} ; | |
60 cat stdout.log >> ${log} ; | |
61 echo "STDERR CONTENT:" >> ${log}; | |
62 cat stderr.log >> ${log} && | |
63 \${REPEX}/stderr_filter.py stderr.log && | |
64 cd tarean_output && | |
65 zip -r ${ReportArchive}.zip * && | |
66 mv ${ReportArchive}.zip ${ReportArchive} && | |
67 cp index.html ${ReportFile} && | |
68 mkdir ${ReportFile.files_path} && | |
69 cp -r --parents libdir ${ReportFile.files_path} && | |
70 cp -r --parents seqclust/clustering/superclusters ${ReportFile.files_path} && | |
71 cp -r --parents seqclust/clustering/clusters ${ReportFile.files_path} && | |
72 cp seqclust/clustering/hitsort.cls ${ReportFile.files_path}/seqclust/clustering/hitsort.cls && | |
73 cp *.png ${ReportFile.files_path}/ && | |
74 cp *.csv ${ReportFile.files_path}/ && | |
75 cp *.html ${ReportFile.files_path}/ && | |
76 cp *.css ${ReportFile.files_path}/ && | |
77 cp *.fasta ${ReportFile.files_path}/ 2>>$log && rm -r ../tarean_output || : | |
78 | |
79 </command> | |
80 <inputs> | |
81 <param name="FastaFile" label="NGS reads" type="data" format="fasta" | |
82 help="Input file must contain fasta-formatted NGS reads. If paired end reads are used, reads must be interlaced and all pairs must be complete. Example of input data format is provided in the help below. "/> | |
83 <param name="paired" type="boolean" truevalue="--paired" falsevalue="" checked="True" label="Paired-end reads" help="Check if you are using pair reads and input sequences contain both read mates and left mates alternate with their right mates" /> | |
84 | |
85 <param name="sample" label="Sample size" type="integer" value="500000" min="10000"/> | |
86 <param name="taxon" label="Select taxon and protein domain database version (REXdb)" type="select" help="Reference database of transposable element protein domains - REXdb - is used for annotation of repeats"> | |
87 <option value="VIRIDIPLANTAE3.0" selected="true">Viridiplantae version 3.0 </option> | |
88 <option value="VIRIDIPLANTAE2.2" selected="true">Viridiplantae version 2.2</option> | |
89 <option value="METAZOA3.0" >Metazoa version 3.0</option> | |
90 <option value="METAZOA2.0" >Metazoa version 2.0</option> | |
91 <!-- Modify setting in config.py accordingly --> | |
92 </param> | |
93 | |
94 <conditional name="advanced_options"> | |
95 <param name="advanced" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Advanced options" /> | |
96 <when value="false"> | |
97 <!-- pass --> | |
98 </when> | |
99 <when value="true"> | |
100 <conditional name="comparative"> | |
101 <param name="options_comparative" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Perform comparative analysis" help="Use this options when you want to compare sequences multiple groups"/> | |
102 <when value="false"> | |
103 <!-- do nothing here --> | |
104 </when> | |
105 <when value="true"> | |
106 <param name="prefix_length" label="Group code length" type="integer" value="3" min="1" max="10" help="For comparative analysis, sequences are from individial groups distinguished by sample code which must be used as prefix for sequence name. See example below."/> | |
107 </when> | |
108 </conditional> | |
109 | |
110 <conditional name="blastx"> | |
111 <param name="options_blastx" type="select" label="Select parameters for protein domain search"> | |
112 <option value="BLASTX_W2" selected="false">blastx with word size 2 (the most sensitive, slowest)</option> | |
113 <option value="BLASTX_W3" selected="true">blastx with word size 3 (default)</option> | |
114 <option value="DIAMOND" selected="false">diamond program (the least sensitive, fastest)</option> | |
115 </param> | |
116 </conditional> | |
117 | |
118 <conditional name="options"> | |
119 <param name="options" type="select" label="Similarity search options" help="Different similarity search parameters are used depending on the used input data to adjust search to differences in length and error rate"> | |
120 <option value="ILLUMINA" selected="true">Illumina reads, read length 100nt or more </option> | |
121 <option value="ILLUMINA_SHORT" selected="false">Illumina reads, shorter than 100nt (Do not use reads shorter than 50nt!) </option> | |
122 <option value="ILLUMINA_DUST_OFF" selected="false">Illumina reads, no masking of low complexity repeats </option> | |
123 <option value="OXFORD_NANOPORE" selected="false"> | |
124 Pseudo short reads simulated from Oxford Nanopore data (experimental feature) | |
125 </option> | |
126 </param> | |
127 </conditional> | |
128 | |
129 <conditional name="custom_library"> | |
130 <param name="options_custom_library" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Use custom repeat database"/> | |
131 <when value="false"> | |
132 <!-- do nothing here --> | |
133 </when> | |
134 <when value="true"> | |
135 <param name="library" format="fasta" type="data" label="Custom library of repeats" help="Library of repeats as DNA sequences in fasta format. The required format for IDs in a custom library is : '>reapeatname#class/subclass'"/> | |
136 </when> | |
137 </conditional> | |
138 <param name="size_threshold" label="Cluster size threshold for detailed analysis" type="float" value="0.01" min="0.0001" max="100" help ="Minimal size (as percentage of input reads) of the smallest cluster which is analyzed, cluster with less than 20 reads are not considered at all."/> | |
139 <param name="automatic_filtering" label="Perform automatic filtering of abundant satellite repeats" help="Automatic filtering tries to identify the most abundant tandem repeats and remove such sequences partially from analysis. Removal of abundant tandem repeat can enable to analyze higher proportion of other less abundant repeats." type="boolean" truevalue="--automatic_filtering" falsevalue="" checked="false"/> | |
140 <param name="keep_names" label="Keep original sequences names" type="boolean" truevalue="--keep_names" falsevalue="" checked="false" help="By default sequence are relabeled using integers. If you want to keep original names, use this option."/> | |
141 <param name="assembly_min_cluster_size" type="integer" label="min cluster size for assembly" value="5" min="2" max="100"/> | |
142 </when> | |
143 </conditional> | |
144 | |
145 <conditional name="queue_definition"> | |
146 <param name="queue_select" type="select" label="Select queue"> | |
147 <option value="basic_fast_queue">basic & fast</option> | |
148 <option value="long_slow_queue">long & slow</option> | |
149 <option value="extra_long_slow_queue">extra long & slow</option> | |
150 </param> | |
151 <when value="basic_fast_queue"> | |
152 <param name="queue_specification" type="text" label="Modify parameters (optional)" | |
153 value="-l select=1:ncpus=10:mem=32gb:scratch_local=50gb -l walltime=48:00:00 -q elixirre@pbs.elixir-czech.cz -v TAREAN_MAX_MEM=4000000,TAREAN_CPU=4" /> | |
154 </when> | |
155 | |
156 <when value="long_slow_queue"> | |
157 <param name="queue_specification" type="text" label="Modify parameters (optional)" | |
158 value="-l select=1:ncpus=16:mem=112gb:scratch_local=50gb -l walltime=336:00:00 -q elixirre@pbs.elixir-czech.cz -v TAREAN_MAX_MEM=64000000,TAREAN_CPU=15" /> | |
159 </when> | |
160 <when value="extra_long_slow_queue"> | |
161 <param name="queue_specification" type="text" label="Modify parameters (optional)" | |
162 value="-l select=1:ncpus=16:mem=112gb:scratch_local=50gb -l walltime=720:00:00 -q elixirre@pbs.elixir-czech.cz -v TAREAN_MAX_MEM=64000000,TAREAN_CPU=15" /> | |
163 </when> | |
164 </conditional> | |
165 | |
166 | |
167 | |
168 </inputs> | |
169 <outputs> | |
170 <data name="log" format="txt" label="RepeatExplorer2 - log file"/> | |
171 <data name="ReportArchive" format="zip" label="RepeatExplorer2 - Archive with HTML report from data ${FastaFile.hid}"/> | |
172 <data name="ReportFile" format="html" label="RepeatExplorer2 - HTML report from data ${FastaFile.hid}"/> | |
173 </outputs> | |
174 | |
175 <help> | |
176 **HELP** | |
177 | |
178 RepeatExplorer2 clustering is a computational pipeline for unsupervised | |
179 identification of repeats from unassembled sequence reads. The | |
180 pipeline uses low-pass whole genome sequence reads and performs graph-based | |
181 clustering. Resulting clusters, representing all types of repeats, are then | |
182 examined to identify and classify into repeats groups. | |
183 | |
184 **Input data** | |
185 | |
186 The analysis requires either **single** or **paired-end reads** generated | |
187 by whole genome shotgun sequencing provided as a single fasta-formatted file. | |
188 Generally, paired-end reads provide significantly better results than single | |
189 reads. Reads should be of uniform length (optimal size range is 100-200 nt) and | |
190 the number of analyzed reads should represent less than 1x genome equivalent | |
191 (genome coverage of 0.01 - 0.50 x is recommended). Reads should be | |
192 quality-filtered (recommended filtering : quality score >=10 over 95% of bases | |
193 and no Ns allowed) and only **complete read pairs** should be submitted for | |
194 analysis. When paired reads are used, input data must be **interlaced** format | |
195 as fasta file: | |
196 | |
197 example of interlaced input format:: | |
198 | |
199 >0001_f | |
200 CGTAATATACATACTTGCTAGCTAGTTGGATGCATCCAACTTGCAAGCTAGTTTGATG | |
201 >0001_r | |
202 GATTTGACGGACACACTAACTAGCTAGTTGCATCTAAGCGGGCACACTAACTAACTAT | |
203 >0002_f | |
204 ACTCATTTGGACTTAACTTTGATAATAAAAACTTAAAAAGGTTTCTGCACATGAATCG | |
205 >0002_r | |
206 TATGTTGAAAAATTGAATTTCGGGACGAAACAGCGTCTATCGTCACGACATAGTGCTC | |
207 >0003_f | |
208 TGACATTTGTGAACGTTAATGTTCAACAAATCTTTCCAATGTCTTTTTATCTTATCAT | |
209 >0003_r | |
210 TATTGAAATACTGGACACAAATTGGAAATGAAACCTTGTGAGTTATTCAATTTATGTT | |
211 ... | |
212 | |
213 | |
214 **Comparative analysis** | |
215 | |
216 For comparative analysis sequence names must contain code (prefix) for each group. | |
217 Prefix in sequences names must be of fixed length. | |
218 | |
219 Example of labeling two groups with where **group code length** is 2 and is used to distinguish groups - AA and BB :: | |
220 | |
221 >AA0001_f | |
222 CGTAATATACATACTTGCTAGCTAGTTGGATGCATCCAACTTGCAAGCTAGTTTGATG | |
223 >AA0001_r | |
224 GATTTGACGGACACACTAACTAGCTAGTTGCATCTAAGCGGGCACACTAACTAACTAT | |
225 >AA0002_f | |
226 ACTCATTTGGACTTAACTTTGATAATAAAAACTTAAAAAGGTTTCTGCACATGAATCG | |
227 >AA0002_r | |
228 TATGTTGAAAAATTGAATTTCGGGACGAAACAGCGTCTATCGTCACGACATAGTGCTC | |
229 >BB0001_f | |
230 TGACATTTGTGAACGTTAATGTTCAACAAATCTTTCCAATGTCTTTTTATCTTATCAT | |
231 >BB0001_r | |
232 TATTGAAATACTGGACACAAATTGGAAATGAAACCTTGTGAGTTATTCAATTTATGTT | |
233 >BB0002_f | |
234 TGACATTTGTGAACGTTAATGTTCAACAAATCTTTCCAATGTCTTTTTATCTTATCAT | |
235 >BB0002_r | |
236 TATTGAAATACTGGACACAAATTGGAAATGAAACCTTGTGAGTTATTCAATTTATGTT | |
237 | |
238 | |
239 To prepare quality filtered and interlaced input fasta file from fastq | |
240 files, use `Preprocessing of paired-reads`__ tool. | |
241 | |
242 .. __: tool_runner?tool_id=paired_fastq_filtering | |
243 | |
244 | |
245 **Additional parameters** | |
246 | |
247 **Sample size** defines how many reads should be used in calculation. | |
248 Default setting with 500,000 reads will enable detection of high copy | |
249 repeats within several hours of computation time. For higher | |
250 sensitivity the sample size can be set higher. Since sample size affects | |
251 the memory usage, this parameter may be automatically adjusted to lower | |
252 value during the run. Maximum sample size which can be processed depends on | |
253 the repetitiveness of analyzed genome. | |
254 | |
255 | |
256 **Select taxon and protein domain database version (REXdb)**. Classification | |
257 of transposable elements is based on the similarity to our reference database | |
258 of transposable element protein domains (**REXdb**). Standalone database for Viridiplantae species | |
259 can be obtained on `repeatexplorer.org`__. Classification | |
260 system used in REXdb is described in article `Systematic survey of plant | |
261 LTR-retrotransposons elucidates phylogenetic relationships of their | |
262 polyprotein domains and provides a reference for element classification`__ | |
263 Database for Metazoa species is still under development so use it with caution. | |
264 | |
265 .. __: http://repeatexplorer.org | |
266 .. __: https://doi.org/10.1186/s13100-018-0144-1 | |
267 | |
268 **Select parameters for protein domain search** REXdb is compared with s | |
269 equence clusters either using blastx or diamond aligner. Diamond program | |
270 is about three time faster than blastx with word size 3. | |
271 | |
272 **Similarity search options** By default sequence reads are compared using | |
273 mgblast program. Default threshold is explicitly set to 90% sequence | |
274 similarity spanning at least 55% of the read length (in the case of reads | |
275 differing in length it applies to the longer one). Additionally, sequence | |
276 overlap must be at least 55 nt. If you select option for shorter reads | |
277 than 100 nt, minimum overlap 55 nt is not required. | |
278 | |
279 By default, | |
280 mgblast search use DUST program to filter out | |
281 low-complexity sequences. If you want | |
282 to increase sensitivity of detection of satellites with shorter monomer | |
283 use option with '*no masking of low complexity repeats*'. Note that omitting | |
284 DUST filtering will significantly increase running times | |
285 | |
286 | |
287 **Automatic filtering of abundant satellite repeats** perform clustering on | |
288 smaller dataset of sequence reads to detect abundant high confidence | |
289 satellite repeats. If such satellites are detected, sequence reads derived | |
290 from these satellites are depleted from input dataset. This step enable more | |
291 sensitive detection of less abundant repeats as more reads can be used | |
292 in clustering step. | |
293 | |
294 **Use custom repeat database**. This option allows users to perform similarity | |
295 comparison of identified repeats to their custom databases. The repeat class must | |
296 be encoded in FASTA headers of database entries in order to allow correct | |
297 parsing of similarity hits. Required format for custom database sequence name is: :: | |
298 | |
299 >reapeatname#class/subclass | |
300 | |
301 | |
302 **Output** | |
303 | |
304 List of clusters identified as putative satellite repeats, their genomic | |
305 abundance and various cluster characteristics. | |
306 | |
307 Output includes a **HTML summary** with table listing of all analyzed | |
308 clusters. More detailed information about clusters is provided in | |
309 additional files and directories. All results are also provided as | |
310 downloadable **zip archive**. Additionally a **log file** reporting | |
311 the progress of the computational pipeline is provided. | |
312 | |
313 </help> | |
314 | |
315 </tool> |