comparison repex_full_clustering.xml @ 0:1d1b9e1b2e2f draft

Uploaded
author petr-novak
date Thu, 19 Dec 2019 10:24:45 -0500
parents
children 394a697ffa49
comparison
equal deleted inserted replaced
-1:000000000000 0:1d1b9e1b2e2f
1 <tool id="repeatexplorer2" name="RepeatExplorer2 clustering: " >
2 <stdio>
3 <regex match="lastdb: can't open file: NEAR" source="stderr" level="fatal" description="Version of last is too old, use ver 956 or higher\n" />
4 <regex match="Traceback" source="stderr" level="fatal" description="Unknown error" />
5 <regex match="error" source="stderr" level="fatal" description="Unknown error" />
6 <regex match="Warning" source="stderr" level="warning" description="Unknown error" />
7 <exit_code range="1:" level="fatal" description="Error" />
8 </stdio>
9 <description>Improved version or repeat discovery and characterization using graph based sequence clustering</description>
10 <requirements>
11 <requirement type="package" version="3.7">python</requirement>
12 <requirement type="package" version="0.9.1" >pyrserve</requirement>
13 <requirement type="package">mafft</requirement>
14 <requirement type="package">imagemagick</requirement>
15 <requirement type="package">blast</requirement>
16 <requirement type="package">diamond</requirement>
17 <requirement type="package">blast-legacy</requirement>
18 <requirement type="package">r-igraph</requirement>
19 <requirement type="package">r-data.tree</requirement>
20 <requirement type="package">r-stringr</requirement>
21 <requirement type="package">r-r2html</requirement>
22 <requirement type="package">r-hwriter</requirement>
23 <requirement type="package">r-dt</requirement>
24 <requirement type="package">r-scales</requirement>
25 <requirement type="package">r-plotrix</requirement>
26 <requirement type="package">r-png</requirement>
27 <requirement type="package">r-plyr</requirement>
28 <requirement type="package">r-dplyr</requirement>
29 <requirement type="package">r-optparse</requirement>
30 <requirement type="package">r-dbi</requirement>
31 <requirement type="package">r-rsqlite</requirement>
32 <requirement type="package">r-rserve</requirement>
33 <requirement type="package">bioconductor-biostrings</requirement>
34 </requirements>
35 <command >
36 export PYTHONHASHSEED=0;
37 ${__tool_directory__}/seqclust --sample ${sample} --output_dir=tarean_output --logfile=${log} --cleanup $paired --taxon $taxon
38
39 #if $advanced_options.advanced:
40 --mincl $advanced_options.size_threshold $advanced_options.keep_names $advanced_options.automatic_filtering -D $advanced_options.blastx.options_blastx
41 --assembly_min $advanced_options.assembly_min_cluster_size
42
43 #if $advanced_options.comparative.options_comparative:
44 --prefix_length $advanced_options.comparative.prefix_length
45 #end if
46
47 #if $advanced_options.custom_library.options_custom_library:
48 -d $advanced_options.custom_library.library extra_database
49 #end if
50
51 #if $advanced_options.options.options:
52 -opt $advanced_options.options.options
53 #end if
54 #end if
55 ${FastaFile} >stdout.log 2> stderr.log ;
56 echo "STDOUT CONTENT:" >> ${log} ;
57 cat stdout.log >> ${log} ;
58 echo "STDERR CONTENT:" >> ${log};
59 cat stderr.log >> ${log} &amp;&amp;
60 cd tarean_output &amp;&amp;
61 zip -r ${ReportArchive}.zip * &amp;&amp;
62 mv ${ReportArchive}.zip ${ReportArchive} &amp;&amp;
63 cp index.html ${ReportFile} &amp;&amp;
64 mkdir ${ReportFile.files_path} &amp;&amp;
65 cp -r --parents libdir ${ReportFile.files_path} &amp;&amp;
66 cp -r --parents seqclust/clustering/superclusters ${ReportFile.files_path} &amp;&amp;
67 cp -r --parents seqclust/clustering/clusters ${ReportFile.files_path} &amp;&amp;
68 cp seqclust/clustering/hitsort.cls ${ReportFile.files_path}/seqclust/clustering/hitsort.cls &amp;&amp;
69 cp *.png ${ReportFile.files_path}/ &amp;&amp;
70 cp *.csv ${ReportFile.files_path}/ &amp;&amp;
71 cp *.html ${ReportFile.files_path}/ &amp;&amp;
72 cp *.css ${ReportFile.files_path}/ &amp;&amp;
73 cp *.fasta ${ReportFile.files_path}/ 2>>$log &amp;&amp; rm -r ../tarean_output || :
74
75 </command>
76 <inputs>
77 <param name="FastaFile" label="NGS reads" type="data" format="fasta"
78 help="Input file must contain fasta-formatted NGS reads. If paired end reads are used, reads must be interlaced and all pairs must be complete. Example of input data format is provided in the help below. "/>
79 <param name="paired" type="boolean" truevalue="--paired" falsevalue="" checked="True" label="Paired-end reads" help="Check if you are using pair reads and input sequences contain both read mates and left mates alternate with their right mates" />
80
81 <param name="sample" label="Sample size" type="integer" value="500000" min="10000"/>
82 <param name="taxon" label="Select taxon and protein domain database version (REXdb)" type="select" help="Reference database of transposable element protein domains - REXdb - is used for annotation of repeats">
83 <option value="VIRIDIPLANTAE3.0" selected="true">Viridiplantae version 3.0 </option>
84 <option value="VIRIDIPLANTAE2.2" selected="true">Viridiplantae version 2.2</option>
85 <option value="METAZOA3.0" >Metazoa version 3.0</option>
86 <option value="METAZOA2.0" >Metazoa version 2.0</option>
87 <!-- Modify setting in config.py accordingly -->
88 </param>
89
90 <conditional name="advanced_options">
91 <param name="advanced" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Advanced options" />
92 <when value="false">
93 <!-- pass -->
94 </when>
95 <when value="true">
96 <conditional name="comparative">
97 <param name="options_comparative" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Perform comparative analysis" help="Use this options when you want to compare sequences multiple groups"/>
98 <when value="false">
99 <!-- do nothing here -->
100 </when>
101 <when value="true">
102 <param name="prefix_length" label="Group code length" type="integer" value="3" min="1" max="10" help="For comparative analysis, sequences are from individial groups distinguished by sample code which must be used as prefix for sequence name. See example below."/>
103 </when>
104 </conditional>
105
106 <conditional name="blastx">
107 <param name="options_blastx" type="select" label="Select parameters for protein domain search">
108 <option value="BLASTX_W2" selected="false">blastx with word size 2 (the most sensitive, slowest)</option>
109 <option value="BLASTX_W3" selected="true">blastx with word size 3 (default)</option>
110 <option value="DIAMOND" selected="false">diamond program (the least sensitive, fastest)</option>
111 </param>
112 </conditional>
113
114 <conditional name="options">
115 <param name="options" type="select" label="Similarity search options" help="Different similarity search parameters are used depending on the used input data to adjust search to differences in length and error rate">
116 <option value="ILLUMINA" selected="true">Illumina reads, read length 100nt or more </option>
117 <option value="ILLUMINA_SHORT" selected="false">Illumina reads, shorter than 100nt (Do not use reads shorter than 50nt!) </option>
118 <option value="ILLUMINA_DUST_OFF" selected="false">Illumina reads, no masking of low complexity repeats </option>
119 <option value="OXFORD_NANOPORE" selected="false">
120 Pseudo short reads simulated from Oxford Nanopore data (experimental feature)
121 </option>
122 </param>
123 </conditional>
124
125 <conditional name="custom_library">
126 <param name="options_custom_library" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Use custom repeat database"/>
127 <when value="false">
128 <!-- do nothing here -->
129 </when>
130 <when value="true">
131 <param name="library" format="fasta" type="data" label="Custom library of repeats" help="Library of repeats as DNA sequences in fasta format. The required format for IDs in a custom library is : '>reapeatname#class/subclass'"/>
132 </when>
133 </conditional>
134 <param name="size_threshold" label="Cluster size threshold for detailed analysis" type="float" value="0.01" min="0.0001" max="100" help ="Minimal size (as percentage of input reads) of the smallest cluster which is analyzed, cluster with less than 20 reads are not considered at all."/>
135 <param name="automatic_filtering" label="Perform automatic filtering of abundant satellite repeats" help="Automatic filtering tries to identify the most abundant tandem repeats and remove such sequences partially from analysis. Removal of abundant tandem repeat can enable to analyze higher proportion of other less abundant repeats." type="boolean" truevalue="--automatic_filtering" falsevalue="" checked="false"/>
136 <param name="keep_names" label="Keep original sequences names" type="boolean" truevalue="--keep_names" falsevalue="" checked="false" help="By default sequence are relabeled using integers. If you want to keep original names, use this option."/>
137 <param name="assembly_min_cluster_size" type="integer" label="min cluster size for assembly" value="5" min="2" max="100"/>
138 </when>
139 </conditional>
140
141 </inputs>
142 <outputs>
143 <data name="log" format="txt" label="RepeatExplorer2 - log file"/>
144 <data name="ReportArchive" format="zip" label="RepeatExplorer2 - Archive with HTML report from data ${FastaFile.hid}"/>
145 <data name="ReportFile" format="html" label="RepeatExplorer2 - HTML report from data ${FastaFile.hid}"/>
146 </outputs>
147
148 <help>
149 **HELP**
150
151 RepeatExplorer2 clustering is a computational pipeline for unsupervised
152 identification of repeats from unassembled sequence reads. The
153 pipeline uses low-pass whole genome sequence reads and performs graph-based
154 clustering. Resulting clusters, representing all types of repeats, are then
155 examined to identify and classify into repeats groups.
156
157 **Input data**
158
159 The analysis requires either **single** or **paired-end reads** generated
160 by whole genome shotgun sequencing provided as a single fasta-formatted file.
161 Generally, paired-end reads provide significantly better results than single
162 reads. Reads should be of uniform length (optimal size range is 100-200 nt) and
163 the number of analyzed reads should represent less than 1x genome equivalent
164 (genome coverage of 0.01 - 0.50 x is recommended). Reads should be
165 quality-filtered (recommended filtering : quality score >=10 over 95% of bases
166 and no Ns allowed) and only **complete read pairs** should be submitted for
167 analysis. When paired reads are used, input data must be **interlaced** format
168 as fasta file:
169
170 example of interlaced input format::
171
172 >0001_f
173 CGTAATATACATACTTGCTAGCTAGTTGGATGCATCCAACTTGCAAGCTAGTTTGATG
174 >0001_r
175 GATTTGACGGACACACTAACTAGCTAGTTGCATCTAAGCGGGCACACTAACTAACTAT
176 >0002_f
177 ACTCATTTGGACTTAACTTTGATAATAAAAACTTAAAAAGGTTTCTGCACATGAATCG
178 >0002_r
179 TATGTTGAAAAATTGAATTTCGGGACGAAACAGCGTCTATCGTCACGACATAGTGCTC
180 >0003_f
181 TGACATTTGTGAACGTTAATGTTCAACAAATCTTTCCAATGTCTTTTTATCTTATCAT
182 >0003_r
183 TATTGAAATACTGGACACAAATTGGAAATGAAACCTTGTGAGTTATTCAATTTATGTT
184 ...
185
186
187 **Comparative analysis**
188
189 For comparative analysis sequence names must contain code (prefix) for each group.
190 Prefix in sequences names must be of fixed length.
191
192 Example of labeling two groups with where **group code length** is 2 and is used to distinguish groups - AA and BB ::
193
194 >AA0001_f
195 CGTAATATACATACTTGCTAGCTAGTTGGATGCATCCAACTTGCAAGCTAGTTTGATG
196 >AA0001_r
197 GATTTGACGGACACACTAACTAGCTAGTTGCATCTAAGCGGGCACACTAACTAACTAT
198 >AA0002_f
199 ACTCATTTGGACTTAACTTTGATAATAAAAACTTAAAAAGGTTTCTGCACATGAATCG
200 >AA0002_r
201 TATGTTGAAAAATTGAATTTCGGGACGAAACAGCGTCTATCGTCACGACATAGTGCTC
202 >BB0001_f
203 TGACATTTGTGAACGTTAATGTTCAACAAATCTTTCCAATGTCTTTTTATCTTATCAT
204 >BB0001_r
205 TATTGAAATACTGGACACAAATTGGAAATGAAACCTTGTGAGTTATTCAATTTATGTT
206 >BB0002_f
207 TGACATTTGTGAACGTTAATGTTCAACAAATCTTTCCAATGTCTTTTTATCTTATCAT
208 >BB0002_r
209 TATTGAAATACTGGACACAAATTGGAAATGAAACCTTGTGAGTTATTCAATTTATGTT
210
211
212 To prepare quality filtered and interlaced input fasta file from fastq
213 files, use `Preprocessing of paired-reads`__ tool.
214
215 .. __: tool_runner?tool_id=paired_fastq_filtering
216
217
218 **Additional parameters**
219
220 **Sample size** defines how many reads should be used in calculation.
221 Default setting with 500,000 reads will enable detection of high copy
222 repeats within several hours of computation time. For higher
223 sensitivity the sample size can be set higher. Since sample size affects
224 the memory usage, this parameter may be automatically adjusted to lower
225 value during the run. Maximum sample size which can be processed depends on
226 the repetitiveness of analyzed genome.
227
228
229 **Select taxon and protein domain database version (REXdb)**. Classification
230 of transposable elements is based on the similarity to our reference database
231 of transposable element protein domains (**REXdb**). Standalone database for Viridiplantae species
232 can be obtained on `repeatexplorer.org`__. Classification
233 system used in REXdb is described in article `Systematic survey of plant
234 LTR-retrotransposons elucidates phylogenetic relationships of their
235 polyprotein domains and provides a reference for element classification`__
236 Database for Metazoa species is still under development so use it with caution.
237
238 .. __: http://repeatexplorer.org
239 .. __: https://doi.org/10.1186/s13100-018-0144-1
240
241 **Select parameters for protein domain search** REXdb is compared with s
242 equence clusters either using blastx or diamond aligner. Diamond program
243 is about three time faster than blastx with word size 3.
244
245 **Similarity search options** By default sequence reads are compared using
246 mgblast program. Default threshold is explicitly set to 90% sequence
247 similarity spanning at least 55% of the read length (in the case of reads
248 differing in length it applies to the longer one). Additionally, sequence
249 overlap must be at least 55 nt. If you select option for shorter reads
250 than 100 nt, minimum overlap 55 nt is not required.
251
252 By default,
253 mgblast search use DUST program to filter out
254 low-complexity sequences. If you want
255 to increase sensitivity of detection of satellites with shorter monomer
256 use option with '*no masking of low complexity repeats*'. Note that omitting
257 DUST filtering will significantly increase running times
258
259
260 **Automatic filtering of abundant satellite repeats** perform clustering on
261 smaller dataset of sequence reads to detect abundant high confidence
262 satellite repeats. If such satellites are detected, sequence reads derived
263 from these satellites are depleted from input dataset. This step enable more
264 sensitive detection of less abundant repeats as more reads can be used
265 in clustering step.
266
267 **Use custom repeat database**. This option allows users to perform similarity
268 comparison of identified repeats to their custom databases. The repeat class must
269 be encoded in FASTA headers of database entries in order to allow correct
270 parsing of similarity hits. Required format for custom database sequence name is: ::
271
272 >reapeatname#class/subclass
273
274
275 **Output**
276
277 List of clusters identified as putative satellite repeats, their genomic
278 abundance and various cluster characteristics.
279
280 Output includes a **HTML summary** with table listing of all analyzed
281 clusters. More detailed information about clusters is provided in
282 additional files and directories. All results are also provided as
283 downloadable **zip archive**. Additionally a **log file** reporting
284 the progress of the computational pipeline is provided.
285
286 </help>
287
288 </tool>