comparison repex_full_clustering.xml @ 0:6eec21828dd4 draft default tip

planemo upload for repository https://github.com/galaxy-genome-annotation/galaxy-tools/tree/master/tools/repeatexplorer2 commit 3407a4e6a60ff89a0ab5eab87ab94b0d9a209500
author gga
date Thu, 02 Nov 2023 16:20:35 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:6eec21828dd4
1 <tool id="repeatexplorer_clustering" name="RepeatExplorer (clustering)" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
2 <description>repeat discovery and characterization using graph-based sequence clustering</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <expand macro="creator"/>
7 <expand macro="requirements"/>
8 <command><![CDATA[
9
10 export GALAXY_MEMORY_KB=\$((\${GALAXY_MEMORY_MB:-8192}*1024))
11 &&
12
13 export PYTHONHASHSEED=0
14 &&
15
16 ## output will go here
17 mkdir -p '${reportfile.extra_files_path}'
18 &&
19
20 /repex_tarean/seqclust
21 --cpu \${GALAXY_SLOTS:-1}
22 --max_memory \${GALAXY_MEMORY_KB}
23 '${paired}'
24 #if $sample:
25 --sample '${sample}'
26 #end if
27 --taxon '${taxon}'
28 --output_dir='${reportfile.extra_files_path}'
29 #if $advanced.mincl:
30 --mincl '${advanced.mincl}'
31 #end if
32 --assembly_min '${advanced.assembly_min}'
33 #if $advanced.keep_names:
34 --keep_names
35 #end if
36 '${fastafile}'
37 &&
38
39 ## pick up the html index
40 cp '${reportfile.extra_files_path}/index.html' ./index.html
41
42 ]]></command>
43 <inputs>
44 <param name="fastafile" label="NGS reads" type="data" format="fasta" help="Input file must contain FASTA-formatted NGS reads. Illumina paired-end reads are recommended."/>
45 <param argument="--paired" type="boolean" truevalue="--paired" falsevalue="" checked="True" label="Paired-end reads" help="If paired-end reads are used, they must be interleaved and all pairs must be complete. Example of the correct format is provided in the help below."/>
46 <param argument="--sample" type="integer" min="2" optional="true" label="Subsample reads (number)" help="Use an integer &gt; 1 to select a specific number of reads to use. Leave this field blank to use the entire dataset."/>
47 <param argument="--taxon" label="Select taxon and protein domain database version (REXdb)" type="select" help="Reference database of transposable element protein domains - REXdb - is used for annotation of repeats">
48 <option value="VIRIDIPLANTAE3.0" selected="true">Viridiplantae version 3.0</option>
49 <option value="VIRIDIPLANTAE2.2" selected="true">Viridiplantae version 2.2</option>
50 <option value="METAZOA3.0">Metazoa version 3.0</option>
51 <option value="METAZOA2.0">Metazoa version 2.0</option>
52 </param>
53 <section name="advanced" title="Advanced options" expanded="false">
54 <param argument="--mincl" label="Cluster size threshold for detailed analysis" type="float" value="" min="0.0001" max="100" optional="true" help="Minimal size (as percentage of input reads) of the smallest cluster which is analyzed; clusters with less than 20 reads are not considered."/>
55 <param argument="--assembly_min" type="integer" label="Minimal cluster size for assembly" value="5" min="2" max="100"/>
56 <param argument="--keep_names" label="Keep original read names" type="boolean" checked="false" help="By default, reads are renamed using integers. Use this option to keep original names."/>
57 </section>
58 </inputs>
59 <outputs>
60 <data name="reportfile" format="html" from_work_dir="index.html" label="RepeatExplorer - HTML report on ${on_string}"/>
61 </outputs>
62 <tests>
63 <!-- test1: basic function -->
64 <test expect_num_outputs="1">
65 <param name="fastafile" value="LAS_paired_10k.fa.gz" ftype="fasta.gz"/>
66 <param name="paired" value="True"/>
67 <param name="taxon" value="VIRIDIPLANTAE3.0"/>
68 <output name="reportfile">
69 <assert_contents>
70 <has_text text="Clustering summary"/>
71 </assert_contents>
72 </output>
73 </test>
74 <!-- test2: read subsample -->
75 <test expect_num_outputs="1">
76 <param name="fastafile" value="LAS_paired_10k.fa.gz" ftype="fasta.gz"/>
77 <param name="paired" value="True"/>
78 <param name="sample" value="5000"/>
79 <param name="taxon" value="VIRIDIPLANTAE3.0"/>
80 <output name="reportfile">
81 <assert_contents>
82 <has_text text="Clustering summary"/>
83 </assert_contents>
84 </output>
85 </test>
86 <!-- test3: advanced params -->
87 <test expect_num_outputs="1">
88 <param name="fastafile" value="LAS_paired_10k.fa.gz" ftype="fasta.gz"/>
89 <param name="paired" value="True"/>
90 <param name="taxon" value="VIRIDIPLANTAE3.0"/>
91 <param name="mincl" value="0.01"/>
92 <param name="keep_names" value="True"/>
93 <output name="reportfile">
94 <assert_contents>
95 <has_text text="Clustering summary"/>
96 </assert_contents>
97 </output>
98 </test>
99 </tests>
100 <help><![CDATA[
101 **HELP**
102
103 RepeatExplorer2 clustering is a computational pipeline for unsupervised
104 identification of repeats from unassembled sequence reads. The
105 pipeline uses low-pass whole genome sequence reads and performs graph-based
106 clustering. Resulting clusters, representing all types of repeats, are then
107 examined to identify and classify into repeats groups.
108
109 **Input data**
110
111 The analysis requires either **single** or **paired-end reads** generated
112 by whole genome shotgun sequencing provided as a single fasta-formatted file.
113 Generally, paired-end reads provide significantly better results than single
114 reads. Reads should be of uniform length (optimal size range is 100-200 nt) and
115 the number of analyzed reads should represent less than 1x genome equivalent
116 (genome coverage of 0.01 - 0.50 x is recommended). Reads should be
117 quality-filtered (recommended filtering : quality score >=10 over 95% of bases
118 and no Ns allowed) and only **complete read pairs** should be submitted for
119 analysis. When paired reads are used, input data must be **interlaced** format
120 as fasta file:
121
122 example of interlaced input format::
123
124 >0001_f
125 CGTAATATACATACTTGCTAGCTAGTTGGATGCATCCAACTTGCAAGCTAGTTTGATG
126 >0001_r
127 GATTTGACGGACACACTAACTAGCTAGTTGCATCTAAGCGGGCACACTAACTAACTAT
128 >0002_f
129 ACTCATTTGGACTTAACTTTGATAATAAAAACTTAAAAAGGTTTCTGCACATGAATCG
130 >0002_r
131 TATGTTGAAAAATTGAATTTCGGGACGAAACAGCGTCTATCGTCACGACATAGTGCTC
132 >0003_f
133 TGACATTTGTGAACGTTAATGTTCAACAAATCTTTCCAATGTCTTTTTATCTTATCAT
134 >0003_r
135 TATTGAAATACTGGACACAAATTGGAAATGAAACCTTGTGAGTTATTCAATTTATGTT
136 ...
137
138
139 **Comparative analysis**
140
141 For comparative analysis sequence names must contain code (prefix) for each group.
142 Prefix in sequences names must be of fixed length.
143
144 Example of labeling two groups with where **group code length** is 2 and is used to distinguish groups - AA and BB ::
145
146 >AA0001_f
147 CGTAATATACATACTTGCTAGCTAGTTGGATGCATCCAACTTGCAAGCTAGTTTGATG
148 >AA0001_r
149 GATTTGACGGACACACTAACTAGCTAGTTGCATCTAAGCGGGCACACTAACTAACTAT
150 >AA0002_f
151 ACTCATTTGGACTTAACTTTGATAATAAAAACTTAAAAAGGTTTCTGCACATGAATCG
152 >AA0002_r
153 TATGTTGAAAAATTGAATTTCGGGACGAAACAGCGTCTATCGTCACGACATAGTGCTC
154 >BB0001_f
155 TGACATTTGTGAACGTTAATGTTCAACAAATCTTTCCAATGTCTTTTTATCTTATCAT
156 >BB0001_r
157 TATTGAAATACTGGACACAAATTGGAAATGAAACCTTGTGAGTTATTCAATTTATGTT
158 >BB0002_f
159 TGACATTTGTGAACGTTAATGTTCAACAAATCTTTCCAATGTCTTTTTATCTTATCAT
160 >BB0002_r
161 TATTGAAATACTGGACACAAATTGGAAATGAAACCTTGTGAGTTATTCAATTTATGTT
162
163
164 To prepare quality filtered and interlaced input fasta file from fastq
165 files, use `Preprocessing of paired-reads`__ tool.
166
167 .. __: tool_runner?tool_id=paired_fastq_filtering
168
169
170 **Additional parameters**
171
172 **Sample size** defines how many reads should be used in calculation.
173 Default setting with 500,000 reads will enable detection of high copy
174 repeats within several hours of computation time. For higher
175 sensitivity the sample size can be set higher. Since sample size affects
176 the memory usage, this parameter may be automatically adjusted to lower
177 value during the run. Maximum sample size which can be processed depends on
178 the repetitiveness of analyzed genome.
179
180
181 **Select taxon and protein domain database version (REXdb)**. Classification
182 of transposable elements is based on the similarity to our reference database
183 of transposable element protein domains (**REXdb**). Standalone database for Viridiplantae species
184 can be obtained on `repeatexplorer.org`__. Classification
185 system used in REXdb is described in article `Systematic survey of plant
186 LTR-retrotransposons elucidates phylogenetic relationships of their
187 polyprotein domains and provides a reference for element classification`__
188 Database for Metazoa species is still under development so use it with caution.
189
190 .. __: http://repeatexplorer.org
191 .. __: https://doi.org/10.1186/s13100-018-0144-1
192
193 **Select parameters for protein domain search** REXdb is compared with s
194 equence clusters either using blastx or diamond aligner. Diamond program
195 is about three time faster than blastx with word size 3.
196
197 **Similarity search options** By default sequence reads are compared using
198 mgblast program. Default threshold is explicitly set to 90% sequence
199 similarity spanning at least 55% of the read length (in the case of reads
200 differing in length it applies to the longer one). Additionally, sequence
201 overlap must be at least 55 nt. If you select option for shorter reads
202 than 100 nt, minimum overlap 55 nt is not required.
203
204 By default,
205 mgblast search use DUST program to filter out
206 low-complexity sequences. If you want
207 to increase sensitivity of detection of satellites with shorter monomer
208 use option with '*no masking of low complexity repeats*'. Note that omitting
209 DUST filtering will significantly increase running times
210
211
212 **Automatic filtering of abundant satellite repeats** perform clustering on
213 smaller dataset of sequence reads to detect abundant high confidence
214 satellite repeats. If such satellites are detected, sequence reads derived
215 from these satellites are depleted from input dataset. This step enable more
216 sensitive detection of less abundant repeats as more reads can be used
217 in clustering step.
218
219 **Use custom repeat database**. This option allows users to perform similarity
220 comparison of identified repeats to their custom databases. The repeat class must
221 be encoded in FASTA headers of database entries in order to allow correct
222 parsing of similarity hits. Required format for custom database sequence name is: ::
223
224 >reapeatname#class/subclass
225
226
227 **Output**
228
229 List of clusters identified as putative satellite repeats, their genomic
230 abundance and various cluster characteristics.
231
232 Output includes a **HTML summary** with table listing of all analyzed
233 clusters. More detailed information about clusters is provided in
234 additional files and directories. All results are also provided as
235 downloadable **zip archive**. Additionally a **log file** reporting
236 the progress of the computational pipeline is provided.
237
238 ]]></help>
239 <expand macro="citations"/>
240 </tool>