comparison repex_tarean.xml @ 0:e2b8e71b85b9 draft

Uploaded
author petr-novak
date Wed, 08 Jan 2020 06:25:59 -0500
parents
children 968f0867acc5
comparison
equal deleted inserted replaced
-1:000000000000 0:e2b8e71b85b9
1 <tool id="tarean" name="Tandem Repeat Analyzer" version="2.3.7" >
2 <stdio>
3 <regex match="Traceback" source="stderr" level="fatal" description="Unknown error" />
4 <regex match="error" source="stderr" level="fatal" description="Unknown error" />
5 <regex match="warning" source="stderr" level="warning" description="Unknown warning" />
6 <exit_code range="1:" level="fatal" description="Error" />
7 </stdio>
8 <description>Identification of genomic tandem repeats from NGS data</description>
9 <requirements>
10 <requirement type="package">imagemagick</requirement>
11 <requirement type="package">mafft</requirement>
12 <requirement type="package">blast</requirement>
13 <requirement type="package">diamond</requirement>
14 <requirement type="package">blast-legacy</requirement>
15 <requirement type="package">r-igraph</requirement>
16 <requirement type="package">r-data.tree</requirement>
17 <requirement type="package">r-stringr</requirement>
18 <requirement type="package">r-r2html</requirement>
19 <requirement type="package">r-hwriter</requirement>
20 <requirement type="package">r-dt</requirement>
21 <requirement type="package">r-scales</requirement>
22 <requirement type="package">r-plotrix</requirement>
23 <requirement type="package">r-png</requirement>
24 <requirement type="package">r-plyr</requirement>
25 <requirement type="package">r-dplyr</requirement>
26 <requirement type="package">r-optparse</requirement>
27 <requirement type="package">r-dbi</requirement>
28 <requirement type="package">r-rsqlite</requirement>
29 <requirement type="package">r-rserve</requirement>
30 <requirement type="package">bioconductor-biostrings</requirement>
31 <requirement type="package" version="2.3.7">repex_tarean</requirement>
32 <requirement type="set_environment">REPEX</requirement>
33 <requirement type="set_environment">REPEX_VERSION</requirement>
34 <requirement type="package" version="0.9.1">pyrserve</requirement>
35 </requirements>
36 <command detect_errors="exit_code">
37 export PYTHONHASHSEED=0;
38 \${REPEX}/seqclust --paired --sample ${sample} --output_dir=tarean_output --logfile=${log} --cleanup --tarean_mode
39 #if $advanced_options.advanced:
40 --mincl $advanced_options.size_threshold $advanced_options.keep_names $advanced_options.automatic_filtering -M $advanced_options.merging
41 #if $advanced_options.custom_library.options_custom_library :
42 -d $advanced_options.custom_library.library extra_database
43 #end if
44 #if $advanced_options.options.options:
45 -opt $advanced_options.options.options
46 #end if
47 #else:
48 -M 0.2
49
50 #end if
51 ${FastaFile} >stdout.log 2> stderr.log ;
52 echo "STDOUT CONTENT:" >> ${log} ;
53 cat stdout.log >> ${log} ;
54 echo "STDERR CONTENT:" >> ${log} ;
55 cat stderr.log >> ${log} &amp;&amp;
56 \${REPEX}/stderr_filter.py stderr.log &amp;&amp;
57 cd tarean_output &amp;&amp;
58 zip -r ${ReportArchive}.zip * &amp;&amp;
59 mv ${ReportArchive}.zip ${ReportArchive} &amp;&amp;
60 cp index.html ${ReportFile} &amp;&amp;
61 mkdir ${ReportFile.files_path} &amp;&amp;
62 cp -r --parents libdir ${ReportFile.files_path} &amp;&amp;
63 cp -r --parents seqclust/clustering/superclusters ${ReportFile.files_path} &amp;&amp;
64 cp -r --parents seqclust/clustering/clusters ${ReportFile.files_path} &amp;&amp;
65 cp seqclust/clustering/hitsort.cls ${ReportFile.files_path}/seqclust/clustering/hitsort.cls &amp;&amp;
66 cp *.png ${ReportFile.files_path}/ &amp;&amp;
67 cp *.csv ${ReportFile.files_path}/ &amp;&amp;
68 cp *.html ${ReportFile.files_path}/ &amp;&amp;
69 cp *.css ${ReportFile.files_path}/ &amp;&amp;
70 cp *.fasta ${ReportFile.files_path}/ 2>>$log &amp;&amp; rm -r ../tarean_output || :
71
72
73 </command>
74
75 <inputs>
76 <param name="FastaFile" label="paired-end NGS reads" type="data" format="fasta"
77 help="Input file must contain fasta-formatted interlaced read pairs from paired-end sequencing. All pairs must be complete. Example of input data format is provided in the help below."/>
78 <param name="sample" label="Sample size" type="integer" value="500000" min="10000"/>
79
80 <conditional name="advanced_options">
81 <param name="advanced" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Advanced options" />
82 <when value="false">
83 <!-- pass -->
84 </when>
85 <when value="true">
86 <param name="merging" type="boolean" truevalue="0.2" falsevalue="0" checked="True" label="Perform cluster merging" help="By default, clusters connected through paired-end reads are merged"/>
87 <conditional name="custom_library">
88 <param name="options_custom_library" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Use custom repeat database"/>
89 <when value="false">
90 <!-- do nothing here -->
91 </when>
92 <when value="true">
93 <param name="library" format="fasta" type="data" label="Use custom repeat database" help="Perform additional similarity search to user-provided repeat database. The database should contain FASTA-formatted DNA sequences with headers (sequence names) in the format: '>reapeatname#class/subclass'"/>
94 </when>
95 </conditional>
96 <param name="size_threshold" label="Cluster size threshold for detailed analysis" type="float" value="0.01" min="0.0001" max="100" help ="Minimal size (as percentage of input reads) of the smallest cluster which is analyzed, cluster with less than 20 reads are not considered at all."/>
97 <param name="automatic_filtering" label="Perform automatic filtering of abundant satellite repeats" type="boolean" truevalue="--automatic_filtering" falsevalue="" checked="false"/>
98 <param name="keep_names" label="Keep original sequences names" type="boolean" truevalue="--keep_names" falsevalue="" checked="false" help="By default sequence are relabeled using integers. If you want to keep original names, use this option."/>
99 <conditional name="options">
100 <param name="options" type="select" label="Similarity search options" help="Different similarity search parameters are used depending on the used input data to adjust search to differences in length and error rate">
101 <option value="ILLUMINA" selected="true">Illumina reads, read length 100nt or more </option>
102 <option value="ILLUMINA_SHORT" selected="false">Illumina reads, shorter than 100nt (Do not use reads shorter than 50nt!) </option>
103 <option value="ILLUMINA_DUST_OFF" selected="false">Illumina reads, no masking of low complexity repeats </option>
104 </param>
105 </conditional>
106 </when>
107 </conditional>
108
109 <conditional name="queue_definition">
110 <param name="queue_select" type="select" label="Select queue">
111 <option value="basic_fast_queue">basic &amp; fast</option>
112 <option value="long_slow_queue">long &amp; slow</option>
113 <option value="extra_long_slow_queue">extra long &amp; slow</option>
114 </param>
115 <when value="basic_fast_queue">
116 <param name="queue_specification" type="text" label="Modify parameters (optional)"
117 value="-l select=1:ncpus=10:mem=32gb:scratch_local=50gb -l walltime=48:00:00 -q elixirre@pbs.elixir-czech.cz -v TAREAN_MAX_MEM=4000000,TAREAN_CPU=4" />
118 </when>
119
120 <when value="long_slow_queue">
121 <param name="queue_specification" type="text" label="Modify parameters (optional)"
122 value="-l select=1:ncpus=16:mem=112gb:scratch_local=50gb -l walltime=336:00:00 -q elixirre@pbs.elixir-czech.cz -v TAREAN_MAX_MEM=64000000,TAREAN_CPU=15" />
123 </when>
124 <when value="extra_long_slow_queue">
125 <param name="queue_specification" type="text" label="Modify parameters (optional)"
126 value="-l select=1:ncpus=16:mem=112gb:scratch_local=50gb -l walltime=720:00:00 -q elixirre@pbs.elixir-czech.cz -v TAREAN_MAX_MEM=64000000,TAREAN_CPU=15" />
127 </when>
128 </conditional>
129
130
131
132 </inputs>
133 <outputs>
134 <data name="log" format="txt" label="TAREAN log file"/>
135 <data name="ReportArchive" format="zip" label="TAREAN Archive with HTML report from data ${FastaFile.hid}"/>
136 <data name="ReportFile" format="html" label="TAREAN HTML report from data ${FastaFile.hid}"/>
137 </outputs>
138
139 <help>
140 **HELP**
141
142 TAREAN - TAndem REpeat ANalyzer is a computational pipeline for
143 **unsupervised identification of satellite repeats** from unassembled
144 sequence reads. The pipeline uses low-pass paired-end whole genome
145 sequence reads and performs graph-based clustering. The resulting
146 clusters, representing all types of repeats present in the genome, are
147 then examined to identify those containing circular structures indicative
148 of tandem repeats. A poster summarizing TAREAN principles and
149 implementation can be found `here.`__
150
151
152 .. __: http://w3lamc.umbr.cas.cz/lamc/?page_id=312
153
154 **Input data**
155
156
157 The analysis requires **paired-end reads** generated by whole genome
158 shotgun sequencing. The data should be provided as a single input file in
159 fasta format with the reads interlaced (see example below). All the pairs
160 must be complete, i.e. both "forward" and "reverse" sequence reads must be
161 present. The reads should all be trimmed to the same length. The optimal
162 size range is between 100 and 200 nucleotides. The number of reads to be
163 analyzed should not exceed 1x coverage of the genome. Genome coverage
164 between 0.01 and 0.5x is recommended. The reads should be filtered for
165 quality. The recommended quality filtering is as follows: each read should
166 have a quality score >=10 for 95% of the bases, i.e. if your reads are 100
167 base pairs long, then a read only passes this quality threshold if 95
168 bases have a quality of 10 or higher. Additionally, any reads containing
169 indeterminate base pairs (indicated as N in the reads) should be removed.
170 Finally, if either one of the reads in a pair fails to meet the
171 aforementioned thresholds, **both** sequences should be removed.
172 example of interlaced input format::
173
174 >0001_f
175 CGTAATATACATACTTGCTAGCTAGTTGGATGCATCCAACTTGCAAGCTAGTTTGATG
176 >0001_r
177 GATTTGACGGACACACTAACTAGCTAGTTGCATCTAAGCGGGCACACTAACTAACTAT
178 >0002_f
179 ACTCATTTGGACTTAACTTTGATAATAAAAACTTAAAAAGGTTTCTGCACATGAATCG
180 >0002_r
181 TATGTTGAAAAATTGAATTTCGGGACGAAACAGCGTCTATCGTCACGACATAGTGCTC
182 >0003_f
183 TGACATTTGTGAACGTTAATGTTCAACAAATCTTTCCAATGTCTTTTTATCTTATCAT
184 >0003_r
185 TATTGAAATACTGGACACAAATTGGAAATGAAACCTTGTGAGTTATTCAATTTATGTT
186 ...
187
188
189 To perform the quality filtering on your fastQ formatted data as described
190 above, and to interlace your paired-end sequence reads,
191 please use the `Preprocessing of paired-reads`__ tool.
192
193 .. __: tool_runner?tool_id=paired_fastq_filtering
194
195
196 **Additional parameters**
197
198 **Sample size** defines how many reads will be used during the computation.
199 The default setting of 500,000 reads will enable detection of high copy
200 number satellites within several hours. For higher
201 sensitivity the sample size can be increased. Since the sample size affects
202 memory usage, this parameter may be automatically adjusted to a lower value
203 during the run. The maximum sample size which can be processed depends on the
204 repetitiveness of the analyzed genome. This significantly limits the number of reads
205 that can be analyzed with the TAREAN pipeline.
206
207 **Perform cluster merging**. Families of repetitive elements are
208 frequently split into multiple clusters rather than being represented as a
209 single one. If you do not want to merge clusters based on the presence
210 of broken read pairs, disable this option.
211
212 **Use custom repeat database**. This option allows users to perform similarity
213 comparison of identified repeats to their custom databases. The repeat class should
214 be encoded in FASTA headers of database entries in order to allow correct
215 parsing of similarity hits.
216
217 **Similarity search options** By default sequence reads are compared using
218 mgblast program. Default threshold is explicitly set to 90% sequence
219 similarity spanning at least 55% of the read length (in the case of reads
220 differing in length it applies to the longer one). Additionally, sequence
221 overlap must be at least 55 nt. If you select option for shorter reads
222 than 100 nt, minimum overlap 55 nt is not required.
223
224 By default,
225 mgblast search use DUST program to filter out
226 low-complexity sequences. If you want
227 to increase sensitivity of detection of satellites with shorter monomer
228 use option with '*no masking of low complexity repeats*'. Note that omitting
229 DUST filtering will significantly increase running times
230
231 **Output**
232
233 A list of clusters identified as putative satellite repeats, their genomic
234 abundance and various cluster characteristics are provided. Length and
235 consensus sequences of reconstructed monomers are also shown and
236 accompanied by a detailed output from kmer-based reconstruction including
237 sequences and sequence logos of alternative variants of monomer sequences.
238
239 The output includes an **HTML summary** with a table listing all analyzed
240 clusters. More detailed information about clusters is provided in
241 additional files and directories. All results are also provided as a
242 downloadable **zip archive**. Since read clustering results in
243 thousands of clusters, the search for satellite repeats is limited to
244 a subset of the largest ones corresponding to the most abundant genomic
245 repeats. The default setting of the pipeline is to analyze all clusters containing at least
246 0.01% of the input reads. Besides the satellite repeats, three other
247 groups of clusters are reported in the output (1) LTR-retrotransposons,
248 (2) 45S and 5S rDNA and (3) all remaining clusters passing the size
249 threshold. As (1) and (2) contain sequences with circular
250 graphs, their consensus is calculated in the same way as for satellite
251 repeats. Additionally a **log file** reporting the progress of the
252 computational pipeline is provided.
253
254
255 </help>
256
257 </tool>