comparison repex_tarean.xml @ 2:349b197133dc draft

Uploaded
author petr-novak
date Fri, 24 Jul 2020 07:26:59 -0400
parents
children d1f67a13b70f
comparison
equal deleted inserted replaced
1:422485508110 2:349b197133dc
1 <tool id="tarean" name="Tandem Repeat Analyzer" version="2.3.8" >
2 <stdio>
3 <regex match="Traceback" source="stderr" level="fatal" description="Unknown error" />
4 <regex match="error" source="stderr" level="fatal" description="Unknown error" />
5 <regex match="warning" source="stderr" level="warning" description="Unknown warning" />
6 <exit_code range="1:" level="fatal" description="Error" />
7 </stdio>
8 <description>Identification of genomic tandem repeats from NGS data</description>
9 <requirements>
10 <requirement type="package">imagemagick</requirement>
11 <requirement type="package">mafft</requirement>
12 <requirement type="package">blast</requirement>
13 <requirement type="package" version="0.9.29">diamond</requirement>
14 <requirement type="package">blast-legacy</requirement>
15 <requirement type="package">r-igraph</requirement>
16 <requirement type="package">r-data.tree</requirement>
17 <requirement type="package">r-stringr</requirement>
18 <requirement type="package">r-r2html</requirement>
19 <requirement type="package">r-hwriter</requirement>
20 <requirement type="package">r-dt</requirement>
21 <requirement type="package">r-scales</requirement>
22 <requirement type="package">r-plotrix</requirement>
23 <requirement type="package">r-png</requirement>
24 <requirement type="package">r-plyr</requirement>
25 <requirement type="package">r-dplyr</requirement>
26 <requirement type="package">r-optparse</requirement>
27 <requirement type="package">r-dbi</requirement>
28 <requirement type="package">r-rsqlite</requirement>
29 <requirement type="package">r-rserve</requirement>
30 <requirement type="package">bioconductor-biostrings</requirement>
31 <requirement type="package" version="2.3.8">repex_tarean</requirement>
32 <requirement type="set_environment">REPEX</requirement>
33 <requirement type="set_environment">REPEX_VERSION</requirement>
34 <requirement type="package" version="0.9.1">pyrserve</requirement>
35 </requirements>
36 <command detect_errors="exit_code">
37 export PYTHONHASHSEED=0;
38 \${REPEX}/seqclust --paired --sample ${read_sampling.sample} --output_dir=tarean_output --logfile=${log} --cleanup --tarean_mode
39 #if $advanced_options.advanced:
40 --mincl $advanced_options.size_threshold $advanced_options.keep_names $advanced_options.automatic_filtering -M $advanced_options.merging
41 #if $advanced_options.custom_library.options_custom_library :
42 -d $advanced_options.custom_library.library extra_database
43 #end if
44 #if $advanced_options.options.options:
45 -opt $advanced_options.options.options
46 #end if
47 #else:
48 -M 0.2
49
50 #end if
51 ${FastaFile} >stdout.log 2> stderr.log ;
52 echo "STDOUT CONTENT:" >> ${log} ;
53 cat stdout.log >> ${log} ;
54 echo "STDERR CONTENT:" >> ${log} ;
55 cat stderr.log >> ${log} &amp;&amp;
56 \${REPEX}/stderr_filter.py stderr.log &amp;&amp;
57 cd tarean_output &amp;&amp;
58 zip -r ${ReportArchive}.zip * &amp;&amp;
59 mv ${ReportArchive}.zip ${ReportArchive} &amp;&amp;
60 cp index.html ${ReportFile} &amp;&amp;
61 mkdir ${ReportFile.files_path} &amp;&amp;
62 cp -r --parents libdir ${ReportFile.files_path} &amp;&amp;
63 cp -r --parents seqclust/clustering/superclusters ${ReportFile.files_path} &amp;&amp;
64 cp -r --parents seqclust/clustering/clusters ${ReportFile.files_path} &amp;&amp;
65 cp seqclust/clustering/hitsort.cls ${ReportFile.files_path}/seqclust/clustering/hitsort.cls &amp;&amp;
66 cp *.png ${ReportFile.files_path}/ &amp;&amp;
67 cp *.csv ${ReportFile.files_path}/ &amp;&amp;
68 cp *.html ${ReportFile.files_path}/ &amp;&amp;
69 cp *.css ${ReportFile.files_path}/ &amp;&amp;
70 cp *.fasta ${ReportFile.files_path}/ 2>>$log &amp;&amp; rm -r ../tarean_output || :
71
72
73 </command>
74
75 <inputs>
76 <param name="FastaFile" label="Paired-end Illumina reads" type="data" format="fasta"
77 help="Input file must contain FASTA-formatted interlaced read pairs from paired-end sequencing. All pairs must be complete. Example of the input data format is provided in the help below."/>
78
79 <conditional name="read_sampling">
80 <param name="do_sampling" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Read sampling" help="Use this option if you want to analyze only a part of the reads" />
81 <when value="false">
82 <!-- pass -->
83 <param name="sample" label="Sample size" hidden="True" type="integer" value="0" help="Number of analyzed reads"/>
84 </when>
85 <when value="true">
86 <param name="sample" label="Sample size" type="integer" value="500000" min="10000" help="Number of analyzed reads"/>
87 </when>
88 </conditional>
89
90 <conditional name="advanced_options">
91 <param name="advanced" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Advanced options" />
92 <when value="false">
93 <!-- pass -->
94 </when>
95 <when value="true">
96 <param name="merging" type="boolean" truevalue="0.2" falsevalue="0" checked="True" label="Perform cluster merging" help="By default, clusters connected through paired-end reads are merged"/>
97 <conditional name="custom_library">
98 <param name="options_custom_library" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Use custom repeat database"/>
99 <when value="false">
100 <!-- do nothing here -->
101 </when>
102 <when value="true">
103 <param name="library" format="fasta" type="data" label="Use custom repeat database" help="Perform additional similarity search to user-provided repeat database. The database should contain FASTA-formatted DNA sequences with headers (sequence names) in the format: '>reapeatname#class/subclass'"/>
104 </when>
105 </conditional>
106 <param name="size_threshold" label="Cluster size threshold for detailed analysis" type="float" value="0.01" min="0.0001" max="100" help ="Minimal size (as percentage of input reads) of the smallest cluster which is analyzed; cluster with less than 20 reads are not considered."/>
107 <param name="automatic_filtering" label="Perform automatic filtering of abundant satellite repeats" type="boolean" truevalue="--automatic_filtering" falsevalue="" checked="false"/>
108 <param name="keep_names" label="Keep original read names" type="boolean" truevalue="--keep_names" falsevalue="" checked="false" help="By default, reads are renamed using integers. Use this option if you want to keep original names."/>
109 <conditional name="options">
110 <param name="options" type="select" label="Similarity search options">
111 <option value="ILLUMINA" selected="true">Default </option>
112 <option value="ILLUMINA_DUST_OFF" selected="false">Masking of low complexity repeats disabled </option>
113
114 <!-- <option value="ILLUMINA_SENSITIVE_MGBLAST" selected="false">Illumina reads, sensitive search (search parameters: mgblast, min PID 80, -W8) slow, experimental feature!</option> -->
115 <!-- <option value="ILLUMINA_SENSITIVE_BLASTPLUS" selected="false">Illumina reads, more sensitive search (search parameters: blastn, min PID 80, -W6) extremely slow, experimental feature!</option> -->
116 <!-- <option value="OXFORD_NANOPORE" selected="false"> -->
117 <!-- Pseudo short reads simulated from Oxford Nanopore data, experimental feature! -->
118 <!-- </option> -->
119 </param>
120 </conditional>
121 </when>
122 </conditional>
123
124
125
126 </inputs>
127 <outputs>
128 <data name="log" format="txt" label="TAREAN log file"/>
129 <data name="ReportArchive" format="zip" label="TAREAN Archive with HTML report from data ${FastaFile.hid}"/>
130 <data name="ReportFile" format="html" label="TAREAN HTML report from data ${FastaFile.hid}"/>
131 </outputs>
132
133 <help>
134 **HELP**
135
136 TAREAN - TAndem REpeat ANalyzer is a computational pipeline for
137 **unsupervised identification of satellite repeats** from unassembled
138 sequence reads. The pipeline uses low-pass paired-end whole genome
139 sequence reads and performs graph-based clustering. The resulting
140 clusters, representing all types of repeats present in the genome, are
141 then examined to identify those containing circular structures indicative
142 of tandem repeats. A poster summarizing TAREAN principles and
143 implementation can be found `here.`__
144
145
146 .. __: http://w3lamc.umbr.cas.cz/lamc/?page_id=312
147
148 **Input data**
149
150
151 The analysis requires **paired-end reads** generated by whole genome
152 shotgun sequencing. The data should be provided as a single input file in
153 fasta format with the reads interlaced (see example below). All the pairs
154 must be complete, i.e. both "forward" and "reverse" sequence reads must be
155 present. The reads should all be trimmed to the same length. The optimal
156 size range is between 100 and 200 nucleotides. The number of reads to be
157 analyzed should not exceed 1x coverage of the genome. Genome coverage
158 between 0.01 and 0.5x is recommended. The reads should be filtered for
159 quality. The recommended quality filtering is as follows: each read should
160 have a quality score >=10 for 95% of the bases, i.e. if your reads are 100
161 base pairs long, then a read only passes this quality threshold if 95
162 bases have a quality of 10 or higher. Additionally, any reads containing
163 indeterminate base pairs (indicated as N in the reads) should be removed.
164 Finally, if either one of the reads in a pair fails to meet the
165 aforementioned thresholds, **both** sequences should be removed.
166 example of interlaced input format::
167
168 >0001_f
169 CGTAATATACATACTTGCTAGCTAGTTGGATGCATCCAACTTGCAAGCTAGTTTGATG
170 >0001_r
171 GATTTGACGGACACACTAACTAGCTAGTTGCATCTAAGCGGGCACACTAACTAACTAT
172 >0002_f
173 ACTCATTTGGACTTAACTTTGATAATAAAAACTTAAAAAGGTTTCTGCACATGAATCG
174 >0002_r
175 TATGTTGAAAAATTGAATTTCGGGACGAAACAGCGTCTATCGTCACGACATAGTGCTC
176 >0003_f
177 TGACATTTGTGAACGTTAATGTTCAACAAATCTTTCCAATGTCTTTTTATCTTATCAT
178 >0003_r
179 TATTGAAATACTGGACACAAATTGGAAATGAAACCTTGTGAGTTATTCAATTTATGTT
180 ...
181
182
183 To perform the quality filtering on your fastQ formatted data as described
184 above, and to interlace your paired-end sequence reads,
185 please use the `Preprocessing of paired-reads`__ tool.
186
187 .. __: tool_runner?tool_id=paired_fastq_filtering
188
189
190 **Additional parameters**
191
192 **Sample size** defines how many reads will be used during the computation.
193 The default setting of 500,000 reads will enable detection of high copy
194 number satellites within several hours. For higher
195 sensitivity the sample size can be increased. Since the sample size affects
196 memory usage, this parameter may be automatically adjusted to a lower value
197 during the run. The maximum sample size which can be processed depends on the
198 repetitiveness of the analyzed genome. This significantly limits the number of reads
199 that can be analyzed with the TAREAN pipeline.
200
201 **Perform cluster merging**. Families of repetitive elements are
202 frequently split into multiple clusters rather than being represented as a
203 single one. If you do not want to merge clusters based on the presence
204 of broken read pairs, disable this option.
205
206 **Use custom repeat database**. This option allows users to perform similarity
207 comparison of identified repeats to their custom databases. The repeat class should
208 be encoded in FASTA headers of database entries in order to allow correct
209 parsing of similarity hits.
210
211 **Similarity search options** By default sequence reads are compared using
212 mgblast program. Default threshold is explicitly set to 90% sequence
213 similarity spanning at least 55% of the read length (in the case of reads
214 differing in length it applies to the longer one). Additionally, sequence
215 overlap must be at least 55 nt. If you select option for shorter reads
216 than 100 nt, minimum overlap 55 nt is not required.
217
218 By default,
219 mgblast search use DUST program to filter out
220 low-complexity sequences. If you want
221 to increase sensitivity of detection of satellites with shorter monomer
222 use option with '*no masking of low complexity repeats*'. Note that omitting
223 DUST filtering will significantly increase running times
224
225 **Output**
226
227 A list of clusters identified as putative satellite repeats, their genomic
228 abundance and various cluster characteristics are provided. Length and
229 consensus sequences of reconstructed monomers are also shown and
230 accompanied by a detailed output from kmer-based reconstruction including
231 sequences and sequence logos of alternative variants of monomer sequences.
232
233 The output includes an **HTML summary** with a table listing all analyzed
234 clusters. More detailed information about clusters is provided in
235 additional files and directories. All results are also provided as a
236 downloadable **zip archive**. Since read clustering results in
237 thousands of clusters, the search for satellite repeats is limited to
238 a subset of the largest ones corresponding to the most abundant genomic
239 repeats. The default setting of the pipeline is to analyze all clusters containing at least
240 0.01% of the input reads. Besides the satellite repeats, three other
241 groups of clusters are reported in the output (1) LTR-retrotransposons,
242 (2) 45S and 5S rDNA and (3) all remaining clusters passing the size
243 threshold. As (1) and (2) contain sequences with circular
244 graphs, their consensus is calculated in the same way as for satellite
245 repeats. Additionally a **log file** reporting the progress of the
246 computational pipeline is provided.
247
248
249 </help>
250
251 </tool>