annotate repex_full_clustering.xml @ 0:e2b8e71b85b9 draft

Uploaded
author petr-novak
date Wed, 08 Jan 2020 06:25:59 -0500
parents
children 968f0867acc5
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
1 <tool id="repeatexplorer2" name="RepeatExplorer2 clustering: " version="2.3.7" >
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
2 <stdio>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
3 <regex match="lastdb: can't open file: NEAR" source="stderr" level="fatal" description="Version of last is too old, use ver 956 or higher\n" />
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
4 <regex match="Traceback" source="stderr" level="fatal" description="Unknown error" />
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
5 <regex match="error" source="stderr" level="fatal" description="Unknown error" />
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
6 <regex match="Warning" source="stderr" level="warning" description="Unknown error" />
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
7 <exit_code range="1:" level="fatal" description="Error" />
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
8 </stdio>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
9 <description>Improved version or repeat discovery and characterization using graph based sequence clustering</description>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
10 <requirements>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
11 <requirement type="package">last</requirement>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
12 <requirement type="package">imagemagick</requirement>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
13 <requirement type="package">mafft</requirement>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
14 <requirement type="package">blast</requirement>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
15 <requirement type="package">diamond</requirement>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
16 <requirement type="package">blast-legacy</requirement>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
17 <requirement type="package">r-igraph</requirement>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
18 <requirement type="package">r-data.tree</requirement>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
19 <requirement type="package">r-stringr</requirement>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
20 <requirement type="package">r-r2html</requirement>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
21 <requirement type="package">r-hwriter</requirement>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
22 <requirement type="package">r-dt</requirement>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
23 <requirement type="package">r-scales</requirement>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
24 <requirement type="package">r-plotrix</requirement>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
25 <requirement type="package">r-png</requirement>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
26 <requirement type="package">r-plyr</requirement>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
27 <requirement type="package">r-dplyr</requirement>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
28 <requirement type="package">r-optparse</requirement>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
29 <requirement type="package">r-dbi</requirement>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
30 <requirement type="package">r-rsqlite</requirement>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
31 <requirement type="package">r-rserve</requirement>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
32 <requirement type="package">bioconductor-biostrings</requirement>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
33 <requirement type="package" version="2.3.7">repex_tarean</requirement>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
34 <requirement type="set_environment">REPEX</requirement>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
35 <requirement type="set_environment">REPEX_VERSION</requirement>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
36 <requirement type="package" version="0.9.1" >pyrserve</requirement>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
37 </requirements>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
38 <command >
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
39 export PYTHONHASHSEED=0;
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
40 \${REPEX}/seqclust --sample ${sample} --output_dir=tarean_output --logfile=${log} --cleanup $paired --taxon $taxon
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
41
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
42 #if $advanced_options.advanced:
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
43 --mincl $advanced_options.size_threshold $advanced_options.keep_names $advanced_options.automatic_filtering -D $advanced_options.blastx.options_blastx
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
44 --assembly_min $advanced_options.assembly_min_cluster_size
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
45
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
46 #if $advanced_options.comparative.options_comparative:
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
47 --prefix_length $advanced_options.comparative.prefix_length
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
48 #end if
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
49
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
50 #if $advanced_options.custom_library.options_custom_library:
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
51 -d $advanced_options.custom_library.library extra_database
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
52 #end if
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
53
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
54 #if $advanced_options.options.options:
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
55 -opt $advanced_options.options.options
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
56 #end if
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
57 #end if
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
58 ${FastaFile} >stdout.log 2> stderr.log ;
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
59 echo "STDOUT CONTENT:" >> ${log} ;
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
60 cat stdout.log >> ${log} ;
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
61 echo "STDERR CONTENT:" >> ${log};
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
62 cat stderr.log >> ${log} &amp;&amp;
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
63 \${REPEX}/stderr_filter.py stderr.log &amp;&amp;
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
64 cd tarean_output &amp;&amp;
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
65 zip -r ${ReportArchive}.zip * &amp;&amp;
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
66 mv ${ReportArchive}.zip ${ReportArchive} &amp;&amp;
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
67 cp index.html ${ReportFile} &amp;&amp;
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
68 mkdir ${ReportFile.files_path} &amp;&amp;
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
69 cp -r --parents libdir ${ReportFile.files_path} &amp;&amp;
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
70 cp -r --parents seqclust/clustering/superclusters ${ReportFile.files_path} &amp;&amp;
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
71 cp -r --parents seqclust/clustering/clusters ${ReportFile.files_path} &amp;&amp;
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
72 cp seqclust/clustering/hitsort.cls ${ReportFile.files_path}/seqclust/clustering/hitsort.cls &amp;&amp;
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
73 cp *.png ${ReportFile.files_path}/ &amp;&amp;
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
74 cp *.csv ${ReportFile.files_path}/ &amp;&amp;
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
75 cp *.html ${ReportFile.files_path}/ &amp;&amp;
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
76 cp *.css ${ReportFile.files_path}/ &amp;&amp;
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
77 cp *.fasta ${ReportFile.files_path}/ 2>>$log &amp;&amp; rm -r ../tarean_output || :
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
78
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
79 </command>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
80 <inputs>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
81 <param name="FastaFile" label="NGS reads" type="data" format="fasta"
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
82 help="Input file must contain fasta-formatted NGS reads. If paired end reads are used, reads must be interlaced and all pairs must be complete. Example of input data format is provided in the help below. "/>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
83 <param name="paired" type="boolean" truevalue="--paired" falsevalue="" checked="True" label="Paired-end reads" help="Check if you are using pair reads and input sequences contain both read mates and left mates alternate with their right mates" />
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
84
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
85 <param name="sample" label="Sample size" type="integer" value="500000" min="10000"/>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
86 <param name="taxon" label="Select taxon and protein domain database version (REXdb)" type="select" help="Reference database of transposable element protein domains - REXdb - is used for annotation of repeats">
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
87 <option value="VIRIDIPLANTAE3.0" selected="true">Viridiplantae version 3.0 </option>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
88 <option value="VIRIDIPLANTAE2.2" selected="true">Viridiplantae version 2.2</option>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
89 <option value="METAZOA3.0" >Metazoa version 3.0</option>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
90 <option value="METAZOA2.0" >Metazoa version 2.0</option>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
91 <!-- Modify setting in config.py accordingly -->
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
92 </param>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
93
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
94 <conditional name="advanced_options">
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
95 <param name="advanced" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Advanced options" />
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
96 <when value="false">
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
97 <!-- pass -->
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
98 </when>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
99 <when value="true">
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
100 <conditional name="comparative">
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
101 <param name="options_comparative" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Perform comparative analysis" help="Use this options when you want to compare sequences multiple groups"/>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
102 <when value="false">
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
103 <!-- do nothing here -->
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
104 </when>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
105 <when value="true">
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
106 <param name="prefix_length" label="Group code length" type="integer" value="3" min="1" max="10" help="For comparative analysis, sequences are from individial groups distinguished by sample code which must be used as prefix for sequence name. See example below."/>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
107 </when>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
108 </conditional>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
109
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
110 <conditional name="blastx">
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
111 <param name="options_blastx" type="select" label="Select parameters for protein domain search">
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
112 <option value="BLASTX_W2" selected="false">blastx with word size 2 (the most sensitive, slowest)</option>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
113 <option value="BLASTX_W3" selected="true">blastx with word size 3 (default)</option>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
114 <option value="DIAMOND" selected="false">diamond program (the least sensitive, fastest)</option>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
115 </param>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
116 </conditional>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
117
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
118 <conditional name="options">
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
119 <param name="options" type="select" label="Similarity search options" help="Different similarity search parameters are used depending on the used input data to adjust search to differences in length and error rate">
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
120 <option value="ILLUMINA" selected="true">Illumina reads, read length 100nt or more </option>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
121 <option value="ILLUMINA_SHORT" selected="false">Illumina reads, shorter than 100nt (Do not use reads shorter than 50nt!) </option>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
122 <option value="ILLUMINA_DUST_OFF" selected="false">Illumina reads, no masking of low complexity repeats </option>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
123 <option value="OXFORD_NANOPORE" selected="false">
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
124 Pseudo short reads simulated from Oxford Nanopore data (experimental feature)
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
125 </option>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
126 </param>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
127 </conditional>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
128
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
129 <conditional name="custom_library">
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
130 <param name="options_custom_library" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Use custom repeat database"/>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
131 <when value="false">
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
132 <!-- do nothing here -->
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
133 </when>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
134 <when value="true">
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
135 <param name="library" format="fasta" type="data" label="Custom library of repeats" help="Library of repeats as DNA sequences in fasta format. The required format for IDs in a custom library is : '>reapeatname#class/subclass'"/>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
136 </when>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
137 </conditional>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
138 <param name="size_threshold" label="Cluster size threshold for detailed analysis" type="float" value="0.01" min="0.0001" max="100" help ="Minimal size (as percentage of input reads) of the smallest cluster which is analyzed, cluster with less than 20 reads are not considered at all."/>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
139 <param name="automatic_filtering" label="Perform automatic filtering of abundant satellite repeats" help="Automatic filtering tries to identify the most abundant tandem repeats and remove such sequences partially from analysis. Removal of abundant tandem repeat can enable to analyze higher proportion of other less abundant repeats." type="boolean" truevalue="--automatic_filtering" falsevalue="" checked="false"/>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
140 <param name="keep_names" label="Keep original sequences names" type="boolean" truevalue="--keep_names" falsevalue="" checked="false" help="By default sequence are relabeled using integers. If you want to keep original names, use this option."/>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
141 <param name="assembly_min_cluster_size" type="integer" label="min cluster size for assembly" value="5" min="2" max="100"/>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
142 </when>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
143 </conditional>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
144
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
145 <conditional name="queue_definition">
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
146 <param name="queue_select" type="select" label="Select queue">
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
147 <option value="basic_fast_queue">basic &amp; fast</option>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
148 <option value="long_slow_queue">long &amp; slow</option>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
149 <option value="extra_long_slow_queue">extra long &amp; slow</option>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
150 </param>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
151 <when value="basic_fast_queue">
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
152 <param name="queue_specification" type="text" label="Modify parameters (optional)"
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
153 value="-l select=1:ncpus=10:mem=32gb:scratch_local=50gb -l walltime=48:00:00 -q elixirre@pbs.elixir-czech.cz -v TAREAN_MAX_MEM=4000000,TAREAN_CPU=4" />
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
154 </when>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
155
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
156 <when value="long_slow_queue">
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
157 <param name="queue_specification" type="text" label="Modify parameters (optional)"
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
158 value="-l select=1:ncpus=16:mem=112gb:scratch_local=50gb -l walltime=336:00:00 -q elixirre@pbs.elixir-czech.cz -v TAREAN_MAX_MEM=64000000,TAREAN_CPU=15" />
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
159 </when>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
160 <when value="extra_long_slow_queue">
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
161 <param name="queue_specification" type="text" label="Modify parameters (optional)"
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
162 value="-l select=1:ncpus=16:mem=112gb:scratch_local=50gb -l walltime=720:00:00 -q elixirre@pbs.elixir-czech.cz -v TAREAN_MAX_MEM=64000000,TAREAN_CPU=15" />
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
163 </when>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
164 </conditional>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
165
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
166
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
167
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
168 </inputs>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
169 <outputs>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
170 <data name="log" format="txt" label="RepeatExplorer2 - log file"/>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
171 <data name="ReportArchive" format="zip" label="RepeatExplorer2 - Archive with HTML report from data ${FastaFile.hid}"/>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
172 <data name="ReportFile" format="html" label="RepeatExplorer2 - HTML report from data ${FastaFile.hid}"/>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
173 </outputs>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
174
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
175 <help>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
176 **HELP**
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
177
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
178 RepeatExplorer2 clustering is a computational pipeline for unsupervised
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
179 identification of repeats from unassembled sequence reads. The
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
180 pipeline uses low-pass whole genome sequence reads and performs graph-based
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
181 clustering. Resulting clusters, representing all types of repeats, are then
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
182 examined to identify and classify into repeats groups.
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
183
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
184 **Input data**
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
185
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
186 The analysis requires either **single** or **paired-end reads** generated
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
187 by whole genome shotgun sequencing provided as a single fasta-formatted file.
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
188 Generally, paired-end reads provide significantly better results than single
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
189 reads. Reads should be of uniform length (optimal size range is 100-200 nt) and
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
190 the number of analyzed reads should represent less than 1x genome equivalent
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
191 (genome coverage of 0.01 - 0.50 x is recommended). Reads should be
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
192 quality-filtered (recommended filtering : quality score >=10 over 95% of bases
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
193 and no Ns allowed) and only **complete read pairs** should be submitted for
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
194 analysis. When paired reads are used, input data must be **interlaced** format
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
195 as fasta file:
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
196
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
197 example of interlaced input format::
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
198
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
199 >0001_f
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
200 CGTAATATACATACTTGCTAGCTAGTTGGATGCATCCAACTTGCAAGCTAGTTTGATG
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
201 >0001_r
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
202 GATTTGACGGACACACTAACTAGCTAGTTGCATCTAAGCGGGCACACTAACTAACTAT
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
203 >0002_f
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
204 ACTCATTTGGACTTAACTTTGATAATAAAAACTTAAAAAGGTTTCTGCACATGAATCG
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
205 >0002_r
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
206 TATGTTGAAAAATTGAATTTCGGGACGAAACAGCGTCTATCGTCACGACATAGTGCTC
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
207 >0003_f
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
208 TGACATTTGTGAACGTTAATGTTCAACAAATCTTTCCAATGTCTTTTTATCTTATCAT
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
209 >0003_r
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
210 TATTGAAATACTGGACACAAATTGGAAATGAAACCTTGTGAGTTATTCAATTTATGTT
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
211 ...
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
212
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
213
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
214 **Comparative analysis**
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
215
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
216 For comparative analysis sequence names must contain code (prefix) for each group.
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
217 Prefix in sequences names must be of fixed length.
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
218
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
219 Example of labeling two groups with where **group code length** is 2 and is used to distinguish groups - AA and BB ::
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
220
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
221 >AA0001_f
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
222 CGTAATATACATACTTGCTAGCTAGTTGGATGCATCCAACTTGCAAGCTAGTTTGATG
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
223 >AA0001_r
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
224 GATTTGACGGACACACTAACTAGCTAGTTGCATCTAAGCGGGCACACTAACTAACTAT
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
225 >AA0002_f
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
226 ACTCATTTGGACTTAACTTTGATAATAAAAACTTAAAAAGGTTTCTGCACATGAATCG
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
227 >AA0002_r
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
228 TATGTTGAAAAATTGAATTTCGGGACGAAACAGCGTCTATCGTCACGACATAGTGCTC
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
229 >BB0001_f
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
230 TGACATTTGTGAACGTTAATGTTCAACAAATCTTTCCAATGTCTTTTTATCTTATCAT
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
231 >BB0001_r
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
232 TATTGAAATACTGGACACAAATTGGAAATGAAACCTTGTGAGTTATTCAATTTATGTT
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
233 >BB0002_f
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
234 TGACATTTGTGAACGTTAATGTTCAACAAATCTTTCCAATGTCTTTTTATCTTATCAT
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
235 >BB0002_r
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
236 TATTGAAATACTGGACACAAATTGGAAATGAAACCTTGTGAGTTATTCAATTTATGTT
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
237
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
238
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
239 To prepare quality filtered and interlaced input fasta file from fastq
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
240 files, use `Preprocessing of paired-reads`__ tool.
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
241
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
242 .. __: tool_runner?tool_id=paired_fastq_filtering
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
243
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
244
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
245 **Additional parameters**
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
246
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
247 **Sample size** defines how many reads should be used in calculation.
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
248 Default setting with 500,000 reads will enable detection of high copy
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
249 repeats within several hours of computation time. For higher
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
250 sensitivity the sample size can be set higher. Since sample size affects
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
251 the memory usage, this parameter may be automatically adjusted to lower
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
252 value during the run. Maximum sample size which can be processed depends on
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
253 the repetitiveness of analyzed genome.
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
254
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
255
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
256 **Select taxon and protein domain database version (REXdb)**. Classification
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
257 of transposable elements is based on the similarity to our reference database
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
258 of transposable element protein domains (**REXdb**). Standalone database for Viridiplantae species
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
259 can be obtained on `repeatexplorer.org`__. Classification
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
260 system used in REXdb is described in article `Systematic survey of plant
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
261 LTR-retrotransposons elucidates phylogenetic relationships of their
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
262 polyprotein domains and provides a reference for element classification`__
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
263 Database for Metazoa species is still under development so use it with caution.
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
264
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
265 .. __: http://repeatexplorer.org
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
266 .. __: https://doi.org/10.1186/s13100-018-0144-1
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
267
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
268 **Select parameters for protein domain search** REXdb is compared with s
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
269 equence clusters either using blastx or diamond aligner. Diamond program
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
270 is about three time faster than blastx with word size 3.
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
271
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
272 **Similarity search options** By default sequence reads are compared using
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
273 mgblast program. Default threshold is explicitly set to 90% sequence
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
274 similarity spanning at least 55% of the read length (in the case of reads
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
275 differing in length it applies to the longer one). Additionally, sequence
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
276 overlap must be at least 55 nt. If you select option for shorter reads
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
277 than 100 nt, minimum overlap 55 nt is not required.
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
278
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
279 By default,
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
280 mgblast search use DUST program to filter out
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
281 low-complexity sequences. If you want
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
282 to increase sensitivity of detection of satellites with shorter monomer
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
283 use option with '*no masking of low complexity repeats*'. Note that omitting
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
284 DUST filtering will significantly increase running times
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
285
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
286
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
287 **Automatic filtering of abundant satellite repeats** perform clustering on
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
288 smaller dataset of sequence reads to detect abundant high confidence
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
289 satellite repeats. If such satellites are detected, sequence reads derived
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
290 from these satellites are depleted from input dataset. This step enable more
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
291 sensitive detection of less abundant repeats as more reads can be used
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
292 in clustering step.
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
293
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
294 **Use custom repeat database**. This option allows users to perform similarity
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
295 comparison of identified repeats to their custom databases. The repeat class must
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
296 be encoded in FASTA headers of database entries in order to allow correct
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
297 parsing of similarity hits. Required format for custom database sequence name is: ::
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
298
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
299 >reapeatname#class/subclass
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
300
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
301
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
302 **Output**
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
303
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
304 List of clusters identified as putative satellite repeats, their genomic
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
305 abundance and various cluster characteristics.
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
306
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
307 Output includes a **HTML summary** with table listing of all analyzed
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
308 clusters. More detailed information about clusters is provided in
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
309 additional files and directories. All results are also provided as
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
310 downloadable **zip archive**. Additionally a **log file** reporting
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
311 the progress of the computational pipeline is provided.
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
312
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
313 </help>
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
314
e2b8e71b85b9 Uploaded
petr-novak
parents:
diff changeset
315 </tool>