annotate repex_full_clustering.xml @ 0:1d1b9e1b2e2f draft

Uploaded
author petr-novak
date Thu, 19 Dec 2019 10:24:45 -0500
parents
children 394a697ffa49
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1 <tool id="repeatexplorer2" name="RepeatExplorer2 clustering: " >
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
2 <stdio>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
3 <regex match="lastdb: can't open file: NEAR" source="stderr" level="fatal" description="Version of last is too old, use ver 956 or higher\n" />
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
4 <regex match="Traceback" source="stderr" level="fatal" description="Unknown error" />
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
5 <regex match="error" source="stderr" level="fatal" description="Unknown error" />
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
6 <regex match="Warning" source="stderr" level="warning" description="Unknown error" />
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
7 <exit_code range="1:" level="fatal" description="Error" />
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
8 </stdio>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
9 <description>Improved version or repeat discovery and characterization using graph based sequence clustering</description>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
10 <requirements>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
11 <requirement type="package" version="3.7">python</requirement>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
12 <requirement type="package" version="0.9.1" >pyrserve</requirement>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
13 <requirement type="package">mafft</requirement>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
14 <requirement type="package">imagemagick</requirement>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
15 <requirement type="package">blast</requirement>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
16 <requirement type="package">diamond</requirement>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
17 <requirement type="package">blast-legacy</requirement>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
18 <requirement type="package">r-igraph</requirement>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
19 <requirement type="package">r-data.tree</requirement>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
20 <requirement type="package">r-stringr</requirement>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
21 <requirement type="package">r-r2html</requirement>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
22 <requirement type="package">r-hwriter</requirement>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
23 <requirement type="package">r-dt</requirement>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
24 <requirement type="package">r-scales</requirement>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
25 <requirement type="package">r-plotrix</requirement>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
26 <requirement type="package">r-png</requirement>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
27 <requirement type="package">r-plyr</requirement>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
28 <requirement type="package">r-dplyr</requirement>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
29 <requirement type="package">r-optparse</requirement>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
30 <requirement type="package">r-dbi</requirement>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
31 <requirement type="package">r-rsqlite</requirement>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
32 <requirement type="package">r-rserve</requirement>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
33 <requirement type="package">bioconductor-biostrings</requirement>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
34 </requirements>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
35 <command >
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
36 export PYTHONHASHSEED=0;
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
37 ${__tool_directory__}/seqclust --sample ${sample} --output_dir=tarean_output --logfile=${log} --cleanup $paired --taxon $taxon
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
38
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
39 #if $advanced_options.advanced:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
40 --mincl $advanced_options.size_threshold $advanced_options.keep_names $advanced_options.automatic_filtering -D $advanced_options.blastx.options_blastx
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
41 --assembly_min $advanced_options.assembly_min_cluster_size
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
42
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
43 #if $advanced_options.comparative.options_comparative:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
44 --prefix_length $advanced_options.comparative.prefix_length
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
45 #end if
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
46
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
47 #if $advanced_options.custom_library.options_custom_library:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
48 -d $advanced_options.custom_library.library extra_database
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
49 #end if
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
50
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
51 #if $advanced_options.options.options:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
52 -opt $advanced_options.options.options
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
53 #end if
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
54 #end if
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
55 ${FastaFile} >stdout.log 2> stderr.log ;
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
56 echo "STDOUT CONTENT:" >> ${log} ;
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
57 cat stdout.log >> ${log} ;
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
58 echo "STDERR CONTENT:" >> ${log};
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
59 cat stderr.log >> ${log} &amp;&amp;
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
60 cd tarean_output &amp;&amp;
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
61 zip -r ${ReportArchive}.zip * &amp;&amp;
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
62 mv ${ReportArchive}.zip ${ReportArchive} &amp;&amp;
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
63 cp index.html ${ReportFile} &amp;&amp;
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
64 mkdir ${ReportFile.files_path} &amp;&amp;
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
65 cp -r --parents libdir ${ReportFile.files_path} &amp;&amp;
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
66 cp -r --parents seqclust/clustering/superclusters ${ReportFile.files_path} &amp;&amp;
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
67 cp -r --parents seqclust/clustering/clusters ${ReportFile.files_path} &amp;&amp;
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
68 cp seqclust/clustering/hitsort.cls ${ReportFile.files_path}/seqclust/clustering/hitsort.cls &amp;&amp;
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
69 cp *.png ${ReportFile.files_path}/ &amp;&amp;
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
70 cp *.csv ${ReportFile.files_path}/ &amp;&amp;
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
71 cp *.html ${ReportFile.files_path}/ &amp;&amp;
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
72 cp *.css ${ReportFile.files_path}/ &amp;&amp;
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
73 cp *.fasta ${ReportFile.files_path}/ 2>>$log &amp;&amp; rm -r ../tarean_output || :
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
74
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
75 </command>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
76 <inputs>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
77 <param name="FastaFile" label="NGS reads" type="data" format="fasta"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
78 help="Input file must contain fasta-formatted NGS reads. If paired end reads are used, reads must be interlaced and all pairs must be complete. Example of input data format is provided in the help below. "/>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
79 <param name="paired" type="boolean" truevalue="--paired" falsevalue="" checked="True" label="Paired-end reads" help="Check if you are using pair reads and input sequences contain both read mates and left mates alternate with their right mates" />
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
80
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
81 <param name="sample" label="Sample size" type="integer" value="500000" min="10000"/>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
82 <param name="taxon" label="Select taxon and protein domain database version (REXdb)" type="select" help="Reference database of transposable element protein domains - REXdb - is used for annotation of repeats">
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
83 <option value="VIRIDIPLANTAE3.0" selected="true">Viridiplantae version 3.0 </option>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
84 <option value="VIRIDIPLANTAE2.2" selected="true">Viridiplantae version 2.2</option>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
85 <option value="METAZOA3.0" >Metazoa version 3.0</option>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
86 <option value="METAZOA2.0" >Metazoa version 2.0</option>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
87 <!-- Modify setting in config.py accordingly -->
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
88 </param>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
89
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
90 <conditional name="advanced_options">
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
91 <param name="advanced" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Advanced options" />
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
92 <when value="false">
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
93 <!-- pass -->
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
94 </when>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
95 <when value="true">
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
96 <conditional name="comparative">
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
97 <param name="options_comparative" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Perform comparative analysis" help="Use this options when you want to compare sequences multiple groups"/>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
98 <when value="false">
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
99 <!-- do nothing here -->
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
100 </when>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
101 <when value="true">
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
102 <param name="prefix_length" label="Group code length" type="integer" value="3" min="1" max="10" help="For comparative analysis, sequences are from individial groups distinguished by sample code which must be used as prefix for sequence name. See example below."/>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
103 </when>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
104 </conditional>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
105
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
106 <conditional name="blastx">
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
107 <param name="options_blastx" type="select" label="Select parameters for protein domain search">
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
108 <option value="BLASTX_W2" selected="false">blastx with word size 2 (the most sensitive, slowest)</option>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
109 <option value="BLASTX_W3" selected="true">blastx with word size 3 (default)</option>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
110 <option value="DIAMOND" selected="false">diamond program (the least sensitive, fastest)</option>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
111 </param>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
112 </conditional>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
113
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
114 <conditional name="options">
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
115 <param name="options" type="select" label="Similarity search options" help="Different similarity search parameters are used depending on the used input data to adjust search to differences in length and error rate">
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
116 <option value="ILLUMINA" selected="true">Illumina reads, read length 100nt or more </option>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
117 <option value="ILLUMINA_SHORT" selected="false">Illumina reads, shorter than 100nt (Do not use reads shorter than 50nt!) </option>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
118 <option value="ILLUMINA_DUST_OFF" selected="false">Illumina reads, no masking of low complexity repeats </option>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
119 <option value="OXFORD_NANOPORE" selected="false">
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
120 Pseudo short reads simulated from Oxford Nanopore data (experimental feature)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
121 </option>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
122 </param>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
123 </conditional>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
124
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
125 <conditional name="custom_library">
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
126 <param name="options_custom_library" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Use custom repeat database"/>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
127 <when value="false">
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
128 <!-- do nothing here -->
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
129 </when>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
130 <when value="true">
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
131 <param name="library" format="fasta" type="data" label="Custom library of repeats" help="Library of repeats as DNA sequences in fasta format. The required format for IDs in a custom library is : '>reapeatname#class/subclass'"/>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
132 </when>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
133 </conditional>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
134 <param name="size_threshold" label="Cluster size threshold for detailed analysis" type="float" value="0.01" min="0.0001" max="100" help ="Minimal size (as percentage of input reads) of the smallest cluster which is analyzed, cluster with less than 20 reads are not considered at all."/>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
135 <param name="automatic_filtering" label="Perform automatic filtering of abundant satellite repeats" help="Automatic filtering tries to identify the most abundant tandem repeats and remove such sequences partially from analysis. Removal of abundant tandem repeat can enable to analyze higher proportion of other less abundant repeats." type="boolean" truevalue="--automatic_filtering" falsevalue="" checked="false"/>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
136 <param name="keep_names" label="Keep original sequences names" type="boolean" truevalue="--keep_names" falsevalue="" checked="false" help="By default sequence are relabeled using integers. If you want to keep original names, use this option."/>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
137 <param name="assembly_min_cluster_size" type="integer" label="min cluster size for assembly" value="5" min="2" max="100"/>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
138 </when>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
139 </conditional>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
140
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
141 </inputs>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
142 <outputs>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
143 <data name="log" format="txt" label="RepeatExplorer2 - log file"/>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
144 <data name="ReportArchive" format="zip" label="RepeatExplorer2 - Archive with HTML report from data ${FastaFile.hid}"/>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
145 <data name="ReportFile" format="html" label="RepeatExplorer2 - HTML report from data ${FastaFile.hid}"/>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
146 </outputs>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
147
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
148 <help>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
149 **HELP**
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
150
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
151 RepeatExplorer2 clustering is a computational pipeline for unsupervised
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
152 identification of repeats from unassembled sequence reads. The
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
153 pipeline uses low-pass whole genome sequence reads and performs graph-based
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
154 clustering. Resulting clusters, representing all types of repeats, are then
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
155 examined to identify and classify into repeats groups.
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
156
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
157 **Input data**
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
158
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
159 The analysis requires either **single** or **paired-end reads** generated
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
160 by whole genome shotgun sequencing provided as a single fasta-formatted file.
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
161 Generally, paired-end reads provide significantly better results than single
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
162 reads. Reads should be of uniform length (optimal size range is 100-200 nt) and
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
163 the number of analyzed reads should represent less than 1x genome equivalent
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
164 (genome coverage of 0.01 - 0.50 x is recommended). Reads should be
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
165 quality-filtered (recommended filtering : quality score >=10 over 95% of bases
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
166 and no Ns allowed) and only **complete read pairs** should be submitted for
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
167 analysis. When paired reads are used, input data must be **interlaced** format
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
168 as fasta file:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
169
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
170 example of interlaced input format::
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
171
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
172 >0001_f
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
173 CGTAATATACATACTTGCTAGCTAGTTGGATGCATCCAACTTGCAAGCTAGTTTGATG
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
174 >0001_r
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
175 GATTTGACGGACACACTAACTAGCTAGTTGCATCTAAGCGGGCACACTAACTAACTAT
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
176 >0002_f
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
177 ACTCATTTGGACTTAACTTTGATAATAAAAACTTAAAAAGGTTTCTGCACATGAATCG
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
178 >0002_r
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
179 TATGTTGAAAAATTGAATTTCGGGACGAAACAGCGTCTATCGTCACGACATAGTGCTC
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
180 >0003_f
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
181 TGACATTTGTGAACGTTAATGTTCAACAAATCTTTCCAATGTCTTTTTATCTTATCAT
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
182 >0003_r
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
183 TATTGAAATACTGGACACAAATTGGAAATGAAACCTTGTGAGTTATTCAATTTATGTT
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
184 ...
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
185
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
186
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
187 **Comparative analysis**
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
188
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
189 For comparative analysis sequence names must contain code (prefix) for each group.
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
190 Prefix in sequences names must be of fixed length.
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
191
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
192 Example of labeling two groups with where **group code length** is 2 and is used to distinguish groups - AA and BB ::
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
193
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
194 >AA0001_f
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
195 CGTAATATACATACTTGCTAGCTAGTTGGATGCATCCAACTTGCAAGCTAGTTTGATG
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
196 >AA0001_r
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
197 GATTTGACGGACACACTAACTAGCTAGTTGCATCTAAGCGGGCACACTAACTAACTAT
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
198 >AA0002_f
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
199 ACTCATTTGGACTTAACTTTGATAATAAAAACTTAAAAAGGTTTCTGCACATGAATCG
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
200 >AA0002_r
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
201 TATGTTGAAAAATTGAATTTCGGGACGAAACAGCGTCTATCGTCACGACATAGTGCTC
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
202 >BB0001_f
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
203 TGACATTTGTGAACGTTAATGTTCAACAAATCTTTCCAATGTCTTTTTATCTTATCAT
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
204 >BB0001_r
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
205 TATTGAAATACTGGACACAAATTGGAAATGAAACCTTGTGAGTTATTCAATTTATGTT
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
206 >BB0002_f
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
207 TGACATTTGTGAACGTTAATGTTCAACAAATCTTTCCAATGTCTTTTTATCTTATCAT
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
208 >BB0002_r
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
209 TATTGAAATACTGGACACAAATTGGAAATGAAACCTTGTGAGTTATTCAATTTATGTT
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
210
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
211
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
212 To prepare quality filtered and interlaced input fasta file from fastq
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
213 files, use `Preprocessing of paired-reads`__ tool.
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
214
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
215 .. __: tool_runner?tool_id=paired_fastq_filtering
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
216
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
217
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
218 **Additional parameters**
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
219
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
220 **Sample size** defines how many reads should be used in calculation.
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
221 Default setting with 500,000 reads will enable detection of high copy
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
222 repeats within several hours of computation time. For higher
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
223 sensitivity the sample size can be set higher. Since sample size affects
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
224 the memory usage, this parameter may be automatically adjusted to lower
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
225 value during the run. Maximum sample size which can be processed depends on
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
226 the repetitiveness of analyzed genome.
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
227
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
228
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
229 **Select taxon and protein domain database version (REXdb)**. Classification
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
230 of transposable elements is based on the similarity to our reference database
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
231 of transposable element protein domains (**REXdb**). Standalone database for Viridiplantae species
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
232 can be obtained on `repeatexplorer.org`__. Classification
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
233 system used in REXdb is described in article `Systematic survey of plant
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
234 LTR-retrotransposons elucidates phylogenetic relationships of their
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
235 polyprotein domains and provides a reference for element classification`__
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
236 Database for Metazoa species is still under development so use it with caution.
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
237
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
238 .. __: http://repeatexplorer.org
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
239 .. __: https://doi.org/10.1186/s13100-018-0144-1
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
240
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
241 **Select parameters for protein domain search** REXdb is compared with s
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
242 equence clusters either using blastx or diamond aligner. Diamond program
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
243 is about three time faster than blastx with word size 3.
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
244
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
245 **Similarity search options** By default sequence reads are compared using
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
246 mgblast program. Default threshold is explicitly set to 90% sequence
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
247 similarity spanning at least 55% of the read length (in the case of reads
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
248 differing in length it applies to the longer one). Additionally, sequence
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
249 overlap must be at least 55 nt. If you select option for shorter reads
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
250 than 100 nt, minimum overlap 55 nt is not required.
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
251
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
252 By default,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
253 mgblast search use DUST program to filter out
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
254 low-complexity sequences. If you want
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
255 to increase sensitivity of detection of satellites with shorter monomer
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
256 use option with '*no masking of low complexity repeats*'. Note that omitting
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
257 DUST filtering will significantly increase running times
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
258
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
259
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
260 **Automatic filtering of abundant satellite repeats** perform clustering on
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
261 smaller dataset of sequence reads to detect abundant high confidence
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
262 satellite repeats. If such satellites are detected, sequence reads derived
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
263 from these satellites are depleted from input dataset. This step enable more
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
264 sensitive detection of less abundant repeats as more reads can be used
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
265 in clustering step.
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
266
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
267 **Use custom repeat database**. This option allows users to perform similarity
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
268 comparison of identified repeats to their custom databases. The repeat class must
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
269 be encoded in FASTA headers of database entries in order to allow correct
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
270 parsing of similarity hits. Required format for custom database sequence name is: ::
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
271
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
272 >reapeatname#class/subclass
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
273
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
274
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
275 **Output**
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
276
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
277 List of clusters identified as putative satellite repeats, their genomic
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
278 abundance and various cluster characteristics.
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
279
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
280 Output includes a **HTML summary** with table listing of all analyzed
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
281 clusters. More detailed information about clusters is provided in
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
282 additional files and directories. All results are also provided as
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
283 downloadable **zip archive**. Additionally a **log file** reporting
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
284 the progress of the computational pipeline is provided.
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
285
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
286 </help>
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
287
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
288 </tool>