changeset 0:e2b8e71b85b9 draft

Uploaded
author petr-novak
date Wed, 08 Jan 2020 06:25:59 -0500
parents
children e6fb0f2b2097
files repex_full_clustering.xml repex_tarean.xml
diffstat 2 files changed, 572 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/repex_full_clustering.xml	Wed Jan 08 06:25:59 2020 -0500
@@ -0,0 +1,315 @@
+<tool id="repeatexplorer2" name="RepeatExplorer2 clustering: " version="2.3.7" >
+    <stdio>
+      <regex match="lastdb: can't open file: NEAR" source="stderr" level="fatal" description="Version of last is too old, use ver 956 or higher\n" />
+      <regex match="Traceback" source="stderr" level="fatal" description="Unknown error" />
+      <regex match="error" source="stderr" level="fatal" description="Unknown error" />
+      <regex match="Warning" source="stderr" level="warning" description="Unknown error" />
+      <exit_code range="1:" level="fatal" description="Error" />
+    </stdio>
+    <description>Improved version or repeat discovery and characterization using graph based sequence clustering</description>
+   <requirements>
+     <requirement type="package">last</requirement>
+     <requirement type="package">imagemagick</requirement>
+     <requirement type="package">mafft</requirement>
+     <requirement type="package">blast</requirement>
+     <requirement type="package">diamond</requirement>
+     <requirement type="package">blast-legacy</requirement>
+     <requirement type="package">r-igraph</requirement>
+     <requirement type="package">r-data.tree</requirement>
+     <requirement type="package">r-stringr</requirement>
+     <requirement type="package">r-r2html</requirement>
+     <requirement type="package">r-hwriter</requirement>
+     <requirement type="package">r-dt</requirement>
+     <requirement type="package">r-scales</requirement>
+     <requirement type="package">r-plotrix</requirement>
+     <requirement type="package">r-png</requirement>
+     <requirement type="package">r-plyr</requirement>
+     <requirement type="package">r-dplyr</requirement>
+     <requirement type="package">r-optparse</requirement>
+     <requirement type="package">r-dbi</requirement>
+     <requirement type="package">r-rsqlite</requirement>
+     <requirement type="package">r-rserve</requirement>
+     <requirement type="package">bioconductor-biostrings</requirement>
+     <requirement type="package" version="2.3.7">repex_tarean</requirement>
+     <requirement type="set_environment">REPEX</requirement>
+     <requirement type="set_environment">REPEX_VERSION</requirement>
+     <requirement type="package" version="0.9.1" >pyrserve</requirement>
+   </requirements>
+    <command >
+      export PYTHONHASHSEED=0;
+      \${REPEX}/seqclust --sample ${sample} --output_dir=tarean_output --logfile=${log} --cleanup $paired --taxon $taxon
+
+      #if $advanced_options.advanced:
+      --mincl $advanced_options.size_threshold $advanced_options.keep_names $advanced_options.automatic_filtering  -D $advanced_options.blastx.options_blastx
+      --assembly_min $advanced_options.assembly_min_cluster_size
+
+        #if $advanced_options.comparative.options_comparative:
+          --prefix_length $advanced_options.comparative.prefix_length
+        #end if
+      
+        #if $advanced_options.custom_library.options_custom_library:
+       	  -d $advanced_options.custom_library.library extra_database
+        #end if
+        
+        #if $advanced_options.options.options:
+         -opt $advanced_options.options.options
+        #end if 
+      #end if
+      ${FastaFile}  >stdout.log 2> stderr.log ;
+      echo "STDOUT CONTENT:" >> ${log} ;
+      cat stdout.log >> ${log} ;
+      echo "STDERR CONTENT:" >> ${log};
+      cat stderr.log >> ${log} &amp;&amp;
+      \${REPEX}/stderr_filter.py stderr.log &amp;&amp;
+      cd tarean_output &amp;&amp;
+      zip -r  ${ReportArchive}.zip * &amp;&amp;
+      mv ${ReportArchive}.zip ${ReportArchive} &amp;&amp;
+      cp index.html ${ReportFile} &amp;&amp;
+      mkdir ${ReportFile.files_path} &amp;&amp;
+      cp -r --parents libdir ${ReportFile.files_path} &amp;&amp;
+      cp -r --parents seqclust/clustering/superclusters ${ReportFile.files_path} &amp;&amp;
+      cp -r --parents seqclust/clustering/clusters ${ReportFile.files_path} &amp;&amp;
+      cp seqclust/clustering/hitsort.cls ${ReportFile.files_path}/seqclust/clustering/hitsort.cls &amp;&amp;
+      cp *.png ${ReportFile.files_path}/ &amp;&amp;
+      cp *.csv ${ReportFile.files_path}/ &amp;&amp;
+      cp *.html ${ReportFile.files_path}/  &amp;&amp;
+      cp *.css ${ReportFile.files_path}/  &amp;&amp;
+      cp *.fasta ${ReportFile.files_path}/ 2>>$log  &amp;&amp; rm -r ../tarean_output || :
+
+    </command>
+ <inputs>
+	<param name="FastaFile" label="NGS reads" type="data" format="fasta"
+	       help="Input file must contain fasta-formatted NGS reads. If paired end reads are used, reads must be interlaced  and all pairs must be complete. Example of input data format is provided in the help below. "/>
+  <param name="paired" type="boolean" truevalue="--paired" falsevalue="" checked="True" label="Paired-end reads" help="Check if you are using pair reads and input sequences contain both read mates and left mates alternate with their right mates" />
+ 
+	<param name="sample" label="Sample size" type="integer" value="500000" min="10000"/>
+  <param name="taxon" label="Select taxon and protein domain database version (REXdb)" type="select" help="Reference database of transposable element protein domains - REXdb - is used for annotation of repeats">
+    <option value="VIRIDIPLANTAE3.0" selected="true">Viridiplantae version 3.0 </option>
+    <option value="VIRIDIPLANTAE2.2" selected="true">Viridiplantae version 2.2</option>
+    <option value="METAZOA3.0" >Metazoa version 3.0</option>
+    <option value="METAZOA2.0" >Metazoa version 2.0</option>
+    <!-- Modify setting in config.py accordingly -->
+  </param>
+
+  <conditional name="advanced_options">
+    <param name="advanced" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Advanced options" />
+    <when value="false">
+      <!-- pass -->
+    </when>
+    <when value="true">
+      <conditional name="comparative">
+        <param name="options_comparative" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Perform comparative analysis" help="Use this options when you want to compare sequences multiple groups"/>
+	      <when value="false">
+          <!-- do nothing here -->
+        </when>
+        <when value="true">
+   		    <param name="prefix_length" label="Group code length" type="integer" value="3" min="1" max="10" help="For comparative analysis, sequences are from individial groups distinguished by sample code which must be used as prefix for sequence name. See example below."/>
+        </when>
+      </conditional>
+
+      <conditional name="blastx">
+        <param name="options_blastx" type="select" label="Select parameters for protein domain search">
+          <option value="BLASTX_W2" selected="false">blastx with word size 2 (the most sensitive, slowest)</option>
+          <option value="BLASTX_W3" selected="true">blastx with word size 3 (default)</option>
+          <option value="DIAMOND" selected="false">diamond program (the least sensitive, fastest)</option>
+        </param>
+      </conditional>
+
+      <conditional name="options">
+        <param name="options" type="select" label="Similarity search options" help="Different similarity search parameters are used depending on the used input data to adjust search to differences in length and error rate">
+          <option value="ILLUMINA" selected="true">Illumina reads, read length 100nt or more </option>
+          <option value="ILLUMINA_SHORT" selected="false">Illumina reads, shorter than 100nt (Do not use reads shorter than 50nt!) </option>
+          <option value="ILLUMINA_DUST_OFF" selected="false">Illumina reads, no masking of low complexity repeats  </option>
+          <option value="OXFORD_NANOPORE" selected="false">
+            Pseudo short reads simulated from Oxford Nanopore data (experimental feature)
+          </option>
+        </param>
+      </conditional>
+      
+      <conditional name="custom_library">
+	      <param name="options_custom_library" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Use custom repeat database"/>
+	      <when value="false">
+          <!-- do nothing here -->
+        </when>
+        <when value="true">
+   		    <param name="library" format="fasta" type="data" label="Custom library of repeats" help="Library of repeats as DNA sequences in fasta format. The required format for IDs in a custom library is : '>reapeatname#class/subclass'"/>
+        </when>
+      </conditional>
+	    <param name="size_threshold" label="Cluster size threshold  for detailed analysis" type="float" value="0.01" min="0.0001" max="100" help ="Minimal size (as percentage of input reads) of the smallest cluster which is analyzed, cluster with less than 20 reads are not considered at all."/>
+      <param name="automatic_filtering" label="Perform automatic filtering of abundant satellite repeats" help="Automatic filtering tries to identify the most abundant tandem repeats and remove such sequences partially from analysis. Removal of abundant tandem repeat can enable to analyze higher proportion of other less abundant repeats." type="boolean" truevalue="--automatic_filtering" falsevalue="" checked="false"/>
+      <param name="keep_names" label="Keep original sequences names" type="boolean" truevalue="--keep_names" falsevalue="" checked="false" help="By default sequence are relabeled using  integers. If you want to keep original names, use this option."/>
+      <param name="assembly_min_cluster_size" type="integer" label="min cluster size for assembly" value="5" min="2" max="100"/>
+    </when>
+  </conditional>
+
+       <conditional name="queue_definition">
+               <param name="queue_select" type="select" label="Select queue">
+                 <option value="basic_fast_queue">basic &amp; fast</option>
+                 <option value="long_slow_queue">long &amp; slow</option>
+                 <option value="extra_long_slow_queue">extra long &amp; slow</option>
+               </param>
+               <when value="basic_fast_queue">
+                 <param name="queue_specification" type="text" label="Modify parameters (optional)"
+                        value="-l select=1:ncpus=10:mem=32gb:scratch_local=50gb -l walltime=48:00:00 -q elixirre@pbs.elixir-czech.cz -v TAREAN_MAX_MEM=4000000,TAREAN_CPU=4" />
+               </when>
+
+               <when value="long_slow_queue">
+                 <param name="queue_specification" type="text" label="Modify parameters (optional)"
+                        value="-l select=1:ncpus=16:mem=112gb:scratch_local=50gb -l walltime=336:00:00 -q elixirre@pbs.elixir-czech.cz -v TAREAN_MAX_MEM=64000000,TAREAN_CPU=15" />
+               </when>
+               <when value="extra_long_slow_queue">
+                 <param name="queue_specification" type="text" label="Modify parameters (optional)"
+                        value="-l select=1:ncpus=16:mem=112gb:scratch_local=50gb -l walltime=720:00:00 -q elixirre@pbs.elixir-czech.cz -v TAREAN_MAX_MEM=64000000,TAREAN_CPU=15" />
+               </when>
+     </conditional>
+ 
+
+
+ </inputs>
+    <outputs>
+	<data name="log" format="txt" label="RepeatExplorer2 - log file"/> 
+	<data name="ReportArchive" format="zip" label="RepeatExplorer2 - Archive with HTML report from data ${FastaFile.hid}"/> 
+	<data name="ReportFile" format="html" label="RepeatExplorer2 - HTML report from data ${FastaFile.hid}"/> 
+    </outputs>
+
+    <help>
+      **HELP**
+      
+      RepeatExplorer2 clustering is a computational pipeline for unsupervised
+      identification of repeats from unassembled sequence reads. The
+      pipeline uses low-pass whole genome sequence reads and performs graph-based
+      clustering. Resulting clusters, representing all types of repeats, are then
+      examined to identify and classify into repeats groups. 
+
+      **Input data**
+      
+      The analysis requires either **single** or **paired-end reads** generated
+      by whole genome shotgun sequencing provided as a single fasta-formatted file.
+      Generally, paired-end reads provide significantly better results than single
+      reads. Reads should be of uniform length (optimal size range is 100-200 nt) and
+      the number of analyzed reads should represent less than 1x genome equivalent
+      (genome coverage of 0.01 - 0.50 x is recommended). Reads should be
+      quality-filtered (recommended filtering : quality score >=10 over 95% of bases
+      and no Ns allowed) and only **complete read pairs** should be submitted for
+      analysis. When paired reads are used, input data must be **interlaced** format
+      as fasta file:
+
+      example of interlaced input format::
+      
+        >0001_f
+        CGTAATATACATACTTGCTAGCTAGTTGGATGCATCCAACTTGCAAGCTAGTTTGATG
+        >0001_r
+        GATTTGACGGACACACTAACTAGCTAGTTGCATCTAAGCGGGCACACTAACTAACTAT
+        >0002_f
+        ACTCATTTGGACTTAACTTTGATAATAAAAACTTAAAAAGGTTTCTGCACATGAATCG
+        >0002_r
+        TATGTTGAAAAATTGAATTTCGGGACGAAACAGCGTCTATCGTCACGACATAGTGCTC
+        >0003_f
+        TGACATTTGTGAACGTTAATGTTCAACAAATCTTTCCAATGTCTTTTTATCTTATCAT
+        >0003_r
+        TATTGAAATACTGGACACAAATTGGAAATGAAACCTTGTGAGTTATTCAATTTATGTT
+        ...
+
+
+      **Comparative analysis**
+
+      For comparative analysis sequence names must contain code (prefix) for each group.
+      Prefix in sequences names  must be of fixed length.
+
+      Example of labeling two groups with where **group code length** is 2 and is used to distinguish groups - AA and BB ::
+
+        >AA0001_f
+        CGTAATATACATACTTGCTAGCTAGTTGGATGCATCCAACTTGCAAGCTAGTTTGATG
+        >AA0001_r
+        GATTTGACGGACACACTAACTAGCTAGTTGCATCTAAGCGGGCACACTAACTAACTAT
+        >AA0002_f
+        ACTCATTTGGACTTAACTTTGATAATAAAAACTTAAAAAGGTTTCTGCACATGAATCG
+        >AA0002_r
+        TATGTTGAAAAATTGAATTTCGGGACGAAACAGCGTCTATCGTCACGACATAGTGCTC
+        >BB0001_f
+        TGACATTTGTGAACGTTAATGTTCAACAAATCTTTCCAATGTCTTTTTATCTTATCAT
+        >BB0001_r
+        TATTGAAATACTGGACACAAATTGGAAATGAAACCTTGTGAGTTATTCAATTTATGTT
+        >BB0002_f
+        TGACATTTGTGAACGTTAATGTTCAACAAATCTTTCCAATGTCTTTTTATCTTATCAT
+        >BB0002_r
+        TATTGAAATACTGGACACAAATTGGAAATGAAACCTTGTGAGTTATTCAATTTATGTT
+        
+
+      To prepare quality filtered and interlaced input fasta file from fastq
+      files, use `Preprocessing of paired-reads`__  tool.
+
+      .. __: tool_runner?tool_id=paired_fastq_filtering
+
+
+      **Additional parameters**
+
+      **Sample size** defines how many reads should be used in calculation.
+      Default setting with 500,000 reads will enable detection of high copy
+      repeats within several hours of computation time. For higher
+      sensitivity the sample size can be set higher. Since sample size affects
+      the memory usage, this parameter may be automatically adjusted to lower
+      value during the run. Maximum sample size which can be processed depends on
+      the repetitiveness of analyzed genome.
+
+      
+      **Select taxon and protein domain database version (REXdb)**. Classification
+      of transposable elements is based on the similarity to our reference database
+      of transposable element protein domains (**REXdb**). Standalone database for Viridiplantae species
+      can be obtained on `repeatexplorer.org`__. Classification
+      system used in REXdb is described in article `Systematic survey of plant
+      LTR-retrotransposons elucidates phylogenetic relationships of their
+      polyprotein domains and provides a reference for element classification`__
+      Database for Metazoa species is still under development so use it with caution.
+
+      .. __: http://repeatexplorer.org
+      .. __: https://doi.org/10.1186/s13100-018-0144-1
+
+      **Select parameters for protein domain search** REXdb is compared with s
+      equence clusters either using blastx or diamond aligner. Diamond program
+      is about three time faster than blastx with word size 3.
+
+      **Similarity search options** By default sequence reads are compared using
+      mgblast program. Default threshold is explicitly set to 90% sequence
+      similarity spanning at least 55% of the read length (in the case of reads
+      differing in length it applies to the longer one). Additionally, sequence
+      overlap must be at least 55 nt. If you select option for shorter reads
+      than 100 nt,  minimum overlap 55 nt is not required.
+
+      By default,
+      mgblast search use DUST program to filter out
+      low-complexity sequences. If you want
+      to increase sensitivity of detection of satellites with shorter monomer
+      use option with '*no masking of low complexity repeats*'. Note that omitting
+      DUST filtering will significantly increase running times
+     
+
+      **Automatic filtering of abundant satellite repeats** perform clustering on
+      smaller dataset of sequence reads to detect abundant high confidence
+      satellite repeats. If such satellites are detected, sequence reads derived
+      from these satellites are depleted from input dataset. This step enable more
+      sensitive detection of less abundant repeats as more reads can be used
+      in clustering step.
+
+      **Use custom repeat database**. This option allows users to perform similarity
+      comparison of identified repeats to their custom databases. The repeat class must
+      be encoded in FASTA headers of database entries in order to allow correct 
+      parsing of similarity hits. Required format for custom database sequence name is: ::
+
+        >reapeatname#class/subclass
+
+
+      **Output**
+
+      List of clusters identified as putative satellite repeats, their genomic
+      abundance and various cluster characteristics. 
+
+      Output includes a **HTML summary** with table listing of all analyzed
+      clusters. More detailed information about clusters is provided in
+      additional files and directories. All results are also provided as
+      downloadable **zip archive**. Additionally a **log file** reporting
+      the progress of the computational pipeline is provided.
+      
+    </help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/repex_tarean.xml	Wed Jan 08 06:25:59 2020 -0500
@@ -0,0 +1,257 @@
+<tool id="tarean" name="Tandem Repeat Analyzer"  version="2.3.7" >
+    <stdio>
+      <regex match="Traceback" source="stderr" level="fatal" description="Unknown error" />
+      <regex match="error" source="stderr" level="fatal" description="Unknown error" />
+      <regex match="warning" source="stderr" level="warning" description="Unknown warning" />
+      <exit_code range="1:" level="fatal" description="Error" />
+    </stdio>
+    <description>Identification of genomic tandem repeats from NGS data</description>
+    <requirements>
+      <requirement type="package">imagemagick</requirement>
+      <requirement type="package">mafft</requirement>
+      <requirement type="package">blast</requirement>
+      <requirement type="package">diamond</requirement>
+      <requirement type="package">blast-legacy</requirement>
+      <requirement type="package">r-igraph</requirement>
+      <requirement type="package">r-data.tree</requirement>
+      <requirement type="package">r-stringr</requirement>
+      <requirement type="package">r-r2html</requirement>
+      <requirement type="package">r-hwriter</requirement>
+      <requirement type="package">r-dt</requirement>
+      <requirement type="package">r-scales</requirement>
+      <requirement type="package">r-plotrix</requirement>
+      <requirement type="package">r-png</requirement>
+      <requirement type="package">r-plyr</requirement>
+      <requirement type="package">r-dplyr</requirement>
+      <requirement type="package">r-optparse</requirement>
+      <requirement type="package">r-dbi</requirement>
+      <requirement type="package">r-rsqlite</requirement>
+      <requirement type="package">r-rserve</requirement>
+      <requirement type="package">bioconductor-biostrings</requirement>
+      <requirement type="package" version="2.3.7">repex_tarean</requirement>
+      <requirement type="set_environment">REPEX</requirement>
+      <requirement type="set_environment">REPEX_VERSION</requirement>
+      <requirement type="package" version="0.9.1">pyrserve</requirement>
+    </requirements>
+  <command detect_errors="exit_code">
+    export PYTHONHASHSEED=0;
+    \${REPEX}/seqclust --paired --sample ${sample} --output_dir=tarean_output --logfile=${log} --cleanup --tarean_mode
+    #if $advanced_options.advanced:
+      --mincl $advanced_options.size_threshold $advanced_options.keep_names $advanced_options.automatic_filtering -M $advanced_options.merging
+      #if $advanced_options.custom_library.options_custom_library :
+     	  -d $advanced_options.custom_library.library extra_database
+      #end if
+      #if $advanced_options.options.options:
+        -opt $advanced_options.options.options
+      #end if   
+    #else:
+      -M 0.2
+
+    #end if
+    ${FastaFile} >stdout.log 2> stderr.log ;
+    echo "STDOUT CONTENT:" >> ${log} ;
+    cat stdout.log >> ${log} ;
+    echo "STDERR CONTENT:" >> ${log} ;
+    cat stderr.log >> ${log} &amp;&amp;
+    \${REPEX}/stderr_filter.py stderr.log &amp;&amp;
+    cd tarean_output &amp;&amp;
+    zip -r  ${ReportArchive}.zip * &amp;&amp;
+    mv ${ReportArchive}.zip ${ReportArchive} &amp;&amp;
+    cp index.html ${ReportFile} &amp;&amp;
+    mkdir ${ReportFile.files_path} &amp;&amp;
+    cp -r --parents libdir ${ReportFile.files_path} &amp;&amp;
+    cp -r --parents seqclust/clustering/superclusters ${ReportFile.files_path} &amp;&amp;
+    cp -r --parents seqclust/clustering/clusters ${ReportFile.files_path} &amp;&amp;
+    cp seqclust/clustering/hitsort.cls ${ReportFile.files_path}/seqclust/clustering/hitsort.cls &amp;&amp;
+    cp *.png ${ReportFile.files_path}/ &amp;&amp;
+    cp *.csv ${ReportFile.files_path}/ &amp;&amp;
+    cp *.html ${ReportFile.files_path}/  &amp;&amp;
+    cp *.css ${ReportFile.files_path}/  &amp;&amp;
+    cp *.fasta ${ReportFile.files_path}/ 2>>$log  &amp;&amp; rm -r ../tarean_output || :
+
+    
+  </command>
+
+  <inputs>
+	  <param name="FastaFile" label="paired-end NGS reads" type="data" format="fasta"
+	         help="Input file must contain fasta-formatted interlaced read pairs from paired-end sequencing. All pairs must be complete. Example of input data format is provided in the help below."/>
+	  <param name="sample" label="Sample size" type="integer" value="500000" min="10000"/>
+
+    <conditional name="advanced_options">
+      <param name="advanced" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Advanced options" />
+      <when value="false">
+        <!-- pass -->
+      </when>
+      <when value="true">
+        <param name="merging" type="boolean" truevalue="0.2" falsevalue="0" checked="True" label="Perform cluster merging" help="By default, clusters connected through paired-end reads are merged"/>
+        <conditional name="custom_library">
+	        <param name="options_custom_library" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Use custom repeat database"/>
+	        <when value="false">
+            <!-- do nothing here -->
+          </when>
+          <when value="true">
+	          <param name="library" format="fasta" type="data" label="Use custom repeat database" help="Perform additional similarity search to user-provided repeat database. The database should contain FASTA-formatted DNA sequences with headers (sequence names) in the format: '>reapeatname#class/subclass'"/>
+          </when>
+        </conditional>
+        <param name="size_threshold" label="Cluster size threshold  for detailed analysis" type="float" value="0.01" min="0.0001" max="100" help ="Minimal size (as percentage of input reads) of the smallest cluster which is analyzed, cluster with less than 20 reads are not considered at all."/>
+        <param name="automatic_filtering" label="Perform automatic filtering of abundant satellite repeats" type="boolean" truevalue="--automatic_filtering" falsevalue="" checked="false"/>
+        <param name="keep_names" label="Keep original sequences names" type="boolean" truevalue="--keep_names" falsevalue="" checked="false" help="By default sequence are relabeled using  integers. If you want to keep original names, use this option."/>
+         <conditional name="options">
+           <param name="options" type="select" label="Similarity search options" help="Different similarity search parameters are used depending on the used input data to adjust search to differences in length and error rate">
+             <option value="ILLUMINA" selected="true">Illumina reads, read length 100nt or more </option>
+             <option value="ILLUMINA_SHORT" selected="false">Illumina reads, shorter than 100nt (Do not use reads shorter than 50nt!) </option>
+             <option value="ILLUMINA_DUST_OFF" selected="false">Illumina reads, no masking of low complexity repeats  </option>
+           </param>
+         </conditional>
+      </when>
+    </conditional>
+
+         <conditional name="queue_definition">
+               <param name="queue_select" type="select" label="Select queue">
+                 <option value="basic_fast_queue">basic &amp; fast</option>
+                 <option value="long_slow_queue">long &amp; slow</option>
+                 <option value="extra_long_slow_queue">extra long &amp; slow</option>
+               </param>
+               <when value="basic_fast_queue">
+                 <param name="queue_specification" type="text" label="Modify parameters (optional)"
+                        value="-l select=1:ncpus=10:mem=32gb:scratch_local=50gb -l walltime=48:00:00 -q elixirre@pbs.elixir-czech.cz -v TAREAN_MAX_MEM=4000000,TAREAN_CPU=4" />
+               </when>
+
+               <when value="long_slow_queue">
+                 <param name="queue_specification" type="text" label="Modify parameters (optional)"
+                        value="-l select=1:ncpus=16:mem=112gb:scratch_local=50gb -l walltime=336:00:00 -q elixirre@pbs.elixir-czech.cz -v TAREAN_MAX_MEM=64000000,TAREAN_CPU=15" />
+               </when>
+               <when value="extra_long_slow_queue">
+                 <param name="queue_specification" type="text" label="Modify parameters (optional)"
+                        value="-l select=1:ncpus=16:mem=112gb:scratch_local=50gb -l walltime=720:00:00 -q elixirre@pbs.elixir-czech.cz -v TAREAN_MAX_MEM=64000000,TAREAN_CPU=15" />
+               </when>
+     </conditional>
+ 
+
+
+  </inputs>
+  <outputs>
+	  <data name="log" format="txt" label="TAREAN log file"/> 
+	  <data name="ReportArchive" format="zip" label="TAREAN Archive with HTML report from data ${FastaFile.hid}"/> 
+	  <data name="ReportFile" format="html" label="TAREAN HTML report from data ${FastaFile.hid}"/> 
+  </outputs>
+
+  <help>
+    **HELP**
+    
+    TAREAN - TAndem REpeat ANalyzer is a computational pipeline for
+    **unsupervised identification of satellite repeats** from unassembled
+    sequence reads. The pipeline uses low-pass paired-end whole genome
+    sequence reads and performs graph-based clustering. The resulting
+    clusters, representing all types of repeats present in the genome, are
+    then examined to identify those containing circular structures indicative
+    of tandem repeats. A poster summarizing TAREAN principles and
+    implementation can be found `here.`__
+
+
+    .. __: http://w3lamc.umbr.cas.cz/lamc/?page_id=312
+
+    **Input data**
+    
+ 
+    The analysis requires **paired-end reads** generated by whole genome
+    shotgun sequencing. The data should be provided as a single input file in
+    fasta format with the reads interlaced (see example below). All the pairs
+    must be complete, i.e. both "forward" and "reverse" sequence reads must be
+    present. The reads should all be trimmed to the same length. The optimal
+    size range is between 100 and 200 nucleotides. The number of reads to be
+    analyzed should not exceed 1x coverage of the genome. Genome coverage
+    between 0.01 and 0.5x is recommended. The reads should be filtered for
+    quality. The recommended quality filtering is as follows: each read should
+    have a quality score >=10 for 95% of the bases, i.e. if your reads are 100
+    base pairs long, then a read only passes this quality threshold if 95
+    bases have a quality of 10 or higher. Additionally, any reads containing
+    indeterminate base pairs (indicated as N in the reads) should be removed.
+    Finally, if either one of the reads in a pair fails to meet the
+    aforementioned thresholds, **both** sequences should be removed.
+    example of interlaced input format::
+    
+      >0001_f
+      CGTAATATACATACTTGCTAGCTAGTTGGATGCATCCAACTTGCAAGCTAGTTTGATG
+      >0001_r
+      GATTTGACGGACACACTAACTAGCTAGTTGCATCTAAGCGGGCACACTAACTAACTAT
+      >0002_f
+      ACTCATTTGGACTTAACTTTGATAATAAAAACTTAAAAAGGTTTCTGCACATGAATCG
+      >0002_r
+      TATGTTGAAAAATTGAATTTCGGGACGAAACAGCGTCTATCGTCACGACATAGTGCTC
+      >0003_f
+      TGACATTTGTGAACGTTAATGTTCAACAAATCTTTCCAATGTCTTTTTATCTTATCAT
+      >0003_r
+      TATTGAAATACTGGACACAAATTGGAAATGAAACCTTGTGAGTTATTCAATTTATGTT
+      ...
+
+
+    To perform the quality filtering on your fastQ formatted data as described
+    above, and to interlace your paired-end sequence reads,
+    please use the `Preprocessing of paired-reads`__  tool.
+
+    .. __: tool_runner?tool_id=paired_fastq_filtering
+
+
+    **Additional parameters**
+
+    **Sample size** defines how many reads will be used during the computation.
+    The default setting of 500,000 reads will enable detection of high copy
+    number satellites within several hours. For higher
+    sensitivity the sample size can be increased. Since the sample size affects
+    memory usage, this parameter may be automatically adjusted to a lower value
+    during the run. The maximum sample size which can be processed depends on the
+    repetitiveness of the analyzed genome. This significantly limits the number of reads
+    that can be analyzed with the TAREAN pipeline.
+
+    **Perform cluster merging**. Families of repetitive elements are
+    frequently split into multiple clusters rather than being represented as a
+    single one. If you do not want to merge clusters based on the presence
+    of broken read pairs, disable this option. 
+    
+    **Use custom repeat database**. This option allows users to perform similarity
+    comparison of identified repeats to their custom databases. The repeat class should
+    be encoded in FASTA headers of database entries in order to allow correct 
+    parsing of similarity hits.
+
+    **Similarity search options** By default sequence reads are compared using
+    mgblast program. Default threshold is explicitly set to 90% sequence
+    similarity spanning at least 55% of the read length (in the case of reads
+    differing in length it applies to the longer one). Additionally, sequence
+    overlap must be at least 55 nt. If you select option for shorter reads
+    than 100 nt,  minimum overlap 55 nt is not required.
+    
+    By default,
+    mgblast search use DUST program to filter out
+    low-complexity sequences. If you want
+    to increase sensitivity of detection of satellites with shorter monomer
+    use option with '*no masking of low complexity repeats*'. Note that omitting
+    DUST filtering will significantly increase running times
+    
+    **Output**
+
+    A list of clusters identified as putative satellite repeats, their genomic
+    abundance and various cluster characteristics are provided. Length and
+    consensus sequences of reconstructed monomers are also shown and
+    accompanied by a detailed output from kmer-based reconstruction including
+    sequences and sequence logos of alternative variants of monomer sequences.
+
+    The output includes an **HTML summary** with a table listing all analyzed
+    clusters. More detailed information about clusters is provided in
+    additional files and directories. All results are also provided as a
+    downloadable **zip archive**. Since read clustering results in
+    thousands of clusters, the search for satellite repeats is limited to
+    a subset of the largest ones corresponding to the most abundant genomic
+    repeats. The default setting of the pipeline is to analyze all clusters containing at least
+    0.01% of the input reads. Besides the satellite repeats, three other
+    groups of clusters are reported in the output (1) LTR-retrotransposons,
+    (2) 45S and 5S rDNA and (3) all remaining clusters passing the size
+    threshold. As (1) and (2) contain sequences with circular
+    graphs, their consensus is calculated in the same way as for satellite
+    repeats. Additionally a **log file** reporting the progress of the
+    computational pipeline is provided.
+
+    
+  </help>
+
+</tool>