view repex_tarean.xml @ 2:968f0867acc5 draft

documentation and help update
author petr-novak
date Mon, 03 Feb 2020 02:34:46 -0500
parents e2b8e71b85b9
children 67964b619af8
line wrap: on
line source

<tool id="tarean" name="Tandem Repeat Analyzer"  version="2.3.7" >
    <stdio>
      <regex match="Traceback" source="stderr" level="fatal" description="Unknown error" />
      <regex match="error" source="stderr" level="fatal" description="Unknown error" />
      <regex match="warning" source="stderr" level="warning" description="Unknown warning" />
      <exit_code range="1:" level="fatal" description="Error" />
    </stdio>
    <description>Identification of genomic tandem repeats from NGS data</description>
    <requirements>
      <requirement type="package">imagemagick</requirement>
      <requirement type="package">mafft</requirement>
      <requirement type="package">blast</requirement>
      <requirement type="package">diamond</requirement>
      <requirement type="package">blast-legacy</requirement>
      <requirement type="package">r-igraph</requirement>
      <requirement type="package">r-data.tree</requirement>
      <requirement type="package">r-stringr</requirement>
      <requirement type="package">r-r2html</requirement>
      <requirement type="package">r-hwriter</requirement>
      <requirement type="package">r-dt</requirement>
      <requirement type="package">r-scales</requirement>
      <requirement type="package">r-plotrix</requirement>
      <requirement type="package">r-png</requirement>
      <requirement type="package">r-plyr</requirement>
      <requirement type="package">r-dplyr</requirement>
      <requirement type="package">r-optparse</requirement>
      <requirement type="package">r-dbi</requirement>
      <requirement type="package">r-rsqlite</requirement>
      <requirement type="package">r-rserve</requirement>
      <requirement type="package">bioconductor-biostrings</requirement>
      <requirement type="package" version="2.3.7">repex_tarean</requirement>
      <requirement type="set_environment">REPEX</requirement>
      <requirement type="set_environment">REPEX_VERSION</requirement>
      <requirement type="package" version="0.9.1">pyrserve</requirement>
    </requirements>
  <command detect_errors="exit_code">
    export PYTHONHASHSEED=0;
    \${REPEX}/seqclust --paired --sample ${sample} --output_dir=tarean_output --logfile=${log} --cleanup --tarean_mode
    #if $advanced_options.advanced:
      --mincl $advanced_options.size_threshold $advanced_options.keep_names $advanced_options.automatic_filtering -M $advanced_options.merging
      #if $advanced_options.custom_library.options_custom_library :
     	  -d $advanced_options.custom_library.library extra_database
      #end if
      #if $advanced_options.options.options:
        -opt $advanced_options.options.options
      #end if   
    #else:
      -M 0.2

    #end if
    ${FastaFile} >stdout.log 2> stderr.log ;
    echo "STDOUT CONTENT:" >> ${log} ;
    cat stdout.log >> ${log} ;
    echo "STDERR CONTENT:" >> ${log} ;
    cat stderr.log >> ${log} &amp;&amp;
    \${REPEX}/stderr_filter.py stderr.log &amp;&amp;
    cd tarean_output &amp;&amp;
    zip -r  ${ReportArchive}.zip * &amp;&amp;
    mv ${ReportArchive}.zip ${ReportArchive} &amp;&amp;
    cp index.html ${ReportFile} &amp;&amp;
    mkdir ${ReportFile.files_path} &amp;&amp;
    cp -r --parents libdir ${ReportFile.files_path} &amp;&amp;
    cp -r --parents seqclust/clustering/superclusters ${ReportFile.files_path} &amp;&amp;
    cp -r --parents seqclust/clustering/clusters ${ReportFile.files_path} &amp;&amp;
    cp seqclust/clustering/hitsort.cls ${ReportFile.files_path}/seqclust/clustering/hitsort.cls &amp;&amp;
    cp *.png ${ReportFile.files_path}/ &amp;&amp;
    cp *.csv ${ReportFile.files_path}/ &amp;&amp;
    cp *.html ${ReportFile.files_path}/  &amp;&amp;
    cp *.css ${ReportFile.files_path}/  &amp;&amp;
    cp *.fasta ${ReportFile.files_path}/ 2>>$log  &amp;&amp; rm -r ../tarean_output || :

    
  </command>

  <inputs>
	  <param name="FastaFile" label="Paired-end Illumina reads" type="data" format="fasta"
	         help="Input file must contain FASTA-formatted interlaced read pairs from paired-end sequencing. All pairs must be complete. Example of the input data format is provided in the help below."/>
	  <param name="sample" label="Sample size" type="integer" value="500000" min="10000"/>

    <conditional name="advanced_options">
      <param name="advanced" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Advanced options" />
      <when value="false">
        <!-- pass -->
      </when>
      <when value="true">
        <param name="merging" type="boolean" truevalue="0.2" falsevalue="0" checked="True" label="Perform cluster merging" help="By default, clusters connected through paired-end reads are merged"/>
        <conditional name="custom_library">
	        <param name="options_custom_library" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Use custom repeat database"/>
	        <when value="false">
            <!-- do nothing here -->
          </when>
          <when value="true">
	          <param name="library" format="fasta" type="data" label="Use custom repeat database" help="Perform additional similarity search to user-provided repeat database. The database should contain FASTA-formatted DNA sequences with headers (sequence names) in the format: '>reapeatname#class/subclass'"/>
          </when>
        </conditional>
        <param name="size_threshold" label="Cluster size threshold for detailed analysis" type="float" value="0.01" min="0.0001" max="100" help ="Minimal size (as percentage of input reads) of the smallest cluster which is analyzed; cluster with less than 20 reads are not considered."/>
        <param name="automatic_filtering" label="Perform automatic filtering of abundant satellite repeats" type="boolean" truevalue="--automatic_filtering" falsevalue="" checked="false"/>
        <param name="keep_names" label="Keep original read names" type="boolean" truevalue="--keep_names" falsevalue="" checked="false" help="By default, reads are renamed using integers. Use this option if you want to keep original names."/>
         <conditional name="options">
           <param name="options" type="select" label="Similarity search options" help="Different similarity search parameters are used depending on the input data to adjust search to differences in read length and error rate">
             <option value="ILLUMINA" selected="true">Illumina reads, read length 100nt or more </option>
             <option value="ILLUMINA_SHORT" selected="false">Illumina reads, shorter than 100nt (Do not use reads shorter than 50nt!) </option>
             <option value="ILLUMINA_DUST_OFF" selected="false">Illumina reads, no masking of low complexity repeats  </option>
           </param>
         </conditional>
      </when>
    </conditional>

         <conditional name="queue_definition">
               <param name="queue_select" type="select" label="Select queue">
                 <option value="basic_fast_queue">basic (max runtime 2 days, 4 GB RAM)</option>
                 <option value="long_slow_queue">long (max runtime 2 weeks, 64 GB RAM)</option>
                 <option value="extra_long_slow_queue">extra long (max runtime 4 weeks, 64 GB RAM)</option>
               </param>
               <when value="basic_fast_queue">
                 <param name="queue_specification" type="text" label="Modify parameters (optional)"
                        value="-l select=1:ncpus=10:mem=32gb:scratch_local=50gb -l walltime=48:00:00 -q elixirre@pbs.elixir-czech.cz -v TAREAN_MAX_MEM=4000000,TAREAN_CPU=4" />
               </when>

               <when value="long_slow_queue">
                 <param name="queue_specification" type="text" label="Modify parameters (optional)"
                        value="-l select=1:ncpus=16:mem=112gb:scratch_local=50gb -l walltime=336:00:00 -q elixirre@pbs.elixir-czech.cz -v TAREAN_MAX_MEM=64000000,TAREAN_CPU=15" />
               </when>
               <when value="extra_long_slow_queue">
                 <param name="queue_specification" type="text" label="Modify parameters (optional)"
                        value="-l select=1:ncpus=16:mem=112gb:scratch_local=50gb -l walltime=720:00:00 -q elixirre@pbs.elixir-czech.cz -v TAREAN_MAX_MEM=64000000,TAREAN_CPU=15" />
               </when>
     </conditional>
 


  </inputs>
  <outputs>
	  <data name="log" format="txt" label="TAREAN log file"/> 
	  <data name="ReportArchive" format="zip" label="TAREAN Archive with HTML report from data ${FastaFile.hid}"/> 
	  <data name="ReportFile" format="html" label="TAREAN HTML report from data ${FastaFile.hid}"/> 
  </outputs>

  <help>
    **HELP**
    
    TAREAN - TAndem REpeat ANalyzer is a computational pipeline for
    **unsupervised identification of satellite repeats** from unassembled
    sequence reads. The pipeline uses low-pass paired-end whole genome
    sequence reads and performs graph-based clustering. The resulting
    clusters, representing all types of repeats present in the genome, are
    then examined to identify those containing circular structures indicative
    of tandem repeats. A poster summarizing TAREAN principles and
    implementation can be found `here.`__


    .. __: http://w3lamc.umbr.cas.cz/lamc/?page_id=312

    **Input data**
    
 
    The analysis requires **paired-end reads** generated by whole genome
    shotgun sequencing. The data should be provided as a single input file in
    fasta format with the reads interlaced (see example below). All the pairs
    must be complete, i.e. both "forward" and "reverse" sequence reads must be
    present. The reads should all be trimmed to the same length. The optimal
    size range is between 100 and 200 nucleotides. The number of reads to be
    analyzed should not exceed 1x coverage of the genome. Genome coverage
    between 0.01 and 0.5x is recommended. The reads should be filtered for
    quality. The recommended quality filtering is as follows: each read should
    have a quality score >=10 for 95% of the bases, i.e. if your reads are 100
    base pairs long, then a read only passes this quality threshold if 95
    bases have a quality of 10 or higher. Additionally, any reads containing
    indeterminate base pairs (indicated as N in the reads) should be removed.
    Finally, if either one of the reads in a pair fails to meet the
    aforementioned thresholds, **both** sequences should be removed.
    example of interlaced input format::
    
      >0001_f
      CGTAATATACATACTTGCTAGCTAGTTGGATGCATCCAACTTGCAAGCTAGTTTGATG
      >0001_r
      GATTTGACGGACACACTAACTAGCTAGTTGCATCTAAGCGGGCACACTAACTAACTAT
      >0002_f
      ACTCATTTGGACTTAACTTTGATAATAAAAACTTAAAAAGGTTTCTGCACATGAATCG
      >0002_r
      TATGTTGAAAAATTGAATTTCGGGACGAAACAGCGTCTATCGTCACGACATAGTGCTC
      >0003_f
      TGACATTTGTGAACGTTAATGTTCAACAAATCTTTCCAATGTCTTTTTATCTTATCAT
      >0003_r
      TATTGAAATACTGGACACAAATTGGAAATGAAACCTTGTGAGTTATTCAATTTATGTT
      ...


    To perform the quality filtering on your fastQ formatted data as described
    above, and to interlace your paired-end sequence reads,
    please use the `Preprocessing of paired-reads`__  tool.

    .. __: tool_runner?tool_id=paired_fastq_filtering


    **Additional parameters**

    **Sample size** defines how many reads will be used during the computation.
    The default setting of 500,000 reads will enable detection of high copy
    number satellites within several hours. For higher
    sensitivity the sample size can be increased. Since the sample size affects
    memory usage, this parameter may be automatically adjusted to a lower value
    during the run. The maximum sample size which can be processed depends on the
    repetitiveness of the analyzed genome. This significantly limits the number of reads
    that can be analyzed with the TAREAN pipeline.

    **Perform cluster merging**. Families of repetitive elements are
    frequently split into multiple clusters rather than being represented as a
    single one. If you do not want to merge clusters based on the presence
    of broken read pairs, disable this option. 
    
    **Use custom repeat database**. This option allows users to perform similarity
    comparison of identified repeats to their custom databases. The repeat class should
    be encoded in FASTA headers of database entries in order to allow correct 
    parsing of similarity hits.

    **Similarity search options** By default sequence reads are compared using
    mgblast program. Default threshold is explicitly set to 90% sequence
    similarity spanning at least 55% of the read length (in the case of reads
    differing in length it applies to the longer one). Additionally, sequence
    overlap must be at least 55 nt. If you select option for shorter reads
    than 100 nt,  minimum overlap 55 nt is not required.
    
    By default,
    mgblast search use DUST program to filter out
    low-complexity sequences. If you want
    to increase sensitivity of detection of satellites with shorter monomer
    use option with '*no masking of low complexity repeats*'. Note that omitting
    DUST filtering will significantly increase running times
    
    **Output**

    A list of clusters identified as putative satellite repeats, their genomic
    abundance and various cluster characteristics are provided. Length and
    consensus sequences of reconstructed monomers are also shown and
    accompanied by a detailed output from kmer-based reconstruction including
    sequences and sequence logos of alternative variants of monomer sequences.

    The output includes an **HTML summary** with a table listing all analyzed
    clusters. More detailed information about clusters is provided in
    additional files and directories. All results are also provided as a
    downloadable **zip archive**. Since read clustering results in
    thousands of clusters, the search for satellite repeats is limited to
    a subset of the largest ones corresponding to the most abundant genomic
    repeats. The default setting of the pipeline is to analyze all clusters containing at least
    0.01% of the input reads. Besides the satellite repeats, three other
    groups of clusters are reported in the output (1) LTR-retrotransposons,
    (2) 45S and 5S rDNA and (3) all remaining clusters passing the size
    threshold. As (1) and (2) contain sequences with circular
    graphs, their consensus is calculated in the same way as for satellite
    repeats. Additionally a **log file** reporting the progress of the
    computational pipeline is provided.

    
  </help>

</tool>