view Tasmanian.xml @ 1:b15fbf90db53 draft default tip

"planemo upload for repository https://github.com/nebiolabs/tasmanian-mismatch commit d67025e9c7764d77f3f622b4d1ac0535b06c63de"
author iuc
date Sat, 24 Jul 2021 17:47:35 +0000
parents bc0b40dec7d2
children
line wrap: on
line source

<tool id="tasmanian_mismatch" name="Analysis of artifacts with Tasmanian" version="@TOOL_VERSION@" profile="20.05">
  <description>Quantify, visualize and summarize mismatches in deep sequencing data</description>
  <macros>
	<token name="@TOOL_VERSION@">1.0.7</token>
	<token name="@SAMTOOLS_VERSION@">1.13</token> 
  </macros>
  <requirements>
    <requirement type="package" version="@TOOL_VERSION@">tasmanian-mismatch</requirement> 
    <requirement type="package" version="@SAMTOOLS_VERSION@">samtools</requirement>
  </requirements>
  <command detect_errors="exit_code">
    <![CDATA[
   
     #set $reference_fasta_filename = "localref.fa"

     #if str( $reference_source.reference_source_selector ) == "history":
        ln -s '${reference_source.ref_file}' '${reference_fasta_filename}' &&
     #else:
        #set $reference_fasta_filename = str( $reference_source.ref_file.fields.path )
     #end if

    samtools view '${bam_input}' | 
   
     #if $bed_filename
        run_intersections -b '$bed_filename' | 
     #end if  
 	   
     run_tasmanian
        -q '${basequality}'
        -s '${softclips}'
        -m '${mapquality}'
        -c '${confidence}'
        -r '${reference_fasta_filename}' > '${output_table}'

  ]]></command>
  <inputs>
    <!-- Bam alignment file -->
    <param type="data" name="bam_input" label="Bam/Sam alignemnt file" format="bam" help="Specify BAM/SAM dataset. If not using a bed file, this file MUST BE SORTED"/>
    <!-- reference genome upload -->
    <conditional name="reference_source">
      <param name="reference_source_selector" type="select" label="Reference genome" help="You can select a reference genome from your history or use a built-in index (Local cache)">
        <option value="cached">Local cache</option>
        <option value="history">History</option>
      </param>
      <when value="cached">
        <param name="ref_file" type="select" label="Select the reference genome from the list">
          <options from_data_table="all_fasta">
            <filter type="sort_by" column="2" />
            <validator type="no_options" message="No indexes are available" />
          </options>
        </param>
      </when>
      <when value="history">
        <param name="ref_file" type="data" format="fasta" label="Use reference genome from history" help="You can first upload a FASTA sequence to the history" />
      </when>
    </conditional>
            
    <!-- bed file -->       
    <param name="bed_filename" type="data" format="bed" optional="true" label="Select a bed file" help="The bed file should contain at least: &quot;chrN&quot;, &quot;start&quot; and &quot;stop&quot;, and is tab separated."/>

    <!-- Additional parameters -->
    <param name="confidence" label="Boundary" type="integer" value="20" min="0" max="100"
        help="Number of bases in boundary region, from 0 to length of the read (read help below). Default=20"/> 
    <param name="softclips" label="Choose an action with softclips" type="select" display="radio"
        help="How sofclips whould be treated. Values include 0,1 or 2 (read the help below). Default=0">
      <option value="1">Never use softcliped bases</option>
      <option value="2">Always use softcliped bases</option>
      <option value="0" selected="True">Automatic desicion (Default)</option>
    </param>
    <param name="mapquality" label="Map quality" type="integer" min="0" max="70" value="20" help="Exclude reads with lower mapQ than this number. Default=20"/>
    <param name="basequality" label="Base quality" type="integer" min="0" max="70" value="20" help="Exclude bases with lower Base quality than this number. Default=20"/>
    <param name="keepHTML_conditional" type="select" label="keep HTML output file?">
      <option value="yes">Yes</option>
      <option value="no">No</option>
    </param>    
  </inputs>

  <outputs>
    <data name="output_table" format="txt" />
    <data format="html" name="html_file" from_work_dir="Tasmanian_artifact_report.html" label="tasmanian-mismatch results table">
      <filter>keepHTML_conditional == "yes"</filter>
    </data>
  </outputs>

  <tests>
    <!-- test when reference from history with bed-->
    <test>
      <param name="bam_input" value="test2.bam" ftype="bam"/>
      <param name="reference_source_selector" value="history"/>
      <param name="ref_file" value="small_region.fa"/>
      <param name="bed_filename" value="test2.bed" ftype="bed"/>
      <output name="output_table" file="test2-bed.output" lines_diff="4"/>
    </test>
    <!-- test when reference from history without bed-->
    <test>
      <param name="bam_input" value="test2.bam" ftype="bam"/>
      <param name="reference_source_selector" value="history"/>
      <param name="ref_file" value="small_region.fa"/>
      <output name="output_table" file="test2-nobed.output" lines_diff="4"/>
    </test>
    <!-- test when reference from cached-->
    <test>
      <param name="bam_input" value="test2.bam" ftype="bam" dbkey="hg38"/>
      <param name="reference_source_selector" value="cached"/>
      <param name="ref_file" value="hg38"/>
      <output name="output_table" file="test2-nobed.output" lines_diff="4"/>
    </test>
  </tests>

  <help>
    <![CDATA[

**What it does**

This tool counts the number/proportion of mismatches per position along the read, 
for each read (see figure below).

.. image:: ${static_path}/images/snapshot_good.jpg
    :height: 350                                                            
    :width: 650   

-----

**What is special**

By providing a bed file, tasmanian-mismatch will count mismatches from all regions depicted in the figure below, 
and will report them separately. Also, a parameter defined as *"confidence"* allows including reads with >=
bases in the boundary region in a separate group. This is useful when the bed refers to repeat regions. Since these 
regions might not have been correctly placed in the assembly or are not the same in different individuals, we can 
include this *confidence* repeat regions where we have high confidence on the reference genome to which we mapped the reads.

.. image:: ${static_path}/images/intersections_tasmanian.jpg                
    :height: 150                                                            
    :width: 650 

Softclips are critical in FFPE (Formalin-fixed paraffin-embedded) experiments as mismatches tend to accumulate at the ends of the reads. Most often, softclips 
are all accepted during the analysis and many real mismatches are indirectly excluded from the analysis. Hence, this tool 
provides different ways to deal with this: 

The *softclips* field allows for 3 different ways at treating softclips:    
0) Exclude these region if there is less than 2/3 identity with the reference genome  
1) Exclude all softclipped bases  
2) Include all softclipped bases  

.. class:: warningmark   

BAM/SAM file must be **sorted** if not using a bed file.
  
    ]]>
  </help>
  <citations>
    <citation type="bibtex">
      @misc{githubtasmanian,
        author = {Langhorst B., Others, Erijman A.},
        year = {2020},
        title = {TBD},
        publisher = {GitHub},
        journal = {GitHub repository},
        url = {https://github.com/nebiolabs/tasmanian-mismatch},
      }
    </citation>
  </citations>
</tool>