Mercurial > repos > artbio > lumpy_smoove

<tool id="lumpy_smoove" name="lumpy_smoove" version="0.2.5+galaxy8">
    <description>find structural variants using the smoove workflow</description>
    <macros>
        <import>macro_lumpy_smoove.xml</import>
    </macros>
    <requirements>
        <requirement type="package" version="0.2.5">smoove</requirement>
    </requirements>
    <stdio>
        <exit_code range="1:" level="fatal" description="Tool exception" />
    </stdio>
    <command detect_errors="exit_code"><![CDATA[
    @pipefail@
    @set_fasta_index@
    #if $set_plan.plan_choice=='pair':
        ln -s $set_plan.normal_bam normal.bam &&
        ln -f -s $set_plan.normal_bam.metadata.bam_index normal.bam.bai &&
        ln -s $set_plan.tumor_bam tumor.bam &&
        ln -f -s $set_plan.tumor_bam.metadata.bam_index tumor.bam.bai &&
    #elif $set_plan.plan_choice=='single':
        ln -s $set_plan.single_bam single.bam &&
        ln -f -s $set_plan.single_bam.metadata.bam_index single.bam.bai &&
    #else:
        #for $sample in $set_plan.cohort:
            ln -s $sample ${sample.element_identifier}.bam &&
            ln -f -s $sample.metadata.bam_index ${sample.element_identifier}.bam.bai &&
        #end for
    #end if

    smoove call --name output
        #if $set_exclusion.choices=="yes":
            --exclude $bedmask
        #end if
            --fasta reference.fa
            --processes \${GALAXY_SLOTS:-4}
            --genotype
       #if $prpos=="no":
            --removepr
       #end if
           *.bam &&
    gunzip -c output-smoove.genotyped.vcf.gz > $vcf_call

    ]]></command>
    <inputs>
        <expand macro="reference_source_conditional" />
        <conditional name="set_plan">
            <param name="plan_choice" type="select" label="Analyse a single Bam or a pair of Bam (eg normal/tumor)" display="radio">
                <option value="pair" selected="true">A pair of Bam files</option>
                <option value="single">A single Bam</option>
                <option value="cohort">a small cohort of Bam files (less than ~40)</option>
            </param>
            <when value="pair">
                <param format="bam" name="normal_bam" type="data" label="BAM alignment from the normal sample"/>
                <param format="bam" name="tumor_bam" type="data" label="BAM alignment from the tumor sample"/>
            </when>
            <when value="single">
                <param format="bam" name="single_bam" type="data" label="BAM alignment from a single sample"/>
            </when>
            <when value="cohort">
                <param name="cohort" type="data_collection" format="bam" label="A collection of bam files" multiple="true"/>
            </when>
        </conditional>


        <conditional name="set_exclusion">
            <param name="choices" type="select" label="exclude regions with a bed file" display="radio">
                <option value="no" selected="true">No</option>
                <option value="yes">Yes</option>
            </param>
            <when value="yes">
                <param format="bed" name="bedmask" type="data" label="BED regions to be excluded for the analysis"/>
            </when>
            <when value="no">
            </when>
        </conditional>
        <param name="prpos" type="select" label="include the PRPOS probabilities in INFO tags" display="radio">
            <option value="no" selected="true">No</option>
            <option value="yes">Yes</option>
        </param>
   </inputs>

    <outputs>
        <data format="vcf" name="vcf_call" label="lumpy-smoove Variant Calling" />
    </outputs>

    <tests>
        <test>

            <conditional name="set_plan">
                <param name="plan_choice" value="cohort"/>
                <param name="cohort">
                    <collection type="list">
                        <element name="1" ftype="bam" value="celegans_RG_1.bam"/>
                        <element name="2" ftype="bam" value="celegans_RG_2.bam"/>
                    </collection>
                </param>
            </conditional>
            <param name="reference_source_selector" value="history" />
            <param name="ref_file" value="chrI-ce11.fa"/>
            <param name="choices" value="yes"/>
            <param name="bedmask" value="exclude.bed"/>
            <param name="prpos" value="no"/>
            <output name="vcf_call" ftype="vcf" file="result-6.vcf" lines_diff="12"/>
        </test>
        <test>
            <param name="reference_source_selector" value="history" />
            <param name="ref_file" value="chrI-ce11.fa"/>
            <param name="normal_bam" value="celegans_RG_1.bam"/>
            <param name="tumor_bam" value="celegans_RG_2.bam"/>
            <param name="choices" value="yes"/>
            <param name="bedmask" value="exclude.bed"/>
            <param name="prpos" value="no"/>
            <output name="vcf_call" ftype="vcf" file="result-1.vcf" lines_diff="12"/>
        </test>
        <test>
            <param name="reference_source_selector" value="history" />
            <param name="ref_file" value="chrI-ce11.fa"/>
            <param name="normal_bam" value="celegans_RG_1.bam"/>
            <param name="tumor_bam" value="celegans_RG_2.bam"/>
            <param name="choices" value="no"/>
            <param name="prpos" value="no"/>
            <output name="vcf_call" ftype="vcf" file="result-2.vcf" lines_diff="12"/>
        </test>
        <test>
            <param name="reference_source_selector" value="history" />
            <param name="ref_file" value="chrI-ce11.fa"/>
            <param name="normal_bam" value="celegans_RG_2.bam"/>
            <param name="tumor_bam" value="celegans_RG_1.bam"/>
            <param name="choices" value="no"/>
            <param name="prpos" value="no"/>
            <output name="vcf_call" ftype="vcf" file="result-3.vcf" lines_diff="12"/>
        </test>
        <test>
            <param name="reference_source_selector" value="history" />
            <param name="ref_file" value="chrI-ce11.fa"/>
            <param name="normal_bam" value="celegans_RG_1.bam"/>
            <param name="tumor_bam" value="celegans_RG_2.bam"/>
            <param name="choices" value="no"/>
            <param name="prpos" value="yes"/>
            <output name="vcf_call" ftype="vcf" file="result-4.vcf" lines_diff="12"/>
        </test>
        <test>
            <param name="reference_source_selector" value="history" />
            <param name="plan_choice" value="single" />
            <param name="ref_file" value="chrI-ce11.fa"/>
            <param name="single_bam" value="celegans_RG_1.bam"/>
            <param name="choices" value="no"/>
            <param name="prpos" value="no"/>
            <output name="vcf_call" ftype="vcf" file="result-5.vcf" lines_diff="12"/>
        </test>
    </tests>

    <help>
**smoove** simplifies and speeds calling and genotyping SVs for short reads. It also improves
specificity by removing many spurious alignment signals that are indicative of low-level
noise and often contribute to spurious calls.

There is a blog-post describing smoove in more detail
here: https://brentp.github.io/post/smoove/

Currently, this Galaxy tool only wraps smoove for 1, 2 (bam normal and tumor inputs) or
a small collection of samples (&lt;40),
which translates in the command line::

    <![CDATA[smoove call --name my-cohort --exclude $bed --fasta $fasta -p $threads --genotype [--removepr] /path/to/*.bam]]>


the --exclude $bed is highly recommended as it can be used to ignore reads that overlap
problematic regions.

A good set of regions for GRCh37 can be found here_

.. _here: https://github.com/hall-lab/speedseq/blob/master/annotations/ceph18.b37.lumpy.exclude.2014-01-15.bed


And a good set for GRCh38 can be found there_

.. _there: https://github.com/hall-lab/speedseq/blob/master/annotations/exclude.cnvnator_100bp.GRCh38.20170403.bed


smoove will::

    1. parallelize calls to lumpy_filter to extract split and discordant reads required by lumpy

    2. further filter lumpy_filter calls to remove high-coverage, spurious regions and user-specified chroms like 'hs37d5';
    it will also remove reads that we've found are likely spurious signals. after this, it will
    remove singleton reads (where the mate was removed by one of the previous filters)
    from the discordant bams. This makes lumpy much faster and less memory-hungry.

    3. calculate per-sample metrics for mean, standard deviation, and distribution of insert
    size as required by lumpy.

    4. stream output of lumpy directly into multiple svtyper processes for parallel-by-region
    genotyping while lumpy is still running.

    5. sort, compress, and index final VCF (but this galaxy wrapper is uncompression the gzip_vcf output)

**Input(s)**

* BAM files: Either a pair of Bam files (e.g. normal vs tumor sample) or a single Bam file.
  Only BAM alignments produced by BWA-mem have been tested with this tool

    .. class:: warningmark

    It is mandatory for proper run of svtyper that **BAM files contain read group information**,
    ie the @RG tag is present and filled in each BAM file


* A bed file describing the regions to exclude from the analysis
* Additional options*: refer to smoove GitHub repository_ and the lumpy publication (doi 10.1186/gb-2014-15-6-r84)

.. _repository: https://github.com/brentp/smoove


Options::

    <![CDATA[

    smoove calls several programs. Those with 'Y' are found on your $PATH. Only those with '*' are required.

  [Y] bgzip [ sort   -> (compress) ->   index ]
  [Y] gsort [(sort)  ->  compress   ->  index ]
  [Y] tabix [ sort   ->  compress   -> (index)]
  [Y] lumpy
  [Y] lumpy_filter
  [Y] samtools
  [Y] svtyper
  [Y] mosdepth [extra filtering of split and discordant files for better scaling]

  [Y] duphold [(optional) annotate calls with depth changes]
  [Y] svtools [only needed for large cohorts].

    Available sub-commands are below. Each can be run with -h for additional help.

 call        : call lumpy (and optionally svtyper)
 merge       : merge and sort (using svtools) calls from multiple samples
 genotype    : parallelize svtyper on an input VCF
 paste       : square final calls from multiple samples (each with same number of variants)
 plot-counts : plot counts of split, discordant reads before, after smoove filtering
 annotate    : annotate a VCF with gene and quality of SV call
 hipstr      : run hipSTR in parallel
 cnvnator    : run cnvnator in parallel
 duphold     : run duphold in parallel (this can be done by adding a flag to call or genotype)
    ]]>
    </help>

    <citations>
    <citation type="doi">10.1186/gb-2014-15-6-r84</citation>
  </citations>
</tool>
author	artbio
date	Sun, 09 May 2021 17:09:33 +0000
parents	8b0005fdfd04
children	7dcf61950215