Mercurial > repos > artbio > sigmut

<tool id="SigProfiler" name="SigProfiler" version="@VERSION@">
    <description>performs  mutational signature characterization from variant files</description>

    <macros>
        <import>sigmut_macros.xml</import>
    </macros>
    <expand macro="requirements"/>
    <expand macro="stdio"/>
    <command detect_errors="exit_code"><![CDATA[
        @VERSION@
        @pipefail@
        BIN=`which sigprofiler | sed 's,/sigprofiler,,g'` &&
        echo \$BIN &&
        chmod -R 777 \$BIN &&
        mkdir run_dir &&
        #if str( $set_analysis.choices ) == "get_sigmut":
            #if str( $set_analysis.vcfile_input.vcfile ) == "maf":
                #set $infile = 'run_dir/snps.maf'
                ln -s -f '$set_analysis.vcfile_input.maf_file' '$infile' &&
            #else if str( $set_analysis.vcfile_input.vcfile ) == "icgc":
                #set $infile = 'run_dir/snps.txt'
		        ln -s -f '$set_analysis.vcfile_input.icgc_file' '$infile' &&
            #else if str( $set_analysis.vcfile_input.vcfile ) == "vcf":
                #set $infile = 'run_dir/snps.vcf'
                ln -s -f '$set_analysis.vcfile_input.vcf_file' '$infile' &&
            #end if
        #end if

        sigprofiler

        #if str( $set_analysis.choices ) == "install_genome":
            -ig $set_analysis.refgendwn > install.log
        #else if str( $set_analysis.choices ) == "get_sigmut":
            -g $set_analysis.refgendat
            -f 'run_dir'
            -n "project"
            -p
## ! implement exome functionality when good test available
##            #if str( $set_analysis.exome ) == "true":
##                -e
##            #end if
## ! implement per chromosome functionality when good test available
##            #if str( $set_analysis.chrom_based ) == "true":
##                -c
##            #end if
            #if str( $set_analysis.tsb_stat ) == "true":
                -t
            #end if
            #if str( $set_analysis.gs ) == "true":
                -s
            #end if
            ##-b $set_analysis.bed ### to be done
            && pdfcombine -f -s -o blinder.pdf run_dir/output/plots/*.pdf
            && ls run_dir/logs/
            #if str( $set_analysis.tsb_stat ) == "true":
                && tail -n +1 run_dir/output/TSB/*.txt > transcriptional_strand_biases.txt
           #end if
           #if $set_analysis.seqInfo:
               && tail -n +1 run_dir/output/*/*.all > information.txt
           #end if
        #end if
        ]]></command>

    <inputs>
        <conditional name="set_analysis">
            <param name="choices" type="select" label="Which of the following jobs do you want perform?">
                <option value="install_genome">Install 'de novo' a reference genome </option>
                <option value="get_sigmut">Obtain the mutational signatures from VCF files</option>
            </param>
            <when value="install_genome">
                <param name="refgendwn" type="select" label="Reference genome" help="Get data from any of the following reference genomes:">
                    <option value="GRCh37">Homo sapiens, GRCh37.p13 [GCA_000001405.14] </option>
                    <option value="GRCh38">Homo sapiens, GRCh38.p12 [GCA_000001405.27] </option>
                    <option value="mm9">Mus musculus, GRCm37 [GCA_000001635.18]</option>
                    <option value="mm10">Mus musculus, GRCm38.p6 [GCA_000001635.8]</option>
                    <option value="rn6">Rattus norvegicus, Rnor_6.0 [GCA_000001895.4]</option>
                    <option value="c_elegans">Caenorhabditis elegans</option>
                    <option value="dog">Dog</option>
                </param>
            </when>

            <when value="get_sigmut">
                <conditional name="vcfile_input">
                    <param name="vcfile" type="select" label="VC file" help="Select the format of your input data">
                        <option value="maf">Mutation Annotation Format</option>
                        <option value="icgc">Tab-separated file</option>
                        <option value="vcf">Variant Call Format</option>
                    </param>
                    <when value='maf'>
                        <param name="maf_file" type="data" format="maf" label="select VC file" help="Select the input file in MAF format." />
                    </when>
                    <when value='icgc'>
                        <param name="icgc_file" type="data" format="txt" label="select VC file" help="Select the input file in ICGC format." />
                    </when>
                    <when value='vcf'>
                        <param name="vcf_file" type="data" format="vcf" label="select VC file" help="Select the input file in VCF format." />
                    </when>
                </conditional>

                <param name="refgendat" type="select" label="Reference genome to be analyzed" help="Use the following reference genome:">
                    <option value="GRCh37">Homo sapiens, GRCh37.p13 [GCA_000001405.14] </option>
                                        <option value="GRCh38">Homo sapiens, GRCh38.p12 [GCA_000001405.27] </option>
                                        <option value="mm9">Mus musculus, GRCm37 [GCA_000001635.18]</option>
                                        <option value="mm10">Mus musculus, GRCm38.p6 [GCA_000001635.8]</option>
                                        <option value="rn6">Rattus norvegicus, Rnor_6.0 [GCA_000001895.4]</option>
                                        <option value="c_elegans">Caenorhabditis elegans</option>
                                        <option value="dog">Dog</option>
                </param>

<!-- implement bed when test available -->
<!--                <conditional name="bed_input">
                    <param name="bedfile" type="select" label="BED file" help="Input a BED file">
                        <option value="yes">Yes</option>
                        <option value="no" selected="true">No</option>
                    </param>
                    <when value='yes'>
                        <param name="bed_file" format="bed" type="data" label="Use a BED file containing the set of regions" help="Provide a BED file"/>
                    </when>
                    <when value='no'>
                    </when>
                </conditional> -->
                <!-- implement exome functionality when test available -->
                <!-- <param name="exome" type="boolean" label="Use only the exome?" checked="False" help="Use exome"/> -->
                <!-- implement chrom_based functionality when test available -->
                <!--<param name="chrom_based" type="boolean" label="Create the matrices on a per chromosome basis?" checked="False" help="Show snvs"/> -->
                <param name="tsb_stat" type="boolean" truevalue="true" label="Performs a transcriptional strand bias test?" checked="False" help="Show snvs"/>
                <param name="seqInfo" type="boolean" truevalue="true" label="Export sequence information?" checked="False" help="Show sequence information"/>
                <param name="gs" type="boolean" label="Performs gene strand bias test?" checked="False" help="Show snvs"/>
            </when>
        </conditional>
    </inputs>

    <outputs>
        <data format="txt" name="logref" label="Log file: Install a Reference Genome"
              from_work_dir="./install.log">
            <filter>set_analysis['choices'] == 'install_genome'</filter>
        </data>
        <data format="txt" name="logsmt" label="Log file: Calculate Mutational Signatures"
              from_work_dir="run_dir/logs/SigProfilerMatrixGenerator*.out">
            <filter>set_analysis['choices'] == 'get_sigmut'</filter>
        </data>

        <data format="pdf" name="blinder" label="SBS Mutational Signatures plots (pdf)"
              from_work_dir="./blinder.pdf" >
            <filter>set_analysis['choices'] == 'get_sigmut' and set_analysis['plot'] is True</filter>
        </data>

        <!-- implement exome outputs when test available -->
        <!--
        <data format="txt" name="dbs_exome" label="DBS_exome.vcf">
            <filter>set_analysis['choices'] == 'get_sigmut' and set_analysis['exome'] is True</filter>
        </data>
        <data format="txt" name="snv_exome" label="SNV_exome.vcf">
            <filter>set_analysis['choices'] == 'get_sigmut' and set_analysis['exome'] is True</filter>
        </data>

        <data format="txt" name="sig_exome" label="DBS 78 and so on Sig. Mut. EXOME">
            <filter>set_analysis['choices'] == 'get_sigmut' and set_analysis['exome'] is True</filter>
        </data>
        -->
        <data format="txt" name="tsb" label="Transcriptional Strand Biases"
                      from_work_dir="./transcriptional_strand_biases.txt" >
            <filter>set_analysis['choices'] == 'get_sigmut' and set_analysis['tsb_stat'] is True</filter>
        </data>

        <data format="txt" name="seqinfo" label="Mutational Signature detailed infos"
              from_work_dir="./information.txt" >
            <filter>set_analysis['choices'] == 'get_sigmut' and  set_analysis['seqInfo'] is True</filter>
        </data>

    </outputs>
    <tests>
        <test>
            <param name="choices" value="install_genome"/>
            <param name="refgendwn" value="GRCh38"/>
            <output name="logref" file="hg38_install.log" lines_diff="5"/>
        </test>
        <test>
            <param name="choices" value="get_sigmut"/>
            <param name="refgendat" value="GRCh38"/>
            <param name="vcfile" value="vcf"/>
            <param name="vcf_file" ftype="vcf" value="hg38.vcf"/>
            <param name="plot" value="True"/>
            <output name="logsmt" ftype="txt" file="sigmut.log" lines_diff="5" />
            <output name="blinder" file="hg38_blinder.pdf"  lines_diff="5" />
        </test>
    </tests>

    <help><![CDATA[

        **SigProfiler**

        Background:

        Cancer genomes evince somatic mutations, which are imprinted by
        different mutational processes, that give rise to diverse
        mutational signatures. Their analysis from single base
        substitutions and their immediate sequencing context, allows the
        classification of small mutational events (including
        substitutions, insertions, deletions, and doublet substitutions)
        for better understanding the mutational processes that have
        shaped a cancer genome.

        In this sense, SigProfiler constitutes a Galaxy-based wrapper of
        a computational method developed by Ludmil B. Alexandrov, that
        allow the exploration and visualization of mutational patterns
        for all types of small mutational events. Specifically, the
        following actions can be performed using SigProfiler wrapper:

        1. Identify and categorize the mutations based on possible
        single nucleotide variants (SNVs), double base substitutions
        (DBS), and insertions/deletions and provides further
        transcriptional strand bias categorization. Afterwards, the
        classification of these mutations are integrated into distinct
        matrices.
        SigProfiler provides matrix generation support for SBS-6,
        SBS-96, SBS-1536, DBS-78 and DBS-1248. In addition, the
        generation of mutational matrices of indels including
        ID-28 and ID-83 are procured. Besides, an ID-8628 matrix that
        extends the ID-83 classification is generated.
        SigProfiler examines transcriptional strand bias for single base
        substitutions, doublet base substitutions, and small indels. It
        is evaluated whether a mutation occurs on the transcribed or the
        non-transcribed strand of well-annotated protein coding genes of
        a reference genome. Mutations found in the transcribed regions
        of the genome are further subclassified as: (i) transcribed,
        (ii) un-transcribed, (iii) bi-directional, or (iv) unknown.

        2. Generation of plots of all types of mutational signatures as
        well as all types of mutational patterns in cancer genomes.

        Additional Information:

        Classification of Single Base substitutions (SBSs):
        Single base substitutions (SBSs) are single DNA base-pairs
        substituted with another single DNA base-pairs. The most
        basic classification catalogues SBSs into six distinct
        categories, including: C:G > A:T, C:G > G:C, C:G > T:A,
        T:A > A:T, T:A > C:G, and T:A > G:C. In practice, a C:G > A:T
        substitution is denoted as either a C > A mutation using the
        pyrimidine base or as a G > T mutation using the purine base.
        In consequence, the most commonly used SBS-6 classification of
        single base substitutions can be written as: C > A, C > G,
        C > T, T > A, T > C, and T > G.
        Additionally, the SBS-6 classification can be further
        expanded by considering the base-pairs immediately
        adjacent 5′ and 3′ to the somatic mutation. Therefore, an
        extended classification for analysis of mutational signatures is
        SBS-96, where each of the classes in SBS-6 is further elaborated
        using one base adjacent at the 5′ of the mutation and one base
        adjacent at the 3′ of the mutation.
        Logically, SBS-96 can be further elaborated by including
        additional 5′ and 3′ adjacent context. Each of the six single
        base substitutions in SBS-6 has 256 possible pentanucleotides
        resulting in a classification with 1536 possible channels.

        Classification of Doublet Base substitutions (DBSs):
        Doublet base substitutions (DBSs) are somatic mutations in which
        a set of two adjacent DNA base-pairs is simultaneously
        substituted with another set of two adjacent DNA base-pairs. An
        example of a DBS is a set of CT:GA base-pairs mutating to a set
        of AA:TT base-pairs, which is usually denoted as CT:GA > AA:TT.
        It should be noted that a CT:GA > AA:TT mutation can be
        equivalently written as either a CT > AA mutation.  Overall, the
        basic classification catalogues DBSs into 78 distinct categories
        denoted as the DBS-78 matrix.
        Similarly, we can expand the characterization of DBS mutations
        by considering the 5′ and 3′ adjacent contexts. With
        seventy-eight possible DBS mutations having sixteen possible
        tetranucleotides each, this context expansion results in 1248
        possible channels denoted as the DBS-1248 context.

        Classification of small insertions and deletions (IDs):
        A somatic insertion is the incorporation of a set of base-pairs
        that lengthens a chromosome, while a somatic deletion is the
        removing of a set of existing base-pairs from a given location
        of a chromosome.
        Unfortunately, indel classification cannot be performed
        analogously to SBS or DBS classifications, where the immediate
        sequencing context flanking each mutation was
        utilized to subclassify these mutational events.
        Consequently, indels (IDs) are classified as single base-pair
        or longer events. They can be further subclassified as either a
        C:G or a T:A indel, while longer indels can also be
        subclassified based on their lengths: 2 bp, 3 bp, 4 bp, and
        5 + bp.

        Incorporation of transcription Strand Bias (TSB):
        The mutational classifications described above allow the
        characterization of mutational patterns of single base
        substitutions, doublet base substitutions, and small insertions
        and deletions. Nevertheless, these classifications can be
        further elaborated by incorporating strand bias. Mutations
        from the same type are expected to be equally distributed across the two
        DNA strands. However, in many cases an asymmetric number of mutations are
        observed due to either one of the strands being preferentially
        repaired or one of the strands having a higher propensity for
        being damaged. To sub-classify mutations based on their
        transcriptional strand bias, the pyrimidine orientation with
        respect to the locations of well-annotated protein coding genes
        on a genome is considered.

        Running SigProfiler:

        1. Reference Genomes:
        Before using SigProfiler, the installation of a reference genome
        is demanded. By default, the tool supports the following
        reference genomes:

                Human: GRCh37 & GRCh38

                Mouse: mm9 & mm10

                Rat: rn6

                Nematode: c_elegans

                A right command line should look like:

                sigprofiler -ig GRCh37

        2. Mutational signatures calculation:

        After successful installation of a reference genome, SigProfiler
        can be applied to files containing somatic mutations in multiple
        formats, for transforming these mutational catalogues into mutational
        matrices. Specifically, the tool can read data formats such as
        Variant Calling Format (VCF) and Mutation Annotation Format
        (MAF) and the following parameters should be provided for
        generating the diverse matrices and plots:

        --name | -n = Project name
        --genome | -g = Reference Genome
        -files | -f = Absolute path where the input mutation files are located

        A right command line should look like:

        sigprofiler -n MYPROJECT -g GRCh37 -f /path_to_folder_with_VCF_files/ -p

        **Options**
        --version               show program's version number and exit

        -h, --help              show this help message and exit

        --install_genome    Install de novo any of the following reference
                    genomes: 'GRCh37', 'GRCh38', 'mm9' or 'mm10'.

        --name=APPENDIX     Provide a project name

        --genome=NAME       Provide a reference genome (ex: GRCh37, GRCh38,
                    mm9 or mm10).

        --files=Abs_path    Path where the input vcf files are located

        --exome         Use only the exome or not

        --bed=FILE      BED file containing the set of regions to be used
                    in generating the matrices

        --chrom         Create the matrices on a per chromosome basis

        --plot          Generate the plots for each context

        --tsb           Performs a transcriptional strand bias test for the
                    24, 384, and 6144 contexts

        --gs            Performs a gene strand bias test

        For further info see: https://github.com/AlexandrovLab/SigProfilerMatrixGenerator

        ]]></help>

    <citations>
        <citation type="doi">10.1186/s12864-019-6041-2</citation>
    </citations>

</tool>
author	artbio
date	Sun, 14 Jun 2020 20:27:29 -0400
parents	9f48c5d97be8
children