Mercurial > repos > artbio > cnv_facets

<tool id="facets_analysis" name="FACETS Analysis" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>Performs allele-specific copy number analysis from a pileup file</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements"/>
    <expand macro="stdio"/>
    <command detect_errors="exit_code"><![CDATA[
        Rscript '${__tool_directory__}/facets_analysis.R'
            --pileup '$pileup'
            --sample_id '$pileup.name'
            --output_seg '$output_seg'
            --output_summary '$output_summary'
            --output_spider '$output_spider'
            --output_plots '$output_plots'
            --output_vcf '$output_vcf'
            --cval $cval
            --min_nhet $min_nhet
            --snp_nbhd $snp_nbhd
            --gbuild '$gbuild'
            #if $merging.merge_select == "yes":
                --enable_merging
                --merge_gap_abs $merging.max_gap_abs
                --merge_gap_rel $merging.max_gap_rel
            #end if
            --vcf_min_nhet $filtering.vcf_min_nhet
            --vcf_min_num_mark $filtering.vcf_min_num_mark
    ]]></command>
    <inputs>
        <param name="pileup" type="data" format="tabular.gz" label="FACETS Pileup File" help="Output from the 'SNP Pileup for FACETS' tool."/>

        <param name="cval" type="float" value="150" label="Critical value for segmentation (cval)"
               help="Higher values lead to fewer segments (less sensitive). Lower values are more sensitive. For dense data (e.g., from WGS), higher values like 400-800 are recommended."/>
        <param name="min_nhet" type="integer" value="25" label="Minimum number of heterozygous SNPs per segment" help="Ensures that segments are supported by sufficient allelic information."/>

        <param name="gbuild" type="select" label="Genome Build">
            <option value="hg38" selected="true">Human (hg38)</option>
            <option value="hg19">Human (hg19)</option>
            <option value="hg18">Human (hg18)</option>
            <option value="mm10">Mouse (mm10)</option>
            <option value="mm9">Mouse (mm9)</option>
        </param>
        <param name="snp_nbhd" type="integer" value="300" label="SNP neighborhood size (snp.nbhd)" help="Should match the --pseudo-snps distance used to generate the pileup file. Default is 300."/>
        <conditional name="merging">
            <param name="merge_select" type="select" label="Post-process VCF to merge adjacent segments?" help="Optional step to merge adjacent CNV calls that likely represent a single biological event.">
                <option value="no" selected="true">No</option>
                <option value="yes">Yes</option>
            </param>
            <when value="no"/>
            <when value="yes">
                <param name="max_gap_abs" type="integer" value="1000000" label="Absolute maximum gap to merge (bp)" help="Maximum distance in base pairs allowed between two segments to consider them for merging."/>
                <param name="max_gap_rel" type="float" value="0.5" label="Relative maximum gap to merge (fraction)" help="Maximum relative distance, as a fraction of the average size of the two segments."/>
            </when>
        </conditional>
        <section name="filtering" title="VCF Output Filtering" expanded="false">
            <param name="vcf_min_nhet" type="integer" value="2" label="Minimum heterozygous SNPs for VCF output" help="Post-filter to remove final segments with fewer than this many heterozygous SNPs."/>
            <param name="vcf_min_num_mark" type="integer" value="3" label="Minimum total markers for VCF output" help="Post-filter to remove final segments with fewer than this many total markers (SNPs). Helps remove SVLEN=0 artifacts."/>
        </section>
    </inputs>
    <outputs>
        <data name="output_seg" format="tsv" label="FACETS Segmentation on ${on_string}"/>
        <data name="output_summary" format="tabular" label="FACETS Summary on ${on_string}"/>
        <data name="output_plots" format="png" label="FACETS Plots on ${on_string}"/>
        <data name="output_spider" format="png" label="FACETS Spider Plot on ${on_string}"/>
        <data name="output_vcf" format="vcf" label="FACETS CNV calls (VCF) on ${on_string}"/>

    </outputs>
    <tests>
        <test>
            <param name="pileup" value="Pileup.input_test_facets.csv.gz" ftype="tabular.gz"/>
            <output name="output_seg" file="test_sample_01.seg.tsv" ftype="tsv"/>
            <output name="output_summary" file="test_sample_01.summary.txt" ftype="tabular"/>
            <output name="output_plots" file="test_sample_01.plots.png" ftype="png" compare="sim_size" delta="20000"/>
            <output name="output_spider" file="test_sample_01.spider.png" ftype="png" compare="sim_size" delta="10000"/>
            <output name="output_vcf" file="test_sample_01.cnv.vcf" ftype="vcf" lines_diff="2" />
        </test>
    </tests>
    <help><![CDATA[
            **What it does**

            This tool runs the `FACETS` R package to perform allele-specific copy number
            and clonal heterogeneity analysis. It takes the compressed pileup file
            generated by the "SNP Pileup for FACETS" tool as its primary input and
            produces a set of standard FACETS outputs.

            ---

            **Primary Parameters**

            These parameters control the core of the FACETS segmentation algorithm.

            - **Critical value for segmentation (cval):** This is the most important
              parameter for controlling the sensitivity. A *higher* value (e.g., 200-800)
              results in fewer segments (less sensitive) and is recommended for
              high-density data (WGS). A *lower* value (e.g., 50-150) increases
              sensitivity and is more suitable for sparser data (WES).

            - **Minimum number of heterozygous SNPs (min.nhet):** This is a quality
              filter. Segments supported by fewer heterozygous SNPs than this
              threshold will be discarded during the initial segmentation pass.

            - **SNP neighbourhood size (snp.nbhd):** Defines the genomic window (in bp)
              around a SNP used for local read depth normalization.

            ---

            **Advanced VCF Post-processing**

            You can optionally enable post-processing steps to refine the final VCF.

            - **Merging Segments:** This option merges adjacent CNV segments that likely
              represent a single biological event, providing a cleaner and more
              biologically accurate output.

            - **Filtering Segments:** This option removes low-quality or artefactual
              segments based on the number of SNPs supporting them. This is recommended
              as FACETS can sometimes report micro-segments that are not biologically
              relevant.

            ---

            **Outputs**

            - **Segmentation file (TSV):** The raw segment data with genomic coordinates
              and their associated copy number (TCN, LCN).
            - **Summary file:** The main estimated parameters like purity, ploidy, etc.
            - **Plots file (PNG):** A genome-wide visualization of the copy number and
              allelic imbalance results across all chromosomes.
            - **Spider Plot (PNG):** The most important **diagnostic plot** for assessing
              the quality of the FACETS fit. See detailed explanation below.
            - **CNV calls file (VCF):** A summary of the detected copy number events in
              a standard VCF format for structural variants. The `ALT` column contains
              symbolic alleles (`<DEL>`, `<DUP>`). All FACETS-specific details are in
              the `INFO` field:

              ``SVTYPE``
                Type of variant (e.g., DEL, DUP).
              ``EVENT``
                FACETS classification (e.g., HOMOZYG_DEL, CN_LOH).
              ``TCN``
                Total Copy Number.
              ``LCN``
                Lesser Copy Number.
              ``NUM_MARK``
                Total number of SNPs in the segment.
              ``NHET``
                Number of heterozygous SNPs in the segment.

            **Interpreting the Spider Plot**

            On this plot (generated by the `logRlogORspider` function), each
            **circle** is a genomic segment from your data. The **curves** (labeled
            `2-1`, `1-0`, etc.) represent the theoretical positions for integer copy
            number states. A high-confidence result is achieved when your data (the
            circles) align closely with these curves. For details, refer to the
            original FACETS publication: Shen and Seshan, *NAR*, 2016.

    ]]></help>
    <expand macro="citations"/>
</tool>
author	artbio
date	Wed, 08 Oct 2025 17:41:18 +0000
parents	625038b7d764
children