diff facets_analysis.xml @ 6:625038b7d764 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/facets commit 8cced47697e5777fd60dacc60300e770bd409e9d
author artbio
date Mon, 06 Oct 2025 15:50:12 +0000
parents 1d56a6b5739f
children 86bcdc94b008
line wrap: on
line diff
--- a/facets_analysis.xml	Mon Oct 06 13:39:01 2025 +0000
+++ b/facets_analysis.xml	Mon Oct 06 15:50:12 2025 +0000
@@ -18,6 +18,11 @@
             --min_nhet $min_nhet
             --snp_nbhd $snp_nbhd
             --gbuild '$gbuild'
+            #if $merging.merge_select == "yes":
+                --enable_merging
+                --merge_gap_abs $merging.max_gap_abs
+                --merge_gap_rel $merging.max_gap_rel
+            #end if
     ]]></command>
     <inputs>
         <param name="pileup" type="data" format="tabular.gz" label="FACETS Pileup File" help="Output from the 'SNP Pileup for FACETS' tool."/>
@@ -33,8 +38,18 @@
             <option value="mm10">Mouse (mm10)</option>
             <option value="mm9">Mouse (mm9)</option>
         </param>
-        
         <param name="snp_nbhd" type="integer" value="300" label="SNP neighborhood size (snp.nbhd)" help="Should match the --pseudo-snps distance used to generate the pileup file. Default is 300."/>
+        <conditional name="merging">
+            <param name="merge_select" type="select" label="Post-process VCF to merge adjacent segments?" help="Optional step to merge adjacent CNV calls that likely represent a single biological event.">
+                <option value="no" selected="true">No</option>
+                <option value="yes">Yes</option>
+            </param>
+            <when value="no"/>
+            <when value="yes">
+                <param name="max_gap_abs" type="integer" value="1000000" label="Absolute maximum gap to merge (bp)" help="Maximum distance in base pairs allowed between two segments to consider them for merging."/>
+                <param name="max_gap_rel" type="float" value="0.5" label="Relative maximum gap to merge (fraction)" help="Maximum relative distance, as a fraction of the average size of the two segments."/>
+            </when>
+        </conditional>
     </inputs>
     <outputs>
         <data name="output_seg" format="tsv" label="FACETS Segmentation on ${on_string}"/>
@@ -54,18 +69,81 @@
             <output name="output_vcf" file="test_sample_01.cnv.vcf" ftype="vcf" lines_diff="2" />
         </test>
     </tests>
-    <help><![CDATA[
-        **What it does**
+<help><![CDATA[
+            **What it does**
+
+            This tool runs the `FACETS` R package to perform allele-specific copy number
+            and clonal heterogeneity analysis. It takes the compressed pileup file
+            generated by the "SNP Pileup for FACETS" tool as its primary input and
+            produces a set of standard FACETS outputs, including segmentation calls,
+            purity/ploidy estimates, plots, and a VCF file summarizing the CNV events.
+
+            ---
+
+            **Primary Parameters**
+
+            These parameters control the core of the FACETS segmentation algorithm.
 
-        This tool runs the `FACETS` R package to perform allele-specific copy number and clonal heterogeneity analysis. It takes the compressed pileup file generated by the "SNP Pileup for FACETS" tool as its primary input.
+            - **Critical value for segmentation (cval):** This is the most important
+              parameter for controlling the sensitivity of the segmentation. A *higher*
+              value (e.g., 200-800) will result in fewer segments and is generally
+              recommended for high-density data like Whole Genome Sequencing (WGS).
+              A *lower* value (e.g., 50-150) increases sensitivity, resulting in more
+              segments, and is more suitable for sparser data like Whole Exome
+              Sequencing (WES).
 
-        **Outputs**
+            - **Minimum number of heterozygous SNPs (min.nhet):** This is a quality
+              filter. After segmentation, any segment that is supported by fewer
+              heterozygous SNPs than this threshold will be discarded. This helps
+              to remove unreliable, small segments.
+
+            - **SNP neighbourhood size (snp.nbhd):** This parameter defines the genomic
+              window (in bp) around a SNP used for local read depth normalization.
+              The default value is generally appropriate.
+
+            ---
+
+            **Advanced VCF Post-processing: Merging Segments**
 
-        - A **Segmentation file (TSV)** with the genomic coordinates of each segment and their associated copy number (TCN, LCN).
-        - A **Summary file** with the main estimated parameters (purity, ploidy, etc.).
-        - A **CNV calls file (VCF)** listing the detected copy number events in a standard VCF format.
-        - A **Plots file (PNG)** with an enhanced visualization of the genome-wide results, including a legend for copy number states.
-        - A **Spider Plot (PNG)** for diagnosing the quality of the purity/ploidy model fit.
-    ]]></help>
+            You can optionally enable a post-processing step to merge adjacent CNV
+            segments in the output VCF.
+
+            *Why is this useful?*
+            Segmentation algorithms can sometimes split a single, large biological event
+            (e.g., a 10 Mb deletion) into several smaller, adjacent segments with the
+            same copy number state. This feature attempts to correct this by merging
+            these segments back together, providing a cleaner and more biologically
+            accurate representation of the CNV landscape.
+
+            The merging is controlled by an algorithm using two thresholds:
+
+            - **Absolute maximum gap:** The maximum distance in base pairs allowed
+              between two segments to even consider them for merging. This acts as a
+              safeguard.
+            - **Relative maximum gap:** The maximum distance allowed, expressed as a
+              *fraction* of the average size of the two segments. This allows large
+              gaps between large segments, but not between small ones, trying to mimic
+              how a human expert would interpret the data.
+
+            ---
+
+            **Outputs**
+
+            - **Segmentation file (TSV):** The raw segment data with genomic coordinates
+              and their associated copy number (TCN, LCN).
+            - **Summary file:** The main estimated parameters like purity, ploidy, etc.
+            - **CNV calls file (VCF):** A summary of the detected copy number events in
+              a standard VCF format, suitable for downstream analysis.
+            - **Plots file (PNG):** An enhanced visualization of the genome-wide results.
+            - **Spider Plot (PNG):** This is the most important **diagnostic plot** for
+              assessing the quality of the FACETS fit.
+              On this plot (generated by the `logRlogORspider` function), each
+              **circle** is a genomic segment from your data. The **curves** (labeled
+              `2-1`, `1-0`, etc.) represent the theoretical positions for integer copy
+              number states given the estimated purity and ploidy. A high-confidence
+              result is achieved when your data (the circles) align closely with these
+              theoretical curves. For a detailed interpretation, please refer to the
+              original FACETS publication: Shen and Seshan, *NAR*, 2016.
+        ]]></help>
     <expand macro="citations"/>
 </tool>