Mercurial > repos > iuc > cnvkit_segment

--- a/macros.xml	Mon Jan 20 16:37:32 2025 +0000
+++ b/macros.xml	Sat Mar 01 12:06:19 2025 +0000
@@ -1,10 +1,10 @@
 <macros>
-    <token name="@VERSION_SUFFIX@">1</token>
-    <token name="@TOOL_VERSION@">0.9.11</token>
+    <token name="@VERSION_SUFFIX@">0</token>
+    <token name="@TOOL_VERSION@">0.9.12</token>
     <xml name="requirements">
         <requirements>
             <requirement type="package" version="@TOOL_VERSION@">cnvkit</requirement>
-            <requirement type="package" version="1.4.2">scikit-learn</requirement>
+            <requirement type="package" version="1.21">samtools</requirement>
         </requirements>
     </xml>
     <xml name="reference_interface">
@@ -56,13 +56,13 @@
                 <option value="wgs">whole genome sequencing </option>
             </param>
             <param argument="--segment-method" type="select" label="Method used in the 'segment' step" help="">
-                <option value="cbs" selected="True">Circular Binary Segmentation CBS</option>
-                <option value="flasso">Fused lasso, hybrid flasso</option>
-                <option value="haar">a pure-Python implementation of HaarSeg, a wavelet-based method. Very fast and performs reasonably well on small panels, but tends to over-segment large datasets., hybrid haar</option>
-                <option value="none">simply calculate the weighted mean log2 value of each chromosome arm. Useful for testing or debugging, or as a baseline for benchmarking other methods., hybrid none</option>
-                <option value="hmm">experimental – a 3-state Hidden Markov Model suitable for most samples. Faster than CBS, and slower but more accurate than Haar. Requires the Python package pomegranate, as do the next two thods., hybrid hmm</option>
-                <option value="hmm-tumor">experimental – a 5-state HMM suitable for finer-grained segmentation of good-quality tumor samples. In particular, this method can detect focal amplifications within a larger-scale, smaller-amplitude copy number gain, or focal deep deletions within a larger-scale hemizygous loss. Training this model takes a bit more CPU time than the simpler hmm method., hybrid hmm-tumor</option>
-                <option value="hmm-germline">experimental – a 3-state HMM with fixed amplitude for the loss, neutral, and gain states corresponding to absolute copy numbers of 1, 2, and 3. Suitable for germline samples and single-cell sequencing of samples with mostly-diploid genomes that are not overly aneuploid., hybrid hmm-germline</option>
+                <option value="cbs" selected="True">CBS: Circular Binary Segmentation (default, precise)</option>
+                <option value="flasso">Flasso: Fused Lasso; smoother segments, fewer breakpoints</option>
+                <option value="haar">Haar: Haar wavelet transform; detects abrupt changes</option>
+                <option value="none">None: No segmentation; outputs bin-level data as segments</option>
+                <option value="hmm">Hmm: Basic Hidden Markov Model (generic use)</option>
+                <option value="hmm-tumor">Hmm-tumor: HMM tailored for tumor samples (somatic CNVs)</option>
+                <option value="hmm-germline">Hmm-germline: HMM for germline (inherited) variants (diploid assumption)</option>
             </param>
             <param argument="--male-reference" type="boolean" checked="false" truevalue="--male-reference" falsevalue="" label="Use or assume a male reference" help="female samples will have +1 log-CNR of chrX; otherwise male samples would have -1 chrX" />
             <param argument="--countreads" type="boolean" checked="false" truevalue="--countreads" falsevalue="" label="Get read depths by counting read midpoints within each bin" help="" />
@@ -70,7 +70,7 @@
     </xml>
     <xml name="create_CNV_reference_file">
         <param name="input_sample_file" type="data" format="bam" label="Sample BAM file" help="" />
-        <param argument="--normal" type="data" format="bam" label="Control BAM file" help="" />
+        <param argument="--normal" optional="true" type="data" format="bam" label="Control BAM file" help="It is optional only if You have a single sample or are working with WGS data with no normal samples" />
         <param argument="--targets" type="data" format="bed" label="Capture BED regions" help="" />
     </xml>
     <xml name="advanced_no_reference">
@@ -85,7 +85,7 @@
     </xml>
     <xml name="reuse_an_existing_cnv_reference_file">
         <param name="input_sample_file" type="data" format="bam" label="Sample file" help="" />
-        <param argument="--reference" type="data" format="tabular" label="CNV reference CNN File" help="" />
+        <param argument="--reference" type="data" format="cnn" label="CNV reference CNN File" help="" />
     </xml>
     <xml name="output_section">
         <section name="output_section" title="Outputs" expanded="false">
@@ -137,7 +137,7 @@
                 <option value="biweight">biweight</option>
             </param>
             <param argument="--center-at" optional="true" type="float" label="Subtract a constant number from all log2 ratios" value="" help="For manual re-centering, in case the --center option gives unsatisfactory results" />
-            <param argument="--thresholds" optional="true" type="text" label="Hard thresholds for calling each integer copy number, separated by commas" value="=-1.1,-0.25,0.2,0.7" help="Apply cutoffs to either original or rescaled log2 values" />
+            <param argument="--thresholds" type="text" label="Hard thresholds for calling each integer copy number, separated by commas" value="=-1.1,-0.25,0.2,0.7" help="Apply cutoffs to either original or rescaled log2 values" />
             <param argument="--ploidy" optional="true" type="integer" label="Ploidy of the sample cells" min="1" max="2" value="2" help="" />
             <param argument="--purity" optional="true" type="float" label="Estimated tumor cell fraction, a.k.a. purity or cellularity" min="0" max="1" value="" help="" />
             <param argument="--drop-low-coverage" type="boolean" checked="false" truevalue="--drop-low-coverage" falsevalue="" label="Drop very-low-coverage bins before segmentation" help="To avoid false-positive deletions in poor-quality tumor samples" />
@@ -145,32 +145,32 @@
     </xml>
     <xml name="additionally_SNP_process">
         <param argument="--vcf" optional="true" type="data" format="vcf" label="VCF file" help="VCF file name containing variants for calculation of b-allele frequencies" />
-        <param argument="--sample-id" optional="true" type="text" label="Name of the sample in the VCF to use for b-allele frequency extraction" value="" help="" />
-        <param argument="--normal-id" optional="true" type="text" label="Corresponding normal sample ID in the input VCF" value="" help="This sample is used to select only germline SNVs to calculate b-allele frequencies" />
+        <param argument="--sample-id" type="text" label="Name of the sample in the VCF to use for b-allele frequency extraction" help="" />
+        <param argument="--normal-id" type="text" label="Corresponding normal sample ID in the input VCF" help="This sample is used to select only germline SNVs to calculate b-allele frequencies" />
         <param argument="--min-variant-depth" type="integer" min="1" value="20" optional="true" label="Minimum read depth for a SNV to be used in the b-allele frequency calculation" help="" />
         <param argument="--zygosity-freq" type="float" min="0" value="0.25" optional="true" label="Ignore VCF's genotypes and instead infer zygosity from allele frequencies" help="" />
     </xml>
     <xml name="diagram_optional">
-            <param argument="--segment" optional="true" type="data" format="tabular" label="Segment" help="Segmentation calls cns, the output of the 'segment' command" />
+            <param argument="--segment" optional="true" type="data" format="cns,cnr" label="Segment" help="Segmentation calls cns, the output of the 'segment' command" />
             <param argument="--threshold" optional="true" type="float" label="Threshold" min="0" value="0.5" help="Copy number change threshold to label genes" />
             <param argument="--min-probes" optional="true" type="integer" label="Minimum propes" min="1" value="3" help="Minimum number of covered probes to label a gene" />
             <param argument="--male-reference" type="boolean" checked="false" truevalue="--male-reference" falsevalue="" label="MALE REFERENCE" help="Assume inputs were normalized to a male reference" />
             <param argument="--no-shift-xy" type="boolean" checked="false" truevalue="--no-shift-xy" falsevalue="" label="Don't adjust the X and Y chromosomes according to sample sex" help="" />
-            <param argument="--chromosome" optional="true" type="text" label="Chromosome to display" value="" help="e.g. 'chr1' no chromosomal range allowed" />
+            <param argument="--chromosome" type="text" label="Chromosome to display" help="e.g. 'chr1' no chromosomal range allowed" />
     </xml>
     <xml name="diagram_plot">
-        <param argument="--title" optional="true" type="text" label="Plot title" value="" help="" />
+        <param argument="--title" type="text" label="Plot title" help="" />
         <param argument="--no-gene-labels" type="boolean" checked="false" truevalue="--no-gene-labels" falsevalue="" label="Disable gene_name labels on plot useful when a lot of CNV were called" help="" />
     </xml>
     <xml name="heatmap_optional">
             <param argument="--by-bin" type="boolean" checked="false" truevalue="--by-bin" falsevalue="" label="Plot data x-coordinates by bin indices instead of genomic coordinates" help="" />
-            <param argument="--chromosome" optional="true" type="text" label="Chromosome range" value="" help="Chromosome or chromosomal range, e.g. 'chr1' or 'chr1:2333000-2444000'" />
+            <param argument="--chromosome" type="text" label="Chromosome range" help="Chromosome or chromosomal range, e.g. 'chr1' or 'chr1:2333000-2444000'" />
             <param argument="--desaturate" type="boolean" checked="false" truevalue="--desaturate" falsevalue="" label="Tweak color saturation to focus on significant changes" help="" />
             <param argument="--male-reference" type="boolean" checked="false" truevalue="--male-reference" falsevalue="" label="MALE REFERENCE" help="Assume inputs were normalized to a male reference" />
             <param argument="--no-shift-xy" type="boolean" checked="false" truevalue="--no-shift-xy" falsevalue="" label="Don't adjust the X and Y chromosomes according to sample sex" help="" />
             <param argument="--vertical" type="boolean" checked="false" truevalue="--vertical" falsevalue="" label="Plot heatmap with samples as X-axis instead of Y-axis" help="" />
             <param argument="--delimit-samples" type="boolean" checked="false" truevalue="--delimit-samples" falsevalue="" label="Add an horizontal delimitation line between each sample" help="" />
-            <param argument="--title" optional="true" type="text" label="Plot title" value="" help="" />
+            <param argument="--title" type="text" label="Plot title" help="" />
     </xml>
     <xml name="reference_optional">
             <param argument="--cluster" type="boolean" checked="false" truevalue="--cluster" falsevalue="" label="Calculate and store summary stats for clustered subsets of the normal samples with similar coverage profiles" help="" />
@@ -187,32 +187,32 @@
         <param argument="--no-rmask" type="boolean" checked="false" truevalue="--no-rmask" falsevalue="" label="skip repeat master correction" help="" />
     </xml>
     <xml name="scatter_optional">
-            <param argument="--segment" optional="true" type="data" format="tabular" label="Segment" help="Segmentation calls cns, the output of the 'segment' command" />
-            <param argument="--chromosome" optional="true" type="text" label="Chromosome range" value="" help="Chromosome or chromosomal range, e.g. 'chr1' or 'chr1:2333000-2444000'" />
-            <param argument="--gene" optional="true" type="text" label="Name of gene or genes comma-separated to display" value="" help="" />
+            <param argument="--segment" optional="true" type="data" format="cns,cnr" label="Segment" help="Segmentation calls cns, the output of the 'segment' command" />
+            <param argument="--chromosome" type="text" label="Chromosome range" help="Chromosome or chromosomal range, e.g. 'chr1' or 'chr1:2333000-2444000'" />
+            <param argument="--gene" type="text" label="Name of gene or genes comma-separated to display" help="" />
             <param argument="--range-list" optional="true" type="data" format="bed" label="Range list" help="File listing the chromosomal ranges to display, as BED"/>
             <param argument="--width" optional="true" type="integer" label="Width" min="1" value="1000000" help="Width of margin to show around the selected genes or small chromosomal region" />
     </xml>
     <xml name="scatter_plot">
-        <param argument="--antitarget-marker" optional="true" type="text" label="Antitarget marker" value="same as targets" help="Plot antitargets using this symbol when plotting in a selected chromosomal region"/>
+        <param argument="--antitarget-marker" type="text" label="Antitarget marker" value="same as targets" help="Plot antitargets using this symbol when plotting in a selected chromosomal region"/>
         <param argument="--by-bin" type="boolean" checked="false" truevalue="--by-bin" falsevalue="" label="Plot data x-coordinates by bin indices instead of genomic coordinates" help=""/>
-        <param argument="--segment-color" optional="true" type="text" label="Segment color" value="red" help=""/>
-        <param argument="--title" optional="true" type="text" label="Plot title" value="" help=""/>
+        <param argument="--segment-color" type="text" label="Segment color" value="red" help=""/>
+        <param argument="--title" type="text" label="Plot title" help=""/>
         <param argument="--trend" type="boolean" checked="false" truevalue="--trend" falsevalue="" label="Draw a smoothed local trendline on the scatter plot" help=""/>
         <param argument="--y-max" optional="true" type="integer" label="y-axis upper limit" min="1" value="" help=""/>
         <param argument="--y-min" optional="true" type="integer" label="y-axis lower limit" min="1" value="" help=""/>
         <param argument="--fig-size" optional="true" type="float" label="Width and height of the plot in inches" value="" help="Example 6.4 4.8, the space between the two inputs is important"/>
     </xml>
     <xml name="segment_optional">
-            <param argument="--dataframe" type="text" optional="true" label="Data frame" value="" help="File name to save the raw R dataframe emitted by CBS or Fused Lasso, example dataframe.r"/>
+            <param argument="--dataframe" type="text" label="Data frame" help="File name to save the raw R dataframe emitted by CBS or Fused Lasso, example dataframe.r"/>
             <param argument="--method" type="select" label="Segmentation method" help="">
-                <option value="cbs" selected="True">Circular Binary Segmentation CBS method,hybrid CBS</option>
-                <option value="flasso">Fused lasso, hybrid flasso</option>
-                <option value="haar">A pure-Python implementation of HaarSeg, a wavelet-based method. Very fast and performs reasonably well on small panels, but tends to over-segment large datasets., hybrid haar</option>
-                <option value="none">simply calculate the weighted mean log2 value of each chromosome arm. Useful for testing or debugging, or as a baseline for benchmarking other methods., hybrid none</option>
-                <option value="hmm">experimental – a 3-state Hidden Markov Model suitable for most samples. Faster than CBS, and slower but more accurate than Haar. Requires the Python package pomegranate, as do the next two methods., hybrid hmm</option>
-                <option value="hmm-tumor">experimental – a 5-state HMM suitable for finer-grained segmentation of good-quality tumor samples. In particular, this method can detect focal amplifications within a larger-scale, smaller-amplitude copy number gain, or focal deep deletions within a larger-scale hemizygous loss. Training this model takes a bit more CPU time than the simpler hmm method., hybrid hmm-tumor</option>
-                <option value="hmm-germline">experimental – a 3-state HMM with fixed amplitude for the loss, neutral, and gain states corresponding to absolute copy numbers of 1, 2, and 3. Suitable for germline samples and single-cell sequencing of samples with mostly-diploid genomes that are not overly aneuploid., hybrid hmm-germline</option>
+                <option value="cbs" selected="True">CBS: Circular Binary Segmentation (default, precise)</option>
+                <option value="flasso">Flasso: Fused Lasso; smoother segments, fewer breakpoints</option>
+                <option value="haar">Haar: Haar wavelet transform; detects abrupt changes</option>
+                <option value="none">None: No segmentation; outputs bin-level data as segments</option>
+                <option value="hmm">Hmm: Basic Hidden Markov Model (generic use)</option>
+                <option value="hmm-tumor">Hmm-tumor: HMM tailored for tumor samples (somatic CNVs)</option>
+                <option value="hmm-germline">Hmm-germline: HMM for germline (inherited) variants (diploid assumption)</option>
             </param>
             <param argument="--threshold" optional="true" type="integer" label="Significance threshold" min="1" help="To accept breakpoints during segmentation. For HMM methods, this is the smoothing window size"/>
             <param argument="--drop-low-coverage" type="boolean" checked="false" truevalue="--drop-low-coverage" falsevalue="" label="Drop very-low-coverage bins before segmentation" help="To avoid false-positive deletions in poor-quality tumor samples"/>
--- a/segment.xml	Mon Jan 20 16:37:32 2025 +0000
+++ b/segment.xml	Sat Mar 01 12:06:19 2025 +0000
@@ -50,7 +50,7 @@
             #end if
     ]]></command>
     <inputs>
-        <param name="filename" type="data" format="tabular" label="Bin-Level log2 Ratios/Coverages cnr file" help="Use the output of the CNVkit fix" />
+        <param name="filename" type="data" format="cnr" label="Bin-Level log2 Ratios/Coverages cnr file" help="Use the output of the CNVkit fix" />
         <section name="additional_SNP_allelic_process" title="additional process for SNP b_allele frequencies" expanded="false">
             <expand macro="additionally_SNP_process" />
         </section>
@@ -59,11 +59,11 @@
         </section>
     </inputs>
     <outputs>
-        <data name="out_sample_segment" format="tabular" label="${tool.name} on ${on_string}: Sample segment" from_work_dir="sample.cns" />
+        <data name="out_sample_segment" format="cns" label="${tool.name} on ${on_string}: Sample segment" from_work_dir="sample.cns" />
     </outputs>
     <tests>
         <test expect_num_outputs="1">
-            <param name="filename" ftype="tabular" value="tumor.cnr" />
+            <param name="filename" ftype="cnr" value="tumor.cnr" />
             <section name="advanced_settings">
                 <param name="method" value="hmm" />
                 <param name="threshold" value="2" />
@@ -81,6 +81,41 @@

          Segmented log2 ratios (.cns) output file contains those columns
           chromosome, Start, end, gene, log2, depth, weight and number of bins covered by the segment (probes)
+
+-----
+
+**Bin-level log2 ratios (.cnr)**
+
+Tabular file containing normalized log2 ratios for small genomic bins (divided regions of the genome). Used to detect raw copy number variations (CNVs) before segmentation.
+
+.. csv-table::
+   :header-rows: 0
+
+    "chromosome","Genomic chromosome (e.g., chr1, chrX)"
+    "start","Start position of the bin."
+    "end","End position of the bin."
+    "gene","Gene name(s) overlapping the bin (if applicable)."
+    "log2","Normalized log2 ratio (sample coverage / reference coverage)."
+    "depth","Average read depth in the bin."
+    "weight","Reliability weight of the bin (higher = more reliable)."
+
+-----
+
+**Segmented log2 ratios (.cns)**
+
+Tabular file with smoothed, merged segments of stable copy number, derived from the .cnr file. Represents final CNV calls.
+
+.. csv-table::
+   :header-rows: 0
+
+    "chromosome","start, end: Genomic coordinates of the segment"
+    "gene","Gene(s) overlapping the segment."
+    "log2","Mean log2 ratio of the segment."
+    "probes","Mean log2 ratio of the segment."
+    "depth","Average read depth."
+    "weight","Reliability weight."
+    "p_value","Statistical confidence (lower = more significant)."
+
     ]]></help>
     <expand macro="citations" />
 </tool>
--- a/test-data/sample.cnv.vcf	Mon Jan 20 16:37:32 2025 +0000
+++ b/test-data/sample.cnv.vcf	Sat Mar 01 12:06:19 2025 +0000
@@ -1,6 +1,6 @@
 ##fileformat=VCFv4.2
-##fileDate=20250120
-##source=CNVkit v0.9.11
+##fileDate=20250203
+##source=CNVkit v0.9.12
 ##INFO=<ID=CIEND,Number=2,Type=Integer,Description="Confidence interval around END for imprecise variants">
 ##INFO=<ID=CIPOS,Number=2,Type=Integer,Description="Confidence interval around POS for imprecise variants">
 ##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">