diff facets_analysis.R @ 7:86bcdc94b008 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/facets commit 2da49e9385ddce5c74e077c81a52ff1ea4131b81
author artbio
date Wed, 08 Oct 2025 17:41:18 +0000
parents 625038b7d764
children
line wrap: on
line diff
--- a/facets_analysis.R	Mon Oct 06 15:50:12 2025 +0000
+++ b/facets_analysis.R	Wed Oct 08 17:41:18 2025 +0000
@@ -81,7 +81,14 @@
     type = "double", default = 0.5,
     help = "Relative maximum gap (fraction of avg. segment length) to merge segments."
 )
-
+parser$add_argument("--vcf_min_nhet",
+    type = "integer", default = 2,
+    help = "VCF Post-Filter: Minimum number of heterozygous SNPs for a segment to be kept."
+)
+parser$add_argument("--vcf_min_num_mark",
+    type = "integer", default = 3,
+    help = "VCF Post-Filter: Minimum number of total markers for a segment to be kept."
+)
 #' Classify CNV segments based on TCN/LCN
 classify_cnv <- function(cncf_df) {
     cncf_df$sv_type <- NA_character_
@@ -283,6 +290,16 @@
                 max_gap_rel = args$merge_gap_rel
             )
         }
+        # Apply VCF post-filters to remove low-quality/artefactual segments
+        # This addresses the issue of FACETS' EM algorithm sometimes creating
+        # micro-segments that bypass the initial min.nhet segmentation parameter.
+        original_rows <- nrow(cnv_calls)
+        cnv_calls <- cnv_calls[
+            cnv_calls$nhet >= args$vcf_min_nhet &
+                cnv_calls$num.mark >= args$vcf_min_num_mark,
+        ]
+        cat(paste("Applied VCF post-filters: kept", nrow(cnv_calls), "of", original_rows, "segments.\n"))
+
         vcf_header <- create_vcf_header(args$sample_id, fit$purity, fit$ploidy)
 
         vcf_body <- apply(cnv_calls, 1, function(seg) {