diff facets_analysis.R @ 4:3f62267c4be7 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/facets commit e47e0ed100904318ef4aae661b763e049c358edf
author artbio
date Sun, 05 Oct 2025 18:42:30 +0000
parents d1914f4d9daf
children 625038b7d764
line wrap: on
line diff
--- a/facets_analysis.R	Sun Oct 05 00:55:34 2025 +0000
+++ b/facets_analysis.R	Sun Oct 05 18:42:30 2025 +0000
@@ -48,6 +48,10 @@
     type = "character", required = TRUE,
     help = "Path for the diagnostic spider plot file (PNG)."
 )
+parser$add_argument("--output_vcf",
+    type = "character", required = TRUE,
+    help = "Path for the output VCF file with CNV calls."
+)
 parser$add_argument("--cval",
     type = "double", default = 150,
     help = "Critical value for segmentation."
@@ -66,6 +70,44 @@
     help = "Genome build used for alignment."
 )
 
+#' Classify CNV segments based on TCN/LCN
+classify_cnv <- function(cncf_df) {
+    cncf_df$sv_type <- NA_character_
+    cncf_df$sv_type[cncf_df$tcn.em == 2 & (cncf_df$lcn.em == 1 | is.na(cncf_df$lcn.em))] <- "NEUTR"
+    cncf_df$sv_type[is.na(cncf_df$sv_type) & cncf_df$tcn.em > 2] <- "DUP"
+    cncf_df$sv_type[is.na(cncf_df$sv_type) & cncf_df$tcn.em < 2 & !is.na(cncf_df$lcn.em) & cncf_df$lcn.em > 0] <- "HEMIZYG_DEL"
+    cncf_df$sv_type[is.na(cncf_df$sv_type) & cncf_df$tcn.em < 2 & !is.na(cncf_df$lcn.em) & cncf_df$lcn.em == 0] <- "HOMOZYG_DEL"
+    cncf_df$sv_type[is.na(cncf_df$sv_type) & cncf_df$tcn.em == 2 & !is.na(cncf_df$lcn.em) & cncf_df$lcn.em == 0] <- "CN_LOH"
+
+    # Remplacer les NA restants (si tcn.em < 2 mais lcn.em est NA) par un type général
+    cncf_df$sv_type[is.na(cncf_df$sv_type) & cncf_df$tcn.em < 2] <- "DEL"
+
+    return(cncf_df)
+}
+
+#' Create a VCF header (explicit version)
+create_vcf_header <- function(sample_id, purity, ploidy) {
+    header <- c(
+        "##fileformat=VCFv4.2",
+        paste0("##fileDate=", format(Sys.Date(), "%Y%m%d")),
+        paste0("##source=FACETS_v", packageVersion("facets")),
+        "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant\">",
+        "##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant (standard VCF tags: DEL, DUP, CNV)\">",
+        "##INFO=<ID=SVLEN,Number=1,Type=Integer,Description=\"Length of the SV\">",
+        # --- MODIFICATION ---
+        "##INFO=<ID=EVENT,Number=1,Type=String,Description=\"FACETS event classification. Possible values: DUP, HEMIZYG_DEL, HOMOZYG_DEL, CN_LOH\">",
+        # --- FIN MODIFICATION ---
+        "##INFO=<ID=TCN,Number=1,Type=Integer,Description=\"Total Copy Number (EM fit)\">",
+        "##INFO=<ID=LCN,Number=1,Type=Integer,Description=\"Lesser Copy Number (EM fit)\">",
+        "##INFO=<ID=NUM_MARK,Number=1,Type=Integer,Description=\"Number of SNPs in the segment\">",
+        "##INFO=<ID=NHET,Number=1,Type=Integer,Description=\"Number of heterozygous SNPs in the segment\">",
+        paste0("##FACETS_PURITY=", round(purity, 4)),
+        paste0("##FACETS_PLOIDY=", round(ploidy, 4)),
+        "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"
+    )
+    return(header)
+}
+
 # --- Main Analysis Function ---
 main <- function(args) {
     # Set seed for reproducibility
@@ -122,6 +164,58 @@
     png(file = args$output_spider, width = 8, height = 8, units = "in", res = 300)
     logRlogORspider(oo$out, oo$dipLogR)
     dev.off()
+
+    # --- Generate VCF file ---
+
+    # Classify segments and define standard SVTYPEs + detailed EVENTs
+    cncf_for_vcf <- fit$cncf
+    if (nrow(cncf_for_vcf) > 0) {
+        cncf_for_vcf$svtype <- NA_character_
+        cncf_for_vcf$event <- NA_character_
+
+        # Duplications
+        cncf_for_vcf[cncf_for_vcf$tcn.em > 2, c("svtype", "event")] <- c("DUP", "DUP")
+
+        # Deletions
+        cncf_for_vcf[cncf_for_vcf$tcn.em < 2, c("svtype")] <- "DEL"
+        cncf_for_vcf[cncf_for_vcf$tcn.em == 1, c("event")] <- "HEMIZYG_DEL"
+        cncf_for_vcf[cncf_for_vcf$tcn.em == 0, c("event")] <- "HOMOZYG_DEL"
+
+        # Copy-Neutral LOH
+        cncf_for_vcf[cncf_for_vcf$tcn.em == 2 & !is.na(cncf_for_vcf$lcn.em) & cncf_for_vcf$lcn.em == 0, c("svtype", "event")] <- c("CNV", "CN_LOH")
+
+        # Filtrer les segments normaux (ceux où 'svtype' est encore NA)
+        cnv_calls <- cncf_for_vcf[!is.na(cncf_for_vcf$svtype), ]
+    } else {
+        cnv_calls <- data.frame()
+    }
+
+
+    if (nrow(cnv_calls) > 0) {
+        vcf_header <- create_vcf_header(args$sample_id, fit$purity, fit$ploidy)
+
+        vcf_body <- apply(cnv_calls, 1, function(seg) {
+            alt_allele <- paste0("<", seg["svtype"], ">")
+
+            info <- paste0(
+                "END=", seg["end"],
+                ";SVTYPE=", seg["svtype"],
+                ";SVLEN=", as.integer(seg["end"]) - as.integer(seg["start"]),
+                ";TCN=", seg["tcn.em"],
+                ";LCN=", ifelse(is.na(seg["lcn.em"]), ".", seg["lcn.em"]),
+                ";EVENT=", seg["event"],
+                ";NUM_MARK=", seg["num.mark"],
+                ";NHET=", seg["nhet"]
+            )
+
+            paste(seg["chrom"], seg["start"], ".", "N", alt_allele, ".", "PASS", info, sep = "\t")
+        })
+
+        writeLines(c(vcf_header, vcf_body), con = args$output_vcf)
+    } else {
+        vcf_header <- create_vcf_header(args$sample_id, fit$purity, fit$ploidy)
+        writeLines(vcf_header, con = args$output_vcf)
+    }
 }
 
 # --- Execution Block ---