Mercurial > repos > artbio > cnv_facets
diff facets_analysis.R @ 6:625038b7d764 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/facets commit 8cced47697e5777fd60dacc60300e770bd409e9d
| author | artbio |
|---|---|
| date | Mon, 06 Oct 2025 15:50:12 +0000 |
| parents | 3f62267c4be7 |
| children | 86bcdc94b008 |
line wrap: on
line diff
--- a/facets_analysis.R Mon Oct 06 13:39:01 2025 +0000 +++ b/facets_analysis.R Mon Oct 06 15:50:12 2025 +0000 @@ -69,6 +69,18 @@ choices = c("hg19", "hg38", "hg18", "mm9", "mm10"), help = "Genome build used for alignment." ) +parser$add_argument("--enable_merging", + action = "store_true", default = FALSE, + help = "If specified, enables the post-processing step to merge adjacent and similar CNV segments." +) +parser$add_argument("--merge_gap_abs", + type = "integer", default = 1000000, + help = "Absolute maximum gap in bp to merge adjacent CNV segments." +) +parser$add_argument("--merge_gap_rel", + type = "double", default = 0.5, + help = "Relative maximum gap (fraction of avg. segment length) to merge segments." +) #' Classify CNV segments based on TCN/LCN classify_cnv <- function(cncf_df) { @@ -108,6 +120,78 @@ return(header) } +# ============================================================================== +# Function to merge adjacent and similar CNV segments +# +# This function implements a merging algorithm that reflects human +# by using a hybrid proximity condition. Two segments are merged +# if they have the same CNV state and are close to each other, both in absolute +# and relative terms. +# +# @param cnv_df A data frame of CNV calls, expected to have columns like +# 'chrom', 'start', 'end', 'svtype', 'tcn.em', 'lcn.em', etc. +# @param max_gap_abs An integer. The absolute maximum distance (in bp) allowed +# between two segments to consider them for merging. Acts as a safeguard. +# @param max_gap_rel A numeric value (0 to 1). The maximum relative distance, +# expressed as a fraction of the average length of the two adjacent segments. +# @return A new data frame with the similar adjacent segments merged. +# ============================================================================== +merge_segments <- function(cnv_df, max_gap_abs = 1000000, max_gap_rel = 0.5) { + # If there's nothing to merge, return the original data frame + if (nrow(cnv_df) < 2) { + return(cnv_df) + } + + # Ensure the data frame is sorted by genomic position + cnv_df <- cnv_df[order(cnv_df$chrom, cnv_df$start), ] + + merged_rows <- list() + current_row <- cnv_df[1, ] + + for (i in 2:nrow(cnv_df)) { + next_row <- cnv_df[i, ] + + # Basic criteria: segments must be of the same type and CN state + same_chrom <- current_row$chrom == next_row$chrom + same_svtype <- current_row$svtype == next_row$svtype + same_event <- current_row$event == next_row$event + same_tcn <- current_row$tcn.em == next_row$tcn.em + same_lcn <- identical(current_row$lcn.em, next_row$lcn.em) # Handles NA safely + + # If basic criteria are met, evaluate proximity + if (same_chrom && same_svtype && same_event && same_tcn && same_lcn) { + gap <- next_row$start - current_row$end + + # Calculate the relative threshold based on the average size of the two segments + len_a <- current_row$end - current_row$start + len_b <- next_row$end - next_row$start + relative_threshold <- ((len_a + len_b) / 2) * max_gap_rel + + # Hybrid merge condition: gap must be below BOTH absolute and relative thresholds + if (gap >= 0 && gap <= max_gap_abs && gap <= relative_threshold) { + # Merge: update the end of the current segment and aggregate numeric fields + current_row$end <- next_row$end + current_row$num.mark <- current_row$num.mark + next_row$num.mark + current_row$nhet <- current_row$nhet + next_row$nhet + + # Skip to the next iteration to try merging the newly expanded segment + # with the one that follows. + next + } + } + + # If no merge occurred, the current segment is final. Save it. + merged_rows <- append(merged_rows, list(current_row)) + # The next segment becomes the new current segment. + current_row <- next_row + } + + # Append the very last segment (which is either a standalone or the result of the last merge) + merged_rows <- append(merged_rows, list(current_row)) + + # Reconstruct a single data frame from the list of merged rows + do.call(rbind, merged_rows) +} # --- Main Analysis Function --- main <- function(args) { # Set seed for reproducibility @@ -184,7 +268,7 @@ # Copy-Neutral LOH cncf_for_vcf[cncf_for_vcf$tcn.em == 2 & !is.na(cncf_for_vcf$lcn.em) & cncf_for_vcf$lcn.em == 0, c("svtype", "event")] <- c("CNV", "CN_LOH") - # Filtrer les segments normaux (ceux où 'svtype' est encore NA) + # Filter normal segments (where'svtype' is still NA) cnv_calls <- cncf_for_vcf[!is.na(cncf_for_vcf$svtype), ] } else { cnv_calls <- data.frame() @@ -192,11 +276,18 @@ if (nrow(cnv_calls) > 0) { + if (args$enable_merging) { + cnv_calls <- merge_segments( + cnv_df = cnv_calls, + max_gap_abs = args$merge_gap_abs, + max_gap_rel = args$merge_gap_rel + ) + } vcf_header <- create_vcf_header(args$sample_id, fit$purity, fit$ploidy) vcf_body <- apply(cnv_calls, 1, function(seg) { + cnv_calls <- merge_segments(cnv_calls) alt_allele <- paste0("<", seg["svtype"], ">") - info <- paste0( "END=", seg["end"], ";SVTYPE=", seg["svtype"], @@ -207,6 +298,8 @@ ";NUM_MARK=", seg["num.mark"], ";NHET=", seg["nhet"] ) + # Remove any space(s) immediately following an '=' sign in the INFO string. + info <- gsub("=\\s+", "=", info) paste(seg["chrom"], seg["start"], ".", "N", alt_allele, ".", "PASS", info, sep = "\t") })
