Mercurial > repos > artbio > cnv_facets

--- a/macros.xml	Wed Oct 08 17:41:18 2025 +0000
+++ b/macros.xml	Thu Oct 09 17:14:30 2025 +0000
@@ -1,7 +1,7 @@
 <macros>
     <token name="@pipefail@"><![CDATA[set -o | grep -q pipefail && set -o pipefail;]]></token>
     <token name="@TOOL_VERSION@">0.6.2</token>
-    <token name="@VERSION_SUFFIX@">7</token>
+    <token name="@VERSION_SUFFIX@">8</token>
     <token name="@PROFILE@">23.0</token>
     <xml name="requirements">
         <requirements>
--- a/snp_pileup_for_facets_wrapper.sh	Wed Oct 08 17:41:18 2025 +0000
+++ b/snp_pileup_for_facets_wrapper.sh	Thu Oct 09 17:14:30 2025 +0000
@@ -2,25 +2,26 @@

 # ==============================================================================
 #
-# snp_pileup_for_facets_wrapper.sh (v3.3 - Final Robust Parallelism)
+# snp_pileup_for_facets_wrapper.sh (v4.0 - Optimal Parallelization)
 #
 # Description:
-#   A robust wrapper script for running snp-pileup in parallel. It works around
-#   snp-pileup's inability to filter by region by creating temporary per-chromosome
-#   input files for each parallel job.
+#   A highly optimized wrapper for snp-pileup. It divides the genome into
+#   numerous, equally-sized chunks to ensure a balanced workload across all
+#   allocated threads, maximizing performance and resource utilization.
 #
 # Author:
 #   drosofff (ARTbio) & Gemini (Google)
 #
 # ==============================================================================

+# --- Robustness settings ---
+set -euo pipefail
+
 # --- Help Function ---
 usage() {
     cat << EOF
 Usage: $0 -n <normal.bam> -t <tumor.bam> -v <snps.vcf.gz> -o <output.csv.gz> [OPTIONS]

-This script runs snp-pileup in parallel across all chromosomes.
-
 REQUIRED ARGUMENTS:
   -n    Path to the normal BAM file (must be indexed).
   -t    Path to the tumor BAM file (must be indexed).
@@ -28,32 +29,23 @@
   -o    Path for the final output pileup file (will be bgzip compressed).

 OPTIONS:
-  -N    Number of parallel processes to use (default: 1).
+  -N    Number of parallel processes to use (default from \${GALAXY_SLOTS:-4}).
   -q    Minimum mapping quality for reads (default: 15).
   -Q    Minimum base quality for bases (default: 20).
-  -p    Pseudo-SNP spacing in bp passed to snp-pileup (--pseudo-snps). Range: 150–5000.
-        Default: 300 (≈ 3.3 pseudo-SNPs/kb; robust with the controlled 1 SNP/kb VCF).
+  -p    Pseudo-SNP spacing in bp (default: 300).
   -A    Include anomalous read pairs (flag, default: not set).
   -h    Display this help message and exit.
 EOF
     exit 0
 }

-# --- Robustness settings ---
-set -e -u -o pipefail
-
-# --- Argument parsing ---
-# (This part is correct and remains unchanged)
-normal_bam=""
-tumor_bam=""
-snp_vcf=""
-output_pileup=""
-nprocs=1
+# --- Argument Parsing ---
+nprocs=${GALAXY_SLOTS:-4} # Default to Galaxy's variable, with a fallback
 mapq=15
 baseq=20
+pseudo_snps=300
 count_orphans=""

-pseudo_snps=300   # default, was hard-coded previously
 while getopts ":hn:t:v:o:N:q:Q:p:A" opt; do
     case ${opt} in
         h ) usage ;;
@@ -71,93 +63,133 @@
     esac
 done

-# --- Main logic ---
+# --- Tool path discovery ---
+# (Assumes tools are in PATH)
+SNP_PILEUP_EXE="snp-pileup"
+SAMTOOLS_EXE="samtools"
+BCFTOOLS_EXE="bcftools"

-# Find full paths to executables
-SNP_PILEUP_EXE=$(command -v snp-pileup)
-SAMTOOLS_EXE=$(command -v samtools)
-BCFTOOLS_EXE=$(command -v bcftools)
+# --- Temp directory setup and cleanup trap ---
+TMPDIR=$(mktemp -d)
+trap 'rm -rf -- "$TMPDIR"' EXIT
+
+# ==============================================================================
+# --- Phase 1: Parallelization Strategy using Balanced Regions ---
+# ==============================================================================
+
+echo "Generating balanced genomic chunks for parallel processing..."

-for exe in SNP_PILEUP_EXE SAMTOOLS_EXE BCFTOOLS_EXE; do
-    if [ -z "${!exe}" ]; then
-        echo "Error: '${exe%%_*}' executable not found in PATH." >&2
-        exit 1
-    fi
-done
-echo "Found snp-pileup executable at: ${SNP_PILEUP_EXE}"
-# --- Input validation for pseudo_snps guardrails ---
-if ! [[ "${pseudo_snps}" =~ ^[0-9]+$ ]]; then
-    echo "Error: -p/--pseudo-snps must be an integer (bp)." >&2
-    exit 1
-fi
-if [ "${pseudo_snps}" -lt 150 ] || [ "${pseudo_snps}" -gt 5000 ]; then
-    echo "Error: -p/--pseudo-snps value (${pseudo_snps}) out of safe range [150..5000] bp." >&2
-    echo "Rationale: too small → excessive density/CPU; too large → pseudo-SNPs too sparse and FACETS may fail." >&2
+# The number of chunks is a multiple of the number of threads for optimal balance.
+NUM_CHUNKS=$((nprocs * 10))
+
+# Step 1: Get the list of chromosomes to process directly from the VCF data body.
+# This is the most robust method. We then apply a positive filter to keep only
+# autosomes and sex chromosomes.
+VCF_PRIMARY_CHROMS=$( \
+    zcat "${snp_vcf}" \
+    | grep -v "^#" \
+    | cut -f 1 \
+    | sort -u \
+    | grep -E '^(chr)?([0-9]+|X|Y)$' \
+    || true \
+)
+
+if [ -z "$VCF_PRIMARY_CHROMS" ]; then
+    echo "Error: No primary autosomes or sex chromosomes (1-22, X, Y) were found in the VCF file body." >&2
     exit 1
 fi
-echo "Using pseudo-SNP spacing: ${pseudo_snps} bp"
-
-
-echo "Starting SNP pileup process with ${nprocs} parallel jobs..."
+echo "Found the following chromosomes to process:"
+echo "$VCF_PRIMARY_CHROMS"

-TMPDIR=$(mktemp -d)
-trap "rm -rf '${TMPDIR}'" EXIT
-echo "Temporary directory created at: ${TMPDIR}"
+# Step 2: Get the geometry (lengths) for these specific chromosomes from the BAM header.
+GENOME_GEOMETRY=$( \
+    "${SAMTOOLS_EXE}" view -H "$normal_bam" \
+    | awk -F'\t' '/^@SQ/ {print $2"\t"$3}' \
+    | sed 's/SN://' | sed 's/LN://' \
+    | grep -wFf <(echo "$VCF_PRIMARY_CHROMS") \
+)

-CHROMS=$("${SAMTOOLS_EXE}" view -H "${normal_bam}" | grep "^@SQ" | cut -f 2 | sed 's/SN://' | grep -Fwf - <(zcat "${snp_vcf}" | grep -v "^#" | cut -f 1 | sort -u) )
-echo "Found the following chromosomes to process in both BAM and VCF:"
-echo "${CHROMS}"
-
-# --- CORRECT PARALLEL EXECUTION LOGIC ---
+# Step 3: Calculate the total genome size to process and the "ideal" chunk size.
+TOTAL_SIZE=$(echo "$GENOME_GEOMETRY" | awk '{sum+=$2} END {print sum}')
+if [[ -z "$TOTAL_SIZE" || "$TOTAL_SIZE" -eq 0 ]]; then
+    echo "Error: Could not determine genome size. Make sure chromosome names in BAM and VCF match (e.g., 'chr1' vs '1')." >&2
+    exit 1
+fi
+CHUNK_SIZE=$((TOTAL_SIZE / NUM_CHUNKS))
+echo "Number of threads: ${nprocs}"
+echo "Target chunk size: ${CHUNK_SIZE} bp for ${NUM_CHUNKS} total chunks."

-# Define the function that will be run in parallel for each chromosome.
-# It will inherit the exported variables from the main script.
-scatter_and_run() {
-    local chrom=$1
-    echo "Scattering data for chromosome ${chrom}..."
+# Step 4: Iterate over each chromosome to generate the list of regions.
+REGIONS=""
+while read -r chrom chrom_len; do
+    current_pos=1
+    while [[ "$current_pos" -lt "$chrom_len" ]]; do
+        end_pos=$((current_pos + CHUNK_SIZE - 1))
+        # Ensure the chunk does not extend beyond the end of the chromosome.
+        if [[ "$end_pos" -gt "$chrom_len" ]]; then
+            end_pos=$chrom_len
+        fi
+        # Add the region (format chr:start-end) to our list of tasks.
+        REGIONS="${REGIONS} ${chrom}:${current_pos}-${end_pos}"
+        current_pos=$((end_pos + 1))
+    done
+done <<< "$GENOME_GEOMETRY"
+
+echo "Generated $(echo "$REGIONS" | wc -w | xargs) tasks for GNU Parallel."

-    local temp_vcf="${TMPDIR}/${chrom}.vcf.gz"
-    local temp_nbam="${TMPDIR}/${chrom}.normal.bam"
-    local temp_tbam="${TMPDIR}/${chrom}.tumor.bam"
-    local temp_output="${TMPDIR}/${chrom}.csv"
+# ==============================================================================
+# --- Phase 2: Parallel Execution (Map) ---
+# ==============================================================================

-    "${BCFTOOLS_EXE}" view -r "${chrom}" "${snp_vcf}" -Oz -o "${temp_vcf}"
-    "${SAMTOOLS_EXE}" view -b "${normal_bam}" "${chrom}" > "${temp_nbam}"
-    "${SAMTOOLS_EXE}" view -b "${tumor_bam}" "${chrom}" > "${temp_tbam}"
+# The worker function for each thread, now adapted for a region.
+scatter_and_run() {
+    local region="$1"
+    local region_safe_name
+    region_safe_name=$(echo "$region" | tr ':-' '_')
+
+    local temp_output="${TMPDIR}/pileup_${region_safe_name}.csv"
+    local temp_nbam="${TMPDIR}/normal_${region_safe_name}.bam"
+    local temp_tbam="${TMPDIR}/tumor_${region_safe_name}.bam"
+    local temp_vcf="${TMPDIR}/snps_${region_safe_name}.vcf.gz"

+    # Extract data for THIS SPECIFIC REGION
+    "${SAMTOOLS_EXE}" view -b "${normal_bam}" "${region}" -o "${temp_nbam}"
+    "${SAMTOOLS_EXE}" view -b "${tumor_bam}" "${region}" -o "${temp_tbam}"
+    "${BCFTOOLS_EXE}" view --regions "${region}" "${snp_vcf}" -Oz -o "${temp_vcf}"
+
+    # Index the temporary files
     "${SAMTOOLS_EXE}" index "${temp_nbam}"
     "${SAMTOOLS_EXE}" index "${temp_tbam}"
+    "${BCFTOOLS_EXE}" index "${temp_vcf}"

-    echo "Running snp-pileup on chromosome ${chrom}..."
+    # Run snp-pileup
     "${SNP_PILEUP_EXE}" --pseudo-snps="${pseudo_snps}" -q "${mapq}" -Q "${baseq}" ${count_orphans} "${temp_vcf}" "${temp_output}" "${temp_nbam}" "${temp_tbam}"
 }

-# Export all necessary variables AND the function so they are available to the sub-shells created by GNU Parallel.
 export -f scatter_and_run
-export snp_vcf normal_bam tumor_bam TMPDIR SNP_PILEUP_EXE SAMTOOLS_EXE BCFTOOLS_EXE mapq baseq count_orphans pseudo_snps
+export normal_bam tumor_bam snp_vcf TMPDIR SNP_PILEUP_EXE SAMTOOLS_EXE BCFTOOLS_EXE pseudo_snps mapq baseq count_orphans
+
+echo "Starting parallel processing with ${nprocs} threads..."
+parallel --jobs "${nprocs}" scatter_and_run ::: ${REGIONS}
+echo "Parallel processing finished."

-# Run the function in parallel, feeding it the list of chromosomes.
-parallel --jobs "${nprocs}" scatter_and_run ::: ${CHROMS}
+# ==============================================================================
+# --- Phase 3: Gathering Results (Reduce) ---
+# ==============================================================================

-echo "Parallel processing finished. Concatenating results..."
-
-# gather job outputs and sort chromosome remains the same
-FIRST_FILE=$(ls -1v "${TMPDIR}"/*.csv 2>/dev/null | head -n 1)
-if [ -z "${FIRST_FILE}" ]; then
+echo "Concatenating, sorting, and compressing results..."
+FIRST_FILE=$(find "$TMPDIR" -name '*.csv' -print -quit)
+if [ -z "$FIRST_FILE" ]; then
     echo "Error: No pileup files were generated." >&2
     exit 1
 fi

-# Use command grouping { ...; } to pipe the combined output of head and tail.
-# This entire pipeline writes the final, sorted, compressed file.
+# A robust pipeline to gather the results
 {
-    # 1. Print the header once.
-    head -n 1 "${FIRST_FILE}";
+    # Print the header from the first file.
+    head -n 1 "$FIRST_FILE";
+    # Concatenate the content of all other files, skipping their headers.
+    tail -q -n +2 "${TMPDIR}"/*.csv;
+} | sort -k1,1V -k2,2n | gzip > "$output_pileup"

-    # 2. Concatenate all files (skipping their headers).
-    tail -q -n +2 "${TMPDIR}"/*.csv;
-
-} | sort -t, -k1,1V -k2,2n | bgzip > "${output_pileup}"
-echo "Concatenation, sorting and compression complete."
-echo "Final output is in ${output_pileup}"
-echo "Script finished successfully. The temporary directory will be removed by the trap."
\ No newline at end of file
+echo "Script finished successfully. Final output is in ${output_pileup}"