Mercurial > repos > azomics > meta_autocluster
changeset 0:c744db871f90 draft default tip
"planemo upload for repository https://github.com/AstraZeneca-Omics/immport-galaxy-tools/tree/master/flowtools/metacyto_autocluster commit 3cc1083d473530ed4f7439d590568baa51a46857"
author | azomics |
---|---|
date | Tue, 27 Jul 2021 23:00:49 +0000 |
parents | |
children | |
files | metacyto_autocluster.R metacyto_autocluster.xml test-data/Group1.fcs test-data/Group2.fcs test-data/preprocess.metacyto_summary.txt |
diffstat | 5 files changed, 413 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metacyto_autocluster.R Tue Jul 27 23:00:49 2021 +0000 @@ -0,0 +1,247 @@ +#!/usr/bin/env Rscript +###################################################################### +# Copyright (c) 2018 Northrop Grumman. +# All rights reserved. +###################################################################### +# +# Version 1 - January 2018 +# Author: Cristel Thomas +# +# + +library(flowCore) +library(MetaCyto) + +check_cluster_def <- function(cl_def) { + if (cl_def == "" || cl_def == "None") { + quit(save = "no", status = 14, runLast = FALSE) + } else { + tmp <- gsub(" ", "", cl_def, fixed = TRUE) + clean_def <- gsub(",", "|", tmp, fixed = TRUE) + return(toupper(clean_def)) + } +} + +path_to_group_file <- function(path_to_result) { + grp <- basename(dirname(path_to_result)) + return(paste(grp, "fcs", sep = ".", collapse = NULL)) +} + +group_file_to_group_name <- function(result_file) { + return(strsplit(result_file, ".", fixed = TRUE)[[1]][1]) +} + +auto_cluster_panels <- function(params, df, fcspaths, fcsnames, quant=0.95, + events=0.05, cluster_algorithm="FlowSOM", + clusters=vector(), outdir="", list_clust="", + metacluster=40, xdim=10, ydim=10, seed=42, uc="") { + + working_dir <- "tmp_metacyto" + working_out <- "tmp_metacyto_out" + dir.create(working_dir) + dir.create(outdir) + + # get nb of groups + nb_groups <- length(fcsnames) + + # reformat summary -- expects csv + 'fcs_names' && 'fcs_files' + new_df <- file.path(working_dir, "processed_sample_summary.csv") + df$fcs_names <- df$filenames + df$fcs_files <- df$filenames + write.csv(df, file = new_df, row.names = F) + + # move && rename FCS files to same directory + for (i in seq_len(length(fcspaths))) { + new_file <- file.path(working_dir, fcsnames[[i]]) + if (!grepl(".fcs$", new_file)) { + new_file <- paste0(new_file, ".fcs") + } + file.copy(fcspaths[[i]], new_file) + } + +#### will need to add other parameters when Zicheng has them working. + if (cluster_algorithm == "FlowSOM") { + cluster_label <- autoCluster.batch(preprocessOutputFolder = working_dir, + excludeClusterParameters = params, + labelQuantile = quant, + clusterFunction = flowSOM.MC, + minPercent = events, + k = metacluster, + xdim = xdim, + ydim = ydim, + seed = seed) + + } else { + cluster_label <- autoCluster.batch(preprocessOutputFolder = working_dir, + excludeClusterParameters = params, + labelQuantile = quant, + clusterFunction = flowHC, + minPercent = events) + } + + + # Add potential user-defined label to cluster definitions + if (length(clusters) > 1) { + cluster_label <- c(cluster_label, clusters) + } + write.table(cluster_label, list_clust, quote = F, row.names = F, col.names = F) + + # Derive summary statistics for the clusters + # Result will be written out to the directory speficied by the "outpath" argument + searchCluster.batch(preprocessOutputFolder = working_dir, + outpath = working_out, + clusterLabel = cluster_label) + + result_files <- list.files(working_out, + pattern = "cluster_stats_in_each_sample", + recursive = T, + full.names = T) + no_results <- vector() + if (length(result_files) != nb_groups) { + groups_with_results <- sapply(result_files, path_to_group_file) + ## one or more groups with no results, figure out which + no_results <- setdiff(fcsnames, groups_with_results) + } + + if (length(no_results) == nb_groups) { + sink(uc) + cat("No clusters were found in none of the groups.") + sink() + } else { + unused_clrs <- list() + if (length(no_results > 0)) { + grp_no_results <- sapply(no_results, group_file_to_group_name) + unused_clrs <- data.frame("cluster_label" = "any", "not_found_in" = grp_no_results) + } + for (result in result_files) { + group_name <- strsplit(result, .Platform$file.sep)[[1]][2] + new_filename <- paste(c(group_name, "cluster_stats.txt"), collapse = "_") + new_path <- file.path(outdir, new_filename) + tmp_df <- read.csv(result) + + used_clr <- as.character(unique(tmp_df$label)) + if (length(used_clr) != length(cluster_label)) { + unused <- setdiff(cluster_label, used_clr) + tmp_udf <- data.frame("cluster_label" = unused, "not_found_in" = group_name) + unused_clrs <- rbind(unused_clrs, tmp_udf) + } + colnames(tmp_df)[[1]] <- "group_name" + write.table(tmp_df, new_path, quote = F, row.names = F, col.names = T, sep = "\t") + } + + if (is.null(dim(unused_clrs))) { + sink(uc) + cat("All provided cluster definition were found in provided FCS files.") + sink() + } else { + write.table(unused_clrs, uc, quote = F, row.names = F, col.names = T, sep = "\t") + } + } +} + +check_input <- function(params = vector(), report = "", fcs_files = list(), + grp_names = list(), quant = 0.95, events = 0.05, + cluster_algorithm = "FlowSOM", clusters = vector(), outdir = "", + list_clust = "", metacluster = 40, xdim = 10, ydim = 10, + seed = 42, unused = "") { + # check FCS files + fcspaths <- unlist(fcs_files) + fcsnames <- unlist(grp_names) + ct_files <- 0 + some_pb <- FALSE + for (i in seq_len(length(fcspaths))) { + is_file_valid <- FALSE + tryCatch({ + fcs <- read.FCS(fcspaths[[i]], transformation = FALSE) + is_file_valid <- TRUE + }, error = function(ex) { + print(paste("File is not a valid FCS file:", fcsnames[[i]], ex)) + }) + if (is_file_valid) { + metacyto_pp_check <- if ("sample_id" %in% colnames(fcs)) TRUE else FALSE + if (metacyto_pp_check) { + idx <- length(colnames(fcs)) + ct_files <- ct_files + max(fcs@exprs[, idx]) + } else { + quit(save = "no", status = 11, runLast = FALSE) + } + } else { + some_pb <- TRUE + } + } + # check summary file format + df <- read.table(report, sep = "\t", header = T, colClasses = "character") + nm <- colnames(df) + check_ab <- if ("antibodies" %in% nm) TRUE else FALSE + check_sdy <- if ("study_id" %in% nm) TRUE else FALSE + + if (check_sdy && check_ab) { + # check that summary index compatible with FCSs in collection - by number of files == index nb? + if (ct_files != length(df$antibodies)) { + quit(save = "no", status = 12, runLast = FALSE) + } + } else { + quit(save = "no", status = 13, runLast = FALSE) + } + + if (some_pb) { + quit(save = "no", status = 10, runLast = FALSE) + } else { + auto_cluster_panels(params, df, fcspaths, fcsnames, quant, events, + cluster_algorithm, clusters, outdir, list_clust, + metacluster, xdim, ydim, seed, unused) + } +} + +################################################################################ +################################################################################ +args <- commandArgs(trailingOnly = TRUE) + +ex_param <- c("FSC-A", "FSC-W", "FSC-H", "FSC", "SSC", "SSC-A", "SSC-W", + "SSC-H", "Time", "Cell_length", "cell_length", "CELL_LENGTH") + +if (args[5] != "" && args[5] != "None") { + tmp <- gsub(" ", "", args[5], fixed = TRUE) + eparam <- unlist(strsplit(tmp, ",")) + ex_param <- toupper(eparam) +} + +i <- grep(args, pattern = "PARAM") +ii <- grep(args, pattern = "FCS_FILES") + +cluster_def <- vector() +if (i > 8) { + id <- i - 1 + cl_df <- args[8:id] + cluster_def <- sapply(cl_df, check_cluster_def) +} + +metacluster <- 40 +xdim <- 10 +ydim <- 10 +seed <- 42 +if (i + 1 != ii) { + metacluster <- as.numeric(args[i + 1]) + xdim <- as.numeric(args[i + 2]) + ydim <- as.numeric(args[i + 3]) + seed <- as.numeric(args[i + 4]) +} + +fcs_files <- list() +fcs_names <- list() +j <- 1 +m <- ii + 1 +n <- length(args) - 1 +tmp_fcs <- args[m:n] + +for (k in seq_len(length(tmp_fcs))) { + if (k %% 2) { + fcs_files[[j]] <- tmp_fcs[[k]] + fcs_names[[j]] <- tmp_fcs[[k + 1]] + j <- j + 1 + } +} + +check_input(ex_param, args[1], fcs_files, fcs_names, as.numeric(args[3]), + as.numeric(args[4]), args[2], cluster_def, args[6], args[7], + metacluster, xdim, ydim, seed, args[length(args)])
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metacyto_autocluster.xml Tue Jul 27 23:00:49 2021 +0000 @@ -0,0 +1,155 @@ +<tool id="metacyto_autocluster" name="Autoclustering analysis" version="1.0+galaxy0" profile="18.01"> + <description>using MetaCyto</description> + <requirements> + <!-- <requirement type="package" version="1.42.0">bioconductor-flowcore</requirement> --> + <!-- flowcore is already a dependency of MetaCyto --> + <requirement type="package" version="1.4.0">bioconductor-metacyto</requirement> + </requirements> + <stdio> + <exit_code range="1:9" /> + <exit_code range="10" level="fatal" description="Please provide valid input FCS files." /> + <exit_code range="11" level="fatal" description="Please provide FCS files pre-processed for MetaCyto." /> + <exit_code range="12" level="fatal" description="Pre-processing summary doesn't match the set of FCS files." /> + <exit_code range="13" level="fatal" description="The pre-processing summary is in the wrong format." /> + <exit_code range="14" level="fatal" description="Please provide a cluster definition" /> + <exit_code range="15:" /> + </stdio> + <command><![CDATA[ + Rscript --slave --vanilla '$__tool_directory__/metacyto_autocluster.R' '${summary}' '${clr_option.clustering}' '${quantile}' '${min_event}' '${ex_param}' 'fcs_stats' '${cluster_list}' + #if $more_cluster_def.more_def == "TRUE" + '${more_cluster_def.first_def}' + #for $r in $more_cluster_def.cl_df + '${r.cluster_def}' + #end for + #end if + 'PARAM' + #if $clr_option.clustering == "FlowSOM" + '${clr_option.mcluster}' '${clr_option.xdim}' '${clr_option.ydim}' '${clr_option.seed}' + #end if + 'FCS_FILES' + #for $f in $group + '${f}' '${f.name}' + #end for + '${unused}' + ]]> + </command> + <inputs> + <param format="metacyto_summary.txt" name="summary" type="data" label="MetaCyto preprocessing summary"/> + <param format="fcs" name="group" type="data_collection" collection_type="list" label="FCS files Collection pre-processed for MetaCyto"/> + <conditional name="clr_option"> + <param name="clustering" type="select" label="Clustering algorithm to use:"> + <option value="FlowSOM">FlowSOM</option> + <option value="flowHC">flowHC</option> + </param> + <when value="FlowSOM"> + <param name="mcluster" type="integer" value="40" label="Number of expected metaclusters" help="MetaCyto authors recommend using 40, FlowSOM default is 10." /> + <param name="xdim" type="integer" value="10" label="Grid size, width"/> + <param name="ydim" type="integer" value="10" label="Grid size, height" help="By default, the grid size is 10x10. The grid size specifies the number of clusters." /> + <param name="seed" type="integer" value="42" label="Seed" help="Let's be geeks, default is set to 42." /> + </when> + </conditional> + <param name="quantile" type="float" value="0.95" min="0.5" max="1" label="Quantile threshold" help="Minimum percent of cells in a cluster expressing more or less than the cutoff value of a marker. The default value is 0.95"/> + <param name="min_event" type="float" value="0.05" min="0" max="0.5" label="Minimum percent of cells in the positive and negative region after bisection" help="The default value is 0.05. Keep this factor small to avoid bisecting uni-mode distributions."/> + <param name="ex_param" type="text" label="Markers to exclude from clustering analysis:" help="By default FSC, SSC, Time and Cell Length channels are excluded. Providing markers to exclude overrides the default setting. i.e.:FSC,SSC,CD88"/> + <conditional name="more_cluster_def"> + <param name="more_def" type="boolean" label="Add cluster definitions?" checked="false" truevalue="TRUE" falsevalue="FALSE" /> + <when value="TRUE"> + <param name="first_def" type="text" label="Additional cluster definition:" help="For example: CD3+,CD4-,CD8+,CCR7+"/> + <repeat name="cl_df" title="Cluster:"> + <param name="cluster_def" type="text" label="Additional cluster definition:" help="For example: CD3+,CD4-,CD8+,CCR7+"/> + </repeat> + </when> + </conditional> + </inputs> + <outputs> + <collection type="list" label="${clr_option.clustering} autoClustering analysis on ${on_string}" name="output"> + <discover_datasets pattern="(?P<name>.*)" directory="fcs_stats" format="metacyto_stats.txt" /> + </collection> + <data format="tabular" name="unused" label="List of clusters not found in all files from ${on_string}"/> + <data format="metacyto_clr.txt" name="cluster_list" label="List of clusters from ${clr_option.clustering} autoClustering analysis of ${group.name}"/> + </outputs> + <tests> + <test> + <param name="summary" value="preprocess.metacyto_summary.txt"/> + <param name="group"> + <collection type="list"> + <element name="Group1" value="Group1.fcs"/> + <element name="Group2" value="Group2.fcs"/> + </collection> + </param> + <output name="cluster_list" ftype="metacyto_clr.txt"> + <assert_contents> + <has_n_lines n="80" /> + <has_text text="CD16-" /> + </assert_contents> + </output> + <output name="unused" ftype="tabular"> + <assert_contents> + <has_n_lines n="81" /> + <has_text text="CD19-" /> + </assert_contents> + </output> + </test> + </tests> + <help><![CDATA[ +This tool uses MetaCyto to cluster events automatically from several sets of FCS files. +--------------------------------------------------------------------------------------- + +**Input files** + +This tool requires the pre-processing summary generated for MetaCyto as well as the pre-processed FCS files. + +**Parameters** + +*Quantile threshold* + +This value represents the minimum proportion of events in a cluster that should express more (or less) than the cutoff value for a given marker. With a value of 0.8, a cluster will be labeled Marker1+ if over 80% of cells in the cluster express Marker1 at a higher level than a cutoff value determined by the clustering analysis. + +*Minimum Number of events* + +This value represents the minimum proportion of cells in each region after bisection. Keep this factor small to avoid bisecting uni-mode distributions. + +*Markers to exclude* + +Please provide a comma-separated list of the markers that should be excluded from the clustering analysis. By default, FSC, SSC, Time and Cell Length channels are excluded, but providing a list of markers overrides the default settings. + +*Additional cluster definitions* + +Please provide additional gate definitions as comma-separated lists of marker names, for instance "CD3+, CD4+, CD25+, Foxp3+". + +*FlowSOM - Number of meta-clusters* + +Please indicate the exact number of meta-clusters expected. + +*FlowSOM - Grid dimension* + +Please indicate the dimension of the grid for establishing the SOM. The dimension of the grid defines the number of clusters (height x width = cluster) + +*FlowSOM - Seed* + +Please indicate a random number to use as seed. To make your analysis reproducible, use the same seed. + +**Output** + +This tool generates a list of clusters identified, including optionally provided cluster definitions, as well as a table of the MFI for each marker in each cluster in each file, and proportion of each cluster in each file. A list of unused cluster definitions, if any, is also generated. +----- + +**Example** + +*Input* - Pre-Processing Summary Table:: + study_id antibodies filenames + group1 Marker1|Marker2|Marker3|... file1.fcs + group2 Marker1|Marker2|Marker3|... file2.fcs + ... ... ... +*Output* - Clustering Summary Tables:: + group_name fcs_files cluster_id label fcs_names Marker1 Marker2 ... fraction + group1 file1_group1.fcs cluster1 Marker1-|Marker2+|Marker3+ file1_group1.fcs 1.9815 0.2024 ... 0.373 + group1 file2_group1.fcs cluster1 Marker1-|Marker2+|Marker3+ file2_group1.fcs 2.3739 0.3377 ... 0.26 + ... ... ... ... ... ... ... ... ... +*Output* - Cluster List:: + Marker1+|Marker3- + Marker1-|Marker2+|Marker3- + ... + ]]> + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/preprocess.metacyto_summary.txt Tue Jul 27 23:00:49 2021 +0000 @@ -0,0 +1,11 @@ +study_id antibodies filenames +Group1 FSC-A|SSC-A|CD11A|CD4|CD16|CD28|CD95|CD3|CD62L|CD8B|CCR7|TIME|SAMPLE_ID inputflow6.fcs +Group1 FSC-A|SSC-A|CD11A|CD4|CD16|CD28|CD95|CD3|CD62L|CD8B|CCR7|TIME|SAMPLE_ID inputflow5.fcs +Group1 FSC-A|SSC-A|CD11A|CD4|CD16|CD28|CD95|CD3|CD62L|CD8B|CCR7|TIME|SAMPLE_ID inputflow4.fcs +Group1 FSC-A|SSC-A|CD11A|CD4|CD16|CD28|CD95|CD3|CD62L|CD8B|CCR7|TIME|SAMPLE_ID inputflow3.fcs +Group1 FSC-A|SSC-A|CD11A|CD4|CD16|CD28|CD95|CD3|CD62L|CD8B|CCR7|TIME|SAMPLE_ID inputflow2.fcs +Group1 FSC-A|SSC-A|CD11A|CD4|CD16|CD28|CD95|CD3|CD62L|CD8B|CCR7|TIME|SAMPLE_ID inputflow1.fcs +Group2 TIME|CELL_LENGTH|DEAD|CD19|CD4|CD8|IGD|CD85J|CD16|CD3|CD38|CD27|CD14|CD94|CCR7|CD45RA|CD20|CD127|CD33|CD28|CD24|CD161|TCRGD|CD56|HLADR|CD25|DNA1|DNA2|SAMPLE_ID inputcytof4.fcs +Group2 TIME|CELL_LENGTH|DEAD|CD19|CD4|CD8|IGD|CD85J|CD16|CD3|CD38|CD27|CD14|CD94|CCR7|CD45RA|CD20|CD127|CD33|CD28|CD24|CD161|TCRGD|CD56|HLADR|CD25|DNA1|DNA2|SAMPLE_ID inputcytof3.fcs +Group2 TIME|CELL_LENGTH|DEAD|CD19|CD4|CD8|IGD|CD85J|CD16|CD3|CD38|CD27|CD14|CD94|CCR7|CD45RA|CD20|CD127|CD33|CD28|CD24|CD161|TCRGD|CD56|HLADR|CD25|DNA1|DNA2|SAMPLE_ID inputcytof2.fcs +Group2 TIME|CELL_LENGTH|DEAD|CD19|CD4|CD8|IGD|CD85J|CD16|CD3|CD38|CD27|CD14|CD94|CCR7|CD45RA|CD20|CD127|CD33|CD28|CD24|CD161|TCRGD|CD56|HLADR|CD25|DNA1|DNA2|SAMPLE_ID inputcytof1.fcs