Mercurial > repos > azomics > meta_autocluster

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/metacyto_autocluster.R	Tue Jul 27 23:00:49 2021 +0000
@@ -0,0 +1,247 @@
+#!/usr/bin/env Rscript
+######################################################################
+#                  Copyright (c) 2018 Northrop Grumman.
+#                          All rights reserved.
+######################################################################
+#
+# Version 1 - January 2018
+# Author: Cristel Thomas
+#
+#
+
+library(flowCore)
+library(MetaCyto)
+
+check_cluster_def <- function(cl_def) {
+  if (cl_def == "" || cl_def == "None") {
+    quit(save = "no", status = 14, runLast = FALSE)
+  } else {
+    tmp <- gsub(" ", "", cl_def, fixed = TRUE)
+    clean_def <- gsub(",", "|", tmp, fixed = TRUE)
+    return(toupper(clean_def))
+  }
+}
+
+path_to_group_file <- function(path_to_result) {
+  grp <- basename(dirname(path_to_result))
+  return(paste(grp, "fcs", sep = ".", collapse = NULL))
+}
+
+group_file_to_group_name <- function(result_file) {
+  return(strsplit(result_file, ".", fixed = TRUE)[[1]][1])
+}
+
+auto_cluster_panels <- function(params, df, fcspaths, fcsnames, quant=0.95,
+                              events=0.05, cluster_algorithm="FlowSOM",
+                              clusters=vector(), outdir="", list_clust="",
+                              metacluster=40, xdim=10, ydim=10, seed=42, uc="") {
+
+  working_dir <- "tmp_metacyto"
+  working_out <- "tmp_metacyto_out"
+  dir.create(working_dir)
+  dir.create(outdir)
+
+  # get nb of groups
+  nb_groups <- length(fcsnames)
+
+  # reformat summary -- expects csv + 'fcs_names' && 'fcs_files'
+  new_df <- file.path(working_dir, "processed_sample_summary.csv")
+  df$fcs_names <- df$filenames
+  df$fcs_files <- df$filenames
+  write.csv(df, file = new_df, row.names = F)
+
+  # move && rename FCS files to same directory
+  for (i in seq_len(length(fcspaths))) {
+    new_file <- file.path(working_dir, fcsnames[[i]])
+    if (!grepl(".fcs$", new_file)) {
+        new_file <- paste0(new_file, ".fcs")
+    }
+    file.copy(fcspaths[[i]], new_file)
+  }
+
+#### will need to add other parameters when Zicheng has them working.
+  if (cluster_algorithm == "FlowSOM") {
+    cluster_label <- autoCluster.batch(preprocessOutputFolder = working_dir,
+                                        excludeClusterParameters = params,
+                                        labelQuantile = quant,
+                                        clusterFunction = flowSOM.MC,
+                                        minPercent = events,
+                                        k = metacluster,
+                                        xdim = xdim,
+                                        ydim = ydim,
+                                        seed = seed)
+
+  } else {
+    cluster_label <- autoCluster.batch(preprocessOutputFolder = working_dir,
+                                        excludeClusterParameters = params,
+                                        labelQuantile = quant,
+                                        clusterFunction = flowHC,
+                                        minPercent = events)
+  }
+
+
+  # Add potential user-defined label to cluster definitions
+  if (length(clusters) > 1) {
+    cluster_label <- c(cluster_label, clusters)
+  }
+  write.table(cluster_label, list_clust, quote = F, row.names = F, col.names = F)
+
+  # Derive summary statistics for the clusters
+  # Result will be written out to the directory speficied by the "outpath" argument
+  searchCluster.batch(preprocessOutputFolder = working_dir,
+                      outpath = working_out,
+                      clusterLabel = cluster_label)
+
+  result_files <- list.files(working_out,
+                             pattern = "cluster_stats_in_each_sample",
+                             recursive = T,
+                             full.names = T)
+  no_results <- vector()
+  if (length(result_files) != nb_groups) {
+    groups_with_results <- sapply(result_files, path_to_group_file)
+    ## one or more groups with no results, figure out which
+    no_results <- setdiff(fcsnames, groups_with_results)
+  }
+
+  if (length(no_results) == nb_groups) {
+    sink(uc)
+    cat("No clusters were found in none of the groups.")
+    sink()
+  } else {
+    unused_clrs <- list()
+    if (length(no_results > 0)) {
+      grp_no_results <- sapply(no_results, group_file_to_group_name)
+      unused_clrs <- data.frame("cluster_label" = "any", "not_found_in" = grp_no_results)
+    }
+    for (result in result_files) {
+      group_name <- strsplit(result, .Platform$file.sep)[[1]][2]
+      new_filename <- paste(c(group_name, "cluster_stats.txt"), collapse = "_")
+      new_path <- file.path(outdir, new_filename)
+      tmp_df <- read.csv(result)
+
+      used_clr <- as.character(unique(tmp_df$label))
+      if (length(used_clr) != length(cluster_label)) {
+        unused <- setdiff(cluster_label, used_clr)
+        tmp_udf <- data.frame("cluster_label" = unused, "not_found_in" = group_name)
+        unused_clrs <- rbind(unused_clrs, tmp_udf)
+      }
+      colnames(tmp_df)[[1]] <- "group_name"
+      write.table(tmp_df, new_path, quote = F, row.names = F, col.names = T, sep = "\t")
+    }
+
+    if (is.null(dim(unused_clrs))) {
+      sink(uc)
+      cat("All provided cluster definition were found in provided FCS files.")
+      sink()
+    } else {
+      write.table(unused_clrs, uc, quote = F, row.names = F, col.names = T, sep = "\t")
+    }
+  }
+}
+
+check_input <- function(params = vector(), report = "", fcs_files = list(),
+                       grp_names = list(), quant = 0.95, events = 0.05,
+                       cluster_algorithm = "FlowSOM", clusters = vector(), outdir = "",
+                       list_clust = "", metacluster = 40, xdim = 10, ydim = 10,
+                       seed = 42, unused = "") {
+  # check FCS files
+  fcspaths <- unlist(fcs_files)
+  fcsnames <- unlist(grp_names)
+  ct_files <- 0
+  some_pb <- FALSE
+  for (i in seq_len(length(fcspaths))) {
+    is_file_valid <- FALSE
+    tryCatch({
+      fcs <- read.FCS(fcspaths[[i]], transformation = FALSE)
+      is_file_valid <- TRUE
+    }, error = function(ex) {
+      print(paste("File is not a valid FCS file:", fcsnames[[i]], ex))
+    })
+    if (is_file_valid) {
+      metacyto_pp_check <- if ("sample_id" %in% colnames(fcs)) TRUE else FALSE
+      if (metacyto_pp_check) {
+        idx <- length(colnames(fcs))
+        ct_files <- ct_files + max(fcs@exprs[, idx])
+      } else {
+        quit(save = "no", status = 11, runLast = FALSE)
+      }
+    } else {
+      some_pb <- TRUE
+    }
+  }
+  # check summary file format
+  df <- read.table(report, sep = "\t", header = T, colClasses = "character")
+  nm <- colnames(df)
+  check_ab <- if ("antibodies" %in% nm) TRUE else FALSE
+  check_sdy <- if ("study_id" %in% nm) TRUE else FALSE
+
+  if (check_sdy && check_ab) {
+    # check that summary index compatible with FCSs in collection - by number of files == index nb?
+    if (ct_files != length(df$antibodies)) {
+      quit(save = "no", status = 12, runLast = FALSE)
+    }
+  } else {
+    quit(save = "no", status = 13, runLast = FALSE)
+  }
+
+  if (some_pb) {
+    quit(save = "no", status = 10, runLast = FALSE)
+  } else {
+    auto_cluster_panels(params, df, fcspaths, fcsnames, quant, events,
+                      cluster_algorithm, clusters, outdir, list_clust,
+                      metacluster, xdim, ydim, seed, unused)
+  }
+}
+
+################################################################################
+################################################################################
+args <- commandArgs(trailingOnly = TRUE)
+
+ex_param <- c("FSC-A", "FSC-W", "FSC-H", "FSC", "SSC", "SSC-A", "SSC-W",
+              "SSC-H", "Time", "Cell_length", "cell_length", "CELL_LENGTH")
+
+if (args[5] != "" && args[5] != "None") {
+  tmp <- gsub(" ", "", args[5], fixed = TRUE)
+  eparam <- unlist(strsplit(tmp, ","))
+  ex_param <- toupper(eparam)
+}
+
+i <- grep(args, pattern = "PARAM")
+ii <- grep(args, pattern = "FCS_FILES")
+
+cluster_def <- vector()
+if (i > 8) {
+  id <- i - 1
+  cl_df <- args[8:id]
+  cluster_def <- sapply(cl_df, check_cluster_def)
+}
+
+metacluster <- 40
+xdim <- 10
+ydim <- 10
+seed <- 42
+if (i + 1 != ii) {
+  metacluster <- as.numeric(args[i + 1])
+  xdim <- as.numeric(args[i + 2])
+  ydim <- as.numeric(args[i + 3])
+  seed <- as.numeric(args[i + 4])
+}
+
+fcs_files <- list()
+fcs_names <- list()
+j <- 1
+m <- ii + 1
+n <- length(args) - 1
+tmp_fcs <- args[m:n]
+
+for (k in seq_len(length(tmp_fcs))) {
+  if (k %% 2) {
+    fcs_files[[j]] <- tmp_fcs[[k]]
+    fcs_names[[j]] <- tmp_fcs[[k + 1]]
+    j <- j + 1
+  }
+}
+
+check_input(ex_param, args[1], fcs_files, fcs_names, as.numeric(args[3]),
+           as.numeric(args[4]), args[2], cluster_def, args[6], args[7],
+           metacluster, xdim, ydim, seed, args[length(args)])
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/metacyto_autocluster.xml	Tue Jul 27 23:00:49 2021 +0000
@@ -0,0 +1,155 @@
+<tool id="metacyto_autocluster" name="Autoclustering analysis" version="1.0+galaxy0" profile="18.01">
+  <description>using MetaCyto</description>
+  <requirements>
+    <!-- <requirement type="package" version="1.42.0">bioconductor-flowcore</requirement> -->
+    <!-- flowcore is already a dependency of MetaCyto -->
+    <requirement type="package" version="1.4.0">bioconductor-metacyto</requirement>
+  </requirements>
+  <stdio>
+    <exit_code range="1:9" />
+    <exit_code range="10" level="fatal" description="Please provide valid input FCS files." />
+    <exit_code range="11" level="fatal" description="Please provide FCS files pre-processed for MetaCyto." />
+    <exit_code range="12" level="fatal" description="Pre-processing summary doesn't match the set of FCS files." />
+    <exit_code range="13" level="fatal" description="The pre-processing summary is in the wrong format." />
+    <exit_code range="14" level="fatal" description="Please provide a cluster definition" />
+    <exit_code range="15:" />
+  </stdio>
+  <command><![CDATA[
+    Rscript --slave --vanilla '$__tool_directory__/metacyto_autocluster.R' '${summary}' '${clr_option.clustering}' '${quantile}' '${min_event}' '${ex_param}' 'fcs_stats' '${cluster_list}'
+  #if $more_cluster_def.more_def == "TRUE"
+    '${more_cluster_def.first_def}'
+    #for $r in $more_cluster_def.cl_df
+      '${r.cluster_def}'
+    #end for
+  #end if
+  'PARAM'
+  #if $clr_option.clustering == "FlowSOM"
+    '${clr_option.mcluster}' '${clr_option.xdim}' '${clr_option.ydim}' '${clr_option.seed}'
+  #end if
+  'FCS_FILES'
+  #for $f in $group
+    '${f}' '${f.name}'
+  #end for
+  '${unused}'
+  ]]>
+  </command>
+  <inputs>
+    <param format="metacyto_summary.txt" name="summary" type="data" label="MetaCyto preprocessing summary"/>
+    <param format="fcs" name="group" type="data_collection" collection_type="list" label="FCS files Collection pre-processed for MetaCyto"/>
+    <conditional name="clr_option">
+      <param name="clustering" type="select" label="Clustering algorithm to use:">
+        <option value="FlowSOM">FlowSOM</option>
+        <option value="flowHC">flowHC</option>
+      </param>
+      <when value="FlowSOM">
+        <param name="mcluster" type="integer" value="40" label="Number of expected metaclusters" help="MetaCyto authors recommend using 40, FlowSOM default is 10." />
+        <param name="xdim" type="integer" value="10" label="Grid size, width"/>
+        <param name="ydim" type="integer" value="10" label="Grid size, height"  help="By default, the grid size is 10x10. The grid size specifies the number of clusters." />
+        <param name="seed" type="integer" value="42" label="Seed"  help="Let's be geeks, default is set to 42." />
+      </when>
+    </conditional>
+    <param name="quantile" type="float" value="0.95" min="0.5" max="1" label="Quantile threshold" help="Minimum percent of cells in a cluster expressing more or less than the cutoff value of a marker. The default value is 0.95"/>
+    <param name="min_event" type="float" value="0.05" min="0" max="0.5" label="Minimum percent of cells in the positive and negative region after bisection" help="The default value is 0.05. Keep this factor small to avoid bisecting uni-mode distributions."/>
+    <param name="ex_param" type="text" label="Markers to exclude from clustering analysis:" help="By default FSC, SSC, Time and Cell Length channels are excluded. Providing markers to exclude overrides the default setting. i.e.:FSC,SSC,CD88"/>
+    <conditional name="more_cluster_def">
+      <param name="more_def" type="boolean" label="Add cluster definitions?" checked="false" truevalue="TRUE" falsevalue="FALSE" />
+      <when value="TRUE">
+        <param name="first_def" type="text" label="Additional cluster definition:" help="For example: CD3+,CD4-,CD8+,CCR7+"/>
+        <repeat name="cl_df" title="Cluster:">
+          <param name="cluster_def" type="text" label="Additional cluster definition:" help="For example: CD3+,CD4-,CD8+,CCR7+"/>
+        </repeat>
+      </when>
+    </conditional>
+  </inputs>
+  <outputs>
+    <collection type="list" label="${clr_option.clustering} autoClustering analysis on ${on_string}" name="output">
+      <discover_datasets pattern="(?P&lt;name&gt;.*)" directory="fcs_stats" format="metacyto_stats.txt" />
+    </collection>
+    <data format="tabular" name="unused" label="List of clusters not found in all files from ${on_string}"/>
+    <data format="metacyto_clr.txt" name="cluster_list" label="List of clusters from ${clr_option.clustering} autoClustering analysis of ${group.name}"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="summary" value="preprocess.metacyto_summary.txt"/>
+      <param name="group">
+        <collection type="list">
+          <element name="Group1" value="Group1.fcs"/>
+          <element name="Group2" value="Group2.fcs"/>
+        </collection>
+      </param>
+      <output name="cluster_list" ftype="metacyto_clr.txt">
+        <assert_contents>
+          <has_n_lines n="80" />
+          <has_text text="CD16-" />
+        </assert_contents>
+      </output>
+      <output name="unused" ftype="tabular">
+        <assert_contents>
+          <has_n_lines n="81" />
+          <has_text text="CD19-" />
+        </assert_contents>
+      </output>
+    </test>
+  </tests>
+  <help><![CDATA[
+This tool uses MetaCyto to cluster events automatically from several sets of FCS files.
+---------------------------------------------------------------------------------------
+
+**Input files**
+
+This tool requires the pre-processing summary generated for MetaCyto as well as the pre-processed FCS files.
+
+**Parameters**
+
+*Quantile threshold*
+
+This value represents the minimum proportion of events in a cluster that should express more (or less) than the cutoff value for a given marker. With a value of 0.8, a cluster will be labeled Marker1+ if over 80% of cells in the cluster express Marker1 at a higher level than a cutoff value determined by the clustering analysis.
+
+*Minimum Number of events*
+
+This value represents the minimum proportion of cells in each region after bisection. Keep this factor small to avoid bisecting uni-mode distributions.
+
+*Markers to exclude*
+
+Please provide a comma-separated list of the markers that should be excluded from the clustering analysis. By default, FSC, SSC, Time and Cell Length channels are excluded, but providing a list of markers overrides the default settings.
+
+*Additional cluster definitions*
+
+Please provide additional gate definitions as comma-separated lists of marker names, for instance "CD3+, CD4+, CD25+, Foxp3+".
+
+*FlowSOM - Number of meta-clusters*
+
+Please indicate the exact number of meta-clusters expected.
+
+*FlowSOM - Grid dimension*
+
+Please indicate the dimension of the grid for establishing the SOM. The dimension of the grid defines the number of clusters (height x width = cluster)
+
+*FlowSOM - Seed*
+
+Please indicate a random number to use as seed. To make your analysis reproducible, use the same seed.
+
+**Output**
+
+This tool generates a list of clusters identified, including optionally provided cluster definitions, as well as a table of the MFI for each marker in each cluster in each file, and proportion of each cluster in each file. A list of unused cluster definitions, if any, is also generated.
+-----
+
+**Example**
+
+*Input* - Pre-Processing Summary Table::
+   study_id antibodies                  filenames
+   group1   Marker1|Marker2|Marker3|... file1.fcs
+   group2   Marker1|Marker2|Marker3|... file2.fcs
+   ...      ...                         ...
+*Output* - Clustering Summary Tables::
+   group_name fcs_files        cluster_id label                      fcs_names        Marker1 Marker2 ... fraction
+   group1     file1_group1.fcs cluster1   Marker1-|Marker2+|Marker3+ file1_group1.fcs 1.9815  0.2024  ... 0.373
+   group1     file2_group1.fcs cluster1   Marker1-|Marker2+|Marker3+ file2_group1.fcs 2.3739  0.3377  ... 0.26
+   ...        ...              ...        ...                        ...              ...     ...     ... ...
+*Output* - Cluster List::
+   Marker1+|Marker3-
+   Marker1-|Marker2+|Marker3-
+   ...
+  ]]>
+  </help>
+</tool>
Binary file test-data/Group1.fcs has changed
Binary file test-data/Group2.fcs has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/preprocess.metacyto_summary.txt	Tue Jul 27 23:00:49 2021 +0000
@@ -0,0 +1,11 @@
+study_id	antibodies	filenames
+Group1	FSC-A|SSC-A|CD11A|CD4|CD16|CD28|CD95|CD3|CD62L|CD8B|CCR7|TIME|SAMPLE_ID	inputflow6.fcs
+Group1	FSC-A|SSC-A|CD11A|CD4|CD16|CD28|CD95|CD3|CD62L|CD8B|CCR7|TIME|SAMPLE_ID	inputflow5.fcs
+Group1	FSC-A|SSC-A|CD11A|CD4|CD16|CD28|CD95|CD3|CD62L|CD8B|CCR7|TIME|SAMPLE_ID	inputflow4.fcs
+Group1	FSC-A|SSC-A|CD11A|CD4|CD16|CD28|CD95|CD3|CD62L|CD8B|CCR7|TIME|SAMPLE_ID	inputflow3.fcs
+Group1	FSC-A|SSC-A|CD11A|CD4|CD16|CD28|CD95|CD3|CD62L|CD8B|CCR7|TIME|SAMPLE_ID	inputflow2.fcs
+Group1	FSC-A|SSC-A|CD11A|CD4|CD16|CD28|CD95|CD3|CD62L|CD8B|CCR7|TIME|SAMPLE_ID	inputflow1.fcs
+Group2	TIME|CELL_LENGTH|DEAD|CD19|CD4|CD8|IGD|CD85J|CD16|CD3|CD38|CD27|CD14|CD94|CCR7|CD45RA|CD20|CD127|CD33|CD28|CD24|CD161|TCRGD|CD56|HLADR|CD25|DNA1|DNA2|SAMPLE_ID	inputcytof4.fcs
+Group2	TIME|CELL_LENGTH|DEAD|CD19|CD4|CD8|IGD|CD85J|CD16|CD3|CD38|CD27|CD14|CD94|CCR7|CD45RA|CD20|CD127|CD33|CD28|CD24|CD161|TCRGD|CD56|HLADR|CD25|DNA1|DNA2|SAMPLE_ID	inputcytof3.fcs
+Group2	TIME|CELL_LENGTH|DEAD|CD19|CD4|CD8|IGD|CD85J|CD16|CD3|CD38|CD27|CD14|CD94|CCR7|CD45RA|CD20|CD127|CD33|CD28|CD24|CD161|TCRGD|CD56|HLADR|CD25|DNA1|DNA2|SAMPLE_ID	inputcytof2.fcs
+Group2	TIME|CELL_LENGTH|DEAD|CD19|CD4|CD8|IGD|CD85J|CD16|CD3|CD38|CD27|CD14|CD94|CCR7|CD45RA|CD20|CD127|CD33|CD28|CD24|CD161|TCRGD|CD56|HLADR|CD25|DNA1|DNA2|SAMPLE_ID	inputcytof1.fcs