changeset 0:bf6470882a15 draft default tip

"planemo upload for repository https://github.com/AstraZeneca-Omics/immport-galaxy-tools/tree/master/flowtools/metacyto_preprocess commit c3d761b4fca140636c3f22ef0fdbb855f3ecbdb8"
author azomics
date Sun, 25 Jul 2021 10:36:03 +0000
parents
children
files metacyto_preprocess.R metacyto_preprocess.xml test-data/inputcytof1.fcs test-data/inputcytof2.fcs test-data/inputcytof3.fcs test-data/inputcytof4.fcs test-data/inputflow1.fcs test-data/inputflow2.fcs test-data/inputflow3.fcs test-data/inputflow4.fcs test-data/inputflow5.fcs test-data/inputflow6.fcs
diffstat 12 files changed, 397 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/metacyto_preprocess.R	Sun Jul 25 10:36:03 2021 +0000
@@ -0,0 +1,227 @@
+#!/usr/bin/env Rscript
+######################################################################
+#                  Copyright (c) 2018 Northrop Grumman.
+#                          All rights reserved.
+######################################################################
+#
+# Version 1 - January 2018
+# Author: Cristel Thomas
+#
+#
+
+library(flowCore)
+library(MetaCyto)
+
+compare_lists <- function(m1, m2) {
+  list_check <- T
+  if (is.na(all(m1 == m2))) {
+    mm1 <- is.na(m1)
+    mm2 <- is.na(m2)
+    if (all(mm1 == mm2)) {
+      if (!all(m1 == m2, na.rm = TRUE)) {
+        list_check <- F
+      }
+    } else {
+      list_check <- F
+    }
+  } else if (!all(m1 == m2)) {
+    list_check <- F
+  }
+  return(list_check)
+}
+
+
+run_batch_processing <- function(sampling_size = 5000, flag_default = T,
+                               to_exclude, outdir = "", outfile = "",
+                               labels, assays, factors, fcspaths, fcsnames) {
+  # Create meta_data object
+  fp <- unlist(fcspaths)
+  file_counts <- lengths(fcspaths)
+  group_names <- rep(labels, times = file_counts)
+  group_bs <- rep(factors, times = file_counts)
+  group_types <- rep(assays, times = file_counts)
+
+  meta_data <- data.frame(fcs_files = fp, study_id = group_names)
+
+  # excluded_parameters
+  default_param <- c("FSC-A", "FSC-H", "FSC-W", "FSC", "SSC-A", "SSC-H",
+                     "SSC-W", "SSC", "Time", "Cell_length", "cell_length",
+                     "CELL_LENGTH")
+  excluded_parameters <- if (flag_default) default_param else to_exclude
+  # Run preprocessing.batch
+  preprocessing.batch(inputMeta = meta_data,
+                      assay = group_types,
+                      b = group_bs,
+                      fileSampleSize = sampling_size,
+                      outpath = outdir,
+                      excludeTransformParameters = excluded_parameters)
+
+  # deal with outputs
+  # output[2]: a csv file summarizing the pre-processing result.
+  ## -> open file to pull info and print out filenames rather than path.
+  tmp_csv <- file.path(outdir, "processed_sample_summary.csv")
+  tmp <- read.csv(tmp_csv)
+  tmp$old_index <- seq(1, length(tmp$fcs_names))
+
+  fn <- unlist(fcsnames)
+  df <- data.frame(fcs_files = fp, filenames = fn)
+
+  # merge two data frames by ID
+  total <- merge(tmp, df, by = "fcs_files")
+  total2 <- total[order(total$old_index), ]
+  to_drop <- c("fcs_names", "fcs_files", "old_index")
+  newdf <- total2[, !(names(total2) %in% to_drop)]
+  write.table(newdf, file = outfile, quote = F, row.names = F, col.names = T, sep = "\t")
+
+  file.remove(tmp_csv)
+}
+
+check_fcs <- function(sampling = 5000, flag_default = TRUE, to_exclude,
+                     outdir = "", outfile = "", labels, assays, factors,
+                     fcspaths, fcsnames) {
+
+  if (length(labels) > length(unique(labels))) {
+    # we have repeated group names, all group names need to be different
+    print("ERROR: repeated labels among groups, make sure that labels are all different for groups.")
+    print("The following labels are repeated")
+    table(labels)[table(labels) > 1]
+    quit(save = "no", status = 13, runLast = FALSE)
+  }
+
+  marker_pb <- FALSE
+  for (i in seq_len(length(fcspaths))) {
+    for (n in seq_len(length(fcspaths[[i]]))) {
+      marker_check <- FALSE
+      marker_channel <- FALSE
+      tryCatch({
+        fcs <- read.FCS(fcspaths[[i]][[n]], transformation = FALSE)
+      }, error = function(ex) {
+        print(paste("File is not a valid FCS file:", fnames[[i]][[n]], ex))
+        quit(save = "no", status = 10, runLast = FALSE)
+      })
+
+      if (n == 1) {
+        m1 <- as.vector(pData(parameters(fcs))$desc)
+        c1 <- colnames(fcs)
+      } else {
+        m2 <- as.vector(pData(parameters(fcs))$desc)
+        c2 <- colnames(fcs)
+        marker_check <- compare_lists(m1, m2)
+        marker_channel <- compare_lists(c1, c2)
+      }
+      if (n > 1 && marker_check == F) {
+        marker_pb <- TRUE
+        print(paste("Marker discrepancy detected in markers -- group", labels[[i]]))
+      } else if (n > 1 && marker_channel == F) {
+        marker_pb <- TRUE
+        print(paste("Marker discrepancy detected in channels -- group", labels[[i]]))
+      }
+    }
+  }
+
+  if (marker_pb) {
+    quit(save = "no", status = 12, runLast = FALSE)
+  } else {
+    run_batch_processing(sampling, flag_default, to_exclude, outdir, outfile,
+                       labels, assays, factors, fcspaths, fcsnames)
+  }
+}
+
+################################################################################
+################################################################################
+args <- commandArgs(trailingOnly = TRUE)
+
+# Arg 1: sub_sampling number
+# Arg 2: output dir for processed FCS files for check_fcs and run_batch_processing
+# Arg 3: Main output file (text file)
+# Arg 4: excluded params
+# Arg 5: Group 1 Name
+# Arg 6: Group 1 format
+# Arg 7: Group 1 Scaling factor
+# Cycle through files in group 1
+# Arg  : file path in Galaxy
+# Arg  : desired real file name
+# Cycle through at at least one additional group
+# Arg : 'new_panel' - used as some sort of delimiter
+# Arg : Group n+1 Name
+# Arg : Group n+1 format
+# Arg : Group n+1 Scaling factor
+## Cycle through files in that group
+## Arg : file path in Galaxy
+## Arg : desired real file path
+
+sub_sampling <- NULL
+if (as.numeric(args[1]) > 0) {
+  sub_sampling <- as.numeric(args[1])
+}
+
+# parameters to exclude => args[4]
+to_exclude <- vector()
+flag_default <- FALSE
+i <- 1
+if (args[4] == "None" || args[4] == "") {
+  flag_default <- TRUE
+} else {
+  excluded <- unlist(strsplit(args[4], ","))
+  for (channel in excluded) {
+    stripped_chan <- gsub(" ", "", channel, fixed = TRUE)
+    if (!is.na(stripped_chan)) {
+      to_exclude[[i]] <- stripped_chan
+    }
+    i <- i + 1
+  }
+}
+
+# handle group cycle in arguments to produce iterable panels
+tot_args <- length(args)
+tmpargs <- paste(args[5:tot_args], collapse = "=%=")
+tmppanels <- strsplit(tmpargs, "=%=new_panel=%=")
+nb_panel <- length(tmppanels[[1]])
+
+labels <- vector(mode = "character", length = nb_panel)
+assay_types <- vector(mode = "character", length = nb_panel)
+scaling_factors <- vector(mode = "numeric", length = nb_panel)
+filepaths <- list()
+filenames <- list()
+
+# iterate over panels (groups of fcs files)
+j <- 1
+for (pnl in tmppanels[[1]]) {
+  tmppanel <- strsplit(pnl, "=%=")
+  # number of FCS files
+  nb_files <- (length(tmppanel[[1]]) - 3) / 2
+  tmplist <- character(nb_files)
+  tmpnames <- character(nb_files)
+  if (tmppanel[[1]][[1]] == "None" || tmppanel[[1]][[1]] == "") {
+    print(paste("ERROR: Empty group name/label for group ", j))
+    quit(save = "no", status = 11, runLast = FALSE)
+  } else {
+    labels[[j]] <- tmppanel[[1]][[1]]
+  }
+  # assay type
+  assay_types[[j]] <- tmppanel[[1]][[2]]
+
+  scaling_factors[[j]] <- 0
+  if (as.numeric(tmppanel[[1]][[3]]) > 0) {
+    scaling_factors[[j]] <- 1 / as.numeric(tmppanel[[1]][[3]])
+  }
+
+  k <- 1
+  for (m in 4:length(tmppanel[[1]])) {
+    if (!m %% 2) {
+      tmplist[[k]] <- tmppanel[[1]][[m]]
+      tmpnames[[k]] <- tmppanel[[1]][[m + 1]]
+      k <- k + 1
+    }
+  }
+  filepaths[[tmppanel[[1]][1]]] <- tmplist
+  filenames[[tmppanel[[1]][1]]] <- tmpnames
+  j <- j + 1
+}
+
+check_fcs(sub_sampling, flag_default, to_exclude, args[2], args[3], labels,
+         assay_types, scaling_factors, filepaths, filenames)
+
+# check_fcs <- function(sampling = 5000, flag_default = TRUE, to_exclude,
+#                       outdir = "", outfile = "", labels, assays, factors,
+#                       fcspaths, fcsnames)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/metacyto_preprocess.xml	Sun Jul 25 10:36:03 2021 +0000
@@ -0,0 +1,170 @@
+<tool id="metacyto_preprocess" name="Pre-process samples" version="1.0+galaxy0" profile="18.01">
+  <description>for MetaCyto</description>
+  <requirements>
+    <requirement type="package" version="1.4.0">bioconductor-metacyto</requirement>
+  </requirements>
+  <stdio>
+    <exit_code range="1:9" />
+    <exit_code range="10" level="fatal" description="Please provide valid input FCS files." />
+    <exit_code range="11" level="fatal" description="Please provide a label for FCS files sets." />
+    <exit_code range="12" level="fatal" description="FCS files in a same group MUST have the same set of markers." />
+    <exit_code range="13" level="fatal" description="All groups needs to have different labels"/>
+    <exit_code range="14:" />
+  </stdio>
+  <command><![CDATA[
+    Rscript --slave --vanilla '$__tool_directory__/metacyto_preprocess.R'
+      '${sampling}'
+      'preprocessed_fcs'
+      '${output_file}'
+      '${excluded_param}'
+      '${g1_name}'
+      '${g1_format}'
+      '${g1_scaling_factor}'
+    #for $f in $group1
+    '${f}' '${f.name}'
+    #end for
+    #for $panel in $fcs_set
+      'new_panel'
+      '${panel.gp_name}'
+      '${panel.gp_format}'
+      '${panel.gp_scaling_factor}'
+      #for $ff in $panel.group
+        '${ff}' '${ff.name}'
+      #end for
+    #end for
+    ]]>
+  </command>
+  <inputs>
+    <param name="sampling" type="integer" label="Number of events to sample FCS files to." help="0 will use all events from input files, default value is 5000." value="5000"/>
+    <param name="excluded_param" type="text" label="Parameters to exclude from the transformation." help="By default FSC, SSC, Time and Cell Length channels are excluded. Providing markers to exclude overrides the default setting. i.e.:FSC, SSC, CD88"/>
+    <param name="g1_name" type="text" label="Label for the first set of FCS files." value="group 1"/>
+    <param format="fcs" name="group1" type="data_collection" collection_type="list" label="FCS files Collection."/>
+    <param name="g1_format" type="select" label="Assay type for the first set of FCS files." help="If files are compensated already, please select CyTOF.">
+      <option value="FCM" selected="true">Standard Flow Cytometry data</option>
+      <option value="CyTOF">CyTOF data</option>
+    </param>
+    <param name="g1_scaling_factor" type="integer" min="0" max="200" value="150" label="Scaling factor b for arcsinh transform for the first set of files." help="The default value is 150 for standard FCM data. The recommended value for CyTOF data is 5. If data is transformed already, please select 0."/>
+    <repeat name="fcs_set" title="Set of FCS files">
+      <param name="gp_name" type="text" label="Label for this set of FCS files." help="For example: group 2"/>
+      <param format="fcs" name="group" type="data_collection" collection_type="list" label="FCS files Collection."/>
+      <param name="gp_format" type="select" label="Assay type for the first set of FCS files." help="If files are compensated already, please select CyTOF.">
+        <option value="FCM" selected="true">Standard Flow Cytometry data</option>
+        <option value="CyTOF">CyTOF data</option>
+      </param>
+      <param name="gp_scaling_factor" type="integer" min="1" max="200" value="150" label="Scaling factor b for arcsinh transform for the first set of files." help="The default value is 150 for standard FCM data. The recommended value for cyTOF data is 5. If data is transformed already, please select 0."/>
+    </repeat>
+  </inputs>
+  <outputs>
+    <data format="metacyto_summary.txt" name="output_file" label="${tool.name} on ${on_string}: samples summary"/>
+    <collection type="list" label="${tool.name} on ${on_string}: processed samples" name="preprocessed">
+      <discover_datasets directory="preprocessed_fcs" pattern="__name_and_ext__" ext="fcs" />
+    </collection>
+  </outputs>
+  <tests>
+    <test>
+      <param name="sampling" value="1000"/>
+      <param name="excluded_param" value="FSC-A,FSC-W,FSC-H,Time,Cell_length"/>
+      <param name="g1_name" value="SDY376"/>
+      <param name="group1">
+        <collection type="list">
+          <element name="inputflow1" value="inputflow1.fcs"/>
+          <element name="inputflow2" value="inputflow2.fcs"/>
+          <element name="inputflow3" value="inputflow3.fcs"/>
+          <element name="inputflow4" value="inputflow4.fcs"/>
+          <element name="inputflow5" value="inputflow5.fcs"/>
+          <element name="inputflow6" value="inputflow6.fcs"/>
+        </collection>
+      </param>
+      <param name="g1_format" value="FCM"/>
+      <param name="g1_scaling_factor" value="150"/>
+      <repeat name="fcs_set">
+        <param name="gp_name" value="SDY376-2"/>
+        <param name="group">
+          <collection type="list">
+            <element name="inputcytof1" value="inputcytof1.fcs"/>
+            <element name="inputcytof2" value="inputcytof2.fcs"/>
+            <element name="inputcytof3" value="inputcytof3.fcs"/>
+            <element name="inputcytof4" value="inputcytof4.fcs"/>
+          </collection>
+        </param>
+        <param name="gp_format" value="CyTOF"/>
+        <param name="gp_scaling_factor" value="8"/>
+      </repeat>
+      <output name="output_file">
+        <assert_contents>
+          <has_n_lines n="11" />
+        </assert_contents>
+      </output>
+      <output_collection name="preprocessed" type="list">
+        <element name="SDY376" ftype="fcs">
+          <assert_contents>
+            <has_text_matching expression="^FCS3.0" />
+          </assert_contents>
+        </element>
+        <element name="SDY376-2" ftype="fcs">
+          <assert_contents>
+            <has_text_matching expression="^FCS3.0" />
+          </assert_contents>
+        </element>
+      </output_collection>
+    </test>
+  </tests>
+  <help><![CDATA[
+    Pre-process samples
+    -------------------
+
+    This tool uses MetaCyto's preprocessing function to prepare sets of FCS files for a MetaCyto analysis.
+
+    **Input**
+    This tool requires one or more sets of FCS files.
+    .. class:: infomark
+    The number provided for sub-sampling corresponds to the number of events randomly sampled from each FCS files.
+    **Output**
+    This tool generates one or more FCS files containing optionally sub-sampled data from the input FCS data sets. The FCS data can optionally be compensated and/or transformed. A summary of the operations is also generated.
+    .. class:: infomark
+    This tool uses the arcsinh transformation. If you would like to use another transformation algorithm, sets of files can be prepared independantly by using the following tools:
+    - Merge and downsample FCS files with FlowSOM
+    - Transform FCS data with optional compensation and automated gating with flowDensity.
+    .. class:: warningmark
+    The workflow to use MetaCyto in R vs. ImmPort Galaxy are slightly different - please use the following tool in FCS File Tools to harmonize FCS files before MetaCyto pre-processing:
+    - Edit markers or channels in FCS files
+
+    **Example**
+
+    *File1*: 20K events::
+
+    Marker1 Marker2 Marker3 ...
+    34      45      12      ...
+    33      65      10      ...
+    87      26      76      ...
+    24      56      32      ...
+    95      83      53      ...
+    ...     ...     ...     ...
+
+    *File2*: 20K events::
+
+    Marker1 Marker2 Marker3 ...
+    19      62      98      ...
+    12      36      58      ...
+    41      42      68      ...
+    76      74      53      ...
+    62      34      45      ...
+    ...     ...     ...     ...
+
+    *Output*: 5K events::
+
+    Marker1 Marker2 Marker3 ...
+    34      45      12      ...
+    87      26      76      ...
+    12      36      58      ...
+    62      34      45      ...
+    ...     ...     ...     ...
+
+    *Output* - Summary Table::
+
+    study_id antibodies                  filenames
+    group1   Marker1|Marker2|Marker3|... file1.fcs
+    group2   Marker1|Marker2|Marker3|... file2.fcs
+    ]]>
+  </help>
+</tool>
Binary file test-data/inputcytof1.fcs has changed
Binary file test-data/inputcytof2.fcs has changed
Binary file test-data/inputcytof3.fcs has changed
Binary file test-data/inputcytof4.fcs has changed
Binary file test-data/inputflow1.fcs has changed
Binary file test-data/inputflow2.fcs has changed
Binary file test-data/inputflow3.fcs has changed
Binary file test-data/inputflow4.fcs has changed
Binary file test-data/inputflow5.fcs has changed
Binary file test-data/inputflow6.fcs has changed