Mercurial > repos > azomics > metacyto_preprocess
changeset 0:bf6470882a15 draft default tip
"planemo upload for repository https://github.com/AstraZeneca-Omics/immport-galaxy-tools/tree/master/flowtools/metacyto_preprocess commit c3d761b4fca140636c3f22ef0fdbb855f3ecbdb8"
author | azomics |
---|---|
date | Sun, 25 Jul 2021 10:36:03 +0000 |
parents | |
children | |
files | metacyto_preprocess.R metacyto_preprocess.xml test-data/inputcytof1.fcs test-data/inputcytof2.fcs test-data/inputcytof3.fcs test-data/inputcytof4.fcs test-data/inputflow1.fcs test-data/inputflow2.fcs test-data/inputflow3.fcs test-data/inputflow4.fcs test-data/inputflow5.fcs test-data/inputflow6.fcs |
diffstat | 12 files changed, 397 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metacyto_preprocess.R Sun Jul 25 10:36:03 2021 +0000 @@ -0,0 +1,227 @@ +#!/usr/bin/env Rscript +###################################################################### +# Copyright (c) 2018 Northrop Grumman. +# All rights reserved. +###################################################################### +# +# Version 1 - January 2018 +# Author: Cristel Thomas +# +# + +library(flowCore) +library(MetaCyto) + +compare_lists <- function(m1, m2) { + list_check <- T + if (is.na(all(m1 == m2))) { + mm1 <- is.na(m1) + mm2 <- is.na(m2) + if (all(mm1 == mm2)) { + if (!all(m1 == m2, na.rm = TRUE)) { + list_check <- F + } + } else { + list_check <- F + } + } else if (!all(m1 == m2)) { + list_check <- F + } + return(list_check) +} + + +run_batch_processing <- function(sampling_size = 5000, flag_default = T, + to_exclude, outdir = "", outfile = "", + labels, assays, factors, fcspaths, fcsnames) { + # Create meta_data object + fp <- unlist(fcspaths) + file_counts <- lengths(fcspaths) + group_names <- rep(labels, times = file_counts) + group_bs <- rep(factors, times = file_counts) + group_types <- rep(assays, times = file_counts) + + meta_data <- data.frame(fcs_files = fp, study_id = group_names) + + # excluded_parameters + default_param <- c("FSC-A", "FSC-H", "FSC-W", "FSC", "SSC-A", "SSC-H", + "SSC-W", "SSC", "Time", "Cell_length", "cell_length", + "CELL_LENGTH") + excluded_parameters <- if (flag_default) default_param else to_exclude + # Run preprocessing.batch + preprocessing.batch(inputMeta = meta_data, + assay = group_types, + b = group_bs, + fileSampleSize = sampling_size, + outpath = outdir, + excludeTransformParameters = excluded_parameters) + + # deal with outputs + # output[2]: a csv file summarizing the pre-processing result. + ## -> open file to pull info and print out filenames rather than path. + tmp_csv <- file.path(outdir, "processed_sample_summary.csv") + tmp <- read.csv(tmp_csv) + tmp$old_index <- seq(1, length(tmp$fcs_names)) + + fn <- unlist(fcsnames) + df <- data.frame(fcs_files = fp, filenames = fn) + + # merge two data frames by ID + total <- merge(tmp, df, by = "fcs_files") + total2 <- total[order(total$old_index), ] + to_drop <- c("fcs_names", "fcs_files", "old_index") + newdf <- total2[, !(names(total2) %in% to_drop)] + write.table(newdf, file = outfile, quote = F, row.names = F, col.names = T, sep = "\t") + + file.remove(tmp_csv) +} + +check_fcs <- function(sampling = 5000, flag_default = TRUE, to_exclude, + outdir = "", outfile = "", labels, assays, factors, + fcspaths, fcsnames) { + + if (length(labels) > length(unique(labels))) { + # we have repeated group names, all group names need to be different + print("ERROR: repeated labels among groups, make sure that labels are all different for groups.") + print("The following labels are repeated") + table(labels)[table(labels) > 1] + quit(save = "no", status = 13, runLast = FALSE) + } + + marker_pb <- FALSE + for (i in seq_len(length(fcspaths))) { + for (n in seq_len(length(fcspaths[[i]]))) { + marker_check <- FALSE + marker_channel <- FALSE + tryCatch({ + fcs <- read.FCS(fcspaths[[i]][[n]], transformation = FALSE) + }, error = function(ex) { + print(paste("File is not a valid FCS file:", fnames[[i]][[n]], ex)) + quit(save = "no", status = 10, runLast = FALSE) + }) + + if (n == 1) { + m1 <- as.vector(pData(parameters(fcs))$desc) + c1 <- colnames(fcs) + } else { + m2 <- as.vector(pData(parameters(fcs))$desc) + c2 <- colnames(fcs) + marker_check <- compare_lists(m1, m2) + marker_channel <- compare_lists(c1, c2) + } + if (n > 1 && marker_check == F) { + marker_pb <- TRUE + print(paste("Marker discrepancy detected in markers -- group", labels[[i]])) + } else if (n > 1 && marker_channel == F) { + marker_pb <- TRUE + print(paste("Marker discrepancy detected in channels -- group", labels[[i]])) + } + } + } + + if (marker_pb) { + quit(save = "no", status = 12, runLast = FALSE) + } else { + run_batch_processing(sampling, flag_default, to_exclude, outdir, outfile, + labels, assays, factors, fcspaths, fcsnames) + } +} + +################################################################################ +################################################################################ +args <- commandArgs(trailingOnly = TRUE) + +# Arg 1: sub_sampling number +# Arg 2: output dir for processed FCS files for check_fcs and run_batch_processing +# Arg 3: Main output file (text file) +# Arg 4: excluded params +# Arg 5: Group 1 Name +# Arg 6: Group 1 format +# Arg 7: Group 1 Scaling factor +# Cycle through files in group 1 +# Arg : file path in Galaxy +# Arg : desired real file name +# Cycle through at at least one additional group +# Arg : 'new_panel' - used as some sort of delimiter +# Arg : Group n+1 Name +# Arg : Group n+1 format +# Arg : Group n+1 Scaling factor +## Cycle through files in that group +## Arg : file path in Galaxy +## Arg : desired real file path + +sub_sampling <- NULL +if (as.numeric(args[1]) > 0) { + sub_sampling <- as.numeric(args[1]) +} + +# parameters to exclude => args[4] +to_exclude <- vector() +flag_default <- FALSE +i <- 1 +if (args[4] == "None" || args[4] == "") { + flag_default <- TRUE +} else { + excluded <- unlist(strsplit(args[4], ",")) + for (channel in excluded) { + stripped_chan <- gsub(" ", "", channel, fixed = TRUE) + if (!is.na(stripped_chan)) { + to_exclude[[i]] <- stripped_chan + } + i <- i + 1 + } +} + +# handle group cycle in arguments to produce iterable panels +tot_args <- length(args) +tmpargs <- paste(args[5:tot_args], collapse = "=%=") +tmppanels <- strsplit(tmpargs, "=%=new_panel=%=") +nb_panel <- length(tmppanels[[1]]) + +labels <- vector(mode = "character", length = nb_panel) +assay_types <- vector(mode = "character", length = nb_panel) +scaling_factors <- vector(mode = "numeric", length = nb_panel) +filepaths <- list() +filenames <- list() + +# iterate over panels (groups of fcs files) +j <- 1 +for (pnl in tmppanels[[1]]) { + tmppanel <- strsplit(pnl, "=%=") + # number of FCS files + nb_files <- (length(tmppanel[[1]]) - 3) / 2 + tmplist <- character(nb_files) + tmpnames <- character(nb_files) + if (tmppanel[[1]][[1]] == "None" || tmppanel[[1]][[1]] == "") { + print(paste("ERROR: Empty group name/label for group ", j)) + quit(save = "no", status = 11, runLast = FALSE) + } else { + labels[[j]] <- tmppanel[[1]][[1]] + } + # assay type + assay_types[[j]] <- tmppanel[[1]][[2]] + + scaling_factors[[j]] <- 0 + if (as.numeric(tmppanel[[1]][[3]]) > 0) { + scaling_factors[[j]] <- 1 / as.numeric(tmppanel[[1]][[3]]) + } + + k <- 1 + for (m in 4:length(tmppanel[[1]])) { + if (!m %% 2) { + tmplist[[k]] <- tmppanel[[1]][[m]] + tmpnames[[k]] <- tmppanel[[1]][[m + 1]] + k <- k + 1 + } + } + filepaths[[tmppanel[[1]][1]]] <- tmplist + filenames[[tmppanel[[1]][1]]] <- tmpnames + j <- j + 1 +} + +check_fcs(sub_sampling, flag_default, to_exclude, args[2], args[3], labels, + assay_types, scaling_factors, filepaths, filenames) + +# check_fcs <- function(sampling = 5000, flag_default = TRUE, to_exclude, +# outdir = "", outfile = "", labels, assays, factors, +# fcspaths, fcsnames)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metacyto_preprocess.xml Sun Jul 25 10:36:03 2021 +0000 @@ -0,0 +1,170 @@ +<tool id="metacyto_preprocess" name="Pre-process samples" version="1.0+galaxy0" profile="18.01"> + <description>for MetaCyto</description> + <requirements> + <requirement type="package" version="1.4.0">bioconductor-metacyto</requirement> + </requirements> + <stdio> + <exit_code range="1:9" /> + <exit_code range="10" level="fatal" description="Please provide valid input FCS files." /> + <exit_code range="11" level="fatal" description="Please provide a label for FCS files sets." /> + <exit_code range="12" level="fatal" description="FCS files in a same group MUST have the same set of markers." /> + <exit_code range="13" level="fatal" description="All groups needs to have different labels"/> + <exit_code range="14:" /> + </stdio> + <command><![CDATA[ + Rscript --slave --vanilla '$__tool_directory__/metacyto_preprocess.R' + '${sampling}' + 'preprocessed_fcs' + '${output_file}' + '${excluded_param}' + '${g1_name}' + '${g1_format}' + '${g1_scaling_factor}' + #for $f in $group1 + '${f}' '${f.name}' + #end for + #for $panel in $fcs_set + 'new_panel' + '${panel.gp_name}' + '${panel.gp_format}' + '${panel.gp_scaling_factor}' + #for $ff in $panel.group + '${ff}' '${ff.name}' + #end for + #end for + ]]> + </command> + <inputs> + <param name="sampling" type="integer" label="Number of events to sample FCS files to." help="0 will use all events from input files, default value is 5000." value="5000"/> + <param name="excluded_param" type="text" label="Parameters to exclude from the transformation." help="By default FSC, SSC, Time and Cell Length channels are excluded. Providing markers to exclude overrides the default setting. i.e.:FSC, SSC, CD88"/> + <param name="g1_name" type="text" label="Label for the first set of FCS files." value="group 1"/> + <param format="fcs" name="group1" type="data_collection" collection_type="list" label="FCS files Collection."/> + <param name="g1_format" type="select" label="Assay type for the first set of FCS files." help="If files are compensated already, please select CyTOF."> + <option value="FCM" selected="true">Standard Flow Cytometry data</option> + <option value="CyTOF">CyTOF data</option> + </param> + <param name="g1_scaling_factor" type="integer" min="0" max="200" value="150" label="Scaling factor b for arcsinh transform for the first set of files." help="The default value is 150 for standard FCM data. The recommended value for CyTOF data is 5. If data is transformed already, please select 0."/> + <repeat name="fcs_set" title="Set of FCS files"> + <param name="gp_name" type="text" label="Label for this set of FCS files." help="For example: group 2"/> + <param format="fcs" name="group" type="data_collection" collection_type="list" label="FCS files Collection."/> + <param name="gp_format" type="select" label="Assay type for the first set of FCS files." help="If files are compensated already, please select CyTOF."> + <option value="FCM" selected="true">Standard Flow Cytometry data</option> + <option value="CyTOF">CyTOF data</option> + </param> + <param name="gp_scaling_factor" type="integer" min="1" max="200" value="150" label="Scaling factor b for arcsinh transform for the first set of files." help="The default value is 150 for standard FCM data. The recommended value for cyTOF data is 5. If data is transformed already, please select 0."/> + </repeat> + </inputs> + <outputs> + <data format="metacyto_summary.txt" name="output_file" label="${tool.name} on ${on_string}: samples summary"/> + <collection type="list" label="${tool.name} on ${on_string}: processed samples" name="preprocessed"> + <discover_datasets directory="preprocessed_fcs" pattern="__name_and_ext__" ext="fcs" /> + </collection> + </outputs> + <tests> + <test> + <param name="sampling" value="1000"/> + <param name="excluded_param" value="FSC-A,FSC-W,FSC-H,Time,Cell_length"/> + <param name="g1_name" value="SDY376"/> + <param name="group1"> + <collection type="list"> + <element name="inputflow1" value="inputflow1.fcs"/> + <element name="inputflow2" value="inputflow2.fcs"/> + <element name="inputflow3" value="inputflow3.fcs"/> + <element name="inputflow4" value="inputflow4.fcs"/> + <element name="inputflow5" value="inputflow5.fcs"/> + <element name="inputflow6" value="inputflow6.fcs"/> + </collection> + </param> + <param name="g1_format" value="FCM"/> + <param name="g1_scaling_factor" value="150"/> + <repeat name="fcs_set"> + <param name="gp_name" value="SDY376-2"/> + <param name="group"> + <collection type="list"> + <element name="inputcytof1" value="inputcytof1.fcs"/> + <element name="inputcytof2" value="inputcytof2.fcs"/> + <element name="inputcytof3" value="inputcytof3.fcs"/> + <element name="inputcytof4" value="inputcytof4.fcs"/> + </collection> + </param> + <param name="gp_format" value="CyTOF"/> + <param name="gp_scaling_factor" value="8"/> + </repeat> + <output name="output_file"> + <assert_contents> + <has_n_lines n="11" /> + </assert_contents> + </output> + <output_collection name="preprocessed" type="list"> + <element name="SDY376" ftype="fcs"> + <assert_contents> + <has_text_matching expression="^FCS3.0" /> + </assert_contents> + </element> + <element name="SDY376-2" ftype="fcs"> + <assert_contents> + <has_text_matching expression="^FCS3.0" /> + </assert_contents> + </element> + </output_collection> + </test> + </tests> + <help><![CDATA[ + Pre-process samples + ------------------- + + This tool uses MetaCyto's preprocessing function to prepare sets of FCS files for a MetaCyto analysis. + + **Input** + This tool requires one or more sets of FCS files. + .. class:: infomark + The number provided for sub-sampling corresponds to the number of events randomly sampled from each FCS files. + **Output** + This tool generates one or more FCS files containing optionally sub-sampled data from the input FCS data sets. The FCS data can optionally be compensated and/or transformed. A summary of the operations is also generated. + .. class:: infomark + This tool uses the arcsinh transformation. If you would like to use another transformation algorithm, sets of files can be prepared independantly by using the following tools: + - Merge and downsample FCS files with FlowSOM + - Transform FCS data with optional compensation and automated gating with flowDensity. + .. class:: warningmark + The workflow to use MetaCyto in R vs. ImmPort Galaxy are slightly different - please use the following tool in FCS File Tools to harmonize FCS files before MetaCyto pre-processing: + - Edit markers or channels in FCS files + + **Example** + + *File1*: 20K events:: + + Marker1 Marker2 Marker3 ... + 34 45 12 ... + 33 65 10 ... + 87 26 76 ... + 24 56 32 ... + 95 83 53 ... + ... ... ... ... + + *File2*: 20K events:: + + Marker1 Marker2 Marker3 ... + 19 62 98 ... + 12 36 58 ... + 41 42 68 ... + 76 74 53 ... + 62 34 45 ... + ... ... ... ... + + *Output*: 5K events:: + + Marker1 Marker2 Marker3 ... + 34 45 12 ... + 87 26 76 ... + 12 36 58 ... + 62 34 45 ... + ... ... ... ... + + *Output* - Summary Table:: + + study_id antibodies filenames + group1 Marker1|Marker2|Marker3|... file1.fcs + group2 Marker1|Marker2|Marker3|... file2.fcs + ]]> + </help> +</tool>