Mercurial > repos > azomics > metacyto_preprocess
diff metacyto_preprocess.R @ 0:bf6470882a15 draft default tip
"planemo upload for repository https://github.com/AstraZeneca-Omics/immport-galaxy-tools/tree/master/flowtools/metacyto_preprocess commit c3d761b4fca140636c3f22ef0fdbb855f3ecbdb8"
author | azomics |
---|---|
date | Sun, 25 Jul 2021 10:36:03 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metacyto_preprocess.R Sun Jul 25 10:36:03 2021 +0000 @@ -0,0 +1,227 @@ +#!/usr/bin/env Rscript +###################################################################### +# Copyright (c) 2018 Northrop Grumman. +# All rights reserved. +###################################################################### +# +# Version 1 - January 2018 +# Author: Cristel Thomas +# +# + +library(flowCore) +library(MetaCyto) + +compare_lists <- function(m1, m2) { + list_check <- T + if (is.na(all(m1 == m2))) { + mm1 <- is.na(m1) + mm2 <- is.na(m2) + if (all(mm1 == mm2)) { + if (!all(m1 == m2, na.rm = TRUE)) { + list_check <- F + } + } else { + list_check <- F + } + } else if (!all(m1 == m2)) { + list_check <- F + } + return(list_check) +} + + +run_batch_processing <- function(sampling_size = 5000, flag_default = T, + to_exclude, outdir = "", outfile = "", + labels, assays, factors, fcspaths, fcsnames) { + # Create meta_data object + fp <- unlist(fcspaths) + file_counts <- lengths(fcspaths) + group_names <- rep(labels, times = file_counts) + group_bs <- rep(factors, times = file_counts) + group_types <- rep(assays, times = file_counts) + + meta_data <- data.frame(fcs_files = fp, study_id = group_names) + + # excluded_parameters + default_param <- c("FSC-A", "FSC-H", "FSC-W", "FSC", "SSC-A", "SSC-H", + "SSC-W", "SSC", "Time", "Cell_length", "cell_length", + "CELL_LENGTH") + excluded_parameters <- if (flag_default) default_param else to_exclude + # Run preprocessing.batch + preprocessing.batch(inputMeta = meta_data, + assay = group_types, + b = group_bs, + fileSampleSize = sampling_size, + outpath = outdir, + excludeTransformParameters = excluded_parameters) + + # deal with outputs + # output[2]: a csv file summarizing the pre-processing result. + ## -> open file to pull info and print out filenames rather than path. + tmp_csv <- file.path(outdir, "processed_sample_summary.csv") + tmp <- read.csv(tmp_csv) + tmp$old_index <- seq(1, length(tmp$fcs_names)) + + fn <- unlist(fcsnames) + df <- data.frame(fcs_files = fp, filenames = fn) + + # merge two data frames by ID + total <- merge(tmp, df, by = "fcs_files") + total2 <- total[order(total$old_index), ] + to_drop <- c("fcs_names", "fcs_files", "old_index") + newdf <- total2[, !(names(total2) %in% to_drop)] + write.table(newdf, file = outfile, quote = F, row.names = F, col.names = T, sep = "\t") + + file.remove(tmp_csv) +} + +check_fcs <- function(sampling = 5000, flag_default = TRUE, to_exclude, + outdir = "", outfile = "", labels, assays, factors, + fcspaths, fcsnames) { + + if (length(labels) > length(unique(labels))) { + # we have repeated group names, all group names need to be different + print("ERROR: repeated labels among groups, make sure that labels are all different for groups.") + print("The following labels are repeated") + table(labels)[table(labels) > 1] + quit(save = "no", status = 13, runLast = FALSE) + } + + marker_pb <- FALSE + for (i in seq_len(length(fcspaths))) { + for (n in seq_len(length(fcspaths[[i]]))) { + marker_check <- FALSE + marker_channel <- FALSE + tryCatch({ + fcs <- read.FCS(fcspaths[[i]][[n]], transformation = FALSE) + }, error = function(ex) { + print(paste("File is not a valid FCS file:", fnames[[i]][[n]], ex)) + quit(save = "no", status = 10, runLast = FALSE) + }) + + if (n == 1) { + m1 <- as.vector(pData(parameters(fcs))$desc) + c1 <- colnames(fcs) + } else { + m2 <- as.vector(pData(parameters(fcs))$desc) + c2 <- colnames(fcs) + marker_check <- compare_lists(m1, m2) + marker_channel <- compare_lists(c1, c2) + } + if (n > 1 && marker_check == F) { + marker_pb <- TRUE + print(paste("Marker discrepancy detected in markers -- group", labels[[i]])) + } else if (n > 1 && marker_channel == F) { + marker_pb <- TRUE + print(paste("Marker discrepancy detected in channels -- group", labels[[i]])) + } + } + } + + if (marker_pb) { + quit(save = "no", status = 12, runLast = FALSE) + } else { + run_batch_processing(sampling, flag_default, to_exclude, outdir, outfile, + labels, assays, factors, fcspaths, fcsnames) + } +} + +################################################################################ +################################################################################ +args <- commandArgs(trailingOnly = TRUE) + +# Arg 1: sub_sampling number +# Arg 2: output dir for processed FCS files for check_fcs and run_batch_processing +# Arg 3: Main output file (text file) +# Arg 4: excluded params +# Arg 5: Group 1 Name +# Arg 6: Group 1 format +# Arg 7: Group 1 Scaling factor +# Cycle through files in group 1 +# Arg : file path in Galaxy +# Arg : desired real file name +# Cycle through at at least one additional group +# Arg : 'new_panel' - used as some sort of delimiter +# Arg : Group n+1 Name +# Arg : Group n+1 format +# Arg : Group n+1 Scaling factor +## Cycle through files in that group +## Arg : file path in Galaxy +## Arg : desired real file path + +sub_sampling <- NULL +if (as.numeric(args[1]) > 0) { + sub_sampling <- as.numeric(args[1]) +} + +# parameters to exclude => args[4] +to_exclude <- vector() +flag_default <- FALSE +i <- 1 +if (args[4] == "None" || args[4] == "") { + flag_default <- TRUE +} else { + excluded <- unlist(strsplit(args[4], ",")) + for (channel in excluded) { + stripped_chan <- gsub(" ", "", channel, fixed = TRUE) + if (!is.na(stripped_chan)) { + to_exclude[[i]] <- stripped_chan + } + i <- i + 1 + } +} + +# handle group cycle in arguments to produce iterable panels +tot_args <- length(args) +tmpargs <- paste(args[5:tot_args], collapse = "=%=") +tmppanels <- strsplit(tmpargs, "=%=new_panel=%=") +nb_panel <- length(tmppanels[[1]]) + +labels <- vector(mode = "character", length = nb_panel) +assay_types <- vector(mode = "character", length = nb_panel) +scaling_factors <- vector(mode = "numeric", length = nb_panel) +filepaths <- list() +filenames <- list() + +# iterate over panels (groups of fcs files) +j <- 1 +for (pnl in tmppanels[[1]]) { + tmppanel <- strsplit(pnl, "=%=") + # number of FCS files + nb_files <- (length(tmppanel[[1]]) - 3) / 2 + tmplist <- character(nb_files) + tmpnames <- character(nb_files) + if (tmppanel[[1]][[1]] == "None" || tmppanel[[1]][[1]] == "") { + print(paste("ERROR: Empty group name/label for group ", j)) + quit(save = "no", status = 11, runLast = FALSE) + } else { + labels[[j]] <- tmppanel[[1]][[1]] + } + # assay type + assay_types[[j]] <- tmppanel[[1]][[2]] + + scaling_factors[[j]] <- 0 + if (as.numeric(tmppanel[[1]][[3]]) > 0) { + scaling_factors[[j]] <- 1 / as.numeric(tmppanel[[1]][[3]]) + } + + k <- 1 + for (m in 4:length(tmppanel[[1]])) { + if (!m %% 2) { + tmplist[[k]] <- tmppanel[[1]][[m]] + tmpnames[[k]] <- tmppanel[[1]][[m + 1]] + k <- k + 1 + } + } + filepaths[[tmppanel[[1]][1]]] <- tmplist + filenames[[tmppanel[[1]][1]]] <- tmpnames + j <- j + 1 +} + +check_fcs(sub_sampling, flag_default, to_exclude, args[2], args[3], labels, + assay_types, scaling_factors, filepaths, filenames) + +# check_fcs <- function(sampling = 5000, flag_default = TRUE, to_exclude, +# outdir = "", outfile = "", labels, assays, factors, +# fcspaths, fcsnames)