Mercurial > repos > azomics > metacyto_preprocess
view metacyto_preprocess.R @ 0:bf6470882a15 draft default tip
"planemo upload for repository https://github.com/AstraZeneca-Omics/immport-galaxy-tools/tree/master/flowtools/metacyto_preprocess commit c3d761b4fca140636c3f22ef0fdbb855f3ecbdb8"
author | azomics |
---|---|
date | Sun, 25 Jul 2021 10:36:03 +0000 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env Rscript ###################################################################### # Copyright (c) 2018 Northrop Grumman. # All rights reserved. ###################################################################### # # Version 1 - January 2018 # Author: Cristel Thomas # # library(flowCore) library(MetaCyto) compare_lists <- function(m1, m2) { list_check <- T if (is.na(all(m1 == m2))) { mm1 <- is.na(m1) mm2 <- is.na(m2) if (all(mm1 == mm2)) { if (!all(m1 == m2, na.rm = TRUE)) { list_check <- F } } else { list_check <- F } } else if (!all(m1 == m2)) { list_check <- F } return(list_check) } run_batch_processing <- function(sampling_size = 5000, flag_default = T, to_exclude, outdir = "", outfile = "", labels, assays, factors, fcspaths, fcsnames) { # Create meta_data object fp <- unlist(fcspaths) file_counts <- lengths(fcspaths) group_names <- rep(labels, times = file_counts) group_bs <- rep(factors, times = file_counts) group_types <- rep(assays, times = file_counts) meta_data <- data.frame(fcs_files = fp, study_id = group_names) # excluded_parameters default_param <- c("FSC-A", "FSC-H", "FSC-W", "FSC", "SSC-A", "SSC-H", "SSC-W", "SSC", "Time", "Cell_length", "cell_length", "CELL_LENGTH") excluded_parameters <- if (flag_default) default_param else to_exclude # Run preprocessing.batch preprocessing.batch(inputMeta = meta_data, assay = group_types, b = group_bs, fileSampleSize = sampling_size, outpath = outdir, excludeTransformParameters = excluded_parameters) # deal with outputs # output[2]: a csv file summarizing the pre-processing result. ## -> open file to pull info and print out filenames rather than path. tmp_csv <- file.path(outdir, "processed_sample_summary.csv") tmp <- read.csv(tmp_csv) tmp$old_index <- seq(1, length(tmp$fcs_names)) fn <- unlist(fcsnames) df <- data.frame(fcs_files = fp, filenames = fn) # merge two data frames by ID total <- merge(tmp, df, by = "fcs_files") total2 <- total[order(total$old_index), ] to_drop <- c("fcs_names", "fcs_files", "old_index") newdf <- total2[, !(names(total2) %in% to_drop)] write.table(newdf, file = outfile, quote = F, row.names = F, col.names = T, sep = "\t") file.remove(tmp_csv) } check_fcs <- function(sampling = 5000, flag_default = TRUE, to_exclude, outdir = "", outfile = "", labels, assays, factors, fcspaths, fcsnames) { if (length(labels) > length(unique(labels))) { # we have repeated group names, all group names need to be different print("ERROR: repeated labels among groups, make sure that labels are all different for groups.") print("The following labels are repeated") table(labels)[table(labels) > 1] quit(save = "no", status = 13, runLast = FALSE) } marker_pb <- FALSE for (i in seq_len(length(fcspaths))) { for (n in seq_len(length(fcspaths[[i]]))) { marker_check <- FALSE marker_channel <- FALSE tryCatch({ fcs <- read.FCS(fcspaths[[i]][[n]], transformation = FALSE) }, error = function(ex) { print(paste("File is not a valid FCS file:", fnames[[i]][[n]], ex)) quit(save = "no", status = 10, runLast = FALSE) }) if (n == 1) { m1 <- as.vector(pData(parameters(fcs))$desc) c1 <- colnames(fcs) } else { m2 <- as.vector(pData(parameters(fcs))$desc) c2 <- colnames(fcs) marker_check <- compare_lists(m1, m2) marker_channel <- compare_lists(c1, c2) } if (n > 1 && marker_check == F) { marker_pb <- TRUE print(paste("Marker discrepancy detected in markers -- group", labels[[i]])) } else if (n > 1 && marker_channel == F) { marker_pb <- TRUE print(paste("Marker discrepancy detected in channels -- group", labels[[i]])) } } } if (marker_pb) { quit(save = "no", status = 12, runLast = FALSE) } else { run_batch_processing(sampling, flag_default, to_exclude, outdir, outfile, labels, assays, factors, fcspaths, fcsnames) } } ################################################################################ ################################################################################ args <- commandArgs(trailingOnly = TRUE) # Arg 1: sub_sampling number # Arg 2: output dir for processed FCS files for check_fcs and run_batch_processing # Arg 3: Main output file (text file) # Arg 4: excluded params # Arg 5: Group 1 Name # Arg 6: Group 1 format # Arg 7: Group 1 Scaling factor # Cycle through files in group 1 # Arg : file path in Galaxy # Arg : desired real file name # Cycle through at at least one additional group # Arg : 'new_panel' - used as some sort of delimiter # Arg : Group n+1 Name # Arg : Group n+1 format # Arg : Group n+1 Scaling factor ## Cycle through files in that group ## Arg : file path in Galaxy ## Arg : desired real file path sub_sampling <- NULL if (as.numeric(args[1]) > 0) { sub_sampling <- as.numeric(args[1]) } # parameters to exclude => args[4] to_exclude <- vector() flag_default <- FALSE i <- 1 if (args[4] == "None" || args[4] == "") { flag_default <- TRUE } else { excluded <- unlist(strsplit(args[4], ",")) for (channel in excluded) { stripped_chan <- gsub(" ", "", channel, fixed = TRUE) if (!is.na(stripped_chan)) { to_exclude[[i]] <- stripped_chan } i <- i + 1 } } # handle group cycle in arguments to produce iterable panels tot_args <- length(args) tmpargs <- paste(args[5:tot_args], collapse = "=%=") tmppanels <- strsplit(tmpargs, "=%=new_panel=%=") nb_panel <- length(tmppanels[[1]]) labels <- vector(mode = "character", length = nb_panel) assay_types <- vector(mode = "character", length = nb_panel) scaling_factors <- vector(mode = "numeric", length = nb_panel) filepaths <- list() filenames <- list() # iterate over panels (groups of fcs files) j <- 1 for (pnl in tmppanels[[1]]) { tmppanel <- strsplit(pnl, "=%=") # number of FCS files nb_files <- (length(tmppanel[[1]]) - 3) / 2 tmplist <- character(nb_files) tmpnames <- character(nb_files) if (tmppanel[[1]][[1]] == "None" || tmppanel[[1]][[1]] == "") { print(paste("ERROR: Empty group name/label for group ", j)) quit(save = "no", status = 11, runLast = FALSE) } else { labels[[j]] <- tmppanel[[1]][[1]] } # assay type assay_types[[j]] <- tmppanel[[1]][[2]] scaling_factors[[j]] <- 0 if (as.numeric(tmppanel[[1]][[3]]) > 0) { scaling_factors[[j]] <- 1 / as.numeric(tmppanel[[1]][[3]]) } k <- 1 for (m in 4:length(tmppanel[[1]])) { if (!m %% 2) { tmplist[[k]] <- tmppanel[[1]][[m]] tmpnames[[k]] <- tmppanel[[1]][[m + 1]] k <- k + 1 } } filepaths[[tmppanel[[1]][1]]] <- tmplist filenames[[tmppanel[[1]][1]]] <- tmpnames j <- j + 1 } check_fcs(sub_sampling, flag_default, to_exclude, args[2], args[3], labels, assay_types, scaling_factors, filepaths, filenames) # check_fcs <- function(sampling = 5000, flag_default = TRUE, to_exclude, # outdir = "", outfile = "", labels, assays, factors, # fcspaths, fcsnames)