comparison metacyto_preprocess.R @ 0:bf6470882a15 draft default tip

"planemo upload for repository https://github.com/AstraZeneca-Omics/immport-galaxy-tools/tree/master/flowtools/metacyto_preprocess commit c3d761b4fca140636c3f22ef0fdbb855f3ecbdb8"
author azomics
date Sun, 25 Jul 2021 10:36:03 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:bf6470882a15
1 #!/usr/bin/env Rscript
2 ######################################################################
3 # Copyright (c) 2018 Northrop Grumman.
4 # All rights reserved.
5 ######################################################################
6 #
7 # Version 1 - January 2018
8 # Author: Cristel Thomas
9 #
10 #
11
12 library(flowCore)
13 library(MetaCyto)
14
15 compare_lists <- function(m1, m2) {
16 list_check <- T
17 if (is.na(all(m1 == m2))) {
18 mm1 <- is.na(m1)
19 mm2 <- is.na(m2)
20 if (all(mm1 == mm2)) {
21 if (!all(m1 == m2, na.rm = TRUE)) {
22 list_check <- F
23 }
24 } else {
25 list_check <- F
26 }
27 } else if (!all(m1 == m2)) {
28 list_check <- F
29 }
30 return(list_check)
31 }
32
33
34 run_batch_processing <- function(sampling_size = 5000, flag_default = T,
35 to_exclude, outdir = "", outfile = "",
36 labels, assays, factors, fcspaths, fcsnames) {
37 # Create meta_data object
38 fp <- unlist(fcspaths)
39 file_counts <- lengths(fcspaths)
40 group_names <- rep(labels, times = file_counts)
41 group_bs <- rep(factors, times = file_counts)
42 group_types <- rep(assays, times = file_counts)
43
44 meta_data <- data.frame(fcs_files = fp, study_id = group_names)
45
46 # excluded_parameters
47 default_param <- c("FSC-A", "FSC-H", "FSC-W", "FSC", "SSC-A", "SSC-H",
48 "SSC-W", "SSC", "Time", "Cell_length", "cell_length",
49 "CELL_LENGTH")
50 excluded_parameters <- if (flag_default) default_param else to_exclude
51 # Run preprocessing.batch
52 preprocessing.batch(inputMeta = meta_data,
53 assay = group_types,
54 b = group_bs,
55 fileSampleSize = sampling_size,
56 outpath = outdir,
57 excludeTransformParameters = excluded_parameters)
58
59 # deal with outputs
60 # output[2]: a csv file summarizing the pre-processing result.
61 ## -> open file to pull info and print out filenames rather than path.
62 tmp_csv <- file.path(outdir, "processed_sample_summary.csv")
63 tmp <- read.csv(tmp_csv)
64 tmp$old_index <- seq(1, length(tmp$fcs_names))
65
66 fn <- unlist(fcsnames)
67 df <- data.frame(fcs_files = fp, filenames = fn)
68
69 # merge two data frames by ID
70 total <- merge(tmp, df, by = "fcs_files")
71 total2 <- total[order(total$old_index), ]
72 to_drop <- c("fcs_names", "fcs_files", "old_index")
73 newdf <- total2[, !(names(total2) %in% to_drop)]
74 write.table(newdf, file = outfile, quote = F, row.names = F, col.names = T, sep = "\t")
75
76 file.remove(tmp_csv)
77 }
78
79 check_fcs <- function(sampling = 5000, flag_default = TRUE, to_exclude,
80 outdir = "", outfile = "", labels, assays, factors,
81 fcspaths, fcsnames) {
82
83 if (length(labels) > length(unique(labels))) {
84 # we have repeated group names, all group names need to be different
85 print("ERROR: repeated labels among groups, make sure that labels are all different for groups.")
86 print("The following labels are repeated")
87 table(labels)[table(labels) > 1]
88 quit(save = "no", status = 13, runLast = FALSE)
89 }
90
91 marker_pb <- FALSE
92 for (i in seq_len(length(fcspaths))) {
93 for (n in seq_len(length(fcspaths[[i]]))) {
94 marker_check <- FALSE
95 marker_channel <- FALSE
96 tryCatch({
97 fcs <- read.FCS(fcspaths[[i]][[n]], transformation = FALSE)
98 }, error = function(ex) {
99 print(paste("File is not a valid FCS file:", fnames[[i]][[n]], ex))
100 quit(save = "no", status = 10, runLast = FALSE)
101 })
102
103 if (n == 1) {
104 m1 <- as.vector(pData(parameters(fcs))$desc)
105 c1 <- colnames(fcs)
106 } else {
107 m2 <- as.vector(pData(parameters(fcs))$desc)
108 c2 <- colnames(fcs)
109 marker_check <- compare_lists(m1, m2)
110 marker_channel <- compare_lists(c1, c2)
111 }
112 if (n > 1 && marker_check == F) {
113 marker_pb <- TRUE
114 print(paste("Marker discrepancy detected in markers -- group", labels[[i]]))
115 } else if (n > 1 && marker_channel == F) {
116 marker_pb <- TRUE
117 print(paste("Marker discrepancy detected in channels -- group", labels[[i]]))
118 }
119 }
120 }
121
122 if (marker_pb) {
123 quit(save = "no", status = 12, runLast = FALSE)
124 } else {
125 run_batch_processing(sampling, flag_default, to_exclude, outdir, outfile,
126 labels, assays, factors, fcspaths, fcsnames)
127 }
128 }
129
130 ################################################################################
131 ################################################################################
132 args <- commandArgs(trailingOnly = TRUE)
133
134 # Arg 1: sub_sampling number
135 # Arg 2: output dir for processed FCS files for check_fcs and run_batch_processing
136 # Arg 3: Main output file (text file)
137 # Arg 4: excluded params
138 # Arg 5: Group 1 Name
139 # Arg 6: Group 1 format
140 # Arg 7: Group 1 Scaling factor
141 # Cycle through files in group 1
142 # Arg : file path in Galaxy
143 # Arg : desired real file name
144 # Cycle through at at least one additional group
145 # Arg : 'new_panel' - used as some sort of delimiter
146 # Arg : Group n+1 Name
147 # Arg : Group n+1 format
148 # Arg : Group n+1 Scaling factor
149 ## Cycle through files in that group
150 ## Arg : file path in Galaxy
151 ## Arg : desired real file path
152
153 sub_sampling <- NULL
154 if (as.numeric(args[1]) > 0) {
155 sub_sampling <- as.numeric(args[1])
156 }
157
158 # parameters to exclude => args[4]
159 to_exclude <- vector()
160 flag_default <- FALSE
161 i <- 1
162 if (args[4] == "None" || args[4] == "") {
163 flag_default <- TRUE
164 } else {
165 excluded <- unlist(strsplit(args[4], ","))
166 for (channel in excluded) {
167 stripped_chan <- gsub(" ", "", channel, fixed = TRUE)
168 if (!is.na(stripped_chan)) {
169 to_exclude[[i]] <- stripped_chan
170 }
171 i <- i + 1
172 }
173 }
174
175 # handle group cycle in arguments to produce iterable panels
176 tot_args <- length(args)
177 tmpargs <- paste(args[5:tot_args], collapse = "=%=")
178 tmppanels <- strsplit(tmpargs, "=%=new_panel=%=")
179 nb_panel <- length(tmppanels[[1]])
180
181 labels <- vector(mode = "character", length = nb_panel)
182 assay_types <- vector(mode = "character", length = nb_panel)
183 scaling_factors <- vector(mode = "numeric", length = nb_panel)
184 filepaths <- list()
185 filenames <- list()
186
187 # iterate over panels (groups of fcs files)
188 j <- 1
189 for (pnl in tmppanels[[1]]) {
190 tmppanel <- strsplit(pnl, "=%=")
191 # number of FCS files
192 nb_files <- (length(tmppanel[[1]]) - 3) / 2
193 tmplist <- character(nb_files)
194 tmpnames <- character(nb_files)
195 if (tmppanel[[1]][[1]] == "None" || tmppanel[[1]][[1]] == "") {
196 print(paste("ERROR: Empty group name/label for group ", j))
197 quit(save = "no", status = 11, runLast = FALSE)
198 } else {
199 labels[[j]] <- tmppanel[[1]][[1]]
200 }
201 # assay type
202 assay_types[[j]] <- tmppanel[[1]][[2]]
203
204 scaling_factors[[j]] <- 0
205 if (as.numeric(tmppanel[[1]][[3]]) > 0) {
206 scaling_factors[[j]] <- 1 / as.numeric(tmppanel[[1]][[3]])
207 }
208
209 k <- 1
210 for (m in 4:length(tmppanel[[1]])) {
211 if (!m %% 2) {
212 tmplist[[k]] <- tmppanel[[1]][[m]]
213 tmpnames[[k]] <- tmppanel[[1]][[m + 1]]
214 k <- k + 1
215 }
216 }
217 filepaths[[tmppanel[[1]][1]]] <- tmplist
218 filenames[[tmppanel[[1]][1]]] <- tmpnames
219 j <- j + 1
220 }
221
222 check_fcs(sub_sampling, flag_default, to_exclude, args[2], args[3], labels,
223 assay_types, scaling_factors, filepaths, filenames)
224
225 # check_fcs <- function(sampling = 5000, flag_default = TRUE, to_exclude,
226 # outdir = "", outfile = "", labels, assays, factors,
227 # fcspaths, fcsnames)