comparison concatenation.R @ 0:c577f13705f2 draft default tip

planemo upload for repository https://github.com/workflow4metabolomics/tools-metabolomics commit eba1150dad55ad9eca5e93358d9f75e6b6dba2ec
author workflow4metabolomics
date Wed, 10 Jul 2024 15:20:11 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:c577f13705f2
1 if (FALSE) {
2 rm(list = ls())
3
4 DM1 <- data.frame(data = c("5d_-kkcùf", "npèt", "5PY4(*3"),
5 `j 785` = c(0.356426723610756, 0.801750949101246, 0.875199970420953),
6 `y54j 68y4j6` = c(0.380152310071702, 0.535593104115636, 0.0825428101366147),
7 `5-6 4` = c(0.0306944207412024, 0.258351312473067, 0.253659010703906),
8 hrrrrh = c(0.334137638848017, 0.599475573145688, 0.507762246807195),
9 `5h -` = c(0.298147485608469, 0.0763319665667417, 0.856444177031262))
10
11 DM2 <- data.frame(data = c("5d_-kkcùf", "npèt", "5PY4(*3"),
12 `j 785` = c(0.356426723610756, 0.801750949101246, 0.875199970420953),
13 `y54j 68y4j6` = c(0.380152310071702, 0.535593104115636, 0.0825428101366147),
14 `5-6 4` = c(0.0306944207412024, 0.258351312473067, 0.253659010703906),
15 hrrrrh = c(0.334137638848017, 0.599475573145688, 0.507762246807195),
16 `5h -` = c(0.298147485608469, 0.0763319665667417, 0.856444177031262))
17
18 M1 <- data.frame(samplename = c("j 785", "y54j 68y4j6", "5-6 4", "hrrrrh", "5h -"),
19 ABD = c(19, 24, 2, 3, "y"), E = c(9, "p0", 45, 24, 29),
20 AAA = c("r", "bg", "il", "d", "b"),
21 fp = c("pj", "z", "e", "r", "t"),
22 uv = c("s", "d", "f", "s", "d"))
23
24 M2 <- data.frame(samplename = c("j 785", "y54j 68y4j6", "5-6 4", "hrrrrh", "5h -"),
25 ABD = c(19, 24, 2, 3, "y"), E = c(9, "ici", 45, 24, 29),
26 AAA = c("r", "bg", "il", "d", "b"),
27 fp = c("pj", "z", "e", "r", "t"),
28 uv = c("s", "d", "f", "s", "d"))
29 type <- "sample"
30 concatenation <- "unique"
31 tab1 <- "tab1"
32 tab2 <- "tab2"
33 choice_keep <- "oui"
34 keep <- 0
35 concat(DM1, M1, DM2, M2, type, tab1, tab2, concatenation, choice_keep, keep)
36 }
37
38 #################################################################################################################
39
40
41 concat <- function(DM1, M1, DM2, M2, type, tab1, tab2, concatenation, choice_keep, keep) {
42
43
44
45 #DM1/DM2 = data.frame containing data Matrix
46 #M1/M2 = data.frame containing sample Metadata or variable Metadata
47 #type = "sample" or "variable" depending on Metadata content
48 #tab1/tab2 = Suffix for Metadata 1/2
49 #concatenation = type of concatenation
50 #choice_keep = choice of keeping columns with the same no or keeping just one
51 #keep = keep the column in M1 or M2
52 #returns the concatenated metadata and the two Data Matrix
53
54 identifiers_1 <- colnames(M1)[1]
55 identifiers_2 <- colnames(M2)[1]
56
57 err.stock <- NULL
58
59 #Concatenation------------------------------------------------------------------
60
61 #If Metadatas is Sample_Metadata we transpose
62 if (type == "sample") {
63
64 rownames(DM1) <- DM1[, 1]
65 corner_DM1 <- colnames(DM1)[1]
66 DM1 <- DM1[, -1, drop = FALSE]
67 DM1 <- t(DM1)
68 DM1 <- data.frame(sample = row.names(DM1), DM1, check.names = FALSE)
69 rownames(DM1) <- NULL
70
71 rownames(DM2) <- DM2[, 1]
72 corner_DM2 <- colnames(DM2)[1]
73 DM2 <- DM2[, -1, drop = FALSE]
74 DM2 <- t(DM2)
75 DM2 <- data.frame(sample = row.names(DM2), DM2, check.names = FALSE)
76 rownames(DM2) <- NULL
77 }
78
79 #Add order of sample and Sort by order
80
81
82 M1$order1 <- seq(1, nrow(M1))
83 M2$order2 <- seq(nrow(M1) + 1, nrow(M2) + nrow(M1))
84
85 M1_bf <- M1[order(M1[, 1]), ]
86 M2_bf <- M2[order(M2[, 1]), ]
87
88
89 #Check the variables in common and extract them.
90
91
92 same <- check_features(M1_bf, M2_bf)
93 same <- same[- which(same == identifiers_1)]
94
95 #Check that shared variables have the same values.
96 #If not, they are renamed or deleted according to the parameters chosen by the user.
97 result2 <- compare_same_columns(M1_bf, M2_bf, same, choice_keep, keep, tab1, tab2)
98 M1 <- result2$M1
99 M2 <- result2$M2
100
101 #Unique--------------------------------------------------------------------------
102 if (concatenation == "unique") {
103 #Table match check
104 #We verify that the individuals are all the same
105 err.stock <- match2_bis(M1, M2, type)
106 check_err(err.stock)
107 M_merge <- merge(M1, M2, by = 1)
108 }
109
110
111 #Intersection--------------------------------------------------------------------
112
113 if (concatenation == "intersection") {
114
115 #select individuals in common
116 sample_common <- intersect(M1[, 1], M2[, 1])
117
118 #if the list of individuals in common is null, an error message is sent
119 if (length(sample_common) == 0) {
120 err.stock <- c(err.stock, "\nThere are no individuals in common \n")
121 check_err(err.stock)
122 }
123 #if the list of individuals in common is less than 5, then a Warning message is sent
124 if (length(sample_common) < 5) {
125 cat("\nWarning: Less than 5 individuals in common\n")
126 }
127 M_merge <- merge(M1, M2, by = 1)
128 }
129
130 #Union --------------------------------------------------------------------------
131 if (concatenation == "union") {
132
133 #select common ids
134 id_common <- intersect(M1[, 1], M2[, 1])
135
136 if (is.null(id_common)) {
137 cat("\nT Warning : there are no individuals in common\n")
138 }
139
140 M2_common <- M2[M2[, 1] %in% id_common, ]
141 #Store rows with individuals belonging only to M2
142 M2_specifique <- M2[! M2[, 1] %in% id_common, ]
143 #Merge the two tables only with the samples not in common
144 M_merge <- bind_rows(M1, M2_specifique)
145 col_names <- colnames(M2_common)
146 col_names <- col_names[- which(col_names == identifiers_2)]
147 feature_common <- check_features(M_merge, M2_bf)
148 #Check if M_merge and M2_bf have columns in common. If so, complete the table with the values not taken.
149 if (!is.null(feature_common)) {
150
151 identifiers_3 <- M2_specifique[, 1]
152 #We select the value in M2_bf, the M2 table before undergoing any changes, then insert it in the M_merge table.
153 for (feature in feature_common) {
154 for (id in identifiers_3) {
155
156 index_row <- which(M2_bf[, 1] == id)
157 index_col <- which(colnames(M2_bf) == feature)
158 new_value <- M2_bf[index_row, index_col]
159 index_row <- which(M_merge[, 1] == id)
160 index_col <- which(colnames(M_merge) == feature)
161 M_merge[index_row, index_col] <- new_value
162
163 }
164 }
165 }
166 #Fill in the table with common values
167 for (col in col_names) {
168 for (id in id_common) {
169 index_row <- which(M2_common[, 1] == id)
170 index_col <- which(colnames(M2_common) == col)
171 new_value <- M2_common[index_row, index_col]
172 index_row <- which(M_merge[, 1] == id)
173 index_col <- which(colnames(M_merge) == col)
174 M_merge[index_row, index_col] <- new_value
175
176 }
177 }
178 }
179 M_merge_sort <- M_merge[order(M_merge$order1, M_merge$order2), ]
180 M_merge_sort <- M_merge_sort[, - which(colnames(M_merge_sort) == "order1")]
181 M_merge_sort <- M_merge_sort[, - which(colnames(M_merge_sort) == "order2")]
182 #DataMatrix ---------------------------------------------------------------------
183
184 colnames_1 <- colnames(DM1)
185 colnames_2 <- colnames(DM2)
186 #Unique -------------------------------------------------------------------------
187
188 if (concatenation == "unique") {
189
190 if (type == "sample") {
191
192 rownames(DM1) <- DM1[, 1]
193 DM1 <- DM1[, -1]
194 DM1 <- t(DM1)
195 DM1 <- data.frame(sample = row.names(DM1), DM1, check.names = FALSE)
196 colnames(DM1)[1] <- corner_DM1
197 rownames(DM1) <- NULL
198
199 rownames(DM2) <- DM2[, 1]
200 DM2 <- DM2[, -1, drop = FALSE]
201 DM2 <- t(DM2)
202 DM2 <- data.frame(sample = row.names(DM2), DM2, check.names = FALSE)
203 colnames(DM2)[1] <- corner_DM2
204 rownames(DM2) <- NULL
205 }
206 result <- list(M_merge_sort = M_merge_sort, DM1 = DM1, DM2 = DM2)
207 return(result)
208 }
209
210 #Intersection--------------------------------------------------------------------
211
212 if (concatenation == "intersection") {
213
214 id_in_common <- intersect(DM1[, 1], DM2[, 1])
215
216 DM1_filter <- subset(DM1, DM1[, 1] %in% id_in_common)
217 DM2_filter <- subset(DM2, DM2[, 1] %in% id_in_common)
218
219 if (type == "sample") {
220
221 rownames(DM1_filter) <- DM1_filter[, 1]
222 DM1_filter <- DM1_filter[, -1]
223 DM1_filter <- t(DM1_filter)
224 DM1_filter <- data.frame(sample = row.names(DM1_filter), DM1_filter, check.names = FALSE)
225 colnames(DM1_filter)[1] <- corner_DM1
226 rownames(DM1_filter) <- NULL
227
228 rownames(DM2_filter) <- DM2_filter[, 1]
229 DM2_filter <- DM2_filter[, -1, drop = FALSE]
230 DM2_filter <- t(DM2_filter)
231 DM2_filter <- data.frame(sample = row.names(DM2_filter), DM2_filter, check.names = FALSE)
232 colnames(DM2_filter)[1] <- corner_DM2
233 rownames(DM2_filter) <- NULL
234 }
235 result <- list(M_merge_sort = M_merge_sort, DM1 = DM1_filter, DM2 = DM2_filter)
236 return(result)
237 }
238
239 #Union --------------------------------------------------------------------------
240
241 if (concatenation == "union") {
242
243 common_individuals <- intersect(DM1[, 1], DM2[, 1])
244 common_columns <- intersect(colnames_1, colnames_2)
245 #check whether there are individuals or variables in common
246 if (is.null(common_individuals) || is.null(common_columns)) {
247
248 comparison_result <- FALSE
249 #If the individuals in common take the same values for all variables, then comparison_result=TRUE
250 } else {
251
252 DM1_common <- subset(DM1, DM1[, 1] %in% common_individuals)
253 DM2_common <- subset(DM2, DM2[, 1] %in% common_individuals)
254 DM1_common <- DM1_common[, common_columns, drop = FALSE]
255 DM2_common <- DM2_common[, common_columns, drop = FALSE]
256
257 for (col in common_columns) {
258 comparison_result <- identical(DM1_common$col, DM2_common$col)
259 }
260
261 }
262
263 if (comparison_result) {
264
265
266 DM1$order1 <- seq(1, nrow(DM1))
267 DM2$order2 <- seq(nrow(DM1) + 1, nrow(DM2) + nrow(DM1))
268 DM1_sort <- DM1[order(DM1[, 1]), ]
269 DM2_sort <- DM2[order(DM2[, 1]), ]
270 id_in_common <- intersect(DM1[, 1], DM2[, 1])
271 DM1_filter <- subset(DM1, DM1[, 1] %in% id_in_common)
272 DM2_filter <- subset(DM2, DM2[, 1] %in% id_in_common)
273 different_DM2 <- colnames_2[! colnames_2 %in% colnames_1]
274 DM2_specifique <- DM2[! DM2[, 1] %in% id_in_common, ]
275 #Merge the two tables only with the samples not in common
276 DM1_merge <- bind_rows(DM1, DM2_specifique)
277
278
279 #Deletion of columns present only in DM2
280 DM1_merge <- DM1_merge[, ! names(DM1_merge) %in% different_DM2]
281 different_DM1 <- colnames_1[! colnames_1 %in% colnames_2]
282 DM1_specifique <- DM1[! DM1[, 1] %in% id_in_common, ]
283 #Merge the two tables only with the samples not in common
284 DM2_merge <- bind_rows(DM2, DM1_specifique)
285 #Deletion of columns present only in DM2
286 DM2_merge <- DM2_merge[, ! names(DM2_merge) %in% different_DM1]
287 #DM2_merge
288
289
290 DM1_merge_sort <- DM1_merge[order(DM1_merge$order1, DM1_merge$order2), ]
291 DM1_merge_sort <- DM1_merge_sort[, - which(colnames(DM1_merge_sort) == "order1")]
292 DM1_merge_sort <- DM1_merge_sort[, - which(colnames(DM1_merge_sort) == "order2")]
293
294 DM2_merge_sort <- DM2_merge[order(DM2_merge$order1, DM2_merge$order2), ]
295 DM2_merge_sort <- DM2_merge_sort[, - which(colnames(DM2_merge_sort) == "order1")]
296 DM2_merge_sort <- DM2_merge_sort[, - which(colnames(DM2_merge_sort) == "order2")]
297
298
299 if (type == "sample") {
300
301 rownames(DM1_merge_sort) <- DM1_merge_sort[, 1]
302 DM1_merge_sort <- DM1_merge_sort[, -1]
303 DM1_merge_sort <- t(DM1_merge_sort)
304 DM1_merge_sort <- data.frame(sample = row.names(DM1_merge_sort), DM1_merge_sort, check.names = FALSE)
305 colnames(DM1_merge_sort)[1] <- corner_DM1
306 rownames(DM1_merge_sort) <- NULL
307
308 rownames(DM2_merge_sort) <- DM2_merge_sort[, 1]
309 DM2_merge_sort <- DM2_merge_sort[, -1, drop = FALSE]
310 DM2_merge_sort <- t(DM2_merge_sort)
311 DM2_merge_sort <- data.frame(sample = row.names(DM2_merge_sort), DM2_merge_sort, check.names = FALSE)
312 colnames(DM2_merge_sort)[1] <- corner_DM2
313 rownames(DM2_merge_sort) <- NULL
314 }
315
316 result <- list(M_merge_sort = M_merge_sort, DM1 = DM1_merge_sort, DM2 = DM2_merge_sort)
317 return(result)
318
319 } else {
320 #selects line ids that are in DM2 and not in DM1
321 id_diff_1 <- setdiff(DM2[, 1], DM1[, 1])
322 #we store them in a dataframe
323 row_add_1 <- data.frame(id = id_diff_1)
324 #renames columns with their names in DM1
325 colnames(row_add_1)[1] <- colnames(DM1)[1]
326 #Merge
327 DM1_add <- bind_rows(DM1, row_add_1)
328 id_diff_2 <- setdiff(DM1[, 1], DM2[, 1])
329 row_add_2 <- data.frame(id = id_diff_2)
330 colnames(row_add_2)[1] <- colnames(DM2)[1]
331 DM2_add <- bind_rows(DM2, row_add_2)
332
333 if (type == "sample") {
334 rownames(DM1_add) <- DM1_add[, 1]
335 DM1_add <- DM1_add[, -1]
336 DM1_add <- t(DM1_add)
337 DM1_add <- data.frame(sample = row.names(DM1_add), DM1_add, check.names = FALSE)
338 colnames(DM1_add)[1] <- corner_DM1
339 rownames(DM1_add) <- NULL
340 rownames(DM2_add) <- DM2_add[, 1]
341 DM2_add <- DM2_add[, -1, drop = FALSE]
342 DM2_add <- t(DM2_add)
343 DM2_add <- data.frame(sample = row.names(DM2_add), DM2_add, check.names = FALSE)
344 colnames(DM2_add)[1] <- corner_DM2
345 rownames(DM2_add) <- NULL
346 }
347 result <- list(M_merge_sort = M_merge_sort, DM1 = DM1_add, DM2 = DM2_add)
348 return(result)
349
350 }
351 }
352 }