Mercurial > repos > iuc > join_files_by_id
comparison join_files.R @ 0:d212d12aee5e draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
| author | iuc |
|---|---|
| date | Wed, 09 Aug 2017 17:44:54 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:d212d12aee5e |
|---|---|
| 1 # libraries | |
| 2 library(data.table) | |
| 3 library(parallel) | |
| 4 | |
| 5 # inputs | |
| 6 args <- commandArgs() | |
| 7 | |
| 8 input <- gsub("--in=", "", args[grepl("--in=", args)]) | |
| 9 header <- as.integer(gsub("--he=", "", args[grepl("--he=", args)])) | |
| 10 join_col <- gsub("--jc=", "", args[grepl("--jc=", args)]) | |
| 11 separator <- gsub("--sep=", "", args[grepl("--sep=", args)]) | |
| 12 null_char <- gsub("--nc=", "", args[grepl("--nc=", args)]) | |
| 13 output <- gsub("--out=", "", args[grepl("--out=", args)]) | |
| 14 | |
| 15 # test VARS | |
| 16 # input <- list("test-data/df1.txt", "test-data/df2.txt", "test-data/df3.txt") | |
| 17 # | |
| 18 # input <- list("test-data/df_big_1.txt", "test-data/df_big_2.txt", "test-data/df_big_3.txt", | |
| 19 # "test-data/df_big_4.txt", "test-data/df_big_5.txt", "test-data/df_big_6.txt", | |
| 20 # "test-data/df_big_7.txt", "test-data/df_big_8.txt", "test-data/df_big_9.txt", | |
| 21 # "test-data/df_big_10.txt") | |
| 22 # header <- 1 | |
| 23 # join_col <- 1 | |
| 24 # separator <- "ta" | |
| 25 # null_char <- 0 | |
| 26 # output <- "out" | |
| 27 | |
| 28 if(header > 0){ | |
| 29 header <- TRUE | |
| 30 }else{ | |
| 31 header <- FALSE | |
| 32 } | |
| 33 join_col <- as.integer(join_col) | |
| 34 | |
| 35 # read files into list | |
| 36 df_list <- lapply(input, function(x){as.data.frame(fread(x))}) | |
| 37 | |
| 38 | |
| 39 #### fix the ids name for all read in tables | |
| 40 df_list <- lapply(df_list, function(x){ | |
| 41 names_x <- names(x) | |
| 42 names_x[names_x == "ids"] <- "id" # to join correctly | |
| 43 names_x[join_col] <- "ids" | |
| 44 names(x) <- names_x | |
| 45 x | |
| 46 }) | |
| 47 | |
| 48 # generate unique ids string | |
| 49 df0 <- lapply(df_list, function(x){ | |
| 50 x[join_col] | |
| 51 }) | |
| 52 | |
| 53 df0 <- data.frame(ids=unique(do.call(rbind, df0))) | |
| 54 df_list <- append(df0, df_list) | |
| 55 df_list[[1]] <- data.frame(ids=df_list[[1]]) | |
| 56 | |
| 57 | |
| 58 | |
| 59 ids <- df_list[[1]] | |
| 60 ids <- data.frame(ids = ids[order(ids$ids), "ids" ]) | |
| 61 merged_df <- mclapply(2:length(df_list), function(x){ | |
| 62 merged_sub <- merge(x = ids, y = df_list[[x]], by = "ids", all.x = T, sort = F) | |
| 63 merged_sub <- merged_sub[order(merged_sub$ids), ] | |
| 64 merged_sub[-1] | |
| 65 }, mc.cores=4) | |
| 66 | |
| 67 df <- cbind(ids, do.call(cbind, merged_df)) | |
| 68 | |
| 69 # benchmarking | |
| 70 # library(microbenchmark) | |
| 71 # microbenchmark( | |
| 72 # df1 <- lapply(2:length(df_list), function(x){ | |
| 73 # merge(df_list[[1]], df_list[[x]], by = "ids", all.x = T)[-1] | |
| 74 # }), | |
| 75 # merged_df <- mclapply(2:length(df_list), function(x){ | |
| 76 # merge(x = ids, y = df_list[[x]], by = "ids", all.x = T, sort = F)[-1]}, mc.cores=7) | |
| 77 # ,times = 4 | |
| 78 # ) | |
| 79 | |
| 80 # change null_char | |
| 81 df[is.na(df)] <- null_char | |
| 82 # separator <- "ta" | |
| 83 # print(separator) | |
| 84 delim <- list(ta = "\t", do = ".", co = ",", un = "_", da = "-", sp = " ") | |
| 85 # print(separator) | |
| 86 separator <- delim[[separator]] | |
| 87 # write data.frame to file | |
| 88 write.table(df, file = output, sep = separator, row.names = F, quote = F, col.names = header) |
