diff join_files.R @ 0:d212d12aee5e draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
author iuc
date Wed, 09 Aug 2017 17:44:54 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/join_files.R	Wed Aug 09 17:44:54 2017 -0400
@@ -0,0 +1,88 @@
+# libraries
+library(data.table)
+library(parallel)
+
+# inputs
+args <- commandArgs()
+
+input <- gsub("--in=", "", args[grepl("--in=", args)])
+header <- as.integer(gsub("--he=", "", args[grepl("--he=", args)]))
+join_col <- gsub("--jc=", "", args[grepl("--jc=", args)])
+separator <- gsub("--sep=", "", args[grepl("--sep=", args)])
+null_char <- gsub("--nc=", "", args[grepl("--nc=", args)])
+output <- gsub("--out=", "", args[grepl("--out=", args)])
+
+# test VARS
+# input <- list("test-data/df1.txt", "test-data/df2.txt", "test-data/df3.txt")
+# 
+# input <- list("test-data/df_big_1.txt", "test-data/df_big_2.txt", "test-data/df_big_3.txt",
+#               "test-data/df_big_4.txt", "test-data/df_big_5.txt", "test-data/df_big_6.txt",
+#               "test-data/df_big_7.txt", "test-data/df_big_8.txt", "test-data/df_big_9.txt",
+#               "test-data/df_big_10.txt")
+# header <- 1
+# join_col <- 1
+# separator <- "ta"
+# null_char <- 0
+# output <- "out"
+
+if(header > 0){
+  header <- TRUE
+}else{
+  header <- FALSE
+}
+join_col <- as.integer(join_col)
+
+# read files into list
+df_list <- lapply(input, function(x){as.data.frame(fread(x))})
+
+
+#### fix the ids name for all read in tables
+df_list <- lapply(df_list, function(x){
+  names_x <- names(x)
+  names_x[names_x == "ids"] <- "id" # to join correctly
+  names_x[join_col] <- "ids"
+  names(x) <- names_x
+  x
+})
+
+# generate unique ids string
+df0 <- lapply(df_list, function(x){
+  x[join_col]
+})
+
+df0 <- data.frame(ids=unique(do.call(rbind, df0)))
+df_list <- append(df0, df_list)
+df_list[[1]] <- data.frame(ids=df_list[[1]]) 
+
+
+
+ids <- df_list[[1]]
+ids <- data.frame(ids = ids[order(ids$ids), "ids" ])
+merged_df <- mclapply(2:length(df_list), function(x){
+  merged_sub <- merge(x = ids, y = df_list[[x]], by = "ids", all.x = T, sort = F)
+  merged_sub <- merged_sub[order(merged_sub$ids), ]
+  merged_sub[-1]
+}, mc.cores=4)
+
+df <- cbind(ids, do.call(cbind, merged_df))
+
+# benchmarking
+# library(microbenchmark)
+# microbenchmark(
+#   df1 <- lapply(2:length(df_list), function(x){
+#     merge(df_list[[1]], df_list[[x]], by = "ids", all.x = T)[-1]
+#   }),
+#   merged_df <- mclapply(2:length(df_list), function(x){
+#     merge(x = ids, y = df_list[[x]], by = "ids", all.x = T, sort = F)[-1]}, mc.cores=7)
+#   ,times = 4
+# )
+
+# change null_char
+df[is.na(df)] <- null_char
+# separator <- "ta"
+# print(separator)
+delim <- list(ta = "\t", do = ".", co = ",", un = "_", da = "-", sp = " ")
+# print(separator)
+separator <- delim[[separator]]
+# write data.frame to file
+write.table(df, file = output, sep = separator, row.names = F, quote = F, col.names = header)