Mercurial > repos > iuc > join_files_by_id
view join_files.R @ 0:d212d12aee5e draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
author | iuc |
---|---|
date | Wed, 09 Aug 2017 17:44:54 -0400 |
parents | |
children |
line wrap: on
line source
# libraries library(data.table) library(parallel) # inputs args <- commandArgs() input <- gsub("--in=", "", args[grepl("--in=", args)]) header <- as.integer(gsub("--he=", "", args[grepl("--he=", args)])) join_col <- gsub("--jc=", "", args[grepl("--jc=", args)]) separator <- gsub("--sep=", "", args[grepl("--sep=", args)]) null_char <- gsub("--nc=", "", args[grepl("--nc=", args)]) output <- gsub("--out=", "", args[grepl("--out=", args)]) # test VARS # input <- list("test-data/df1.txt", "test-data/df2.txt", "test-data/df3.txt") # # input <- list("test-data/df_big_1.txt", "test-data/df_big_2.txt", "test-data/df_big_3.txt", # "test-data/df_big_4.txt", "test-data/df_big_5.txt", "test-data/df_big_6.txt", # "test-data/df_big_7.txt", "test-data/df_big_8.txt", "test-data/df_big_9.txt", # "test-data/df_big_10.txt") # header <- 1 # join_col <- 1 # separator <- "ta" # null_char <- 0 # output <- "out" if(header > 0){ header <- TRUE }else{ header <- FALSE } join_col <- as.integer(join_col) # read files into list df_list <- lapply(input, function(x){as.data.frame(fread(x))}) #### fix the ids name for all read in tables df_list <- lapply(df_list, function(x){ names_x <- names(x) names_x[names_x == "ids"] <- "id" # to join correctly names_x[join_col] <- "ids" names(x) <- names_x x }) # generate unique ids string df0 <- lapply(df_list, function(x){ x[join_col] }) df0 <- data.frame(ids=unique(do.call(rbind, df0))) df_list <- append(df0, df_list) df_list[[1]] <- data.frame(ids=df_list[[1]]) ids <- df_list[[1]] ids <- data.frame(ids = ids[order(ids$ids), "ids" ]) merged_df <- mclapply(2:length(df_list), function(x){ merged_sub <- merge(x = ids, y = df_list[[x]], by = "ids", all.x = T, sort = F) merged_sub <- merged_sub[order(merged_sub$ids), ] merged_sub[-1] }, mc.cores=4) df <- cbind(ids, do.call(cbind, merged_df)) # benchmarking # library(microbenchmark) # microbenchmark( # df1 <- lapply(2:length(df_list), function(x){ # merge(df_list[[1]], df_list[[x]], by = "ids", all.x = T)[-1] # }), # merged_df <- mclapply(2:length(df_list), function(x){ # merge(x = ids, y = df_list[[x]], by = "ids", all.x = T, sort = F)[-1]}, mc.cores=7) # ,times = 4 # ) # change null_char df[is.na(df)] <- null_char # separator <- "ta" # print(separator) delim <- list(ta = "\t", do = ".", co = ",", un = "_", da = "-", sp = " ") # print(separator) separator <- delim[[separator]] # write data.frame to file write.table(df, file = output, sep = separator, row.names = F, quote = F, col.names = header)