annotate join_files.R @ 0:d212d12aee5e draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
author iuc
date Wed, 09 Aug 2017 17:44:54 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
1 # libraries
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
2 library(data.table)
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
3 library(parallel)
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
4
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
5 # inputs
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
6 args <- commandArgs()
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
7
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
8 input <- gsub("--in=", "", args[grepl("--in=", args)])
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
9 header <- as.integer(gsub("--he=", "", args[grepl("--he=", args)]))
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
10 join_col <- gsub("--jc=", "", args[grepl("--jc=", args)])
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
11 separator <- gsub("--sep=", "", args[grepl("--sep=", args)])
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
12 null_char <- gsub("--nc=", "", args[grepl("--nc=", args)])
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
13 output <- gsub("--out=", "", args[grepl("--out=", args)])
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
14
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
15 # test VARS
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
16 # input <- list("test-data/df1.txt", "test-data/df2.txt", "test-data/df3.txt")
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
17 #
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
18 # input <- list("test-data/df_big_1.txt", "test-data/df_big_2.txt", "test-data/df_big_3.txt",
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
19 # "test-data/df_big_4.txt", "test-data/df_big_5.txt", "test-data/df_big_6.txt",
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
20 # "test-data/df_big_7.txt", "test-data/df_big_8.txt", "test-data/df_big_9.txt",
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
21 # "test-data/df_big_10.txt")
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
22 # header <- 1
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
23 # join_col <- 1
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
24 # separator <- "ta"
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
25 # null_char <- 0
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
26 # output <- "out"
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
27
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
28 if(header > 0){
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
29 header <- TRUE
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
30 }else{
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
31 header <- FALSE
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
32 }
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
33 join_col <- as.integer(join_col)
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
34
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
35 # read files into list
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
36 df_list <- lapply(input, function(x){as.data.frame(fread(x))})
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
37
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
38
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
39 #### fix the ids name for all read in tables
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
40 df_list <- lapply(df_list, function(x){
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
41 names_x <- names(x)
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
42 names_x[names_x == "ids"] <- "id" # to join correctly
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
43 names_x[join_col] <- "ids"
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
44 names(x) <- names_x
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
45 x
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
46 })
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
47
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
48 # generate unique ids string
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
49 df0 <- lapply(df_list, function(x){
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
50 x[join_col]
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
51 })
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
52
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
53 df0 <- data.frame(ids=unique(do.call(rbind, df0)))
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
54 df_list <- append(df0, df_list)
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
55 df_list[[1]] <- data.frame(ids=df_list[[1]])
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
56
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
57
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
58
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
59 ids <- df_list[[1]]
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
60 ids <- data.frame(ids = ids[order(ids$ids), "ids" ])
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
61 merged_df <- mclapply(2:length(df_list), function(x){
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
62 merged_sub <- merge(x = ids, y = df_list[[x]], by = "ids", all.x = T, sort = F)
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
63 merged_sub <- merged_sub[order(merged_sub$ids), ]
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
64 merged_sub[-1]
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
65 }, mc.cores=4)
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
66
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
67 df <- cbind(ids, do.call(cbind, merged_df))
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
68
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
69 # benchmarking
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
70 # library(microbenchmark)
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
71 # microbenchmark(
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
72 # df1 <- lapply(2:length(df_list), function(x){
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
73 # merge(df_list[[1]], df_list[[x]], by = "ids", all.x = T)[-1]
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
74 # }),
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
75 # merged_df <- mclapply(2:length(df_list), function(x){
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
76 # merge(x = ids, y = df_list[[x]], by = "ids", all.x = T, sort = F)[-1]}, mc.cores=7)
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
77 # ,times = 4
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
78 # )
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
79
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
80 # change null_char
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
81 df[is.na(df)] <- null_char
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
82 # separator <- "ta"
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
83 # print(separator)
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
84 delim <- list(ta = "\t", do = ".", co = ",", un = "_", da = "-", sp = " ")
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
85 # print(separator)
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
86 separator <- delim[[separator]]
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
87 # write data.frame to file
d212d12aee5e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
iuc
parents:
diff changeset
88 write.table(df, file = output, sep = separator, row.names = F, quote = F, col.names = header)