Mercurial > repos > iuc > join_files_by_id
changeset 0:d212d12aee5e draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/join_files_by_id commit a835a0b20127f485a6af4b466bb29d58d63541ea
author | iuc |
---|---|
date | Wed, 09 Aug 2017 17:44:54 -0400 |
parents | |
children | |
files | join_files.R join_files_by_id.xml test-data/df.txt test-data/df1.txt test-data/df2.txt test-data/df3.txt |
diffstat | 6 files changed, 227 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/join_files.R Wed Aug 09 17:44:54 2017 -0400 @@ -0,0 +1,88 @@ +# libraries +library(data.table) +library(parallel) + +# inputs +args <- commandArgs() + +input <- gsub("--in=", "", args[grepl("--in=", args)]) +header <- as.integer(gsub("--he=", "", args[grepl("--he=", args)])) +join_col <- gsub("--jc=", "", args[grepl("--jc=", args)]) +separator <- gsub("--sep=", "", args[grepl("--sep=", args)]) +null_char <- gsub("--nc=", "", args[grepl("--nc=", args)]) +output <- gsub("--out=", "", args[grepl("--out=", args)]) + +# test VARS +# input <- list("test-data/df1.txt", "test-data/df2.txt", "test-data/df3.txt") +# +# input <- list("test-data/df_big_1.txt", "test-data/df_big_2.txt", "test-data/df_big_3.txt", +# "test-data/df_big_4.txt", "test-data/df_big_5.txt", "test-data/df_big_6.txt", +# "test-data/df_big_7.txt", "test-data/df_big_8.txt", "test-data/df_big_9.txt", +# "test-data/df_big_10.txt") +# header <- 1 +# join_col <- 1 +# separator <- "ta" +# null_char <- 0 +# output <- "out" + +if(header > 0){ + header <- TRUE +}else{ + header <- FALSE +} +join_col <- as.integer(join_col) + +# read files into list +df_list <- lapply(input, function(x){as.data.frame(fread(x))}) + + +#### fix the ids name for all read in tables +df_list <- lapply(df_list, function(x){ + names_x <- names(x) + names_x[names_x == "ids"] <- "id" # to join correctly + names_x[join_col] <- "ids" + names(x) <- names_x + x +}) + +# generate unique ids string +df0 <- lapply(df_list, function(x){ + x[join_col] +}) + +df0 <- data.frame(ids=unique(do.call(rbind, df0))) +df_list <- append(df0, df_list) +df_list[[1]] <- data.frame(ids=df_list[[1]]) + + + +ids <- df_list[[1]] +ids <- data.frame(ids = ids[order(ids$ids), "ids" ]) +merged_df <- mclapply(2:length(df_list), function(x){ + merged_sub <- merge(x = ids, y = df_list[[x]], by = "ids", all.x = T, sort = F) + merged_sub <- merged_sub[order(merged_sub$ids), ] + merged_sub[-1] +}, mc.cores=4) + +df <- cbind(ids, do.call(cbind, merged_df)) + +# benchmarking +# library(microbenchmark) +# microbenchmark( +# df1 <- lapply(2:length(df_list), function(x){ +# merge(df_list[[1]], df_list[[x]], by = "ids", all.x = T)[-1] +# }), +# merged_df <- mclapply(2:length(df_list), function(x){ +# merge(x = ids, y = df_list[[x]], by = "ids", all.x = T, sort = F)[-1]}, mc.cores=7) +# ,times = 4 +# ) + +# change null_char +df[is.na(df)] <- null_char +# separator <- "ta" +# print(separator) +delim <- list(ta = "\t", do = ".", co = ",", un = "_", da = "-", sp = " ") +# print(separator) +separator <- delim[[separator]] +# write data.frame to file +write.table(df, file = output, sep = separator, row.names = F, quote = F, col.names = header)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/join_files_by_id.xml Wed Aug 09 17:44:54 2017 -0400 @@ -0,0 +1,124 @@ +<tool id="join_files_by_id" name="Join datasets by identifier column" version="1.0"> + <requirements> + <requirement type="package" version="1.10.4">r-data.table</requirement> + </requirements> + <command detect_errors="exit_code"><![CDATA[ +Rscript '$__tool_directory__/join_files.R' +--out='$out_file1' +--he='$header' +--jc='$field' +--sep='$delimiter' +--nc='$null_str' +--in='$input1' +#for $q in $queries + --in='${q.input}' +#end for + ]]></command> + <inputs> + <param name="header" type="boolean" checked="true" truevalue="1" falsevalue="0" label="Header" help="If header is true, the column name used for joining need to be the same in all datasets"/> + <param name="delimiter" type="select" label="Delimited by"> + <option value="ta">Tab</option> + <option value="do">Dot</option> + <option value="co">Comma</option> + <option value="un">Underscore</option> + <option value="da">Dash</option> + <option value="sp">Space</option> + </param> + <param name="null_str" type="select" label="Select mismatch character"> + <option value="0">0</option> + <option value="-">-</option> + <option value="">"empty"</option> + <option value="NA">NA</option> + </param> + <param name="input1" type="data" format="tabular" label="Select table" help="Single dataset selection only to preserve the order"/> + <repeat name="queries" min="1" title="Select table"> + <param name="input" type="data" format="tabular" label="Select table" help="Single dataset selection only to preserve the order"/> + </repeat> + <param name="field" type="data_column" data_ref="input1" label="Column used for joining" /> + </inputs> + <outputs> + <data name="out_file1" format="tabular" /> + </outputs> + <tests> + <test> + <param name="header" value="1"/> + <param name="null_str" value="0"/> + <param name="delimiter" value="ta"/> + <param name="input1" value="df1.txt"/> + <repeat name="queries"> + <param name="input" value="df2.txt"/> + </repeat> + <repeat name="queries"> + <param name="input" value="df3.txt"/> + </repeat> + <param name="field" value="1"/> + <output name="out_file1" file="df.txt"/> + </test> + </tests> + <help><![CDATA[ +**What it does** + +Joins datasets by identifier column + +----- + +'''Example''' + +Dataset 1: + ++--------+--------+---------+ +| name | counts | percent | ++========+========+=========+ +| gene 1 | 10 | 0.3 | ++--------+--------+---------+ +| gene 2 | 20 | 0.2 | ++--------+--------+---------+ +| gene 3 | 20 | 0.2 | ++--------+--------+---------+ + + +and Dataset 2: + ++--------+--------+---------+ +| name | counts | percent | ++========+========+=========+ +| gene 1 | 13 | 0.5 | ++--------+--------+---------+ +| gene 4 | 4 | 0.4 | ++--------+--------+---------+ + + +and Dataset 3: + ++--------+--------+---------+ +| name | counts | percent | ++========+========+=========+ +| gene 1 | 10 | 0.3 | ++--------+--------+---------+ +| gene 3 | 12 | 0.2 | ++--------+--------+---------+ +| gene 4 | 4 | 0.4 | ++--------+--------+---------+ +| gene 5 | 19 | 0.6 | ++--------+--------+---------+ + + +will result in the following: + ++--------+--------+---------+--------+---------+--------+---------+ +| name | counts | percent | counts | percent | counts | percent | ++========+========+=========+========+=========+========+=========+ +| gene 1 | 10 | 0.3 | 13 | 0.5 | 10 | 0.3 | ++--------+--------+---------+--------+---------+--------+---------+ +| gene 2 | 20 | 0.2 | 0 | 0 | 0 | 0 | ++--------+--------+---------+--------+---------+--------+---------+ +| gene 3 | 12 | 0.2 | 0 | 0 | 12 | 0.2 | ++--------+--------+---------+--------+---------+--------+---------+ +| gene 4 | 0 | 0 | 4 | 0.4 | 4 | 0.4 | ++--------+--------+---------+--------+---------+--------+---------+ +| gene 5 | 0 | 0 | 0 | 0 | 19 | 0.6 | ++--------+--------+---------+--------+---------+--------+---------+ + +For help contact: jochen.bick@usys.ethz.ch + ]]></help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/df.txt Wed Aug 09 17:44:54 2017 -0400 @@ -0,0 +1,5 @@ +ids col2 col3 col2 col3 col2 col3 +test 2 5 2 6 0 0 +test1 3 6 0 0 0 0 +test2 4 7 3 7 2 6 +test3 0 0 0 0 3 7
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/df1.txt Wed Aug 09 17:44:54 2017 -0400 @@ -0,0 +1,4 @@ +ids col2 col3 +test 2 5 +test1 3 6 +test2 4 7