Mercurial > repos > iuc > join_files_by_id

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/join_files.R	Wed Aug 09 17:44:54 2017 -0400
@@ -0,0 +1,88 @@
+# libraries
+library(data.table)
+library(parallel)
+
+# inputs
+args <- commandArgs()
+
+input <- gsub("--in=", "", args[grepl("--in=", args)])
+header <- as.integer(gsub("--he=", "", args[grepl("--he=", args)]))
+join_col <- gsub("--jc=", "", args[grepl("--jc=", args)])
+separator <- gsub("--sep=", "", args[grepl("--sep=", args)])
+null_char <- gsub("--nc=", "", args[grepl("--nc=", args)])
+output <- gsub("--out=", "", args[grepl("--out=", args)])
+
+# test VARS
+# input <- list("test-data/df1.txt", "test-data/df2.txt", "test-data/df3.txt")
+#
+# input <- list("test-data/df_big_1.txt", "test-data/df_big_2.txt", "test-data/df_big_3.txt",
+#               "test-data/df_big_4.txt", "test-data/df_big_5.txt", "test-data/df_big_6.txt",
+#               "test-data/df_big_7.txt", "test-data/df_big_8.txt", "test-data/df_big_9.txt",
+#               "test-data/df_big_10.txt")
+# header <- 1
+# join_col <- 1
+# separator <- "ta"
+# null_char <- 0
+# output <- "out"
+
+if(header > 0){
+  header <- TRUE
+}else{
+  header <- FALSE
+}
+join_col <- as.integer(join_col)
+
+# read files into list
+df_list <- lapply(input, function(x){as.data.frame(fread(x))})
+
+
+#### fix the ids name for all read in tables
+df_list <- lapply(df_list, function(x){
+  names_x <- names(x)
+  names_x[names_x == "ids"] <- "id" # to join correctly
+  names_x[join_col] <- "ids"
+  names(x) <- names_x
+  x
+})
+
+# generate unique ids string
+df0 <- lapply(df_list, function(x){
+  x[join_col]
+})
+
+df0 <- data.frame(ids=unique(do.call(rbind, df0)))
+df_list <- append(df0, df_list)
+df_list[[1]] <- data.frame(ids=df_list[[1]])
+
+
+
+ids <- df_list[[1]]
+ids <- data.frame(ids = ids[order(ids$ids), "ids" ])
+merged_df <- mclapply(2:length(df_list), function(x){
+  merged_sub <- merge(x = ids, y = df_list[[x]], by = "ids", all.x = T, sort = F)
+  merged_sub <- merged_sub[order(merged_sub$ids), ]
+  merged_sub[-1]
+}, mc.cores=4)
+
+df <- cbind(ids, do.call(cbind, merged_df))
+
+# benchmarking
+# library(microbenchmark)
+# microbenchmark(
+#   df1 <- lapply(2:length(df_list), function(x){
+#     merge(df_list[[1]], df_list[[x]], by = "ids", all.x = T)[-1]
+#   }),
+#   merged_df <- mclapply(2:length(df_list), function(x){
+#     merge(x = ids, y = df_list[[x]], by = "ids", all.x = T, sort = F)[-1]}, mc.cores=7)
+#   ,times = 4
+# )
+
+# change null_char
+df[is.na(df)] <- null_char
+# separator <- "ta"
+# print(separator)
+delim <- list(ta = "\t", do = ".", co = ",", un = "_", da = "-", sp = " ")
+# print(separator)
+separator <- delim[[separator]]
+# write data.frame to file
+write.table(df, file = output, sep = separator, row.names = F, quote = F, col.names = header)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/join_files_by_id.xml	Wed Aug 09 17:44:54 2017 -0400
@@ -0,0 +1,124 @@
+<tool id="join_files_by_id" name="Join datasets by identifier column" version="1.0">
+    <requirements>
+        <requirement type="package" version="1.10.4">r-data.table</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+Rscript '$__tool_directory__/join_files.R'
+--out='$out_file1'
+--he='$header'
+--jc='$field'
+--sep='$delimiter'
+--nc='$null_str'
+--in='$input1'
+#for $q in $queries
+    --in='${q.input}'
+#end for
+    ]]></command>
+    <inputs>
+        <param name="header" type="boolean" checked="true" truevalue="1" falsevalue="0" label="Header" help="If header is true, the column name used for joining need to be the same in all datasets"/>
+        <param name="delimiter" type="select" label="Delimited by">
+            <option value="ta">Tab</option>
+            <option value="do">Dot</option>
+            <option value="co">Comma</option>
+            <option value="un">Underscore</option>
+            <option value="da">Dash</option>
+            <option value="sp">Space</option>
+        </param>
+        <param name="null_str" type="select" label="Select mismatch character">
+            <option value="0">0</option>
+            <option value="-">-</option>
+            <option value="">"empty"</option>
+            <option value="NA">NA</option>
+        </param>
+        <param name="input1" type="data" format="tabular" label="Select table" help="Single dataset selection only to preserve the order"/>
+        <repeat name="queries" min="1" title="Select table">
+            <param name="input" type="data" format="tabular" label="Select table" help="Single dataset selection only to preserve the order"/>
+        </repeat>
+        <param name="field" type="data_column" data_ref="input1" label="Column used for joining" />
+    </inputs>
+    <outputs>
+        <data name="out_file1" format="tabular" />
+    </outputs>
+    <tests>
+        <test>
+            <param name="header" value="1"/>
+            <param name="null_str" value="0"/>
+            <param name="delimiter" value="ta"/>
+            <param name="input1" value="df1.txt"/>
+            <repeat name="queries">
+                <param name="input" value="df2.txt"/>
+            </repeat>
+            <repeat name="queries">
+                <param name="input" value="df3.txt"/>
+            </repeat>
+            <param name="field" value="1"/>
+            <output name="out_file1" file="df.txt"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+**What it does**
+
+Joins datasets by identifier column
+
+-----
+
+'''Example'''
+
+Dataset 1:
+
++--------+--------+---------+
+| name   | counts | percent |
++========+========+=========+
+| gene 1 |   10   |   0.3   |
++--------+--------+---------+
+| gene 2 |   20   |   0.2   |
++--------+--------+---------+
+| gene 3 |   20   |   0.2   |
++--------+--------+---------+
+
+
+and Dataset 2:
+
++--------+--------+---------+
+| name   | counts | percent |
++========+========+=========+
+| gene 1 |   13   |   0.5   |
++--------+--------+---------+
+| gene 4 |    4   |   0.4   |
++--------+--------+---------+
+
+
+and Dataset 3:
+
++--------+--------+---------+
+| name   | counts | percent |
++========+========+=========+
+| gene 1 |   10   |   0.3   |
++--------+--------+---------+
+| gene 3 |   12   |   0.2   |
++--------+--------+---------+
+| gene 4 |    4   |   0.4   |
++--------+--------+---------+
+| gene 5 |   19   |   0.6   |
++--------+--------+---------+
+
+
+will result in the following:
+
++--------+--------+---------+--------+---------+--------+---------+
+| name   | counts | percent | counts | percent | counts | percent |
++========+========+=========+========+=========+========+=========+
+| gene 1 |   10   |   0.3   |   13   |   0.5   |   10   |   0.3   |
++--------+--------+---------+--------+---------+--------+---------+
+| gene 2 |   20   |   0.2   |    0   |    0    |    0   |    0    |
++--------+--------+---------+--------+---------+--------+---------+
+| gene 3 |   12   |   0.2   |    0   |    0    |   12   |   0.2   |
++--------+--------+---------+--------+---------+--------+---------+
+| gene 4 |    0   |    0    |    4   |   0.4   |    4   |   0.4   |
++--------+--------+---------+--------+---------+--------+---------+
+| gene 5 |    0   |    0    |    0   |    0    |   19   |   0.6   |
++--------+--------+---------+--------+---------+--------+---------+
+
+For help contact: jochen.bick@usys.ethz.ch
+    ]]></help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/df.txt	Wed Aug 09 17:44:54 2017 -0400
@@ -0,0 +1,5 @@
+ids	col2	col3	col2	col3	col2	col3
+test	2	5	2	6	0	0
+test1	3	6	0	0	0	0
+test2	4	7	3	7	2	6
+test3	0	0	0	0	3	7
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/df1.txt	Wed Aug 09 17:44:54 2017 -0400
@@ -0,0 +1,4 @@
+ids	col2	col3
+test	2	5
+test1	3	6
+test2	4	7
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/df2.txt	Wed Aug 09 17:44:54 2017 -0400
@@ -0,0 +1,3 @@
+ids	col2	col3
+test	2	6
+test2	3	7
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/df3.txt	Wed Aug 09 17:44:54 2017 -0400
@@ -0,0 +1,3 @@
+ids	col2	col3
+test2	2	6
+test3	3	7