comparison Rscripts/filter-RIDB.R @ 0:dffc38727496

initial commit
author pieter.lukasse@wur.nl
date Sat, 07 Feb 2015 22:02:00 +0100
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dffc38727496
1 ##
2 #
3 # Removes duplicates from a RI-database
4 #
5 # Usage:
6 # Rscript filter-RIDB.R /path/to/retention_db.txt output_RIDB_file.txt
7 #
8 ##
9
10 # Commandline arguments
11 args <- commandArgs(TRUE)
12 ridb <- args[1]
13 out_file <- args[2]
14
15 # Function to check duplicates
16 duplicates <- function(dat) {
17 s <- do.call("order", as.data.frame(dat))
18 non.dup <- !duplicated(dat[s, ])
19 orig.ind <- s[non.dup]
20 first.occ <- orig.ind[cumsum(non.dup)]
21 first.occ[non.dup] <- NA
22 first.occ[order(s)]
23 }
24
25 # Load CSV file
26 ridb <- read.csv(ridb,header=TRUE, sep="\t")
27 ## Filters on: CAS FORMULA Column type Column phase type Column name
28 filter_cols <- c(1, 3, 5, 6, 7)
29 cat("RIDB dimensions: ")
30 print(dim(ridb))
31 deleted <- NULL
32 cat("Checking for duplicates...")
33 dups <- duplicates(ridb[,filter_cols])
34 cat("\t[DONE]\nRemoving duplicates...")
35 newridb <- ridb
36 newridb["min"] <- NA
37 newridb["max"] <- NA
38 newridb["orig.columns"] <- NA
39 for (i in unique(dups)) {
40 if (!is.na(i)) {
41 rows <- which(dups == i)
42 duprows <- ridb[c(i, rows),]
43 # Replace duplicate rows with one row containing the median value
44 new_RI <- median(duprows$RI)
45 newridb$RI[i] <- median(duprows$RI)
46 newridb$min[i] <- min(duprows$RI)
47 newridb$max[i] <- max(duprows$RI)
48 newridb$orig.columns[i] <- paste(rows, collapse=",")
49 deleted <- c(deleted, rows)
50 }
51 }
52 cat("\t\t[DONE]\nCreating new dataset...")
53 out_ridb <- newridb[-deleted,]
54 cat("\t\t[DONE]\nWriting new dataset...")
55 write.table(out_ridb, na='', file=out_file, quote=T, sep="\t", row.names=F)
56 cat("\t\t[DONE]\n")