annotate Rscripts/filter-RIDB.R @ 0:dffc38727496

initial commit
author pieter.lukasse@wur.nl
date Sat, 07 Feb 2015 22:02:00 +0100
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
1 ##
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
2 #
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
3 # Removes duplicates from a RI-database
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
4 #
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
5 # Usage:
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
6 # Rscript filter-RIDB.R /path/to/retention_db.txt output_RIDB_file.txt
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
7 #
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
8 ##
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
9
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
10 # Commandline arguments
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
11 args <- commandArgs(TRUE)
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
12 ridb <- args[1]
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
13 out_file <- args[2]
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
14
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
15 # Function to check duplicates
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
16 duplicates <- function(dat) {
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
17 s <- do.call("order", as.data.frame(dat))
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
18 non.dup <- !duplicated(dat[s, ])
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
19 orig.ind <- s[non.dup]
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
20 first.occ <- orig.ind[cumsum(non.dup)]
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
21 first.occ[non.dup] <- NA
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
22 first.occ[order(s)]
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
23 }
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
24
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
25 # Load CSV file
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
26 ridb <- read.csv(ridb,header=TRUE, sep="\t")
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
27 ## Filters on: CAS FORMULA Column type Column phase type Column name
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
28 filter_cols <- c(1, 3, 5, 6, 7)
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
29 cat("RIDB dimensions: ")
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
30 print(dim(ridb))
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
31 deleted <- NULL
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
32 cat("Checking for duplicates...")
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
33 dups <- duplicates(ridb[,filter_cols])
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
34 cat("\t[DONE]\nRemoving duplicates...")
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
35 newridb <- ridb
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
36 newridb["min"] <- NA
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
37 newridb["max"] <- NA
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
38 newridb["orig.columns"] <- NA
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
39 for (i in unique(dups)) {
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
40 if (!is.na(i)) {
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
41 rows <- which(dups == i)
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
42 duprows <- ridb[c(i, rows),]
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
43 # Replace duplicate rows with one row containing the median value
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
44 new_RI <- median(duprows$RI)
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
45 newridb$RI[i] <- median(duprows$RI)
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
46 newridb$min[i] <- min(duprows$RI)
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
47 newridb$max[i] <- max(duprows$RI)
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
48 newridb$orig.columns[i] <- paste(rows, collapse=",")
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
49 deleted <- c(deleted, rows)
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
50 }
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
51 }
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
52 cat("\t\t[DONE]\nCreating new dataset...")
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
53 out_ridb <- newridb[-deleted,]
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
54 cat("\t\t[DONE]\nWriting new dataset...")
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
55 write.table(out_ridb, na='', file=out_file, quote=T, sep="\t", row.names=F)
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
56 cat("\t\t[DONE]\n")