Mercurial > repos > pieterlukasse > prims_metabolomics
diff Rscripts/filter-RIDB.R @ 0:9d5f4f5f764b
Initial commit to toolshed
author | pieter.lukasse@wur.nl |
---|---|
date | Thu, 16 Jan 2014 13:10:00 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Rscripts/filter-RIDB.R Thu Jan 16 13:10:00 2014 +0100 @@ -0,0 +1,56 @@ +## +# +# Removes duplicates from a RI-database +# +# Usage: +# Rscript filter-RIDB.R /path/to/retention_db.txt output_RIDB_file.txt +# +## + +# Commandline arguments +args <- commandArgs(TRUE) +ridb <- args[1] +out_file <- args[2] + +# Function to check duplicates +duplicates <- function(dat) { + s <- do.call("order", as.data.frame(dat)) + non.dup <- !duplicated(dat[s, ]) + orig.ind <- s[non.dup] + first.occ <- orig.ind[cumsum(non.dup)] + first.occ[non.dup] <- NA + first.occ[order(s)] +} + +# Load CSV file +ridb <- read.csv(ridb,header=TRUE, sep="\t") +## Filters on: CAS FORMULA Column type Column phase type Column name +filter_cols <- c(1, 3, 5, 6, 7) +cat("RIDB dimensions: ") +print(dim(ridb)) +deleted <- NULL +cat("Checking for duplicates...") +dups <- duplicates(ridb[,filter_cols]) +cat("\t[DONE]\nRemoving duplicates...") +newridb <- ridb +newridb["min"] <- NA +newridb["max"] <- NA +newridb["orig.columns"] <- NA +for (i in unique(dups)) { + if (!is.na(i)) { + rows <- which(dups == i) + duprows <- ridb[c(i, rows),] + # Replace duplicate rows with one row containing the median value + new_RI <- median(duprows$RI) + newridb$RI[i] <- median(duprows$RI) + newridb$min[i] <- min(duprows$RI) + newridb$max[i] <- max(duprows$RI) + newridb$orig.columns[i] <- paste(rows, collapse=",") + deleted <- c(deleted, rows) + } +} +cat("\t\t[DONE]\nCreating new dataset...") +out_ridb <- newridb[-deleted,] +cat("\t\t[DONE]\nWriting new dataset...") +write.table(out_ridb, na='', file=out_file, quote=T, sep="\t", row.names=F) +cat("\t\t[DONE]\n")