Mercurial > repos > pieterlukasse > prims_metabolomics
view Rscripts/filter-RIDB.R @ 34:e79a7c32b011
version using less memory, but more CPU
author | pieter.lukasse@wur.nl |
---|---|
date | Wed, 17 Sep 2014 11:31:34 +0200 |
parents | 9d5f4f5f764b |
children |
line wrap: on
line source
## # # Removes duplicates from a RI-database # # Usage: # Rscript filter-RIDB.R /path/to/retention_db.txt output_RIDB_file.txt # ## # Commandline arguments args <- commandArgs(TRUE) ridb <- args[1] out_file <- args[2] # Function to check duplicates duplicates <- function(dat) { s <- do.call("order", as.data.frame(dat)) non.dup <- !duplicated(dat[s, ]) orig.ind <- s[non.dup] first.occ <- orig.ind[cumsum(non.dup)] first.occ[non.dup] <- NA first.occ[order(s)] } # Load CSV file ridb <- read.csv(ridb,header=TRUE, sep="\t") ## Filters on: CAS FORMULA Column type Column phase type Column name filter_cols <- c(1, 3, 5, 6, 7) cat("RIDB dimensions: ") print(dim(ridb)) deleted <- NULL cat("Checking for duplicates...") dups <- duplicates(ridb[,filter_cols]) cat("\t[DONE]\nRemoving duplicates...") newridb <- ridb newridb["min"] <- NA newridb["max"] <- NA newridb["orig.columns"] <- NA for (i in unique(dups)) { if (!is.na(i)) { rows <- which(dups == i) duprows <- ridb[c(i, rows),] # Replace duplicate rows with one row containing the median value new_RI <- median(duprows$RI) newridb$RI[i] <- median(duprows$RI) newridb$min[i] <- min(duprows$RI) newridb$max[i] <- max(duprows$RI) newridb$orig.columns[i] <- paste(rows, collapse=",") deleted <- c(deleted, rows) } } cat("\t\t[DONE]\nCreating new dataset...") out_ridb <- newridb[-deleted,] cat("\t\t[DONE]\nWriting new dataset...") write.table(out_ridb, na='', file=out_file, quote=T, sep="\t", row.names=F) cat("\t\t[DONE]\n")