Mercurial > repos > pieterlukasse > prims_metabolomics2
comparison Rscripts/filter-RIDB.R @ 0:dffc38727496
initial commit
author | pieter.lukasse@wur.nl |
---|---|
date | Sat, 07 Feb 2015 22:02:00 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dffc38727496 |
---|---|
1 ## | |
2 # | |
3 # Removes duplicates from a RI-database | |
4 # | |
5 # Usage: | |
6 # Rscript filter-RIDB.R /path/to/retention_db.txt output_RIDB_file.txt | |
7 # | |
8 ## | |
9 | |
10 # Commandline arguments | |
11 args <- commandArgs(TRUE) | |
12 ridb <- args[1] | |
13 out_file <- args[2] | |
14 | |
15 # Function to check duplicates | |
16 duplicates <- function(dat) { | |
17 s <- do.call("order", as.data.frame(dat)) | |
18 non.dup <- !duplicated(dat[s, ]) | |
19 orig.ind <- s[non.dup] | |
20 first.occ <- orig.ind[cumsum(non.dup)] | |
21 first.occ[non.dup] <- NA | |
22 first.occ[order(s)] | |
23 } | |
24 | |
25 # Load CSV file | |
26 ridb <- read.csv(ridb,header=TRUE, sep="\t") | |
27 ## Filters on: CAS FORMULA Column type Column phase type Column name | |
28 filter_cols <- c(1, 3, 5, 6, 7) | |
29 cat("RIDB dimensions: ") | |
30 print(dim(ridb)) | |
31 deleted <- NULL | |
32 cat("Checking for duplicates...") | |
33 dups <- duplicates(ridb[,filter_cols]) | |
34 cat("\t[DONE]\nRemoving duplicates...") | |
35 newridb <- ridb | |
36 newridb["min"] <- NA | |
37 newridb["max"] <- NA | |
38 newridb["orig.columns"] <- NA | |
39 for (i in unique(dups)) { | |
40 if (!is.na(i)) { | |
41 rows <- which(dups == i) | |
42 duprows <- ridb[c(i, rows),] | |
43 # Replace duplicate rows with one row containing the median value | |
44 new_RI <- median(duprows$RI) | |
45 newridb$RI[i] <- median(duprows$RI) | |
46 newridb$min[i] <- min(duprows$RI) | |
47 newridb$max[i] <- max(duprows$RI) | |
48 newridb$orig.columns[i] <- paste(rows, collapse=",") | |
49 deleted <- c(deleted, rows) | |
50 } | |
51 } | |
52 cat("\t\t[DONE]\nCreating new dataset...") | |
53 out_ridb <- newridb[-deleted,] | |
54 cat("\t\t[DONE]\nWriting new dataset...") | |
55 write.table(out_ridb, na='', file=out_file, quote=T, sep="\t", row.names=F) | |
56 cat("\t\t[DONE]\n") |