comparison dimsPredictPuritySingle.R @ 6:18c0038dde9c draft

"planemo upload for repository https://github.com/computational-metabolomics/mspurity-galaxy commit 2579c8746819670348c378f86116f83703c493eb"
author computational-metabolomics
date Thu, 04 Mar 2021 12:33:09 +0000
parents 7f3183988100
children 94a2b6571758
comparison
equal deleted inserted replaced
5:03b45dceb134 6:18c0038dde9c
1 library(msPurity) 1 library(msPurity)
2 library(optparse) 2 library(optparse)
3 print(sessionInfo()) 3 print(sessionInfo())
4 4
5 option_list <- list( 5 option_list <- list(
6 make_option(c("--mzML_file"), type="character"), 6 make_option(c("--mzML_file"), type = "character"),
7 make_option(c("--mzML_files"), type="character"), 7 make_option(c("--mzML_files"), type = "character"),
8 make_option(c("--mzML_filename"), type="character", default=''), 8 make_option(c("--mzML_filename"), type = "character", default = ""),
9 make_option(c("--mzML_galaxy_names"), type="character", default=''), 9 make_option(c("--mzML_galaxy_names"), type = "character", default = ""),
10 make_option(c("--peaks_file"), type="character"), 10 make_option(c("--peaks_file"), type = "character"),
11 make_option(c("-o", "--out_dir"), type="character"), 11 make_option(c("-o", "--out_dir"), type = "character"),
12 make_option("--minoffset", default=0.5), 12 make_option("--minoffset", default = 0.5),
13 make_option("--maxoffset", default=0.5), 13 make_option("--maxoffset", default = 0.5),
14 make_option("--ilim", default=0.05), 14 make_option("--ilim", default = 0.05),
15 make_option("--ppm", default=4), 15 make_option("--ppm", default = 4),
16 make_option("--dimspy", action="store_true"), 16 make_option("--dimspy", action = "store_true"),
17 make_option("--sim", action="store_true"), 17 make_option("--sim", action = "store_true"),
18 make_option("--remove_nas", action="store_true"), 18 make_option("--remove_nas", action = "store_true"),
19 make_option("--iwNorm", default="none", type="character"), 19 make_option("--iwNorm", default = "none", type = "character"),
20 make_option("--file_num_dimspy", default=1), 20 make_option("--file_num_dimspy", default = 1),
21 make_option("--exclude_isotopes", action="store_true"), 21 make_option("--exclude_isotopes", action = "store_true"),
22 make_option("--isotope_matrix", type="character") 22 make_option("--isotope_matrix", type = "character")
23 ) 23 )
24 24
25 # store options 25 # store options
26 opt<- parse_args(OptionParser(option_list=option_list)) 26 opt <- parse_args(OptionParser(option_list = option_list))
27 27
28 print(sessionInfo()) 28 print(sessionInfo())
29 print(opt) 29 print(opt)
30 30
31 print(opt$mzML_files) 31 print(opt$mzML_files)
32 print(opt$mzML_galaxy_names) 32 print(opt$mzML_galaxy_names)
33 33
34 str_to_vec <- function(x){ 34 str_to_vec <- function(x) {
35 print(x) 35 print(x)
36 x <- trimws(strsplit(x, ',')[[1]]) 36 x <- trimws(strsplit(x, ",")[[1]])
37 return(x[x != ""]) 37 return(x[x != ""])
38 } 38 }
39 39
40 find_mzml_file <- function(mzML_files, galaxy_names, mzML_filename){ 40 find_mzml_file <- function(mzML_files, galaxy_names, mzML_filename) {
41 mzML_filename <- trimws(mzML_filename) 41 mzML_filename <- trimws(mzML_filename)
42 mzML_files <- str_to_vec(mzML_files) 42 mzML_files <- str_to_vec(mzML_files)
43 galaxy_names <- str_to_vec(galaxy_names) 43 galaxy_names <- str_to_vec(galaxy_names)
44 if (mzML_filename %in% galaxy_names){ 44 if (mzML_filename %in% galaxy_names) {
45 return(mzML_files[galaxy_names==mzML_filename]) 45 return(mzML_files[galaxy_names == mzML_filename])
46 }else{ 46 }else{
47 stop(paste("mzML file not found - ", mzML_filename)) 47 stop(paste("mzML file not found - ", mzML_filename))
48 } 48 }
49 } 49 }
50 50
51 51
52 if (is.null(opt$dimspy)){ 52 if (is.null(opt$dimspy)) {
53 df <- read.table(opt$peaks_file, header = TRUE, sep='\t') 53 df <- read.table(opt$peaks_file, header = TRUE, sep = "\t")
54 if (file.exists(opt$mzML_file)){ 54 if (file.exists(opt$mzML_file)) {
55 mzML_file <- opt$mzML_file 55 mzML_file <- opt$mzML_file
56 }else if (!is.null(opt$mzML_files)){ 56 }else if (!is.null(opt$mzML_files)) {
57 mzML_file <- find_mzml_file(opt$mzML_files, opt$mzML_galaxy_names, 57 mzML_file <- find_mzml_file(opt$mzML_files, opt$mzML_galaxy_names,
58 opt$mzML_filename) 58 opt$mzML_filename)
59 }else{ 59 }else{
60 mzML_file <- file.path(opt$mzML_file, filename) 60 mzML_file <- file.path(opt$mzML_file, filename)
61 } 61 }
62 }else{ 62 }else{
63 indf <- read.table(opt$peaks_file, 63 indf <- read.table(opt$peaks_file,
64 header = TRUE, sep='\t', stringsAsFactors = FALSE) 64 header = TRUE, sep = "\t", stringsAsFactors = FALSE)
65 65
66 filename <- colnames(indf)[8:ncol(indf)][opt$file_num_dimspy] 66 filename <- colnames(indf)[8:ncol(indf)][opt$file_num_dimspy]
67 print(filename) 67 print(filename)
68 # check if the data file is mzML or RAW (can only use mzML currently) so 68 # check if the data file is mzML or RAW (can only use mzML currently) so
69 # we expect an mzML file of the same name in the same folder 69 # we expect an mzML file of the same name in the same folder
70 indf$i <- indf[,colnames(indf)==filename] 70 indf$i <- indf[, colnames(indf) == filename]
71 indf[,colnames(indf)==filename] <- as.numeric(indf[,colnames(indf)==filename]) 71 indf[, colnames(indf) == filename] <- as.numeric(indf[, colnames(indf) == filename])
72 72
73 filename = sub("raw", "mzML", filename, ignore.case = TRUE) 73 filename <- sub("raw", "mzML", filename, ignore.case = TRUE)
74 print(filename) 74 print(filename)
75 75
76 76 if (file.exists(opt$mzML_file)) {
77 if (file.exists(opt$mzML_file)){
78 mzML_file <- opt$mzML_file 77 mzML_file <- opt$mzML_file
79 }else if (!is.null(opt$mzML_files)){ 78 }else if (!is.null(opt$mzML_files)) {
80 mzML_file <- find_mzml_file(opt$mzML_files, opt$mzML_galaxy_names, filename) 79 mzML_file <- find_mzml_file(opt$mzML_files, opt$mzML_galaxy_names, filename)
81 }else{ 80 }else{
82 mzML_file <- file.path(opt$mzML_file, filename) 81 mzML_file <- file.path(opt$mzML_file, filename)
83 }
84
85 # Update the dimspy output with the correct information
86 df <- indf[4:nrow(indf),]
87 if ('blank_flag' %in% colnames(df)){
88 df <- df[df$blank_flag==1,]
89 } 82 }
90 colnames(df)[colnames(df)=='m.z'] <- 'mz' 83
91 84 # Update the dimspy output with the correct information
92 if ('nan' %in% df$mz){ 85 df <- indf[4:nrow(indf), ]
93 df[df$mz=='nan',]$mz <- NA 86 if ("blank_flag" %in% colnames(df)) {
87 df <- df[df$blank_flag == 1, ]
88 }
89 colnames(df)[colnames(df) == "m.z"] <- "mz"
90
91 if ("nan" %in% df$mz) {
92 df[df$mz == "nan", ]$mz <- NA
94 } 93 }
95 df$mz <- as.numeric(df$mz) 94 df$mz <- as.numeric(df$mz)
96 } 95 }
97 96
98 if (!is.null(opt$remove_nas)){ 97 if (!is.null(opt$remove_nas)) {
99 df <- df[!is.na(df$mz),] 98 df <- df[!is.na(df$mz), ]
100 } 99 }
101 100
102 if (is.null(opt$isotope_matrix)){ 101 if (is.null(opt$isotope_matrix)) {
103 im <- NULL 102 im <- NULL
104 }else{ 103 }else{
105 im <- read.table(opt$isotope_matrix, 104 im <- read.table(opt$isotope_matrix,
106 header = TRUE, sep='\t', stringsAsFactors = FALSE) 105 header = TRUE, sep = "\t", stringsAsFactors = FALSE)
107 } 106 }
108 107
109 if (is.null(opt$exclude_isotopes)){ 108 if (is.null(opt$exclude_isotopes)) {
110 isotopes <- FALSE 109 isotopes <- FALSE
111 }else{ 110 }else{
112 isotopes <- TRUE 111 isotopes <- TRUE
113 } 112 }
114 113
115 if (is.null(opt$sim)){ 114 if (is.null(opt$sim)) {
116 sim=FALSE 115 sim <- FALSE
117 }else{ 116 }else{
118 sim=TRUE 117 sim <- TRUE
119 } 118 }
120 119
121 minOffset = as.numeric(opt$minoffset) 120 minOffset <- as.numeric(opt$minoffset)
122 maxOffset = as.numeric(opt$maxoffset) 121 maxOffset <- as.numeric(opt$maxoffset)
123 122
124 if (opt$iwNorm=='none'){ 123 if (opt$iwNorm == "none") {
125 iwNorm = FALSE 124 iwNorm <- FALSE
126 iwNormFun = NULL 125 iwNormFun <- NULL
127 }else if (opt$iwNorm=='gauss'){ 126 }else if (opt$iwNorm == "gauss") {
128 iwNorm = TRUE 127 iwNorm <- TRUE
129 iwNormFun = msPurity::iwNormGauss(minOff=-minOffset, maxOff=maxOffset) 128 iwNormFun <- msPurity::iwNormGauss(minOff = -minOffset, maxOff = maxOffset)
130 }else if (opt$iwNorm=='rcosine'){ 129 }else if (opt$iwNorm == "rcosine") {
131 iwNorm = TRUE 130 iwNorm <- TRUE
132 iwNormFun = msPurity::iwNormRcosine(minOff=-minOffset, maxOff=maxOffset) 131 iwNormFun <- msPurity::iwNormRcosine(minOff = -minOffset, maxOff = maxOffset)
133 }else if (opt$iwNorm=='QE5'){ 132 }else if (opt$iwNorm == "QE5") {
134 iwNorm = TRUE 133 iwNorm <- TRUE
135 iwNormFun = msPurity::iwNormQE.5() 134 iwNormFun <- msPurity::iwNormQE.5()
136 } 135 }
137 136
138 print('FIRST ROWS OF PEAK FILE') 137 print("FIRST ROWS OF PEAK FILE")
139 print(head(df)) 138 print(head(df))
140 print(mzML_file) 139 print(mzML_file)
141 predicted <- msPurity::dimsPredictPuritySingle(df$mz, 140 predicted <- msPurity::dimsPredictPuritySingle(df$mz,
142 filepth=mzML_file, 141 filepth = mzML_file,
143 minOffset=minOffset, 142 minOffset = minOffset,
144 maxOffset=maxOffset, 143 maxOffset = maxOffset,
145 ppm=opt$ppm, 144 ppm = opt$ppm,
146 mzML=TRUE, 145 mzML = TRUE,
147 sim = sim, 146 sim = sim,
148 ilim = opt$ilim, 147 ilim = opt$ilim,
149 isotopes = isotopes, 148 isotopes = isotopes,
150 im = im, 149 im = im,
151 iwNorm = iwNorm, 150 iwNorm = iwNorm,
152 iwNormFun = iwNormFun 151 iwNormFun = iwNormFun
153 ) 152 )
154 predicted <- cbind(df, predicted) 153 predicted <- cbind(df, predicted)
155 154
156 print(head(predicted)) 155 print(head(predicted))
157 print(file.path(opt$out_dir, 'dimsPredictPuritySingle_output.tsv')) 156 print(file.path(opt$out_dir, "dimsPredictPuritySingle_output.tsv"))
158 157
159 write.table(predicted, 158 write.table(predicted,
160 file.path(opt$out_dir, 'dimsPredictPuritySingle_output.tsv'), 159 file.path(opt$out_dir, "dimsPredictPuritySingle_output.tsv"),
161 row.names=FALSE, sep='\t') 160 row.names = FALSE, sep = "\t")
162
163