Mercurial > repos > fgiacomoni > xseekerpreparator
comparison XSeekerPreparator.R @ 0:1660665c081e draft
#issue9 branch Updating - - Fxx
author | fgiacomoni |
---|---|
date | Thu, 19 Nov 2020 14:28:46 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1660665c081e |
---|---|
1 | |
2 | |
3 TOOL_NAME <- "XSeekerPreparator" | |
4 VERSION <- "1.1.2" | |
5 | |
6 OUTPUT_SPECIFIC_TOOL <- "XSeeker_Galaxy" | |
7 | |
8 ENRICHED_RDATA_VERSION <- paste("1.1.2", OUTPUT_SPECIFIC_TOOL, sep="-") | |
9 ENRICHED_RDATA_DOC <- sprintf(" | |
10 Welcome to the enriched <Version %s> of the output of CAMERA/xcms. | |
11 This doc was generated by the tool: %s - Version %s | |
12 To show the different variables contained in this rdata, type: | |
13 - `load('this_rdata.rdata', rdata_env <- new.env())` | |
14 - `names(rdata_env)` | |
15 | |
16 Sections | |
17 ###### | |
18 | |
19 | |
20 This tools helpers | |
21 ------ | |
22 The version number is somewhat special because the evolution of the | |
23 rdata's format is non-linear. | |
24 There may be different branches, each evolving separatly. | |
25 To reflect these branches's diversions, there may be a prepended | |
26 branch name following this format: | |
27 major.minor.patch-branch_name | |
28 Like this, we can process rdata with the same tool, and output | |
29 rdata formated differently, for each tool. | |
30 | |
31 | |
32 - enriched_rdata: | |
33 - Description: flag created by that tool to tell it was enriched. | |
34 - Retrieval method: enriched_rdata <- TRUE | |
35 | |
36 - enriched_rdata_version: | |
37 - Description: A flag created by that tool to tell which version of | |
38 this tool has enriched the rdata. | |
39 - Retrieval method: enriched_rdata_version <- sprintf(\"%s\", ENRICHED_RDATA_VERSION) | |
40 | |
41 - enriched_rdata_doc: | |
42 - Description: Contains the documentation string. | |
43 | |
44 Data from original mzxml file | |
45 ------ | |
46 - tic: | |
47 - Description: Those are the tic values from the original mzxml | |
48 file, extracted using xcms 2. | |
49 - Retrieval method: xcms::xcmsRaw('original_file.mzxml')@tic | |
50 - xcms version: 2.0 | |
51 | |
52 - mz: | |
53 - Description: Those are the m/z values from the original mzxml | |
54 file, extracted using xcms 2. | |
55 - Retrieval method: xcms::xcmsRaw('original_file.mzxml')@env$mz | |
56 - xcms version: 2.0 | |
57 | |
58 - scanindex: | |
59 - Description: Those are the scanindex values from the original mzxml | |
60 file, extracted using xcms 2. | |
61 - Retrieval method: xcms::xcmsRaw('original_file.mzxml')@scanindex | |
62 - xcms version: 2.0 | |
63 | |
64 - scantime: | |
65 - Description: Those are the scantime values from the original mzxml | |
66 file, extracted using xcms 2. | |
67 - Retrieval method: xcms::xcmsRaw('original_file.mzxml')@scantime | |
68 - xcms version: 2.0 | |
69 | |
70 - intensity: | |
71 - Description: Those are the intensity values from the original mzxml | |
72 file, extracted using xcms 2. | |
73 - Retrieval method: xcms::xcmsRaw('original_file.mzxml')@env$intensity | |
74 - xcms version: 2.0 | |
75 | |
76 - polarity: | |
77 - Description: Those are the polarity values from the original mzxml | |
78 file, extracted using xcms 2. | |
79 - Retrieval method: as.character(xcms::xcmsRaw('original_file.mzxml')@polarity[[1]]) | |
80 - xcms version: 2.0 | |
81 | |
82 Data taken from incoming rdata | |
83 ------ | |
84 - variableMetadata: | |
85 - Description: Unmodified copy of variableMetadata from incoming rdata. | |
86 - Retrieval method: rdata_file$variableMetadata | |
87 | |
88 - process_params: | |
89 - Description: Those are the processing parameters values from the | |
90 curent rdata. They have been simplified to allow easy access like: | |
91 for (params in process_params) { | |
92 if (params[[\"xfunction\"]] == \"annotatediff\") { | |
93 process_peak_picking_params(params) | |
94 } | |
95 } | |
96 - Retrieval method: | |
97 ## just he same list, but simplified | |
98 process_params <- list() | |
99 for (list_name in names(rdata_file$listOFlistArguments)) { | |
100 param_list <- list() | |
101 for (param_name in names(rdata_file$listOFlistArguments[[list_name]])) { | |
102 param_list[[param_name]] <- rdata_file$listOFlistArguments[[list_name]][[param_name]] | |
103 } | |
104 process_params[[length(process_params)+1]] <- param_list | |
105 } | |
106 ", ENRICHED_RDATA_VERSION, TOOL_NAME, VERSION, ENRICHED_RDATA_VERSION) | |
107 | |
108 | |
109 | |
110 get_models <- function(path) { | |
111 if (is.null(path)) { | |
112 stop("No models to define the database schema") | |
113 } else { | |
114 message(sprintf("Loading models from %s", path)) | |
115 } | |
116 ## galaxy mangles the "@" to a "__at__" | |
117 if (substr(path, 1, 9) == "git__at__") { | |
118 path <- sub("^git__at__", "git@", path, perl=TRUE) | |
119 } | |
120 if ( | |
121 substr(path, 1, 4) == "git@" | |
122 || substr(path, length(path)-4, 4) == ".git" | |
123 ) { | |
124 return (get_models_from_git(path)) | |
125 } | |
126 if (substr(path, 1, 4) == "http") { | |
127 return (get_models_from_url(path)) | |
128 } | |
129 return (source(path)$value) | |
130 } | |
131 | |
132 get_models_from_git <- function (url, target_file="models.R", rm=TRUE) { | |
133 tmp <- tempdir() | |
134 message(sprintf("Cloning %s", url)) | |
135 system2("git", c("clone", url, tmp)) | |
136 result <- search_tree(file.path(tmp, dir), target_file) | |
137 if (!is.null(result)) { | |
138 models <- source(result)$value | |
139 if (rm) { | |
140 unlink(tmp, recursive=TRUE) | |
141 } | |
142 return (models) | |
143 } | |
144 if (rm) { | |
145 unlink(tmp, recursive=TRUE) | |
146 } | |
147 stop(sprintf( | |
148 "Could not find any file named \"%s\" in this repo", | |
149 target_file | |
150 )) | |
151 } | |
152 | |
153 get_models_from_url <- function (url, target_file="models.R", rm=TRUE) { | |
154 tmp <- tempdir() | |
155 message(sprintf("Downloading %s", url)) | |
156 result <- file.path(tmp, target_file) | |
157 if (download.file(url, destfile=result) == 0) { | |
158 models <- source(result)$value | |
159 if (rm) { | |
160 unlink(tmp, recursive=TRUE) | |
161 } | |
162 return (models) | |
163 } | |
164 if (rm) { | |
165 unlink(tmp, recursive=TRUE) | |
166 } | |
167 stop("Could not download any file at this adress.") | |
168 } | |
169 | |
170 search_tree <- function(path, target) { | |
171 target <- tolower(target) | |
172 for (file in list.files(path)) { | |
173 if (is.dir(file)) { | |
174 result <- search_tree(file.path(path, file), target) | |
175 if (!is.null(result)) { | |
176 return (result) | |
177 } | |
178 } else if (tolower(file) == target) { | |
179 return (file.path(path, file)) | |
180 } | |
181 } | |
182 return (NULL) | |
183 } | |
184 | |
185 create_database <- function(orm) { | |
186 orm$recreate_database(no_exists=FALSE) | |
187 set_database_version(orm, "created") | |
188 } | |
189 | |
190 insert_adducts <- function(orm) { | |
191 message("Creating adducts...") | |
192 adducts <- list( | |
193 list("[M-H2O-H]-",1,-1,-48.992020312000001069,1,0,0.5,"H0","H1O3"), | |
194 list("[M-H-Cl+O]-",1,-1,-19.981214542000000022,2,0,0.5,"O1","H1Cl1"), | |
195 list("[M-Cl+O]-",1,-1,-18.973389510000000512,3,0,0.5,"O1","Cl1"), | |
196 list("[M-3H]3-",1,-3,-3.0218293560000000219,4,0,1.0,"H0","H3"), | |
197 list("[2M-3H]3-",2,-3,-3.0218293560000000219,4,0,0.5,"H0","H3"), | |
198 list("[3M-3H]3-",3,-3,-3.0218293560000000219,4,0,0.5,"H0","H3"), | |
199 list("[M-2H]2-",1,-2,-2.0145529039999998666,5,0,1.0,"H0","H2"), | |
200 list("[2M-2H]2-",2,-2,-2.0145529039999998666,5,0,0.5,"H0","H2"), | |
201 list("[3M-2H]2-",3,-2,-2.0145529039999998666,5,0,0.5,"H0","H2"), | |
202 list("[M-H]-",1,-1,-1.0072764519999999333,6,1,1.0,"H0","H1"), | |
203 list("[2M-H]-",2,-1,-1.0072764519999999333,6,0,0.5,"H0","H1"), | |
204 list("[3M-H]-",3,-1,-1.0072764519999999333,6,0,0.5,"H0","H1"), | |
205 list("[M]+",1,1,-0.00054858000000000000945,7,1,1.0,"H0","H0"), | |
206 list("[M]-",1,-1,0.00054858000000000000945,8,1,1.0,"H0","H0"), | |
207 list("[M+H]+",1,1,1.0072764519999999333,9,1,1.0,"H1","H0"), | |
208 list("[2M+H]+",2,1,1.0072764519999999333,9,0,0.5,"H1","H0"), | |
209 list("[3M+H]+",3,1,1.0072764519999999333,9,0,0.25,"H1","H0"), | |
210 list("[M+2H]2+",1,2,2.0145529039999998666,10,0,0.75,"H2","H0"), | |
211 list("[2M+2H]2+",2,2,2.0145529039999998666,10,0,0.5,"H2","H0"), | |
212 list("[3M+2H]2+",3,2,2.0145529039999998666,10,0,0.25,"H2","H0"), | |
213 list("[M+3H]3+",1,3,3.0218293560000000219,11,0,0.75,"H3","H0"), | |
214 list("[2M+3H]3+",2,3,3.0218293560000000219,11,0,0.5,"H3","H0"), | |
215 list("[3M+3H]3+",3,3,3.0218293560000000219,11,0,0.25,"H3","H0"), | |
216 list("[M-2H+NH4]-",1,-1,16.019272654000001665,12,0,0.25,"N1H4","H2"), | |
217 list("[2M-2H+NH4]-",2,-1,16.019272654000001665,12,0,0.0,"N1H4","H2"), | |
218 list("[3M-2H+NH4]-",3,-1,16.019272654000001665,12,0,0.25,"N1H4","H2"), | |
219 list("[M+NH4]+",1,1,18.033825558000000199,13,1,1.0,"N1H4","H0"), | |
220 list("[2M+NH4]+",2,1,18.033825558000000199,13,0,0.5,"N1H4","H0"), | |
221 list("[3M+NH4]+",3,1,18.033825558000000199,13,0,0.25,"N1H4","H0"), | |
222 list("[M+H+NH4]2+",1,2,19.041102009999999467,14,0,0.5,"N1H5","H0"), | |
223 list("[2M+H+NH4]2+",2,2,19.041102009999999467,14,0,0.5,"N1H5","H0"), | |
224 list("[3M+H+NH4]2+",3,2,19.041102009999999467,14,0,0.25,"N1H5","H0"), | |
225 list("[M+Na-2H]-",1,-1,20.974668176000001551,15,0,0.75,"Na1","H2"), | |
226 list("[2M-2H+Na]-",2,-1,20.974668176000001551,15,0,0.25,"Na1","H2"), | |
227 list("[3M-2H+Na]-",3,-1,20.974668176000001551,15,0,0.25,"Na1","H2"), | |
228 list("[M+Na]+",1,1,22.989221080000000086,16,1,1.0,"Na1","H0"), | |
229 list("[2M+Na]+",2,1,22.989221080000000086,16,0,0.5,"Na1","H0"), | |
230 list("[3M+Na]+",3,1,22.989221080000000086,16,0,0.25,"Na1","H0"), | |
231 list("[M+H+Na]2+",1,2,23.996497531999999353,17,0,0.5,"Na1H1","H0"), | |
232 list("[2M+H+Na]2+",2,2,23.996497531999999353,17,0,0.5,"Na1H1","H0"), | |
233 list("[3M+H+Na]2+",3,2,23.996497531999999353,17,0,0.25,"Na1H1","H0"), | |
234 list("[M+2H+Na]3+",1,3,25.003773983999998619,18,0,0.25,"H2Na1","H0"), | |
235 list("[M+CH3OH+H]+",1,1,33.033491200000000276,19,0,0.25,"C1O1H5","H0"), | |
236 list("[M-H+Cl]2-",1,-2,33.962124838000001148,20,0,1.0,"Cl1","H1"), | |
237 list("[2M-H+Cl]2-",2,-2,33.962124838000001148,20,0,0.5,"Cl1","H1"), | |
238 list("[3M-H+Cl]2-",3,-2,33.962124838000001148,20,0,0.5,"Cl1","H1"), | |
239 list("[M+Cl]-",1,-1,34.969401290000000416,21,1,1.0,"Cl1","H0"), | |
240 list("[2M+Cl]-",2,-1,34.969401290000000416,21,0,0.5,"Cl1","H0"), | |
241 list("[3M+Cl]-",3,-1,34.969401290000000416,21,0,0.5,"Cl1","H0"), | |
242 list("[M+K-2H]-",1,-1,36.948605415999999479,22,0,0.5,"K1","H2"), | |
243 list("[2M-2H+K]-",2,-1,36.948605415999999479,22,0,0.0,"K1","H2"), | |
244 list("[3M-2H+K]-",3,-1,36.948605415999999479,22,0,0.0,"K1","H2"), | |
245 list("[M+K]+",1,1,38.963158319999998013,23,1,1.0,"K1","H0"), | |
246 list("[2M+K]+",2,1,38.963158319999998013,23,0,0.5,"K1","H0"), | |
247 list("[3M+K]+",3,1,38.963158319999998013,23,0,0.25,"K1","H0"), | |
248 list("[M+H+K]2+",1,2,39.970434771999997281,24,0,0.5,"K1H1","H0"), | |
249 list("[2M+H+K]2+",2,2,39.970434771999997281,24,0,0.5,"K1H1","H0"), | |
250 list("[3M+H+K]2+",3,2,39.970434771999997281,24,0,0.25,"K1H1","H0"), | |
251 list("[M+ACN+H]+",1,1,42.033825557999996646,25,0,0.25,"C2H4N1","H0"), | |
252 list("[2M+ACN+H]+",2,1,42.033825557999996646,25,0,0.25,"C2H4N1","H0"), | |
253 list("[M+2Na-H]+",1,1,44.971165708000000902,26,0,0.5,"Na2","H1"), | |
254 list("[2M+2Na-H]+",2,1,44.971165708000000902,26,0,0.25,"Na2","H1"), | |
255 list("[3M+2Na-H]+",3,1,44.971165708000000902,26,0,0.25,"Na2","H1"), | |
256 list("[2M+FA-H]-",2,-1,44.998202851999998586,27,0,0.25,"C1O2H2","H1"), | |
257 list("[M+FA-H]-",1,-1,44.998202851999998586,27,0,0.5,"C1O2H2","H1"), | |
258 list("[M+2Na]2+",1,2,45.978442160000000172,28,0,0.5,"Na2","H0"), | |
259 list("[2M+2Na]2+",2,2,45.978442160000000172,28,0,0.5,"Na2","H0"), | |
260 list("[3M+2Na]2+",3,2,45.978442160000000172,28,0,0.25,"Na2","H0"), | |
261 list("[M+H+2Na]3+",1,3,46.985718611999999438,29,0,0.25,"H1Na2","H0"), | |
262 list("[M+H+FA]+",1,1,47.012755755999997122,30,0,0.25,"C1O2H3","H0"), | |
263 list("[M+Hac-H]-",1,-1,59.013852915999997607,31,0,0.25,"C2O2H4","H1"), | |
264 list("[2M+Hac-H]-",2,-1,59.013852915999997607,31,0,0.25,"C2O2H4","H1"), | |
265 list("[M+IsoProp+H]+",1,1,61.064791327999998317,32,0,0.25,"C3H9O1","H0"), | |
266 list("[M+Na+K]2+",1,2,61.9523793999999981,33,0,0.5,"Na1K1","H0"), | |
267 list("[2M+Na+K]2+",2,2,61.9523793999999981,33,0,0.5,"Na1K1","H0"), | |
268 list("[3M+Na+K]2+",3,2,61.9523793999999981,33,0,0.25,"Na1K1","H0"), | |
269 list("[M+NO3]-",1,-1,61.988366450000000895,34,0,0.5,"N1O3","H0"), | |
270 list("[M+ACN+Na]+",1,1,64.015770185999997464,35,0,0.25,"C2H3N1Na1","H0"), | |
271 list("[2M+ACN+Na]+",2,1,64.015770185999997464,35,0,0.25,"C2H3N1Na1","H0"), | |
272 list("[M+NH4+FA]+",1,1,64.039304861999994502,36,0,0.25,"N1C1O2H6","H0"), | |
273 list("[M-2H+Na+FA]-",1,-1,66.980147479999999405,37,0,0.5,"NaC1O2H2","H2"), | |
274 list("[M+3Na]3+",1,3,68.967663239999993153,38,0,0.25,"Na3","H0"), | |
275 list("[M+Na+FA]+",1,1,68.99470038399999794,39,0,0.25,"Na1C1O2H2","H0"), | |
276 list("[M+2Cl]2-",1,-2,69.938802580000000832,40,0,1.0,"Cl2","H0"), | |
277 list("[2M+2Cl]2-",2,-2,69.938802580000000832,40,0,0.5,"Cl2","H0"), | |
278 list("[3M+2Cl]2-",3,-2,69.938802580000000832,40,0,0.5,"Cl2","H0"), | |
279 list("[M+2K-H]+",1,1,76.919040187999996758,41,0,0.5,"K2","H1"), | |
280 list("[2M+2K-H]+",2,1,76.919040187999996758,41,0,0.25,"K2","H1"), | |
281 list("[3M+2K-H]+",3,1,76.919040187999996758,41,0,0.25,"K2","H1"), | |
282 list("[M+2K]2+",1,2,77.926316639999996028,42,0,0.5,"K2","H0"), | |
283 list("[2M+2K]2+",2,2,77.926316639999996028,42,0,0.5,"K2","H0"), | |
284 list("[3M+2K]2+",3,2,77.926316639999996028,42,0,0.25,"K2","H0"), | |
285 list("[M+Br]-",1,-1,78.918886479999997619,43,1,1.0,"Br1","H0"), | |
286 list("[M+Cl+FA]-",1,-1,80.974880593999998268,44,0,0.5,"Cl1C1O2H2","H0"), | |
287 list("[M+AcNa-H]-",1,-1,80.995797543999998426,45,0,0.25,"C2H3Na1O2","H1"), | |
288 list("[M+2ACN+2H]2+",1,2,84.067651115999993292,46,0,0.25,"C4H8N2","H0"), | |
289 list("[M+K+FA]+",1,1,84.968637623999995868,47,0,0.25,"K1C1O2H2","H0"), | |
290 list("[M+Cl+Na+FA-H]-",1,-1,102.95682522200000619,48,0,0.5,"Cl1Na1C1O2H2","H1"), | |
291 list("[2M+3H2O+2H]+",2,1,104.03153939599999944,49,0,0.25,"H8O6","H0"), | |
292 list("[M+TFA-H]-",1,-1,112.98558742000000165,50,0,0.5,"C2F3O2H1","H1"), | |
293 list("[M+H+TFA]+",1,1,115.00014032400000019,51,0,0.25,"C2F3O2H2","H0"), | |
294 list("[M+3ACN+2H]2+",1,2,125.09420022199999778,52,0,0.25,"C6H11N3","H0"), | |
295 list("[M+NH4+TFA]+",1,1,132.02668943000000468,53,0,0.25,"N1C2F3O2H5","H0"), | |
296 list("[M+Na+TFA]+",1,1,136.98208495200000811,54,0,0.25,"Na1C2F3O2H1","H0"), | |
297 list("[M+Cl+TFA]-",1,-1,148.96226516199999423,55,0,0.5,"Cl1C2F3O2H1","H0"), | |
298 list("[M+K+TFA]+",1,1,152.95602219200000604,56,0,0.25,"K1C2F3O2H1","H0") | |
299 ) | |
300 dummy_adduct <- orm$adduct() | |
301 for (adduct in adducts) { | |
302 i <- 0 | |
303 dummy_adduct$set_name(adduct[[i <- i+1]]) | |
304 dummy_adduct$set_multi(adduct[[i <- i+1]]) | |
305 dummy_adduct$set_charge(adduct[[i <- i+1]]) | |
306 dummy_adduct$set_mass(adduct[[i <- i+1]]) | |
307 dummy_adduct$set_oidscore(adduct[[i <- i+1]]) | |
308 dummy_adduct$set_quasi(adduct[[i <- i+1]]) | |
309 dummy_adduct$set_ips(adduct[[i <- i+1]]) | |
310 dummy_adduct$set_formula_add(adduct[[i <- i+1]]) | |
311 dummy_adduct$set_formula_ded(adduct[[i <- i+1]]) | |
312 dummy_adduct$save() | |
313 dummy_adduct$clear(unset_id=TRUE) | |
314 } | |
315 message("Adducts created") | |
316 } | |
317 | |
318 insert_base_data <- function(orm, path, archetype=FALSE) { | |
319 if (archetype) { | |
320 ## not implemented yet | |
321 return () | |
322 } | |
323 base_data <- readLines(path) | |
324 for (sql in strsplit(paste(base_data, collapse=" "), ";")[[1]]) { | |
325 orm$execute(sql) | |
326 } | |
327 set_database_version(orm, "enriched") | |
328 } | |
329 | |
330 insert_compounds <- function(orm, compounds_path) { | |
331 compounds <- read.csv(file=compounds_path, sep="\t") | |
332 if (is.null(compounds <- translate_compounds(compounds))) { | |
333 stop("Could not find asked compound's attributes in csv file.") | |
334 } | |
335 dummy_compound <- orm$compound() | |
336 compound_list <- list() | |
337 for (i in seq_len(nrow(compounds))) { | |
338 dummy_compound$set_mz(compounds[i, "mz"]) | |
339 dummy_compound$set_name(compounds[i, "name"]) | |
340 dummy_compound$set_common_name(compounds[i, "common_name"]) | |
341 dummy_compound$set_formula(compounds[i, "formula"]) | |
342 # dummy_compound$set_mz(compounds[i, "mz"]) | |
343 # dummy_compound$set_mz(compounds[i, "mz"]) | |
344 compound_list[[length(compound_list)+1]] <- as.list( | |
345 dummy_compound, | |
346 c("mz", "name", "common_name", "formula") | |
347 ) | |
348 dummy_compound$clear(unset_id=TRUE) | |
349 } | |
350 dummy_compound$save(bulk=compound_list) | |
351 } | |
352 | |
353 translate_compounds <- function(compounds) { | |
354 recognized_headers <- list( | |
355 c("HMDB_ID", "MzBank", "X.M.H..", "X.M.H...1", "MetName", "ChemFormula", "INChIkey") | |
356 ) | |
357 header_translators <- list( | |
358 hmdb_header_translator | |
359 ) | |
360 for (index in seq_along(recognized_headers)) { | |
361 headers <- recognized_headers[[index]] | |
362 if (identical(colnames(compounds), headers)) { | |
363 return (header_translators[[index]](compounds)) | |
364 } | |
365 } | |
366 if (is.null(translator <- guess_translator(colnames(compounds)))) { | |
367 return (NULL) | |
368 } | |
369 return (csv_header_translator(translator, compounds)) | |
370 } | |
371 | |
372 guess_translator <- function(header) { | |
373 result <- list( | |
374 # HMDB_ID=NULL,< | |
375 mz=NULL, | |
376 name=NULL, | |
377 common_name=NULL, | |
378 formula=NULL, | |
379 # inchi_key=NULL | |
380 ) | |
381 asked_cols <- names(result) | |
382 for (asked_col in asked_cols) { | |
383 for (col in header) { | |
384 if ((twisted <- tolower(col)) == asked_col | |
385 || gsub("-", "_", twisted) == asked_col | |
386 || gsub(" ", "_", twisted) == asked_col | |
387 || tolower(gsub("(.)([A-Z])", "\\1_\\2", col)) == asked_col | |
388 ) { | |
389 result[[asked_col]] <- col | |
390 next | |
391 } | |
392 } | |
393 } | |
394 if (any(mapply(is.null, result))) { | |
395 return (NULL) | |
396 } | |
397 return (result) | |
398 } | |
399 | |
400 hmdb_header_translator <- function(compounds) { | |
401 return (csv_header_translator( | |
402 list( | |
403 HMDB_ID="HMDB_ID", | |
404 mz="MzBank", | |
405 name="MetName", | |
406 common_name="MetName", | |
407 formula="ChemFormula", | |
408 inchi_key="INChIkey" | |
409 ), compounds | |
410 )) | |
411 } | |
412 | |
413 csv_header_translator <- function(translation_table, csv) { | |
414 header_names <- names(translation_table) | |
415 result <- data.frame(1:nrow(csv)) | |
416 # colnames(result) <- header_names | |
417 for (i in seq_along(header_names)) { | |
418 result[, header_names[[i]]] <- csv[, translation_table[[i]]] | |
419 } | |
420 print(result[, "mz"]) | |
421 result[, "mz"] <- as.numeric(result[, "mz"]) | |
422 print(result[, "mz"]) | |
423 return (result) | |
424 } | |
425 | |
426 set_database_version <- function(orm, version) { | |
427 orm$set_tag( | |
428 version, | |
429 tag_name="database_version", | |
430 tag_table_name="XSeeker_tagging_table" | |
431 ) | |
432 } | |
433 | |
434 process_rdata <- function(orm, rdata, options) { | |
435 mzml_tmp_dir <- gather_mzml_files(rdata) | |
436 samples <- names(rdata$singlefile) | |
437 if (!is.null(options$samples)) { | |
438 samples <- samples[options$samples %in% samples] | |
439 } | |
440 show_percent <- ( | |
441 is.null(options$`not-show-percent`) | |
442 || options$`not-show-percent` == FALSE | |
443 ) | |
444 error <- tryCatch({ | |
445 process_sample_list( | |
446 orm, rdata, samples, | |
447 show_percent=show_percent | |
448 ) | |
449 NULL | |
450 }, error=function(e) { | |
451 message(e) | |
452 e | |
453 }) | |
454 if (!is.null(mzml_tmp_dir)) { | |
455 unlink(mzml_tmp_dir, recursive=TRUE) | |
456 } | |
457 if (!is.null(error)) { | |
458 stop(error) | |
459 } | |
460 } | |
461 | |
462 gather_mzml_files <- function(rdata) { | |
463 if (is.null(rdata$singlefile)) { | |
464 message("Extracting mxml files") | |
465 tmp <- tempdir() | |
466 rdata$singlefile <- utils::unzip(rdata$zipfile, exdir=tmp) | |
467 names(rdata$singlefile) <- tools::file_path_sans_ext(basename(rdata$singlefile)) | |
468 message("Extracted") | |
469 return (tmp) | |
470 } | |
471 return (NULL) | |
472 } | |
473 | |
474 process_sample_list <- function(orm, radta, sample_names, show_percent) { | |
475 file_grouping_var <- find_grouping_var(rdata$variableMetadata) | |
476 message("Processing samples.") | |
477 message(sprintf("File grouping variable: %s", file_grouping_var)) | |
478 if(is.null(file_grouping_var)) { | |
479 stop("Malformed variableMetada.") | |
480 } | |
481 | |
482 process_arg_list <- rdata$listOFlistArguments | |
483 process_params <- list() | |
484 for (list_name in names(process_arg_list)) { | |
485 param_list <- list() | |
486 for (param_name in names(process_arg_list[[list_name]])) { | |
487 param_list[[param_name]] <- process_arg_list[[list_name]][[param_name]] | |
488 } | |
489 process_params[[length(process_params)+1]] <- param_list | |
490 } | |
491 message("Parameters from previous processes extracted.") | |
492 | |
493 var_meta <- rdata$variableMetadata | |
494 align_group <- rep(0, nrow(var_meta)) | |
495 var_meta <- cbind(var_meta, align_group) | |
496 context <- new.env() | |
497 context$clusters <- list() | |
498 context$groupidx <- rdata$xa@xcmsSet@groupidx | |
499 context$peaks <- rdata$xa@xcmsSet@peaks | |
500 context$show_percent <- show_percent | |
501 | |
502 indices <- as.numeric(unique(var_meta[, file_grouping_var])) | |
503 smol_xcms_set <- orm$smol_xcms_set() | |
504 mz_tab_info <- new.env() | |
505 xcms_set <- rdata$xa@xcmsSet | |
506 g <- xcms::groups(xcms_set) | |
507 mz_tab_info$group_length <- nrow(g) | |
508 mz_tab_info$dataset_path <- xcms::filepaths(xcms_set) | |
509 mz_tab_info$sampnames <- xcms::sampnames(xcms_set) | |
510 mz_tab_info$sampclass <- xcms::sampclass(xcms_set) | |
511 mz_tab_info$rtmed <- g[,"rtmed"] | |
512 mz_tab_info$mzmed <- g[,"mzmed"] | |
513 mz_tab_info$smallmolecule_abundance_assay <- xcms::groupval(xcms_set, value="into") | |
514 blogified <- blob::blob(fst::compress_fst(serialize(mz_tab_info, NULL), compression=100)) | |
515 smol_xcms_set$set_raw(blogified)$save() | |
516 for (no in indices) { | |
517 sample_name <- names(rdata$singlefile)[[no]] | |
518 sample_path <- rdata$singlefile[[no]] | |
519 if ( | |
520 is.na(no) | |
521 || is.null(sample_path) | |
522 || !(sample_name %in% sample_names) | |
523 ) { | |
524 next | |
525 } | |
526 ms_file=xcms::xcmsRaw(sample_path) | |
527 env <- new.env() | |
528 env$variableMetadata <- var_meta[var_meta[, file_grouping_var]==no,] | |
529 env$tic <- ms_file@tic | |
530 env$mz <- ms_file@env$mz | |
531 env$scanindex <- ms_file@scanindex | |
532 env$scantime <- ms_file@scantime | |
533 env$intensity <- ms_file@env$intensity | |
534 env$polarity <- as.character(ms_file@polarity[[1]]) | |
535 env$sample_name <- sample_name | |
536 env$dataset_path <- sample_path | |
537 env$process_params <- process_params | |
538 env$enriched_rdata <- TRUE | |
539 env$enriched_rdata_version <- ENRICHED_RDATA_VERSION | |
540 env$tool_name <- TOOL_NAME | |
541 env$enriched_rdata_doc <- ENRICHED_RDATA_DOC | |
542 context$sample_no <- no | |
543 add_sample_to_database(orm, env, context, smol_xcms_set) | |
544 } | |
545 message("Features enrichment") | |
546 complete_features(orm, context) | |
547 message("Features enrichment done.") | |
548 return (NULL) | |
549 } | |
550 | |
551 find_grouping_var <- function(var_meta) { | |
552 for (grouping_var in c(".", "Bio")) { | |
553 if (!is.null(rdata$variableMetadata[[grouping_var]])) { | |
554 return (grouping_var) | |
555 } | |
556 } | |
557 return (NULL) | |
558 } | |
559 | |
560 add_sample_to_database <- function(orm, env, context, smol_xcms_set) { | |
561 message(sprintf("Processing sample %s", env$sample_name)) | |
562 sample <- ( | |
563 orm$sample() | |
564 $set_name(env$sample_name) | |
565 $set_path(env$dataset_path) | |
566 $set_kind("enriched_rdata") | |
567 $set_polarity( | |
568 if (is.null(env$polarity) || identical(env$polarity, character(0))) "" | |
569 else env$polarity | |
570 ) | |
571 $set_smol_xcms_set(smol_xcms_set) | |
572 $set_raw(blob::blob(fst::compress_fst( | |
573 serialize(env, NULL), | |
574 compression=100 | |
575 ))) | |
576 $save() | |
577 ) | |
578 load_variable_metadata(orm, sample, env$variableMetadata, context) | |
579 load_process_params(orm, sample, env$process_params) | |
580 message(sprintf("Sample %s inserted.", env$sample_name)) | |
581 return (sample) | |
582 } | |
583 | |
584 | |
585 load_variable_metadata <- function(orm, sample, var_meta, context) { | |
586 all_clusters <- orm$cluster()$all() | |
587 | |
588 next_feature_id <- get_next_id(orm$feature()$all(), "featureID") | |
589 next_cluster_id <- get_next_id(all_clusters, "clusterID") | |
590 next_pc_group <- get_next_id(all_clusters, "pc_group") | |
591 next_align_group <- get_next_id(all_clusters, "align_group") | |
592 message("Extracting features") | |
593 invisible(create_features( | |
594 orm, sample, var_meta, context, | |
595 next_feature_id, next_cluster_id, | |
596 next_pc_group, next_align_group | |
597 )) | |
598 message("Extracting features done.") | |
599 return (NULL) | |
600 } | |
601 | |
602 get_next_id <- function(models, attribute) { | |
603 if ((id <- models$max(attribute)) == Inf || id == -Inf) { | |
604 return (1) | |
605 } | |
606 return (id + 1) | |
607 } | |
608 | |
609 create_features <- function( | |
610 orm, sample, var_meta, context, | |
611 next_feature_id, next_cluster_id, | |
612 next_pc_group, next_align_group | |
613 ) { | |
614 field_names <- as.list(names(orm$feature()$fields__)) | |
615 field_names[field_names=="id"] <- NULL | |
616 | |
617 features <- list() | |
618 dummy_feature <- orm$feature() | |
619 | |
620 if (show_percent <- context$show_percent) { | |
621 percent <- -1 | |
622 total <- nrow(var_meta) | |
623 } | |
624 for (row in seq_len(nrow(var_meta))) { | |
625 if (show_percent && (row / total) * 100 > percent) { | |
626 percent <- percent + 1 | |
627 message("\r", sprintf("\r%d %%", percent), appendLF=FALSE) | |
628 } | |
629 | |
630 curent_var_meta <- var_meta[row, ] | |
631 | |
632 peak_list <- context$peaks[context$groupidx[[row]], ] | |
633 sample_peak_list <- peak_list[peak_list[, "sample"] == context$sample_no, , drop=FALSE] | |
634 if (!identical(sample_peak_list, numeric(0)) && !is.null(nrow(sample_peak_list)) && nrow(sample_peak_list) != 0) { | |
635 if (!is.na(int_o <- extract_peak_var(sample_peak_list, "into"))) { | |
636 dummy_feature$set_int_o(int_o) | |
637 } | |
638 if (!is.na(int_b <- extract_peak_var(sample_peak_list, "intb"))) { | |
639 dummy_feature$set_int_b(int_b) | |
640 } | |
641 if (!is.na(max_o <- extract_peak_var(sample_peak_list, "maxo"))) { | |
642 dummy_feature$set_max_o(max_o) | |
643 } | |
644 } | |
645 | |
646 set_feature_fields_from_var_meta(dummy_feature, curent_var_meta) | |
647 | |
648 dummy_feature$set_featureID(next_feature_id) | |
649 next_feature_id <- next_feature_id + 1 | |
650 fake_iso <- dummy_feature$get_iso() | |
651 iso <- extract_iso(fake_iso) | |
652 clusterID <- extract_clusterID(fake_iso, next_cluster_id) | |
653 context$clusterID <- clusterID | |
654 dummy_feature$set_iso(iso) | |
655 create_associated_cluster( | |
656 sample, dummy_feature, clusterID, | |
657 context, curent_var_meta, next_pc_group, | |
658 next_align_group | |
659 ) | |
660 next_align_group <- next_align_group + 1 | |
661 features[[length(features)+1]] <- as.list(dummy_feature, field_names) | |
662 dummy_feature$clear() | |
663 } | |
664 message("")## +\n for previous message | |
665 message("Saving features") | |
666 dummy_feature$save(bulk=features) | |
667 message("Saved.") | |
668 return (context$clusters) | |
669 } | |
670 | |
671 extract_peak_var <- function(peak_list, var_name, selector=max) { | |
672 value <- peak_list[, var_name] | |
673 names(value) <- NULL | |
674 return (selector(value)) | |
675 } | |
676 | |
677 set_feature_fields_from_var_meta <- function(feature, var_meta) { | |
678 if (!is.null(mz <- var_meta[["mz"]]) && !is.na(mz)) { | |
679 feature$set_mz(mz) | |
680 } | |
681 if (!is.null(mzmin <- var_meta[["mzmin"]]) && !is.na(mzmin)) { | |
682 feature$set_mz_min(mzmin) | |
683 } | |
684 if (!is.null(mzmax <- var_meta[["mzmax"]]) && !is.na(mzmax)) { | |
685 feature$set_mz_max(mzmax) | |
686 } | |
687 if (!is.null(rt <- var_meta[["rt"]]) && !is.na(rt)) { | |
688 feature$set_rt(rt) | |
689 } | |
690 if (!is.null(rtmin <- var_meta[["rtmin"]]) && !is.na(rtmin)) { | |
691 feature$set_rt_min(rtmin) | |
692 } | |
693 if (!is.null(rtmax <- var_meta[["rtmax"]]) && !is.na(rtmax)) { | |
694 feature$set_rt_max(rtmax) | |
695 } | |
696 if (!is.null(isotopes <- var_meta[["isotopes"]]) && !is.na(isotopes)) { | |
697 feature$set_iso(isotopes) | |
698 } | |
699 return (feature) | |
700 } | |
701 | |
702 extract_iso <- function(weird_data) { | |
703 if (grepl("^\\[\\d+\\]", weird_data)[[1]]) { | |
704 return (sub("^\\[\\d+\\]", "", weird_data, perl=TRUE)) | |
705 } | |
706 return (weird_data) | |
707 } | |
708 | |
709 extract_clusterID <- function(weird_data, next_cluster_id){ | |
710 if (grepl("^\\[\\d+\\]", weird_data)[[1]]) { | |
711 clusterID <- stringr::str_extract(weird_data, "^\\[\\d+\\]") | |
712 clusterID <- as.numeric(stringr::str_extract(clusterID, "\\d+")) | |
713 } else { | |
714 clusterID <- 0 | |
715 } | |
716 return (clusterID + next_cluster_id) | |
717 } | |
718 | |
719 create_associated_cluster <- function( | |
720 sample, feature, grouping_variable, | |
721 context, curent_var_meta, next_pc_group, next_align_group | |
722 ) { | |
723 pcgroup <- as.numeric(curent_var_meta[["pcgroup"]]) | |
724 adduct <- as.character(curent_var_meta[["adduct"]]) | |
725 annotation <- curent_var_meta[["isotopes"]] | |
726 grouping_variable <- as.character(grouping_variable) | |
727 if (is.null(cluster <- context$clusters[[grouping_variable]])) { | |
728 cluster <- context$clusters[[grouping_variable]] <- orm$cluster( | |
729 pc_group=pcgroup + next_pc_group, | |
730 adduct=adduct, | |
731 align_group=next_align_group, | |
732 # curent_group=curent_group, | |
733 clusterID=context$clusterID, | |
734 annotation=annotation | |
735 )$set_sample(sample) | |
736 } else { | |
737 if (context$clusterID != 0 && cluster$get_clusterID() == 0) { | |
738 cluster$set_clusterID(context$clusterID) | |
739 } | |
740 } | |
741 cluster$save() | |
742 feature$set_cluster(cluster) | |
743 return (feature) | |
744 } | |
745 | |
746 complete_features <- function(orm, context) { | |
747 for (cluster in context$clusters) { | |
748 features <- orm$feature()$load_by(cluster_id=cluster$get_id()) | |
749 if (features$any()) { | |
750 if (!is.null(rt <- features$mean("rt"))) { | |
751 cluster$set_mean_rt(rt)$save() | |
752 } | |
753 features_df <- as.data.frame(features) | |
754 central_feature <- features_df[grepl("^\\[M\\]", features_df[, "iso"]), ] | |
755 central_feature_into <- central_feature[["int_o"]] | |
756 if (!identical(central_feature_into, numeric(0)) && central_feature_into != 0) { | |
757 for (feature in as.vector(features)) { | |
758 feature$set_abundance( | |
759 feature$get_int_o() / central_feature_into * 100 | |
760 )$save() | |
761 } | |
762 } | |
763 } | |
764 } | |
765 return (NULL) | |
766 } | |
767 | |
768 load_process_params <- function(orm, sample, params) { | |
769 for (param_list in params) { | |
770 if (is.null(param_list[["xfunction"]])) { | |
771 next | |
772 } | |
773 if (param_list[["xfunction"]] == "annotatediff") { | |
774 load_process_params_peak_picking(orm, sample, param_list) | |
775 } | |
776 } | |
777 return (sample) | |
778 } | |
779 | |
780 load_process_params_peak_picking <- function(orm, sample, peak_picking_params) { | |
781 return (add_sample_process_parameters( | |
782 params=peak_picking_params, | |
783 params_translation=list( | |
784 ppm="ppm", | |
785 maxcharge="maxCharge", | |
786 maxiso="maxIso" | |
787 ), | |
788 param_model_generator=orm$peak_picking_parameters, | |
789 sample_param_setter=sample$set_peak_picking_parameters | |
790 )) | |
791 } | |
792 | |
793 add_sample_process_parameters <- function( | |
794 params, | |
795 params_translation, | |
796 param_model_generator, | |
797 sample_param_setter | |
798 ) { | |
799 model_params <- list() | |
800 for (rdata_param_name in names(params_translation)) { | |
801 database_param_name <- params_translation[[rdata_param_name]] | |
802 if (is.null(rdata_param <- params[[rdata_param_name]])) { | |
803 next | |
804 } | |
805 model_params[[database_param_name]] <- rdata_param | |
806 } | |
807 params_models <- do.call(param_model_generator()$load_by, model_params) | |
808 if (params_models$any()) { | |
809 params_model <- params_models$first() | |
810 } else { | |
811 params_model <- do.call(param_model_generator, model_params) | |
812 params_model$save() | |
813 } | |
814 return (sample_param_setter(params_model)$save()) | |
815 } | |
816 | |
817 | |
818 library(optparse) | |
819 | |
820 option_list <- list( | |
821 optparse::make_option( | |
822 c("-v", "--version"), | |
823 action="store_true", | |
824 help="Display this tool's version and exits" | |
825 ), | |
826 optparse::make_option( | |
827 c("-i", "--input"), | |
828 type="character", | |
829 help="The rdata path to import in XSeeker" | |
830 ), | |
831 optparse::make_option( | |
832 c("-s", "--samples"), | |
833 type="character", | |
834 help="Samples to visualise in XSeeker" | |
835 ), | |
836 optparse::make_option( | |
837 c("-B", "--archetype"), | |
838 type="character", | |
839 help="The name of the base database" | |
840 ), | |
841 optparse::make_option( | |
842 c("-b", "--database"), | |
843 type="character", | |
844 help="The base database's path" | |
845 ), | |
846 optparse::make_option( | |
847 c("-c", "--compounds-csv"), | |
848 type="character", | |
849 help="The csv containing compounds" | |
850 ), | |
851 optparse::make_option( | |
852 c("-m", "--models"), | |
853 type="character", | |
854 help="The path or url (must begin with http[s]:// or git@) to the database's models" | |
855 ), | |
856 optparse::make_option( | |
857 c("-o", "--output"), | |
858 type="character", | |
859 help="The path where to output sqlite" | |
860 ), | |
861 optparse::make_option( | |
862 c("-P", "--not-show-percent"), | |
863 action="store_true", | |
864 help="Flag not to show the percents", | |
865 default=FALSE | |
866 ) | |
867 ) | |
868 | |
869 options(error=function(){traceback(3)}) | |
870 | |
871 parser <- OptionParser(usage="%prog [options] file", option_list=option_list) | |
872 args <- parse_args(parser, positional_arguments=0) | |
873 | |
874 err_code <- 0 | |
875 | |
876 if (!is.null(args$options$version)) { | |
877 message(sprintf("%s %s", TOOL_NAME, VERSION)) | |
878 quit() | |
879 } | |
880 | |
881 models <- get_models(args$options$models) | |
882 orm <- DBModelR::ORM( | |
883 connection_params=list(dbname=args$options$output), | |
884 dbms="SQLite" | |
885 ) | |
886 | |
887 invisible(orm$models(models)) | |
888 invisible(create_database(orm)) | |
889 | |
890 message("Database model created") | |
891 | |
892 insert_adducts(orm) | |
893 | |
894 if (!is.null(args$options$database)) { | |
895 insert_base_data(orm, args$options$database) | |
896 } | |
897 message(sprintf("Base data inserted using %s.", args$options$database)) | |
898 | |
899 if (!is.null(args$options$archetype)) { | |
900 insert_base_data(orm, args$options$archetype, archetype=TRUE) | |
901 } | |
902 if (!is.null(args$options$`compounds-csv`)) { | |
903 insert_compounds(orm, args$options$`compounds-csv`) | |
904 } | |
905 | |
906 # if (!is.null(args$options$rdata)) { | |
907 # load_rdata_in_base(args$options$rdata, args$options$samples, args$options$`not-show-percent`) | |
908 # } | |
909 | |
910 | |
911 load(args$options$input, rdata <- new.env()) | |
912 | |
913 process_rdata(orm, rdata, args$options) | |
914 | |
915 quit(status=err_code) | |
916 | |
917 |