comparison filtering.xml @ 0:a2988d8d4b77 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/cardinal commit 0825a4ccd3ebf4ca8a298326d14f3e7b25ae8415
author galaxyp
date Mon, 01 Oct 2018 01:04:17 -0400
parents
children aac805a9d2ae
comparison
equal deleted inserted replaced
-1:000000000000 0:a2988d8d4b77
1 <tool id="cardinal_filtering" name="MSI filtering" version="@VERSION@.0">
2 <description>tool for filtering mass spectrometry imaging data</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <expand macro="requirements">
7 <requirement type="package" version="2.2.1">r-gridextra</requirement>
8 <requirement type="package" version="2.2.1">r-ggplot2</requirement>
9 </expand>
10 <command detect_errors="exit_code">
11 <![CDATA[
12
13 @INPUT_LINKING@
14 cat '${MSI_subsetting}' &&
15 Rscript '${MSI_subsetting}'
16
17 ]]>
18 </command>
19 <configfiles>
20 <configfile name="MSI_subsetting"><![CDATA[
21
22
23 ################################# load libraries and read file #################
24
25
26 library(Cardinal)
27 library(ggplot2)
28 library(gridExtra)
29
30 @READING_MSIDATA@
31
32
33 ########################### QC numbers ########################
34
35 ## Number of features (m/z)
36 maxfeatures = length(features(msidata))
37 ## Range m/z
38 minmz = round(min(mz(msidata)), digits=2)
39 maxmz = round(max(mz(msidata)), digits=2)
40 ## Number of spectra (pixels)
41 pixelcount = length(pixels(msidata))
42 ## Range x coordinates
43 minimumx = min(coord(msidata)[,1])
44 maximumx = max(coord(msidata)[,1])
45 ## Range y coordinates
46 minimumy = min(coord(msidata)[,2])
47 maximumy = max(coord(msidata)[,2])
48 ## Number of intensities > 0
49 npeaks= sum(spectra(msidata)[]>0, na.rm=TRUE)
50 ## Spectra multiplied with m/z (potential number of peaks)
51 numpeaks = ncol(spectra(msidata)[])*nrow(spectra(msidata)[])
52 ## Percentage of intensities > 0
53 percpeaks = round(npeaks/numpeaks*100, digits=2)
54 ## Number of empty TICs
55 TICs = colSums(spectra(msidata)[], na.rm=TRUE)
56 NumemptyTIC = sum(TICs == 0)
57 ## median TIC
58 medint = round(median(TICs), digits=2)
59 ## Store features for QC plot
60 featuresinfile = mz(msidata)
61
62 ## Next steps will only run if there are more than 0 intensities/pixels/features in the file
63
64 if (sum(spectra(msidata)[]>0, na.rm=TRUE) > 0)
65 {
66
67
68 ## prepare dataframe for QC of pixel distribution (will be overwritten in filtering of pixels condition)
69 position_df = cbind(coord(msidata)[,1:2], rep("$infile.element_identifier", times=ncol(msidata)))
70 colnames(position_df)[3] = "annotation"
71
72 ###################################### Filtering of pixels #####################
73 ################################################################################
74
75 ############ Pixels in two columns format: x and y in different columns #############
76
77 #if str($pixels_cond.pixel_filtering) == "two_columns":
78 print("two columns")
79
80 ## read tabular file
81 input_list = read.delim("$pixels_cond.annotation_file", header = $pixels_cond.tabular_header,
82 stringsAsFactors = FALSE)
83 numberpixels = nrow(input_list)
84 inputpixels = input_list[,c($pixels_cond.column_x, $pixels_cond.column_y, $pixels_cond.column_names)]
85
86 ## rewrite into x = 1, y = 1 format and filter msidata, count validpixels
87 pixelvector = character()
88 for (pixel in 1:nrow(inputpixels)){
89 pixelvector[pixel] = paste0("x = ", inputpixels[pixel,1],", ", "y = ", inputpixels[pixel,2])}
90 pixelsofinterest= pixels(msidata)[names(pixels(msidata)) %in% pixelvector]
91 msidata = msidata[,pixelsofinterest]
92 validpixels=ncol(msidata)
93
94 ## in case some pixels are left print annotation plot
95 colnames(inputpixels) = c("x", "y", "annotation")
96 position_df = merge(coord(msidata)[,1:2], inputpixels, by=c("x", "y"), all.x=TRUE)
97 colnames(position_df)[3] = "annotation"
98 position_df\$annotation = factor(position_df\$annotation)
99
100
101 ########### Pixels wihin x and y minima and maxima are kept ###################
102
103 #elif str($pixels_cond.pixel_filtering) == "pixel_range":
104 print("pixel range")
105
106 numberpixels = "range"
107 validpixels = "range"
108
109 ## only filter pixels if at least one pixel will be left
110 if (sum(coord(msidata)\$x <= $pixels_cond.max_x_range & coord(msidata)\$x >= $pixels_cond.min_x_range) > 0 & sum(coord(msidata)\$y <= $pixels_cond.max_y_range & coord(msidata)\$y >= $pixels_cond.min_y_range) > 0){
111
112 msidata = msidata[, coord(msidata)\$x <= $pixels_cond.max_x_range & coord(msidata)\$x >= $pixels_cond.min_x_range]
113 msidata = msidata[, coord(msidata)\$y <= $pixels_cond.max_y_range & coord(msidata)\$y >= $pixels_cond.min_y_range]
114 }else{
115 msidata = msidata[,0]
116 print("no valid pixel found")}
117
118 ## update position_df for filtered pixels
119 position_df = cbind(coord(msidata)[,1:2], rep("$infile.element_identifier", times=ncol(msidata)))
120 colnames(position_df)[3] = "annotation"
121 position_df\$annotation = factor(position_df\$annotation)
122
123 #elif str($pixels_cond.pixel_filtering) == "none":
124 print("no pixel filtering")
125
126 numberpixels = 0
127 validpixels = 0
128
129 #end if
130
131 }else{
132 print("Inputfile has no intensities > 0")
133 }
134
135 ################################# filtering of features ######################
136 ##############################################################################
137
138 ####################### Keep m/z from tabular file #########################
139
140 ## feature filtering only when pixels/features/intensities are left
141 npeaks_before_filtering= sum(spectra(msidata)[]>0, na.rm=TRUE)
142
143
144 if (npeaks_before_filtering > 0)
145
146 {
147
148 #if str($features_cond.features_filtering) == "features_list":
149 print("feature list")
150
151 ## read tabular file, define starting row, extract and count valid features
152 input_features = read.delim("$mz_tabular", header = $features_cond.feature_header, stringsAsFactors = FALSE)
153 extracted_features = input_features[,$features_cond.feature_column]
154 numberfeatures = length(extracted_features)
155 if (class(extracted_features) == "numeric"){
156 ### max digits given in the input file will be used to match m/z but the maximum is 4
157 max_digits = max(nchar(matrix(unlist(strsplit(as.character(extracted_features), "\\.")), ncol=2, byrow=TRUE)[,2]))
158 if (max_digits >4)
159 {
160 max_digits = 4
161 }
162
163 validfeatures = round(extracted_features, max_digits) %in% round(mz(msidata),max_digits)
164 featuresofinterest = features(msidata)[round(mz(msidata), digits = max_digits) %in% round(extracted_features[validfeatures], max_digits)]
165 validmz = length(unique(featuresofinterest))
166 }else{
167 validmz = 0
168 featuresofinterest = 0}
169
170 ### filter msidata for valid features
171 msidata = msidata[featuresofinterest,]
172
173 ############### features within a given range are kept #####################
174
175 #elif str($features_cond.features_filtering) == "features_range":
176 print("feature range")
177
178 numberfeatures = "range"
179 validmz = "range"
180
181 if (sum(mz(msidata) >= $features_cond.min_mz & mz(msidata) <= $features_cond.max_mz)> 0){
182 msidata = msidata[mz(msidata) >= $features_cond.min_mz & mz(msidata) <= $features_cond.max_mz,]
183 }else{
184 msidata = msidata[0,]
185 print("no valid mz range")}
186
187 ############### Remove m/z from tabular file #########################
188
189 #elif str($features_cond.features_filtering) == "remove_features":
190 print("remove features")
191
192 ## read tabular file, define starting row, extract and count valid features
193 input_features = read.delim("$mz_tabular", header = $features_cond.removal_header, stringsAsFactors = FALSE)
194 extracted_features = input_features[,$features_cond.removal_column]
195 numberfeatures = length(extracted_features)
196 if (class(extracted_features) == "numeric"){
197 print("input is numeric")
198 featuresofinterest = extracted_features
199 validmz = sum(featuresofinterest <= max(mz(msidata))& featuresofinterest >= min(mz(msidata)))
200 }else{featuresofinterest = 0
201 validmz = 0}
202
203 ### Here starts removal of features:
204 plusminus = $features_cond.removal_plusminus
205
206 mass_to_remove = numeric()
207 if (sum(featuresofinterest) > 0){
208 for (masses in featuresofinterest){
209 #if str($features_cond.units_removal) == "ppm":
210 plusminus = masses * $features_cond.removal_plusminus/1000000
211 #end if
212 current_mass = which(c(mz(msidata) <= masses + plusminus & mz(msidata) >= masses - plusminus))
213 mass_to_remove = append(mass_to_remove, current_mass)}
214 msidata= msidata[-mass_to_remove, ]
215 }else{print("No features were removed as they were not fitting to m/z values and/or range")}
216
217
218 #elif str($features_cond.features_filtering) == "none":
219
220 print("no feature filtering")
221 validmz = 0
222 numberfeatures = 0
223
224 #end if
225
226 ## save msidata as Rfile
227 save(msidata, file="$msidata_filtered")
228 ## Number of empty TICs
229 TICs2 = colSums(spectra(msidata)[], na.rm=TRUE)
230 }else{
231 print("Inputfile or file filtered for pixels has no intensities > 0")
232 numberfeatures = NA
233 validmz = NA
234 ## Number of empty TICs
235 TICs2 = NA
236 }
237
238 #################### QC numbers #######################
239
240
241 ## Number of features (m/z)
242 maxfeatures2 = length(features(msidata))
243 ## Range m/z
244 minmz2 = round(min(mz(msidata)), digits=2)
245 maxmz2 = round(max(mz(msidata)), digits=2)
246 ## Number of spectra (pixels)
247 pixelcount2 = length(pixels(msidata))
248 ## Range x coordinates
249 minimumx2 = min(coord(msidata)[,1])
250 maximumx2 = max(coord(msidata)[,1])
251 ## Range y coordinates
252 minimumy2 = min(coord(msidata)[,2])
253 maximumy2 = max(coord(msidata)[,2])
254 ## Number of intensities > 0
255 npeaks2= sum(spectra(msidata)[]>0, na.rm=TRUE)
256 ## Spectra multiplied with m/z (potential number of peaks)
257 numpeaks2 = ncol(spectra(msidata)[])*nrow(spectra(msidata)[])
258 ## Percentage of intensities > 0
259 percpeaks2 = round(npeaks2/numpeaks2*100, digits=2)
260 ## Number of empty TICs
261 NumemptyTIC2 = sum(TICs2 == 0)
262 ## median TIC
263 medint2 = round(median(TICs2), digits=2)
264
265 properties = c("Number of m/z features",
266 "Range of m/z values",
267 "Number of pixels",
268 "Range of x coordinates",
269 "Range of y coordinates",
270 "Intensities > 0",
271 "Median TIC per pixel",
272 "Number of empty spectra",
273 "pixel overview",
274 "feature overview")
275
276 before = c(paste0(maxfeatures),
277 paste0(minmz, " - ", maxmz),
278 paste0(pixelcount),
279 paste0(minimumx, " - ", maximumx),
280 paste0(minimumy, " - ", maximumy),
281 paste0(percpeaks, " %"),
282 paste0(medint),
283 paste0(NumemptyTIC),
284 paste0("input pixels: ", numberpixels),
285 paste0("input mz: ", numberfeatures))
286
287 filtered = c(paste0(maxfeatures2),
288 paste0(minmz2, " - ", maxmz2),
289 paste0(pixelcount2),
290 paste0(minimumx2, " - ", maximumx2),
291 paste0(minimumy2, " - ", maximumy2),
292 paste0(percpeaks2, " %"),
293 paste0(medint2),
294 paste0(NumemptyTIC2),
295 paste0("valid pixels: ", validpixels),
296 paste0("valid mz: ", validmz))
297
298 property_df = data.frame(properties, before, filtered)
299
300 ############################### PDF QC ################################
301
302
303 pdf("filtertool_QC.pdf", fonts = "Times", pointsize = 12)
304 plot(0,type='n',axes=FALSE,ann=FALSE)
305 title(main=paste0("Qualitycontrol of filtering tool for file: \n\n", "$infile.display_name"))
306 grid.table(property_df, rows= NULL)
307
308 ## QC report with more than value-table: only when pixels/features/intensities are left
309 if (npeaks2 > 0)
310 {
311 ### visual pixel control
312
313 levels(position_df\$annotation) = factor(paste(1:length(levels(position_df\$annotation)), levels(position_df\$annotation), sep="_"))
314
315 pixel_image = ggplot(position_df, aes(x=x, y=y, fill=annotation))+
316 geom_tile(height = 1, width=1)+
317 coord_fixed()+
318 ggtitle("Spatial orientation of filtered pixels")+
319 theme_bw()+
320 theme(plot.title = element_text(hjust = 0.5))+
321 theme(text=element_text(family="ArialMT", face="bold", size=12))+
322 theme(legend.position="bottom",legend.direction="vertical")+
323 theme(legend.key.size = unit(0.2, "line"), legend.text = element_text(size = 6))+
324 guides(fill=guide_legend(ncol=4,byrow=TRUE))
325
326 coord_labels = aggregate(cbind(x,y)~annotation, data=position_df, mean, na.rm=TRUE, na.action="na.pass")
327 coord_labels\$file_number = 1:length(levels(position_df\$annotation))
328
329 for(file_count in 1:nrow(coord_labels))
330 {pixel_image = pixel_image + annotate("text",x=coord_labels[file_count,"x"],
331 y=coord_labels[file_count,"y"],label=toString(coord_labels[file_count,4]))}
332
333 print(pixel_image)
334
335 ### control features which are removed
336 hist(mz(msidata), xlab="m/z", main="Kept m/z values")
337 #if str($features_cond.features_filtering) == "none":
338 print("no difference histogram as no m/z filtering took place")
339 #else:
340
341 if (isTRUE(all.equal(featuresinfile, mz(msidata)))){
342 print("No difference in m/z values before and after filtering, no histogram drawn")
343 }else{
344 hist(setdiff(featuresinfile, mz(msidata)), xlab="m/z", main="Removed m/z values")}
345 #end if
346
347 dev.off()
348
349 }else{
350 print("Inputfile or filtered file has no intensities > 0")
351 dev.off()
352 }
353 ]]></configfile>
354 </configfiles>
355 <inputs>
356 <expand macro="reading_msidata"/>
357 <conditional name="pixels_cond">
358 <param name="pixel_filtering" type="select" label="Select pixel filtering option">
359 <option value="none" selected="True">none</option>
360 <option value="two_columns">list of pixel coordinates (tabular file)</option>
361 <option value="pixel_range">ranges for x and y (manually)</option>
362 </param>
363 <when value="none"/>
364 <when value="two_columns">
365 <expand macro="reading_pixel_annotations"/>
366
367
368 <param name="two_columns_pixel" type="data" format="tabular" label="Tabular file with pixel coordinates"
369 help="Column with x values, another with y values, another with pixel annotations"/>
370 <param name="pixel_column_x" data_ref="two_columns_pixel" label="Column with x values" type="data_column"/>
371 <param name="pixel_column_y" data_ref="two_columns_pixel" label="Column with y values" type="data_column"/>
372 <param name="annotation_column_xy" data_ref="two_columns_pixel" label="Column with annotations" type="data_column"/>
373 <param name="pixel_header" type="boolean" label="Tabular file contains a header line" truevalue="TRUE" falsevalue="FALSE"/>
374
375
376
377
378 </when>
379 <when value="pixel_range">
380 <param name="min_x_range" type="integer" value="0" label="Minimum value for x"/>
381 <param name="max_x_range" type="integer" value="100" label="Maximum value for x"/>
382 <param name="min_y_range" type="integer" value="0" label="Minimum value for y"/>
383 <param name="max_y_range" type="integer" value="100" label="Maximum value for y"/>
384 </when>
385 </conditional>
386
387 <conditional name="features_cond">
388 <param name="features_filtering" type="select" label="Select m/z feature filtering option">
389 <option value="none" selected="True">none</option>
390 <option value="features_list">keep a list of m/z (tabular file)</option>
391 <option value="features_range">m/z range (manually)</option>
392 <option value="remove_features">remove a list of m/z (tabular file)</option>
393 </param>
394 <when value="none"/>
395 <when value="features_list">
396 <expand macro="reading_1_column_mz_tabular" label="Tabular file with m/z features to keep"/>
397 </when>
398 <when value="features_range">
399 <param name="min_mz" type="float" value="1" label="Minimum value for m/z"/>
400 <param name="max_mz" type="float" value="100" label="Maximum value for m/z"/>
401 </when>
402 <when value="remove_features">
403 <expand macro="reading_1_column_mz_tabular" label="Tabular file with m/z features to remove"/>
404 <param name="removal_plusminus" type="float" value="20" label="Window in which all m/z will be removed" help="This value is the half window size, it will be added and substracted from the given input value"/>
405 <param name="units_removal" type="select" display="radio" optional ="False" label="units">
406 <option value="ppm" selected="True">ppm</option>
407 <option value="Da">Da</option>
408 </param>
409 </when>
410 </conditional>
411 </inputs>
412
413 <outputs>
414 <data format="rdata" name="msidata_filtered" label="${tool.name} on ${on_string}"/>
415 <data format="pdf" name="QC_overview" from_work_dir="filtertool_QC.pdf" label = "${tool.name} on ${on_string}: QC"/>
416 </outputs>
417 <tests>
418 <test>
419 <expand macro="infile_imzml"/>
420 <param name="pixel_filtering" value="pixel_range"/>
421 <param name="min_x_range" value="10"/>
422 <param name="max_x_range" value="20"/>
423 <param name="min_y_range" value="2"/>
424 <param name="max_y_range" value="2"/>
425 <output name="QC_overview" file="imzml_filtered2.pdf" compare="sim_size"/>
426 <output name="msidata_filtered" file="imzml_filtered2.RData" compare="sim_size"/>
427 </test>
428 <test>
429 <expand macro="infile_imzml"/>
430 <param name="pixel_filtering" value="pixel_range"/>
431 <param name="min_x_range" value="1"/>
432 <param name="max_x_range" value="20"/>
433 <param name="min_y_range" value="2"/>
434 <param name="max_y_range" value="2"/>
435 <param name="features_filtering" value="features_range"/>
436 <param name="min_mz" value="350" />
437 <param name="max_mz" value="500"/>
438 <output name="QC_overview" file="imzml_filtered3.pdf" compare="sim_size"/>
439 <output name="msidata_filtered" file="imzml_filtered3.RData" compare="sim_size"/>
440 </test>
441 <test>
442 <expand macro="infile_imzml"/>
443 <param name="pixel_filtering" value="two_columns"/>
444 <param name="annotation_file" ftype="tabular" value = "inputpixels_2column.tabular"/>
445 <param name="column_x" value="1"/>
446 <param name="column_y" value="3"/>
447 <param name="column_names" value="2"/>
448 <output name="QC_overview" file="imzml_filtered4.pdf" compare="sim_size"/>
449 <output name="msidata_filtered" file="imzml_filtered4.RData" compare="sim_size"/>
450 </test>
451 <test>
452 <expand macro="infile_imzml"/>
453 <param name="pixel_filtering" value="pixel_range"/>
454 <param name="min_x_range" value="0"/>
455 <param name="max_x_range" value="10"/>
456 <param name="min_y_range" value="2"/>
457 <param name="max_y_range" value="20"/>
458 <param name="features_filtering" value="features_list"/>
459 <param name="mz_tabular" ftype="tabular" value = "featuresofinterest5.tabular"/>
460 <param name="feature_column" value="1"/>
461 <param name="feature_header" value="0"/>
462 <output name="QC_overview" file="imzml_filtered5.pdf" compare="sim_size"/>
463 <output name="msidata_filtered" file="imzml_filtered5.RData" compare="sim_size" />
464 </test>
465 <test>
466 <expand macro="infile_analyze75"/>
467 <output name="QC_overview" file="analyze75_filtered2.pdf" compare="sim_size"/>
468 <output name="msidata_filtered" file="analyze_filteredoutside.RData" compare="sim_size" />
469 </test>
470 <test>
471 <param name="infile" value="preprocessed.RData" ftype="rdata"/>
472 <conditional name="outputs">
473 <param name="outputs_select" value="no_quality_control"/>
474 </conditional>
475 <output name="msidata_filtered" file="rdata_notfiltered.RData" compare="sim_size"/>
476 <output name="QC_overview" file="rdata_notfiltered.pdf" compare="sim_size" />
477 </test>
478 </tests>
479 <help>
480 <![CDATA[
481
482 @CARDINAL_DESCRIPTION@
483
484 -----
485
486 This tool provides options to filter (subset) pixels and m/z features of mass spectrometry imaging data.
487
488 @MSIDATA_INPUT_DESCRIPTION@
489
490 @SPECTRA_TABULAR_INPUT_DESCRIPTION@
491
492 @MZ_TABULAR_INPUT_DESCRIPTION@
493
494 **Options**
495
496 - pixel filtering/annotation: either with a tabular file containing x and y coordinates and pixel annotations or by defining a range for x and y by hand (for the latter no annotation is possible). Pixel that are not present in the dataset are ignored. In case all pixels are not present in the dataset the output file will be empty and no further mz filtering will be performed.
497 - m/z feature filtering: m/z values for filtering should be either imported as a tabular file containing containing m/z of interest or by defining a range for the m/z values. m/z that are not present in the dataset are ignored. If all given m/z values or the m/z range is outside the dataset, the output file will be empty.
498 - m/z feature removing: perturbing m/z features such as matrix contaminants can be removed by specifying their m/z in a tabular file, optionally with a half window size in ppm or m/z for the window in which peaks should be removed.
499
500
501 **Tips**
502
503 - Numeric m/z features imported via a tabular file and m/z features of the dataset are rounded to 4 decimal points (or maximum number of decimal points of input m/z) and then matched. Therefore, it is recommended to use the filtering tool only for m/z which have been extracted from the same dataset. If the m/z values are from a different dataset, the tool "Join two files on column allowing a small difference" should be used to find corresponding m/z values, which can then be used for filtering.
504 - In case tabular file cannot be selected in drop-down menu: Datatype in Galaxy must be tabular otherwise file will not appear in selection window (if Galaxy auto-detection was wrong, datatype can be changed by pressing the pen button (edit attributes))
505
506
507 **Output**
508
509 - imzML file filtered for pixels and/or m/z
510 - pdf with heatmap showing the pixels that are left after filtering and histograms of kept and removed m/z
511
512
513 ]]>
514 </help>
515 <expand macro="citations"/>
516 </tool>