comparison spatial_DGMM.xml @ 0:4cb6c83d3777 draft

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/cardinal commit badc51fcd74ba0c14cd1ae64d5f524291fa11441"
author galaxyp
date Tue, 22 Feb 2022 20:51:09 +0000
parents
children db423b7bce78
comparison
equal deleted inserted replaced
-1:000000000000 0:4cb6c83d3777
1 <tool id="cardinal_single_ion_segmentation" name="MSI single ion segmentation" version="@VERSION@.0">
2 <description>mass spectrometry imaging spatial DGMM</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <expand macro="requirements">
7 <requirement type="package" version="2.3">r-gridextra</requirement>
8 </expand>
9 <command detect_errors="exit_code">
10 <![CDATA[
11
12 @INPUT_LINKING@
13 cat '${MSI_spatial_DGMM}' &&
14 Rscript '${MSI_spatial_DGMM}'
15
16 ]]>
17 </command>
18 <configfiles>
19 <configfile name="MSI_spatial_DGMM"><![CDATA[
20
21 ################################# load libraries and read file #################
22
23 library(Cardinal)
24 library(gridExtra)
25
26 @READING_MSIDATA_FULLY_COMPATIBLE@
27
28 #if str($sample_groups.group) == "multiple_groups":
29 ## read and extract x,y,annotation information
30 input_tabular <- read.delim("$sample_groups.annotation_file", header = $sample_groups.tabular_header, stringsAsFactors = FALSE)
31 annotation_input <- input_tabular[,c($sample_groups.column_x, $sample_groups.column_y, $sample_groups.column_names)]
32 annotation_name <- colnames(annotation_input)[3] ##extract header for annotations to later export tabular with same name
33 colnames(annotation_input) <- c("x", "y", "annotation") ## rename annotations header to default name "annotation"
34
35 ## merge with coordinate information of msidata
36 msidata_coordinates <- data.frame(coord(msidata)\$x, coord(msidata)\$y, c(1:ncol(msidata)))
37 colnames(msidata_coordinates) <- c("x", "y", "pixel_index")
38 merged_annotation <- merge(msidata_coordinates, annotation_input, by=c("x", "y"), all.x=TRUE)
39 merged_annotation[is.na(merged_annotation)] <- "NA"
40 merged_annotation <- merged_annotation[order(merged_annotation\$pixel_index),]
41 msidata\$annotation <- as.character(merged_annotation[,4])
42 #end if
43
44
45 @DATA_PROPERTIES_INRAM@
46
47
48 ## remove duplicated coordinates
49 msidata <- msidata[,!duplicated(coord(msidata))]
50
51
52 ######################################## PDF ###################################
53 ################################################################################
54 ################################################################################
55
56
57 pdf("single_ion_segmentation.pdf", fonts = "Times", pointsize = 12)
58 plot(0,type='n',axes=FALSE,ann=FALSE)
59
60 title(main=paste0("Single ion segmentation for file: \n\n", "$infile.display_name"))
61
62 grid.table(property_df, rows= NULL)
63
64 if (npeaks > 0)
65 {
66
67 ## set seed to make analysis reproducible
68 set.seed($setseed)
69
70
71 ## single ion segmentation
72 dgmm <- spatialDGMM(msidata,
73 r = c($r),
74 k = c($k),
75 #if str($sample_groups.group) == 'single_group':
76 groups = as.factor(rep("$infile.display_name", ncol(msidata))),
77 #else
78 groups = msidata\$annotation,
79 #end if
80 method = "$method",
81 dist = "$dist",
82 annealing = $annealing,
83 init = "$init",
84 p0 = $p0,
85 iter.max = $iter_max,
86 tol = $tol)
87
88 ## Summary results table
89 dgmm_summary <- as.data.frame(summary(dgmm))
90 colnames(dgmm_summary) <- c('r', 'k', 'Feature', 'Classes/group')
91 dgmm_summary\$'m/z' <- mz(msidata)
92 feature_n <- dgmm_summary\$Feature
93 dgmm_summary\$Feature <- NULL
94 dgmm_summary\$Feature <- feature_n
95 write.table(dgmm_summary, file="$dgmm_summary", quote = FALSE, row.names = FALSE, col.names=TRUE, sep = "\t")
96
97 ## Results images
98 for (dgmm_repeat in 1:nrow(dgmm_summary)){
99 print(image(dgmm, values="class", model=dgmm_repeat))}
100 dev.off() ## closes pdf file
101
102 ## optional outputs
103 pixel_names <- paste0("xy_", coord(dgmm)\$x, "_", coord(dgmm)\$y)
104
105 #if $output_probability:
106 dir.create("DGMM_probability")
107 for (dgmm_repeat in 1:nrow(dgmm_summary)){
108 name_repeat <- file.path(paste0("DGMM_probability/probability_r", dgmm_summary\$r[dgmm_repeat], "_k", dgmm_summary\$k[dgmm_repeat], "_mz", dgmm_summary\$`m/z`[dgmm_repeat], ".tabular"))
109 prob_df <- data.frame(coord(dgmm)\$x, coord(dgmm)\$y, pixel_names, resultData(dgmm, dgmm_repeat, "class"), resultData(dgmm, dgmm_repeat, "probability"))
110 colnames(prob_df)[1:4] <- c("x", "y", "pixel_names", "class")
111 write.table(prob_df, file=name_repeat, quote = FALSE, row.names = FALSE, col.names=TRUE, sep = "\t")
112 }
113 #end if
114
115 #if $output_estimates:
116 dir.create("DGMM_estimates")
117 for (dgmm_repeat in 1:nrow(dgmm_summary)){
118 name_repeat <- file.path(paste0("DGMM_estimates/estimates_r", dgmm_summary\$r[dgmm_repeat], "_k", dgmm_summary\$k[dgmm_repeat], "_mz", dgmm_summary\$`m/z`[dgmm_repeat], ".tabular"))
119 est_df <- resultData(dgmm, dgmm_repeat, "estimates")
120 write.table(est_df, file=name_repeat, quote = FALSE, row.names = FALSE, col.names=TRUE, sep = "\t")
121 }
122 #end if
123
124 #if $output_plots:
125 dir.create("DGMM_plots")
126 for (dgmm_repeat in 1:nrow(dgmm_summary)){
127 name_repeat <- file.path(paste0("DGMM_plots/plot_r", dgmm_summary\$r[dgmm_repeat], "_k", dgmm_summary\$k[dgmm_repeat], "_mz", dgmm_summary\$`m/z`[dgmm_repeat], ".png"))
128 png(file=name_repeat)
129 print(plot(dgmm, model=dgmm_repeat, lwd=2))
130 dev.off()
131 }
132 #end if
133
134 ## optional output as .RData
135 #if $output_rdata:
136 ## save as (.RData)
137 save(dgmm, file="$dgmm_rdata")
138 #end if
139
140 }else{
141 print("Inputfile has no intensities > 0")
142 }
143
144 ]]></configfile>
145 </configfiles>
146 <inputs>
147 <expand macro="reading_msidata"/>
148 <conditional name="sample_groups">
149 <param name="group" type="select" label="Dataset groups" help="Pixels from different groups will be segmented separately. For the validity of
150 downstream statistical analysis, it is important that each distinct observational unit (e.g., tissue sample) is assigned to a unique group">
151 <option value="single_group" selected="True">Dataset is a single group</option>
152 <option value="multiple_groups">Dataset contains multiple groups</option>
153 </param>
154 <when value="single_group"/>
155 <when value="multiple_groups">
156 <expand macro="reading_pixel_annotations"/>
157 </when>
158 </conditional>
159 <param name="r" type="text" value="2"
160 label="r" help="The spatial neighborhood radius of nearby pixels to consider. Only a single value is allowed">
161 <expand macro="sanitizer_multiple_digits"/>
162 </param>
163 <param name="k" type="text" value="5"
164 label="k" help="The maximum number of segments (clusters). The final number of segments may differ. Only a single value is allowed.">
165 <expand macro="sanitizer_multiple_digits"/>
166 </param>
167 <param name="method" type="select" display="radio"
168 label="weights method" help="The method to use to calculate the spatial smoothing weights. The 'gaussian' method refers to spatially-aware (SA) weights, and 'adaptive' refers to spatially-aware structurally-adaptive (SASA) weights">
169 <option value="gaussian" selected="True">gaussian</option>
170 <option value="adaptive">adaptive</option>
171 </param>
172 <param name="dist" type="select" display="radio"
173 label="distance metric" help="The type of distance metric to use when calculating neighboring pixels based on r. The options are ‘radial’, ‘manhattan’, ‘minkowski’, and ‘chebyshev’ (the
174 default).">
175 <option value="chebyshev" selected="True">chebyshev</option>
176 <option value="manhattan">manhattan</option>
177 <option value="radial">radial</option>
178 <option value="minkowski">minkowski</option>
179 </param>
180 <param name="annealing" type="boolean" label="annealing" help="Should simulated annealing be used during the optimization process to improve parameter estimates?" truevalue="TRUE" falsevalue="FALSE" />
181 <param name="init" type="select" display="radio"
182 label="init" help="Should the parameter estimates be initialized using k-means (’kmeans’) or Gaussian mixture model (’gmm’)?">
183 <option value="kmeans" selected="True">kmeans</option>
184 <option value="gmm">gmm</option>
185 </param>
186 <param name="p0" type="float" value="0.05" label="p0" help="A regularization parameter applied to estimated posterior class probabilities to avoid singularities. Must be positive for successful gradient descent optimization.Changing this value (within reason) should have only minimal impact on values of parameter estimates, but may greatly affect the algorithm’s speed and stability." />
187 <param name="iter_max" type="integer" value="100" label="iter.max" help="The maximum number of EM-algorithm iterations." />
188 <param name="tol" type="float" value="0.05" label="tolerance" help="The tolerance convergence criterion for the EM-algorithm. Corresponds to the
189 change in log-likelihood."/>
190 <param name="setseed" type="integer" value="1" label="set seed" help="Use same value to reproduce previous results"/>
191 <param name="output_estimates" type="boolean" label="Generate estimates results"/>
192 <param name="output_probability" type="boolean" label="Generate probability and class results"/>
193 <param name="output_plots" type="boolean" label="Generate plots results"/>
194 <param name="output_rdata" type="boolean" label="Results as .RData output"/>
195 </inputs>
196 <outputs>
197 <data format="tabular" name="dgmm_summary" label="${tool.name} on ${on_string}: summary"/>
198 <data format="pdf" name="file_info" from_work_dir="single_ion_segmentation.pdf" label = "${tool.name} on ${on_string}: file_info"/>
199 <data format="rdata" name="dgmm_rdata" label="${tool.name} on ${on_string}: dgmm.RData">
200 <filter>output_rdata</filter>
201 </data>
202 <collection name="estimates_output" type="list" label="${tool.name} logs: ${on_string}: estimates">
203 <filter>output_estimates</filter>
204 <discover_datasets pattern="__designation_and_ext__" directory="DGMM_estimates" format="tabular"/>
205 </collection>
206 <collection name="probability_output" type="list" label="${tool.name} logs: ${on_string}: probability">
207 <filter>output_probability</filter>
208 <discover_datasets pattern="__designation_and_ext__" directory="DGMM_probability" format="tabular"/>
209 </collection>
210 <collection name="plots_output" type="list" label="${tool.name} logs: ${on_string}: plots">
211 <filter>output_plots</filter>
212 <discover_datasets pattern="__designation_and_ext__" directory="DGMM_plots" format="tabular"/>
213 </collection>
214 </outputs>
215 <tests>
216 <test>
217 <param name="infile" value="" ftype="imzml">
218 <composite_data value="spatial_DGMM_input.imzML"/>
219 <composite_data value="spatial_DGMM_input.ibd"/>
220 </param>
221 <param name="r" value="1"/>
222 <param name="k" value="6"/>
223 <param name="method" value="adaptive"/>
224 <param name="dist" value="radial"/>
225 <param name="annealing" value="TRUE"/>
226 <param name="output_estimates" value="True"/>
227 <param name="output_probability" value="True"/>
228 <output name="file_info" file="dgmm_test1.pdf" compare="sim_size"/>
229 <output name="dgmm_summary" file="dgmm_summary1.tabular"/>
230 <output_collection name="estimates_output" type="list" count="10">
231 <element name="estimates_r1_k6_mz1135.93347167969" file="estimates_r1_k6_mz1135.93347167969.tabular"/>
232 </output_collection>
233 <output_collection name="probability_output" type="list" count="10">
234 <element name="probability_r1_k6_mz1023.70806884766" file="probability_r1_k6_mz1023.70806884766.tabular"/>
235 </output_collection>
236 </test>
237 <test>
238 <param name="infile" value="" ftype="imzml">
239 <composite_data value="spatial_DGMM_input.imzML"/>
240 <composite_data value="spatial_DGMM_input.ibd"/>
241 </param>
242 <conditional name="sample_groups">
243 <param name="group" value="multiple_groups"/>
244 <param name="annotation_file" value="DGMM_annotations.tabular"/>
245 <param name="column_x" value="1"/>
246 <param name="column_y" value="2"/>
247 <param name="column_names" value="3"/>
248 <param name="tabular_header" value="True"/>
249 </conditional>
250 <param name="r" value="2"/>
251 <param name="k" value="10"/>
252 <param name="annealing" value="TRUE"/>
253 <param name="output_estimates" value="True"/>
254 <param name="output_probability" value="True"/>
255 <param name="output_plots" value="True"/>
256 <param name="output_rdata" value="True"/>
257 <output name="file_info" file="dgmm_test2.pdf" compare="sim_size"/>
258 <output name="dgmm_summary" file="dgmm_summary2.tabular"/>
259 <output name="dgmm_rdata" file="dgmm_test2.RData" compare="sim_size"/>
260 <output_collection name="estimates_output" type="list" count="10">
261 <element name="estimates_r2_k10_mz1200.46533203125" file="estimates_r2_k10_mz1200.46533203125.tabular"/>
262 </output_collection>
263 <output_collection name="probability_output" type="list" count="10">
264 <element name="probability_r2_k10_mz1135.93347167969" file="probability_r2_k10_mz1135.93347167969.tabular"/>
265 </output_collection>
266 </test>
267 </tests>
268 <help>
269 <![CDATA[
270
271 @CARDINAL_DESCRIPTION@
272
273 -----
274
275 This tool fits spatially-aware Dirichlet Gaussian mixture models (DGMM) to each feature and each run in an mass spectrometry imaging
276 experiment. Each image is segmented and the means and variances of all Gaussian components are estimated. A linear filter with a spatial kernel is applied to the component probabilities to achieve
277 spatial smoothing. Simulated annealing is used in the EM-algorithm to avoid local optima and achieve more accurate parameter estimates.
278
279 @MSIDATA_INPUT_DESCRIPTION@
280 - NA intensities are not allowed
281 - duplicated coordinates will be removed
282 - It is highly recommended to use a dataset that is reduced for the number of m/z features e.g. pre-processed, binned, filtered for m/z of interest in order to keep computational times reasonable. In addition, it is beneficial to run the tool first without generating all possible results data and upon inspection of the summary of the results decide on the best tool parameters and m/z features (which can be filtered in the MSI filtering tool).
283
284 @SPECTRA_TABULAR_INPUT_DESCRIPTION@
285
286 **Tips**
287
288 - The input dataset should contain as few m/z features as possible to keep computational times reasonable. In addition, it is beneficial to run the tool first without generating all possible results data and upon inspection of the summary of the results decide on the best tool parameters and m/z features (which can be filtered in the MSI filtering tool).
289 - Pixels from distinct ovbservational units (e.g. sample, patient) should be assigned to a unique group via the annotation file and segmented separately for the validity of downstream statistical analysis.
290
291 **Output**
292
293 - Pdf with file info and an image of the clusters for each m/z feature
294 - Tabular file summarizing spatial DGMM performance for each feature
295 - (optional) Tabular files for each spatial DGMM run and feature with
296
297 - probabilities (The probability of class membership for each Gaussian component) and classes (The predicted Gaussian component)
298 - estimates (A list giving the parameter estimates for the means and variances for each Gaussian component)
299 - (optional) Visualization of features density plots
300 - (optional) .RData file which contains the segmentation results and can be used for further exploration in R using the Cardinal package
301
302 ]]>
303 </help>
304 <citations>
305 <citation type="doi">10.1093/bioinformatics/btv146</citation>
306 <citation type="doi">10.1093/bioinformatics/btz345</citation>
307 </citations>
308
309 </tool>