comparison volcanoplot.xml @ 9:ab01e379d29e draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/volcanoplot commit 060f6734402e7d2eced6227663cc61d2cb865ae5
author iuc
date Mon, 09 Dec 2024 16:04:24 +0000
parents 2f557f6abbfb
children 99ace6c1ff57
comparison
equal deleted inserted replaced
8:2f557f6abbfb 9:ab01e379d29e
62 } 62 }
63 #end if 63 #end if
64 64
65 # Format data ------------------------------------------------------------ 65 # Format data ------------------------------------------------------------
66 66
67 # Create columns from the column numbers specified 67 # Create columns from the column numbers specified and use the existing category_symbol column for shapes
68 results <- results %>% mutate(fdr = .[[$fdr_col]], 68 results <- results %>% mutate(
69 pvalue = .[[$pval_col]], 69 fdr = .[[$fdr_col]],
70 logfc = .[[$lfc_col]], 70 pvalue = .[[$pval_col]],
71 labels = .[[$label_col]]) 71 logfc = .[[$lfc_col]],
72 labels = .[[$label_col]],
73 )
74
75 # Check if shape_col is provided
76 #if $shape_col:
77 results <- results %>% mutate(category_symbol = .[[$shape_col]]) # Use the shape column if it exists
78 #end if
72 79
73 # Get names for legend 80 # Get names for legend
74 down <- unlist(strsplit('$plot_options.legend_labs', split = ","))[1] 81 down <- unlist(strsplit('$plot_options.legend_labs', split = ","))[1]
75 notsig <- unlist(strsplit('$plot_options.legend_labs', split = ","))[2] 82 notsig <- unlist(strsplit('$plot_options.legend_labs', split = ","))[2]
76 up <- unlist(strsplit('$plot_options.legend_labs', split = ","))[3] 83 up <- unlist(strsplit('$plot_options.legend_labs', split = ","))[3]
118 # Create plot ------------------------------------------------------------- 125 # Create plot -------------------------------------------------------------
119 126
120 # Open file to save plot as PDF 127 # Open file to save plot as PDF
121 pdf("volcano_plot.pdf") 128 pdf("volcano_plot.pdf")
122 129
123 # Set up base plot 130 # Set up base plot with faceting by category_symbol instead of shapes
124 p <- ggplot(data = results, aes(x = logfc, y = -log10(pvalue))) + 131 p <- ggplot(data = results, aes(x = logfc, y = -log10(pvalue))) +
125 geom_point(aes(colour = sig)) +
126 scale_color_manual(values = colours) + 132 scale_color_manual(values = colours) +
127 theme(panel.grid.major = element_blank(), 133 theme(panel.grid.major = element_blank(),
128 panel.grid.minor = element_blank(), 134 panel.grid.minor = element_blank(),
129 panel.background = element_blank(), 135 panel.background = element_blank(),
130 axis.line = element_line(colour = "black"), 136 axis.line = element_line(colour = "black"),
131 legend.key = element_blank()) 137 legend.key = element_blank())
138
139 # Conditional logic to use either shape or facet based on user selection
140 #if $shape_col:
141 if ('$shape_or_facet' == 'facet') {
142 p <- p + facet_wrap(~ category_symbol) # Facet the plot based on category_symbol
143 } else {
144 p <- p + geom_point(aes(colour = sig, shape = factor(category_symbol))) # Use shapes for categories
145 }
146 #else:
147 p <- p + geom_point(aes(colour = sig)) #only add color
148 #end if
132 149
133 #if $labels.label_select != "none" 150 #if $labels.label_select != "none"
134 # Add gene labels 151 # Add gene labels
135 #if $plot_options.boxes 152 #if $plot_options.boxes
136 p <- p + geom_label_repel(data = filter(results, labels != ""), aes(label = labels), 153 p <- p + geom_label_repel(data = filter(results, labels != ""), aes(label = labels),
193 </param> 210 </param>
194 <param name="fdr_col" type="data_column" data_ref="input" label="FDR (adjusted P value) column number" /> 211 <param name="fdr_col" type="data_column" data_ref="input" label="FDR (adjusted P value) column number" />
195 <param name="pval_col" type="data_column" data_ref="input" label="P value (raw) column number" /> 212 <param name="pval_col" type="data_column" data_ref="input" label="P value (raw) column number" />
196 <param name="lfc_col" type="data_column" data_ref="input" label="Log Fold Change column number" /> 213 <param name="lfc_col" type="data_column" data_ref="input" label="Log Fold Change column number" />
197 <param name="label_col" type="data_column" data_ref="input" label="Labels column number" /> 214 <param name="label_col" type="data_column" data_ref="input" label="Labels column number" />
215 <param name="shape_col" type="data_column" data_ref="input" label="Categories that can be used to plot different shapes or facet (useful if multivariable associations are investigated)" optional="true" />
216 <param name="shape_or_facet" type="select" label="Display categories by:" help="Choose whether to display categories by faceting the plot or using shape." optional="true">
217 <option value="facet">Facet</option>
218 <option value="shape">Shape</option>
219 </param>
198 <param name="signif_thresh" type="float" max="1" value="0.05" label="Significance threshold" help="Default: 0.05"/> 220 <param name="signif_thresh" type="float" max="1" value="0.05" label="Significance threshold" help="Default: 0.05"/>
199 <param name="lfc_thresh" type="float" value="0" label="LogFC threshold to colour" help="Default: 0"/> 221 <param name="lfc_thresh" type="float" value="0" label="LogFC threshold to colour" help="Default: 0"/>
200 <conditional name="labels"> 222 <conditional name="labels">
201 <param name="label_select" type="select" label="Points to label" help="Select to label significant points or input labels from file. Default: None"> 223 <param name="label_select" type="select" label="Points to label" help="Select to label significant points or input labels from file. Default: None">
202 <option value="none" selected="True">None</option> 224 <option value="none" selected="True">None</option>
246 <assert_contents> 268 <assert_contents>
247 <has_size value= "933451" delta="1000" /> 269 <has_size value= "933451" delta="1000" />
248 </assert_contents> 270 </assert_contents>
249 </output> 271 </output>
250 </test> 272 </test>
273
251 <test expect_num_outputs="1"> 274 <test expect_num_outputs="1">
252 <!-- Ensure input labels and plot options work --> 275 <!-- Ensure input labels and plot options work -->
253 <param name="input" ftype="tabular" value="input.tab"/> 276 <param name="input" ftype="tabular" value="input.tab"/>
254 <param name="fdr_col" value="4" /> 277 <param name="fdr_col" value="4" />
255 <param name="pval_col" value="3" /> 278 <param name="pval_col" value="3" />
281 <has_size value= "933832" delta="1000" /> 304 <has_size value= "933832" delta="1000" />
282 </assert_contents> 305 </assert_contents>
283 </output> 306 </output>
284 <output name="rscript" value= "out.rscript" lines_diff="4"/> 307 <output name="rscript" value= "out.rscript" lines_diff="4"/>
285 </test> 308 </test>
309
310 <test expect_num_outputs="1">
311 <!-- Ensure input labels and plot options work with faceting -->
312 <param name="input" ftype="tabular" value="category.tab"/>
313 <param name="fdr_col" value="4" />
314 <param name="pval_col" value="3" />
315 <param name="lfc_col" value="2" />
316 <param name="label_col" value="1" />
317 <param name="shape_col" value="5" /> <!-- Assuming the shape is in column 5 -->
318 <param name="lfc_thresh" value="0" />
319 <param name="label_select" value="file"/>
320 <param name="label_file" ftype="tabular" value="labels.tab" />
321 <param name="shape_or_facet" value="facet" /> <!-- Testing the facet option -->
322 <output name="plot">
323 <assert_contents>
324 <has_size value="5007" delta="1000" />
325 </assert_contents>
326 </output>
327 </test>
328
329 <test expect_num_outputs="1">
330 <!-- Ensure input labels and plot options work with shape option -->
331 <param name="input" ftype="tabular" value="category.tab"/>
332 <param name="fdr_col" value="4" />
333 <param name="pval_col" value="3" />
334 <param name="lfc_col" value="2" />
335 <param name="label_col" value="1" />
336 <param name="shape_col" value="5" /> <!-- Assuming the shape is in column 5 -->
337 <param name="lfc_thresh" value="0" />
338 <param name="label_select" value="file"/>
339 <param name="label_file" ftype="tabular" value="labels.tab" />
340 <param name="shape_or_facet" value="shape" /> <!-- Testing the shape option -->
341 <output name="plot">
342 <assert_contents>
343 <has_size value="5533" delta="1000" />
344 </assert_contents>
345 </output>
346 </test>
347
286 </tests> 348 </tests>
287 <help><![CDATA[ 349 <help><![CDATA[
288 .. class:: infomark 350 .. class:: infomark
289 351
290 **What it does** 352 **What it does**
291 353
292 This tool creates a Volcano plot using ggplot2. Points can be labelled via ggrepel. It was inspired by this Getting Genetics Done `blog post`_. 354 This tool creates a Volcano plot using ggplot2. Points can be labelled via ggrepel. It was inspired by this Getting Genetics Done `blog post`_.
293 355
294 In statistics, a `Volcano plot`_ is a type of scatter-plot that is used to quickly identify changes in large data sets composed of replicate data. It plots significance versus fold-change on the y and x axes, respectively. These plots are increasingly common in omic experiments such as genomics, proteomics, and metabolomics where one often has a list of many thousands of replicate data points between two conditions and one wishes to quickly identify the most meaningful changes. A volcano plot combines a measure of statistical significance from a statistical test (e.g., a p value from an ANOVA model) with the magnitude of the change, enabling quick visual identification of those data-points (genes, etc.) that display large magnitude changes that are also statistically significant. 356 In statistics, a `Volcano plot`_ is a type of scatter plot that is used to quickly identify changes in large data sets composed of replicate data. It plots significance versus fold-change on the y and x axes, respectively. These plots are increasingly common in omic experiments such as genomics, proteomics, and metabolomics where one often has a list of many thousands of replicate data points between two conditions and one wishes to quickly identify the most meaningful changes. A volcano plot combines a measure of statistical significance from a statistical test (e.g., a p-value from an ANOVA model) with the magnitude of the change, enabling quick visual identification of those data points (genes, etc.) that display large magnitude changes that are also statistically significant.
295 357
296 A volcano plot is constructed by plotting the negative log of the p value on the y axis (usually base 10). This results in data points with low p values (highly significant) appearing toward the top of the plot. The x axis is the log of the fold change between the two conditions. The log of the fold change is used so that changes in both directions appear equidistant from the center. Plotting points in this way results in two regions of interest in the plot: those points that are found toward the top of the plot that are far to either the left- or right-hand sides. These represent values that display large magnitude fold changes (hence being left or right of center) as well as high statistical significance (hence being toward the top). 358 A volcano plot is constructed by plotting the negative log of the p-value on the y-axis (usually base 10). This results in data points with low p-values (highly significant) appearing toward the top of the plot. The x-axis is the log of the fold change between the two conditions. The log of the fold change is used so that changes in both directions appear equidistant from the center. Plotting points in this way results in two regions of interest in the plot: those points that are found toward the top of the plot that are far to either the left or right-hand sides. These represent values that display large magnitude fold changes (hence being left or right of center) as well as high statistical significance (hence being toward the top).
359
360 Additionally, users can specify a `shape_col`, which allows the differentiation of points in the plot based on categorical variables. The shapes of the points can represent distinct groups or categories within the data, providing another layer of visual information. This feature is particularly useful when comparing multiple groups or conditions in the same plot.
297 361
298 Source: Wikipedia 362 Source: Wikipedia
299 363
300 ----- 364 -----
301 365
302 **Inputs** 366 **Inputs**
303 367
304 A tabular file containing the columns below (additional columns may be present): 368 A tabular file containing the columns below (additional columns may be present):
305 369
306 * P value 370 * P value
307 * FDR / adjusted P value 371 * FDR / adjusted P value
308 * Log fold change 372 * Log fold change
309 * Labels (e.g. Gene symbols or IDs) 373 * Labels (e.g. Gene symbols or IDs)
310 374 * Shape (optional; categorical data for point shapes)
311 All significant points, those meeting the specified FDR and Log Fold Change thresholds, will be coloured, red for upregulated, blue for downregulated. Users can choose to apply labels to the points (such as gene symbols) from the Labels column. To label all significant points, select "Significant" for the **Points to label** option, or to only label the top most significant specify a number under "Only label top most significant". Users can label any points of interest through selecting **Points to label** "Input from file" and providing a tabular labels file. The labels file must contain a header row and have the labels in the first column. These labels must match the labels in the main input file. 375
376 All significant points, those meeting the specified FDR and Log Fold Change thresholds, will be coloured: red for upregulated, blue for downregulated. Users can choose to apply labels to the points (such as gene symbols) from the Labels column. To label all significant points, select "Significant" for the **Points to label** option, or to only label the top most significant, specify a number under "Only label top most significant". Users can label any points of interest through selecting **Points to label** "Input from file" and providing a tabular labels file. The labels file must contain a header row and have the labels in the first column. These labels must match the labels in the main input file.
312 377
313 **Outputs** 378 **Outputs**
314 379
315 A PDF containing a Volcano plot like below. The R code can be output through *Output Options* in the tool form. 380 A PDF containing a Volcano plot like below. The R code can be output through *Output Options* in the tool form.
316 381
317 .. image:: $PATH_TO_IMAGES/volcano_plot.png 382 .. image:: $PATH_TO_IMAGES/volcano_plot.png
318 383
319 .. _Volcano plot: https://en.wikipedia.org/wiki/Volcano_plot_(statistics) 384 .. _Volcano plot: https://en.wikipedia.org/wiki/Volcano_plot_(statistics)
320 .. _blog post: https://gettinggeneticsdone.blogspot.com/2016/01/ 385 .. _blog post: https://gettinggeneticsdone.blogspot.com/2016/01/
386
321 387
322 ]]></help> 388 ]]></help>
323 <citations> 389 <citations>
324 <citation type="doi">10.1007/978-3-319-24277-4</citation> 390 <citation type="doi">10.1007/978-3-319-24277-4</citation>
325 </citations> 391 </citations>