Mercurial > repos > galaxyp > cardinal_classification

diff classification.xml @ 14:ece627528a78 draft
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/cardinal commit 888b3e991d0752b694bf480531ce0e5318c2f337-dirty"
author: galaxyp
date: Fri, 07 May 2021 10:10:35 +0000
parents: 24c000517173
children: f28ad96b76dc
--- a/classification.xml	Wed Dec 23 21:59:28 2020 +0000
+++ b/classification.xml	Fri May 07 10:10:35 2021 +0000
@@ -1,4 +1,4 @@
-<tool id="cardinal_classification" name="MSI classification" version="@VERSION@.0">
+<tool id="cardinal_classification" name="MSI classification" version="@VERSION@.1">
     <description>spatial classification of mass spectrometry imaging data</description>
     <macros>
         <import>macros.xml</import>
@@ -25,7 +25,7 @@
 library(Cardinal)
 library(gridExtra)
 library(ggplot2)
-
+library(scales)
 
 @READING_MSIDATA@
 
@@ -57,7 +57,7 @@
 
 
 ##################### I) numbers and control plots #############################
-###############################################################################
+################################################################################
 
 ## table with values
 grid.table(property_df, rows= NULL)
@@ -67,7 +67,7 @@
 
     opar <- par()
 
-    ######################## II) Training #############################
+    ######################## II) Training #######################################
     #############################################################################
     #if str( $type_cond.type_method) == "training":
         print("training")
@@ -90,21 +90,26 @@
             merged_response = merge(msidata_coordinates, y_input, by=c("x", "y"), all.x=TRUE)
             merged_response[is.na(merged_response)] = "NA"
             merged_response = merged_response[order(merged_response\$pixel_index),]
-            y_vector = as.factor(merged_response[,4])
+            conditions = as.factor(merged_response[,4])
+            y_vector = conditions
 
     ## plot of y vector
 
-    position_df = cbind(coord(msidata)[,1:2], y_vector)
-    y_plot = ggplot(position_df, aes(x=x, y=y, fill=y_vector))+
+    position_df = cbind(coord(msidata)[,1:2], conditions)
+    y_plot = ggplot(position_df, aes(x=x, y=y, fill=conditions))+
            geom_tile() +
            coord_fixed()+
-           ggtitle("Distribution of the response variable y")+
-           theme_bw()+
+           ggtitle("Distribution of the conditions")+
+		theme_bw()+
+                theme(
+	       plot.background = element_blank(),
+	       panel.grid.major = element_blank(),
+	       panel.grid.minor = element_blank())+
            theme(text=element_text(family="ArialMT", face="bold", size=15))+
            theme(legend.position="bottom",legend.direction="vertical")+
            guides(fill=guide_legend(ncol=4,byrow=TRUE))
-    coord_labels = aggregate(cbind(x,y)~y_vector, data=position_df, mean, na.rm=TRUE, na.action="na.pass")
-    coord_labels\$file_number = gsub( "_.*$", "", coord_labels\$y_vector)
+    coord_labels = aggregate(cbind(x,y)~conditions, data=position_df, mean, na.rm=TRUE, na.action="na.pass")
+    coord_labels\$file_number = gsub( "_.*$", "", coord_labels\$conditions)
     print(y_plot)
 
 
@@ -119,7 +124,11 @@
                geom_tile() +
                coord_fixed()+
                ggtitle("Distribution of the fold variable")+
-               theme_bw()+
+	       theme_bw()+
+               theme(
+	       plot.background = element_blank(),
+	       panel.grid.major = element_blank(),
+	       panel.grid.minor = element_blank())+
                theme(text=element_text(family="ArialMT", face="bold", size=15))+
                theme(legend.position="bottom",legend.direction="vertical")+
                guides(fill=guide_legend(ncol=4,byrow=TRUE))
@@ -276,7 +285,11 @@
                        geom_tile() +
                        coord_fixed()+
                        ggtitle("Predicted condition for each pixel")+
-                       theme_bw()+
+			theme_bw()+
+		        theme(
+		       plot.background = element_blank(),
+		       panel.grid.major = element_blank(),
+		       panel.grid.minor = element_blank())+
                        theme(text=element_text(family="ArialMT", face="bold", size=15))+
                        theme(legend.position="bottom",legend.direction="vertical")+
                        guides(fill=guide_legend(ncol=4,byrow=TRUE))
@@ -443,7 +456,11 @@
                        geom_tile() +
                        coord_fixed()+
                        ggtitle("Predicted condition for each pixel")+
-                       theme_bw()+
+			theme_bw()+
+			theme(
+		       plot.background = element_blank(),
+		       panel.grid.major = element_blank(),
+		       panel.grid.minor = element_blank())+
                        theme(text=element_text(family="ArialMT", face="bold", size=15))+
                        theme(legend.position="bottom",legend.direction="vertical")+
                        guides(fill=guide_legend(ncol=4,byrow=TRUE))
@@ -507,6 +524,11 @@
                 maximumy = max(coord(msidata.cv.ssc)[,2])
                 image(msidata.cv.ssc, model = list( r = r_value, s = s_value ), ylim= c(maximumy+0.2*maximumy,minimumy-0.2*minimumy),layout=c(1,1))
 
+		#if $type_cond.method_cond.ssc_analysis_cond.write_best_params:
+                	write.table(r_value, file="$best_r", quote = FALSE, row.names = FALSE, col.names=FALSE, sep = "\t")
+                	write.table(s_value, file="$best_s", quote = FALSE, row.names = FALSE, col.names=FALSE, sep = "\t")
+                #end if
+                
                 ## print table with summary in pdf
                 par(opar)
                 plot(0,type='n',axes=FALSE,ann=FALSE)
@@ -542,9 +564,12 @@
                 number_groups = length(levels(y_vector))
 
                 ## SSC analysis and plot
-                msidata.ssc <- spatialShrunkenCentroids(msidata, y = y_vector, .fold = fold_vector, 
+                msidata.ssc <- spatialShrunkenCentroids(msidata, y = y_vector,
 r = c($type_cond.method_cond.ssc_r), s = c($type_cond.method_cond.ssc_s), method = "$type_cond.method_cond.ssc_kernel_method")
-                plot(msidata.ssc, mode = "tstatistics", model = list("r" = c($type_cond.method_cond.ssc_r), "s" = c($type_cond.method_cond.ssc_s)))
+                plot(msidata.ssc, mode = "tstatistics", model = list("r" = c($type_cond.method_cond.ssc_r), "s" = c($type_cond.method_cond.ssc_s)), 
+			col=hue_pal()(length(levels(msidata.ssc\$classes[[1]]))), lwd=2)
+                
+
 
                 ### summary table SSC
                 ##############summary_table = summary(msidata.ssc)
@@ -582,6 +607,7 @@
 
                 ## m/z and pixel information output
                 ssc_classes = data.frame(msidata.ssc\$classes[[1]])
+                ssc_probabilities = data.frame(msidata.ssc\$probabilities[[1]])
 
                 ## pixel names and coordinates
                 ## to remove potential sample names and z dimension, split at comma and take only x and y 
@@ -596,39 +622,36 @@
                 rm(msidata)
                 gc()
 
-                ssc_classes2 = data.frame(pixel_names, x_coordinates, y_coordinates, ssc_classes)
-                colnames(ssc_classes2) = c("pixel names", "x", "y","predicted condition")
+                ssc_classes2 = data.frame(pixel_names, x_coordinates, y_coordinates, ssc_classes, ssc_probabilities)
+                colnames(ssc_classes2) = c("pixel names", "x", "y","predicted condition", levels(msidata.ssc\$classes[[1]]))
                 ssc_toplabels = topFeatures(msidata.ssc, n=Inf)
                 ssc_toplabels[,6:9] <-round(ssc_toplabels[,6:9],6)
                 write.table(ssc_toplabels, file="$mzfeatures", quote = FALSE, row.names = FALSE, col.names=TRUE, sep = "\t")
                 write.table(ssc_classes2, file="$pixeloutput", quote = FALSE, row.names = FALSE, col.names=TRUE, sep = "\t")
-
-                ## image with predicted classes
-                prediction_df = cbind(coord(msidata.ssc)[,1:2], ssc_classes)
-                colnames(prediction_df) = c("x", "y", "predicted_classes")
+     
+                image(msidata.ssc, model=list(r = c($type_cond.method_cond.ssc_r), s = c($type_cond.method_cond.ssc_s)), 
+			col=hue_pal()(length(levels(msidata.ssc\$classes[[1]]))), mode="classes", layout=c(1,1), main="Class Prediction")
+                image(msidata.ssc, model=list(r = c($type_cond.method_cond.ssc_r), s = c($type_cond.method_cond.ssc_s)), 
+			col=hue_pal()(length(levels(msidata.ssc\$classes[[1]]))), mode="probabilities", layout=c(1,1), main="Class probabilities")
 
-                prediction_plot = ggplot(prediction_df, aes(x=x, y=y, fill=predicted_classes))+
-                       geom_tile() +
-                       coord_fixed()+
-                       ggtitle("Predicted condition for each pixel")+
-                       theme_bw()+
-                       theme(text=element_text(family="ArialMT", face="bold", size=15))+
-                       theme(legend.position="bottom",legend.direction="vertical")+
-                       guides(fill=guide_legend(ncol=4,byrow=TRUE))
-                coord_labels = aggregate(cbind(x,y)~predicted_classes, data=prediction_df, mean, na.rm=TRUE, na.action="na.pass")
-                coord_labels\$file_number = gsub( "_.*$", "", coord_labels\$predicted_classes)
-                print(prediction_plot)
-                
                 
                ## image with right and wrong classes: 
+               prediction_df = cbind(coord(msidata.ssc)[,1:2], ssc_classes)
+               colnames(prediction_df) = c("x", "y", "predicted_classes")
                comparison_df = cbind(prediction_df, y_vector)
                comparison_df\$correct<- ifelse(comparison_df\$predicted_classes==comparison_df\$y_vector, T, F)
+	       correctness = round(sum(comparison_df\$correct)/length(comparison_df\$correct)*100,2)
 
                correctness_plot = ggplot(comparison_df, aes(x=x, y=y, fill=correct))+
                        geom_tile() +
                        coord_fixed()+
-                       ggtitle("Correctness of classification")+
-                       theme_bw()+
+                       ggtitle(paste0("Correctness of classification: ",correctness, "%"))+
+                       scale_fill_manual(values = c("TRUE" = "orange","FALSE" = "darkblue"))+
+		       theme_bw()+
+                       theme(
+    		       plot.background = element_blank(),
+   		       panel.grid.major = element_blank(),
+  		       panel.grid.minor = element_blank())+
                        theme(text=element_text(family="ArialMT", face="bold", size=15))+
                        theme(legend.position="bottom",legend.direction="vertical")+
                        guides(fill=guide_legend(ncol=2,byrow=TRUE))
@@ -668,6 +691,16 @@
             merged_response = merged_response[order(merged_response\$pixel_index),]
             new_y_vector = as.factor(merged_response[,4])
             prediction = predict(training_data,msidata, newy = new_y_vector)
+            
+	    ## Summary table prediction
+	    summary_table = summary(prediction)\$accuracy[[names(prediction@resultData)]]
+	    summary_table2 = round(as.numeric(summary_table), digits=2)
+	    summary_matrix = matrix(summary_table2, nrow=4, ncol=ncol(summary_table))
+	    summary_table3 = cbind(rownames(summary_table), summary_matrix) ## include rownames in table
+	    summary_table4 = t(summary_table3)
+	    summary_table5 = cbind(c(names(prediction@resultData),colnames(summary_table)), summary_table4)
+	    plot(0,type='n',axes=FALSE,ann=FALSE)
+	   grid.table(summary_table5, rows= NULL)
 
         #else
             prediction = predict(training_data,msidata)
@@ -684,57 +717,65 @@
         predicted_toplabels = topFeatures(prediction, n=Inf)
         if (colnames(predicted_toplabels)[4] == "coefficients"){
             predicted_toplabels[,4:6] <-round(predicted_toplabels[,4:6],5)
-
         }else{
             predicted_toplabels[,6:9] <-round(predicted_toplabels[,6:9],5)}
+            
+        ##predicted classes
+        prediction_df = cbind(coord(prediction)[,1:2], predicted_classes)
+        colnames(prediction_df) = c("x", "y", "predicted_classes")
+            
+        #if str($type_cond.classification_type) == "SSC_classifier":
+            ## this seems to work only for SSC, therefore overwrite tables
+            predicted_probabilities = data.frame(prediction\$probabilities[[1]])
+            predicted_classes2 = data.frame(pixel_names, x_coordinates, y_coordinates, predicted_classes, predicted_probabilities)
+            colnames(predicted_classes2) = c("pixel names", "x", "y","predicted condition", levels(prediction\$classes[[1]]))
+            ## also image modes are specific to SSC
+            image(prediction, mode="classes", layout=c(1,1), main="Class", col=hue_pal()(length(unique(prediction\$classes[[1]]))))
+            image(prediction, mode="probabilities", layout=c(1,1), main="Class probabilities", col=hue_pal()(length(unique(prediction\$classes[[1]]))))
+
+	#else
+        
+                prediction_plot = ggplot(prediction_df, aes(x=x, y=y, fill=predicted_classes))+
+        	geom_tile()+
+        	coord_fixed()+
+        	ggtitle("Predicted condition for each spectrum")+
+        	theme_bw()+
+        	theme(
+        	plot.background = element_blank(),
+        	panel.grid.major = element_blank(),
+        	panel.grid.minor = element_blank())+
+        	theme(text=element_text(family="ArialMT", face="bold", size=15))+
+        	theme(legend.position="bottom", legend.direction="vertical")+
+        	guides(fill=guide_legend(ncol=4, byrow=TRUE))
+        	coord_labels = aggregate(cbind(x,y)~predicted_classes, data=prediction_df, mean, na.rm=TRUE, na.action="na.pass")
+        	coord_labels\$file_number = gsub( "_.*§", "", coord_labels\$predicted_classes)
+        	print(prediction_plot)
+        #end if
+        
         write.table(predicted_toplabels, file="$mzfeatures", quote = FALSE, row.names = FALSE, col.names=TRUE, sep = "\t")
         write.table(predicted_classes2, file="$pixeloutput", quote = FALSE, row.names = FALSE, col.names=TRUE, sep = "\t")
 
-        ## image with predicted classes
 
-        prediction_df = cbind(coord(prediction)[,1:2], predicted_classes)
-        colnames(prediction_df) = c("x", "y", "predicted_classes")
 
-        prediction_plot = ggplot(prediction_df, aes(x=x, y=y, fill=predicted_classes))+
-               geom_tile() +
-               coord_fixed()+
-               ggtitle("Predicted condition for each pixel")+
-               theme_bw()+
-               theme(text=element_text(family="ArialMT", face="bold", size=15))+
-               theme(legend.position="bottom",legend.direction="vertical")+
-               guides(fill=guide_legend(ncol=4,byrow=TRUE))
-        coord_labels = aggregate(cbind(x,y)~predicted_classes, data=prediction_df, mean, na.rm=TRUE, na.action="na.pass")
-        coord_labels\$file_number = gsub( "_.*$", "", coord_labels\$predicted_classes)
-        print(prediction_plot)
         
+        #if str($type_cond.new_y_values_cond.new_y_values) == "new_response":
         ## image with right and wrong classes:     
 
 	comparison_df = cbind(prediction_df, new_y_vector)
-	comparison_df\$correct<- as.factor(ifelse(comparison_df\$predicted_classes==comparison_df\$new_y_vector, T, F))
+	comparison_df\$correct<- ifelse(comparison_df\$predicted_classes==comparison_df\$new_y_vector, T, F)
+        correctness = round(sum(comparison_df\$correct)/length(comparison_df\$correct)*100,2)
 
 	correctness_plot = ggplot(comparison_df, aes(x=x, y=y, fill=correct))+
                        geom_tile()+
                        scale_fill_manual(values = c("TRUE" = "orange","FALSE" = "darkblue"))+
                        coord_fixed()+
-                       ggtitle("Correctness of classification")+
+                       ggtitle(paste0("Correctness of classification: ",correctness, "%"))+
                        theme_bw()+
                        theme(text=element_text(family="ArialMT", face="bold", size=15))+
                        theme(legend.position="bottom",legend.direction="vertical")+
                        guides(fill=guide_legend(ncol=2,byrow=TRUE))
-               ## coord_labels = aggregate(cbind(x,y)~correct, data=comparison_df, mean, na.rm=TRUE, na.action="na.pass")
-                ##coord_labels\$file_number = gsub( "_.*$", "", coord_labels\$predicted_classes)
         print(correctness_plot)
-                
-
-        ## Summary table prediction
-        summary_table = summary(prediction)\$accuracy[[names(prediction@resultData)]]
-        summary_table2 = round(as.numeric(summary_table), digits=2)
-        summary_matrix = matrix(summary_table2, nrow=4, ncol=ncol(summary_table))
-        summary_table3 = cbind(rownames(summary_table), summary_matrix) ## include rownames in table
-        summary_table4 = t(summary_table3)
-        summary_table5 = cbind(c(names(prediction@resultData),colnames(summary_table)), summary_table4)
-        plot(0,type='n',axes=FALSE,ann=FALSE)
-        grid.table(summary_table5, rows= NULL)
+        #end if      
 
         ## optional output as .RData
         #if $output_rdata:
@@ -833,8 +874,9 @@
                                 <option value="ssc_cvapply" selected="True">cvApply</option>
                                 <option value="ssc_analysis">spatial shrunken centroids analysis</option>
                             </param>
-                            <when value="ssc_cvapply"/>
-
+                            <when value="ssc_cvapply">
+                                  <param name="write_best_params" type="boolean" label="Write out best r and s values" help="Can be used to generate automatic classification workflow"/>
+                            </when>
                             <when value="ssc_analysis">
                                 <!--param name="ssc_toplabels" type="integer" value="100"
                                    label="Number of toplabels (m/z features) which should be written in tabular output"/-->
@@ -851,8 +893,7 @@
                         <param name="ssc_kernel_method" type="select" display="radio" label = "The method to use to calculate the spatial smoothing kernels for the embedding. The 'gaussian' method refers to spatially-aware (SA) weights, and 'adaptive' refers to spatially-aware structurally-adaptive (SASA) weights">
                             <option value="gaussian">gaussian</option>
                             <option value="adaptive" selected="True">adaptive</option>
-                        </param>
-
+                        </param>                        
                     </when>
                 </conditional>
 
@@ -862,10 +903,15 @@
                 <param name="training_result" type="data" format="rdata" label="Result from previous classification training"/>
                 <!--param name="predicted_toplabels" type="integer" value="100"
                                    label="Number of toplabels (m/z features) which should be written in tabular output"/-->
+                <param name="classification_type" type="select" display="radio" optional="False" label="Which classification method was used">
+                	<option value="PLS_classifier" selected="True" >PLS classifier</option>
+                	<option value="OPLS_classifier">OPLS classifier</option>
+                	<option value="SSC_classifier">SSC_classifier</option>
+        	</param>
                 <conditional name="new_y_values_cond">
-                    <param name="new_y_values" type="select" label="Should new response values be used">
-                        <option value="no_new_response" selected="True">old response should be used</option>
-                        <option value="new_response">load new response from tabular file</option>
+                    <param name="new_y_values" type="select" label="Load annotations (optional, but allows accuracy calculations)">
+                        <option value="no_new_response" selected="True">no</option>
+                        <option value="new_response">use annotations</option>
                     </param>
                     <when value="no_new_response"/>
                     <when value="new_response">
@@ -884,6 +930,12 @@
         <data format="pdf" name="classification_images" from_work_dir="classificationpdf.pdf" label = "${tool.name} on ${on_string}: results"/>
         <data format="tabular" name="mzfeatures" label="${tool.name} on ${on_string}: features"/>
         <data format="tabular" name="pixeloutput" label="${tool.name} on ${on_string}: pixels"/>
+        <data format="txt" name="best_r" label="${tool.name} on ${on_string}:best r">
+        <filter>type_cond['type_method'] == 'training' and type_cond['method_cond']['class_method'] == 'spatialShrunkenCentroids' and type_cond['method_cond']['ssc_analysis_cond']['ssc_method'] == 'ssc_cvapply' and type_cond['method_cond']['ssc_analysis_cond']['write_best_params']</filter>
+        </data>
+        <data format="txt" name="best_s" label="${tool.name} on ${on_string}:best s">
+        <filter>type_cond['type_method'] == 'training' and type_cond['method_cond']['class_method'] == 'spatialShrunkenCentroids' and type_cond['method_cond']['ssc_analysis_cond']['ssc_method'] == 'ssc_cvapply' and type_cond['method_cond']['ssc_analysis_cond']['write_best_params']</filter>
+        </data>                               
         <data format="rdata" name="classification_rdata" label="${tool.name} on ${on_string}: results.RData">
             <filter>output_rdata</filter>
         </data>
@@ -1101,8 +1153,9 @@
 - training and prediction
 
     - training can be done with cvapply that uses cross validation to find the best value for s, this requires not only a condition for each spectrum but also a fold (each fold should contain spectra of all conditions)
-    - training with the best value for s gives the top m/z features for each condition and the predicted classification group for each spectrum
+    - training with the best value for r and s gives the top m/z features for each condition and the predicted classification group for each spectrum
     - training result can be saved as RData file that can be reused for prediction of further samples
+    - prediction can calculate accuracies when the annotations are known and provided
 
 
 .. image:: $PATH_TO_IMAGES/classification_overview.png
author	galaxyp
date	Fri, 07 May 2021 10:10:35 +0000
parents	24c000517173
children	f28ad96b76dc