diff preprocessing.xml @ 2:1b875f0b8024 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/cardinal commit f127be2141cf22e269c85282d226eb16fe14a9c1
author galaxyp
date Fri, 15 Feb 2019 10:22:14 -0500
parents 1b22c1e7bfe7
children f172efe92629
line wrap: on
line diff
--- a/preprocessing.xml	Thu Oct 25 07:29:29 2018 -0400
+++ b/preprocessing.xml	Fri Feb 15 10:22:14 2019 -0500
@@ -1,4 +1,4 @@
-<tool id="cardinal_preprocessing" name="MSI preprocessing" version="@VERSION@.1">
+<tool id="cardinal_preprocessing" name="MSI preprocessing" version="@VERSION@.2">
     <description>
         mass spectrometry imaging preprocessing
     </description>
@@ -17,15 +17,13 @@
         cat '${cardinal_preprocessing}' &&
         Rscript '${cardinal_preprocessing}' &&
 
-        #if $imzml_output:
+        #if str($imzml_output) == "imzml_format":
         mkdir $outfile_imzml.files_path &&
-        ls -l &&
             mv ./out.imzML "${os.path.join($outfile_imzml.files_path, 'imzml')}" | true &&
             mv ./out.ibd "${os.path.join($outfile_imzml.files_path, 'ibd')}" | true &&
         #end if
-            echo "imzML file:" > $outfile_imzml &&
-            ls -l "$outfile_imzml.files_path" >> $outfile_imzml
-
+        echo "imzML file:" > $outfile_imzml &&
+        ls -l "$outfile_imzml.files_path" >> $outfile_imzml
 
     ]]>
     </command>
@@ -41,24 +39,28 @@
 
 @READING_MSIDATA@
 
-
 ## remove duplicated coordinates, otherwise peak picking and log2 transformation will fail
-print(paste0(sum(duplicated(coord(msidata))), " duplicated coordinates were removed"))
-msidata <- msidata[,!duplicated(coord(msidata))]
-
-print(paste0("Number of NA in input file: ",sum(is.na(spectra(msidata)[]))))
+print(paste0(sum(duplicated(coord(msidata)[,1:2])), " duplicated coordinates were removed"))
+msidata <- msidata[,!duplicated(coord(msidata)[,1:2])]
 
 
-if (sum(spectra(msidata)[]>0, na.rm=TRUE)> 0){
+if (ncol(msidata)>0 & nrow(msidata) >0){                                         
+
+    ## start QC report
+
+    pdf("Preprocessing.pdf", fonts = "Times", pointsize = 12)
+    plot(0,type='n',axes=FALSE,ann=FALSE)
+    title(main=paste("Quality control during preprocessing \n", "Filename:", "$infile.display_name"))
+
     ######################### preparations for QC report #################
 
-        maxfeatures = length(features(msidata))
-        medianpeaks = median(colSums(spectra(msidata)[]>0, na.rm=TRUE))
-        medint = round(median(spectra(msidata)[],na.rm=TRUE), digits=2)
-        minmz = round(min(mz(msidata)), digits=2)
-        maxmz = round(max(mz(msidata)), digits=2)
-        QC_numbers= data.frame(inputdata = c(minmz, maxmz,maxfeatures, medianpeaks, medint))
-        vectorofactions = "inputdata"
+    maxfeatures =nrow(msidata)
+    pixelcount = ncol(msidata)
+    minmz = round(min(mz(msidata)), digits=2)
+    maxmz = round(max(mz(msidata)), digits=2)
+    QC_numbers= data.frame(inputdata = c(minmz, maxmz,maxfeatures, pixelcount))
+    vectorofactions = "inputdata"
+    plot(msidata, pixel = 1:pixelcount, main="Average spectrum of input file")
 
     ############################### Preprocessing steps ###########################
     ###############################################################################
@@ -75,14 +77,14 @@
 
             ############################### QC ###########################
 
-                maxfeatures = length(features(msidata))
-                medianpeaks = median(colSums(spectra(msidata)[]>0, na.rm=TRUE),)
-                medint = round(median(spectra(msidata)[], na.rm=TRUE), digits=2)
-                minmz = round(min(mz(msidata)), digits=2)
-                maxmz = round(max(mz(msidata)), digits=2)
-                normalized = c(minmz, maxmz,maxfeatures, medianpeaks, medint)
-                QC_numbers= cbind(QC_numbers, normalized)
-                vectorofactions = append(vectorofactions, "normalized")
+            maxfeatures =nrow(msidata)
+            pixelcount = ncol(msidata)
+            minmz = round(min(mz(msidata)), digits=2)
+            maxmz = round(max(mz(msidata)), digits=2)
+            normalized = c(minmz, maxmz,maxfeatures, pixelcount)
+            QC_numbers= cbind(QC_numbers, normalized)
+            vectorofactions = append(vectorofactions, "normalized")
+            plot(msidata, pixel = 1:pixelcount, main="Average spectrum after normalization")
 
     ############################### Baseline reduction ###########################
 
@@ -94,14 +96,14 @@
 
             ############################### QC ###########################
 
-                maxfeatures = length(features(msidata))
-                medianpeaks = median(colSums(spectra(msidata)[]>0, na.rm=TRUE))
-                medint = round(median(spectra(msidata)[], na.rm=TRUE), digits=2)
-                minmz = round(min(mz(msidata)), digits=2)
-                maxmz = round(max(mz(msidata)), digits=2)
-                baseline = c(minmz, maxmz,maxfeatures, medianpeaks, medint)
-                QC_numbers= cbind(QC_numbers, baseline)
-                vectorofactions = append(vectorofactions, "baseline red.")
+            maxfeatures =nrow(msidata)
+            pixelcount = ncol(msidata)
+            minmz = round(min(mz(msidata)), digits=2)
+            maxmz = round(max(mz(msidata)), digits=2)
+            baseline = c(minmz, maxmz,maxfeatures, pixelcount)
+            QC_numbers= cbind(QC_numbers, baseline)
+            vectorofactions = append(vectorofactions, "baseline red.")
+            plot(msidata, pixel = 1:pixelcount, main="Average spectrum after baseline reduction")
 
     ############################### Smoothing ###########################
 
@@ -127,14 +129,14 @@
 
             ############################### QC ###########################
 
-                maxfeatures = length(features(msidata))
-                medianpeaks = median(colSums(spectra(msidata)[]>0, na.rm=TRUE))
-                medint = round(median(spectra(msidata)[], na.rm=TRUE), digits=2)
-                minmz = round(min(mz(msidata)), digits=2)
-                maxmz = round(max(mz(msidata)), digits=2)
-                smoothed = c(minmz, maxmz,maxfeatures, medianpeaks, medint)
-                QC_numbers= cbind(QC_numbers, smoothed)
-                vectorofactions = append(vectorofactions, "smoothed")
+            maxfeatures =nrow(msidata)
+            pixelcount = ncol(msidata)
+            minmz = round(min(mz(msidata)), digits=2)
+            maxmz = round(max(mz(msidata)), digits=2)
+            smoothed = c(minmz, maxmz,maxfeatures, pixelcount)
+            QC_numbers= cbind(QC_numbers, smoothed)
+            vectorofactions = append(vectorofactions, "smoothed")
+            plot(msidata, pixel = 1:pixelcount, main="Average spectrum after smoothing")
 
     ############################### Peak picking ###########################
 
@@ -161,14 +163,14 @@
 
             ############################### QC ###########################
 
-                maxfeatures = length(features(msidata))
-                medianpeaks = median(colSums(spectra(msidata)[]>0, na.rm=TRUE))
-                medint = round(median(spectra(msidata)[], na.rm=TRUE), digits=2)
-                minmz = round(min(mz(msidata)), digits=2)
-                maxmz = round(max(mz(msidata)), digits=2)
-                picked = c(minmz, maxmz,maxfeatures, medianpeaks, medint)
-                QC_numbers= cbind(QC_numbers, picked)
-                vectorofactions = append(vectorofactions, "picked")
+            maxfeatures =nrow(msidata)
+            pixelcount = ncol(msidata)
+            minmz = round(min(mz(msidata)), digits=2)
+            maxmz = round(max(mz(msidata)), digits=2)
+            picked = c(minmz, maxmz,maxfeatures, pixelcount)
+            QC_numbers= cbind(QC_numbers, picked)
+            vectorofactions = append(vectorofactions, "picked")
+            plot(msidata, pixel = 1:pixelcount, main="Average spectrum after peak picking")
 
     ############################### Peak alignment ###########################
 
@@ -208,14 +210,14 @@
 
             ############################### QC ###########################
 
-                maxfeatures = length(features(msidata))
-                medianpeaks = median(colSums(spectra(msidata)[]>0, na.rm=TRUE))
-                medint = round(median(spectra(msidata)[], na.rm=TRUE), digits=2)
-                minmz = round(min(mz(msidata)), digits=2)
-                maxmz = round(max(mz(msidata)), digits=2)
-                aligned = c(minmz, maxmz,maxfeatures, medianpeaks, medint)
-                QC_numbers= cbind(QC_numbers, aligned)
-                vectorofactions = append(vectorofactions, "aligned")
+            maxfeatures =nrow(msidata)
+            pixelcount = ncol(msidata)
+            minmz = round(min(mz(msidata)), digits=2)
+            maxmz = round(max(mz(msidata)), digits=2)
+            aligned = c(minmz, maxmz,maxfeatures, pixelcount)
+            QC_numbers= cbind(QC_numbers, aligned)
+            vectorofactions = append(vectorofactions, "aligned")
+            plot(msidata, pixel = 1:pixelcount, main="Average spectrum after alignment")
 
     ############################### Peak filtering ###########################
 
@@ -226,14 +228,14 @@
 
             ############################### QC ###########################
 
-                maxfeatures = length(features(msidata))
-                medianpeaks = median(colSums(spectra(msidata)[]>0, na.rm=TRUE))
-                medint = round(median(spectra(msidata)[], na.rm=TRUE), digits=2)
-                minmz = round(min(mz(msidata)), digits=2)
-                maxmz = round(max(mz(msidata)), digits=2)
-                filtered = c(minmz, maxmz,maxfeatures, medianpeaks, medint)
-                QC_numbers= cbind(QC_numbers, filtered)
-                vectorofactions = append(vectorofactions, "filtered")
+            maxfeatures =nrow(msidata)
+            pixelcount = ncol(msidata)
+            minmz = round(min(mz(msidata)), digits=2)
+            maxmz = round(max(mz(msidata)), digits=2)
+            filtered = c(minmz, maxmz,maxfeatures, pixelcount)
+            QC_numbers= cbind(QC_numbers, filtered)
+            vectorofactions = append(vectorofactions, "filtered")
+            plot(msidata, pixel = 1:pixelcount, main="Average spectrum after filtering")
 
     ############################### Data reduction ###########################
 
@@ -247,8 +249,11 @@
 
                 ## optional: replace NA with 0
                 #if $method.methods_conditional.methods_for_reduction.replace_NA_bin:
+                    ## binning seems to create normal R matrix but to be sure:
+                    iData(msidata) <- iData(msidata)[]
+                    ## count and replace NAs
                     print(paste0("Number of NA that were set to zero after binning:",sum(is.na(spectra(msidata)[]))))
-                    spectra(msidata)[][is.na(spectra(msidata)[])] = 0
+                    spectra(msidata)[is.na(spectra(msidata))] = 0 
                 #end if
 
             #elif str( $method.methods_conditional.methods_for_reduction.reduction_method) == 'resample':
@@ -275,24 +280,27 @@
             #end if
             ############################### QC ###########################
 
-                maxfeatures = length(features(msidata))
-                medianpeaks = median(colSums(spectra(msidata)[]>0, na.rm=TRUE))
-                medint = round(median(spectra(msidata)[], na.rm=TRUE), digits=2)
-                minmz = round(min(mz(msidata)), digits=2)
-                maxmz = round(max(mz(msidata)), digits=2)
-                reduced = c(minmz, maxmz,maxfeatures, medianpeaks, medint)
-                QC_numbers= cbind(QC_numbers, reduced)
-                vectorofactions = append(vectorofactions, "reduced")
+            maxfeatures =nrow(msidata)
+            pixelcount = ncol(msidata)
+            minmz = round(min(mz(msidata)), digits=2)
+            maxmz = round(max(mz(msidata)), digits=2)
+            reduced = c(minmz, maxmz,maxfeatures, pixelcount)
+            QC_numbers= cbind(QC_numbers, reduced)
+            vectorofactions = append(vectorofactions, "reduced")
+            plot(msidata, pixel = 1:pixelcount, main="Average spectrum after data reduction")
 
         ############################### Transformation ###########################
 
         #elif str( $method.methods_conditional.preprocessing_method) == 'Transformation':
             print('Transformation')
 
+            ## convert data into R matrix what brings it automatically into memory and can take some take but next steps need R matrix
+            iData(msidata) <- iData(msidata)[]
+
             #if str( $method.methods_conditional.transf_conditional.trans_type) == 'log2':
                 print('log2 transformation')
 
-                ## replace 0 with NA
+                ## replace 0 with NA to prevent Inf
                 spectra_df = spectra(msidata)[]
                 spectra_df[spectra_df ==0] = NA
                 print(paste0("Number of 0 which were converted into NA:",sum(is.na(spectra_df))))
@@ -313,14 +321,14 @@
 
             ############################### QC ###########################
 
-                maxfeatures = length(features(msidata))
-                medianpeaks = median(colSums(spectra(msidata)[]>0, na.rm=TRUE))
-                medint = round(median(spectra(msidata)[], na.rm=TRUE), digits=2)
-                minmz = round(min(mz(msidata)), digits=2)
-                maxmz = round(max(mz(msidata)), digits=2)
-                transformed = c(minmz, maxmz,maxfeatures, medianpeaks, medint)
-                QC_numbers= cbind(QC_numbers, transformed)
-                vectorofactions = append(vectorofactions, "transformed")
+            maxfeatures =nrow(msidata)
+            pixelcount = ncol(msidata)
+            minmz = round(min(mz(msidata)), digits=2)
+            maxmz = round(max(mz(msidata)), digits=2)
+            transformed = c(minmz, maxmz,maxfeatures, pixelcount)
+            QC_numbers= cbind(QC_numbers, transformed)
+            vectorofactions = append(vectorofactions, "transformed")
+            plot(msidata, pixel = 1:pixelcount, main="Average spectrum after transformation")
 
             #end if
     #end for
@@ -328,24 +336,19 @@
     ############# Outputs: RData, imzml and QC report #############
     ################################################################################
 
-    print(paste0("Number of NA in output file: ",sum(is.na(spectra(msidata)[]))))
-
-    ## save as (.RData)
-    save(msidata, file="$msidata_preprocessed")
+    ## save msidata as imzML file, will only work if there is at least 1 m/z left
 
-    ## save msidata as imzML file, will only work if there is at least 1 m/z left
-    #if $imzml_output:
+    #if str($imzml_output) == "imzml_format":
         if (nrow(msidata) > 0){
-print("write outputfile")
             writeImzML(msidata, "out")}
+    #elif str($imzml_output) == "rdata_format":
+        ## save as (.RData)
+        iData(msidata) = iData(msidata)[]
+        save(msidata, file="$outfile_rdata")
     #end if
 
-    ## save QC report
-
-    pdf("Preprocessing.pdf", fonts = "Times", pointsize = 12)
     plot(0,type='n',axes=FALSE,ann=FALSE)
-    title(main=paste("Quality control during preprocessing \n", "Filename:", "$infile.display_name"))
-    rownames(QC_numbers) = c("min m/z", "max mz", "# features", "median\n# peaks", "median\nintensity")
+    rownames(QC_numbers) = c("min m/z", "max mz", "# features", "# spectra")
     grid.table(t(QC_numbers))
 
     dev.off()
@@ -361,7 +364,7 @@
         <repeat name="methods" title="Preprocessing" min="1" max="50">
             <conditional name="methods_conditional">
                 <param name="preprocessing_method" type="select" label="Preprocessing methods">
-                    <option value="Normalization" selected="True">Normalization</option>
+                    <option value="Normalization" selected="True">Intensity Normalization (TIC)</option>
                     <option value="Baseline_reduction">Baseline Reduction</option>
                     <option value="Smoothing">Peak smoothing</option>
                     <option value="Peak_picking">Peak picking</option>
@@ -528,14 +531,20 @@
                 </when>
             </conditional>
         </repeat>
-        <param name="imzml_output" type="boolean" label="Output of imzML file" truevalue="TRUE" falsevalue="FALSE"/>
+        <param name="imzml_output" type="select" display = "radio" optional = "False"
+               label="Output format" help= "Choose the output format">
+                <option value="imzml_format" selected="True">imzML</option>
+                <option value="rdata_format">RData</option>
+        </param>
     </inputs>
     <outputs>
-        <data format="rdata" name="msidata_preprocessed" label="${tool.name} on ${on_string}"/>
+        <data format="imzml" name="outfile_imzml" label="${tool.name} on ${on_string}: imzML">
+            <filter>imzml_output=='imzml_format'</filter>
+        </data>
+        <data format="rdata" name="outfile_rdata" label="${tool.name} on ${on_string}: RData">
+            <filter>imzml_output == 'rdata_format'</filter>
+        </data>
         <data format="pdf" name="QC_overview" from_work_dir="Preprocessing.pdf" label = "${tool.name} on ${on_string}: QC"/>
-        <data format="imzml" name="outfile_imzml" label="${tool.name} on ${on_string}: imzML">
-            <filter>imzml_output</filter>
-       </data>
     </outputs>
     <tests>
         <test>
@@ -591,8 +600,12 @@
                         </conditional>
                 </conditional>
             </repeat>
-            <output name="msidata_preprocessed" file="preprocessing_results1.RData" compare="sim_size"/>
+            <param name="imzml_output" value="imzml_format"/>
             <output name="QC_overview" file="preprocessing_results1.pdf" compare="sim_size"/>
+            <output name="outfile_imzml" ftype="imzml" file="preprocessing_results1.imzml.txt" compare="sim_size">
+                <extra_files type="file" file="preprocessing_results1.imzml" name="imzml" lines_diff="4"/>
+                <extra_files type="file" file="preprocessing_results1.ibd" name="ibd" compare="sim_size"/>
+            </output>
         </test>
         <test>
             <param name="infile" value="3_files_combined.RData" ftype="rdata"/>
@@ -615,8 +628,12 @@
                     </conditional>
                 </conditional>
             </repeat>
-            <output name="msidata_preprocessed" file="preprocessing_results2.RData" compare="sim_size"/>
+            <param name="imzml_output" value="imzml_format"/>
             <output name="QC_overview" file="preprocessing_results2.pdf" compare="sim_size"/>
+            <output name="outfile_imzml" ftype="imzml" file="preprocessing_results2.imzml.txt" compare="sim_size">
+                <extra_files type="file" file="preprocessing_results2.imzml" name="imzml" lines_diff="4"/>
+                <extra_files type="file" file="preprocessing_results2.ibd" name="ibd" compare="sim_size"/>
+            </output>
         </test>
         <test>
             <expand macro="infile_analyze75"/>
@@ -645,8 +662,12 @@
                     </conditional>
                 </conditional>
             </repeat>
-            <output name="msidata_preprocessed" file="preprocessing_results3.RData" compare="sim_size"/>
+            <param name="imzml_output" value="imzml_format"/>
             <output name="QC_overview" file="preprocessing_results3.pdf" compare="sim_size"/>
+            <output name="outfile_imzml" ftype="imzml" file="preprocessing_results3.imzml.txt" compare="sim_size">
+                <extra_files type="file" file="preprocessing_results3.imzml" name="imzml" lines_diff="4"/>
+                <extra_files type="file" file="preprocessing_results3.ibd" name="ibd" compare="sim_size"/>
+            </output>
         </test>
         <test>
             <expand macro="infile_analyze75"/>
@@ -661,8 +682,12 @@
                     <param name="bin_width" value="0.1"/>
                 </conditional>
             </repeat>
-            <output name="msidata_preprocessed" file="preprocessing_results4.RData" compare="sim_size"/>
+            <param name="imzml_output" value="imzml_format"/>
             <output name="QC_overview" file="preprocessing_results4.pdf" compare="sim_size"/>
+            <output name="outfile_imzml" ftype="imzml" file="preprocessing_results4.imzml.txt" compare="sim_size">
+                <extra_files type="file" file="preprocessing_results4.imzml" name="imzml" lines_diff="4"/>
+                <extra_files type="file" file="preprocessing_results4.ibd" name="ibd" compare="sim_size"/>
+            </output>
         </test>
         <test>
             <expand macro="infile_imzml"/>
@@ -675,7 +700,8 @@
                         </conditional>
                 </conditional>
             </repeat>
-             <output name="msidata_preprocessed" file="preprocessing_results5.RData" compare="sim_size"/>
+            <param name="imzml_output" value="rdata_format"/>
+            <output name="outfile_rdata" file="preprocessing_results5.RData" compare="sim_size"/>
             <output name="QC_overview" file="preprocessing_results5.pdf" compare="sim_size"/>
         </test>
     </tests>
@@ -710,11 +736,11 @@
 
 **Output**
 
-- MSI data as .RData output (can be read with the Cardinal package in R)
-- optional: MSI data as imzML file
-- pdf with key values after each processing step
+- MSI data as imzML file or .RData (can be read with the Cardinal package in R)
+- pdf with key values and average mass spectra after each processing step
 
         ]]>
     </help>
     <expand macro="citations"/>
 </tool>
+