diff maldi_quant_preprocessing.xml @ 2:e754c2b545a9 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/MALDIquant commit d2f311f7fff24e54c565127c40414de708e31b3c
author galaxyp
date Thu, 25 Oct 2018 07:31:55 -0400
parents 0892a051eb17
children 71411ac28268
line wrap: on
line diff
--- a/maldi_quant_preprocessing.xml	Mon Oct 01 01:09:28 2018 -0400
+++ b/maldi_quant_preprocessing.xml	Thu Oct 25 07:31:55 2018 -0400
@@ -1,4 +1,4 @@
-<tool id="maldi_quant_preprocessing" name="MALDIquant preprocessing" version="@VERSION@.1">
+<tool id="maldi_quant_preprocessing" name="MALDIquant preprocessing" version="@VERSION@.2">
     <description>
         Preprocessing of mass-spectrometry imaging data
     </description>
@@ -8,6 +8,7 @@
     <expand macro="requirements"/>
     <command detect_errors="exit_code">
     <![CDATA[
+        cat '${maldi_quant_preprocessing}' &&
         #if $infile.ext == 'imzml'
             cp '${infile.extra_files_path}/imzml' infile.imzML &&
             cp '${infile.extra_files_path}/ibd' infile.ibd &&
@@ -22,6 +23,7 @@
             ln -s $infile infile.RData &&
         #end if
         Rscript "${maldi_quant_preprocessing}" &&
+
         mkdir $outfile_imzml.files_path &&
         mv ./out.imzMl "${os.path.join($outfile_imzml.files_path, 'imzml')}" | true &&
         mv ./out.ibd "${os.path.join($outfile_imzml.files_path, 'ibd')}" | true &&
@@ -54,7 +56,7 @@
         coordinates_info = cbind(coordinates(maldi_data)[,1:2], c(1:length(maldi_data)))
     #elif $infile.ext == 'analyze75'
         ## Import analyze7.5 file
-        maldi_data = import( 'infile.hdr' )
+        maldi_data = importAnalyze( 'infile.hdr' )
         coordinates_info = cbind(coordinates(maldi_data)[,1:2], c(1:length(maldi_data)))
     #else
         loadRData <- function(fileName){
@@ -142,10 +144,11 @@
 pixel_number = length(maldi_data)
 minmz = round(min(unlist(lapply(maldi_data,mass))), digits=4)
 maxmz = round(max(unlist(lapply(maldi_data,mass))), digits=4)
-maxfeatures = round(length(unlist(lapply(maldi_data,mass)))/length(maldi_data), digits=2)
+mean_features = round(length(unlist(lapply(maldi_data,mass)))/length(maldi_data), digits=2)
+number_features = length(unique(unlist(lapply(maldi_data,mass))))
 medint = round(median(unlist(lapply(maldi_data,intensity))), digits=2)
-inputdata = c(minmz, maxmz,maxfeatures,  medint)
-QC_numbers= data.frame(inputdata = c(minmz, maxmz,maxfeatures, medint))
+inputdata = c(minmz, maxmz,number_features,mean_features,  medint)
+QC_numbers= data.frame(inputdata = c(minmz, maxmz,number_features, mean_features, medint))
 vectorofactions = "inputdata"
 
 
@@ -162,9 +165,10 @@
         pixel_number = length(maldi_data)
         minmz = round(min(unlist(lapply(maldi_data,mass))), digits=4)
         maxmz = round(max(unlist(lapply(maldi_data,mass))), digits=4)
-        maxfeatures = round(length(unlist(lapply(maldi_data,mass)))/length(maldi_data), digits=2)
+        mean_features = round(length(unlist(lapply(maldi_data,mass)))/length(maldi_data), digits=2)
         medint = round(median(unlist(lapply(maldi_data,intensity))), digits=2)
-        transformed = c(minmz, maxmz,maxfeatures, medint)
+        number_features = length(unique(unlist(lapply(maldi_data,mass))))
+        transformed = c(minmz, maxmz,number_features,mean_features,  medint)
         QC_numbers= cbind(QC_numbers, transformed)
         vectorofactions = append(vectorofactions, "transformed")
 
@@ -196,9 +200,10 @@
         pixel_number = length(maldi_data)
         minmz = round(min(unlist(lapply(maldi_data,mass))), digits=4)
         maxmz = round(max(unlist(lapply(maldi_data,mass))), digits=4)
-        maxfeatures = round(length(unlist(lapply(maldi_data,mass)))/length(maldi_data), digits=2)
+        mean_features = round(length(unlist(lapply(maldi_data,mass)))/length(maldi_data), digits=2)
         medint = round(median(unlist(lapply(maldi_data,intensity))), digits=2)
-        smoothed = c(minmz, maxmz,maxfeatures, medint)
+        number_features = length(unique(unlist(lapply(maldi_data,mass))))
+        smoothed = c(minmz, maxmz,number_features,mean_features,  medint)
         QC_numbers= cbind(QC_numbers, smoothed)
         vectorofactions = append(vectorofactions, "smoothed")
 
@@ -251,9 +256,10 @@
         pixel_number = length(maldi_data)
         minmz = round(min(unlist(lapply(maldi_data,mass))), digits=4)
         maxmz = round(max(unlist(lapply(maldi_data,mass))), digits=4)
-        maxfeatures = round(length(unlist(lapply(maldi_data,mass)))/length(maldi_data), digits=2)
+        mean_features = round(length(unlist(lapply(maldi_data,mass)))/length(maldi_data), digits=2)
         medint = round(median(unlist(lapply(maldi_data,intensity))), digits=2)
-        baseline_removed = c(minmz, maxmz,maxfeatures, medint)
+        number_features = length(unique(unlist(lapply(maldi_data,mass))))
+        baseline_removed = c(minmz, maxmz,number_features,mean_features,  medint)
         QC_numbers= cbind(QC_numbers, baseline_removed)
         vectorofactions = append(vectorofactions, "baseline_removed")
 
@@ -263,13 +269,13 @@
         print('calibrate')
         ##calibrate
 
-        #if $method.methods_conditional.mass_start != 0 and $method.methods_conditional.mass_end != 0:
-        ## calibrate only given m/z range
-        maldi_data = calibrateIntensity(maldi_data,
-            method="$method.methods_conditional.calibrate_method",
-            range=c($method.methods_conditional.mass_start, $method.methods_conditional.mass_end))
+        #if str($method.methods_conditional.cond_calibration_range) == "yes":
+            ## calibrate only given m/z range
+            maldi_data = calibrateIntensity(maldi_data,
+                method="$method.methods_conditional.calibrate_method",
+                range=c($method.methods_conditional.cond_calibration_range.mass_start, $method.methods_conditional.cond_calibration_range.mass_end))
         #else:
-        maldi_data = calibrateIntensity(maldi_data,
+            maldi_data = calibrateIntensity(maldi_data,
             method="$method.methods_conditional.calibrate_method")
         #end if
         ## QC plot and numbers
@@ -278,10 +284,11 @@
         pixel_number = length(maldi_data)
         minmz = round(min(unlist(lapply(maldi_data,mass))), digits=4)
         maxmz = round(max(unlist(lapply(maldi_data,mass))), digits=4)
-        maxfeatures = round(length(unlist(lapply(maldi_data,mass)))/length(maldi_data), digits=2)
+        mean_features = round(length(unlist(lapply(maldi_data,mass)))/length(maldi_data), digits=2)
         medint = round(median(unlist(lapply(maldi_data,intensity))), digits=2)
-        intensity_calibrated = c(minmz, maxmz,maxfeatures, medint)
-        QC_numbers= cbind(QC_numbers, intensity_calibrated )
+        number_features = length(unique(unlist(lapply(maldi_data,mass))))
+        intensity_calibrated = c(minmz, maxmz,number_features,mean_features,  medint)
+        QC_numbers= cbind(QC_numbers, intensity_calibrated)
         vectorofactions = append(vectorofactions, "intensity_calibrated ")
 
 
@@ -333,16 +340,17 @@
         pixel_number = length(maldi_data)
         minmz = round(min(unlist(lapply(maldi_data,mass))), digits=4)
         maxmz = round(max(unlist(lapply(maldi_data,mass))), digits=4)
-        maxfeatures = round(length(unlist(lapply(maldi_data,mass)))/length(maldi_data), digits=2)
+        mean_features = round(length(unlist(lapply(maldi_data,mass)))/length(maldi_data), digits=2)
         medint = round(median(unlist(lapply(maldi_data,intensity))), digits=2)
-        spectra_aligned = c(minmz, maxmz,maxfeatures, medint)
-        QC_numbers= cbind(QC_numbers, spectra_aligned )
+        number_features = length(unique(unlist(lapply(maldi_data,mass))))
+        spectra_aligned = c(minmz, maxmz,number_features,mean_features, medint)
+        QC_numbers= cbind(QC_numbers, spectra_aligned)
         vectorofactions = append(vectorofactions, "spectra_aligned")
     #end if
 
 #end for
 
-rownames(QC_numbers) = c("min m/z", "max mz", "# features", "median\nintensity")
+rownames(QC_numbers) = c("min m/z", "max mz", "# features", "median \n# features", "median\nintensity")
 plot(0,type='n',axes=FALSE,ann=FALSE)
 grid.table(t(QC_numbers))
 
@@ -356,17 +364,13 @@
         MALDIquantForeign::exportImzMl(maldi_data, file="out.imzMl", processed=$export_processed)
     #end if
 
-    ## export annotation tabular file
-    #if str($tabular_annotation.load_annotation) == 'yes_annotation':
-        write.table(merged_annotation, file="$annotation_output", quote = FALSE, row.names = FALSE, col.names=TRUE, sep = "\t")
-    #end if
 }else{"All spectra are empty, outputfiles will be empty,too."}
 
     ]]>
         </configfile>
     </configfiles>
     <inputs>
-        <param name="infile" type="data" format="imzml,rdata" label="Inputfile as imzML or Cardinal MSImageSet saved as RData" help="This file is in imzML format or Cardinal MSImageSet saved as RData. The file must be in profile mode, not centroided"/>
+        <param name="infile" type="data" format="imzml,rdata,analyze75" label="Inputfile as imzML or Cardinal MSImageSet saved as RData" help="This file is in imzML format or Cardinal MSImageSet saved as RData. The file must be in profile mode, not centroided"/>
         <conditional name="restriction_conditional">
             <param name="restriction" type="select" label="Read in only spectra of interest" help="This option only works for imzML files">
                 <option value="no_restriction" selected="True">Calculate on entire file</option>
@@ -374,7 +378,7 @@
             </param>
             <when value="restrict">
                 <param name="coordinates_file" type="data" format="tabular" label="Tabular file with coordinates" help="x-values in first column, y-values in second column"/>
-                <param name="coordinates_header" type="boolean" label="Tabular file contains a header line" truevalue="TRUE" falsevalue="FALSE"/>
+                <param name="coordinates_header" type="boolean" label="File contains a header line" truevalue="TRUE" falsevalue="FALSE"/>
             </when>              
             <when value="no_restriction"/>
         </conditional>
@@ -389,7 +393,7 @@
                         <param name="column_x" data_ref="annotation_file" label="Column with x values" type="data_column"/>
                         <param name="column_y" data_ref="annotation_file" label="Column with y values" type="data_column"/>
                         <param name="column_names" data_ref="annotation_file" label="Column with pixel annotations" type="data_column"/>
-                        <param name="tabular_header" type="boolean" label="Tabular file contains a header line" truevalue="TRUE" falsevalue="FALSE"/>
+                        <param name="tabular_header" type="boolean" label="File contains a header line" truevalue="TRUE" falsevalue="FALSE"/>
                 </when>
                 <when value="no_annotation"/>
         </conditional>
@@ -404,7 +408,7 @@
                     <validator type="empty_field" />
                 </param>
                 <when value="Transformation">
-                    <param name="transform_method" type="select" label="Select the transfprormation method">
+                    <param name="transform_method" type="select" label="Select a transfprormation method">
                         <option value="sqrt" selected="True">sqrt</option>
                         <option value="log">log</option>
                         <option value="log2">log2</option>
@@ -419,18 +423,18 @@
                             <option value="MovingAverage">MovingAverage</option>
                         </param>
                         <when value="SavitzkyGolay">
-                            <param name="polynomial" value="3" type="text" label="PolynomialOrder argument to control the order of the filter"/>
+                            <param name="polynomial" value="3" type="text" label="PolynomialOrder argument to control the order of the filter"
+                                    help="should be smaller than the resulting window"/>
                         </when>
                         <when value="MovingAverage">
-                            <param name="weighted" type="boolean" label="Weighted average" help = "indicates if the average should be equal weight or if it should have weights depending on the distance from the center as calculated as 1/2^abs(-halfWindowSize:halfWindowSize) with the sum of all weigths normalized to 1" truevalue="TRUE" falsevalue="FALSE"/>
+                            <param name="weighted" type="boolean" label="Weighted average" help = "Indicates if the average should be equal weight or if it should have weights depending on the distance from the center as calculated as 1/2^abs(-halfWindowSize:halfWindowSize) with the sum of all weigths normalized to 1" truevalue="TRUE" falsevalue="FALSE"/>
                         </when>
                     </conditional>
                     <param name="halfWindowSize" type="integer" value="10"
-                        label="Half window size"
+                        label="Half window size (number of data points)"
                         help="The resulting window reaches from 
                             mass[currentIndex-halfWindowSize] to mass[currentIndex+halfWindowSize]
-                            (window size is 2*halfWindowSize+1).
-                            The best size differs depending on the selected smoothing method."/>
+                            (window size is 2*halfWindowSize+1)."/>
                 </when>
                 <when value="Baseline">
                     <conditional name="methods_for_baseline">
@@ -447,29 +451,40 @@
                         </when>
                         <when value="TopHat">
                             <param name="tophat_halfWindowSize" type="integer" value="10"
-                        label="Half window size" help="The resulting window reaches from 
+                        label="Half window size (number of data points)"
+                            help="The resulting window reaches from 
                             mass[currentIndex-halfWindowSize] to mass[currentIndex+halfWindowSize]"/>
                         </when>
                         <when value="ConvexHull"/>
                         <when value="median">
                             <param name="median_halfWindowSize" type="integer" value="10"
-                        label="Half window size" help="The resulting window reaches from 
+                        label="Half window size (number of data points)"
+                            help="The resulting window reaches from 
                             mass[currentIndex-halfWindowSize] to mass[currentIndex+halfWindowSize]"/>
                         </when>
                     </conditional>
                 </when>
                 <when value="Calibrate">
-                    <param name="calibrate_method" type="select" label="Calibration method">
+                    <param name="calibrate_method" type="select" label="Intensity calibration (normalization) method">
                         <option value="TIC" selected="True">TIC</option>
                         <option value="PQN">PQN</option>
                         <option value="median">median</option>
                         <validator type="empty_field" />
                     </param>
-                    <param name="mass_start" type="integer" value="0"
-                        label="Start of m/z range, has to be inside m/z range" 
-                        help="Scaling factor is calculated on the mass range and applied to the whole spectrum. Start and end are not allowed to be 0"/>
-                    <param name="mass_end" type="integer" value="0"
-                        label="End of m/z range, has to be inside m/z range"/>
+                    <conditional name="cond_calibration_range">
+                        <param name="calibration_range" type="select" label="Instead of the whole m/z range, a specified m/z range can be used to calculate the scaling factor">
+                            <option value="no" selected="True">complete m/z range</option>
+                            <option value="yes">specify a m/z range</option>
+                        </param>
+                        <when value="no"/>
+                        <when value="yes">
+                            <param name="mass_start" type="integer" value="800"
+                                label="Start of m/z range, has to be inside m/z range" 
+                                help="Scaling factor is calculated on the mass range and applied to the whole spectrum."/>
+                            <param name="mass_end" type="integer" value="3000"
+                                label="End of m/z range, has to be inside m/z range"/>
+                        </when>
+                    </conditional>
                 </when>
                 <when value="Align">
                     <param name="warping_method" type="select" label="Warping methods">
@@ -479,12 +494,12 @@
                         <option value="cubic">Cubic</option>
                     </param>
 
-                    <param name="tolerance" type="float" value="0.002"
-                        label="Tolerance"
-                        help="Double, maximal relative deviation of a peak position (m/z) to be considered as identical" />
+                    <param name="tolerance" type="float" value="0.00005"
+                        label="Tolerance = abs(mz1 - mz2)/mz2"
+                        help="Maximal relative deviation of a peak position (m/z) to be considered as identical. For 50ppm use 0.00005 or 50e-6" />
 
                     <param name="halfWindowSize" type="integer" value="20"
-                        label="Half window size"
+                        label="Half window size (number of data points)"
                         help="The resulting window reaches from 
                             mass[currentIndex-halfWindowSize] to mass[currentIndex+halfWindowSize]
                             (window size is 2*halfWindowSize+1).
@@ -492,7 +507,7 @@
 
                     <param name="snr" type="integer" value="2" label="Signal-to-noise-ratio"/>
                     <param name="allow_nomatch" type="boolean" label="Don't throw an error when less than 2 reference m/z were found in a spectrum" truevalue="TRUE" falsevalue="FALSE"/>
-                    <param name="empty_nomatch" type="boolean" label="logical, if TRUE the intensity values of MassSpectrum or MassPeaks objects with missing (NA) warping functions are set to zero" truevalue="TRUE" falsevalue="FALSE"/>
+                    <param name="empty_nomatch" type="boolean" label="If TRUE the intensity values of MassSpectrum or MassPeaks objects with missing (NA) warping functions are set to zero" truevalue="TRUE" falsevalue="FALSE"/>
                     <param name="remove_empty" type="boolean" label="Should empty spectra be removed" truevalue="TRUE" falsevalue="FALSE" help="For Cardinal RData files this step can only be performed if pixel annotations were provided"/>
 
                     <conditional name="reference_for_alignment">
@@ -503,9 +518,9 @@
                         <when value="no_reference"/>
                         <when value="yes_reference">
                             <param name="reference_file" type="data" format="tabular"
-                                label="Tabular file with m/z of internal calibrants (MassPeaks) which should be used for spectra alignment"
-                                help="calibration of m/z values to internal calibrants, at least 2 m/z per spectrum are needed"/>
-                            <param name="reference_header" type="boolean" label="Tabular file contains a header line" truevalue="TRUE" falsevalue="FALSE"/>
+                                label="Tabular file with m/z (MassPeaks) which should be used for spectra alignment"
+                                help="At least 2 reference m/z per spectrum are needed"/>
+                            <param name="reference_header" type="boolean" label="File contains a header line" truevalue="TRUE" falsevalue="FALSE"/>
                         </when>
                     </conditional>
                 </when>
@@ -514,11 +529,8 @@
         <param name="export_processed" type="boolean" label="Export file as processed imzML" help="otherwise continuous imzML will be exported" truevalue="TRUE" falsevalue="FALSE"/>
     </inputs>
     <outputs>
-        <data format="imzml" name="outfile_imzml" label="$infile.display_name preprocessed" />
-        <data format="pdf" name="plots" from_work_dir="prepro_qc_plot.pdf" label="$infile.display_name preprocessed QC"/>
-        <data format="tabular" name="annotation_output" label="$infile.display_name annotations">
-            <filter>tabular_annotation["load_annotation"] == 'yes_annotation'</filter>
-        </data>
+        <data format="imzml" name="outfile_imzml" label="${tool.name} on ${on_string}" />
+        <data format="pdf" name="plots" from_work_dir="prepro_qc_plot.pdf" label="${tool.name} on ${on_string}: QC"/>
     </outputs>
     <tests>
         <test>
@@ -569,6 +581,7 @@
                 <param name="method" value="Align"/>
                 <param name="warping_method" value="linear"/>
                 <param name="halfWindowSize" value="1"/>
+                <param name="tolerance" value="0.002"/>
                 <param name="allow_nomatch" value="TRUE"/>
                 <param name="remove_empty" value="TRUE"/>
                 <param name="empty_nomatch" value="TRUE"/>
@@ -580,7 +593,6 @@
             <output name="outfile_imzml" file="outfile3.imzML" compare="sim_size"/>
             <output name="outfile_imzml" file="outfile3.ibd" compare="sim_size"/>
             <output name="plots" file="Preprocessing3_QC.pdf" compare="sim_size"/>
-            <output name="annotation_output" file="annotations_output3.tabular"/>
         </test>
     </tests>
     <help><![CDATA[
@@ -623,17 +635,26 @@
 
 **Options**
 
-- Transformation: transformation of intensities with log, log2, log10 and squareroot
-- Smoothing: Smoothing of the peaks reduces noise and improves peak detection. Available smoothing methods are SavitzkyGolay and Moving Average
+- Transformation: Variance stabilization through intensity transformation:'log', 'log2', 'log10' and 'squareroot' (sqrt) are available
+- Smoothing: Smoothing of the peaks reduces noise and improves peak detection. Available smoothing methods are 'SavitzkyGolay' and 'Moving Average'
+
+    - For all smoothing methods: The larger the 'Half window size'f, the stronger the smoothing. The resulting window should be smaller than the FWHM (full width at half maximum) of the typical peaks. Moving average needs smaller window size than SavitzkyGolay.
+    - Moving average: Recommended for broader peaks/high m/z range spectra. Weighted moving average: Points in the center get larger weight factors than points away from the center.  
+    - SavitzkyGolay: Recommended for sharp peaks/low m/z range, preserves the shape of the local maxima. The PolynomialOrder should be smaller than the resulting window. Negative values will be replaced with 0. 
+
 - Baseline reduction: Baseline reduction removes background intensity generated by chemical noise (common in MALDI datasets). 
 
     - Available methods are SNIP, TopHat,ConvexHull and median:
     - SNIP is the default baseline reduction method in MALDIquant. 
-    - ConvexHull cannot be used for MALDI-TOF baseline removal. 
+    - ConvexHull is not appropriate for MALDI-TOF baseline removal. 
     - The moving median may generate negative intensities. 
     - Except for the ConvexHull all methods have a parameter for the 'Half window size' (in SNIP it is called 'iterations'). The smaller the window the more baseline will be removed but also parts of the peaks. Wider windows preserve the peak height better and produce a smoother baseline, but some local background variation will remain. 
 
 - Intensity calibration (normalization): Normalization of intensities to Total Ion Current (TIC), median spectrum, Probabilistic Quotient Normalization (PQN)
+
+    - TIC and median are local calibration methods: each spectrum is normalized on its own (each peak is divided by the TIC or median of the spectrum)
+    - PQN is a global calibration method: In PQN all spectra are calibrated using the TIC calibration first. Subsequently, a median reference spectrum is created and the intensities in all spectra are standardized using the reference spectrum and a spectrum-specific median is calculated for each spectrum. Finally, each spectrum is rescaled by the median of the ratios of its intensity values and that of the reference spectrum
+
 - Spectra alignment (warping): alignment for (re)calibration of m/z values, at least two m/z per spectrum are needed for the alignment. This requirement can be skipped by setting "Don't throw an error when less than 2 reference m/z were found in a spectrum" to yes. If the not aligned spectra should be set to zero select yes in "logical, if TRUE the intensity values of MassSpectrum or MassPeaks objects with missing (NA) warping functions are set to zero". In order to remove such empty spectra set "Should empty spectra be removed" to yes.