view filtering.xml @ 13:b8c79e70516d draft

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/cardinal commit 180894cfabbee2d308be140a0f0b4dba119e88d4-dirty"
author galaxyp
date Wed, 23 Dec 2020 22:21:13 +0000
parents a4475567dd22
children 105056844497
line wrap: on
line source

<tool id="cardinal_filtering" name="MSI filtering" version="@VERSION@.0">
    <description>tool for filtering mass spectrometry imaging data</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements">
        <requirement type="package" version="2.3">r-gridextra</requirement>
        <requirement type="package" version="3.3.2">r-ggplot2</requirement>
    </expand>
    <expand macro="print_version"/>
    <command detect_errors="exit_code">
    <![CDATA[

        @INPUT_LINKING@
        cat '${MSI_subsetting}' &&
        Rscript '${MSI_subsetting}' &&
        mkdir $outfile_imzml.files_path &&
        mv ./out.imzML "${os.path.join($outfile_imzml.files_path, 'imzml')}" | true &&
        mv ./out.ibd "${os.path.join($outfile_imzml.files_path, 'ibd')}" | true &&
        echo "imzML file:" > $outfile_imzml &&
        ls -l "$outfile_imzml.files_path" >> $outfile_imzml
    ]]>
    </command>


    <configfiles>
        <configfile name="MSI_subsetting"><![CDATA[


################################# load libraries and read file #################


library(Cardinal)
library(ggplot2)
library(gridExtra)



@READING_MSIDATA_FULLY_COMPATIBLE@


########################### QC numbers ########################

        ## Number of features (m/z)
        maxfeatures = nrow(msidata)
        ## Range m/z
        minmz = round(min(mz(msidata)), digits=2)
        maxmz = round(max(mz(msidata)), digits=2)
        ## Number of spectra (pixels)
        pixelcount = ncol(msidata)
        ## Range x coordinates
        minimumx = min(coord(msidata)\$x)
        maximumx = max(coord(msidata)\$x)
        ## Range y coordinates
        minimumy = min(coord(msidata)\$y)
        maximumy = max(coord(msidata)\$y)
        ## Store features for QC plot
        featuresinfile = mz(msidata)

        all_df = data.frame(coord(msidata)\$x, coord(msidata)\$y, rep("removed pixels", times=ncol(msidata)))
        colnames(all_df) = c("x", "y", "annotation")

## Next steps will only run if there are more than 0 pixels/features in the file

if (ncol(msidata)>0 & nrow(msidata) >0)
{
    ###################################### Filtering of pixels #####################
    ################################################################################

    ############ Pixels in two columns format: x and y in different columns #############

    #if str($pixels_cond.pixel_filtering) == "two_columns":
        print("two columns")

        ## read tabular file
        input_list = read.delim("$pixels_cond.annotation_file", header = $pixels_cond.tabular_header, 
        stringsAsFactors = FALSE)
        inputpixels = input_list[,c($pixels_cond.column_x, $pixels_cond.column_y)]
        input_pixels = paste(inputpixels[,1], inputpixels[,2], sep="_")
        dataset_pixels = paste(coord(msidata)\$x, coord(msidata)\$y, sep="_")
        pixelsofinterest = dataset_pixels %in% input_pixels

        tryCatch(
                {
                msidata = msidata[,pixelsofinterest]
                if (ncol(msidata) == 0)
                    {
                    stop(call.=FALSE)
                    }
                },
                error=function(cond) {
                ## in case all coordinates were outside the dataset leading to zero pixels, tool is stopped to avoid continuing with wrong data
                    message("Error during pixel filtering")
                    message("Possible problems: Forgot to set 'Tabular file contains a header line' = Yes, wrong columns selected, columns with coordinates contain empty fields or letters, all coordinates were outside the range of the dataset - this can be checked with the 'MSI qualitycontrol' tool")
                    stop(call.=FALSE)
                }
            )    

        ## QC values:
        numberpixels = nrow(input_list)
        validpixels=ncol(msidata)

    ########### Pixels wihin x and y minima and maxima are kept #################

    #elif str($pixels_cond.pixel_filtering) == "pixel_range":
        print("pixel range")

        ## QC values:
        numberpixels = "range"
        validpixels = "range"

        tryCatch(
                {
                    msidata = msidata[, coord(msidata)\$x <= $pixels_cond.max_x_range & coord(msidata)\$x >= $pixels_cond.min_x_range]
                    msidata = msidata[, coord(msidata)\$y <= $pixels_cond.max_y_range & coord(msidata)\$y >= $pixels_cond.min_y_range]
                    if (ncol(msidata) == 0)
                        {
                        stop(call.=FALSE)
                        }
                },
                error=function(cond) {
                ## in case one of the ranges was outside the dataset leading to zero pixels, tool is stopped to avoid continuing with wrong data
                    message("Error during pixel filtering")
                    message("Check that both x and y ranges were inside the dataset coordinates (can be checked with the 'MSI qualitycontrol' tool) or if any not numeric character was entered into the input fields")
                    stop(call.=FALSE)
                }
            )    

    ######################## no pixel filtering ################################

    #elif str($pixels_cond.pixel_filtering) == "none":
        print("no pixel filtering")

        ## QC values:
        numberpixels = 0
        validpixels = 0

    #end if

    ############################# QC data #####################################

    ## dataframe for QC of pixel distribution

    remaining_df = data.frame(as.numeric(coord(msidata)\$x), as.numeric(coord(msidata)\$y), rep("remaining pixels", times=ncol(msidata)))
    colnames(remaining_df) = c("x", "y", "annotation")
    position_df = rbind(all_df, remaining_df)
    position_df[row.names(unique(position_df[,c("x", "y")])),]
    position_df\$annotation = factor(position_df\$annotation)
    gc()

}else{
    print("Inputfile has no intensities > 0")
}

################################# filtering of features ######################
##############################################################################

####################### Keep m/z from tabular file #########################

## feature filtering only when pixels/features/intensities are left


if (ncol(msidata) > 0){
    if (nrow(msidata) > 0)
    {

        #if str($features_cond.features_filtering) == "features_list":
            print("feature list")

            ## read tabular file, define starting row, extract and count valid features
            input_features = read.delim("$mz_tabular", header = $features_cond.feature_header, stringsAsFactors = FALSE)
            extracted_features = input_features[,$features_cond.feature_column]
            numberfeatures = length(extracted_features)

            if (class(extracted_features) == "numeric"){
                ### max digits given in the input file will be used to match m/z but the maximum is 4
                   max_digits = max(nchar(sapply(strsplit(as.character(extracted_features), "\\."),`[`,2)), na.rm=TRUE)

                   if (max_digits >4)
                   {
                   max_digits = 4
                   }

                validfeatures = round(extracted_features, max_digits) %in% round(mz(msidata),max_digits)
                featuresofinterest = features(msidata)[round(mz(msidata), digits = max_digits) %in% round(extracted_features[validfeatures], max_digits)]
                validmz = length(unique(featuresofinterest))

            }else{
                    validmz = 0
                    featuresofinterest = 0}

            ### filter msidata for valid features

                tryCatch(
                        {
                        msidata = msidata[featuresofinterest,]
                        ## does not throw error when processed file has no features left, therefore create error to avoid continuing with wrong data
                        if (nrow(msidata) == 0)
                            {
                            stop(call.=FALSE)
                            }
                        },
                        error=function(cond) {
                        ## in case all provided m/z values were outside the m/z range
                        ## tool is stopped to avoid continuing with wrong data
                            message("Error during m/z filtering")
                            message("Possible problems: Forgot to set 'Tabular file contains a header line' = Yes, wrong columns selected, column with m/z features contains empty fields or letters, all m/z features s were outside the range of the dataset  (this can be checked with the 'MSI qualitycontrol' tool) or did not match any m/z feature of the dataset (see help section for more information on that)")
                            stop(call.=FALSE)
                        }
                    )    


        ############### features within a given range are kept #####################

        #elif str($features_cond.features_filtering) == "features_range":
            print("feature range")

            numberfeatures = "range"
            validmz = "range"

            tryCatch(
                    {
                    msidata = msidata[mz(msidata) >= $features_cond.min_mz & mz(msidata) <= $features_cond.max_mz,]
                    ## does not throw error when processed file has no features left, therefore create error to avoid continuing with wrong data
                    if (nrow(msidata) == 0)
                        {
                        stop(call.=FALSE)
                        }
                    },
                    error=function(cond) {
                    ## in case all m/z features were outside the dataset leading to zero m/z features, tool is stopped to avoid continuing with wrong data
                        message("Error during m/z filtering")
                        message("Check that the entered m/z range is inside the dataset coordinates (can be checked with the 'MSI qualitycontrol' tool) or if any not numeric character was entered into the input fields")
                        stop(call.=FALSE)
                    }
                )    

        ############### Remove m/z from tabular file #########################

        #elif str($features_cond.features_filtering) == "remove_features":
            print("remove features")

            ## read tabular file, define starting row, extract and count valid features
            input_features = read.delim("$mz_tabular", header = $features_cond.feature_header, stringsAsFactors = FALSE) 
            extracted_features = input_features[,$features_cond.feature_column]
            numberfeatures = length(extracted_features)
            if (class(extracted_features) == "numeric"){
                print("input is numeric")
                featuresofinterest = extracted_features
            }else{featuresofinterest = 0}

        ### Here starts removal of features: 
            plusminus = $features_cond.removal_plusminus

            tryCatch(
                    {
                    mass_to_remove = numeric()
                    for (masses in featuresofinterest){
                        #if str($features_cond.units_removal) == "ppm": 
                            plusminus = masses * $features_cond.removal_plusminus/1000000
                        #end if 
                        current_mass = which(c(mz(msidata) <= masses + plusminus & mz(msidata) >= masses - plusminus))
                        mass_to_remove = append(mass_to_remove, current_mass)}
                        mass_to_keep = setdiff(1:nrow(msidata),mass_to_remove)

                    msidata= msidata[mass_to_keep, ]
                    validmz = maxfeatures - nrow(msidata)

                    ## does not throw error when processed file has no features left, therefore create error to avoid continuing with wrong data
                    if (nrow(msidata) == 0)
                        {
                        stop(call.=FALSE)
                        }
                    },
                    error=function(cond) {
                        message("Error during removal of m/z features")
                        stop(call.=FALSE)
                    }
                )    



        ######################## No m/z filtering ##############################

        #elif str($features_cond.features_filtering) == "none":

            print("no feature filtering")
            validmz = 0
            numberfeatures = 0

        #end if

    }else{
        print("Inputfile has no m/z features")
        numberfeatures = NA
        validmz = NA
    }
}else{
        print("Inputfile or file filtered for pixels has no pixels")
        numberfeatures = NA
        validmz = NA
}
gc()

#################### QC numbers #######################

## Number of features (m/z)
maxfeatures2 = nrow(msidata)
## Range m/z
minmz2 = round(min(mz(msidata)), digits=2)
maxmz2 = round(max(mz(msidata)), digits=2)
## Number of spectra (pixels)
pixelcount2 = ncol(msidata)
## Range x coordinates
minimumx2 = min(coord(msidata)\$x)
maximumx2 = max(coord(msidata)\$x)
## Range y coordinates
minimumy2 = min(coord(msidata)\$y)
maximumy2 = max(coord(msidata)\$y)

properties = c("Number of m/z features",
               "Range of m/z values",
               "Number of pixels", 
               "Range of x coordinates", 
               "Range of y coordinates",
               "pixel overview", 
               "feature overview")

before = c(paste0(maxfeatures), 
           paste0(minmz, " - ", maxmz), 
           paste0(pixelcount), 
           paste0(minimumx, " - ", maximumx), 
           paste0(minimumy, " - ", maximumy), 
           paste0("input pixels: ", numberpixels),
           paste0("input mz: ", numberfeatures))

filtered = c(paste0(maxfeatures2), 
           paste0(minmz2, " - ", maxmz2), 
           paste0(pixelcount2), 
           paste0(minimumx2, " - ", maximumx2),  
           paste0(minimumy2, " - ", maximumy2), 
           paste0("valid pixels: ", validpixels),
           paste0("valid mz: ", validmz))

property_df = data.frame(properties, before, filtered)

########################### PDF QC and MSI output ###########################

pdf("filtertool_QC.pdf", fonts = "Times", pointsize = 12)
plot(0,type='n',axes=FALSE,ann=FALSE)
title(main=paste0("Qualitycontrol of filtering tool for file: \n\n", "$infile.element_identifier"))
grid.table(property_df, rows= NULL)

## QC report only when pixels/features are left
if (ncol(msidata)>0 & nrow(msidata) >0)
{

    ### visual pixel control

    pixel_image = ggplot(position_df, aes(x=x, y=y, fill=annotation))+
           geom_tile(height = 1, width=1)+
           coord_fixed()+
           ggtitle("Spatial orientation of filtered pixels")+
           theme_bw()+
           theme(plot.title = element_text(hjust = 0.5))+
           theme(legend.position="bottom",legend.direction="vertical")+
           guides(fill=guide_legend(ncol=5,byrow=TRUE))
    print(pixel_image + scale_fill_manual(values=c("#00BFC4", "#F8766D")))


    ### visual mz feature control

    kept_df = data.frame(mz(msidata), rep("remaining m/z", nrow(msidata)))
    colnames(kept_df) = c("mz", "legend")

    mz_removed = setdiff(featuresinfile, mz(msidata))
    removed_df = data.frame(mz_removed, rep("removed m/z", length(mz_removed)))
    colnames(removed_df) = c("mz", "legend")
    histogram_df = rbind(removed_df,kept_df)

    histogram_mz= ggplot(histogram_df, aes(x=mz, fill=legend)) +
        geom_histogram()+ theme_bw()+
        theme(plot.title = element_text(hjust = 0.5))+
        theme(legend.position="bottom",legend.direction="vertical")+
        labs(title="Overview of filtered m/z", x="m/z", y = "count") +
        guides(fill=guide_legend(ncol=5,byrow=TRUE))
      print(histogram_mz + scale_fill_manual(values=c("#00BFC4", "#F8766D")))

    dev.off()

    ## save msidata as imzML file, will only work if there is at least 1 m/z left

        if (maxfeatures2 > 0){
            ## make sure that coordinates are integers
            coord(msidata)\$y = as.integer(coord(msidata)\$y)
            coord(msidata)\$x = as.integer(coord(msidata)\$x)
            msidata = as(msidata, "MSContinuousImagingExperiment")
        writeImzML(msidata, "out")}


}else{
    print("Inputfile or filtered file has no intensities > 0")
    dev.off()
}

    ]]></configfile>
    </configfiles>
    <inputs>
        <expand macro="reading_msidata"/>
        <conditional name="pixels_cond">
            <param name="pixel_filtering" type="select" label="Select pixel filtering option">
                <option value="none" selected="True">none</option>
                <option value="two_columns">coordinates from tabular file</option>
                <option value="pixel_range">ranges for x and y (manually)</option>
            </param>
            <when value="none"/>
            <when value="two_columns">
                <param name="annotation_file" type="data" format="tabular" label="Tabular file with pixel coordinates"
                help="Tabular file with two columns: x values and y values"/>
                <param name="column_x" data_ref="annotation_file" label="Column with x values" type="data_column"/>
                <param name="column_y" data_ref="annotation_file" label="Column with y values" type="data_column"/>
                <param name="tabular_header" type="boolean" label="Tabular file contains a header line" truevalue="TRUE" falsevalue="FALSE"/>
            </when> 
            <when value="pixel_range">
                <param name="min_x_range" type="integer" value="0" label="Minimum value for x"/>
                <param name="max_x_range" type="integer" value="100" label="Maximum value for x"/>
                <param name="min_y_range" type="integer" value="0" label="Minimum value for y"/>
                <param name="max_y_range" type="integer" value="100" label="Maximum value for y"/>
            </when> 
        </conditional>

        <conditional name="features_cond">
            <param name="features_filtering" type="select" label="Select m/z feature filtering option">
                <option value="none" selected="True">none</option>
                <option value="features_list">keep m/z (tabular file)</option>
                <option value="features_range">m/z range (manually)</option>
                <option value="remove_features">remove m/z (tabular file)</option>
            </param>
            <when value="none"/>
            <when value="features_list">
                <expand macro="reading_1_column_mz_tabular" label="Tabular file with m/z features to keep"/>
            </when> 
            <when value="features_range">
                <param name="min_mz" type="float" value="1" label="Minimum value for m/z"/>
                <param name="max_mz" type="float" value="100" label="Maximum value for m/z"/>
            </when> 
            <when value="remove_features">
                <expand macro="reading_1_column_mz_tabular" label="Tabular file with m/z features to remove"/>
                <param name="removal_plusminus" type="float" value="20" label="Window in which all m/z will be removed" help="This value is the half window size, it will be added and substracted from the given input value"/>
                <param name="units_removal" type="select" display="radio" optional  ="False" label="units">
                        <option value="ppm" selected="True">ppm</option>
                        <option value="Da">Da</option>
                </param>
            </when>
        </conditional>
    </inputs>

    <outputs>
        <data format="imzml" name="outfile_imzml" label="${tool.name} on ${on_string}: imzML"/>
        <data format="pdf" name="QC_overview" from_work_dir="filtertool_QC.pdf" label = "${tool.name} on ${on_string}: QC"/>
    </outputs>
    <tests>
        <test>
            <expand macro="infile_imzml"/>
            <param name="pixel_filtering" value="pixel_range"/>
            <param name="min_x_range" value="1"/>
            <param name="max_x_range" value="20"/>
            <param name="min_y_range" value="2"/>
            <param name="max_y_range" value="2"/>
            <param name="features_filtering" value="features_range"/>
            <param name="min_mz" value="350"/>
            <param name="max_mz" value="500"/>
            <output name="QC_overview" file="imzml_filtered3.pdf" compare="sim_size"/>
            <output name="outfile_imzml" ftype="imzml" file="out3.imzml.txt" compare="sim_size">
                <extra_files type="file" file="out3.imzml" name="imzml" lines_diff="6"/>
                <extra_files type="file" file="out3.ibd" name="ibd" compare="sim_size"/>
            </output>
        </test>
        <test>
            <expand macro="infile_imzml"/>
            <param name="pixel_filtering" value="two_columns"/>
            <param name="annotation_file" ftype="tabular" value="inputpixels_2column.tabular"/>
            <param name="column_x" value="1"/>
            <param name="column_y" value="3"/>
            <output name="QC_overview" file="imzml_filtered4.pdf" compare="sim_size"/>
            <output name="outfile_imzml" ftype="imzml" file="out4.imzml.txt" compare="sim_size">
                <extra_files type="file" file="out4.imzml" name="imzml" lines_diff="6"/>
                <extra_files type="file" file="out4.ibd" name="ibd" compare="sim_size"/>
            </output>
        </test>
        <test>
            <expand macro="infile_imzml"/>
            <param name="pixel_filtering" value="pixel_range"/>
            <param name="min_x_range" value="0"/>
            <param name="max_x_range" value="10"/>
            <param name="min_y_range" value="2"/>
            <param name="max_y_range" value="20"/>
            <param name="features_filtering" value="features_list"/>
            <param name="mz_tabular" ftype="tabular" value = "featuresofinterest5.tabular"/>
            <param name="feature_column" value="1"/>
            <param name="feature_header" value="0"/>
            <output name="QC_overview" file="imzml_filtered5.pdf" compare="sim_size"/>
            <output name="outfile_imzml" ftype="imzml" file="out5.imzml.txt" compare="sim_size">
                <extra_files type="file" file="out5.imzml" name="imzml" lines_diff="6"/>
                <extra_files type="file" file="out5.ibd" name="ibd" compare="sim_size"/>
            </output>
        </test>
        <test>
            <expand macro="infile_analyze75"/>
            <output name="QC_overview" file="analyze75_filtered2.pdf" compare="sim_size"/>
            <output name="outfile_imzml" ftype="imzml" file="out6.imzml.txt" compare="sim_size">
                <extra_files type="file" file="out6.imzml" name="imzml" lines_diff="6"/>
                <extra_files type="file" file="out6.ibd" name="ibd" compare="sim_size"/>
            </output>
        </test>
        <test>
            <param name="infile" value="preprocessed.RData" ftype="rdata"/>
            <output name="QC_overview" file="rdata_notfiltered.pdf" compare="sim_size" />
            <output name="outfile_imzml" ftype="imzml" file="out7.imzml.txt" compare="sim_size">
                <extra_files type="file" file="out7.imzml" name="imzml" lines_diff="6"/>
                <extra_files type="file" file="out7.ibd" name="ibd" compare="sim_size"/>
            </output>
        </test>
        <test>
            <expand macro="processed_infile_imzml"/>
            <conditional name="processed_cond">
                <param name="processed_file" value="processed"/>
                <param name="accuracy" value="200"/>
                <param name="units" value="ppm"/>
            </conditional>
            <param name="pixel_filtering" value="two_columns"/>
            <param name="annotation_file" ftype="tabular" value="inputpixels_2column.tabular"/>
            <param name="column_x" value="1"/>
            <param name="column_y" value="3"/>
            <param name="features_filtering" value="remove_features"/>
            <param name="mz_tabular" ftype="tabular" value = "featuresofinterest5.tabular"/>
            <param name="feature_column" value="1"/>
            <param name="feature_header" value="0"/>
            <param name="removal_plusminus" value="100"/>
            <param name="units_removal" value="ppm"/>
            <output name="QC_overview" file="imzml_filtered8.pdf" compare="sim_size"/>
            <output name="outfile_imzml" ftype="imzml" file="out8.imzml.txt" compare="sim_size">
                <extra_files type="file" file="out8.imzml" name="imzml" lines_diff="6"/>
                <extra_files type="file" file="out8.ibd" name="ibd" compare="sim_size"/>
            </output>
        </test>
    </tests>
    <help>
        <![CDATA[

@CARDINAL_DESCRIPTION@

-----

This tool provides options to filter (subset) pixels and m/z features of mass spectrometry imaging data.

@MSIDATA_INPUT_DESCRIPTION@

        - Optional file with pixel coordinates and annotation: 

            - Tabular file: One column with x values, one column with y values
            - The file is allowed to have any column names as header (in this case set "Tabular file contains a header line" to yes)
            - Pixel with coordinates outside the coordinates of the input file are ignored

                ::

                      x_coord     y_coord
                        1            1   
                        2            1   
                        3            1   
                        ...
                        ...

@MZ_TABULAR_INPUT_DESCRIPTION@

**Options**

- Pixel filtering/annotation: 

    - Either with a tabular file containing x and y coordinates or by entering x-min, x-max, y-min, y-max manually
    - Pixel that are not present in the dataset are ignored
    - An error occurs if the input for filtering (tabular file, x-range or y-range) contains not a single coordinate that occurs in the input dataset


- m/z feature filtering: 

    - Either with a tabular file containing m/z values or by entering m/z-min and m/z-max manually
    - m/z values that are not present in the dataset are ignored
    - An error occurs if the input for filtering (tabular file or mz-range) contains not a single m/z feature that occurs in the dataset


- m/z feature removing: 

    - Perturbing m/z features such as matrix contaminants, tryptic peptides and internal calibrants can be removed by specifying their m/z value in a tabular file, optionally with a half window size in ppm or m/z for the window in which peaks should be removed


**Tips**

- m/z feautre filtering with a tabular file: 

    - For matching the m/z features of the input dataset are rounded to the number of decimal points of the m/z values from the tabular file. In case the input had more than 4 digits m/z values of dataset and tabular file are rounded to 4 digits. 
    - Therefore, it is recommended to use the filtering tool only for m/z features which have been extracted from the same dataset. If the m/z values are from a different dataset, the tool 'Join two files on column allowing a small difference' should be used to find corresponding m/z values, which can then be used for filtering. 

- Problems to select tabular file: 

    - In case tabular file cannot be selected in drop-down menu: Datatype in Galaxy might be different from 'tabular' - datatype can be changed by pressing the pen button of the dataset (edit attributes)


**Output**

- MSI data as (continuous) imzML file
- pdf with heatmap showing the pixels that are removed and kept as well as histograms of kept and removed m/z


        ]]>
    </help>
    <expand macro="citations"/>
</tool>