view replicate_filter.xml @ 2:6cb796aa12c8 draft default tip

"planemo upload for repository https://github.com/computational-metabolomics/dimspy-galaxy commit 80069808371b58f45da0c8133c27d67ac1a5b448"
author computational-metabolomics
date Wed, 17 Feb 2021 10:54:32 +0000
parents 0cdf340364ed
children
line wrap: on
line source

<tool id="dimspy_replicate_filter" name="Replicate Filter" version="@TOOL_VERSION@+galaxy1">
    <description> - Remove peaks that fail to appear in at least x-out-of-n (technical) replicates</description>  
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements" />
    <command detect_errors="exit_code">
    <![CDATA[
        dimspy replicate-filter
        --input '$hdf5_file_in'
        --output '$hdf5_file_out'
        #if $filelist
            --filelist '$filelist'
        #end if
        --ppm $ppm
        --replicates $replicates
        --min-peak-present $min_peaks
        #if $rsd_threshold
            --rsd-threshold $rsd_threshold
        #end if
        --report '$report'
        --ncpu \${GALAXY_SLOTS:-1}
        &&
        dimspy create-sample-list
        --input '$hdf5_file_out'
        --output '$samplelist'
        --delimiter tab
        &&
        dimspy hdf5-pls-to-txt
        --input '$hdf5_file_out'
        --output .
        --delimiter $delimiter
    ]]>
    </command>
    <inputs>
        <param name="hdf5_file_in" argument="--hdf5_file_in" type="data" format="h5" label="Peaklists (HDF5 file)" help="Peaklists generated by Process Scans (SIM-Stitch)." />
        <param name="filelist" argument="--filelist" type="data" format="tsv,tabular" optional="true" label="Filelist / Samplelist" help="Only provide a filelist if you like to exclude Peaklists, update the metadata (e.g. classLabel), or if you have not provided a filelist for Process Scans." />
        <param name="replicates" argument="--replicates" type="integer" value="3" label="Number of technical replicates for each sample" help="" />
        <param name="min_peaks" argument="--min_peaks" type="integer" value="2" label="Minimum number of technical replicates a peak has to be present in" help="" />
        <param name="ppm" argument="--ppm" type="float" value="2.0" label="Ppm error tolerance" help="Maximum tolerated m/z deviation across technical replicates in parts per million (ppm)." />
        <param name="rsd_threshold" argument="--rsd-threshold" type="text" value="" label="Relative standard deviation threshold" help="Maximum tolerated relative standard deviation (RSD) of the peak intensities across technical replicates. Leave empty to skip this filter step." />
        <param name="delimiter" type="hidden" value="tab" argument="--delimiter" label="" help=""/>
    </inputs>
    <outputs>
        <data name="hdf5_file_out" format="h5" label="${tool.name} on ${on_string}: Peaklists (HDF5 file)"/>
        <data name="report" format="txt" label="${tool.name} on ${on_string}: Report"/>
        <data name="samplelist" format="tsv" label="${tool.name} on ${on_string}: Sample Metadata (updated)" />
        <collection name="peaklists_txt" type="list" label="${tool.name} on ${on_string}: Peaklists">
            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.txt" format="tsv" directory="." visible="false" />
        </collection>
    </outputs>
    <tests>
        <test>
            <param name="hdf5_file_in" value="pls.h5" ftype="h5"/>
            <param name="replicates" value="3"/>
            <param name="min_peaks" value="2"/>
            <param name="ppm" value="2.0"/>
            <param name="rsd_threshold" value=""/>
            <output name="hdf5_file_out" value="pls_rf.h5" ftype="h5" compare="sim_size"/>
            <output name="report" value="report_pls_rf_01.txt" ftype="txt"/>
            <output name="samplelist" value="samplelist_1.txt" ftype="tsv"/>
            <output_collection name="peaklists_txt" type="list">
                <element name="batch04_QC17_rep01_262_2_263_3_264" file="batch04_QC17_rep01_262_2_263_3_264.txt" ftype="tsv"/>
            </output_collection>
        </test>
        <test>
            <param name="hdf5_file_in" value="pls_scan5.h5" ftype="h5"/>
            <param name="replicates" value="3"/>
            <param name="min_peaks" value="2"/>
            <param name="ppm" value="2.0"/>
            <param name="rsd_threshold" value=""/>
            <output name="report" value="report_pls_rf_02.txt" ftype="txt"/>
        </test>
    </tests>
    <help>
----------------
Replicate filter
----------------

..

--------------------

Description
-----------

Standard DIMS processing workflow: Process Scans -> **[Replicate Filter]** -> Align Samples -> Blank Filter -> Sample Filter -> [Missing value sample filter] -> Pre-processing -> Statistics

|

To draw robust conclusions from DIMS-based metabolomics datasets, the data itself must be collected and processed in a robust and reproducible way. To support this aim, study samples are often divided in to a set of equivalent aliquots, each of which is analysed under defined and consistent analytical conditions. As aliquots of the same sample are assumed to comprise identical biological material, differences in their resulting spectra are assumed to arise due to technical variability. Removing artifacts associated with this technical variability, and removing mass spectral peaks that are detected irreproducible, is possible by filtering across these technical replicate spectra. The Replicate Filter tools facilitates this process, combining the peaklists extracted (using the Process Scans tool) from each technical replicate of a given study in to a single merged peaklist, before applying a series of user-defined filters to yield a replicate-filtered peaklist.

|

**Replicate filter process**: peaks from each technical replicate (for a given study sample) are aligned using a one-dimensional hierarchical clustering procedure (applied on the mass-to-charge scale). Peaks are aligned only if the difference in their mass-to-charge ratios, when divided by the average of their mass-to-charge ratios and multiplied by 1 × 10\ :sup:`6` \ (i.e. when measured in units of parts-per-million, ppm), is less-than or equal-to the user-defined ‘ppm error tolerance’. After alignment, a set of user-defined filters are applied to retain only those peaks that:

* occur in equal-to or more-than the user-defined 'Number of technical replicates a peak has to be present in', i.e. if set to 2, then a peak must be detected in at least two of the replicate analyses, **and/or**

* have relative standard deviation (measured in %; may otherwise be referred to as the percent coefficient of variation) of intensity values, across technical replicates, that is equal-to or less-than the user-defined ‘relative standard deviation threshold’ (if defined, otherwise ignored). 

**IMPORTANT**:

* When a user sets the parameter “number of technical replicates for each sample” to a value less-than the total number of technical replicates actually acquired for each study sample, this tool will automatically determine which combination of technical replicates to combine. See the parameter description (below) for further details.

* If a specific scan event has been replicated within a given file, then users should not use this tool for filtering and aligning peaks. In this case, users should instead FIRST use the Process Scans (and SIM stitch) tool - see the documentation for the Process Scans tool for a description of how to perform this.

|

**Output** - a replicate-filtered peaklist per input study sample. In each, rows correspond to replicate-filtered mass spectral peaks, while columns provide a range of metrics for each of the detected peaks (see “Output file(s)” section, below). 

------------------

Parameters
----------

* **Number of technical replicates for each sample** (REQUIRED; default = 3) - the total number of technical replicates acquired for each study sample. This value must be set to the lowest number of technical replicates acquired for ANY of the study samples, or alternatively, may be set to the minimum number of replicates the user would like to select from the total number of technical replicates for a biological sample.

* **Minimum number of technical replicates a peak has to be present in** (REQUIRED; default = 2) - For a given biological sample, the number of replicates that will be used to generate the replicate-filtered peaklist. By default, if this parameter is set to a value less-than the total number of technical replicates acquired for each biological sample, then the tool automatically determines which combination of technical replicates yields the best overall rank. Otherwise, all technical replicates are used. Ranking of the combinations of technical replicates is based on the average of the following three scores:

    # score 1: peak count / peak count present in n-out-n (e.g. 3-out-of-3)

    # score 2: peak count present in x-out-of-n (e.g. 3-out-of-3) / MAX peak count present in x-out-of-n across sets of replicates

    # score 3: RSD categories (0-5 (score=1.0), 5-10 (score=0.9), 10-15 (score=0.8), etc)

* **ppm error tolerance** (REQUIRED; default = 2) - this parameter will influence the alignment of peaks across technical replicates. Peaks from distinct technical replicates of a given study sample are aligned if the difference between their mass-to-charge ratios, when divided by the average of their mass-to-charge ratios and multiplied by 1 × 106, is equal-to or less-than than this parameter value (i.e. the difference between the mass-to-charge ratios, measured on the ppm scale, is less than the user-defined threshold). 

* **Relative standard deviation threshold** (OPTIONAL) - a numerical value from 0 upwards that defines the acceptable percentage relative standard deviation (otherwise termed the percent coefficient of variation) of a peak’s intensity across technical replicates. Peaks are removed from the output ‘replicate-filtered’ peaklist if this condition is not met.

-------------------

Output file(s)
--------------

|

The Replicate Filter tool will output three file types:

1) **A HDF5 file containing the replicate-filtered Peaklists**

2) **A replicate-filtered peaklist**, in .tsv format, for each biological sample defined in the filelist/samplelist. Tab-delimited text file containing a numeric data matrix, with . as decimal, and NA for missing values. Each row corresponds to a mass spectral peak. Columns provide metrics associated with each mass spectral peak. Metrics include:

@help_columns_peaklist@

@example_peaklist@

|

3) **A tabular “report” file** that details, for each biological sample included in the filelist/samplelist:
    
   - **Set** - a numeric index value that is unique to a specific biological sample

   - **Rank** - a positive integer indicating which combination of technical replicates offered the best-ranked replicate-filtered peaklist.

   - **Name** - a string indicating which of the technical replicates for a given biological sample were combined together.

   - **Peaks_[XooY]** - a numeric value indicating the total number of peaks that were present in X-out-of-Y technical replicates.

   - **Median_rsd_[XooY]** - the median of the percent relative standard deviations of all peaks in the replicate-filtered peaklist that were present in X-out-of-Y technical replicates.

   - **Score** - a numeric value between 0 and 1 that serves to indicate the best combination of technical replicates for a given biological sample (as defined by the ‘name’ column)

--------------------------

@github_developers_contributors@
@license@
    </help>
    <expand macro="citations" />
</tool>