Mercurial > repos > galaxyp > proteomics_moff

<tool id="proteomics_moff" name="moFF" version="@VERSION@.1">
    <description>extracts MS1 intensities from spectrum files</description>
    <macros>
        <token name="@VERSION@">1.2.1</token>
        <!-- xml macros, used for shared Galaxy parameter inputs -->
        <xml name="ident_input_macro" token_allow_multiple="true" token_input_type="data">
        <!-- this is exactly the same across all three, except for allowing multiple in MBR and all but not in moff -->
            <conditional name="ident_input">
                    <param name="input_type_selector" type="select" label="Choose the format for the identification file:">
                        <option value="ps">Peptide Shaker PSM report (standard, not extended)</option>
                        <option value="generic">Another tabular identification file</option>
                    </param>
                    <when value="ps">
                        <param name="ident_input_file" type="@INPUT_TYPE@" format="tabular" label="Peptide Shaker PSM report" multiple="@ALLOW_MULTIPLE@"/>
                    </when>
                    <when value="generic">
                        <param name="ident_input_file" type="@INPUT_TYPE@" format="tabular" label="A general tabular format" multiple="@ALLOW_MULTIPLE@"
                            help="Must have specific columns; see below to select these columns from your file. The file should have at most one header line. "/>
                        <param name="remove_header" type="boolean" value="false" label="Remove the header line?" help="This is necessary if the file has a line with column headers"/>
                        <param name="peptide"
                            type="data_column"
                            data_ref="ident_input_file"
                            label="Column with peptide-spectrum-match sequence"/>
                        <param name="prot"
                            type="data_column"
                            data_ref="ident_input_file"
                            label="Column with protein ID"/>
                        <param name="mod_peptide"
                            type="data_column"
                            data_ref="ident_input_file"
                            label="Column with peptide-spectrum-match sequence that contains possible modifications"/>
                        <param name="rt"
                            type="data_column"
                            data_ref="ident_input_file"
                            label="Column with PSM retention time (in second)"/>
                        <param name="mz"
                            type="data_column"
                            data_ref="ident_input_file"
                            label="Column with m/z (mass over charge)"/>
                        <param name="mass"
                            type="data_column"
                            data_ref="ident_input_file"
                            label="Column with mass of the peptide"/>
                        <param name="charge"
                            type="data_column"
                            data_ref="ident_input_file"
                            label="Column with charge of ionized peptide"/>
                    </when>
            </conditional>
        </xml>
        <xml name="raw_input_macro" token_allow_multiple="true" token_input_type="data">
            <conditional name="msms_input">
                <param name="input_type_selector" type="select" label="Choose the format for the MS/MS file">
                    <option value="raw">Thermo RAW file</option>
                    <option value="mzml">mzML</option>
                </param>
                <when value="raw">
                    <param argument="--inputraw" type="@INPUT_TYPE@" multiple="@ALLOW_MULTIPLE@" format="raw" label="RAW file(s)"/>
                </when>
                <when value="mzml">
                    <param argument="--inputraw" type="@INPUT_TYPE@" multiple="@ALLOW_MULTIPLE@" format="mzml" label="mzML file(s)"/>
                </when>
            </conditional>
        </xml>
        <!-- tokens (code snippets used in <command>) -->
        <token name="@IDENT_INPUT_ARG_MULTIPLE@"><![CDATA[
            ## this is where the ident input gets passed to moff/moff_all/moff_mbr
            --inputtsv
            #for $key in $task.ident_input.ident_input_file.keys():
                './ident_inputs/${task.ident_input.ident_input_file[$key].display_name}'
            #end for
        ]]></token>
        <token name="@IDENT_INPUT_ARG_SINGLE@"><![CDATA[
            ## this is where the ident input gets passed to moff/moff_all/moff_mbr
            --inputtsv './ident_inputs/${task.ident_input.ident_input_file.display_name}'
        ]]></token>
        <token name="@WRANGLE_IDENT_INPUT_SINGLE@"><![CDATA[
            mkdir ./ident_inputs &&
            #if $task.ident_input.input_type_selector == "ps":
                cp '$task.ident_input.ident_input_file' './ident_inputs/$task.ident_input.ident_input_file.display_name' &&
            #else
                cp '$task.ident_input.ident_input_file' ./tempfile1.tab &&
                ## optionally remove first line
                #if $task.ident_input.remove_header:
                    sed -i '1d' ./tempfile1.tab &&
                #end if
                ## header row with correct names: "peptide", "prot", "mod_peptide", "rt", "mz", "mass", and "charge"
                echo -e "peptide\tprot\tmod_peptide\trt\tmz\tmass\tcharge" > ./tempfile2.tab &&
                awk 'BEGIN{OFS="\t"; FS="\t"}{print \$pep,\$prot,\$mod,\$rt,\$mz,\$mass,\$charge}' pep="${task.ident_input.peptide}" prot="$task.ident_input.prot" mod="$task.ident_input.mod_peptide" rt="$task.ident_input.rt" mz="$task.ident_input.mz" mass="$task.ident_input.mass" charge="$task.ident_input.charge" ./tempfile1.tab >> ./tempfile2.tab &&
                mv ./tempfile2.tab './ident_inputs/$task.ident_input.ident_input_file.display_name' &&
            #end if
        ]]></token>
        <token name="@WRANGLE_IDENT_INPUT_MULTIPLE@"><![CDATA[
            mkdir ./ident_inputs &&
            #if $task.ident_input.input_type_selector == "ps":
                #for $key in $task.ident_input.ident_input_file.keys():
                    cp '${task.ident_input.ident_input_file[$key]}' './ident_inputs/${task.ident_input.ident_input_file[$key].display_name}' &&
                #end for
            #else
                #for $key in $task.ident_input.ident_input_file.keys():
                    cp '${task.ident_input.ident_input_file[$key]}' './tempfile${key}_1.tab' &&
                    ## optionally remove first line
                    #if $task.ident_input.remove_header:
                        sed -i '1d' './tempfile${key}_1.tab' &&
                    #end if
                    ## header row with correct names: "peptide", "prot", "mod_peptide", "rt", "mz", "mass", and "charge"
                    echo -e "peptide\tprot\tmod_peptide\trt\tmz\tmass\tcharge" > './tempfile${key}_2.tab' &&
                    awk 'BEGIN{OFS="\t"; FS="\t"}{print \$pep,\$prot,\$mod,\$rt,\$mz,\$mass,\$charge}' pep="${task.ident_input.peptide}" prot="$task.ident_input.prot" mod="$task.ident_input.mod_peptide" rt="$task.ident_input.rt" mz="$task.ident_input.mz" mass="$task.ident_input.mass" charge="$task.ident_input.charge" './tempfile${key}_1.tab' >> './tempfile${key}_2.tab' &&
                    mv './tempfile${key}_2.tab' './ident_inputs/$task.ident_input.ident_input_file[$key].display_name' &&
                #end for
            #end if
        ]]></token>
        <token name="@RAW_INPUT_ARG_SINGLE@"><![CDATA[
            --inputraw './raws/$task.msms_input.inputraw.display_name'
        ]]></token>
        <token name="@RAW_INPUT_ARG_MULTIPLE@"><![CDATA[
            --inputraw
            #for $key in $task.msms_input.inputraw.keys():
                './raws/$task.msms_input.inputraw[$key].display_name'
            #end for
        ]]></token>
        <token name="@WRANGLE_RAW_INPUT_SINGLE@"><![CDATA[
            mkdir ./raws &&
            ## for files, need to softlink the display name to the history item
            ln -s '$task.msms_input.inputraw' './raws/$task.msms_input.inputraw.display_name' &&
        ]]></token>
        <token name="@WRANGLE_RAW_INPUT_MULTIPLE@"><![CDATA[
            mkdir ./raws &&
            ## for files, need to softlink the display name to the history item
            #for $key in $task.msms_input.inputraw.keys():
                ln -s '$task.msms_input.inputraw[$key]' './raws/$task.msms_input.inputraw[$key].display_name' &&
            #end for
        ]]></token>
    </macros>
    <requirements>
        <requirement type="package" version="@VERSION@">moff</requirement>
    </requirements>
    <version_command>echo @VERSION@</version_command>
    <command detect_errors="aggressive"><![CDATA[
        mkdir ./out &&
        #if $task.task_selector == "moff":
            @WRANGLE_IDENT_INPUT_SINGLE@
            @WRANGLE_RAW_INPUT_SINGLE@
            moff.py
                @IDENT_INPUT_ARG_SINGLE@
                @RAW_INPUT_ARG_SINGLE@
                --tol $task.tol
                --rt_w $task.rt_w
                --rt_p $task.rt_p
                --output_folder ./out
                #if ($task.peptide_summary):
                    --peptide_summary 1
                #end if
            &&
            #if $task.peptide_summary:
                mv ./out/peptide_summary_intensity_moFF_run.tab '$output_peptide_summary' &&
            #end if
            mv ./out/*moff_result.txt '$output_table'
            &&
            mv ./out/*.log '$output_logs'
        #else if $task.task_selector == "mbr":
           @WRANGLE_IDENT_INPUT_MULTIPLE@
           moff_mbr.py
                --inputF ./ident_inputs
                --ext $task.ext
           &&
           mv ./ident_inputs/mbr_output/* ./out
        #else:
           ## moff_all (mbr followed by apex)
           @WRANGLE_IDENT_INPUT_MULTIPLE@
           @WRANGLE_RAW_INPUT_MULTIPLE@
           moff_all.py
               @IDENT_INPUT_ARG_MULTIPLE@
               @RAW_INPUT_ARG_MULTIPLE@
               --tol $task.tol
               --rt_w $task.rt_w
               --rt_p $task.rt_p
               --rt_p_match $task.rt_p_match
               --output_folder ./out
               #if $task.peptide_summary:
                   --peptide_summary 1
               #end if
           &&
           #if $task.peptide_summary:
               mv ./out/peptide_summary_intensity_moFF_run.tab '$output_peptide_summary'
           #end if
        #end if
    ]]></command>
    <inputs>
        <conditional name="task">
            <param name="task_selector" type="select" label="Choose which module to run">
                <option value="moff">Apex intensity</option>
                <option value="mbr">Match between runs</option>
                <option value="all">All (match-between-runs followed by quantitation)</option>
            </param>
            <when value = "moff">
                <expand macro="ident_input_macro" allow_multiple="false"/>
                <expand macro="raw_input_macro" allow_multiple="false"/>
                <param argument="--tol" type="float" value="10" label="Tolerance parameter"
                    help="Specify the tolerance parameter in ppm." />
                <param argument="--rt_w" type="float" value="3.0" label="Retention time window"
                    help="Specify rt window for xic in minutes." />
                <param argument="--rt_p" type="float" value="1" label="Time window for the peak"
                    help="Specify the time windows for the peak in minutes." />
                <param argument="--peptide_summary" type="boolean" value="true" label="Output the peptide summary?"/>
            </when>
            <when value="mbr">
                <expand macro="ident_input_macro" allow_multiple="false" input_type="data_collection"/>
                <param argument="--ext" type="text" value="tab" label="Provide the extension used in the display file name (without the period)"/>
            </when>
            <when value="all">
                <expand macro="ident_input_macro" allow_multiple="false" input_type="data_collection"/>
                <expand macro="raw_input_macro" allow_multiple="false" input_type="data_collection"/>
                <param argument="--tol" type="float" value="10" label="Tolerance parameter"
                    help="Specify the tolerance parameter in ppm." />
                <param argument="--rt_w" type="float" value="3.0" label="Retention time window"
                    help="Specify rt window for xic in minutes." />
                <param argument="--rt_p" type="float" value="1" label="Time window for the peak"
                    help="Specify the time windows for the peak in minutes." />
                <param argument="--rt_p_match" type="float" value="1.2" label="Time window for the matched peak"
                    help="Specify the time windows for the matched peak in minutes." />
                <param argument="--peptide_summary" type="boolean" value="true" label="Output the peptide summary?"/>
            </when>
        </conditional>
    </inputs>
    <outputs>
        <data format="tabular" name="output_table" label="${tool.name} on ${on_string}: quantification">
            <filter>task['task_selector']=='moff'</filter>
        </data>
        <data format="txt" name="output_logs" label="${tool.name} ${on_string}: log">
            <filter>task['task_selector']=='moff'</filter>
        </data>
        <collection name="ident_output" type="list" label="${tool.name} on ${on_string}: quantification">
            <filter>task['task_selector']=='all'</filter>
            <discover_datasets pattern="(?P&lt;designation&gt;.*)\.txt" directory="out" format="tabular"/>
        </collection>
        <collection name="ident_output_mbr" type="list" label="${tool.name} on ${on_string}: matched">
            <filter>task['task_selector']=='mbr'</filter>
            <discover_datasets pattern="(?P&lt;designation&gt;.*)\.txt" directory="out" format="tabular"/>
        </collection>
        <collection name="log_output" type="list" label="${tool.name} on ${on_string}: logs">
            <filter>task['task_selector']=='all' or task['task_selector']=='mbr'</filter>
            <discover_datasets pattern="(?P&lt;designation&gt;.*)\.log" directory="out" format="txt"/>
        </collection>
        <data format="tabular" name="output_peptide_summary" label="${tool.name} on ${on_string}: peptide summary">
            <filter>task['peptide_summary'] and (task['task_selector']=='all' or task['task_selector']=='moff')</filter>
        </data>
    </outputs>
    <tests>
        <!-- test moff_all -->
        <test>
            <param name="task|task_selector" value="all"/>
            <param name="ident_input|input_type_selector" value="ps"/>
            <param name="ident_input_file">
                <collection type="list">
                    <element name="mbr_test1" value="input/mbr_test1.tabular"/>
                    <element name="mbr_test2" value="input/mbr_test2.tabular"/>
                </collection>
            </param>
            <param name="msms_input|input_type_selector" value="mzml"/>
            <param name="inputraw">
                <collection type="list">
                    <element name="mbr_test1" value="input/mbr_test1.mzml"/>
                    <element name="mbr_test2" value="input/mbr_test2.mzml"/>
                </collection>
            </param>
            <param name="peptide_summary" value="true"/>
            <output name="output_peptide_summary" ftype="tabular">
                <assert_contents>
                    <has_text text="sumIntensity_mbr_test1"/>
                    <has_text text="sumIntensity_mbr_test2"/>
                </assert_contents>
            </output>
            <output_collection name="ident_output" type="list">
                <element name="mbr_test1_match_moff_result">
                    <assert_contents>
                        <has_text text="NH2-QVEEAVQSDDK-COOH"/>
                    </assert_contents>
                </element>
                <element name="mbr_test2_match_moff_result">
                    <assert_contents>
                        <has_text text="NH2-RDVGINNTVK-COOH"/>
                    </assert_contents>
                </element>
            </output_collection>
            <output_collection name="log_output" type="list">
                <element name="mbr_test1_match__moff">
                    <assert_contents>
                        <has_line line="peptide at line 200 -->  MZ: 783.4200 RT: 134.6997 matched (yes=1/no=0): 0"/>
                    </assert_contents>
                </element>
                <element name="mbr_test2_match__moff">
                    <assert_contents>
                        <has_line line="peptide at line 132 -->  MZ: 767.8700 RT: 98.1975 matched (yes=1/no=0): 0"/>
                    </assert_contents>
                </element>
            </output_collection>
        </test>
        <!-- test moff alone -->
        <test>
            <param name="task|task_selector" value="moff"/>
            <param name="ident_input|input_type_selector" value="ps"/>
            <param name="ident_input_file" value="input/test.tabular" ftype="tabular"/>
            <param name="msms_input|input_type_selector" value="mzml"/>
            <param name="inputraw" value="input/test.mzml" ftype="mzml"/>
            <param name="peptide_summary" value="true"/>
            <output name="output_peptide_summary" ftype="tabular" file="output2/moff_test_pepsum.tab"/>
            <output name="output_logs">
                <assert_contents>
                    <has_line line="peptide at line 294 -->  MZ: 677.3300 RT: 60.6078"/>
                </assert_contents>
            </output>
        </test>
        <!-- test the generic input -->
        <test>
            <param name="task|task_selector" value="moff"/>
            <param name="ident_input|input_type_selector" value="generic"/>
            <param name="ident_input_file" value="input/test.tabular" ftype="tabular"/>
            <param name="remove_header" value="true"/>
            <param name="msms_input|input_type_selector" value="mzml"/>
            <param name="inputraw" value="input/test.mzml" ftype="mzml"/>
            <param name="peptide" value="3"/>
            <param name="prot" value="2"/>
            <param name="mod_peptide" value="7"/>
            <param name="rt" value="13"/>
            <param name="mz" value="14"/>
            <param name="mass" value="17"/>
            <param name="charge" value="15"/>
            <param name="peptide_summary" value="true"/>
            <output name="output_peptide_summary" ftype="tabular" file="output2/moff_test_pepsum.tab"/>
            <output name="output_logs">
                <assert_contents>
                    <has_line line="peptide at line 294 -->  MZ: 677.3300 RT: 60.6078"/>
                </assert_contents>
            </output>
        </test>
        <test>
            <param name="task|task_selector" value="all"/>
            <param name="ident_input|input_type_selector" value="generic"/>
            <param name="ident_input_file">
                <collection type="list">
                    <element name="mbr_test1" value="input/mbr_test1.tabular"/>
                    <element name="mbr_test2" value="input/mbr_test2.tabular"/>
                </collection>
            </param>
            <param name="remove_header" value="true"/>
            <param name="peptide" value="3"/>
            <param name="prot" value="2"/>
            <param name="mod_peptide" value="7"/>
            <param name="rt" value="13"/>
            <param name="mz" value="14"/>
            <param name="mass" value="17"/>
            <param name="charge" value="15"/>
            <param name="msms_input|input_type_selector" value="mzml"/>
            <param name="inputraw">
                <collection type="list">
                    <element name="mbr_test1" value="input/mbr_test1.mzml"/>
                    <element name="mbr_test2" value="input/mbr_test2.mzml"/>
                </collection>
            </param>
            <param name="peptide_summary" value="true"/>
            <output name="output_peptide_summary" ftype="tabular">
                <assert_contents>
                    <has_text text="sumIntensity_mbr_test1"/>
                    <has_text text="sumIntensity_mbr_test2"/>
                </assert_contents>
            </output>
            <output_collection name="ident_output" type="list">
                <element name="mbr_test1_match_moff_result">
                    <assert_contents>
                        <has_text text="NH2-QVEEAVQSDDK-COOH"/>
                    </assert_contents>
                </element>
                <element name="mbr_test2_match_moff_result">
                    <assert_contents>
                        <has_text text="NH2-RDVGINNTVK-COOH"/>
                    </assert_contents>
                </element>
            </output_collection>
            <output_collection name="log_output" type="list">
                <element name="mbr_test1_match__moff">
                    <assert_contents>
                        <has_line line="peptide at line 200 -->  MZ: 783.4200 RT: 134.6997 matched (yes=1/no=0): 0"/>
                    </assert_contents>
                </element>
                <element name="mbr_test2_match__moff">
                    <assert_contents>
                        <has_line line="peptide at line 132 -->  MZ: 767.8700 RT: 98.1975 matched (yes=1/no=0): 0"/>
                    </assert_contents>
                </element>
            </output_collection>
        </test>
        <!-- test mbr -->
        <test>
            <param name="task|task_selector" value="mbr"/>
            <param name="ident_input|input_type_selector" value="ps"/>
            <param name="ident_input_file">
                <collection type="list">
                    <element name="mbr_test1" value="input/mbr_test1.tabular"/>
                    <element name="mbr_test2" value="input/mbr_test2.tabular"/>
                </collection>
            </param>
            <param name="ext" value="tabular"/>
            <output_collection name="ident_output_mbr" type="list" count="2">
                <element name="mbr_test1_match">
                    <assert_contents>
                        <has_text text="NH2-QVEEAVQSDDK-COOH"/>
                    </assert_contents>
                </element>
                <element name="mbr_test2_match">
                    <assert_contents>
                        <has_text text="NH2-RDVGINNTVK-COOH"/>
                    </assert_contents>
                </element>
            </output_collection>
        </test>
    </tests>
    <help>
    <![CDATA[
**Description**

moFF (a Modest Feature Finder) is an OS independent tool designed to extract
apex MS1 intensity using a set of identified MS2 peptides.
It currently uses a Go library to directly extract data from Thermo Raw spectrum files,
eliminating the need for conversions from other formats.
Moreover, moFF also allows one to work directly with mzML files.

**Usage**

*Modules:*

1. Apex Intensity: this is used for a single pair of files, one identification and one spectrum file.
2. Match between runs (MBR): for multiple identification files, share MS2 identified peptides between runs and predict the retention time.
3. All (match between runs followed by apex intensity): this is used for more than one pair of identification and spectrum files.

If both match between runs and apex intensity are desired, it is best to run them both at once (i.e., run the 'All' module).
The MBR module is mainly useful for observing the intermediate steps of the algorithm - its outputs are not able to be used as inputs in moFF or in other tools.

If quantification of multiple files without MBR is desired, the apex intensity module may be run with multiple files or a dataset collection in batch mode.
In either case, moFF must be given the paired files at the same time - thus the best method is to construct a dataset collection in which the raw and identification files are in the same order.


*Inputs:*

- Identification file: this can either be a generic tabular file or the standard PSM report from PeptideShaker.
  If it is a generic tabular file, please select the columns corresponding to the required information.

- MS/MS file: this can either be a Thermo raw file or an mzML file.

A given pair of files must have the *exact* same display name, not including the extension;
e.g. ``example1.tabular`` and ``example1.mzml``.
If the display names are different, simply change them in the history menu.

For multiple files (the MBR or All modules), the identification and spectrum files must be provided as dataset collections.
This allows for usage of the output dataset collections in workflows.

*Parameters:*

All the parameters related to the the time windows (``rt_w``, ``rt_p``, ``rt_p_match``) are basically the
half of the entire time windows where the apex peak is searched or the XIC is retrieved.
For correct rt windows, we suggest you set the ``rt_p`` value equal to or slighly greater than the
dynamic exclusion duration set in your machine. We suggest also to set the
``rt_p_match`` always slightly bigger than tha values used for ``rt_p``.

*Outputs:*

When used in the single file mode ("Apex intensity" module), the outputs are 2 (or 3) files: a log file, a quantitation file,
and (optionally) a peptide summary, with intensities aggregated across peptides. When used in the multiple file mode ("All"),
the outputs are a dataset collection of log files (one per identification file), a dataset collection of quantification files, and (optionally) a peptide summary.

If used with a generic tabular format, the only columns in the output file are the 7 columns selected while using moFF plus the columns that moFF adds. Other columns are discarded.

**More Information**

See the moFF Github site at https://github.com/compomics/moFF,
and the publication at https://dx.doi.org/10.1038/nmeth.4075

    ]]>
    </help>
    <citations>
        <citation type="doi">10.1038/nmeth.4075</citation>
    </citations>
</tool>
author	galaxyp
date	Wed, 14 Feb 2018 07:19:47 -0500
parents	8f0e76ad46ef
children	226287d75d96