view spectralMatching.xml @ 3:15e3df50be3c draft

"planemo upload for repository https://github.com/computational-metabolomics/mspurity-galaxy commit 89bf304d12233004031de3c3e703a472b0ba5459"
author computational-metabolomics
date Thu, 09 Apr 2020 14:11:20 -0400
parents 5ff9d40c7a42
children aee10d29e82c
line wrap: on
line source

<tool id="mspurity_spectralmatching" name="msPurity.spectralMatching" version="@TOOL_VERSION@+galaxy@GALAXY_TOOL_VERSION@">
    <description>
        Perform spectral matching to MS/MS spectral libraries
    </description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements"/>
    <command detect_errors="exit_code"><![CDATA[
        Rscript '$__tool_directory__/spectralMatching.R'
            --outDir=.
            --cores=\${GALAXY_SLOTS:-4}

            #if $Query.q_dbPth_con.q_dbPth_select == 'msPurityData'
              --q_defaultDb
            #else if $Query.q_dbPth_con.q_dbPth_select == 'sqlite'
              --q_dbPth=$Query.q_dbPth_con.q_dbPth
            #end if

            #if $Library.l_dbPth_con.l_dbPth_select == 'msPurityData'
              --l_defaultDb
            #else if $Library.l_dbPth_con.l_dbPth_select == 'userdb_sqlite'
              --l_dbPth=$Library.l_dbPth_con.l_dbPth
            #end if

            --l_dbType=$Library.l_dbPth_con.l_dbPth_select
            --q_dbType=$Query.q_dbPth_con.q_dbPth_select


            --q_ppmPrec=$Query.q_filters.q_ppmPrec
            --l_ppmPrec=$Library.l_filters.l_ppmPrec

            --q_ppmProd=$Query.q_filters.q_ppmProd
            --l_ppmProd=$Library.l_filters.l_ppmProd


            #if $Query.q_filters.q_raThres_cond.q_raThres_bool
              --q_raThres=$Query.q_filters.q_raThres_cond.q_raThres
            #end if

            #if $Library.l_filters.l_raThres_cond.l_raThres_bool
              --l_raThres=$Library.l_filters.l_raThres_cond.l_raThres
            #end if

            #if $Query.q_filters.q_polarity_cond.q_polarity_bool
              --q_polarity=$Query.q_filters.q_polarity_cond.q_polarity
            #end if

            #if $Library.l_filters.l_polarity_cond.l_polarity_bool
              --l_polarity=$Library.l_filters.l_polarity_cond.l_polarity
            #end if

            #if $Query.q_filters.q_purity_cond.q_purity_bool
              --q_purity=$Query.q_filters.q_purity_cond.q_purity
            #end if

            #if $Library.l_filters.l_purity_cond.l_purity_bool
              --l_purity=$Library.l_filters.l_purity_cond.l_purity
            #end if

            #if $Query.q_filters.q_xcmsGroups_cond.q_xcmsGroups_bool
              --q_xcmsGroups=$Query.q_filters.q_xcmsGroups_cond.q_xcmsGroups
            #end if

            #if $Library.l_filters.l_xcmsGroups_cond.l_xcmsGroups_bool
              --l_xcmsGroups=$Library.l_filters.l_xcmsGroups_cond.l_xcmsGroups
            #end if

            #if $Query.q_filters.q_pids_cond.q_pids_bool
              --q_pids=$Query.q_filters.q_pids_cond.q_pids
            #end if

            #if $Library.l_filters.l_pids_cond.l_pids_bool
              --l_pids=$Library.l_filters.l_pids_cond.l_pids
            #end if

            #if $Query.q_filters.q_rtrange_cond.q_rtrange_bool
              --q_rtrangeMin=$Query.q_filters.q_rtrange_cond.q_rtrangeMin
              --q_rtrangeMax=$Query.q_filters.q_rtrange_cond.q_rtrangeMax
            #end if

            #if $Library.l_filters.l_rtrange_cond.l_rtrange_bool
              --l_rtrangeMin=$Library.l_filters.l_rtrange_cond.l_rtrangeMin
              --l_rtrangeMax=$Library.l_filters.l_rtrange_cond.l_rtrangeMax
            #end if

            #if $Query.q_filters.q_accessions_cond.q_accessions_bool
              --q_accessions=$Query.q_filters.q_accessions_cond.q_accessions
            #end if

            #if $Library.l_filters.l_accessions_cond.l_accessions_bool
              --l_accessions=$Library.l_filters.l_accessions_cond.l_accessions
            #end if


            #if $Query.q_filters.q_sources_cond.q_sources_bool
              --q_sources=$Query.q_filters.q_sources_cond.q_sources
              --q_sourcesUser='$Query.q_filters.q_sources_cond.q_sourcesUser'
            #end if

            #if $Library.l_filters.l_sources_cond.l_sources_bool
              --l_sources=$Library.l_filters.l_sources_cond.l_sources
              --l_sourcesUser='$Library.l_filters.l_sources_cond.l_sourcesUser'
            #end if

            #if $Query.q_filters.q_instrumentTypes_cond.q_instrumentTypes_bool
              --q_instrumentTypes='$Query.q_filters.q_instrumentTypes_cond.q_instrumentTypes'
              --q_instrumentTypesUser='$Query.q_filters.q_instrumentTypes_cond.q_instrumentTypesUser'
            #end if

            #if $Library.l_filters.l_instrumentTypes_cond.l_instrumentTypes_bool
              --l_instrumentTypes='$Library.l_filters.l_instrumentTypes_cond.l_instrumentTypes'
              --l_instrumentTypesUser='$Library.l_filters.l_instrumentTypes_cond.l_instrumentTypesUser'
            #end if

            #if $Query.q_filters.q_instruments_cond.q_instruments_bool
              --q_instruments=$Query.q_filters.q_instruments_cond.q_instruments
            #end if

            #if $Library.l_filters.l_instruments_cond.l_instruments_bool
              --l_instruments='$Library.l_filters.l_instruments_cond.l_instruments'
            #end if

            #if $Query.q_filters.q_spectraTypes_cond.q_spectraTypes_bool
              --q_spectraTypes=$Query.q_filters.q_spectraTypes_cond.q_spectraTypes
            #end if

            #if $Library.l_filters.l_spectraTypes_cond.l_spectraTypes_bool
              --l_spectraTypes=$Library.l_filters.l_spectraTypes_cond.l_spectraTypes
            #end if

            #if $Query.q_filters.q_spectraFilter
              --q_spectraFilter
            #end if

            #if $Library.l_filters.l_spectraFilter
              --l_spectraFilter
            #end if

            #if $General.rttol_cond.rttol_bool
              --rttol=$General.rttol_cond.rttol
            #end if

            --raW=$General.raW
            --mzW=$General.mzW

            #if $General.updateDb_cond.updateDb
              --updateDb
              #if $General.updateDb_cond.copyDb
                 --copyDb
              #end if
            #end if

            #if $General.usePrecursors
                 --usePrecursors
            #end if

    ]]></command>
    <inputs>
        <section name="Query" title="Query spectra input and filters" expanded="True">
            <expand macro="sm_input" ql="Query" ql_shrt = "q" user="True" mspuritydatalib="False" msp="False"
                help="Query SQLite database - in the standard XCMS msPurity workflow - the output
                    of msPurity.createDatabase should be used here. However any SQLite database
                    following the schema of as https://bioconductor.org/packages/release/bioc/vignettes/msPurity/inst/doc/msPurity-spectral-database-vignette.html can be used as input"/>
            <expand macro="filters" ql="Query" ql_shrt="q"/>
        </section>
        <section name="Library" title="Library spectra input and filters" expanded="True">
            <expand macro="sm_input" ql="Library" ql_shrt = "l" user="False" mspuritydatalib="True" msp="False"
                help="Library SQLite database - in the standard XCMS msPurity workflow - a default
                    database of MassBank, HMDB, LipidBlast and GNPS is used. However any SQLite
                    database following the schema of https://bioconductor.org/packages/release/bioc/vignettes/msPurity/inst/doc/msPurity-spectral-database-vignette.html can be used as input"/>
            <expand macro="filters" ql="Library"  ql_shrt="l"/>
        </section>
        <section name="General" title="General arguments" expanded="False">
            <conditional name="rttol_cond">
                <param name="rttol_bool" type="boolean" label="Filter on retention time match?"
                    help="" />
                    <when value="true">
                        <param name="rttol" type="float" value="30" min="0"
                            label="Retention time tolerance (seconds)"
                            help="Retention time tolerance in seconds to match precursors"/>
                    </when>
                    <when value="false"/>
            </conditional>
            <param name="usePrecursors" type="boolean" checked="true" label="Filter on matching precursors?"
                help="If True, spectra will be filtered by similarity of precursors based on
                    the library and query ppm defined tolerance" />
            <param name="raW" label="Weighting for relative abundance" type="float" value="0.5"
                help="Relative abundance weight for spectra (default to 0.5 as determined by
                    massbank for ESI data)"/>
            <param name="mzW" label="Weighting for mz" type="float" min="0" value="2"
                help="mz weight for spectra (default to 2 as determined by massbank for ESI data)"/>
            <conditional name="updateDb_cond">
                <param name="updateDb" type="boolean" checked="true"
                    label="Update database with results?" help="" />
                    <when value="true">
                        <param name="copyDb" type="boolean" checked="true"
                            label="Make a copy of the database?"
                            help="A copy will be made of the input SQLite target database and the
                                results will be added to this copy.  When False, the input SQLite
                                database will be updated  with the matching results. Use False if
                                you want to reduce storage space being used."/>
                    </when>
                    <when value="false"/>
            </conditional>
        </section>
    </inputs>

    <outputs>
        <data name="sqlite_results" format="sqlite" label="${tool.name} on ${on_string}: SQLite results"
            from_work_dir="db_with_spectral_matching.sqlite" >
            <filter>create_new_database is True</filter>
        </data>
        <data name="matches" format="tsv" label="${tool.name} on ${on_string}: matches"
            from_work_dir="matched_results.tsv" >
            <filter>spectra_type_q == "scans"</filter>
        </data>
        <data name="xcms_matches" format="tsv" label="${tool.name} on ${on_string}: XCMS matches"
            from_work_dir="xcms_matched_results.tsv" />
    </outputs>
    <tests>
        <test>
            <param name="q_dbPth" value="createDatabase_output.sqlite" />
	        <param name="l_dbPth_select" value="userdb" />
            <param name="l_dbPth" value="PR100037.sqlite" />
            <param name="q_xcmsGroups_bool" value="true" />
            <param name="l_accessions_bool" value="true" />
            <param name="q_xcmsGroups" value="14" />
            <param name="l_accessions" value="PR100037" />
            <output name="xcms_matches" file="spectralMatching_matched_results.tsv" />
            <output name="matches" file="spectralMatching_xcms_matched_results.tsv" />
            <output name="sqlite_results" value="spectralMatching_db_with_spectral_matching.sqlite" ftype="sqlite" compare="sim_size"/>
        </test>
        <test>
            <param name="l_instrumentTypes_bool" value="true" />
            <param name="q_dbPth" value="createDatabase_output.sqlite" />
	    	<param name="l_dbPth_select" value="userdb" />
            <param name="l_dbPth" value="PR100037.sqlite" />
            <param name="q_xcmsGroups_bool" value="true" />
            <param name="l_accessions_bool" value="true" />
            <param name="q_xcmsGroups" value="14" />
            <param name="l_accessions" value="PR100037" />
            <output name="xcms_matches" file="spectralMatching_matched_results_instrumentTypes.tsv" />
            <output name="matches" file="spectralMatching_xcms_matched_results_instrumentTypes.tsv" />
            <output name="sqlite_results" value="spectralMatching_db_with_spectral_matching_instrumentTypes.sqlite" ftype="sqlite" compare="sim_size"/>
        </test>
    </tests>

    <help><![CDATA[

=============================================================
Spectral matching
=============================================================
-----------
General
-----------


Perform spectral matching to spectral libraries for an LC-MS/MS dataset.

The spectral matching is performed from a **Query** SQLite spectral-database against a **Library** SQLite spectral-database.

The SQLite schema of the spectral database here: spectral_database_schema_


The query spectral-database in most cases should contain be the "unknown" spectra database generated the msPurity
function createDatabase as part of a msPurity-XCMS data processing workflow.

The library spectral-database in most cases should contain the "known" spectra from either public or user generated resources.
The library SQLite database by default contains data from MoNA including Massbank, HMDB, LipidBlast and GNPS.
A larger_database_ can be download and used from the msp2db github repository.

To create a user generated library SQLite database the following tool can be used to generate a SQLite database
from a collection of MSP files: msp2db_.

It should be noted though, that as long as the schema of the spectral-database is as described here, then any database can be used
for either the library or query -  even allowing for the same database to be used.

The spectral matching functionality has four main components, spectral filtering, spectral alignment, spectral matching,
and summarising the results.

Spectral filtering is simply filtering both the library and query spectra to be search against (e.g. choosing
the library source, instrument, retention time, precursor PPM tolerance etc).

The spectral alignment stage involves aligning the query peaks to the library peaks. The approach used is similar
to modified pMatch algorithm described in Zhou et al 2015.

The spectral matching of the aligned spectra is performed against a combined intensity and m/z weighted vector - created for both
the query and library spectra (wq and wl). See below:

.. math::

    w=intensity^x \cdot mz^y


Where x and y represent weight factors and can be adjusted with the parameters raW and mzW.
Defaults to x=0.5 and y=2 as per MassBank for ESI based mass spectrometry data.

The aligned weighted vectors are then matched using dot product cosine, reverse dot product cosine and the composite dot product.
See below for dot product cosine equation.

.. math::

    dpc =  \frac{ w_q \cdot w_l } { \sqrt{Σ{w_{q}{}^2} } \cdot \sqrt{Σ{w_{l}{}^2}}}


Full details of the matching approaches are described in the msPurity_spectral_matching_vignette_

--------------------------------------------
Example LC-MS/MS processing workflow
--------------------------------------------

* Purity assessments
   +  (mzML files) -> purityA -> (pa)
* XCMS processing
   +  (mzML files) -> xcms.xcmsSet -> xcms.merge -> xcms.group -> xcms.retcor -> xcms.group -> (xset)
* Fragmentation processing
   + (xset, pa) -> frag4feature -> filterFragSpectra -> averageAllFragSpectra -> createDatabase -> **spectralMatching** -> (sqlite spectral database)

-----------
Output
-----------

**Database**

The updated query database (this will have been updated with the annotation results if updateDb argument used)


**xcmsMatchedResults**

If the qeury spectra had XCMS based chromotographic peaks tables (e.g c_peak_groups, c_peaks) in the sqlite database - it will
be possible to summarise the matches for each XCMS grouped feature. The dataframe contains the following columns

* lpid - id in database of library spectra
* qpid - id in database of query spectra
* dpc - dot product cosine of the match
* rdpc - reverse dot product cosine of the match
* cdpc - composite dot product cosine of the match
* mcount - number of matching peaks
* allcount - total number of peaks across both query and library spectra
* mpercent - percentage of matching peaks across both query and library spectra
* library_rt - retention time of library spectra
* query_rt - retention time of query spectra
* rtdiff - difference between library and query retention time
* library_precursor_mz - library precursor mz
* query_precursor_mz - query precursor mz
* library_precursor_ion_purity - library precursor ion purity
* query_precursor_ion_purity - query precursor ion purity
* library_accession -  library accession value (unique string or number given to eith MoNA or Massbank data entires)
* library_precursor_type - library precursor type (i.e. adduct)
* library_entry_name - Name given to the library spectra
* inchikey - inchikey of the matched library spectra
* library_source_name - source of the spectra (e.g. massbank, gnps)
* library_compound_name - name of compound spectra was obtained from

**matchedResults**

All matched results from the query spectra to the library spectra. Contains the same as above
without the XCMS details. This table is useful to observe spectral matching results
for all MS/MS spectra irrespective of if they are linked to XCMS MS1 features.


.. _spectral_database_schema: https://bioconductor.org/packages/release/bioc/vignettes/msPurity/inst/doc/msPurity-spectral-datatabase-schema.html
.. _larger_database: https://github.com/computational-metabolomics/msp2db/releases
.. _msp2db: https://github.com/computational-metabolomics/msp2db/releases
.. _msPurity_spectral_matching_vignette: https://bioconductor.org/packages/release/bioc/vignettes/msPurity/inst/doc/msPurity-lcmsms-data-processing-and-spectral-matching-vignette.html

    ]]></help>

	<expand macro="citations">     </expand>
</tool>