view pyprophet_export.xml @ 4:3cf580bf28e2 draft default tip

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/pyprophet commit 8b9f6963836c6ccb227343ce952e7b9a015d0483"
author galaxyp
date Fri, 05 Jun 2020 12:38:25 -0400
parents 102d940d365c
children
line wrap: on
line source

<tool id="pyprophet_export" name="PyProphet export" version="@VERSION@.1">
    <description>
    Export tabular files, optional swath2stats export
    </description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements">
        <requirement type="package" version="1.16.0">bioconductor-swath2stats</requirement>
        <requirement type="package" version="0.8.4">r-dplyr</requirement>
        <requirement type="package" version="1.12.8">r-data.table</requirement>
        <requirement type="package" version="2.3">r-gridextra</requirement>
    </expand>
    <command detect_errors="aggressive">
    <![CDATA[
        ln -s '$input' ./input.osw &&
        pyprophet export
        --in=./input.osw
        --format=$conditional_output.format

        #if $conditional_output.format=='legacy_split':
            $conditional_output.transition_quant
            --max_transition_pep=$conditional_output.max_transition_pep
            --ipf=$conditional_output.ipf
            --ipf_max_peptidoform_pep=$conditional_output.ipf_max_peptidoform_pep
            --max_rs_peakgroup_qvalue=$conditional_output.max_rs_peakgroup_qvalue
            --max_global_peptide_qvalue=$conditional_output.max_global_peptide_qvalue
            --max_global_protein_qvalue=$conditional_output.max_global_protein_qvalue

        #elif $conditional_output.format=='legacy_merged':
            $conditional_output.transition_quant
            --max_transition_pep=$conditional_output.max_transition_pep
            --ipf=$conditional_output.ipf
            --ipf_max_peptidoform_pep=$conditional_output.ipf_max_peptidoform_pep
            --max_rs_peakgroup_qvalue=$conditional_output.max_rs_peakgroup_qvalue
            --max_global_peptide_qvalue=$conditional_output.max_global_peptide_qvalue
            --max_global_protein_qvalue=$conditional_output.max_global_protein_qvalue

        #elif $conditional_output.format=='matrix':
            --ipf=$conditional_output.ipf
            --ipf_max_peptidoform_pep=$conditional_output.ipf_max_peptidoform_pep
            --max_rs_peakgroup_qvalue=$conditional_output.max_rs_peakgroup_qvalue
            --max_global_peptide_qvalue=$conditional_output.max_global_peptide_qvalue
            --max_global_protein_qvalue=$conditional_output.max_global_protein_qvalue
        #end if
        $peptide_error
        $protein_error
        --out=./output.tsv

        #if $conditional_swath2stats.swath2stats=='yes_swath2stats':
            && cat '${swath2stats}'
            && Rscript '${swath2stats}'
        #end if

        #if $conditional_output.format=='score_plots':
            && mv *score_plots.pdf '$score_plots'
        #else:
            && mv output.tsv '$export_file'
        #end if


    ]]>
    </command>
    <configfiles>
        <configfile name="swath2stats"><![CDATA[

#if $conditional_swath2stats.swath2stats=='yes_swath2stats':

library("SWATH2stats")
library("data.table")
library("dplyr")
library(gridExtra)

########################### Input ##############################################

## read in pyprophet export file
data_me <- data.frame(fread('output.tsv', sep='\t', header=TRUE))

## read in study design template
study_design <- data.frame(fread('$conditional_swath2stats.study_design', sep='\t', header=TRUE))

## merge both files on filename column
data.annotated <- sample_annotation(data_me, study_design, column.file = "filename")


########################### QC plots and tabular files #########################

## remove decoys when generating plots
data.annotated.nodecoy <- subset(data.annotated, decoy==FALSE)

pdf("summary.pdf", fonts = "Times", pointsize = 8)
plot(0,type='n',axes=FALSE,ann=FALSE)
title(main="Summarized plots and tables from pyprophet export file")

## Look at Numbers of peptides and proteins per run
## for many runs table needs to be split over several pages
number_samples = nrow(count_analytes(data.annotated.nodecoy))

    ### for more than 20 annotation groups print only 20 samples per page:
    if (number_samples <= 20){
        grid.table(count_analytes(data.annotated.nodecoy), rows= NULL)
    }else{
        grid.table(count_analytes(data.annotated.nodecoy)[1:20,], rows= NULL)
        mincount = 21
        maxcount = 40
        for (count15 in 1:(ceiling(number_samples/20)-1)){
            plot(0,type='n',axes=FALSE,ann=FALSE)
            if (maxcount <= number_samples){
                grid.table(count_analytes(data.annotated.nodecoy)[mincount:maxcount,], rows= NULL)
                mincount = mincount+20
                maxcount = maxcount+20
            }else{### stop last page with last sample otherwise NA in table
                grid.table(count_analytes(data.annotated.nodecoy)[mincount:number_samples,], rows= NULL)} 
        }
    }


## Correlation of the intensities
correlation_int <- plot_correlation_between_samples(data.annotated.nodecoy, column.values = 'Intensity')

## Plot the correlation of the delta_rt, which is the deviation of the retention time from the expected retention time
correlation_rt <- plot_correlation_between_samples(data.annotated.nodecoy, column.values = 'delta_rt')

## Plot the variation of the signal across replicates
variation <- plot_variation(data.annotated.nodecoy) 
plot(0,type='n',axes=FALSE,ann=FALSE)
grid.table(variation[[2]])

## Plot the total variation versus variation within replicates
variation_total <- plot_variation_vs_total(data.annotated.nodecoy)

## Calculate the summed signal per peptide and protein across samples
peptide_signal <- write_matrix_peptides(data.annotated.nodecoy) 
protein_signal <- write_matrix_proteins(data.annotated.nodecoy) 


#if str($conditional_swath2stats.conditional_fdr_replica.calc_fdr_replica) =="calc_fdr_replica_yes": 

    ## Estimate the overall FDR across runs using a target decoy strategy
    fdr_target_decoy <- assess_fdr_overall(data.annotated, n.range = $conditional_swath2stats.conditional_fdr_replica.n_range, FFT = $conditional_swath2stats.conditional_fdr_replica.fft, output = 'Rconsole')
    print(fdr_target_decoy)
    dev.off()
#else
    dev.off()
#end if

############################# Filtering ########################################

data.filtered = data.annotated

#if str($conditional_swath2stats.conditional_fdr_replica.calc_fdr_replica) =="calc_fdr_replica_yes": 

    ## According to this FDR estimation one can filter the data with a higher mscore threshold to reach an overall protein FDR of 5%.
    ## Check what m-score cut-off is requiered for Protein FDR of 5 %
    cutoff_mscore = mscore4protfdr(data_me, FFT = $conditional_swath2stats.conditional_fdr_replica.fft, fdr_target = $conditional_swath2stats.conditional_fdr_replica.fdr_target)
print(cutoff_mscore)
    ## Filter data for values that pass the 0.001 mscore criteria in at least two replicates of one condition
    data.filtered <- filter_mscore_condition(data.filtered, cutoff_mscore, n.replica = $conditional_swath2stats.conditional_fdr_replica.n_replica)
#end if

#if str($conditional_swath2stats.conditional_max_pep.filter_max_pep) == "filter_max_pep_yes":
    ## Select only the 10 peptides showing strongest signal per protein
    data.filtered <- filter_on_max_peptides(data.filtered, n_peptides = $conditional_swath2stats.conditional_max_pep.n_peptides_max)
#end if


#if str($conditional_swath2stats.conditional_min_pep.filter_min_pep) == "filter_min_pep_yes":
    ## Filter for proteins that are supported by at least two peptides
    data.filtered <- filter_on_min_peptides(data.filtered, n_peptides = $conditional_swath2stats.conditional_min_pep.n_peptides_min)
#end if

########################### Output ############################################
## Convert the data into a transition-level format (one row per transition measured).
data.transition <- disaggregate(data.filtered)

## Convert the data into the format required by MSstats.
MSstats.input <- convert4MSstats(data.transition)

### Transitions which were found at different RT / multiple scans are combined by summarizing the Intensities
Test = MSstats.input %>% group_by(ProteinName, PeptideSequence, PrecursorCharge, FragmentIon, ProductCharge, IsotopeLabelType, BioReplicate, Condition, Run) %>% summarise(Intensity = sum(Intensity))

Test = Test[, c("ProteinName", "PeptideSequence", "PrecursorCharge", "FragmentIon", "ProductCharge", "IsotopeLabelType", "Intensity", "BioReplicate", "Condition", "Run")]

write.table(Test, file="$msstats_input", quote = FALSE, row.names = FALSE, col.names=TRUE, sep = "\t")
write.table(peptide_signal, file="$peptide_signal", quote = FALSE, row.names = FALSE, col.names=TRUE, sep = "\t")
write.table(protein_signal, file="$protein_signal", quote = FALSE, row.names = FALSE, col.names=TRUE, sep = "\t")

#end if

    ]]></configfile>
    </configfiles>
    <inputs>
        <param name="input" type="data" format="osw" label="Input file" help="This file needs to be in OSW format (--in)" />
        <conditional name="conditional_output">
        <param argument="format" type="select" label="Export format, either matrix, legacy_split, legacy_merged (mProphet/PyProphet) or score_plots format" >
            <option value="legacy_split" selected="True">legacy_split</option>
            <option value="legacy_merged">legacy_merged</option>
            <option value="matrix">matrix</option>
            <option value="score_plots">score_plots</option>
        </param>
            <when value="legacy_split">

                <param name="transition_quant" type="boolean" truevalue="--transition_quantification" falsevalue="--no-transition_quantification" checked="True" label="Report aggregated transition-level quantification" help="(--transition_quantification / --no-transition_quantification)" />
                <param argument="max_transition_pep" type="float" value="0.7" label="Maximum PEP to retain scored transitions for quantification (requires transition-level scoring)" />
                <param argument="ipf" type="select" display="radio" label="Should IPF results be reported if present? 'peptidoform': Report results on peptidoform-level, 'augmented': Augment OpenSWATH results with IPF scores, 'disable': Ignore IPF results" >
                  <option value="peptidoform" selected="True" >peptidoform </option>
                  <option value="augmented">augmented</option>
                  <option value="disable">disable</option>
                </param>
                <param argument="ipf_max_peptidoform_pep" type="float" value="0.4" label="IPF: Filter results to maximum run-specific peptidoform-level PEP" />
                <param argument="max_rs_peakgroup_qvalue" type="float" value="0.05" label="Filter results to maximum run-specific peak group-level q-value" />
                <param argument="max_global_peptide_qvalue" type="float" value="0.01" label="Filter results to maximum global peptide-level q-value" />
                <param argument="max_global_protein_qvalue" type="float" value="0.01" label="ilter results to maximum global protein-level q-value" />
            </when>
            <when value="legacy_merged">

                <param name="transition_quant" type="boolean" truevalue="--transition_quantification" falsevalue="--no-transition_quantification" checked="True" label="Report aggregated transition-level quantification" help="(--transition_quantification / --no-transition_quantification)" />
                <param argument="max_transition_pep" type="float" value="0.7" label="Maximum PEP to retain scored transitions for quantification (requires transition-level scoring)" />
                <param argument="ipf" type="select" display="radio" label="Should IPF results be reported if present? 'peptidoform': Report results on peptidoform-level, 'augmented': Augment OpenSWATH results with IPF scores, 'disable': Ignore IPF results" >
                  <option value="peptidoform" selected="True">peptidoform </option>
                  <option value="augmented">augmented</option>
                  <option value="disable">disable</option>
                </param>
                <param argument="ipf_max_peptidoform_pep" type="float" value="0.4" label="IPF: Filter results to maximum run-specific peptidoform-level PEP" />
                <param argument="max_rs_peakgroup_qvalue" type="float" value="0.05" label="Filter results to maximum run-specific peak group-level q-value" />
                <param argument="max_global_peptide_qvalue" type="float" value="0.01" label="Filter results to maximum global peptide-level q-value" />
                <param argument="max_global_protein_qvalue" type="float" value="0.01" label="ilter results to maximum global protein-level q-value" />
            </when>
            <when value="matrix">

                <param argument="ipf" type="select" display="radio" label="Should IPF results be reported if present? 'peptidoform': Report results on peptidoform-level, 'augmented': Augment OpenSWATH results with IPF scores, 'disable': Ignore IPF results" >
                  <option value="peptidoform" selected="True">peptidoform </option>
                  <option value="augmented">augmented</option>
                  <option value="disable">disable</option>
                </param>
                <param argument="ipf_max_peptidoform_pep" type="float" value="0.4" label="IPF: Filter results to maximum run-specific peptidoform-level PEP" />
                <param argument="max_rs_peakgroup_qvalue" type="float" value="0.05" label="Filter results to maximum run-specific peak group-level q-value" />
                <param argument="max_global_peptide_qvalue" type="float" value="0.01" label="Filter results to maximum global peptide-level q-value" />
                <param argument="max_global_protein_qvalue" type="float" value="0.01" label="ilter results to maximum global protein-level q-value" />
            </when>
            <when value="score_plots"/>
        </conditional>
        <param name="peptide_error" type="boolean" truevalue="--peptide" falsevalue="--no-peptide"  checked="True" label="Append peptide-level error-rate estimates if available" help="(--peptide / --no-peptide)" />
        <param name="protein_error" type="boolean" truevalue="--protein" falsevalue="--no-protein"  checked="True" label="Append protein-level error-rate estimates if available" help="(--protein / --no-protein)" />
        <conditional name="conditional_swath2stats">
            <param name="swath2stats" type="select" label="Use swath2stats to export file for statsics" >
                <option value="yes_swath2stats" selected="True">yes</option>
                <option value="no_swath2stats">no</option>
            </param>
                <when value="yes_swath2stats">
                <param name="study_design" type="data" format="tabular" label="Study design tabular file" help="Needs to have columns with Filename, Condition, BioReplicate, Run" />
                    <conditional name="conditional_fdr_replica">
                        <param name="calc_fdr_replica" type="select" label="Filter for fdr and number of replicates" >
                        <option value="calc_fdr_replica_yes" selected="True">Yes</option>
                        <option value="calc_fdr_replica_no">No</option>
                    </param>
                        <when value="calc_fdr_replica_yes">
                            <param name="fft" type="float" value="0.5" label="FFT. Ratio of false positives to true negatives, q-values from pyProphet stats output" help="As an approximation, the q-values of multiple runs are averaged and supplied as argument FFT. Numeric from 0 to 1."/>
                            <param name="n_range" type="float" value="10" label="Option to set the number of magnitude for which the m_score threshold is decreased" />
                            <param name="fdr_target" type="float" value="0.05" label="FDR target." help="An m_score cutoff achieving and FDR smaller fdr_target will be selected. Calculated as FDR = decoys*FFT/targets" />
                            <param name="n_replica" type="integer" value="2" label="Number Replicates." help="Number of measurements within at least one condition that have to pass the mscore threshold for this transition." />
                        </when>
                        <when value="calc_fdr_replica_no"/>
                    </conditional>
                    <conditional name="conditional_max_pep">
                        <param name="filter_max_pep" type="select" label="Filter for a maximum number of peptides per protein" >
                        <option value="filter_max_pep_yes" selected="True">Yes</option>
                        <option value="filter_max_pep_no">No</option>
                    </param>
                        <when value="filter_max_pep_yes">
                            <param name="n_peptides_max" type="integer" value="10" label="Maximum number of peptides per protein." help="Maximum number of highest intense peptides to filter the data on." />
                        </when>
                        <when value="filter_max_pep_no"/>
                    </conditional>
                    <conditional name="conditional_min_pep">
                        <param name="filter_min_pep" type="select" label="Filter for a proteins that are supported by a minimum number of peptides" >
                        <option value="filter_min_pep_yes" selected="True">Yes</option>
                        <option value="filter_min_pep_no">No</option>
                    </param>
                        <when value="filter_min_pep_yes">
                            <param name="n_peptides_min" type="integer" value="2" label="Minimum number of peptides per protein" help="Number of minimal number of peptide IDs associated with a protein ID in order to be kept in the dataset." />
                        </when>
                        <when value="filter_min_pep_no"/>
                    </conditional>
                </when>
                <when value="no_swath2stats"/>
        </conditional>
    </inputs>
    <outputs>
        <data name="export_file" format="tabular" label="${tool.name} on ${on_string}: export.tabular" >
            <filter>conditional_output['format'] != 'score_plots'</filter>
        </data>
        <data name="score_plots" format="pdf" label="${tool.name} on ${on_string}: score_plots.pdf" >
            <filter>conditional_output['format'] == 'score_plots'</filter>
        </data>
        <data name="summary" format="pdf" from_work_dir="summary.pdf" label = "${tool.name} on ${on_string}: summary.pdf">
            <filter>conditional_swath2stats['swath2stats'] == 'yes_swath2stats'</filter>
        </data>
        <data name="peptide_signal" format="tabular" label="${tool.name} on ${on_string}: peptide_signal.tabular" from_work_dir="peptide_signal.tabular" >
            <filter>conditional_swath2stats['swath2stats'] == 'yes_swath2stats'</filter>
        </data>
        <data name="protein_signal" format="tabular" label="${tool.name} on ${on_string}: protein_signal.tabular" from_work_dir="protein_signal.tabular" >
            <filter>conditional_swath2stats['swath2stats'] == 'yes_swath2stats'</filter>
        </data>
        <data name="msstats_input" format="tabular" label="${tool.name} on ${on_string}: msstats_input.tabular" from_work_dir="msstats_input.tabular" >
            <filter>conditional_swath2stats['swath2stats'] == 'yes_swath2stats'</filter>
        </data>
    </outputs>
    <tests>
        <test expect_num_outputs="1">
            <param name="input" value="protein2.osw" ftype="osw" />
            <param name="format" value="legacy_merged" />
            <param name="max_global_peptide_qvalue" value="0.2" />
            <conditional name="conditional_swath2stats">
                <param name="swath2stats" value="no_swath2stats"/>
            </conditional>
            <output name="export_file" file="output.tabular" />
        </test>
        <test expect_num_outputs="1">
            <param name="input" value="protein2.osw" ftype="osw" />
            <param name="format" value="score_plots" />
            <conditional name="conditional_swath2stats">
                <param name="swath2stats" value="no_swath2stats"/>
            </conditional>
            <output name="score_plots" file="score_plots.pdf" compare="sim_size" />
        </test>
        <test expect_failure="true">
            <param name="input" value="protein2.osw" ftype="osw" />
            <param name="format" value="legacy_merged" />
            <conditional name="conditional_swath2stats">
                <param name="study_design" value="study_design.tabular" ftype="tabular" />
                <conditional name="conditional_fdr_replica">
                    <param name="calc_fdr_replica" value="calc_fdr_replica_no"/>
                </conditional>
                <conditional name="conditional_max_pep">
                    <param name="filter_max_pep" value="filter_max_pep_no" />
                </conditional>
                <conditional name="conditional_min_pep">
                    <param name="filter_min_pep" value="filter_min_pep_no" />
                </conditional>
            </conditional>
            <assert_stderr>
                <has_text text="replacement has 1 row, data has 0" />
            </assert_stderr>
        </test>
    </tests>
    <help>
<![CDATA[
**What it does**

PyProphet: Semi-supervised learning and scoring of OpenSWATH results.

Export tabular (tsv) tables. By default, both peptide- and transition-level quantification is reported, which is necessary for requantification or SWATH2stats. If peptide and protein inference in the global context was conducted, the results will be filtered to 1% FDR by default.

Optional SWATH2stats output. SWATH2stats is intended to transform SWATH data from the OpenSWATH software into a format readable by other statistics packages while performing filtering, annotation and FDR estimation.

**Study desing file for SWATH2stats**

    - Tabular file with columns that are named: Filename, Condition, BioReplicate, Run. 
    - The Filename should be part or the same as the original filenames used in OpenSWATH workflow
    - The Condition will be used for statistical analysis. In case multiple conditions are of interest for statistical analysis (e.g. diagnosis and age), this tool has to be run multiple times as SWATH2stats can only handle one condition at a time
    - The BioReplicate is corresponds to the biological replicate
    - The Run is the number of the MS run in which the sample was measured

    - **Example for one replicate per patient**

        ::

                      Filename       Condition    BioReplicate     Run
                    healthy1.mzml     healthy         1             1
                    healthy2.mzml     healthy         2             2
                    diseased1.mzml    diseased        3             3
                    diseased2.mzml    diseased        4             4
                        ...
                        ...


    - **Example for two replicates per patient**

        ::

                      Filename       Condition    BioReplicate     Run
                    healthy1.mzml     healthy         1             1
                    healthy2.mzml     healthy         1             2
                    diseased1.mzml    diseased        2             3
                    diseased2.mzml    diseased        2             4
                        ...
                        ...

PyProphet is a Python re-implementation of the mProphet algorithm (Reiter 2010 Nature Methods) optimized for SWATH-MS data acquired by data-independent acquisition (DIA). The algorithm was originally published in (Telemann 2014 Bioinformatics) and has since been extended to support new data types and analysis modes (Rosenberger 2017, Nature biotechnology and Nature methods).

For more information, visit @link@

]]>
    </help>
    <expand macro="citations">
        <citation type="doi">10.1371/journal.pone.0153160</citation>
    </expand>
</tool>