Mercurial > repos > galaxyp > calisp
diff calisp.xml @ 1:867f17ede7f3 draft default tip
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/calisp commit 42e5dfeaa309e6ac17b4616314498a3b628272d2
author | galaxyp |
---|---|
date | Thu, 14 Sep 2023 12:49:19 +0000 |
parents | 6d93529d19d4 |
children |
line wrap: on
line diff
--- a/calisp.xml Thu Jun 01 08:34:14 2023 +0000 +++ b/calisp.xml Thu Sep 14 12:49:19 2023 +0000 @@ -1,7 +1,7 @@ <tool id="calisp" name="calisp" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="21.05"> <description>Estimate isotopic composition of peptides from proteomics mass spectrometry data</description> <macros> - <token name="@TOOL_VERSION@">3.0.10</token> + <token name="@TOOL_VERSION@">3.0.13</token> <token name="@VERSION_SUFFIX@">0</token> <token name="@CALISP_REPO@">https://raw.githubusercontent.com/kinestetika/Calisp/208d495674e2b52fe56cf23457c833d1c2527242</token> <xml name="input_macro" tokens="multiple"> @@ -30,8 +30,28 @@ --bin_delimiter '$bin_delimiter' --threads "\${GALAXY_SLOTS:-1}" --isotope $isotope - $compute_clumps && -'$__tool_directory__/feather2tsv.py' --calisp_output calisp-output/ + $compute_clumps +#if $isotope_abundance_matrix + --isotope_abundance_matrix '$isotope_abundance_matrix' +#end if + +#if $isotope_abundance_matrix + && ISOTOPE_ABUNDANCE_MATRIX="$isotope_abundance_matrix" +#else + && ISOTOPE_ABUNDANCE_MATRIX="\$(python -c 'import site; print(f"{site.getsitepackages()[0]}/calisp/isotope_matrix.txt")')" +#end if + + && python '$__tool_directory__/benchmarking.py' + --input calisp-output/ + --isotope_abundance_matrix "\$ISOTOPE_ABUNDANCE_MATRIX" + --isotope $isotope +#if $benchmark_cond.benchmark == 'yes' + --out_filtered '$filtered' + --out_summary '$summary' + #if $benchmark_cond.nominal_values + --nominal_values '$benchmark_cond.nominal_values' + #end if +#end if ]]></command> <inputs> <param argument="--spectrum_file" type="data" multiple="false" format="mzml" label="Spectrum file"/> @@ -59,18 +79,36 @@ <option value="36S">36S</option> </param> <param argument="--compute_clumps" type="boolean" truevalue="--compute_clumps" falsevalue="" checked="false" label="Compute clumps" help="To compute clumpiness of carbon assimilation. Only use when samples are labeled tosaturation. Estimation of clumpiness takes much additional time." /> + <param argument="--isotope_abundance_matrix" type="data" format="tabular" optional="true" label="Custom isotope abundance matrix" help="If not given the built in matrix will be used" /> + <conditional name="benchmark_cond"> + <param name="benchmark" type="select" label="Run benchmarking"> + <option value="yes">Yes</option> + <option value="no">No</option> + </param> + <when value="yes"> + <param name="nominal_values" type="data" format="tabular" optional="true" label="Nominal values" help="A table containing ms_run and their nominal value (1, 5, or 10)"/> + </when> + <when value="no"/> + </conditional> </inputs> <outputs> <collection name="output" type="list"> <discover_datasets pattern="(?P<designation>.*)\.tsv" format="tabular" directory="calisp-output"/> </collection> + <data name="filtered" format="tabular" label="${tool.name} on ${on_string}: filtered"> + <filter>benchmark_cond['benchmark'] == 'yes'</filter> + </data> + <data name="summary" format="tabular" label="${tool.name} on ${on_string}: peptide summary"> + <filter>benchmark_cond['benchmark'] == 'yes'</filter> + </data> </outputs> <tests> <!-- TODO test data to large, avilable from here: https://github.com/kinestetika/Calisp/tree/master/test - if possible inlcude via location in the future - <test expect_num_outputs="1"> - <param name="spectrum_file" value="calisp_test_data.mzML" ftype="mzml"/> - <param name="peptide_file" value="calisp_test_data_TargetPeptideSpectrumMatch.txt" ftype="tabular"/> + if possible inlcude via location in the future --> + <!-- <test expect_num_outputs="3"> + <param name="spectrum_file" location="https://raw.githubusercontent.com/kinestetika/Calisp/v@TOOL_VERSION@/test/calisp_test_data.mzML" ftype="mzml"/> + <param name="peptide_file" location="https://raw.githubusercontent.com/kinestetika/Calisp/v@TOOL_VERSION@/test/calisp_test_data_TargetPeptideSpectrumMatch.txt" ftype="tabular"/> + <param name="benchmark" value="true"/> <output_collection name="output" count="1"> <element name="calisp_test_data"> <assert_contents> @@ -80,13 +118,74 @@ <has_text text="P13645"/> <has_text text="NHEEEMKDLR"/> <has_text text="Oxidation"/> - <has_n_columns n="85"/> + <has_n_columns n="84"/> <has_n_lines n="24"/> </assert_contents> </element> </output_collection> - </test> - --> + <output name="filtered" ftype="tabular"> + <assert_contents> + <has_n_lines n="24"/> + <has_n_columns n="87"/> + </assert_contents> + </output> + <output name="summary" value="summary.tsv" ftype="tabular"/> + </test> --> + + <!-- same test, but with isotope abundance matrix supplied by the user + (using the same as the built in => same results) + + TODO: test will only work with 23.1 tool-utils package available --> + <!-- <test expect_num_outputs="3"> + <param name="spectrum_file" location="https://raw.githubusercontent.com/kinestetika/Calisp/v@TOOL_VERSION@/test/calisp_test_data.mzML" ftype="mzml"/> + <param name="peptide_file" location="https://raw.githubusercontent.com/kinestetika/Calisp/v@TOOL_VERSION@/test/calisp_test_data_TargetPeptideSpectrumMatch.txt" ftype="tabular"/> + <param name="isotope_abundance_matrix" location="https://raw.githubusercontent.com/kinestetika/Calisp/v@TOOL_VERSION@/src/calisp/isotope_matrix.txt" ftype="tabular"/> + <param name="benchmark" value="true"/> + <output_collection name="output" count="1"> + <element name="calisp_test_data"> + <assert_contents> + <has_text text="experiment"/> + <has_text text="MKH_260min_1800ng"/> + <has_text text="HOMO"/> + <has_text text="P13645"/> + <has_text text="NHEEEMKDLR"/> + <has_text text="Oxidation"/> + <has_n_columns n="84"/> + <has_n_lines n="24"/> + </assert_contents> + </element> + </output_collection> + <output name="filtered" ftype="tabular"> + <assert_contents> + <has_n_lines n="24"/> + <has_n_columns n="87"/> + </assert_contents> + </output> + <output name="summary" value="summary.tsv" ftype="tabular"/> + </test> --> + + <!-- trst output filters for no benchmarking --> + <!-- <test expect_num_outputs="1"> + <param name="spectrum_file" location="https://raw.githubusercontent.com/kinestetika/Calisp/v@TOOL_VERSION@/test/calisp_test_data.mzML" ftype="mzml"/> + <param name="peptide_file" location="https://raw.githubusercontent.com/kinestetika/Calisp/v@TOOL_VERSION@/test/calisp_test_data_TargetPeptideSpectrumMatch.txt" ftype="tabular"/> + <conditional name="benchmark_cond"> + <param name="benchmark" value="no"/> + </conditional> + <output_collection name="output" count="1"> + <element name="calisp_test_data"> + <assert_contents> + <has_text text="experiment"/> + <has_text text="MKH_260min_1800ng"/> + <has_text text="HOMO"/> + <has_text text="P13645"/> + <has_text text="NHEEEMKDLR"/> + <has_text text="Oxidation"/> + <has_n_columns n="84"/> + <has_n_lines n="24"/> + </assert_contents> + </element> + </output_collection> + </test> --> </tests> <help><![CDATA[ Calisp (Calgary approach to isotopes in proteomics) is a program that estimates @@ -165,8 +264,8 @@ pattern_peak_count # of peaks in the pattern pattern_median_peak_spacing medium mass difference between a pattern's peaks spectrum_mass_irregularity a measure for the standard deviation in the mass difference between a pattern's peaks -ratio_na the estimated isotope ratio inferred from neutron abundance (sip experiments) -ratio_fft the estimated isotope ratio inferred by the fft method (natural isotope abundances) +ratio_na the estimated isotope ratio (in percent) inferred from neutron abundance (sip experiments) +ratio_fft the estimated isotope ratio (in percent) inferred by the fft method (natural isotope abundances) error_fft the remaining error after fitting the pattern with fft error_clumpy the remaining error after fitting the pattern with the clumpy carbon method flag_peptide_contains_sulfur true if peptide contains sulfur @@ -183,11 +282,123 @@ m0 - m19 the masses of the first 20 peaks of the pattern c1 - c6 contributions of clumps of 1-6 carbon to ratio_na. These are the outcomes of the clumpy carbon model. These results are only meaningful if the biomass was labeled to saturation. ========================================== =================== + +Benchmarking +============ + +If the user chooses to run the additional benchmarking script two additional +outputs are created as follows. + +Load data: +---------- + +- Concatenate calisp result tables +- add column ``delta_na`` = 1000 * ``ratio_na`` / (1/factor-2) +- add column ``delta_fft`` = 1000 * ``ratio_fft`` / (1/factor-2) + +Filter data: +------------ + +Rows are removed for which any of the following criteria applies + +- flag_peak_at_minus_one_pos +- flag_pattern_is_wobbly +- flag_psm_has_low_confidence +- flag_psm_is_ambiguous +- flag_pattern_is_contaminated +- flag_peptide_assigned_to_multiple_bins + +Furthermore in the ``peptide`` column the strings ``"Oxidation"``, ``"Carbamidomethyl"``, +and text in brackets (i.e. ``[]``) preceded by any number of spaces +is removed. + +Benchmarking: +------------- + +Iterate through all combinations of unique peptides, proteins, and samples +and output the following tabular information + +=================== =========================== +Column Content +=================== =========================== +file The name of the mzML spectrum file comprising the peptide +bin bin/mag ids, separated by commas. Calisp expects the protein ids in the psm file to consist of two parts, separated by a delimiter (_ by default). The first part is the bin/mag id, and the second part is the protein id +%label The label percentage (≠ 0 if labelled components used during experiments) +ratio The natural abundance ratio of the target element (C, H, N, O, S) +peptide The labeled peptides +psm_mz psm m over z +n(patterns) The number of iterations of the same pattern for the peptides has been repeated +mean intensity The mean of the total intensity of the pattern +ratio_NA median The mean of the estimated isotope ratio inferred from neutron abundance (sip experiments) +N mean The mean of the number of neutrons inferred from custom 'neutron' modifications +ratio_NA SEM The standard error of the mean of the estimated isotope ratio inferred from neutron abundance (sip experiments) +ratio_FFT median The mean of the estimated isotope ratio inferred by the fft method (natural isotope abundances) +ratio_FFT SEM The standard error of the mean of the estimated isotope ratio inferred by the fft method (natural isotope abundances) +False Positive Any false positive indications +=================== =========================== + +Mean, median, and standard error values are computed for +all entries of this sample and that have the same peptide. + +**Isotope abundance matrix**: + +The isotope abundance matrix gives the background unlabeled fraction. +The default matrix implemented in calisp is given here: +https://github.com/kinestetika/Calisp/blob/v@TOOL_VERSION@/src/calisp/isotope_matrix.txt. +Columns specify the atom of interest and the rows the isotope, i.e. +rows 1-5 correspond to C, N, O, H, S. For instance +13C is in the 2nd column of the 1st row and 14C in the 3rd column +of the same row. + +**Benchmarking without nominal values**: + +If no nominal values, i.e. percentage of labeled atoms are given, +nominal values of 0 are assumed. + +The values in the `ratio` column is comuted as `background_isotope / background_unlabelled * 100` +where `background_unlabelled` is taken from the isotope abundance matrix +according to the chosen target isotope. +Then `background_isotope` is given by `1 - background_unlabelled` + +All entries of the table are considered not false positive. + +**Benchmarking with nominal values**: + +The `%label` (the nominal value) of a sample is either 0 (the default), +1, 5, or 10 and can be provided or each sample by a tabular dataset +(column 1 should give the sample names and column 2 the nominal value). + +The `ratio = I / U * 100` is given by +`U = unlabeled_fraction * background_unlabelled` and +`I = nominal_value / 100 + unlabeled_fraction * background_isotope` +where +`unlabeled_fraction = 1 - nominal_value / 100` +`background_isotope = 1 - background_unlabelled` +and `background_unlabelled` is given by the isotope abundance matrix. + +A peptide is considered false positive if it's not a contaminant(at the moment only K12) +and the median of `ratio_na` values for the same peptide and sample +is greater than a threshold depending on the nominal value: + +"For false positive discovery rates we set the threshold at the +isotope/unlabelled associated with 1/4 of a generation of labeling. The E. +coli values (1.7, 4.2 and 7.1) are for 1 generation at 1, 5 and 10% label, and +we take the background (1.07) into account as well."" + +============= ========= +nominal value threshold +============= ========= +1 `1.07 + (1.7 - 1.07) / 4` +5 `1.07 + (4.2 - 1.07) / 4` +10 `1.07 + (7.1 - 1.07) / 4` +============= ========= + +File an issue at https://github.com/galaxyproteomics/tools-galaxyp/issues if +different contaminants of thresholds should be considered. ]]></help> <citations> <citation type="doi">10.1186/s40168-022-01454-1</citation> <citation type="doi">10.1073/pnas.1722325115</citation> - <citation type="doi">10.1101/2021.03.29.437612</citation> <citation type="doi">10.1093/bioinformatics/bty046</citation> </citations> -</tool> \ No newline at end of file +</tool>