comparison calisp.xml @ 0:6d93529d19d4 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/calisp commit 060699366b6dd19ad6c3ef3f332f63cc55d75dce
author galaxyp
date Thu, 01 Jun 2023 08:34:14 +0000
parents
children 867f17ede7f3
comparison
equal deleted inserted replaced
-1:000000000000 0:6d93529d19d4
1 <tool id="calisp" name="calisp" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="21.05">
2 <description>Estimate isotopic composition of peptides from proteomics mass spectrometry data</description>
3 <macros>
4 <token name="@TOOL_VERSION@">3.0.10</token>
5 <token name="@VERSION_SUFFIX@">0</token>
6 <token name="@CALISP_REPO@">https://raw.githubusercontent.com/kinestetika/Calisp/208d495674e2b52fe56cf23457c833d1c2527242</token>
7 <xml name="input_macro" tokens="multiple">
8 <!-- According to readme mzid input is not yet implented -->
9 </xml>
10 </macros>
11 <requirements>
12 <requirement type="package" version="@TOOL_VERSION@">calisp</requirement>
13 </requirements>
14 <command detect_errors="aggressive"><![CDATA[
15 #import re
16
17 mkdir -p spectra &&
18 #set escaped_specs = re.sub('[^\w\-\.,:]', '_', str($spectrum_file.element_identifier))
19 ln -s '$spectrum_file' spectra/'$escaped_specs' &&
20
21 mkdir -p psms &&
22 #set escaped_peps = re.sub('[^\w\-\.,:]', '_', str($peptide_file.element_identifier))
23 ln -s '$peptide_file' psms/'$escaped_peps' &&
24
25 calisp
26 --spectrum_file spectra/
27 --peptide_file psms/
28 --output_file calisp-output/
29 --mass_accuracy $mass_accuracy
30 --bin_delimiter '$bin_delimiter'
31 --threads "\${GALAXY_SLOTS:-1}"
32 --isotope $isotope
33 $compute_clumps &&
34 '$__tool_directory__/feather2tsv.py' --calisp_output calisp-output/
35 ]]></command>
36 <inputs>
37 <param argument="--spectrum_file" type="data" multiple="false" format="mzml" label="Spectrum file"/>
38 <param argument="--peptide_file" type="data" multiple="false" format="tabular" label="Peptide file" help="Psm file" />
39 <param argument="--mass_accuracy" type="float" value="10" label="Mass accuracy" help="The maximum mass difference between theoretical mass and experimental mass of a peptide" />
40 <param argument="--bin_delimiter" type="text" value="_" label="Bin delimiter" help="For metagenomic data, the delimiter that separates the bin ID from the protein ID (default: &quot;_&quot;). Use &quot;-&quot; to ignore bins ID entirely.">
41 <sanitizer invalid_char="">
42 <valid initial="string.ascii_letters,string.digits">
43 <add value="_" />
44 <add value="-" />
45 <add value=":" />
46 </valid>
47 </sanitizer>
48 </param>
49 <param argument="--isotope" type="select" label="Target isotope">
50 <option value="13C" selected="true">13C</option>
51 <option value="14C">14C</option>
52 <option value="15N">15N</option>
53 <option value="17O">17O</option>
54 <option value="18O">18O</option>
55 <option value="2H">2H</option>
56 <option value="3H">3H</option>
57 <option value="33S">33S</option>
58 <option value="34S">34S</option>
59 <option value="36S">36S</option>
60 </param>
61 <param argument="--compute_clumps" type="boolean" truevalue="--compute_clumps" falsevalue="" checked="false" label="Compute clumps" help="To compute clumpiness of carbon assimilation. Only use when samples are labeled tosaturation. Estimation of clumpiness takes much additional time." />
62 </inputs>
63 <outputs>
64 <collection name="output" type="list">
65 <discover_datasets pattern="(?P&lt;designation&gt;.*)\.tsv" format="tabular" directory="calisp-output"/>
66 </collection>
67 </outputs>
68 <tests>
69 <!-- TODO test data to large, avilable from here: https://github.com/kinestetika/Calisp/tree/master/test
70 if possible inlcude via location in the future
71 <test expect_num_outputs="1">
72 <param name="spectrum_file" value="calisp_test_data.mzML" ftype="mzml"/>
73 <param name="peptide_file" value="calisp_test_data_TargetPeptideSpectrumMatch.txt" ftype="tabular"/>
74 <output_collection name="output" count="1">
75 <element name="calisp_test_data">
76 <assert_contents>
77 <has_text text="experiment"/>
78 <has_text text="MKH_260min_1800ng"/>
79 <has_text text="HOMO"/>
80 <has_text text="P13645"/>
81 <has_text text="NHEEEMKDLR"/>
82 <has_text text="Oxidation"/>
83 <has_n_columns n="85"/>
84 <has_n_lines n="24"/>
85 </assert_contents>
86 </element>
87 </output_collection>
88 </test>
89 -->
90 </tests>
91 <help><![CDATA[
92 Calisp (Calgary approach to isotopes in proteomics) is a program that estimates
93 isotopic composition (e.g. 13C/12C, delta13C, 15N/14N etc) of peptides from
94 proteomics mass spectrometry data. Input data consist of mzML files and files
95 with peptide spectrum matches.
96
97 Calisp was originally developed in Java. This Galaxy tool uses the python
98 reimplementation https://github.com/kinestetika/Calisp.
99 Note that, in contrast to the Java version the python reimplementation does
100 not use ``mcl`` .
101 Compared to Java versions of calisp, the workflow has been simplified.
102 Calisp does not filter out any isotopic patterns, or adds up isotopic
103 patterns to reduce noise - like the Java version does. It simply estimates the
104 ratio for the target isotopes (e.g. 13C/12C) for every isotopic pattern it can
105 subsample. It estimates this ratio based on neutron abundance and using fast
106 fourier transforms. The former applies to stable isotope probing experiments.
107 The latter applies to natural abundances, or to isotope probing experiments with
108 very little added label (e.g. using substrates with <1% additional 13C). The
109 motivation for omitting filtering is that keeping all subsampled isotopic
110 patterns, including bad ones, will enable training of machine learning
111 classifiers. Also, because it was shown that the median provides better
112 estimates for species in microbial communities than the mean, adding up isotopic
113 patterns to improve precision has lost its purpose. There is more power (and
114 sensitivity) in numbers.
115
116 Because no data are filtered out and no isotopic patterns get added up,
117 calisp analyzes at least ten times as many isotopic patterns compared to the
118 Java version. That means calisp.py is about ten times slower, it takes about
119 5-10 min per .mzML file on a Desktop computer. For natural
120 abundance data, it works well to only use those spectra that have a FFT fitting
121 error ("error_fft") of less than 0.001. Note that this threshold is less
122 stringent then thew one used by the java program.
123
124 Input
125 =====
126
127 Calisp needs two inputs: a spectra file in mzML format and tabular peptipe file (PSM).
128 The PSM file contains a column "Spectrum File" that links the peptides to the
129 original spectra files. The mzML files are identified by the run id
130 information stored in the mzML files or the file name.
131 In order to make the association via the file name work in Galaxy one can either
132
133 - use collections where the element identifiers are equal to the data in the column
134 - make sure that dataset names are equal to the data in this column
135
136 Output table
137 ============
138
139 Each row contains one isotopic pattern, defined by the following columns:
140
141 ========================================== ===================
142 Header name Content
143 ========================================== ===================
144 experiment filename of the peptide spectrum match (psm) file
145 ms_run filename of the .mzml file
146 bins bin/mag ids, separated by commas. Calisp expects the protein ids in the psm file to consist of two parts, separated by a delimiter (_ by default). The first part is the bin/mag id, the second part the protein id
147 proteins the ids of the proteins associated with the pattern (without the bin id)
148 peptide the aminoacid sequence of the peptide
149 peptide_mass the mass of the peptide
150 C # of carbon atoms in the peptide
151 N # of nitrogen atoms in the peptide
152 O # of oxygen atoms in the peptide
153 H # of hydrogen atoms in the peptide
154 S # of sulfur atoms in the peptide
155 psm_id psm id
156 psm_mz psm m over z
157 psm_charge psm charge
158 psm_neutrons number of neutrons inferred from custom 'neutron' modifications
159 psm_rank rank of the psm
160 psm_precursor_id id of the ms1 spectrum that was the source of the psm
161 psm_precursor_mz mass over charge of the precursor of the psm
162 pattern_charge charge of the pattern
163 pattern_precursor_id id of the ms1 spectrum that was the source of the pattern
164 pattern_total_intensity total intensity of the pattern
165 pattern_peak_count # of peaks in the pattern
166 pattern_median_peak_spacing medium mass difference between a pattern's peaks
167 spectrum_mass_irregularity a measure for the standard deviation in the mass difference between a pattern's peaks
168 ratio_na the estimated isotope ratio inferred from neutron abundance (sip experiments)
169 ratio_fft the estimated isotope ratio inferred by the fft method (natural isotope abundances)
170 error_fft the remaining error after fitting the pattern with fft
171 error_clumpy the remaining error after fitting the pattern with the clumpy carbon method
172 flag_peptide_contains_sulfur true if peptide contains sulfur
173 flag_peptide_has_modifications true if peptide has no modifications
174 flag_peptide_assigned_to_multiple_bins true if peptide is associated with multiple proteins from different bins/mags
175 flag_peptide_assigned_to_multiple_proteins true if peptide is associated with multiple proteins
176 flag_peptide_mass_and_elements_undefined true if peptide has unknown mass and elemental composition
177 flag_psm_has_low_confidence true if psm was flagged as having low confidence (peptide identity uncertain)
178 flag_psm_is_ambiguous true if psm could not be assigned with certainty
179 flag_pattern_is_contaminated true if multiple patterns have one or more shared peaks
180 flag_pattern_is_wobbly true if pattern_median_peak_spacing exceeds a treshold
181 flag_peak_at_minus_one_pos true if a peak was detected immediately before the monoisotopic peak, could indicate overlap with another pattern
182 i0 - i19 the intensities of the first 20 peaks of the pattern
183 m0 - m19 the masses of the first 20 peaks of the pattern
184 c1 - c6 contributions of clumps of 1-6 carbon to ratio_na. These are the outcomes of the clumpy carbon model. These results are only meaningful if the biomass was labeled to saturation.
185 ========================================== ===================
186 ]]></help>
187 <citations>
188 <citation type="doi">10.1186/s40168-022-01454-1</citation>
189 <citation type="doi">10.1073/pnas.1722325115</citation>
190 <citation type="doi">10.1101/2021.03.29.437612</citation>
191 <citation type="doi">10.1093/bioinformatics/bty046</citation>
192 </citations>
193 </tool>