Mercurial > repos > galaxyp > calisp
comparison calisp.xml @ 0:6d93529d19d4 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/calisp commit 060699366b6dd19ad6c3ef3f332f63cc55d75dce
author | galaxyp |
---|---|
date | Thu, 01 Jun 2023 08:34:14 +0000 |
parents | |
children | 867f17ede7f3 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:6d93529d19d4 |
---|---|
1 <tool id="calisp" name="calisp" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="21.05"> | |
2 <description>Estimate isotopic composition of peptides from proteomics mass spectrometry data</description> | |
3 <macros> | |
4 <token name="@TOOL_VERSION@">3.0.10</token> | |
5 <token name="@VERSION_SUFFIX@">0</token> | |
6 <token name="@CALISP_REPO@">https://raw.githubusercontent.com/kinestetika/Calisp/208d495674e2b52fe56cf23457c833d1c2527242</token> | |
7 <xml name="input_macro" tokens="multiple"> | |
8 <!-- According to readme mzid input is not yet implented --> | |
9 </xml> | |
10 </macros> | |
11 <requirements> | |
12 <requirement type="package" version="@TOOL_VERSION@">calisp</requirement> | |
13 </requirements> | |
14 <command detect_errors="aggressive"><![CDATA[ | |
15 #import re | |
16 | |
17 mkdir -p spectra && | |
18 #set escaped_specs = re.sub('[^\w\-\.,:]', '_', str($spectrum_file.element_identifier)) | |
19 ln -s '$spectrum_file' spectra/'$escaped_specs' && | |
20 | |
21 mkdir -p psms && | |
22 #set escaped_peps = re.sub('[^\w\-\.,:]', '_', str($peptide_file.element_identifier)) | |
23 ln -s '$peptide_file' psms/'$escaped_peps' && | |
24 | |
25 calisp | |
26 --spectrum_file spectra/ | |
27 --peptide_file psms/ | |
28 --output_file calisp-output/ | |
29 --mass_accuracy $mass_accuracy | |
30 --bin_delimiter '$bin_delimiter' | |
31 --threads "\${GALAXY_SLOTS:-1}" | |
32 --isotope $isotope | |
33 $compute_clumps && | |
34 '$__tool_directory__/feather2tsv.py' --calisp_output calisp-output/ | |
35 ]]></command> | |
36 <inputs> | |
37 <param argument="--spectrum_file" type="data" multiple="false" format="mzml" label="Spectrum file"/> | |
38 <param argument="--peptide_file" type="data" multiple="false" format="tabular" label="Peptide file" help="Psm file" /> | |
39 <param argument="--mass_accuracy" type="float" value="10" label="Mass accuracy" help="The maximum mass difference between theoretical mass and experimental mass of a peptide" /> | |
40 <param argument="--bin_delimiter" type="text" value="_" label="Bin delimiter" help="For metagenomic data, the delimiter that separates the bin ID from the protein ID (default: "_"). Use "-" to ignore bins ID entirely."> | |
41 <sanitizer invalid_char=""> | |
42 <valid initial="string.ascii_letters,string.digits"> | |
43 <add value="_" /> | |
44 <add value="-" /> | |
45 <add value=":" /> | |
46 </valid> | |
47 </sanitizer> | |
48 </param> | |
49 <param argument="--isotope" type="select" label="Target isotope"> | |
50 <option value="13C" selected="true">13C</option> | |
51 <option value="14C">14C</option> | |
52 <option value="15N">15N</option> | |
53 <option value="17O">17O</option> | |
54 <option value="18O">18O</option> | |
55 <option value="2H">2H</option> | |
56 <option value="3H">3H</option> | |
57 <option value="33S">33S</option> | |
58 <option value="34S">34S</option> | |
59 <option value="36S">36S</option> | |
60 </param> | |
61 <param argument="--compute_clumps" type="boolean" truevalue="--compute_clumps" falsevalue="" checked="false" label="Compute clumps" help="To compute clumpiness of carbon assimilation. Only use when samples are labeled tosaturation. Estimation of clumpiness takes much additional time." /> | |
62 </inputs> | |
63 <outputs> | |
64 <collection name="output" type="list"> | |
65 <discover_datasets pattern="(?P<designation>.*)\.tsv" format="tabular" directory="calisp-output"/> | |
66 </collection> | |
67 </outputs> | |
68 <tests> | |
69 <!-- TODO test data to large, avilable from here: https://github.com/kinestetika/Calisp/tree/master/test | |
70 if possible inlcude via location in the future | |
71 <test expect_num_outputs="1"> | |
72 <param name="spectrum_file" value="calisp_test_data.mzML" ftype="mzml"/> | |
73 <param name="peptide_file" value="calisp_test_data_TargetPeptideSpectrumMatch.txt" ftype="tabular"/> | |
74 <output_collection name="output" count="1"> | |
75 <element name="calisp_test_data"> | |
76 <assert_contents> | |
77 <has_text text="experiment"/> | |
78 <has_text text="MKH_260min_1800ng"/> | |
79 <has_text text="HOMO"/> | |
80 <has_text text="P13645"/> | |
81 <has_text text="NHEEEMKDLR"/> | |
82 <has_text text="Oxidation"/> | |
83 <has_n_columns n="85"/> | |
84 <has_n_lines n="24"/> | |
85 </assert_contents> | |
86 </element> | |
87 </output_collection> | |
88 </test> | |
89 --> | |
90 </tests> | |
91 <help><![CDATA[ | |
92 Calisp (Calgary approach to isotopes in proteomics) is a program that estimates | |
93 isotopic composition (e.g. 13C/12C, delta13C, 15N/14N etc) of peptides from | |
94 proteomics mass spectrometry data. Input data consist of mzML files and files | |
95 with peptide spectrum matches. | |
96 | |
97 Calisp was originally developed in Java. This Galaxy tool uses the python | |
98 reimplementation https://github.com/kinestetika/Calisp. | |
99 Note that, in contrast to the Java version the python reimplementation does | |
100 not use ``mcl`` . | |
101 Compared to Java versions of calisp, the workflow has been simplified. | |
102 Calisp does not filter out any isotopic patterns, or adds up isotopic | |
103 patterns to reduce noise - like the Java version does. It simply estimates the | |
104 ratio for the target isotopes (e.g. 13C/12C) for every isotopic pattern it can | |
105 subsample. It estimates this ratio based on neutron abundance and using fast | |
106 fourier transforms. The former applies to stable isotope probing experiments. | |
107 The latter applies to natural abundances, or to isotope probing experiments with | |
108 very little added label (e.g. using substrates with <1% additional 13C). The | |
109 motivation for omitting filtering is that keeping all subsampled isotopic | |
110 patterns, including bad ones, will enable training of machine learning | |
111 classifiers. Also, because it was shown that the median provides better | |
112 estimates for species in microbial communities than the mean, adding up isotopic | |
113 patterns to improve precision has lost its purpose. There is more power (and | |
114 sensitivity) in numbers. | |
115 | |
116 Because no data are filtered out and no isotopic patterns get added up, | |
117 calisp analyzes at least ten times as many isotopic patterns compared to the | |
118 Java version. That means calisp.py is about ten times slower, it takes about | |
119 5-10 min per .mzML file on a Desktop computer. For natural | |
120 abundance data, it works well to only use those spectra that have a FFT fitting | |
121 error ("error_fft") of less than 0.001. Note that this threshold is less | |
122 stringent then thew one used by the java program. | |
123 | |
124 Input | |
125 ===== | |
126 | |
127 Calisp needs two inputs: a spectra file in mzML format and tabular peptipe file (PSM). | |
128 The PSM file contains a column "Spectrum File" that links the peptides to the | |
129 original spectra files. The mzML files are identified by the run id | |
130 information stored in the mzML files or the file name. | |
131 In order to make the association via the file name work in Galaxy one can either | |
132 | |
133 - use collections where the element identifiers are equal to the data in the column | |
134 - make sure that dataset names are equal to the data in this column | |
135 | |
136 Output table | |
137 ============ | |
138 | |
139 Each row contains one isotopic pattern, defined by the following columns: | |
140 | |
141 ========================================== =================== | |
142 Header name Content | |
143 ========================================== =================== | |
144 experiment filename of the peptide spectrum match (psm) file | |
145 ms_run filename of the .mzml file | |
146 bins bin/mag ids, separated by commas. Calisp expects the protein ids in the psm file to consist of two parts, separated by a delimiter (_ by default). The first part is the bin/mag id, the second part the protein id | |
147 proteins the ids of the proteins associated with the pattern (without the bin id) | |
148 peptide the aminoacid sequence of the peptide | |
149 peptide_mass the mass of the peptide | |
150 C # of carbon atoms in the peptide | |
151 N # of nitrogen atoms in the peptide | |
152 O # of oxygen atoms in the peptide | |
153 H # of hydrogen atoms in the peptide | |
154 S # of sulfur atoms in the peptide | |
155 psm_id psm id | |
156 psm_mz psm m over z | |
157 psm_charge psm charge | |
158 psm_neutrons number of neutrons inferred from custom 'neutron' modifications | |
159 psm_rank rank of the psm | |
160 psm_precursor_id id of the ms1 spectrum that was the source of the psm | |
161 psm_precursor_mz mass over charge of the precursor of the psm | |
162 pattern_charge charge of the pattern | |
163 pattern_precursor_id id of the ms1 spectrum that was the source of the pattern | |
164 pattern_total_intensity total intensity of the pattern | |
165 pattern_peak_count # of peaks in the pattern | |
166 pattern_median_peak_spacing medium mass difference between a pattern's peaks | |
167 spectrum_mass_irregularity a measure for the standard deviation in the mass difference between a pattern's peaks | |
168 ratio_na the estimated isotope ratio inferred from neutron abundance (sip experiments) | |
169 ratio_fft the estimated isotope ratio inferred by the fft method (natural isotope abundances) | |
170 error_fft the remaining error after fitting the pattern with fft | |
171 error_clumpy the remaining error after fitting the pattern with the clumpy carbon method | |
172 flag_peptide_contains_sulfur true if peptide contains sulfur | |
173 flag_peptide_has_modifications true if peptide has no modifications | |
174 flag_peptide_assigned_to_multiple_bins true if peptide is associated with multiple proteins from different bins/mags | |
175 flag_peptide_assigned_to_multiple_proteins true if peptide is associated with multiple proteins | |
176 flag_peptide_mass_and_elements_undefined true if peptide has unknown mass and elemental composition | |
177 flag_psm_has_low_confidence true if psm was flagged as having low confidence (peptide identity uncertain) | |
178 flag_psm_is_ambiguous true if psm could not be assigned with certainty | |
179 flag_pattern_is_contaminated true if multiple patterns have one or more shared peaks | |
180 flag_pattern_is_wobbly true if pattern_median_peak_spacing exceeds a treshold | |
181 flag_peak_at_minus_one_pos true if a peak was detected immediately before the monoisotopic peak, could indicate overlap with another pattern | |
182 i0 - i19 the intensities of the first 20 peaks of the pattern | |
183 m0 - m19 the masses of the first 20 peaks of the pattern | |
184 c1 - c6 contributions of clumps of 1-6 carbon to ratio_na. These are the outcomes of the clumpy carbon model. These results are only meaningful if the biomass was labeled to saturation. | |
185 ========================================== =================== | |
186 ]]></help> | |
187 <citations> | |
188 <citation type="doi">10.1186/s40168-022-01454-1</citation> | |
189 <citation type="doi">10.1073/pnas.1722325115</citation> | |
190 <citation type="doi">10.1101/2021.03.29.437612</citation> | |
191 <citation type="doi">10.1093/bioinformatics/bty046</citation> | |
192 </citations> | |
193 </tool> |