Mercurial > repos > galaxyp > calisp
comparison benchmarking.py @ 1:867f17ede7f3 draft default tip
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/calisp commit 42e5dfeaa309e6ac17b4616314498a3b628272d2
author | galaxyp |
---|---|
date | Thu, 14 Sep 2023 12:49:19 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:6d93529d19d4 | 1:867f17ede7f3 |
---|---|
1 import argparse | |
2 import os | |
3 | |
4 import numpy as np | |
5 import pandas as pd | |
6 | |
7 # Define the ArgumentParser | |
8 parser = argparse.ArgumentParser("List of natural abundances of the isotopes") | |
9 | |
10 parser.add_argument( | |
11 "--input", type=str, metavar="data", help="Input file/folder", required=True | |
12 ) | |
13 | |
14 parser.add_argument( | |
15 "--isotope_abundance_matrix", | |
16 type=str, | |
17 metavar="data", | |
18 help="Isotope abundance matrix", | |
19 required=True, | |
20 ) | |
21 parser.add_argument( | |
22 "--isotope", | |
23 type=str, | |
24 metavar="ISOTOPE", | |
25 help="Isotope", | |
26 required=True, | |
27 ) | |
28 parser.add_argument( | |
29 "--out_summary", | |
30 type=str, | |
31 metavar="output", | |
32 help="Peptide summary output", | |
33 required=False, | |
34 ) | |
35 parser.add_argument( | |
36 "--out_filtered", type=str, metavar="output", help="Filtered output", required=False | |
37 ) | |
38 parser.add_argument( | |
39 "--nominal_values", | |
40 type=str, | |
41 metavar="nominal_values", | |
42 help="Table giving nominal values", | |
43 default=None, | |
44 required=False, | |
45 ) | |
46 | |
47 # Indicate end of argument definitions and parse args | |
48 args = parser.parse_args() | |
49 | |
50 | |
51 def parse_nominal_values(filename): | |
52 nominal_values = {} | |
53 if not filename: | |
54 return nominal_values | |
55 with open(filename) as fh: | |
56 for line in fh: | |
57 line = line.strip() | |
58 if len(line) == 0 or line[0] == "#": | |
59 continue | |
60 line = line.split() | |
61 nominal_values[line[0]] = line[1] | |
62 return nominal_values | |
63 | |
64 | |
65 # Benchmarking section | |
66 # the functions for optimising calis-p data | |
67 | |
68 | |
69 def load_calisp_data(filename, factor): | |
70 # (1) load data | |
71 file_count = 1 | |
72 if os.path.isdir(filename): | |
73 file_data = [] | |
74 file_count = len(os.listdir(filename)) | |
75 for f in os.listdir(filename): | |
76 f = os.path.join(filename, f) | |
77 file_data.append(pd.read_feather(f)) | |
78 base, _ = os.path.splitext(f) | |
79 file_data[-1].to_csv(f"{base}.tsv", sep="\t", index=False) | |
80 data = pd.concat(file_data) | |
81 else: | |
82 data = pd.read_feather(filename) | |
83 base, _ = os.path.splitext(filename) | |
84 data.to_csv(f"{base}.tsv", sep="\t", index=False) | |
85 | |
86 file_success_count = len(data["ms_run"].unique()) | |
87 # (2) calculate deltas | |
88 # ((1-f)/f) - 1 == 1/f -2 | |
89 data["delta_na"] = data["ratio_na"] / ((1 / factor) - 2) * 1000 | |
90 data["delta_fft"] = data["ratio_fft"] / ((1 / factor) - 2) * 1000 | |
91 print( | |
92 f"Loaded {len(data.index)} isotopic patterns from {file_success_count}/{file_count} file(s)" | |
93 ) | |
94 return data | |
95 | |
96 | |
97 def filter_calisp_data(data, target): | |
98 if target.lower() == "na": | |
99 subdata = data.loc[ | |
100 lambda df: (df["flag_peak_at_minus_one_pos"] == False) # noqa: E712 | |
101 & (df["flag_pattern_is_wobbly"] == False) # noqa: E712 | |
102 & (df["flag_psm_has_low_confidence"] == False) # noqa: E712 | |
103 & (df["flag_psm_is_ambiguous"] == False) # noqa: E712 | |
104 & (df["flag_pattern_is_contaminated"] == False) # noqa: E712 | |
105 & (df["flag_peptide_assigned_to_multiple_bins"] == False), # noqa: E712 | |
106 :, | |
107 ] | |
108 elif target.lower() == "fft": | |
109 subdata = data.loc[ | |
110 lambda df: (df["error_fft"] < 0.001) | |
111 & (df["flag_peptide_assigned_to_multiple_bins"] == False), # noqa: E712 | |
112 :, | |
113 ] | |
114 elif target.lower() == "clumpy": | |
115 subdata = data.loc[ | |
116 lambda df: (df["error_clumpy"] < 0.001) | |
117 & (df["flag_peptide_assigned_to_multiple_bins"] == False), # noqa: E712 | |
118 :, | |
119 ] | |
120 | |
121 print( | |
122 f"{len(subdata.index)} ({len(subdata.index)/len(data.index)*100:.1f}%) remaining after filters." | |
123 ) | |
124 return subdata | |
125 | |
126 | |
127 def estimate_clumpiness(data): | |
128 subdata = data.loc[lambda df: df["error_clumpy"] < 0.001, :] | |
129 clumpiness = [] | |
130 for c in ["c1", "c2", "c3", "c4", "c5", "c6"]: | |
131 try: | |
132 count, division = np.histogram(subdata[c], bins=50) | |
133 count = count[1:-1] | |
134 opt = 0.02 * np.where(count == count.max())[0][0] / 0.96 | |
135 clumpiness.append(opt) | |
136 except ValueError: | |
137 pass | |
138 return clumpiness / sum(clumpiness) | |
139 | |
140 | |
141 # the function for benchmarking | |
142 def benchmark_sip_mock_community_data(data, factor, nominal_values): | |
143 background_isotope = 1 - factor | |
144 background_unlabelled = factor | |
145 | |
146 # For false positive discovery rates we set the threshold at the isotope/unlabelled associated with 1/4 of a generation | |
147 # of labeling. The E. coli values (1.7, 4.2 and 7.1) are for 1 generation at 1, 5 and 10% label, | |
148 # and we take the background (1.07) into account as well. | |
149 thresholds = { | |
150 1: 1.07 + (1.7 - 1.07) / 4, | |
151 5: 1.07 + (4.2 - 1.07) / 4, | |
152 10: 1.07 + (7.1 - 1.07) / 4, | |
153 } | |
154 | |
155 filenames = data["ms_run"].unique() | |
156 for fname in filenames: | |
157 print(f"Using nominal value {nominal_values.get(fname, 0)} for {fname}") | |
158 | |
159 bin_names = data["bins"].unique() | |
160 peptide_sequences = data["peptide"].unique() | |
161 benchmarking = pd.DataFrame( | |
162 columns=[ | |
163 "file", | |
164 "bins", | |
165 "% label", | |
166 "ratio", | |
167 "peptide", | |
168 "psm_mz", | |
169 "n(patterns)", | |
170 "mean intensity", | |
171 "ratio_NA median", | |
172 "N mean", | |
173 "ratio_NA SEM", | |
174 "ratio_FFT median", | |
175 "ratio_FFT SEM", | |
176 "False Positive", | |
177 ] | |
178 ) | |
179 false_positives = 0 | |
180 for p in peptide_sequences: | |
181 pep_data = data.loc[lambda df: df["peptide"] == p, :] | |
182 for b in bin_names: | |
183 # bindata = data.loc[lambda df: df["bins"] == b, :] | |
184 for f in filenames: | |
185 nominal_value = nominal_values.get(fname, 0) | |
186 unlabeled_fraction = 1 - nominal_value / 100 | |
187 U = unlabeled_fraction * background_unlabelled | |
188 I = nominal_value / 100 + unlabeled_fraction * background_isotope | |
189 ratio = I / U * 100 | |
190 pepfiledata = pep_data.loc[lambda df: df["ms_run"] == f, :] | |
191 is_false_positive = 0 | |
192 try: | |
193 if ( | |
194 b != "K12" | |
195 and pepfiledata["ratio_na"].median() > thresholds[nominal_value] | |
196 ): | |
197 is_false_positive = 1 | |
198 false_positives += 1 | |
199 except KeyError: | |
200 pass | |
201 benchmarking.loc[len(benchmarking)] = [ | |
202 f, | |
203 b, | |
204 nominal_value, | |
205 ratio, | |
206 p, | |
207 pepfiledata["psm_mz"].median(), | |
208 len(pepfiledata.index), | |
209 pepfiledata["pattern_total_intensity"].mean(), | |
210 pepfiledata["ratio_na"].median(), | |
211 pepfiledata["psm_neutrons"].mean(), | |
212 pepfiledata["ratio_na"].sem(), | |
213 pepfiledata["ratio_fft"].median(), | |
214 pepfiledata["ratio_fft"].sem(), | |
215 is_false_positive, | |
216 ] | |
217 | |
218 benchmarking = benchmarking.sort_values(["bins", "peptide"]) | |
219 benchmarking = benchmarking.reset_index(drop=True) | |
220 return benchmarking | |
221 | |
222 | |
223 rowcol = { | |
224 "13C": (0, 1), | |
225 "14C": (0, 2), | |
226 "15N": (1, 1), | |
227 "17O": (2, 1), | |
228 "18O": (2, 2), | |
229 "2H": (3, 1), | |
230 "3H": (3, 2), | |
231 "33S": (4, 1), | |
232 "34S": (4, 2), | |
233 "36S": (4, 3), | |
234 } | |
235 with open(args.isotope_abundance_matrix) as iamf: | |
236 matrix = [] | |
237 for line in iamf: | |
238 line = line.strip() | |
239 line = line.split("#")[0] | |
240 if line == "": | |
241 continue | |
242 matrix.append([float(x) for x in line.split()]) | |
243 factor = matrix[rowcol[args.isotope][0]][rowcol[args.isotope][1]] | |
244 print(f"Using factor {factor}") | |
245 | |
246 | |
247 # cleaning and filtering data | |
248 data = load_calisp_data(args.input, factor) | |
249 | |
250 if args.out_filtered: | |
251 data = filter_calisp_data(data, "na") | |
252 data["peptide_clean"] = data["peptide"] | |
253 data["peptide_clean"] = data["peptide_clean"].replace("'Oxidation'", "", regex=True) | |
254 data["peptide_clean"] = data["peptide_clean"].replace( | |
255 "'Carbamidomethyl'", "", regex=True | |
256 ) | |
257 data["peptide_clean"] = data["peptide_clean"].replace(r"\s*\[.*\]", "", regex=True) | |
258 | |
259 data["ratio_na"] = data["ratio_na"] * 100 | |
260 data["ratio_fft"] = data["ratio_fft"] * 100 | |
261 data.to_csv(args.out_filtered, sep="\t", index=False) | |
262 | |
263 # The column "% label" indicates the amount of label applied (percentage of label in the glucose). The amount of | |
264 # labeled E. coli cells added corresponded to 1 generation of labeling (50% of E. coli cells were labeled in | |
265 # all experiments except controls) | |
266 | |
267 if args.out_summary: | |
268 nominal_values = parse_nominal_values(args.nominal_values) | |
269 benchmarks = benchmark_sip_mock_community_data(data, factor, nominal_values) | |
270 benchmarks.to_csv(args.out_summary, sep="\t", index=False) |