Mercurial > repos > galaxyp > calisp
comparison benchmarking.py @ 1:867f17ede7f3 draft default tip
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/calisp commit 42e5dfeaa309e6ac17b4616314498a3b628272d2
| author | galaxyp |
|---|---|
| date | Thu, 14 Sep 2023 12:49:19 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:6d93529d19d4 | 1:867f17ede7f3 |
|---|---|
| 1 import argparse | |
| 2 import os | |
| 3 | |
| 4 import numpy as np | |
| 5 import pandas as pd | |
| 6 | |
| 7 # Define the ArgumentParser | |
| 8 parser = argparse.ArgumentParser("List of natural abundances of the isotopes") | |
| 9 | |
| 10 parser.add_argument( | |
| 11 "--input", type=str, metavar="data", help="Input file/folder", required=True | |
| 12 ) | |
| 13 | |
| 14 parser.add_argument( | |
| 15 "--isotope_abundance_matrix", | |
| 16 type=str, | |
| 17 metavar="data", | |
| 18 help="Isotope abundance matrix", | |
| 19 required=True, | |
| 20 ) | |
| 21 parser.add_argument( | |
| 22 "--isotope", | |
| 23 type=str, | |
| 24 metavar="ISOTOPE", | |
| 25 help="Isotope", | |
| 26 required=True, | |
| 27 ) | |
| 28 parser.add_argument( | |
| 29 "--out_summary", | |
| 30 type=str, | |
| 31 metavar="output", | |
| 32 help="Peptide summary output", | |
| 33 required=False, | |
| 34 ) | |
| 35 parser.add_argument( | |
| 36 "--out_filtered", type=str, metavar="output", help="Filtered output", required=False | |
| 37 ) | |
| 38 parser.add_argument( | |
| 39 "--nominal_values", | |
| 40 type=str, | |
| 41 metavar="nominal_values", | |
| 42 help="Table giving nominal values", | |
| 43 default=None, | |
| 44 required=False, | |
| 45 ) | |
| 46 | |
| 47 # Indicate end of argument definitions and parse args | |
| 48 args = parser.parse_args() | |
| 49 | |
| 50 | |
| 51 def parse_nominal_values(filename): | |
| 52 nominal_values = {} | |
| 53 if not filename: | |
| 54 return nominal_values | |
| 55 with open(filename) as fh: | |
| 56 for line in fh: | |
| 57 line = line.strip() | |
| 58 if len(line) == 0 or line[0] == "#": | |
| 59 continue | |
| 60 line = line.split() | |
| 61 nominal_values[line[0]] = line[1] | |
| 62 return nominal_values | |
| 63 | |
| 64 | |
| 65 # Benchmarking section | |
| 66 # the functions for optimising calis-p data | |
| 67 | |
| 68 | |
| 69 def load_calisp_data(filename, factor): | |
| 70 # (1) load data | |
| 71 file_count = 1 | |
| 72 if os.path.isdir(filename): | |
| 73 file_data = [] | |
| 74 file_count = len(os.listdir(filename)) | |
| 75 for f in os.listdir(filename): | |
| 76 f = os.path.join(filename, f) | |
| 77 file_data.append(pd.read_feather(f)) | |
| 78 base, _ = os.path.splitext(f) | |
| 79 file_data[-1].to_csv(f"{base}.tsv", sep="\t", index=False) | |
| 80 data = pd.concat(file_data) | |
| 81 else: | |
| 82 data = pd.read_feather(filename) | |
| 83 base, _ = os.path.splitext(filename) | |
| 84 data.to_csv(f"{base}.tsv", sep="\t", index=False) | |
| 85 | |
| 86 file_success_count = len(data["ms_run"].unique()) | |
| 87 # (2) calculate deltas | |
| 88 # ((1-f)/f) - 1 == 1/f -2 | |
| 89 data["delta_na"] = data["ratio_na"] / ((1 / factor) - 2) * 1000 | |
| 90 data["delta_fft"] = data["ratio_fft"] / ((1 / factor) - 2) * 1000 | |
| 91 print( | |
| 92 f"Loaded {len(data.index)} isotopic patterns from {file_success_count}/{file_count} file(s)" | |
| 93 ) | |
| 94 return data | |
| 95 | |
| 96 | |
| 97 def filter_calisp_data(data, target): | |
| 98 if target.lower() == "na": | |
| 99 subdata = data.loc[ | |
| 100 lambda df: (df["flag_peak_at_minus_one_pos"] == False) # noqa: E712 | |
| 101 & (df["flag_pattern_is_wobbly"] == False) # noqa: E712 | |
| 102 & (df["flag_psm_has_low_confidence"] == False) # noqa: E712 | |
| 103 & (df["flag_psm_is_ambiguous"] == False) # noqa: E712 | |
| 104 & (df["flag_pattern_is_contaminated"] == False) # noqa: E712 | |
| 105 & (df["flag_peptide_assigned_to_multiple_bins"] == False), # noqa: E712 | |
| 106 :, | |
| 107 ] | |
| 108 elif target.lower() == "fft": | |
| 109 subdata = data.loc[ | |
| 110 lambda df: (df["error_fft"] < 0.001) | |
| 111 & (df["flag_peptide_assigned_to_multiple_bins"] == False), # noqa: E712 | |
| 112 :, | |
| 113 ] | |
| 114 elif target.lower() == "clumpy": | |
| 115 subdata = data.loc[ | |
| 116 lambda df: (df["error_clumpy"] < 0.001) | |
| 117 & (df["flag_peptide_assigned_to_multiple_bins"] == False), # noqa: E712 | |
| 118 :, | |
| 119 ] | |
| 120 | |
| 121 print( | |
| 122 f"{len(subdata.index)} ({len(subdata.index)/len(data.index)*100:.1f}%) remaining after filters." | |
| 123 ) | |
| 124 return subdata | |
| 125 | |
| 126 | |
| 127 def estimate_clumpiness(data): | |
| 128 subdata = data.loc[lambda df: df["error_clumpy"] < 0.001, :] | |
| 129 clumpiness = [] | |
| 130 for c in ["c1", "c2", "c3", "c4", "c5", "c6"]: | |
| 131 try: | |
| 132 count, division = np.histogram(subdata[c], bins=50) | |
| 133 count = count[1:-1] | |
| 134 opt = 0.02 * np.where(count == count.max())[0][0] / 0.96 | |
| 135 clumpiness.append(opt) | |
| 136 except ValueError: | |
| 137 pass | |
| 138 return clumpiness / sum(clumpiness) | |
| 139 | |
| 140 | |
| 141 # the function for benchmarking | |
| 142 def benchmark_sip_mock_community_data(data, factor, nominal_values): | |
| 143 background_isotope = 1 - factor | |
| 144 background_unlabelled = factor | |
| 145 | |
| 146 # For false positive discovery rates we set the threshold at the isotope/unlabelled associated with 1/4 of a generation | |
| 147 # of labeling. The E. coli values (1.7, 4.2 and 7.1) are for 1 generation at 1, 5 and 10% label, | |
| 148 # and we take the background (1.07) into account as well. | |
| 149 thresholds = { | |
| 150 1: 1.07 + (1.7 - 1.07) / 4, | |
| 151 5: 1.07 + (4.2 - 1.07) / 4, | |
| 152 10: 1.07 + (7.1 - 1.07) / 4, | |
| 153 } | |
| 154 | |
| 155 filenames = data["ms_run"].unique() | |
| 156 for fname in filenames: | |
| 157 print(f"Using nominal value {nominal_values.get(fname, 0)} for {fname}") | |
| 158 | |
| 159 bin_names = data["bins"].unique() | |
| 160 peptide_sequences = data["peptide"].unique() | |
| 161 benchmarking = pd.DataFrame( | |
| 162 columns=[ | |
| 163 "file", | |
| 164 "bins", | |
| 165 "% label", | |
| 166 "ratio", | |
| 167 "peptide", | |
| 168 "psm_mz", | |
| 169 "n(patterns)", | |
| 170 "mean intensity", | |
| 171 "ratio_NA median", | |
| 172 "N mean", | |
| 173 "ratio_NA SEM", | |
| 174 "ratio_FFT median", | |
| 175 "ratio_FFT SEM", | |
| 176 "False Positive", | |
| 177 ] | |
| 178 ) | |
| 179 false_positives = 0 | |
| 180 for p in peptide_sequences: | |
| 181 pep_data = data.loc[lambda df: df["peptide"] == p, :] | |
| 182 for b in bin_names: | |
| 183 # bindata = data.loc[lambda df: df["bins"] == b, :] | |
| 184 for f in filenames: | |
| 185 nominal_value = nominal_values.get(fname, 0) | |
| 186 unlabeled_fraction = 1 - nominal_value / 100 | |
| 187 U = unlabeled_fraction * background_unlabelled | |
| 188 I = nominal_value / 100 + unlabeled_fraction * background_isotope | |
| 189 ratio = I / U * 100 | |
| 190 pepfiledata = pep_data.loc[lambda df: df["ms_run"] == f, :] | |
| 191 is_false_positive = 0 | |
| 192 try: | |
| 193 if ( | |
| 194 b != "K12" | |
| 195 and pepfiledata["ratio_na"].median() > thresholds[nominal_value] | |
| 196 ): | |
| 197 is_false_positive = 1 | |
| 198 false_positives += 1 | |
| 199 except KeyError: | |
| 200 pass | |
| 201 benchmarking.loc[len(benchmarking)] = [ | |
| 202 f, | |
| 203 b, | |
| 204 nominal_value, | |
| 205 ratio, | |
| 206 p, | |
| 207 pepfiledata["psm_mz"].median(), | |
| 208 len(pepfiledata.index), | |
| 209 pepfiledata["pattern_total_intensity"].mean(), | |
| 210 pepfiledata["ratio_na"].median(), | |
| 211 pepfiledata["psm_neutrons"].mean(), | |
| 212 pepfiledata["ratio_na"].sem(), | |
| 213 pepfiledata["ratio_fft"].median(), | |
| 214 pepfiledata["ratio_fft"].sem(), | |
| 215 is_false_positive, | |
| 216 ] | |
| 217 | |
| 218 benchmarking = benchmarking.sort_values(["bins", "peptide"]) | |
| 219 benchmarking = benchmarking.reset_index(drop=True) | |
| 220 return benchmarking | |
| 221 | |
| 222 | |
| 223 rowcol = { | |
| 224 "13C": (0, 1), | |
| 225 "14C": (0, 2), | |
| 226 "15N": (1, 1), | |
| 227 "17O": (2, 1), | |
| 228 "18O": (2, 2), | |
| 229 "2H": (3, 1), | |
| 230 "3H": (3, 2), | |
| 231 "33S": (4, 1), | |
| 232 "34S": (4, 2), | |
| 233 "36S": (4, 3), | |
| 234 } | |
| 235 with open(args.isotope_abundance_matrix) as iamf: | |
| 236 matrix = [] | |
| 237 for line in iamf: | |
| 238 line = line.strip() | |
| 239 line = line.split("#")[0] | |
| 240 if line == "": | |
| 241 continue | |
| 242 matrix.append([float(x) for x in line.split()]) | |
| 243 factor = matrix[rowcol[args.isotope][0]][rowcol[args.isotope][1]] | |
| 244 print(f"Using factor {factor}") | |
| 245 | |
| 246 | |
| 247 # cleaning and filtering data | |
| 248 data = load_calisp_data(args.input, factor) | |
| 249 | |
| 250 if args.out_filtered: | |
| 251 data = filter_calisp_data(data, "na") | |
| 252 data["peptide_clean"] = data["peptide"] | |
| 253 data["peptide_clean"] = data["peptide_clean"].replace("'Oxidation'", "", regex=True) | |
| 254 data["peptide_clean"] = data["peptide_clean"].replace( | |
| 255 "'Carbamidomethyl'", "", regex=True | |
| 256 ) | |
| 257 data["peptide_clean"] = data["peptide_clean"].replace(r"\s*\[.*\]", "", regex=True) | |
| 258 | |
| 259 data["ratio_na"] = data["ratio_na"] * 100 | |
| 260 data["ratio_fft"] = data["ratio_fft"] * 100 | |
| 261 data.to_csv(args.out_filtered, sep="\t", index=False) | |
| 262 | |
| 263 # The column "% label" indicates the amount of label applied (percentage of label in the glucose). The amount of | |
| 264 # labeled E. coli cells added corresponded to 1 generation of labeling (50% of E. coli cells were labeled in | |
| 265 # all experiments except controls) | |
| 266 | |
| 267 if args.out_summary: | |
| 268 nominal_values = parse_nominal_values(args.nominal_values) | |
| 269 benchmarks = benchmark_sip_mock_community_data(data, factor, nominal_values) | |
| 270 benchmarks.to_csv(args.out_summary, sep="\t", index=False) |
