Mercurial > repos > cpt > cpt_sar_finder
comparison SAR_functions.py @ 1:112751823323 draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
| author | cpt | 
|---|---|
| date | Mon, 05 Jun 2023 02:52:57 +0000 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| 0:9f62910edcc9 | 1:112751823323 | 
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 import sys | |
| 4 import argparse | |
| 5 import os | |
| 6 import re | |
| 7 from Bio import SeqIO | |
| 8 | |
| 9 | |
| 10 class CheckSequence: | |
| 11 """ | |
| 12 SAR endolysin Verification class, which starts with complete FA file, and is shrunk by each function to reveal best candidates of SAR endolysin proteins | |
| 13 """ | |
| 14 | |
| 15 def __init__(self, protein_name, protein_data): | |
| 16 self.name = protein_name | |
| 17 self.seq = protein_data.seq | |
| 18 self.description = protein_data.description | |
| 19 self.size = len(self.seq) | |
| 20 self.store = {} | |
| 21 | |
| 22 def check_sizes(self, min, max): | |
| 23 """check the minimum and maximum peptide lengths""" | |
| 24 if self.size < min: | |
| 25 print("too small") | |
| 26 elif self.size > max: | |
| 27 print("too large") | |
| 28 else: | |
| 29 print(f"{self.name} : {self.seq}") | |
| 30 return True | |
| 31 | |
| 32 def check_hydrophobicity_and_charge( | |
| 33 self, sar_min=15, sar_max=20, perc_residues="SGAT" | |
| 34 ): | |
| 35 """verifies the existence of a hydrophobic region within the sequence""" | |
| 36 hydrophobic_residues = "['FIWLVMYCATGSP']" # fed through regex | |
| 37 hits = self.store | |
| 38 pos_res = "RK" | |
| 39 neg_res = "DE" | |
| 40 | |
| 41 if self.size > 50: | |
| 42 seq = self.seq[0:50] | |
| 43 else: | |
| 44 seq = self.seq | |
| 45 for sar_size in range(sar_min, sar_max, 1): | |
| 46 for i in range(0, len(seq) - sar_size, 1): | |
| 47 sar_seq = str(seq[i : i + sar_size]) | |
| 48 if re.search( | |
| 49 (hydrophobic_residues + "{" + str(sar_size) + "}"), sar_seq | |
| 50 ): | |
| 51 ( | |
| 52 charge_seq, | |
| 53 charge, | |
| 54 perc_cont, | |
| 55 sar_coords, | |
| 56 nterm_coords, | |
| 57 cterm_coords, | |
| 58 sar_start, | |
| 59 sar_end, | |
| 60 ) = rep_funcs( | |
| 61 self, seq, i, pos_res, neg_res, sar_seq, perc_residues, sar_size | |
| 62 ) | |
| 63 storage_dict( | |
| 64 self=self, | |
| 65 sar_size=sar_size, | |
| 66 sar_seq=sar_seq, | |
| 67 hits=hits, | |
| 68 charge_seq=charge_seq, | |
| 69 charge=charge, | |
| 70 perc_cont=perc_cont, | |
| 71 nterm_coords=nterm_coords, | |
| 72 sar_coords=sar_coords, | |
| 73 cterm_coords=cterm_coords, | |
| 74 sar_start=sar_start, | |
| 75 sar_end=sar_end, | |
| 76 ) | |
| 77 # print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1)) | |
| 78 elif "K" in sar_seq[0] and re.search( | |
| 79 (hydrophobic_residues + "{" + str(sar_size - 1) + "}"), sar_seq[1:] | |
| 80 ): # check frontend snorkels | |
| 81 ( | |
| 82 charge_seq, | |
| 83 charge, | |
| 84 perc_cont, | |
| 85 sar_coords, | |
| 86 nterm_coords, | |
| 87 cterm_coords, | |
| 88 sar_start, | |
| 89 sar_end, | |
| 90 ) = rep_funcs( | |
| 91 self, seq, i, pos_res, neg_res, sar_seq, perc_residues, sar_size | |
| 92 ) | |
| 93 storage_dict( | |
| 94 self=self, | |
| 95 sar_size=sar_size, | |
| 96 sar_seq=sar_seq, | |
| 97 hits=hits, | |
| 98 charge_seq=charge_seq, | |
| 99 charge=charge, | |
| 100 perc_cont=perc_cont, | |
| 101 nterm_coords=nterm_coords, | |
| 102 sar_coords=sar_coords, | |
| 103 cterm_coords=cterm_coords, | |
| 104 sar_start=sar_start, | |
| 105 sar_end=sar_end, | |
| 106 ) | |
| 107 # print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1)) | |
| 108 elif "K" in sar_seq[-1] and re.search( | |
| 109 (hydrophobic_residues + "{" + str(sar_size - 1) + "}"), sar_seq[:-1] | |
| 110 ): # check backend snorkels | |
| 111 ( | |
| 112 charge_seq, | |
| 113 charge, | |
| 114 perc_cont, | |
| 115 sar_coords, | |
| 116 nterm_coords, | |
| 117 cterm_coords, | |
| 118 sar_start, | |
| 119 sar_end, | |
| 120 ) = rep_funcs( | |
| 121 self, seq, i, pos_res, neg_res, sar_seq, perc_residues, sar_size | |
| 122 ) | |
| 123 storage_dict( | |
| 124 self=self, | |
| 125 sar_size=sar_size, | |
| 126 sar_seq=sar_seq, | |
| 127 hits=hits, | |
| 128 charge_seq=charge_seq, | |
| 129 charge=charge, | |
| 130 perc_cont=perc_cont, | |
| 131 nterm_coords=nterm_coords, | |
| 132 sar_coords=sar_coords, | |
| 133 cterm_coords=cterm_coords, | |
| 134 sar_start=sar_start, | |
| 135 sar_end=sar_end, | |
| 136 ) | |
| 137 # print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1)) | |
| 138 continue | |
| 139 | |
| 140 return hits | |
| 141 | |
| 142 def shrink_results(self, sar_min=15, sar_max=20, perc_residues="SGAT"): | |
| 143 """removes repetiive hits, keeps only the shortest and longest of each SAR domain""" | |
| 144 compare_candidates = {} | |
| 145 hits = self.check_hydrophobicity_and_charge(sar_min=sar_min, sar_max=sar_max) | |
| 146 for sar_name, data in hits.items(): | |
| 147 # print(sar_name) | |
| 148 compare_candidates[sar_name] = {} | |
| 149 # print("\nThese are the values: {}".format(v)) | |
| 150 # count_of_times = 0 | |
| 151 tmd_log = [] | |
| 152 for sar_size in range(sar_max, sar_min - 1, -1): | |
| 153 if "TMD_" + str(sar_size) in data: | |
| 154 tmd_log.append(sar_size) | |
| 155 # print(tmd_log) | |
| 156 for idx, the_data in enumerate(data["TMD_" + str(sar_size)]): | |
| 157 # print(the_data[7]) | |
| 158 # print(the_data) | |
| 159 # print(f"This is the index: {idx}") | |
| 160 # print(f"This is the list of data at this index: {the_data}") | |
| 161 if ( | |
| 162 the_data[7] in compare_candidates[sar_name] | |
| 163 ): # index to start | |
| 164 compare_candidates[sar_name][the_data[7]]["count"] += 1 | |
| 165 compare_candidates[sar_name][the_data[7]]["size"].append( | |
| 166 sar_size | |
| 167 ) | |
| 168 compare_candidates[sar_name][the_data[7]]["index"].append( | |
| 169 idx | |
| 170 ) | |
| 171 else: | |
| 172 compare_candidates[sar_name][the_data[7]] = {} | |
| 173 compare_candidates[sar_name][the_data[7]]["count"] = 1 | |
| 174 compare_candidates[sar_name][the_data[7]]["size"] = [ | |
| 175 sar_size | |
| 176 ] | |
| 177 compare_candidates[sar_name][the_data[7]]["index"] = [idx] | |
| 178 hits[sar_name]["biggest_sar"] = tmd_log[0] | |
| 179 for sar_name, compare_data in compare_candidates.items(): | |
| 180 for data in compare_data.values(): | |
| 181 if len(data["size"]) >= 3: | |
| 182 # print(f"{each_size} --> {data}") | |
| 183 minmax = [min(data["size"]), max(data["size"])] | |
| 184 nonminmax = [x for x in data["size"] if x not in minmax] | |
| 185 nonminmax_index = [] | |
| 186 for each_nonminmax in nonminmax: | |
| 187 v = data["size"].index(each_nonminmax) | |
| 188 x = data["index"][v] | |
| 189 nonminmax_index.append(x) | |
| 190 nons = zip(nonminmax, nonminmax_index) | |
| 191 for value in nons: | |
| 192 # hits[sar_name]["TMD_"+str(value[0])] = hits[sar_name]["TMD_"+str(value[0])].pop(value[1]) | |
| 193 hits[sar_name]["TMD_" + str(value[0])][value[1]] = [""] | |
| 194 | |
| 195 return hits | |
| 196 | |
| 197 | |
| 198 def rep_funcs(self, seq, loc, pos_res, neg_res, sar_seq, perc_residues, sar_size): | |
| 199 """run a set of functions together before sending the results to the storage dictionary""" | |
| 200 | |
| 201 charge_seq = str(seq[:loc]) | |
| 202 charge = charge_check(charge_seq, pos_res, neg_res) | |
| 203 perc_cont = percent_calc(sar_seq, perc_residues, int(sar_size)) | |
| 204 sar_start = loc | |
| 205 sar_end = loc + sar_size | |
| 206 sar_coords = "{}..{}".format(loc, loc + sar_size) | |
| 207 nterm_coords = "{}..{}".format("0", loc - 1) | |
| 208 cterm_coords = "{}..{}".format(loc + sar_size + 1, self.size) | |
| 209 | |
| 210 return ( | |
| 211 charge_seq, | |
| 212 charge, | |
| 213 perc_cont, | |
| 214 sar_coords, | |
| 215 nterm_coords, | |
| 216 cterm_coords, | |
| 217 sar_start, | |
| 218 sar_end, | |
| 219 ) | |
| 220 | |
| 221 | |
| 222 ### Extra "helper" functions | |
| 223 def storage_dict( | |
| 224 self, | |
| 225 sar_size, | |
| 226 sar_seq, | |
| 227 hits, | |
| 228 charge_seq, | |
| 229 charge, | |
| 230 perc_cont, | |
| 231 nterm_coords, | |
| 232 sar_coords, | |
| 233 cterm_coords, | |
| 234 sar_start, | |
| 235 sar_end, | |
| 236 ): # probably not good to call "self" a param here...definitley not PEP approved... | |
| 237 """organize dictionary for hydrophobicity check""" | |
| 238 if self.name not in hits: | |
| 239 hits[self.name] = {} | |
| 240 hits[self.name]["description"] = str(self.description) | |
| 241 hits[self.name]["sequence"] = str(self.seq) | |
| 242 hits[self.name]["size"] = str(self.size) | |
| 243 # GAcont = str((str(self.seq).count("G")+str(self.seq).count("A"))/int(self.size)*100) | |
| 244 # hits[self.name]["GAcont"] = "{:.2f}%".format(float(GAcont)) | |
| 245 if "TMD_" + str(sar_size) not in hits[self.name]: | |
| 246 hits[self.name]["TMD_" + str(sar_size)] = [] | |
| 247 hits[self.name]["TMD_" + str(sar_size)].append( | |
| 248 [ | |
| 249 sar_seq, | |
| 250 charge_seq, | |
| 251 charge, | |
| 252 perc_cont, | |
| 253 nterm_coords, | |
| 254 sar_coords, | |
| 255 cterm_coords, | |
| 256 sar_start, | |
| 257 sar_end, | |
| 258 ] | |
| 259 ) | |
| 260 else: | |
| 261 hits[self.name]["TMD_" + str(sar_size)].append( | |
| 262 [ | |
| 263 sar_seq, | |
| 264 charge_seq, | |
| 265 charge, | |
| 266 perc_cont, | |
| 267 nterm_coords, | |
| 268 sar_coords, | |
| 269 cterm_coords, | |
| 270 sar_start, | |
| 271 sar_end, | |
| 272 ] | |
| 273 ) | |
| 274 else: | |
| 275 if "TMD_" + str(sar_size) not in hits[self.name]: | |
| 276 hits[self.name]["TMD_" + str(sar_size)] = [] | |
| 277 hits[self.name]["TMD_" + str(sar_size)].append( | |
| 278 [ | |
| 279 sar_seq, | |
| 280 charge_seq, | |
| 281 charge, | |
| 282 perc_cont, | |
| 283 nterm_coords, | |
| 284 sar_coords, | |
| 285 cterm_coords, | |
| 286 sar_start, | |
| 287 sar_end, | |
| 288 ] | |
| 289 ) | |
| 290 else: | |
| 291 hits[self.name]["TMD_" + str(sar_size)].append( | |
| 292 [ | |
| 293 sar_seq, | |
| 294 charge_seq, | |
| 295 charge, | |
| 296 perc_cont, | |
| 297 nterm_coords, | |
| 298 sar_coords, | |
| 299 cterm_coords, | |
| 300 sar_start, | |
| 301 sar_end, | |
| 302 ] | |
| 303 ) | |
| 304 | |
| 305 | |
| 306 def percent_calc(sequence, residues, size): | |
| 307 """Calculate the percent of a set of residues within an input sequence""" | |
| 308 counted = {} | |
| 309 for aa in sequence: | |
| 310 # print(aa) | |
| 311 if aa in counted: | |
| 312 counted[aa] += 1 | |
| 313 else: | |
| 314 counted[aa] = 1 | |
| 315 residue_amt = 0 | |
| 316 my_ratios = [] | |
| 317 for res_of_interest in residues: | |
| 318 try: | |
| 319 residue_amt = counted[res_of_interest] | |
| 320 except KeyError: | |
| 321 residue_amt = 0 | |
| 322 ratio = residue_amt / size | |
| 323 my_ratios.append((round(ratio * 100, 2))) | |
| 324 | |
| 325 res_rat = list(zip(residues, my_ratios)) | |
| 326 | |
| 327 return res_rat | |
| 328 | |
| 329 | |
| 330 def charge_check(charge_seq, pos_res, neg_res): | |
| 331 charge = 0 | |
| 332 for aa in charge_seq: | |
| 333 if aa in pos_res: | |
| 334 charge += 1 | |
| 335 if aa in neg_res: | |
| 336 charge -= 1 | |
| 337 return charge | |
| 338 | |
| 339 | |
| 340 if __name__ == "__main__": | |
| 341 sequence = "MAGBYYYTRLCVRKLRKGGGHP" | |
| 342 residues = "YL" | |
| 343 size = len(sequence) | |
| 344 print(size) | |
| 345 v = percent_calc(sequence, residues, size) | |
| 346 print(v) | |
| 347 for i in v: | |
| 348 print(i) | 
