comparison SAR_functions.py @ 1:112751823323 draft

planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author cpt
date Mon, 05 Jun 2023 02:52:57 +0000
parents
children
comparison
equal deleted inserted replaced
0:9f62910edcc9 1:112751823323
1 #!/usr/bin/env python
2
3 import sys
4 import argparse
5 import os
6 import re
7 from Bio import SeqIO
8
9
10 class CheckSequence:
11 """
12 SAR endolysin Verification class, which starts with complete FA file, and is shrunk by each function to reveal best candidates of SAR endolysin proteins
13 """
14
15 def __init__(self, protein_name, protein_data):
16 self.name = protein_name
17 self.seq = protein_data.seq
18 self.description = protein_data.description
19 self.size = len(self.seq)
20 self.store = {}
21
22 def check_sizes(self, min, max):
23 """check the minimum and maximum peptide lengths"""
24 if self.size < min:
25 print("too small")
26 elif self.size > max:
27 print("too large")
28 else:
29 print(f"{self.name} : {self.seq}")
30 return True
31
32 def check_hydrophobicity_and_charge(
33 self, sar_min=15, sar_max=20, perc_residues="SGAT"
34 ):
35 """verifies the existence of a hydrophobic region within the sequence"""
36 hydrophobic_residues = "['FIWLVMYCATGSP']" # fed through regex
37 hits = self.store
38 pos_res = "RK"
39 neg_res = "DE"
40
41 if self.size > 50:
42 seq = self.seq[0:50]
43 else:
44 seq = self.seq
45 for sar_size in range(sar_min, sar_max, 1):
46 for i in range(0, len(seq) - sar_size, 1):
47 sar_seq = str(seq[i : i + sar_size])
48 if re.search(
49 (hydrophobic_residues + "{" + str(sar_size) + "}"), sar_seq
50 ):
51 (
52 charge_seq,
53 charge,
54 perc_cont,
55 sar_coords,
56 nterm_coords,
57 cterm_coords,
58 sar_start,
59 sar_end,
60 ) = rep_funcs(
61 self, seq, i, pos_res, neg_res, sar_seq, perc_residues, sar_size
62 )
63 storage_dict(
64 self=self,
65 sar_size=sar_size,
66 sar_seq=sar_seq,
67 hits=hits,
68 charge_seq=charge_seq,
69 charge=charge,
70 perc_cont=perc_cont,
71 nterm_coords=nterm_coords,
72 sar_coords=sar_coords,
73 cterm_coords=cterm_coords,
74 sar_start=sar_start,
75 sar_end=sar_end,
76 )
77 # print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1))
78 elif "K" in sar_seq[0] and re.search(
79 (hydrophobic_residues + "{" + str(sar_size - 1) + "}"), sar_seq[1:]
80 ): # check frontend snorkels
81 (
82 charge_seq,
83 charge,
84 perc_cont,
85 sar_coords,
86 nterm_coords,
87 cterm_coords,
88 sar_start,
89 sar_end,
90 ) = rep_funcs(
91 self, seq, i, pos_res, neg_res, sar_seq, perc_residues, sar_size
92 )
93 storage_dict(
94 self=self,
95 sar_size=sar_size,
96 sar_seq=sar_seq,
97 hits=hits,
98 charge_seq=charge_seq,
99 charge=charge,
100 perc_cont=perc_cont,
101 nterm_coords=nterm_coords,
102 sar_coords=sar_coords,
103 cterm_coords=cterm_coords,
104 sar_start=sar_start,
105 sar_end=sar_end,
106 )
107 # print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1))
108 elif "K" in sar_seq[-1] and re.search(
109 (hydrophobic_residues + "{" + str(sar_size - 1) + "}"), sar_seq[:-1]
110 ): # check backend snorkels
111 (
112 charge_seq,
113 charge,
114 perc_cont,
115 sar_coords,
116 nterm_coords,
117 cterm_coords,
118 sar_start,
119 sar_end,
120 ) = rep_funcs(
121 self, seq, i, pos_res, neg_res, sar_seq, perc_residues, sar_size
122 )
123 storage_dict(
124 self=self,
125 sar_size=sar_size,
126 sar_seq=sar_seq,
127 hits=hits,
128 charge_seq=charge_seq,
129 charge=charge,
130 perc_cont=perc_cont,
131 nterm_coords=nterm_coords,
132 sar_coords=sar_coords,
133 cterm_coords=cterm_coords,
134 sar_start=sar_start,
135 sar_end=sar_end,
136 )
137 # print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1))
138 continue
139
140 return hits
141
142 def shrink_results(self, sar_min=15, sar_max=20, perc_residues="SGAT"):
143 """removes repetiive hits, keeps only the shortest and longest of each SAR domain"""
144 compare_candidates = {}
145 hits = self.check_hydrophobicity_and_charge(sar_min=sar_min, sar_max=sar_max)
146 for sar_name, data in hits.items():
147 # print(sar_name)
148 compare_candidates[sar_name] = {}
149 # print("\nThese are the values: {}".format(v))
150 # count_of_times = 0
151 tmd_log = []
152 for sar_size in range(sar_max, sar_min - 1, -1):
153 if "TMD_" + str(sar_size) in data:
154 tmd_log.append(sar_size)
155 # print(tmd_log)
156 for idx, the_data in enumerate(data["TMD_" + str(sar_size)]):
157 # print(the_data[7])
158 # print(the_data)
159 # print(f"This is the index: {idx}")
160 # print(f"This is the list of data at this index: {the_data}")
161 if (
162 the_data[7] in compare_candidates[sar_name]
163 ): # index to start
164 compare_candidates[sar_name][the_data[7]]["count"] += 1
165 compare_candidates[sar_name][the_data[7]]["size"].append(
166 sar_size
167 )
168 compare_candidates[sar_name][the_data[7]]["index"].append(
169 idx
170 )
171 else:
172 compare_candidates[sar_name][the_data[7]] = {}
173 compare_candidates[sar_name][the_data[7]]["count"] = 1
174 compare_candidates[sar_name][the_data[7]]["size"] = [
175 sar_size
176 ]
177 compare_candidates[sar_name][the_data[7]]["index"] = [idx]
178 hits[sar_name]["biggest_sar"] = tmd_log[0]
179 for sar_name, compare_data in compare_candidates.items():
180 for data in compare_data.values():
181 if len(data["size"]) >= 3:
182 # print(f"{each_size} --> {data}")
183 minmax = [min(data["size"]), max(data["size"])]
184 nonminmax = [x for x in data["size"] if x not in minmax]
185 nonminmax_index = []
186 for each_nonminmax in nonminmax:
187 v = data["size"].index(each_nonminmax)
188 x = data["index"][v]
189 nonminmax_index.append(x)
190 nons = zip(nonminmax, nonminmax_index)
191 for value in nons:
192 # hits[sar_name]["TMD_"+str(value[0])] = hits[sar_name]["TMD_"+str(value[0])].pop(value[1])
193 hits[sar_name]["TMD_" + str(value[0])][value[1]] = [""]
194
195 return hits
196
197
198 def rep_funcs(self, seq, loc, pos_res, neg_res, sar_seq, perc_residues, sar_size):
199 """run a set of functions together before sending the results to the storage dictionary"""
200
201 charge_seq = str(seq[:loc])
202 charge = charge_check(charge_seq, pos_res, neg_res)
203 perc_cont = percent_calc(sar_seq, perc_residues, int(sar_size))
204 sar_start = loc
205 sar_end = loc + sar_size
206 sar_coords = "{}..{}".format(loc, loc + sar_size)
207 nterm_coords = "{}..{}".format("0", loc - 1)
208 cterm_coords = "{}..{}".format(loc + sar_size + 1, self.size)
209
210 return (
211 charge_seq,
212 charge,
213 perc_cont,
214 sar_coords,
215 nterm_coords,
216 cterm_coords,
217 sar_start,
218 sar_end,
219 )
220
221
222 ### Extra "helper" functions
223 def storage_dict(
224 self,
225 sar_size,
226 sar_seq,
227 hits,
228 charge_seq,
229 charge,
230 perc_cont,
231 nterm_coords,
232 sar_coords,
233 cterm_coords,
234 sar_start,
235 sar_end,
236 ): # probably not good to call "self" a param here...definitley not PEP approved...
237 """organize dictionary for hydrophobicity check"""
238 if self.name not in hits:
239 hits[self.name] = {}
240 hits[self.name]["description"] = str(self.description)
241 hits[self.name]["sequence"] = str(self.seq)
242 hits[self.name]["size"] = str(self.size)
243 # GAcont = str((str(self.seq).count("G")+str(self.seq).count("A"))/int(self.size)*100)
244 # hits[self.name]["GAcont"] = "{:.2f}%".format(float(GAcont))
245 if "TMD_" + str(sar_size) not in hits[self.name]:
246 hits[self.name]["TMD_" + str(sar_size)] = []
247 hits[self.name]["TMD_" + str(sar_size)].append(
248 [
249 sar_seq,
250 charge_seq,
251 charge,
252 perc_cont,
253 nterm_coords,
254 sar_coords,
255 cterm_coords,
256 sar_start,
257 sar_end,
258 ]
259 )
260 else:
261 hits[self.name]["TMD_" + str(sar_size)].append(
262 [
263 sar_seq,
264 charge_seq,
265 charge,
266 perc_cont,
267 nterm_coords,
268 sar_coords,
269 cterm_coords,
270 sar_start,
271 sar_end,
272 ]
273 )
274 else:
275 if "TMD_" + str(sar_size) not in hits[self.name]:
276 hits[self.name]["TMD_" + str(sar_size)] = []
277 hits[self.name]["TMD_" + str(sar_size)].append(
278 [
279 sar_seq,
280 charge_seq,
281 charge,
282 perc_cont,
283 nterm_coords,
284 sar_coords,
285 cterm_coords,
286 sar_start,
287 sar_end,
288 ]
289 )
290 else:
291 hits[self.name]["TMD_" + str(sar_size)].append(
292 [
293 sar_seq,
294 charge_seq,
295 charge,
296 perc_cont,
297 nterm_coords,
298 sar_coords,
299 cterm_coords,
300 sar_start,
301 sar_end,
302 ]
303 )
304
305
306 def percent_calc(sequence, residues, size):
307 """Calculate the percent of a set of residues within an input sequence"""
308 counted = {}
309 for aa in sequence:
310 # print(aa)
311 if aa in counted:
312 counted[aa] += 1
313 else:
314 counted[aa] = 1
315 residue_amt = 0
316 my_ratios = []
317 for res_of_interest in residues:
318 try:
319 residue_amt = counted[res_of_interest]
320 except KeyError:
321 residue_amt = 0
322 ratio = residue_amt / size
323 my_ratios.append((round(ratio * 100, 2)))
324
325 res_rat = list(zip(residues, my_ratios))
326
327 return res_rat
328
329
330 def charge_check(charge_seq, pos_res, neg_res):
331 charge = 0
332 for aa in charge_seq:
333 if aa in pos_res:
334 charge += 1
335 if aa in neg_res:
336 charge -= 1
337 return charge
338
339
340 if __name__ == "__main__":
341 sequence = "MAGBYYYTRLCVRKLRKGGGHP"
342 residues = "YL"
343 size = len(sequence)
344 print(size)
345 v = percent_calc(sequence, residues, size)
346 print(v)
347 for i in v:
348 print(i)