Mercurial > repos > cpt > cpt_sar_finder
comparison SAR_functions.py @ 1:112751823323 draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:52:57 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:9f62910edcc9 | 1:112751823323 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import sys | |
4 import argparse | |
5 import os | |
6 import re | |
7 from Bio import SeqIO | |
8 | |
9 | |
10 class CheckSequence: | |
11 """ | |
12 SAR endolysin Verification class, which starts with complete FA file, and is shrunk by each function to reveal best candidates of SAR endolysin proteins | |
13 """ | |
14 | |
15 def __init__(self, protein_name, protein_data): | |
16 self.name = protein_name | |
17 self.seq = protein_data.seq | |
18 self.description = protein_data.description | |
19 self.size = len(self.seq) | |
20 self.store = {} | |
21 | |
22 def check_sizes(self, min, max): | |
23 """check the minimum and maximum peptide lengths""" | |
24 if self.size < min: | |
25 print("too small") | |
26 elif self.size > max: | |
27 print("too large") | |
28 else: | |
29 print(f"{self.name} : {self.seq}") | |
30 return True | |
31 | |
32 def check_hydrophobicity_and_charge( | |
33 self, sar_min=15, sar_max=20, perc_residues="SGAT" | |
34 ): | |
35 """verifies the existence of a hydrophobic region within the sequence""" | |
36 hydrophobic_residues = "['FIWLVMYCATGSP']" # fed through regex | |
37 hits = self.store | |
38 pos_res = "RK" | |
39 neg_res = "DE" | |
40 | |
41 if self.size > 50: | |
42 seq = self.seq[0:50] | |
43 else: | |
44 seq = self.seq | |
45 for sar_size in range(sar_min, sar_max, 1): | |
46 for i in range(0, len(seq) - sar_size, 1): | |
47 sar_seq = str(seq[i : i + sar_size]) | |
48 if re.search( | |
49 (hydrophobic_residues + "{" + str(sar_size) + "}"), sar_seq | |
50 ): | |
51 ( | |
52 charge_seq, | |
53 charge, | |
54 perc_cont, | |
55 sar_coords, | |
56 nterm_coords, | |
57 cterm_coords, | |
58 sar_start, | |
59 sar_end, | |
60 ) = rep_funcs( | |
61 self, seq, i, pos_res, neg_res, sar_seq, perc_residues, sar_size | |
62 ) | |
63 storage_dict( | |
64 self=self, | |
65 sar_size=sar_size, | |
66 sar_seq=sar_seq, | |
67 hits=hits, | |
68 charge_seq=charge_seq, | |
69 charge=charge, | |
70 perc_cont=perc_cont, | |
71 nterm_coords=nterm_coords, | |
72 sar_coords=sar_coords, | |
73 cterm_coords=cterm_coords, | |
74 sar_start=sar_start, | |
75 sar_end=sar_end, | |
76 ) | |
77 # print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1)) | |
78 elif "K" in sar_seq[0] and re.search( | |
79 (hydrophobic_residues + "{" + str(sar_size - 1) + "}"), sar_seq[1:] | |
80 ): # check frontend snorkels | |
81 ( | |
82 charge_seq, | |
83 charge, | |
84 perc_cont, | |
85 sar_coords, | |
86 nterm_coords, | |
87 cterm_coords, | |
88 sar_start, | |
89 sar_end, | |
90 ) = rep_funcs( | |
91 self, seq, i, pos_res, neg_res, sar_seq, perc_residues, sar_size | |
92 ) | |
93 storage_dict( | |
94 self=self, | |
95 sar_size=sar_size, | |
96 sar_seq=sar_seq, | |
97 hits=hits, | |
98 charge_seq=charge_seq, | |
99 charge=charge, | |
100 perc_cont=perc_cont, | |
101 nterm_coords=nterm_coords, | |
102 sar_coords=sar_coords, | |
103 cterm_coords=cterm_coords, | |
104 sar_start=sar_start, | |
105 sar_end=sar_end, | |
106 ) | |
107 # print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1)) | |
108 elif "K" in sar_seq[-1] and re.search( | |
109 (hydrophobic_residues + "{" + str(sar_size - 1) + "}"), sar_seq[:-1] | |
110 ): # check backend snorkels | |
111 ( | |
112 charge_seq, | |
113 charge, | |
114 perc_cont, | |
115 sar_coords, | |
116 nterm_coords, | |
117 cterm_coords, | |
118 sar_start, | |
119 sar_end, | |
120 ) = rep_funcs( | |
121 self, seq, i, pos_res, neg_res, sar_seq, perc_residues, sar_size | |
122 ) | |
123 storage_dict( | |
124 self=self, | |
125 sar_size=sar_size, | |
126 sar_seq=sar_seq, | |
127 hits=hits, | |
128 charge_seq=charge_seq, | |
129 charge=charge, | |
130 perc_cont=perc_cont, | |
131 nterm_coords=nterm_coords, | |
132 sar_coords=sar_coords, | |
133 cterm_coords=cterm_coords, | |
134 sar_start=sar_start, | |
135 sar_end=sar_end, | |
136 ) | |
137 # print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1)) | |
138 continue | |
139 | |
140 return hits | |
141 | |
142 def shrink_results(self, sar_min=15, sar_max=20, perc_residues="SGAT"): | |
143 """removes repetiive hits, keeps only the shortest and longest of each SAR domain""" | |
144 compare_candidates = {} | |
145 hits = self.check_hydrophobicity_and_charge(sar_min=sar_min, sar_max=sar_max) | |
146 for sar_name, data in hits.items(): | |
147 # print(sar_name) | |
148 compare_candidates[sar_name] = {} | |
149 # print("\nThese are the values: {}".format(v)) | |
150 # count_of_times = 0 | |
151 tmd_log = [] | |
152 for sar_size in range(sar_max, sar_min - 1, -1): | |
153 if "TMD_" + str(sar_size) in data: | |
154 tmd_log.append(sar_size) | |
155 # print(tmd_log) | |
156 for idx, the_data in enumerate(data["TMD_" + str(sar_size)]): | |
157 # print(the_data[7]) | |
158 # print(the_data) | |
159 # print(f"This is the index: {idx}") | |
160 # print(f"This is the list of data at this index: {the_data}") | |
161 if ( | |
162 the_data[7] in compare_candidates[sar_name] | |
163 ): # index to start | |
164 compare_candidates[sar_name][the_data[7]]["count"] += 1 | |
165 compare_candidates[sar_name][the_data[7]]["size"].append( | |
166 sar_size | |
167 ) | |
168 compare_candidates[sar_name][the_data[7]]["index"].append( | |
169 idx | |
170 ) | |
171 else: | |
172 compare_candidates[sar_name][the_data[7]] = {} | |
173 compare_candidates[sar_name][the_data[7]]["count"] = 1 | |
174 compare_candidates[sar_name][the_data[7]]["size"] = [ | |
175 sar_size | |
176 ] | |
177 compare_candidates[sar_name][the_data[7]]["index"] = [idx] | |
178 hits[sar_name]["biggest_sar"] = tmd_log[0] | |
179 for sar_name, compare_data in compare_candidates.items(): | |
180 for data in compare_data.values(): | |
181 if len(data["size"]) >= 3: | |
182 # print(f"{each_size} --> {data}") | |
183 minmax = [min(data["size"]), max(data["size"])] | |
184 nonminmax = [x for x in data["size"] if x not in minmax] | |
185 nonminmax_index = [] | |
186 for each_nonminmax in nonminmax: | |
187 v = data["size"].index(each_nonminmax) | |
188 x = data["index"][v] | |
189 nonminmax_index.append(x) | |
190 nons = zip(nonminmax, nonminmax_index) | |
191 for value in nons: | |
192 # hits[sar_name]["TMD_"+str(value[0])] = hits[sar_name]["TMD_"+str(value[0])].pop(value[1]) | |
193 hits[sar_name]["TMD_" + str(value[0])][value[1]] = [""] | |
194 | |
195 return hits | |
196 | |
197 | |
198 def rep_funcs(self, seq, loc, pos_res, neg_res, sar_seq, perc_residues, sar_size): | |
199 """run a set of functions together before sending the results to the storage dictionary""" | |
200 | |
201 charge_seq = str(seq[:loc]) | |
202 charge = charge_check(charge_seq, pos_res, neg_res) | |
203 perc_cont = percent_calc(sar_seq, perc_residues, int(sar_size)) | |
204 sar_start = loc | |
205 sar_end = loc + sar_size | |
206 sar_coords = "{}..{}".format(loc, loc + sar_size) | |
207 nterm_coords = "{}..{}".format("0", loc - 1) | |
208 cterm_coords = "{}..{}".format(loc + sar_size + 1, self.size) | |
209 | |
210 return ( | |
211 charge_seq, | |
212 charge, | |
213 perc_cont, | |
214 sar_coords, | |
215 nterm_coords, | |
216 cterm_coords, | |
217 sar_start, | |
218 sar_end, | |
219 ) | |
220 | |
221 | |
222 ### Extra "helper" functions | |
223 def storage_dict( | |
224 self, | |
225 sar_size, | |
226 sar_seq, | |
227 hits, | |
228 charge_seq, | |
229 charge, | |
230 perc_cont, | |
231 nterm_coords, | |
232 sar_coords, | |
233 cterm_coords, | |
234 sar_start, | |
235 sar_end, | |
236 ): # probably not good to call "self" a param here...definitley not PEP approved... | |
237 """organize dictionary for hydrophobicity check""" | |
238 if self.name not in hits: | |
239 hits[self.name] = {} | |
240 hits[self.name]["description"] = str(self.description) | |
241 hits[self.name]["sequence"] = str(self.seq) | |
242 hits[self.name]["size"] = str(self.size) | |
243 # GAcont = str((str(self.seq).count("G")+str(self.seq).count("A"))/int(self.size)*100) | |
244 # hits[self.name]["GAcont"] = "{:.2f}%".format(float(GAcont)) | |
245 if "TMD_" + str(sar_size) not in hits[self.name]: | |
246 hits[self.name]["TMD_" + str(sar_size)] = [] | |
247 hits[self.name]["TMD_" + str(sar_size)].append( | |
248 [ | |
249 sar_seq, | |
250 charge_seq, | |
251 charge, | |
252 perc_cont, | |
253 nterm_coords, | |
254 sar_coords, | |
255 cterm_coords, | |
256 sar_start, | |
257 sar_end, | |
258 ] | |
259 ) | |
260 else: | |
261 hits[self.name]["TMD_" + str(sar_size)].append( | |
262 [ | |
263 sar_seq, | |
264 charge_seq, | |
265 charge, | |
266 perc_cont, | |
267 nterm_coords, | |
268 sar_coords, | |
269 cterm_coords, | |
270 sar_start, | |
271 sar_end, | |
272 ] | |
273 ) | |
274 else: | |
275 if "TMD_" + str(sar_size) not in hits[self.name]: | |
276 hits[self.name]["TMD_" + str(sar_size)] = [] | |
277 hits[self.name]["TMD_" + str(sar_size)].append( | |
278 [ | |
279 sar_seq, | |
280 charge_seq, | |
281 charge, | |
282 perc_cont, | |
283 nterm_coords, | |
284 sar_coords, | |
285 cterm_coords, | |
286 sar_start, | |
287 sar_end, | |
288 ] | |
289 ) | |
290 else: | |
291 hits[self.name]["TMD_" + str(sar_size)].append( | |
292 [ | |
293 sar_seq, | |
294 charge_seq, | |
295 charge, | |
296 perc_cont, | |
297 nterm_coords, | |
298 sar_coords, | |
299 cterm_coords, | |
300 sar_start, | |
301 sar_end, | |
302 ] | |
303 ) | |
304 | |
305 | |
306 def percent_calc(sequence, residues, size): | |
307 """Calculate the percent of a set of residues within an input sequence""" | |
308 counted = {} | |
309 for aa in sequence: | |
310 # print(aa) | |
311 if aa in counted: | |
312 counted[aa] += 1 | |
313 else: | |
314 counted[aa] = 1 | |
315 residue_amt = 0 | |
316 my_ratios = [] | |
317 for res_of_interest in residues: | |
318 try: | |
319 residue_amt = counted[res_of_interest] | |
320 except KeyError: | |
321 residue_amt = 0 | |
322 ratio = residue_amt / size | |
323 my_ratios.append((round(ratio * 100, 2))) | |
324 | |
325 res_rat = list(zip(residues, my_ratios)) | |
326 | |
327 return res_rat | |
328 | |
329 | |
330 def charge_check(charge_seq, pos_res, neg_res): | |
331 charge = 0 | |
332 for aa in charge_seq: | |
333 if aa in pos_res: | |
334 charge += 1 | |
335 if aa in neg_res: | |
336 charge -= 1 | |
337 return charge | |
338 | |
339 | |
340 if __name__ == "__main__": | |
341 sequence = "MAGBYYYTRLCVRKLRKGGGHP" | |
342 residues = "YL" | |
343 size = len(sequence) | |
344 print(size) | |
345 v = percent_calc(sequence, residues, size) | |
346 print(v) | |
347 for i in v: | |
348 print(i) |