Mercurial > repos > pedro_araujo > phage_host_prediction
comparison feature_construction.py @ 0:e4b3fc88efe0 draft
Uploaded
author | pedro_araujo |
---|---|
date | Wed, 27 Jan 2021 13:50:11 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e4b3fc88efe0 |
---|---|
1 | |
2 class FeatureConstruction: | |
3 | |
4 def __init__(self): | |
5 """ | |
6 In development. Extract features from proteins. | |
7 """ | |
8 import pandas as pd | |
9 import json | |
10 import ast | |
11 from pathlib import Path | |
12 import os | |
13 from random import randint | |
14 data = pd.read_csv('files/NCBI_Phage_Bacteria_Data.csv', header=0, index_col=0) | |
15 with open('files/phagesProteins.json', encoding='utf-8') as F: | |
16 self.phagesProteins = json.loads(F.read()) | |
17 self._filter_phage_domains() | |
18 # with open('files/bactProteins.json', encoding='utf-8') as F: | |
19 # self.bactProteins = json.loads(F.read()) | |
20 # self._filter_bacteria() | |
21 all_phages = {} | |
22 ecoli = {} | |
23 kpneumoniae = {} | |
24 abaumannii = {} | |
25 my_file = Path("files/FeatureDataset") | |
26 if not my_file.is_file(): | |
27 for phage in self.phageTails: | |
28 if phage in data.index and self.phageTails[phage]: | |
29 for bact in ast.literal_eval(data.loc[phage, 'Host_ID']): | |
30 bact = bact[:-2] | |
31 if bact + '.json' in os.listdir('files/bacteria'): | |
32 # if self.externalProts[bact]: # This verification is not necessary for carbohydrates | |
33 all_phages[phage + '--' + bact] = 'Yes' | |
34 name = data.loc[phage, 'Host'] | |
35 if 'escherichia' in name.lower() or 'coli' in name.lower(): | |
36 ecoli[bact] = 0 | |
37 elif 'klebsiella' in name.lower() or 'pneumoniae' in name.lower(): | |
38 kpneumoniae[bact] = 0 | |
39 elif 'acinetobacter' in name.lower() or 'baumannii' in name.lower(): | |
40 abaumannii[bact] = 0 | |
41 for phage in self.phageTails: | |
42 if phage in data.index and self.phageTails[phage]: | |
43 # if self.phageTails[phage]: | |
44 name = data.loc[phage, 'Host'] | |
45 if 'escherichia' in name.lower() or 'coli' in name.lower(): | |
46 i = 0 | |
47 while i < 12: | |
48 bact = list(kpneumoniae.keys())[randint(0, len(kpneumoniae.keys()) - 1)] | |
49 all_phages[phage + '--' + bact] = 'No' | |
50 i += 1 | |
51 elif 'klebsiella' in name.lower() or 'pneumoniae' in name.lower(): | |
52 i = 0 | |
53 while i < 12: | |
54 bact = list(ecoli.keys())[randint(0, len(ecoli.keys()) - 1)] | |
55 all_phages[phage + '--' + bact] = 'No' | |
56 i += 1 | |
57 elif 'acinetobacter' in name.lower() or 'baumannii' in name.lower(): | |
58 i = 0 | |
59 while i < 12: | |
60 bact = list(kpneumoniae.keys())[randint(0, len(kpneumoniae.keys()) - 1)] | |
61 all_phages[phage + '--' + bact] = 'No' | |
62 i += 1 | |
63 self.features_data = pd.DataFrame({'ID': list(all_phages.keys()), 'Infects': list(all_phages.values())}) | |
64 self.features_data = self.features_data.set_index('ID') | |
65 else: | |
66 self.import_feat_data() | |
67 | |
68 def _filter_phage_domains(self): | |
69 import json | |
70 from pathlib import Path | |
71 ''' | |
72 Filters out unwanted proteins. Domains that are unknown or are not associated with fibers, spikes, tails, enzymatic or binding are not considered. | |
73 Still in development. | |
74 :return: phageTails, a dictionary containing only | |
75 ''' | |
76 my_file = Path("files/phageTails.json") | |
77 if not my_file.is_file(): | |
78 self.phageTails = {} | |
79 for phage in self.phagesProteins: | |
80 self.phageTails[phage] = {} | |
81 for protein in self.phagesProteins[phage]: | |
82 if any(z in self.phagesProteins[phage][protein][0].lower() for z in ['fiber', 'fibre', 'spike', 'hydrolase', 'bind', 'depolymerase', 'peptidase', 'lyase', 'sialidase', 'dextranase', 'lipase', 'adhesin', 'baseplate', 'protein h', 'recognizing' | |
83 'protein j', 'protein g', 'gpe', 'duf4035', 'host specifity', 'cor protein', 'specificity', 'baseplate component', 'gp38', 'gp12 tail', 'receptor', 'recognition', 'tail']) \ | |
84 and not any(z in self.phagesProteins[phage][protein][0].lower() for z in ['nucle', 'dna', 'rna', 'ligase', 'transferase', 'inhibitor', 'assembly', 'connect', 'nudix', 'atp', 'nad', 'transpos', 'ntp', 'molybdenum', 'hns', | |
85 'gtp', 'riib', 'inhibitor', 'replicat', 'codon', 'pyruvate', 'catalyst', 'hinge', 'sheath completion', 'head', 'capsid', 'tape', 'tip', 'strand', 'matur', 'portal' | |
86 'terminase', 'nucl', 'promot', 'block', 'olfact', 'wedge', 'lysozyme', 'mur', 'sheat']): | |
87 self.phageTails[phage][protein] = self.phagesProteins[phage][protein] | |
88 '''else: | |
89 for i in self.phagesProteins[phage][protein]: | |
90 if type(i) == str: | |
91 if any(z in str(i).lower() for z in ['fiber', 'fibre', 'spike', 'hydrolase', 'bind', 'depolymerase', 'peptidase', 'lyase', 'sialidase', 'dextranase', 'lipase', 'adhesin', 'baseplate', 'protein h', 'recognizing' | |
92 'protein j', 'protein g', 'gpe', 'duf4035', 'host specifity', 'cor protein', 'specificity', 'baseplate component', 'gp38', 'gp12 tail', 'receptor', 'recognition', 'tail']) \ | |
93 and not any(z in str(i).lower() for z in ['nucle', 'dna', 'rna', 'ligase', 'transferase', 'inhibitor', 'assembly', 'connect', 'nudix', 'atp', 'nad', 'transpos', 'ntp', 'molybdenum', 'hns', 'gtp', | |
94 'riib', 'inhibitor', 'replicat', 'codon', 'pyruvate', 'catalyst', 'hinge', 'sheath completion', 'head', 'capsid', 'tape', 'tip', 'strand', 'matur', 'portal' | |
95 'terminase', 'nucl']): | |
96 self.phageTails[phage][protein] = self.phagesProteins[phage][protein] | |
97 else: | |
98 for j in i: | |
99 if any(z in str(j).lower() for z in ['fiber', 'fibre', 'spike', 'hydrolase', 'bind', 'depolymerase', 'peptidase', 'lyase', 'sialidase', 'dextranase', 'lipase', 'adhesin', 'baseplate', 'protein h', 'recognizing' | |
100 'protein j', 'protein g', 'gpe', 'duf4035', 'host specifity', 'cor protein', 'specificity', 'baseplate component', 'gp38', 'gp12 tail', 'receptor', 'recognition', 'tail']) \ | |
101 and not any(z in str(j).lower() for z in ['nucle', 'dna', 'rna', 'ligase', 'transferase', 'inhibitor', 'assembly', 'connect', 'nudix', 'atp', 'nad', 'transpos', 'ntp', 'molybdenum', 'hns', 'gtp', | |
102 'riib', 'inhibitor', 'replicat', 'codon', 'pyruvate', 'catalyst', 'hinge', 'sheath completion', 'head', 'capsid', 'tape', 'tip', 'strand', 'matur', 'portal' | |
103 'terminase', 'nucl']): | |
104 self.phageTails[phage][protein] = self.phagesProteins[phage][protein]''' | |
105 with open('files/phageTails.json', 'w') as f: | |
106 json.dump(self.phageTails, f) | |
107 self.__create_phage_fasta() | |
108 else: | |
109 with open('files/phageTails.json', encoding='utf-8') as F: | |
110 self.phageTails = json.loads(F.read()) | |
111 return self.phageTails | |
112 | |
113 def _filter_bacteria(self): | |
114 import json | |
115 from pathlib import Path | |
116 import pandas as pd | |
117 my_file = Path("files/externalProts.json") | |
118 if not my_file.is_file(): | |
119 self.externalProts = {} | |
120 predictions = pd.read_csv('files/results_psort.txt', sep='\t', index_col=False) | |
121 predictions = predictions.set_index('SeqID') | |
122 predictions = predictions.drop_duplicates() | |
123 for bac in self.bactProteins: | |
124 self.externalProts[bac] = {} | |
125 for protein in self.bactProteins[bac]: | |
126 if protein + ' ' in predictions.index: | |
127 maxScore = 0.0 | |
128 for loc in ['Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Periplasmic_Score', 'OuterMembrane_Score', 'Extracellular_Score']: | |
129 if predictions.loc[protein + ' ', loc] > maxScore: | |
130 maxScore = predictions.loc[protein + ' ', loc] | |
131 location = loc | |
132 if location == 'CytoplasmicMembrane_Score' or location == 'OuterMembrane_Score' or location == 'Extracellular_Score': | |
133 self.externalProts[bac][protein] = self.bactProteins[bac][protein][1] | |
134 if self.externalProts != {}: | |
135 del self.bactProteins | |
136 with open('files/externalProts.json', 'w') as f: | |
137 json.dump(self.externalProts, f) | |
138 else: | |
139 with open('files/externalProts.json', encoding='utf-8') as F: | |
140 self.externalProts = json.loads(F.read()) | |
141 return self.externalProts | |
142 | |
143 def __create_phage_fasta(self): | |
144 """ | |
145 Creates a fasta file containing every protein sequence for every phage. | |
146 :return: | |
147 """ | |
148 with open('files/tails.fasta', 'w') as F: | |
149 for phage in self.phageTails: | |
150 for prot in self.phageTails[phage]: | |
151 F.write('>' + prot + '\n' + self.phageTails[phage][prot][1] + '\n') | |
152 | |
153 def add_kmers(self): | |
154 from skbio import Sequence | |
155 import json | |
156 groups = '0123456' | |
157 freqs = {} | |
158 for i in groups: | |
159 for j in groups: | |
160 freqs[i+j] = 0.0 | |
161 for i in freqs: | |
162 exec('phage_group_{0} = []'.format(i)) | |
163 exec('bact_group_{0} = []'.format(i)) | |
164 phage = '' | |
165 bact = '' | |
166 for ID in self.features_data.index: | |
167 done_phage = False | |
168 done_bact = False | |
169 if ID[:ID.find('--')] == phage: | |
170 for i in freqs.keys(): | |
171 exec('phage_group_{0}.append(phage_group_{0}[-1])'.format(i)) | |
172 done_phage = True | |
173 if ID[ID.find('--') + 2:] == bact: | |
174 for i in freqs.keys(): | |
175 exec('bact_group_{0}.append(bact_group_{0}[-1])'.format(i)) | |
176 done_bact = True | |
177 bact = ID[ID.find('--') + 2:] | |
178 phage = ID[:ID.find('--')] | |
179 | |
180 if not done_phage: | |
181 totalKmers = freqs.copy() | |
182 count_prots = 0 | |
183 for prot in self.list_prot[phage]: | |
184 max_freq = 0.0 | |
185 min_freq = 1000000.0 | |
186 count_prots += 1 | |
187 seq = self.__get_conjoint_triad(self.list_prot[phage][prot][1].replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C').replace('J', 'L')) | |
188 seq = Sequence(seq) | |
189 temp = seq.kmer_frequencies(2, overlap=True, relative=True) | |
190 for i in temp.keys(): # para normalizar | |
191 if temp[i] < min_freq: | |
192 min_freq = temp[i] | |
193 if temp[i] > max_freq: | |
194 max_freq = temp[i] | |
195 for i in temp.keys(): | |
196 totalKmers[i] += temp[i] - (min_freq / max_freq) | |
197 if count_prots != 0: | |
198 for i in totalKmers.keys(): | |
199 totalKmers[i] = totalKmers[i] / count_prots | |
200 temp_value = totalKmers[i] | |
201 exec('phage_group_{0}.append(temp_value)'.format(i)) | |
202 else: | |
203 for i in totalKmers.keys(): | |
204 exec('phage_group_{0}.append(0.0)'.format(i)) | |
205 | |
206 if not done_bact: | |
207 totalKmers = freqs.copy() | |
208 count_prots = 0 | |
209 with open('files/bacteria/' + bact + '.json', encoding='utf-8') as F: | |
210 bact_prots = json.loads(F.read()) | |
211 for prot in bact_prots: | |
212 max_freq = 0.0 | |
213 min_freq = 1000000.0 | |
214 count_prots += 1 | |
215 seq = bact_prots[prot][1] | |
216 seq = seq[:seq.find('"')] | |
217 seq = self.__get_conjoint_triad(seq.replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C').replace('J', 'L')) | |
218 seq = Sequence(seq) | |
219 temp = seq.kmer_frequencies(2, overlap=True, relative=True) | |
220 for i in temp.keys(): # para normalizar | |
221 if temp[i] < min_freq: | |
222 min_freq = temp[i] | |
223 if temp[i] > max_freq: | |
224 max_freq = temp[i] | |
225 for i in temp.keys(): | |
226 totalKmers[i] += temp[i] - (min_freq / max_freq) | |
227 if count_prots != 0: | |
228 for i in totalKmers.keys(): | |
229 totalKmers[i] = totalKmers[i] / count_prots | |
230 temp_value = totalKmers[i] | |
231 exec('bact_group_{0}.append(temp_value)'.format(i)) | |
232 else: | |
233 for i in freqs.keys(): | |
234 exec('bact_group_{0}.append(0.0)'.format(i)) | |
235 | |
236 for i in freqs.keys(): | |
237 exec('self.features_data["phage_kmer_{0}"] = phage_group_{0}'.format(i)) | |
238 exec('self.features_data["bact_kmer_{0}"] = bact_group_{0}'.format(i)) | |
239 | |
240 def get_kmers(self, phage, bacteria): | |
241 from skbio import Sequence | |
242 solution = [] | |
243 groups = '0123456' | |
244 freqs = {} | |
245 for i in groups: | |
246 for j in groups: | |
247 freqs[i+j] = 0.0 | |
248 for i in freqs: | |
249 exec('phage_group_{0} = 0.0'.format(i)) | |
250 exec('bact_group_{0} = 0.0'.format(i)) | |
251 | |
252 totalKmers = freqs.copy() | |
253 count_prots = 0 | |
254 for prot in phage: | |
255 max_freq = 0.0 | |
256 min_freq = 1000000.0 | |
257 count_prots += 1 | |
258 seq = self.__get_conjoint_triad(phage[prot][1].replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C')) | |
259 seq = Sequence(seq) | |
260 temp = seq.kmer_frequencies(2, overlap=True, relative=True) | |
261 for i in temp.keys(): # para normalizar | |
262 if temp[i] < min_freq: | |
263 min_freq = temp[i] | |
264 if temp[i] > max_freq: | |
265 max_freq = temp[i] | |
266 for i in temp.keys(): | |
267 totalKmers[i] += temp[i] - (min_freq / max_freq) | |
268 if count_prots != 0: | |
269 for i in totalKmers.keys(): | |
270 totalKmers[i] = totalKmers[i] / count_prots | |
271 temp_value = totalKmers[i] | |
272 exec('phage_group_{0} += temp_value'.format(i)) | |
273 | |
274 totalKmers = freqs.copy() | |
275 count_prots = 0 | |
276 for prot in bacteria: | |
277 max_freq = 0.0 | |
278 min_freq = 1000000.0 | |
279 count_prots += 1 | |
280 seq = bacteria[prot][1] | |
281 seq = seq[:seq.find('"')] | |
282 seq = self.__get_conjoint_triad(seq.replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C')) | |
283 seq = Sequence(seq) | |
284 temp = seq.kmer_frequencies(2, overlap=True, relative=True) | |
285 for i in temp.keys(): # para normalizar | |
286 if temp[i] < min_freq: | |
287 min_freq = temp[i] | |
288 if temp[i] > max_freq: | |
289 max_freq = temp[i] | |
290 for i in temp.keys(): | |
291 totalKmers[i] += temp[i] - (min_freq / max_freq) | |
292 if count_prots != 0: | |
293 for i in totalKmers.keys(): | |
294 totalKmers[i] = totalKmers[i] / count_prots | |
295 temp_value = totalKmers[i] | |
296 exec('bact_group_{0} += temp_value'.format(i)) | |
297 | |
298 for i in freqs.keys(): | |
299 exec('solution.append(phage_group_{0})'.format(i)) | |
300 exec('solution.append(bact_group_{0})'.format(i)) | |
301 return solution | |
302 | |
303 def add_composition(self): | |
304 from skbio import Sequence | |
305 import json | |
306 bact_comp = {} | |
307 phage_comp = {} | |
308 groups = '0123456' | |
309 for i in groups: | |
310 bact_comp['comp_' + i] = [] | |
311 phage_comp['comp_' + i] = [] | |
312 phage = '' | |
313 bact = '' | |
314 count = -1 | |
315 for ID in self.features_data.index: | |
316 done_phage = False | |
317 done_bact = False | |
318 count += 1 | |
319 if ID[:ID.find('--')] == phage: | |
320 for i in groups: | |
321 phage_comp['comp_' + i].append(phage_comp['comp_' + i][-1]) | |
322 done_phage = True | |
323 if ID[ID.find('--') + 2:] == bact: | |
324 for i in groups: | |
325 bact_comp['comp_' + i].append(bact_comp['comp_' + i][-1]) | |
326 done_bact = True | |
327 bact = ID[ID.find('--') + 2:] | |
328 phage = ID[:ID.find('--')] | |
329 | |
330 if not done_phage: | |
331 count_prots = 0 | |
332 for i in groups: | |
333 phage_comp['comp_' + i].append(0) | |
334 for prot in self.list_prot[phage]: | |
335 max_comp = 0.0 | |
336 min_comp = 1000000.0 | |
337 count_prots += 1 | |
338 seq = self.__get_conjoint_triad(self.list_prot[phage][prot][1].replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C')) | |
339 seq = Sequence(seq) | |
340 for i in groups: # para normalizar | |
341 if seq.count(i) < min_comp: | |
342 min_comp = seq.count(i) | |
343 if seq.count(i) > max_comp: | |
344 max_comp = seq.count(i) | |
345 for i in groups: | |
346 phage_comp['comp_' + i][count] += seq.count(i) - (min_comp / max_comp) | |
347 total = 0 | |
348 if count_prots != 0: | |
349 for i in groups: | |
350 phage_comp['comp_' + i][count] = phage_comp['comp_' + i][count] / count_prots | |
351 total += phage_comp['comp_' + i][count] | |
352 for i in groups: | |
353 phage_comp['comp_' + i][count] = phage_comp['comp_' + i][count] / total | |
354 else: | |
355 for i in groups: | |
356 phage_comp['comp_' + i][count] = 0.0 | |
357 | |
358 if not done_bact: | |
359 count_prots = 0 | |
360 for i in groups: | |
361 bact_comp['comp_' + i].append(0) | |
362 with open('files/bacteria/' + bact + '.json', encoding='utf-8') as F: | |
363 bact_prots = json.loads(F.read()) | |
364 for prot in bact_prots: | |
365 max_comp = 0.0 | |
366 min_comp = 1000000.0 | |
367 count_prots += 1 | |
368 seq = bact_prots[prot][1] | |
369 seq = seq[:seq.find('"')] | |
370 seq = self.__get_conjoint_triad(seq.replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C')) | |
371 seq = Sequence(seq) | |
372 for i in groups: | |
373 if seq.count(i) < min_comp: | |
374 min_comp = seq.count(i) | |
375 if seq.count(i) > max_comp: | |
376 max_comp = seq.count(i) | |
377 for i in groups: | |
378 bact_comp['comp_' + i][count] += seq.count(i) - (min_comp / max_comp) | |
379 total = 0 | |
380 if count_prots != 0: | |
381 for i in groups: | |
382 bact_comp['comp_' + i][count] = bact_comp['comp_' + i][count] / count_prots | |
383 total += bact_comp['comp_' + i][count] | |
384 else: | |
385 for i in groups: | |
386 bact_comp['comp_' + i][count] = 0.0 | |
387 if total != 0: | |
388 for i in groups: | |
389 bact_comp['comp_' + i][count] = bact_comp['comp_' + i][count] / total | |
390 else: | |
391 for i in groups: | |
392 bact_comp['comp_' + i][count] = 0.0 | |
393 | |
394 for i in groups: | |
395 self.features_data['bact_comp_' + i] = bact_comp['comp_' + i] | |
396 self.features_data['phage_comp_' + i] = phage_comp['comp_' + i] | |
397 | |
398 def get_composition(self, phage, bacteria): | |
399 from skbio import Sequence | |
400 solution = [] | |
401 bact_comp = {} | |
402 phage_comp = {} | |
403 phage_comp_carb = {} | |
404 groups = '0123456' | |
405 for i in groups: | |
406 bact_comp['comp_' + i] = 0 | |
407 phage_comp['comp_' + i] = 0 | |
408 count_prots = 0 | |
409 for prot in phage: | |
410 max_comp = 0.0 | |
411 min_comp = 1000000.0 | |
412 count_prots += 1 | |
413 seq = self.__get_conjoint_triad(phage[prot][1].replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C')) | |
414 seq = Sequence(seq) | |
415 for i in groups: # para normalizar | |
416 if seq.count(i) < min_comp: | |
417 min_comp = seq.count(i) | |
418 if seq.count(i) > max_comp: | |
419 max_comp = seq.count(i) | |
420 for i in groups: | |
421 phage_comp['comp_' + i] += seq.count(i) - (min_comp / max_comp) | |
422 total = 0 | |
423 if count_prots != 0: | |
424 for i in groups: | |
425 phage_comp['comp_' + i] = phage_comp['comp_' + i] / count_prots | |
426 total += phage_comp['comp_' + i] | |
427 for i in groups: | |
428 phage_comp['comp_' + i] = phage_comp['comp_' + i] / total | |
429 else: | |
430 for i in groups: | |
431 phage_comp['comp_' + i] = 0.0 | |
432 | |
433 count_prots = 0 | |
434 for prot in bacteria: | |
435 max_comp = 0.0 | |
436 min_comp = 1000000.0 | |
437 count_prots += 1 | |
438 seq = bacteria[prot][1] | |
439 seq = seq[:seq.find('"')] | |
440 seq = self.__get_conjoint_triad(seq.replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C')) | |
441 seq = Sequence(seq) | |
442 for i in groups: | |
443 if seq.count(i) < min_comp: | |
444 min_comp = seq.count(i) | |
445 if seq.count(i) > max_comp: | |
446 max_comp = seq.count(i) | |
447 for i in groups: | |
448 bact_comp['comp_' + i] += seq.count(i) - (min_comp / max_comp) | |
449 total = 0 | |
450 if count_prots != 0: | |
451 for i in groups: | |
452 bact_comp['comp_' + i] = bact_comp['comp_' + i] / count_prots | |
453 total += bact_comp['comp_' + i] | |
454 for i in groups: | |
455 bact_comp['comp_' + i] = bact_comp['comp_' + i] / total | |
456 else: | |
457 for i in groups: | |
458 bact_comp['comp_' + i] = 0.0 | |
459 | |
460 for i in groups: | |
461 solution.append(bact_comp['comp_' + i]) | |
462 solution.append(phage_comp['comp_' + i]) | |
463 return solution | |
464 | |
465 def add_grouping(self): | |
466 from skbio import Sequence | |
467 import json | |
468 bact_group = {} | |
469 phage_group = {} | |
470 groups = '0123456' | |
471 letters = 'ABCDEFGHIJ' | |
472 for i in groups: | |
473 for j in letters: | |
474 bact_group['group' + j + '_' + i] = [] | |
475 phage_group['group' + j + '_' + i] = [] | |
476 phage = '' | |
477 bact = '' | |
478 count = -1 | |
479 for ID in self.features_data.index: | |
480 done_phage = False | |
481 done_bact = False | |
482 count += 1 | |
483 if ID[:ID.find('--')] == phage: | |
484 for i in groups: | |
485 for j in letters: | |
486 phage_group['group' + j + '_' + i].append(phage_group['group' + j + '_' + i][-1]) | |
487 done_phage = True | |
488 if ID[ID.find('--') + 2:] == bact: | |
489 for i in groups: | |
490 for j in letters: | |
491 bact_group['group' + j + '_' + i].append(bact_group['group' + j + '_' + i][-1]) | |
492 done_bact = True | |
493 bact = ID[ID.find('--') + 2:] | |
494 phage = ID[:ID.find('--')] | |
495 | |
496 if not done_phage: | |
497 count_prots = 0 | |
498 for i in groups: | |
499 for j in letters: | |
500 phage_group['group' + j + '_' + i].append(0) | |
501 for prot in self.list_prot[phage]: | |
502 count_prots += 1 | |
503 seq = self.__get_conjoint_triad(self.list_prot[phage][prot][1].replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C')) | |
504 seq = Sequence(seq) | |
505 for j in letters: | |
506 group = self.__get_grouping(seq, j) | |
507 for i in groups: | |
508 phage_group['group' + j + '_' + i][count] += group[i] | |
509 if count_prots != 0: | |
510 for i in groups: | |
511 for j in letters: | |
512 phage_group['group' + j + '_' + i][count] = phage_group['group' + j + '_' + i][count] / count_prots | |
513 else: | |
514 for i in groups: | |
515 for j in letters: | |
516 phage_group['group' + j + '_' + i][count] = 0.0 | |
517 | |
518 if not done_bact: | |
519 count_prots = 0 | |
520 for i in groups: | |
521 for j in letters: | |
522 bact_group['group' + j + '_' + i].append(0) | |
523 with open('files/bacteria/' + bact + '.json', encoding='utf-8') as F: | |
524 bact_prots = json.loads(F.read()) | |
525 for prot in bact_prots: | |
526 count_prots += 1 | |
527 seq = bact_prots[prot][1] | |
528 seq = seq[:seq.find('"')] | |
529 seq = self.__get_conjoint_triad(seq.replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C')) | |
530 seq = Sequence(seq) | |
531 for j in letters: | |
532 group = self.__get_grouping(seq, j) | |
533 for i in groups: | |
534 bact_group['group' + j + '_' + i][count] += group[i] | |
535 if count_prots != 0: | |
536 for i in groups: | |
537 for j in letters: | |
538 bact_group['group' + j + '_' + i][count] = bact_group['group' + j + '_' + i][count] / count_prots | |
539 else: | |
540 for i in groups: | |
541 for j in letters: | |
542 bact_group['group' + j + '_' + i][count] = 0.0 | |
543 | |
544 for i in groups: | |
545 for j in letters: | |
546 self.features_data['bact_group' + j + '_' + i] = bact_group['group' + j + '_' + i] | |
547 self.features_data['phage_group' + j + '_' + i] = phage_group['group' + j + '_' + i] | |
548 | |
549 def get_grouping(self, phage, bacteria): | |
550 from skbio import Sequence | |
551 bact_group = {} | |
552 phage_group = {} | |
553 groups = '0123456' | |
554 letters = 'ABCDEFGHIJ' | |
555 for i in groups: | |
556 for j in letters: | |
557 bact_group['group' + j + '_' + i] = 0 | |
558 phage_group['group' + j + '_' + i] = 0 | |
559 solution = [] | |
560 count_prots = 0 | |
561 for prot in phage: | |
562 count_prots += 1 | |
563 seq = self.__get_conjoint_triad(phage[prot][1].replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C')) | |
564 seq = Sequence(seq) | |
565 for j in letters: | |
566 group = self.__get_grouping(seq, j) | |
567 for i in groups: | |
568 phage_group['group' + j + '_' + i] += group[i] | |
569 if count_prots != 0: | |
570 for i in groups: | |
571 for j in letters: | |
572 phage_group['group' + j + '_' + i] = phage_group['group' + j + '_' + i] / count_prots | |
573 else: | |
574 for i in groups: | |
575 for j in letters: | |
576 phage_group['group' + j + '_' + i] = 0.0 | |
577 | |
578 count_prots = 0 | |
579 for prot in bacteria: | |
580 count_prots += 1 | |
581 seq = bacteria[prot][1] | |
582 seq = seq[:seq.find('"')] | |
583 seq = self.__get_conjoint_triad(seq.replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C')) | |
584 seq = Sequence(seq) | |
585 for j in letters: | |
586 group = self.__get_grouping(seq, j) | |
587 for i in groups: | |
588 bact_group['group' + j + '_' + i] += group[i] | |
589 if count_prots != 0: | |
590 for i in groups: | |
591 for j in letters: | |
592 bact_group['group' + j + '_' + i] = bact_group['group' + j + '_' + i] / count_prots | |
593 else: | |
594 for i in groups: | |
595 for j in letters: | |
596 bact_group['group' + j + '_' + i] = 0.0 | |
597 | |
598 for i in groups: | |
599 for j in letters: | |
600 solution.append(bact_group['group' + j + '_' + i]) | |
601 solution.append(phage_group['group' + j + '_' + i]) | |
602 return solution | |
603 | |
604 def __get_conjoint_triad(self, prot): | |
605 ctm = {'A':'0', 'G':'0', 'V':'0','C':'1', 'F':'2', 'I':'2', 'L':'2', 'P':'2', 'M':'3', 'S':'3', 'T':'3', 'Y':'3', 'H':'4', 'N':'4', 'Q':'4', 'W':'4', 'K':'5', 'R':'5', 'D':'6', 'E':'6'} | |
606 for i, j in ctm.items(): | |
607 prot = prot.replace(i, j) | |
608 return prot | |
609 | |
610 def __get_grouping(self, prot, let='A'): | |
611 from skbio import Sequence | |
612 groups = '0123456' | |
613 group = {} | |
614 for i in groups: | |
615 group[i] = 0.0 | |
616 if let == 'A': | |
617 seq = Sequence(prot[:int(len(prot) * 0.25)]) | |
618 for i in groups: | |
619 group[i] += seq.count(i) / len(seq) | |
620 elif let == 'B': | |
621 seq = Sequence(prot[int(len(prot) * 0.25):int(len(prot) * 0.5)]) | |
622 for i in groups: | |
623 group[i] += seq.count(i) / len(seq) | |
624 elif let == 'C': | |
625 seq = Sequence(prot[int(len(prot) * 0.5):int(len(prot) * 0.75)]) | |
626 for i in groups: | |
627 group[i] += seq.count(i) / len(seq) | |
628 elif let == 'D': | |
629 seq = Sequence(prot[int(len(prot) * 0.75):]) | |
630 for i in groups: | |
631 group[i] += seq.count(i) / len(seq) | |
632 elif let == 'E': | |
633 seq = Sequence(prot[:int(len(prot) * 0.5)]) | |
634 for i in groups: | |
635 group[i] += seq.count(i) / len(seq) | |
636 elif let == 'F': | |
637 seq = Sequence(prot[int(len(prot) * 0.5):]) | |
638 for i in groups: | |
639 group[i] += seq.count(i) / len(seq) | |
640 elif let == 'G': | |
641 seq = Sequence(prot[int(len(prot) * 0.25):int(len(prot) * 0.75)]) | |
642 for i in groups: | |
643 group[i] += seq.count(i) / len(seq) | |
644 elif let == 'H': | |
645 seq = Sequence(prot[:int(len(prot) * 0.75)]) | |
646 for i in groups: | |
647 group[i] += seq.count(i) / len(seq) | |
648 elif let == 'I': | |
649 seq = Sequence(prot[int(len(prot) * 0.25):]) | |
650 for i in groups: | |
651 group[i] += seq.count(i) / len(seq) | |
652 elif let == 'J': | |
653 seq = Sequence(prot[int(len(prot) * 0.125):int(len(prot) * 0.875)]) | |
654 for i in groups: | |
655 group[i] += seq.count(i) / len(seq) | |
656 return group | |
657 | |
658 def set_output(self): | |
659 import pandas as pd | |
660 output = [] | |
661 data = pd.read_csv('files/NCBI_Phage_Bacteria_Data.csv', header=0, index_col=0, names=['Phage Name', 'Bacteria Name', 'Bacteria ID']) | |
662 for phage in self.features_data['ID']: | |
663 phage = phage[:phage.find('--')] | |
664 bact = data.loc[phage, 'Bacteria Name'] | |
665 if 'escherichia' in bact.lower(): | |
666 output.append('Escherichia coli') | |
667 elif 'klebsiella' in bact.lower(): | |
668 output.append('Klebsiella pneumoniae') | |
669 elif 'acinetobacter' in bact.lower(): | |
670 output.append('Acinetobacter baumannii') | |
671 self.features_data = self.features_data.set_index('ID') | |
672 self.features_data['Bacteria'] = output | |
673 | |
674 def save_feat_data(self): | |
675 import pickle | |
676 with open('files/FeatureDataset', 'wb') as f: | |
677 pickle.dump(self.features_data, f) | |
678 return self.features_data | |
679 | |
680 def import_feat_data(self): | |
681 import pickle | |
682 with open('files/FeatureDataset', 'rb') as f: | |
683 self.features_data = pickle.load(f) | |
684 return self.features_data | |
685 | |
686 | |
687 if __name__ == '__main__': | |
688 test = FeatureConstruction() | |
689 # test.process_net_surf() | |
690 test.add_grouping() | |
691 test.add_composition() | |
692 test.add_kmers() | |
693 # test.set_output() | |
694 test.save_feat_data() | |
695 ''' | |
696 test.process_net_surf() | |
697 test.add_aa_freq() | |
698 test.add_aromaticity() | |
699 test.add_flexibility() | |
700 test.add_molecular_weight()''' | |
701 # test.import_feat_data() | |
702 # test.netSurf() |