comparison feature_construction.py @ 0:e4b3fc88efe0 draft

Uploaded
author pedro_araujo
date Wed, 27 Jan 2021 13:50:11 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e4b3fc88efe0
1
2 class FeatureConstruction:
3
4 def __init__(self):
5 """
6 In development. Extract features from proteins.
7 """
8 import pandas as pd
9 import json
10 import ast
11 from pathlib import Path
12 import os
13 from random import randint
14 data = pd.read_csv('files/NCBI_Phage_Bacteria_Data.csv', header=0, index_col=0)
15 with open('files/phagesProteins.json', encoding='utf-8') as F:
16 self.phagesProteins = json.loads(F.read())
17 self._filter_phage_domains()
18 # with open('files/bactProteins.json', encoding='utf-8') as F:
19 # self.bactProteins = json.loads(F.read())
20 # self._filter_bacteria()
21 all_phages = {}
22 ecoli = {}
23 kpneumoniae = {}
24 abaumannii = {}
25 my_file = Path("files/FeatureDataset")
26 if not my_file.is_file():
27 for phage in self.phageTails:
28 if phage in data.index and self.phageTails[phage]:
29 for bact in ast.literal_eval(data.loc[phage, 'Host_ID']):
30 bact = bact[:-2]
31 if bact + '.json' in os.listdir('files/bacteria'):
32 # if self.externalProts[bact]: # This verification is not necessary for carbohydrates
33 all_phages[phage + '--' + bact] = 'Yes'
34 name = data.loc[phage, 'Host']
35 if 'escherichia' in name.lower() or 'coli' in name.lower():
36 ecoli[bact] = 0
37 elif 'klebsiella' in name.lower() or 'pneumoniae' in name.lower():
38 kpneumoniae[bact] = 0
39 elif 'acinetobacter' in name.lower() or 'baumannii' in name.lower():
40 abaumannii[bact] = 0
41 for phage in self.phageTails:
42 if phage in data.index and self.phageTails[phage]:
43 # if self.phageTails[phage]:
44 name = data.loc[phage, 'Host']
45 if 'escherichia' in name.lower() or 'coli' in name.lower():
46 i = 0
47 while i < 12:
48 bact = list(kpneumoniae.keys())[randint(0, len(kpneumoniae.keys()) - 1)]
49 all_phages[phage + '--' + bact] = 'No'
50 i += 1
51 elif 'klebsiella' in name.lower() or 'pneumoniae' in name.lower():
52 i = 0
53 while i < 12:
54 bact = list(ecoli.keys())[randint(0, len(ecoli.keys()) - 1)]
55 all_phages[phage + '--' + bact] = 'No'
56 i += 1
57 elif 'acinetobacter' in name.lower() or 'baumannii' in name.lower():
58 i = 0
59 while i < 12:
60 bact = list(kpneumoniae.keys())[randint(0, len(kpneumoniae.keys()) - 1)]
61 all_phages[phage + '--' + bact] = 'No'
62 i += 1
63 self.features_data = pd.DataFrame({'ID': list(all_phages.keys()), 'Infects': list(all_phages.values())})
64 self.features_data = self.features_data.set_index('ID')
65 else:
66 self.import_feat_data()
67
68 def _filter_phage_domains(self):
69 import json
70 from pathlib import Path
71 '''
72 Filters out unwanted proteins. Domains that are unknown or are not associated with fibers, spikes, tails, enzymatic or binding are not considered.
73 Still in development.
74 :return: phageTails, a dictionary containing only
75 '''
76 my_file = Path("files/phageTails.json")
77 if not my_file.is_file():
78 self.phageTails = {}
79 for phage in self.phagesProteins:
80 self.phageTails[phage] = {}
81 for protein in self.phagesProteins[phage]:
82 if any(z in self.phagesProteins[phage][protein][0].lower() for z in ['fiber', 'fibre', 'spike', 'hydrolase', 'bind', 'depolymerase', 'peptidase', 'lyase', 'sialidase', 'dextranase', 'lipase', 'adhesin', 'baseplate', 'protein h', 'recognizing'
83 'protein j', 'protein g', 'gpe', 'duf4035', 'host specifity', 'cor protein', 'specificity', 'baseplate component', 'gp38', 'gp12 tail', 'receptor', 'recognition', 'tail']) \
84 and not any(z in self.phagesProteins[phage][protein][0].lower() for z in ['nucle', 'dna', 'rna', 'ligase', 'transferase', 'inhibitor', 'assembly', 'connect', 'nudix', 'atp', 'nad', 'transpos', 'ntp', 'molybdenum', 'hns',
85 'gtp', 'riib', 'inhibitor', 'replicat', 'codon', 'pyruvate', 'catalyst', 'hinge', 'sheath completion', 'head', 'capsid', 'tape', 'tip', 'strand', 'matur', 'portal'
86 'terminase', 'nucl', 'promot', 'block', 'olfact', 'wedge', 'lysozyme', 'mur', 'sheat']):
87 self.phageTails[phage][protein] = self.phagesProteins[phage][protein]
88 '''else:
89 for i in self.phagesProteins[phage][protein]:
90 if type(i) == str:
91 if any(z in str(i).lower() for z in ['fiber', 'fibre', 'spike', 'hydrolase', 'bind', 'depolymerase', 'peptidase', 'lyase', 'sialidase', 'dextranase', 'lipase', 'adhesin', 'baseplate', 'protein h', 'recognizing'
92 'protein j', 'protein g', 'gpe', 'duf4035', 'host specifity', 'cor protein', 'specificity', 'baseplate component', 'gp38', 'gp12 tail', 'receptor', 'recognition', 'tail']) \
93 and not any(z in str(i).lower() for z in ['nucle', 'dna', 'rna', 'ligase', 'transferase', 'inhibitor', 'assembly', 'connect', 'nudix', 'atp', 'nad', 'transpos', 'ntp', 'molybdenum', 'hns', 'gtp',
94 'riib', 'inhibitor', 'replicat', 'codon', 'pyruvate', 'catalyst', 'hinge', 'sheath completion', 'head', 'capsid', 'tape', 'tip', 'strand', 'matur', 'portal'
95 'terminase', 'nucl']):
96 self.phageTails[phage][protein] = self.phagesProteins[phage][protein]
97 else:
98 for j in i:
99 if any(z in str(j).lower() for z in ['fiber', 'fibre', 'spike', 'hydrolase', 'bind', 'depolymerase', 'peptidase', 'lyase', 'sialidase', 'dextranase', 'lipase', 'adhesin', 'baseplate', 'protein h', 'recognizing'
100 'protein j', 'protein g', 'gpe', 'duf4035', 'host specifity', 'cor protein', 'specificity', 'baseplate component', 'gp38', 'gp12 tail', 'receptor', 'recognition', 'tail']) \
101 and not any(z in str(j).lower() for z in ['nucle', 'dna', 'rna', 'ligase', 'transferase', 'inhibitor', 'assembly', 'connect', 'nudix', 'atp', 'nad', 'transpos', 'ntp', 'molybdenum', 'hns', 'gtp',
102 'riib', 'inhibitor', 'replicat', 'codon', 'pyruvate', 'catalyst', 'hinge', 'sheath completion', 'head', 'capsid', 'tape', 'tip', 'strand', 'matur', 'portal'
103 'terminase', 'nucl']):
104 self.phageTails[phage][protein] = self.phagesProteins[phage][protein]'''
105 with open('files/phageTails.json', 'w') as f:
106 json.dump(self.phageTails, f)
107 self.__create_phage_fasta()
108 else:
109 with open('files/phageTails.json', encoding='utf-8') as F:
110 self.phageTails = json.loads(F.read())
111 return self.phageTails
112
113 def _filter_bacteria(self):
114 import json
115 from pathlib import Path
116 import pandas as pd
117 my_file = Path("files/externalProts.json")
118 if not my_file.is_file():
119 self.externalProts = {}
120 predictions = pd.read_csv('files/results_psort.txt', sep='\t', index_col=False)
121 predictions = predictions.set_index('SeqID')
122 predictions = predictions.drop_duplicates()
123 for bac in self.bactProteins:
124 self.externalProts[bac] = {}
125 for protein in self.bactProteins[bac]:
126 if protein + ' ' in predictions.index:
127 maxScore = 0.0
128 for loc in ['Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Periplasmic_Score', 'OuterMembrane_Score', 'Extracellular_Score']:
129 if predictions.loc[protein + ' ', loc] > maxScore:
130 maxScore = predictions.loc[protein + ' ', loc]
131 location = loc
132 if location == 'CytoplasmicMembrane_Score' or location == 'OuterMembrane_Score' or location == 'Extracellular_Score':
133 self.externalProts[bac][protein] = self.bactProteins[bac][protein][1]
134 if self.externalProts != {}:
135 del self.bactProteins
136 with open('files/externalProts.json', 'w') as f:
137 json.dump(self.externalProts, f)
138 else:
139 with open('files/externalProts.json', encoding='utf-8') as F:
140 self.externalProts = json.loads(F.read())
141 return self.externalProts
142
143 def __create_phage_fasta(self):
144 """
145 Creates a fasta file containing every protein sequence for every phage.
146 :return:
147 """
148 with open('files/tails.fasta', 'w') as F:
149 for phage in self.phageTails:
150 for prot in self.phageTails[phage]:
151 F.write('>' + prot + '\n' + self.phageTails[phage][prot][1] + '\n')
152
153 def add_kmers(self):
154 from skbio import Sequence
155 import json
156 groups = '0123456'
157 freqs = {}
158 for i in groups:
159 for j in groups:
160 freqs[i+j] = 0.0
161 for i in freqs:
162 exec('phage_group_{0} = []'.format(i))
163 exec('bact_group_{0} = []'.format(i))
164 phage = ''
165 bact = ''
166 for ID in self.features_data.index:
167 done_phage = False
168 done_bact = False
169 if ID[:ID.find('--')] == phage:
170 for i in freqs.keys():
171 exec('phage_group_{0}.append(phage_group_{0}[-1])'.format(i))
172 done_phage = True
173 if ID[ID.find('--') + 2:] == bact:
174 for i in freqs.keys():
175 exec('bact_group_{0}.append(bact_group_{0}[-1])'.format(i))
176 done_bact = True
177 bact = ID[ID.find('--') + 2:]
178 phage = ID[:ID.find('--')]
179
180 if not done_phage:
181 totalKmers = freqs.copy()
182 count_prots = 0
183 for prot in self.list_prot[phage]:
184 max_freq = 0.0
185 min_freq = 1000000.0
186 count_prots += 1
187 seq = self.__get_conjoint_triad(self.list_prot[phage][prot][1].replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C').replace('J', 'L'))
188 seq = Sequence(seq)
189 temp = seq.kmer_frequencies(2, overlap=True, relative=True)
190 for i in temp.keys(): # para normalizar
191 if temp[i] < min_freq:
192 min_freq = temp[i]
193 if temp[i] > max_freq:
194 max_freq = temp[i]
195 for i in temp.keys():
196 totalKmers[i] += temp[i] - (min_freq / max_freq)
197 if count_prots != 0:
198 for i in totalKmers.keys():
199 totalKmers[i] = totalKmers[i] / count_prots
200 temp_value = totalKmers[i]
201 exec('phage_group_{0}.append(temp_value)'.format(i))
202 else:
203 for i in totalKmers.keys():
204 exec('phage_group_{0}.append(0.0)'.format(i))
205
206 if not done_bact:
207 totalKmers = freqs.copy()
208 count_prots = 0
209 with open('files/bacteria/' + bact + '.json', encoding='utf-8') as F:
210 bact_prots = json.loads(F.read())
211 for prot in bact_prots:
212 max_freq = 0.0
213 min_freq = 1000000.0
214 count_prots += 1
215 seq = bact_prots[prot][1]
216 seq = seq[:seq.find('"')]
217 seq = self.__get_conjoint_triad(seq.replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C').replace('J', 'L'))
218 seq = Sequence(seq)
219 temp = seq.kmer_frequencies(2, overlap=True, relative=True)
220 for i in temp.keys(): # para normalizar
221 if temp[i] < min_freq:
222 min_freq = temp[i]
223 if temp[i] > max_freq:
224 max_freq = temp[i]
225 for i in temp.keys():
226 totalKmers[i] += temp[i] - (min_freq / max_freq)
227 if count_prots != 0:
228 for i in totalKmers.keys():
229 totalKmers[i] = totalKmers[i] / count_prots
230 temp_value = totalKmers[i]
231 exec('bact_group_{0}.append(temp_value)'.format(i))
232 else:
233 for i in freqs.keys():
234 exec('bact_group_{0}.append(0.0)'.format(i))
235
236 for i in freqs.keys():
237 exec('self.features_data["phage_kmer_{0}"] = phage_group_{0}'.format(i))
238 exec('self.features_data["bact_kmer_{0}"] = bact_group_{0}'.format(i))
239
240 def get_kmers(self, phage, bacteria):
241 from skbio import Sequence
242 solution = []
243 groups = '0123456'
244 freqs = {}
245 for i in groups:
246 for j in groups:
247 freqs[i+j] = 0.0
248 for i in freqs:
249 exec('phage_group_{0} = 0.0'.format(i))
250 exec('bact_group_{0} = 0.0'.format(i))
251
252 totalKmers = freqs.copy()
253 count_prots = 0
254 for prot in phage:
255 max_freq = 0.0
256 min_freq = 1000000.0
257 count_prots += 1
258 seq = self.__get_conjoint_triad(phage[prot][1].replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C'))
259 seq = Sequence(seq)
260 temp = seq.kmer_frequencies(2, overlap=True, relative=True)
261 for i in temp.keys(): # para normalizar
262 if temp[i] < min_freq:
263 min_freq = temp[i]
264 if temp[i] > max_freq:
265 max_freq = temp[i]
266 for i in temp.keys():
267 totalKmers[i] += temp[i] - (min_freq / max_freq)
268 if count_prots != 0:
269 for i in totalKmers.keys():
270 totalKmers[i] = totalKmers[i] / count_prots
271 temp_value = totalKmers[i]
272 exec('phage_group_{0} += temp_value'.format(i))
273
274 totalKmers = freqs.copy()
275 count_prots = 0
276 for prot in bacteria:
277 max_freq = 0.0
278 min_freq = 1000000.0
279 count_prots += 1
280 seq = bacteria[prot][1]
281 seq = seq[:seq.find('"')]
282 seq = self.__get_conjoint_triad(seq.replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C'))
283 seq = Sequence(seq)
284 temp = seq.kmer_frequencies(2, overlap=True, relative=True)
285 for i in temp.keys(): # para normalizar
286 if temp[i] < min_freq:
287 min_freq = temp[i]
288 if temp[i] > max_freq:
289 max_freq = temp[i]
290 for i in temp.keys():
291 totalKmers[i] += temp[i] - (min_freq / max_freq)
292 if count_prots != 0:
293 for i in totalKmers.keys():
294 totalKmers[i] = totalKmers[i] / count_prots
295 temp_value = totalKmers[i]
296 exec('bact_group_{0} += temp_value'.format(i))
297
298 for i in freqs.keys():
299 exec('solution.append(phage_group_{0})'.format(i))
300 exec('solution.append(bact_group_{0})'.format(i))
301 return solution
302
303 def add_composition(self):
304 from skbio import Sequence
305 import json
306 bact_comp = {}
307 phage_comp = {}
308 groups = '0123456'
309 for i in groups:
310 bact_comp['comp_' + i] = []
311 phage_comp['comp_' + i] = []
312 phage = ''
313 bact = ''
314 count = -1
315 for ID in self.features_data.index:
316 done_phage = False
317 done_bact = False
318 count += 1
319 if ID[:ID.find('--')] == phage:
320 for i in groups:
321 phage_comp['comp_' + i].append(phage_comp['comp_' + i][-1])
322 done_phage = True
323 if ID[ID.find('--') + 2:] == bact:
324 for i in groups:
325 bact_comp['comp_' + i].append(bact_comp['comp_' + i][-1])
326 done_bact = True
327 bact = ID[ID.find('--') + 2:]
328 phage = ID[:ID.find('--')]
329
330 if not done_phage:
331 count_prots = 0
332 for i in groups:
333 phage_comp['comp_' + i].append(0)
334 for prot in self.list_prot[phage]:
335 max_comp = 0.0
336 min_comp = 1000000.0
337 count_prots += 1
338 seq = self.__get_conjoint_triad(self.list_prot[phage][prot][1].replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C'))
339 seq = Sequence(seq)
340 for i in groups: # para normalizar
341 if seq.count(i) < min_comp:
342 min_comp = seq.count(i)
343 if seq.count(i) > max_comp:
344 max_comp = seq.count(i)
345 for i in groups:
346 phage_comp['comp_' + i][count] += seq.count(i) - (min_comp / max_comp)
347 total = 0
348 if count_prots != 0:
349 for i in groups:
350 phage_comp['comp_' + i][count] = phage_comp['comp_' + i][count] / count_prots
351 total += phage_comp['comp_' + i][count]
352 for i in groups:
353 phage_comp['comp_' + i][count] = phage_comp['comp_' + i][count] / total
354 else:
355 for i in groups:
356 phage_comp['comp_' + i][count] = 0.0
357
358 if not done_bact:
359 count_prots = 0
360 for i in groups:
361 bact_comp['comp_' + i].append(0)
362 with open('files/bacteria/' + bact + '.json', encoding='utf-8') as F:
363 bact_prots = json.loads(F.read())
364 for prot in bact_prots:
365 max_comp = 0.0
366 min_comp = 1000000.0
367 count_prots += 1
368 seq = bact_prots[prot][1]
369 seq = seq[:seq.find('"')]
370 seq = self.__get_conjoint_triad(seq.replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C'))
371 seq = Sequence(seq)
372 for i in groups:
373 if seq.count(i) < min_comp:
374 min_comp = seq.count(i)
375 if seq.count(i) > max_comp:
376 max_comp = seq.count(i)
377 for i in groups:
378 bact_comp['comp_' + i][count] += seq.count(i) - (min_comp / max_comp)
379 total = 0
380 if count_prots != 0:
381 for i in groups:
382 bact_comp['comp_' + i][count] = bact_comp['comp_' + i][count] / count_prots
383 total += bact_comp['comp_' + i][count]
384 else:
385 for i in groups:
386 bact_comp['comp_' + i][count] = 0.0
387 if total != 0:
388 for i in groups:
389 bact_comp['comp_' + i][count] = bact_comp['comp_' + i][count] / total
390 else:
391 for i in groups:
392 bact_comp['comp_' + i][count] = 0.0
393
394 for i in groups:
395 self.features_data['bact_comp_' + i] = bact_comp['comp_' + i]
396 self.features_data['phage_comp_' + i] = phage_comp['comp_' + i]
397
398 def get_composition(self, phage, bacteria):
399 from skbio import Sequence
400 solution = []
401 bact_comp = {}
402 phage_comp = {}
403 phage_comp_carb = {}
404 groups = '0123456'
405 for i in groups:
406 bact_comp['comp_' + i] = 0
407 phage_comp['comp_' + i] = 0
408 count_prots = 0
409 for prot in phage:
410 max_comp = 0.0
411 min_comp = 1000000.0
412 count_prots += 1
413 seq = self.__get_conjoint_triad(phage[prot][1].replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C'))
414 seq = Sequence(seq)
415 for i in groups: # para normalizar
416 if seq.count(i) < min_comp:
417 min_comp = seq.count(i)
418 if seq.count(i) > max_comp:
419 max_comp = seq.count(i)
420 for i in groups:
421 phage_comp['comp_' + i] += seq.count(i) - (min_comp / max_comp)
422 total = 0
423 if count_prots != 0:
424 for i in groups:
425 phage_comp['comp_' + i] = phage_comp['comp_' + i] / count_prots
426 total += phage_comp['comp_' + i]
427 for i in groups:
428 phage_comp['comp_' + i] = phage_comp['comp_' + i] / total
429 else:
430 for i in groups:
431 phage_comp['comp_' + i] = 0.0
432
433 count_prots = 0
434 for prot in bacteria:
435 max_comp = 0.0
436 min_comp = 1000000.0
437 count_prots += 1
438 seq = bacteria[prot][1]
439 seq = seq[:seq.find('"')]
440 seq = self.__get_conjoint_triad(seq.replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C'))
441 seq = Sequence(seq)
442 for i in groups:
443 if seq.count(i) < min_comp:
444 min_comp = seq.count(i)
445 if seq.count(i) > max_comp:
446 max_comp = seq.count(i)
447 for i in groups:
448 bact_comp['comp_' + i] += seq.count(i) - (min_comp / max_comp)
449 total = 0
450 if count_prots != 0:
451 for i in groups:
452 bact_comp['comp_' + i] = bact_comp['comp_' + i] / count_prots
453 total += bact_comp['comp_' + i]
454 for i in groups:
455 bact_comp['comp_' + i] = bact_comp['comp_' + i] / total
456 else:
457 for i in groups:
458 bact_comp['comp_' + i] = 0.0
459
460 for i in groups:
461 solution.append(bact_comp['comp_' + i])
462 solution.append(phage_comp['comp_' + i])
463 return solution
464
465 def add_grouping(self):
466 from skbio import Sequence
467 import json
468 bact_group = {}
469 phage_group = {}
470 groups = '0123456'
471 letters = 'ABCDEFGHIJ'
472 for i in groups:
473 for j in letters:
474 bact_group['group' + j + '_' + i] = []
475 phage_group['group' + j + '_' + i] = []
476 phage = ''
477 bact = ''
478 count = -1
479 for ID in self.features_data.index:
480 done_phage = False
481 done_bact = False
482 count += 1
483 if ID[:ID.find('--')] == phage:
484 for i in groups:
485 for j in letters:
486 phage_group['group' + j + '_' + i].append(phage_group['group' + j + '_' + i][-1])
487 done_phage = True
488 if ID[ID.find('--') + 2:] == bact:
489 for i in groups:
490 for j in letters:
491 bact_group['group' + j + '_' + i].append(bact_group['group' + j + '_' + i][-1])
492 done_bact = True
493 bact = ID[ID.find('--') + 2:]
494 phage = ID[:ID.find('--')]
495
496 if not done_phage:
497 count_prots = 0
498 for i in groups:
499 for j in letters:
500 phage_group['group' + j + '_' + i].append(0)
501 for prot in self.list_prot[phage]:
502 count_prots += 1
503 seq = self.__get_conjoint_triad(self.list_prot[phage][prot][1].replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C'))
504 seq = Sequence(seq)
505 for j in letters:
506 group = self.__get_grouping(seq, j)
507 for i in groups:
508 phage_group['group' + j + '_' + i][count] += group[i]
509 if count_prots != 0:
510 for i in groups:
511 for j in letters:
512 phage_group['group' + j + '_' + i][count] = phage_group['group' + j + '_' + i][count] / count_prots
513 else:
514 for i in groups:
515 for j in letters:
516 phage_group['group' + j + '_' + i][count] = 0.0
517
518 if not done_bact:
519 count_prots = 0
520 for i in groups:
521 for j in letters:
522 bact_group['group' + j + '_' + i].append(0)
523 with open('files/bacteria/' + bact + '.json', encoding='utf-8') as F:
524 bact_prots = json.loads(F.read())
525 for prot in bact_prots:
526 count_prots += 1
527 seq = bact_prots[prot][1]
528 seq = seq[:seq.find('"')]
529 seq = self.__get_conjoint_triad(seq.replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C'))
530 seq = Sequence(seq)
531 for j in letters:
532 group = self.__get_grouping(seq, j)
533 for i in groups:
534 bact_group['group' + j + '_' + i][count] += group[i]
535 if count_prots != 0:
536 for i in groups:
537 for j in letters:
538 bact_group['group' + j + '_' + i][count] = bact_group['group' + j + '_' + i][count] / count_prots
539 else:
540 for i in groups:
541 for j in letters:
542 bact_group['group' + j + '_' + i][count] = 0.0
543
544 for i in groups:
545 for j in letters:
546 self.features_data['bact_group' + j + '_' + i] = bact_group['group' + j + '_' + i]
547 self.features_data['phage_group' + j + '_' + i] = phage_group['group' + j + '_' + i]
548
549 def get_grouping(self, phage, bacteria):
550 from skbio import Sequence
551 bact_group = {}
552 phage_group = {}
553 groups = '0123456'
554 letters = 'ABCDEFGHIJ'
555 for i in groups:
556 for j in letters:
557 bact_group['group' + j + '_' + i] = 0
558 phage_group['group' + j + '_' + i] = 0
559 solution = []
560 count_prots = 0
561 for prot in phage:
562 count_prots += 1
563 seq = self.__get_conjoint_triad(phage[prot][1].replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C'))
564 seq = Sequence(seq)
565 for j in letters:
566 group = self.__get_grouping(seq, j)
567 for i in groups:
568 phage_group['group' + j + '_' + i] += group[i]
569 if count_prots != 0:
570 for i in groups:
571 for j in letters:
572 phage_group['group' + j + '_' + i] = phage_group['group' + j + '_' + i] / count_prots
573 else:
574 for i in groups:
575 for j in letters:
576 phage_group['group' + j + '_' + i] = 0.0
577
578 count_prots = 0
579 for prot in bacteria:
580 count_prots += 1
581 seq = bacteria[prot][1]
582 seq = seq[:seq.find('"')]
583 seq = self.__get_conjoint_triad(seq.replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C'))
584 seq = Sequence(seq)
585 for j in letters:
586 group = self.__get_grouping(seq, j)
587 for i in groups:
588 bact_group['group' + j + '_' + i] += group[i]
589 if count_prots != 0:
590 for i in groups:
591 for j in letters:
592 bact_group['group' + j + '_' + i] = bact_group['group' + j + '_' + i] / count_prots
593 else:
594 for i in groups:
595 for j in letters:
596 bact_group['group' + j + '_' + i] = 0.0
597
598 for i in groups:
599 for j in letters:
600 solution.append(bact_group['group' + j + '_' + i])
601 solution.append(phage_group['group' + j + '_' + i])
602 return solution
603
604 def __get_conjoint_triad(self, prot):
605 ctm = {'A':'0', 'G':'0', 'V':'0','C':'1', 'F':'2', 'I':'2', 'L':'2', 'P':'2', 'M':'3', 'S':'3', 'T':'3', 'Y':'3', 'H':'4', 'N':'4', 'Q':'4', 'W':'4', 'K':'5', 'R':'5', 'D':'6', 'E':'6'}
606 for i, j in ctm.items():
607 prot = prot.replace(i, j)
608 return prot
609
610 def __get_grouping(self, prot, let='A'):
611 from skbio import Sequence
612 groups = '0123456'
613 group = {}
614 for i in groups:
615 group[i] = 0.0
616 if let == 'A':
617 seq = Sequence(prot[:int(len(prot) * 0.25)])
618 for i in groups:
619 group[i] += seq.count(i) / len(seq)
620 elif let == 'B':
621 seq = Sequence(prot[int(len(prot) * 0.25):int(len(prot) * 0.5)])
622 for i in groups:
623 group[i] += seq.count(i) / len(seq)
624 elif let == 'C':
625 seq = Sequence(prot[int(len(prot) * 0.5):int(len(prot) * 0.75)])
626 for i in groups:
627 group[i] += seq.count(i) / len(seq)
628 elif let == 'D':
629 seq = Sequence(prot[int(len(prot) * 0.75):])
630 for i in groups:
631 group[i] += seq.count(i) / len(seq)
632 elif let == 'E':
633 seq = Sequence(prot[:int(len(prot) * 0.5)])
634 for i in groups:
635 group[i] += seq.count(i) / len(seq)
636 elif let == 'F':
637 seq = Sequence(prot[int(len(prot) * 0.5):])
638 for i in groups:
639 group[i] += seq.count(i) / len(seq)
640 elif let == 'G':
641 seq = Sequence(prot[int(len(prot) * 0.25):int(len(prot) * 0.75)])
642 for i in groups:
643 group[i] += seq.count(i) / len(seq)
644 elif let == 'H':
645 seq = Sequence(prot[:int(len(prot) * 0.75)])
646 for i in groups:
647 group[i] += seq.count(i) / len(seq)
648 elif let == 'I':
649 seq = Sequence(prot[int(len(prot) * 0.25):])
650 for i in groups:
651 group[i] += seq.count(i) / len(seq)
652 elif let == 'J':
653 seq = Sequence(prot[int(len(prot) * 0.125):int(len(prot) * 0.875)])
654 for i in groups:
655 group[i] += seq.count(i) / len(seq)
656 return group
657
658 def set_output(self):
659 import pandas as pd
660 output = []
661 data = pd.read_csv('files/NCBI_Phage_Bacteria_Data.csv', header=0, index_col=0, names=['Phage Name', 'Bacteria Name', 'Bacteria ID'])
662 for phage in self.features_data['ID']:
663 phage = phage[:phage.find('--')]
664 bact = data.loc[phage, 'Bacteria Name']
665 if 'escherichia' in bact.lower():
666 output.append('Escherichia coli')
667 elif 'klebsiella' in bact.lower():
668 output.append('Klebsiella pneumoniae')
669 elif 'acinetobacter' in bact.lower():
670 output.append('Acinetobacter baumannii')
671 self.features_data = self.features_data.set_index('ID')
672 self.features_data['Bacteria'] = output
673
674 def save_feat_data(self):
675 import pickle
676 with open('files/FeatureDataset', 'wb') as f:
677 pickle.dump(self.features_data, f)
678 return self.features_data
679
680 def import_feat_data(self):
681 import pickle
682 with open('files/FeatureDataset', 'rb') as f:
683 self.features_data = pickle.load(f)
684 return self.features_data
685
686
687 if __name__ == '__main__':
688 test = FeatureConstruction()
689 # test.process_net_surf()
690 test.add_grouping()
691 test.add_composition()
692 test.add_kmers()
693 # test.set_output()
694 test.save_feat_data()
695 '''
696 test.process_net_surf()
697 test.add_aa_freq()
698 test.add_aromaticity()
699 test.add_flexibility()
700 test.add_molecular_weight()'''
701 # test.import_feat_data()
702 # test.netSurf()