annotate phage_host_prediction/feature_construction.py @ 2:3e1e8be4e65c draft default tip

Uploaded
author pedro_araujo
date Fri, 02 Apr 2021 10:11:13 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
2 class FeatureConstruction:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
3
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
4 def __init__(self):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
5 """
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
6 In development. Extract features from proteins.
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
7 """
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
8 import pandas as pd
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
9 import json
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
10 import ast
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
11 from pathlib import Path
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
12 import os
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
13 from random import randint
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
14 data = pd.read_csv('files/NCBI_Phage_Bacteria_Data.csv', header=0, index_col=0)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
15 with open('files/phagesProteins.json', encoding='utf-8') as F:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
16 self.phagesProteins = json.loads(F.read())
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
17 self._filter_phage_domains()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
18 # with open('files/bactProteins.json', encoding='utf-8') as F:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
19 # self.bactProteins = json.loads(F.read())
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
20 # self._filter_bacteria()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
21 all_phages = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
22 ecoli = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
23 kpneumoniae = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
24 abaumannii = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
25 my_file = Path("files/FeatureDataset")
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
26 if not my_file.is_file():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
27 for phage in self.phageTails:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
28 if phage in data.index and self.phageTails[phage]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
29 for bact in ast.literal_eval(data.loc[phage, 'Host_ID']):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
30 bact = bact[:-2]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
31 if bact + '.json' in os.listdir('files/bacteria'):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
32 # if self.externalProts[bact]: # This verification is not necessary for carbohydrates
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
33 all_phages[phage + '--' + bact] = 'Yes'
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
34 name = data.loc[phage, 'Host']
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
35 if 'escherichia' in name.lower() or 'coli' in name.lower():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
36 ecoli[bact] = 0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
37 elif 'klebsiella' in name.lower() or 'pneumoniae' in name.lower():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
38 kpneumoniae[bact] = 0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
39 elif 'acinetobacter' in name.lower() or 'baumannii' in name.lower():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
40 abaumannii[bact] = 0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
41 for phage in self.phageTails:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
42 if phage in data.index and self.phageTails[phage]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
43 # if self.phageTails[phage]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
44 name = data.loc[phage, 'Host']
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
45 if 'escherichia' in name.lower() or 'coli' in name.lower():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
46 i = 0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
47 while i < 12:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
48 bact = list(kpneumoniae.keys())[randint(0, len(kpneumoniae.keys()) - 1)]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
49 all_phages[phage + '--' + bact] = 'No'
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
50 i += 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
51 elif 'klebsiella' in name.lower() or 'pneumoniae' in name.lower():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
52 i = 0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
53 while i < 12:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
54 bact = list(ecoli.keys())[randint(0, len(ecoli.keys()) - 1)]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
55 all_phages[phage + '--' + bact] = 'No'
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
56 i += 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
57 elif 'acinetobacter' in name.lower() or 'baumannii' in name.lower():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
58 i = 0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
59 while i < 12:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
60 bact = list(kpneumoniae.keys())[randint(0, len(kpneumoniae.keys()) - 1)]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
61 all_phages[phage + '--' + bact] = 'No'
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
62 i += 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
63 self.features_data = pd.DataFrame({'ID': list(all_phages.keys()), 'Infects': list(all_phages.values())})
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
64 self.features_data = self.features_data.set_index('ID')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
65 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
66 self.import_feat_data()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
67
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
68 def _filter_phage_domains(self):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
69 import json
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
70 from pathlib import Path
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
71 '''
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
72 Filters out unwanted proteins. Domains that are unknown or are not associated with fibers, spikes, tails, enzymatic or binding are not considered.
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
73 Still in development.
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
74 :return: phageTails, a dictionary containing only
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
75 '''
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
76 my_file = Path("files/phageTails.json")
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
77 if not my_file.is_file():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
78 self.phageTails = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
79 for phage in self.phagesProteins:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
80 self.phageTails[phage] = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
81 for protein in self.phagesProteins[phage]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
82 if any(z in self.phagesProteins[phage][protein][0].lower() for z in ['fiber', 'fibre', 'spike', 'hydrolase', 'bind', 'depolymerase', 'peptidase', 'lyase', 'sialidase', 'dextranase', 'lipase', 'adhesin', 'baseplate', 'protein h', 'recognizing'
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
83 'protein j', 'protein g', 'gpe', 'duf4035', 'host specifity', 'cor protein', 'specificity', 'baseplate component', 'gp38', 'gp12 tail', 'receptor', 'recognition', 'tail']) \
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
84 and not any(z in self.phagesProteins[phage][protein][0].lower() for z in ['nucle', 'dna', 'rna', 'ligase', 'transferase', 'inhibitor', 'assembly', 'connect', 'nudix', 'atp', 'nad', 'transpos', 'ntp', 'molybdenum', 'hns',
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
85 'gtp', 'riib', 'inhibitor', 'replicat', 'codon', 'pyruvate', 'catalyst', 'hinge', 'sheath completion', 'head', 'capsid', 'tape', 'tip', 'strand', 'matur', 'portal'
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
86 'terminase', 'nucl', 'promot', 'block', 'olfact', 'wedge', 'lysozyme', 'mur', 'sheat']):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
87 self.phageTails[phage][protein] = self.phagesProteins[phage][protein]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
88 '''else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
89 for i in self.phagesProteins[phage][protein]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
90 if type(i) == str:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
91 if any(z in str(i).lower() for z in ['fiber', 'fibre', 'spike', 'hydrolase', 'bind', 'depolymerase', 'peptidase', 'lyase', 'sialidase', 'dextranase', 'lipase', 'adhesin', 'baseplate', 'protein h', 'recognizing'
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
92 'protein j', 'protein g', 'gpe', 'duf4035', 'host specifity', 'cor protein', 'specificity', 'baseplate component', 'gp38', 'gp12 tail', 'receptor', 'recognition', 'tail']) \
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
93 and not any(z in str(i).lower() for z in ['nucle', 'dna', 'rna', 'ligase', 'transferase', 'inhibitor', 'assembly', 'connect', 'nudix', 'atp', 'nad', 'transpos', 'ntp', 'molybdenum', 'hns', 'gtp',
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
94 'riib', 'inhibitor', 'replicat', 'codon', 'pyruvate', 'catalyst', 'hinge', 'sheath completion', 'head', 'capsid', 'tape', 'tip', 'strand', 'matur', 'portal'
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
95 'terminase', 'nucl']):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
96 self.phageTails[phage][protein] = self.phagesProteins[phage][protein]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
97 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
98 for j in i:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
99 if any(z in str(j).lower() for z in ['fiber', 'fibre', 'spike', 'hydrolase', 'bind', 'depolymerase', 'peptidase', 'lyase', 'sialidase', 'dextranase', 'lipase', 'adhesin', 'baseplate', 'protein h', 'recognizing'
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
100 'protein j', 'protein g', 'gpe', 'duf4035', 'host specifity', 'cor protein', 'specificity', 'baseplate component', 'gp38', 'gp12 tail', 'receptor', 'recognition', 'tail']) \
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
101 and not any(z in str(j).lower() for z in ['nucle', 'dna', 'rna', 'ligase', 'transferase', 'inhibitor', 'assembly', 'connect', 'nudix', 'atp', 'nad', 'transpos', 'ntp', 'molybdenum', 'hns', 'gtp',
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
102 'riib', 'inhibitor', 'replicat', 'codon', 'pyruvate', 'catalyst', 'hinge', 'sheath completion', 'head', 'capsid', 'tape', 'tip', 'strand', 'matur', 'portal'
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
103 'terminase', 'nucl']):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
104 self.phageTails[phage][protein] = self.phagesProteins[phage][protein]'''
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
105 with open('files/phageTails.json', 'w') as f:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
106 json.dump(self.phageTails, f)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
107 self.__create_phage_fasta()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
108 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
109 with open('files/phageTails.json', encoding='utf-8') as F:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
110 self.phageTails = json.loads(F.read())
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
111 return self.phageTails
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
112
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
113 def _filter_bacteria(self):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
114 import json
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
115 from pathlib import Path
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
116 import pandas as pd
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
117 my_file = Path("files/externalProts.json")
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
118 if not my_file.is_file():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
119 self.externalProts = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
120 predictions = pd.read_csv('files/results_psort.txt', sep='\t', index_col=False)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
121 predictions = predictions.set_index('SeqID')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
122 predictions = predictions.drop_duplicates()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
123 for bac in self.bactProteins:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
124 self.externalProts[bac] = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
125 for protein in self.bactProteins[bac]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
126 if protein + ' ' in predictions.index:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
127 maxScore = 0.0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
128 for loc in ['Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Periplasmic_Score', 'OuterMembrane_Score', 'Extracellular_Score']:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
129 if predictions.loc[protein + ' ', loc] > maxScore:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
130 maxScore = predictions.loc[protein + ' ', loc]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
131 location = loc
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
132 if location == 'CytoplasmicMembrane_Score' or location == 'OuterMembrane_Score' or location == 'Extracellular_Score':
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
133 self.externalProts[bac][protein] = self.bactProteins[bac][protein][1]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
134 if self.externalProts != {}:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
135 del self.bactProteins
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
136 with open('files/externalProts.json', 'w') as f:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
137 json.dump(self.externalProts, f)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
138 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
139 with open('files/externalProts.json', encoding='utf-8') as F:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
140 self.externalProts = json.loads(F.read())
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
141 return self.externalProts
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
142
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
143 def __create_phage_fasta(self):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
144 """
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
145 Creates a fasta file containing every protein sequence for every phage.
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
146 :return:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
147 """
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
148 with open('files/tails.fasta', 'w') as F:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
149 for phage in self.phageTails:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
150 for prot in self.phageTails[phage]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
151 F.write('>' + prot + '\n' + self.phageTails[phage][prot][1] + '\n')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
152
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
153 def add_kmers(self):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
154 from skbio import Sequence
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
155 import json
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
156 groups = '0123456'
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
157 freqs = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
158 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
159 for j in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
160 freqs[i+j] = 0.0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
161 for i in freqs:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
162 exec('phage_group_{0} = []'.format(i))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
163 exec('bact_group_{0} = []'.format(i))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
164 phage = ''
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
165 bact = ''
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
166 for ID in self.features_data.index:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
167 done_phage = False
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
168 done_bact = False
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
169 if ID[:ID.find('--')] == phage:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
170 for i in freqs.keys():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
171 exec('phage_group_{0}.append(phage_group_{0}[-1])'.format(i))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
172 done_phage = True
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
173 if ID[ID.find('--') + 2:] == bact:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
174 for i in freqs.keys():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
175 exec('bact_group_{0}.append(bact_group_{0}[-1])'.format(i))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
176 done_bact = True
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
177 bact = ID[ID.find('--') + 2:]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
178 phage = ID[:ID.find('--')]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
179
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
180 if not done_phage:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
181 totalKmers = freqs.copy()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
182 count_prots = 0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
183 for prot in self.list_prot[phage]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
184 max_freq = 0.0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
185 min_freq = 1000000.0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
186 count_prots += 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
187 seq = self.__get_conjoint_triad(self.list_prot[phage][prot][1].replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C').replace('J', 'L'))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
188 seq = Sequence(seq)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
189 temp = seq.kmer_frequencies(2, overlap=True, relative=True)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
190 for i in temp.keys(): # para normalizar
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
191 if temp[i] < min_freq:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
192 min_freq = temp[i]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
193 if temp[i] > max_freq:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
194 max_freq = temp[i]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
195 for i in temp.keys():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
196 totalKmers[i] += temp[i] - (min_freq / max_freq)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
197 if count_prots != 0:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
198 for i in totalKmers.keys():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
199 totalKmers[i] = totalKmers[i] / count_prots
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
200 temp_value = totalKmers[i]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
201 exec('phage_group_{0}.append(temp_value)'.format(i))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
202 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
203 for i in totalKmers.keys():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
204 exec('phage_group_{0}.append(0.0)'.format(i))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
205
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
206 if not done_bact:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
207 totalKmers = freqs.copy()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
208 count_prots = 0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
209 with open('files/bacteria/' + bact + '.json', encoding='utf-8') as F:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
210 bact_prots = json.loads(F.read())
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
211 for prot in bact_prots:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
212 max_freq = 0.0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
213 min_freq = 1000000.0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
214 count_prots += 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
215 seq = bact_prots[prot][1]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
216 seq = seq[:seq.find('"')]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
217 seq = self.__get_conjoint_triad(seq.replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C').replace('J', 'L'))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
218 seq = Sequence(seq)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
219 temp = seq.kmer_frequencies(2, overlap=True, relative=True)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
220 for i in temp.keys(): # para normalizar
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
221 if temp[i] < min_freq:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
222 min_freq = temp[i]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
223 if temp[i] > max_freq:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
224 max_freq = temp[i]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
225 for i in temp.keys():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
226 totalKmers[i] += temp[i] - (min_freq / max_freq)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
227 if count_prots != 0:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
228 for i in totalKmers.keys():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
229 totalKmers[i] = totalKmers[i] / count_prots
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
230 temp_value = totalKmers[i]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
231 exec('bact_group_{0}.append(temp_value)'.format(i))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
232 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
233 for i in freqs.keys():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
234 exec('bact_group_{0}.append(0.0)'.format(i))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
235
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
236 for i in freqs.keys():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
237 exec('self.features_data["phage_kmer_{0}"] = phage_group_{0}'.format(i))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
238 exec('self.features_data["bact_kmer_{0}"] = bact_group_{0}'.format(i))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
239
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
240 def get_kmers(self, phage, bacteria):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
241 from skbio import Sequence
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
242 solution = []
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
243 groups = '0123456'
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
244 freqs = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
245 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
246 for j in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
247 freqs[i+j] = 0.0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
248 for i in freqs:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
249 exec('phage_group_{0} = 0.0'.format(i))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
250 exec('bact_group_{0} = 0.0'.format(i))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
251
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
252 totalKmers = freqs.copy()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
253 count_prots = 0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
254 for prot in phage:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
255 max_freq = 0.0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
256 min_freq = 1000000.0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
257 count_prots += 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
258 seq = self.__get_conjoint_triad(phage[prot][1].replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C'))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
259 seq = Sequence(seq)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
260 temp = seq.kmer_frequencies(2, overlap=True, relative=True)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
261 for i in temp.keys(): # para normalizar
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
262 if temp[i] < min_freq:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
263 min_freq = temp[i]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
264 if temp[i] > max_freq:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
265 max_freq = temp[i]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
266 for i in temp.keys():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
267 totalKmers[i] += temp[i] - (min_freq / max_freq)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
268 if count_prots != 0:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
269 for i in totalKmers.keys():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
270 totalKmers[i] = totalKmers[i] / count_prots
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
271 temp_value = totalKmers[i]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
272 exec('phage_group_{0} += temp_value'.format(i))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
273
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
274 totalKmers = freqs.copy()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
275 count_prots = 0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
276 for prot in bacteria:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
277 max_freq = 0.0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
278 min_freq = 1000000.0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
279 count_prots += 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
280 seq = bacteria[prot][1]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
281 seq = seq[:seq.find('"')]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
282 seq = self.__get_conjoint_triad(seq.replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C'))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
283 seq = Sequence(seq)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
284 temp = seq.kmer_frequencies(2, overlap=True, relative=True)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
285 for i in temp.keys(): # para normalizar
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
286 if temp[i] < min_freq:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
287 min_freq = temp[i]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
288 if temp[i] > max_freq:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
289 max_freq = temp[i]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
290 for i in temp.keys():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
291 totalKmers[i] += temp[i] - (min_freq / max_freq)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
292 if count_prots != 0:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
293 for i in totalKmers.keys():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
294 totalKmers[i] = totalKmers[i] / count_prots
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
295 temp_value = totalKmers[i]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
296 exec('bact_group_{0} += temp_value'.format(i))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
297
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
298 for i in freqs.keys():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
299 exec('solution.append(phage_group_{0})'.format(i))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
300 exec('solution.append(bact_group_{0})'.format(i))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
301 return solution
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
302
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
303 def add_composition(self):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
304 from skbio import Sequence
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
305 import json
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
306 bact_comp = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
307 phage_comp = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
308 groups = '0123456'
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
309 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
310 bact_comp['comp_' + i] = []
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
311 phage_comp['comp_' + i] = []
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
312 phage = ''
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
313 bact = ''
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
314 count = -1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
315 for ID in self.features_data.index:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
316 done_phage = False
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
317 done_bact = False
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
318 count += 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
319 if ID[:ID.find('--')] == phage:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
320 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
321 phage_comp['comp_' + i].append(phage_comp['comp_' + i][-1])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
322 done_phage = True
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
323 if ID[ID.find('--') + 2:] == bact:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
324 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
325 bact_comp['comp_' + i].append(bact_comp['comp_' + i][-1])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
326 done_bact = True
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
327 bact = ID[ID.find('--') + 2:]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
328 phage = ID[:ID.find('--')]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
329
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
330 if not done_phage:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
331 count_prots = 0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
332 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
333 phage_comp['comp_' + i].append(0)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
334 for prot in self.list_prot[phage]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
335 max_comp = 0.0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
336 min_comp = 1000000.0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
337 count_prots += 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
338 seq = self.__get_conjoint_triad(self.list_prot[phage][prot][1].replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C'))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
339 seq = Sequence(seq)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
340 for i in groups: # para normalizar
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
341 if seq.count(i) < min_comp:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
342 min_comp = seq.count(i)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
343 if seq.count(i) > max_comp:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
344 max_comp = seq.count(i)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
345 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
346 phage_comp['comp_' + i][count] += seq.count(i) - (min_comp / max_comp)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
347 total = 0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
348 if count_prots != 0:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
349 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
350 phage_comp['comp_' + i][count] = phage_comp['comp_' + i][count] / count_prots
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
351 total += phage_comp['comp_' + i][count]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
352 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
353 phage_comp['comp_' + i][count] = phage_comp['comp_' + i][count] / total
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
354 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
355 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
356 phage_comp['comp_' + i][count] = 0.0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
357
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
358 if not done_bact:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
359 count_prots = 0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
360 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
361 bact_comp['comp_' + i].append(0)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
362 with open('files/bacteria/' + bact + '.json', encoding='utf-8') as F:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
363 bact_prots = json.loads(F.read())
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
364 for prot in bact_prots:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
365 max_comp = 0.0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
366 min_comp = 1000000.0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
367 count_prots += 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
368 seq = bact_prots[prot][1]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
369 seq = seq[:seq.find('"')]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
370 seq = self.__get_conjoint_triad(seq.replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C'))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
371 seq = Sequence(seq)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
372 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
373 if seq.count(i) < min_comp:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
374 min_comp = seq.count(i)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
375 if seq.count(i) > max_comp:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
376 max_comp = seq.count(i)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
377 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
378 bact_comp['comp_' + i][count] += seq.count(i) - (min_comp / max_comp)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
379 total = 0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
380 if count_prots != 0:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
381 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
382 bact_comp['comp_' + i][count] = bact_comp['comp_' + i][count] / count_prots
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
383 total += bact_comp['comp_' + i][count]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
384 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
385 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
386 bact_comp['comp_' + i][count] = 0.0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
387 if total != 0:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
388 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
389 bact_comp['comp_' + i][count] = bact_comp['comp_' + i][count] / total
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
390 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
391 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
392 bact_comp['comp_' + i][count] = 0.0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
393
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
394 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
395 self.features_data['bact_comp_' + i] = bact_comp['comp_' + i]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
396 self.features_data['phage_comp_' + i] = phage_comp['comp_' + i]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
397
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
398 def get_composition(self, phage, bacteria):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
399 from skbio import Sequence
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
400 solution = []
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
401 bact_comp = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
402 phage_comp = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
403 phage_comp_carb = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
404 groups = '0123456'
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
405 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
406 bact_comp['comp_' + i] = 0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
407 phage_comp['comp_' + i] = 0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
408 count_prots = 0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
409 for prot in phage:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
410 max_comp = 0.0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
411 min_comp = 1000000.0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
412 count_prots += 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
413 seq = self.__get_conjoint_triad(phage[prot][1].replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C'))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
414 seq = Sequence(seq)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
415 for i in groups: # para normalizar
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
416 if seq.count(i) < min_comp:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
417 min_comp = seq.count(i)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
418 if seq.count(i) > max_comp:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
419 max_comp = seq.count(i)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
420 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
421 phage_comp['comp_' + i] += seq.count(i) - (min_comp / max_comp)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
422 total = 0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
423 if count_prots != 0:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
424 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
425 phage_comp['comp_' + i] = phage_comp['comp_' + i] / count_prots
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
426 total += phage_comp['comp_' + i]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
427 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
428 phage_comp['comp_' + i] = phage_comp['comp_' + i] / total
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
429 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
430 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
431 phage_comp['comp_' + i] = 0.0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
432
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
433 count_prots = 0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
434 for prot in bacteria:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
435 max_comp = 0.0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
436 min_comp = 1000000.0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
437 count_prots += 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
438 seq = bacteria[prot][1]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
439 seq = seq[:seq.find('"')]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
440 seq = self.__get_conjoint_triad(seq.replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C'))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
441 seq = Sequence(seq)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
442 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
443 if seq.count(i) < min_comp:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
444 min_comp = seq.count(i)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
445 if seq.count(i) > max_comp:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
446 max_comp = seq.count(i)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
447 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
448 bact_comp['comp_' + i] += seq.count(i) - (min_comp / max_comp)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
449 total = 0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
450 if count_prots != 0:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
451 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
452 bact_comp['comp_' + i] = bact_comp['comp_' + i] / count_prots
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
453 total += bact_comp['comp_' + i]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
454 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
455 bact_comp['comp_' + i] = bact_comp['comp_' + i] / total
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
456 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
457 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
458 bact_comp['comp_' + i] = 0.0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
459
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
460 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
461 solution.append(bact_comp['comp_' + i])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
462 solution.append(phage_comp['comp_' + i])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
463 return solution
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
464
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
465 def add_grouping(self):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
466 from skbio import Sequence
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
467 import json
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
468 bact_group = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
469 phage_group = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
470 groups = '0123456'
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
471 letters = 'ABCDEFGHIJ'
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
472 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
473 for j in letters:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
474 bact_group['group' + j + '_' + i] = []
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
475 phage_group['group' + j + '_' + i] = []
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
476 phage = ''
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
477 bact = ''
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
478 count = -1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
479 for ID in self.features_data.index:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
480 done_phage = False
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
481 done_bact = False
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
482 count += 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
483 if ID[:ID.find('--')] == phage:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
484 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
485 for j in letters:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
486 phage_group['group' + j + '_' + i].append(phage_group['group' + j + '_' + i][-1])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
487 done_phage = True
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
488 if ID[ID.find('--') + 2:] == bact:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
489 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
490 for j in letters:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
491 bact_group['group' + j + '_' + i].append(bact_group['group' + j + '_' + i][-1])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
492 done_bact = True
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
493 bact = ID[ID.find('--') + 2:]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
494 phage = ID[:ID.find('--')]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
495
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
496 if not done_phage:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
497 count_prots = 0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
498 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
499 for j in letters:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
500 phage_group['group' + j + '_' + i].append(0)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
501 for prot in self.list_prot[phage]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
502 count_prots += 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
503 seq = self.__get_conjoint_triad(self.list_prot[phage][prot][1].replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C'))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
504 seq = Sequence(seq)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
505 for j in letters:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
506 group = self.__get_grouping(seq, j)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
507 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
508 phage_group['group' + j + '_' + i][count] += group[i]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
509 if count_prots != 0:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
510 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
511 for j in letters:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
512 phage_group['group' + j + '_' + i][count] = phage_group['group' + j + '_' + i][count] / count_prots
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
513 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
514 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
515 for j in letters:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
516 phage_group['group' + j + '_' + i][count] = 0.0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
517
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
518 if not done_bact:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
519 count_prots = 0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
520 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
521 for j in letters:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
522 bact_group['group' + j + '_' + i].append(0)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
523 with open('files/bacteria/' + bact + '.json', encoding='utf-8') as F:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
524 bact_prots = json.loads(F.read())
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
525 for prot in bact_prots:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
526 count_prots += 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
527 seq = bact_prots[prot][1]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
528 seq = seq[:seq.find('"')]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
529 seq = self.__get_conjoint_triad(seq.replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C'))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
530 seq = Sequence(seq)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
531 for j in letters:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
532 group = self.__get_grouping(seq, j)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
533 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
534 bact_group['group' + j + '_' + i][count] += group[i]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
535 if count_prots != 0:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
536 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
537 for j in letters:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
538 bact_group['group' + j + '_' + i][count] = bact_group['group' + j + '_' + i][count] / count_prots
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
539 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
540 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
541 for j in letters:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
542 bact_group['group' + j + '_' + i][count] = 0.0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
543
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
544 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
545 for j in letters:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
546 self.features_data['bact_group' + j + '_' + i] = bact_group['group' + j + '_' + i]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
547 self.features_data['phage_group' + j + '_' + i] = phage_group['group' + j + '_' + i]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
548
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
549 def get_grouping(self, phage, bacteria):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
550 from skbio import Sequence
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
551 bact_group = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
552 phage_group = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
553 groups = '0123456'
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
554 letters = 'ABCDEFGHIJ'
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
555 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
556 for j in letters:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
557 bact_group['group' + j + '_' + i] = 0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
558 phage_group['group' + j + '_' + i] = 0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
559 solution = []
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
560 count_prots = 0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
561 for prot in phage:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
562 count_prots += 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
563 seq = self.__get_conjoint_triad(phage[prot][1].replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C'))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
564 seq = Sequence(seq)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
565 for j in letters:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
566 group = self.__get_grouping(seq, j)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
567 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
568 phage_group['group' + j + '_' + i] += group[i]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
569 if count_prots != 0:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
570 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
571 for j in letters:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
572 phage_group['group' + j + '_' + i] = phage_group['group' + j + '_' + i] / count_prots
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
573 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
574 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
575 for j in letters:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
576 phage_group['group' + j + '_' + i] = 0.0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
577
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
578 count_prots = 0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
579 for prot in bacteria:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
580 count_prots += 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
581 seq = bacteria[prot][1]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
582 seq = seq[:seq.find('"')]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
583 seq = self.__get_conjoint_triad(seq.replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C'))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
584 seq = Sequence(seq)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
585 for j in letters:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
586 group = self.__get_grouping(seq, j)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
587 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
588 bact_group['group' + j + '_' + i] += group[i]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
589 if count_prots != 0:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
590 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
591 for j in letters:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
592 bact_group['group' + j + '_' + i] = bact_group['group' + j + '_' + i] / count_prots
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
593 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
594 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
595 for j in letters:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
596 bact_group['group' + j + '_' + i] = 0.0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
597
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
598 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
599 for j in letters:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
600 solution.append(bact_group['group' + j + '_' + i])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
601 solution.append(phage_group['group' + j + '_' + i])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
602 return solution
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
603
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
604 def __get_conjoint_triad(self, prot):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
605 ctm = {'A':'0', 'G':'0', 'V':'0','C':'1', 'F':'2', 'I':'2', 'L':'2', 'P':'2', 'M':'3', 'S':'3', 'T':'3', 'Y':'3', 'H':'4', 'N':'4', 'Q':'4', 'W':'4', 'K':'5', 'R':'5', 'D':'6', 'E':'6'}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
606 for i, j in ctm.items():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
607 prot = prot.replace(i, j)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
608 return prot
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
609
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
610 def __get_grouping(self, prot, let='A'):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
611 from skbio import Sequence
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
612 groups = '0123456'
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
613 group = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
614 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
615 group[i] = 0.0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
616 if let == 'A':
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
617 seq = Sequence(prot[:int(len(prot) * 0.25)])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
618 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
619 group[i] += seq.count(i) / len(seq)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
620 elif let == 'B':
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
621 seq = Sequence(prot[int(len(prot) * 0.25):int(len(prot) * 0.5)])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
622 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
623 group[i] += seq.count(i) / len(seq)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
624 elif let == 'C':
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
625 seq = Sequence(prot[int(len(prot) * 0.5):int(len(prot) * 0.75)])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
626 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
627 group[i] += seq.count(i) / len(seq)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
628 elif let == 'D':
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
629 seq = Sequence(prot[int(len(prot) * 0.75):])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
630 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
631 group[i] += seq.count(i) / len(seq)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
632 elif let == 'E':
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
633 seq = Sequence(prot[:int(len(prot) * 0.5)])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
634 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
635 group[i] += seq.count(i) / len(seq)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
636 elif let == 'F':
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
637 seq = Sequence(prot[int(len(prot) * 0.5):])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
638 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
639 group[i] += seq.count(i) / len(seq)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
640 elif let == 'G':
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
641 seq = Sequence(prot[int(len(prot) * 0.25):int(len(prot) * 0.75)])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
642 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
643 group[i] += seq.count(i) / len(seq)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
644 elif let == 'H':
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
645 seq = Sequence(prot[:int(len(prot) * 0.75)])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
646 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
647 group[i] += seq.count(i) / len(seq)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
648 elif let == 'I':
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
649 seq = Sequence(prot[int(len(prot) * 0.25):])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
650 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
651 group[i] += seq.count(i) / len(seq)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
652 elif let == 'J':
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
653 seq = Sequence(prot[int(len(prot) * 0.125):int(len(prot) * 0.875)])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
654 for i in groups:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
655 group[i] += seq.count(i) / len(seq)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
656 return group
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
657
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
658 def set_output(self):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
659 import pandas as pd
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
660 output = []
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
661 data = pd.read_csv('files/NCBI_Phage_Bacteria_Data.csv', header=0, index_col=0, names=['Phage Name', 'Bacteria Name', 'Bacteria ID'])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
662 for phage in self.features_data['ID']:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
663 phage = phage[:phage.find('--')]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
664 bact = data.loc[phage, 'Bacteria Name']
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
665 if 'escherichia' in bact.lower():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
666 output.append('Escherichia coli')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
667 elif 'klebsiella' in bact.lower():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
668 output.append('Klebsiella pneumoniae')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
669 elif 'acinetobacter' in bact.lower():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
670 output.append('Acinetobacter baumannii')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
671 self.features_data = self.features_data.set_index('ID')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
672 self.features_data['Bacteria'] = output
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
673
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
674 def save_feat_data(self):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
675 import pickle
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
676 with open('files/FeatureDataset', 'wb') as f:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
677 pickle.dump(self.features_data, f)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
678 return self.features_data
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
679
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
680 def import_feat_data(self):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
681 import pickle
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
682 with open('files/FeatureDataset', 'rb') as f:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
683 self.features_data = pickle.load(f)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
684 return self.features_data
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
685
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
686
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
687 if __name__ == '__main__':
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
688 test = FeatureConstruction()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
689 # test.process_net_surf()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
690 test.add_grouping()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
691 test.add_composition()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
692 test.add_kmers()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
693 # test.set_output()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
694 test.save_feat_data()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
695 '''
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
696 test.process_net_surf()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
697 test.add_aa_freq()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
698 test.add_aromaticity()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
699 test.add_flexibility()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
700 test.add_molecular_weight()'''
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
701 # test.import_feat_data()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
702 # test.netSurf()