Mercurial > repos > pedro_araujo > phage_host_prediction
comparison machine_learning.py @ 0:e4b3fc88efe0 draft
Uploaded
author | pedro_araujo |
---|---|
date | Wed, 27 Jan 2021 13:50:11 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e4b3fc88efe0 |
---|---|
1 | |
2 class PredictInteraction: | |
3 | |
4 def __init__(self, data = 'FeatureDataset'): | |
5 import pickle | |
6 from sklearn.preprocessing import LabelEncoder | |
7 with open('files/' + data, 'rb') as f: | |
8 self.dataset = pickle.load(f) | |
9 self.dataset = self.dataset.dropna() | |
10 le = LabelEncoder() | |
11 le.fit(['Yes', 'No']) | |
12 self.output = le.transform(self.dataset['Infects'].values) | |
13 self.dataset = self.dataset.drop('Infects', 1) | |
14 self.__standardize() | |
15 self.__split_train_test() | |
16 | |
17 def __standardize(self): | |
18 from sklearn.preprocessing import StandardScaler | |
19 self.scaler = StandardScaler() | |
20 self.scaler.fit(self.dataset) | |
21 self.data_z = self.scaler.transform(self.dataset) | |
22 | |
23 def __cross_validation(self, method): | |
24 from sklearn.model_selection import cross_val_score | |
25 from sklearn.model_selection import StratifiedKFold | |
26 # from sklearn.model_selection import ShuffleSplit | |
27 # cv = ShuffleSplit(n_splits=4, test_size=0.3) # cross validation normal | |
28 skf = StratifiedKFold(5, shuffle=True) | |
29 scores = cross_val_score(method, self.data_z, self.output, cv=skf, scoring='f1_weighted') | |
30 print(scores) | |
31 return skf | |
32 | |
33 def __split_train_test(self): | |
34 from sklearn.model_selection import train_test_split | |
35 self.data_z, self.X_val, self.output, self.y_val = train_test_split(self.data_z, self.output, test_size=0.2) | |
36 self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.data_z, self.output, test_size=0.3) | |
37 | |
38 def run_knn(self, n=2): | |
39 from sklearn.neighbors import KNeighborsClassifier | |
40 from sklearn.metrics import confusion_matrix | |
41 neigh = KNeighborsClassifier(n_neighbors=n, weights='distance', p=1, algorithm='brute') | |
42 neigh.fit(self.X_train, self.y_train) | |
43 # print(neigh.score(self.X_test, self.y_test)) | |
44 self.__score_metrics(neigh) | |
45 # print(confusion_matrix(self.y_test, neigh.predict(self.X_test))) | |
46 import time | |
47 start_time = time.time() | |
48 cv = self.__cross_validation(neigh) | |
49 print("--- %s seconds ---" % (time.time() - start_time)) | |
50 # self.__plot_roc_cv(neigh, cv) | |
51 # self.__auc_curve(neigh) | |
52 self.__permutation_importance(self._hyperparameters_knn(neigh)) | |
53 | |
54 def _hyperparameters_knn(self, method): | |
55 from sklearn.model_selection import GridSearchCV | |
56 parameters = {'leaf_size': [5, 15, 30, 50], 'n_neighbors': [2, 3, 5, 7], 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 'p': [1, 2]} | |
57 clf = GridSearchCV(method, parameters) | |
58 clf.fit(self.X_val, self.y_val) | |
59 self.__score_metrics(clf) | |
60 print(clf.best_params_) | |
61 return clf | |
62 | |
63 def run_random_forest(self): | |
64 from sklearn.ensemble import RandomForestClassifier | |
65 from sklearn.metrics import confusion_matrix | |
66 clf = RandomForestClassifier(n_estimators=250, bootstrap=False, ccp_alpha=0.0, max_features='sqrt') | |
67 clf = clf.fit(self.X_train, self.y_train) | |
68 # print(clf.score(self.X_test, self.y_test)) | |
69 self.__score_metrics(clf) | |
70 # print(confusion_matrix(self.y_test, clf.predict(self.X_test))) | |
71 # self.recursive_feature_elimination(clf) | |
72 import time | |
73 start_time = time.time() | |
74 cv = self.__cross_validation(clf) | |
75 print("--- %s seconds ---" % (time.time() - start_time)) | |
76 # self.__plot_roc_cv(clf, cv) | |
77 # self.__auc_curve(clf) | |
78 # self.__permutation_importance(clf) | |
79 | |
80 def _hyperparameters_rf(self, method): | |
81 from sklearn.model_selection import GridSearchCV | |
82 parameters = {'n_estimators': [50, 100, 150, 200, 250], 'criterion': ['gini', 'entropy'], 'min_samples_split': [2, 4, 6], 'min_samples_leaf': [2, 4, 6], 'bootstrap': [True, False], 'oob_score': [True, False]} | |
83 clf = GridSearchCV(method, parameters) | |
84 clf.fit(self.X_val, self.y_val) | |
85 self.__score_metrics(clf) | |
86 print(clf.best_params_) | |
87 return clf | |
88 | |
89 def run_svm(self, c=0.1): | |
90 from sklearn.svm import SVC | |
91 from sklearn.metrics import confusion_matrix | |
92 svm = SVC(C=c) | |
93 svm = svm.fit(self.X_train, self.y_train) | |
94 # print(svm.score(self.X_test, self.y_test)) | |
95 self.__score_metrics(svm) | |
96 # print(confusion_matrix(self.y_test, svm.predict(self.X_test))) | |
97 import time | |
98 start_time = time.time() | |
99 cv = self.__cross_validation(svm) | |
100 print("--- %s seconds ---" % (time.time() - start_time)) | |
101 # self.recursive_feature_elimination(svm) | |
102 # self.__plot_roc_cv(svm, cv) | |
103 # self.__auc_curve(svm) | |
104 # self.__permutation_importance(svm) | |
105 | |
106 def _hyperparameters_svm(self, method): | |
107 from sklearn.model_selection import GridSearchCV | |
108 parameters = {'C': [0.01, 0.1, 1, 10], 'kernel': ['linear','rbf','poly','sigmoid', 'precomputed'], 'degree': [2, 3, 4], 'gamma': ['scale', 'auto']} | |
109 clf = GridSearchCV(method, parameters) | |
110 clf.fit(self.X_val, self.y_val) | |
111 self.__score_metrics(clf) | |
112 print(clf.best_params_) | |
113 return clf | |
114 | |
115 def run_neural_networks(self, alpha=1): | |
116 from sklearn.neural_network import MLPClassifier | |
117 from sklearn.metrics import confusion_matrix | |
118 clf = MLPClassifier(alpha=alpha) | |
119 clf.fit(self.X_train, self.y_train) | |
120 self.__score_metrics(clf) | |
121 # print(confusion_matrix(self.y_test, clf.predict(self.X_test))) | |
122 import time | |
123 start_time = time.time() | |
124 cv = self.__cross_validation(clf) | |
125 print("--- %s seconds ---" % (time.time() - start_time)) | |
126 # self.__plot_roc_cv(clf, cv) | |
127 # self.__auc_curve(clf) | |
128 # self.__permutation_importance(clf) | |
129 | |
130 def _hyperparameters_ann(self, method): | |
131 from sklearn.model_selection import GridSearchCV | |
132 parameters = {'hidden_layer_sizes': [50, 100, 200], 'activation': ['identity', 'logistic', 'tanh', 'relu'], 'solver': ['lbfgs', 'sgd', 'adam'], 'alpha': [0.0001, 0.05], 'learning_rate': ['constant', 'invscaling', 'adaptive']} | |
133 clf = GridSearchCV(method, parameters) | |
134 clf.fit(self.X_val, self.y_val) | |
135 self.__score_metrics(clf) | |
136 print(clf.best_params_) | |
137 return clf | |
138 | |
139 def run_logistic_reg(self, c=1): | |
140 from sklearn.linear_model import LogisticRegression | |
141 from sklearn.metrics import confusion_matrix | |
142 clf = LogisticRegression(C=c) | |
143 clf.fit(self.X_train, self.y_train) | |
144 self.__score_metrics(clf) | |
145 # print(confusion_matrix(self.y_test, clf.predict(self.X_test))) | |
146 import time | |
147 start_time = time.time() | |
148 cv = self.__cross_validation(clf) | |
149 print("--- %s seconds ---" % (time.time() - start_time)) | |
150 # self.__plot_roc_cv(clf, cv) | |
151 # self.__auc_curve(clf) | |
152 # self.__permutation_importance(clf) | |
153 | |
154 def _hyperparameters_lr(self, method): | |
155 from sklearn.model_selection import GridSearchCV | |
156 parameters = {'penalty': ['l1', 'l2', 'elasticnet', 'none'], 'C': [0.01, 0.1, 1, 10], 'solver': ['lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga']} | |
157 clf = GridSearchCV(method, parameters) | |
158 clf.fit(self.X_val, self.y_val) | |
159 self.__score_metrics(clf) | |
160 print(clf.best_params_) | |
161 return clf | |
162 | |
163 def hyperparameter_tuning(self): # Best Params: {'bootstrap': False, 'ccp_alpha': 0.0, 'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': 250} | |
164 from sklearn.model_selection import GridSearchCV | |
165 from sklearn.ensemble import RandomForestClassifier | |
166 clf = RandomForestClassifier() | |
167 n_estimators = [50, 100, 150, 200, 250] | |
168 criterion = ['gini', 'entropy'] | |
169 max_features = ['auto', 'sqrt', 'log2'] | |
170 bootstrap = [True, False] | |
171 ccp_alpha = [0.0, 0.01, 0.02] | |
172 param_grid = dict(n_estimators=n_estimators, criterion=criterion, max_features=max_features, bootstrap=bootstrap, ccp_alpha=ccp_alpha) | |
173 grid = GridSearchCV(clf, param_grid, n_jobs=-1) | |
174 grid_result = grid.fit(self.X_val, self.y_val) | |
175 print('Best Score: ', grid_result.best_score_) | |
176 print('Best Params: ', grid_result.best_params_) | |
177 | |
178 def __score_metrics(self, method): | |
179 from sklearn.metrics import matthews_corrcoef, f1_score, precision_score, recall_score | |
180 print(matthews_corrcoef(self.y_test, method.predict(self.X_test))) | |
181 print(f1_score(self.y_test, method.predict(self.X_test), average=None)) | |
182 | |
183 def __auc_curve(self, method): | |
184 import matplotlib.pyplot as plt | |
185 from sklearn import metrics | |
186 metrics.plot_roc_curve(method, self.X_test, self.y_test) | |
187 plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') | |
188 plt.xlim(0, 0.2) | |
189 plt.show() | |
190 | |
191 def __plot_roc_cv(self, method, cv): | |
192 import numpy as np | |
193 import matplotlib.pyplot as plt | |
194 from sklearn import datasets | |
195 from sklearn.metrics import auc | |
196 from sklearn.metrics import plot_roc_curve | |
197 tprs = [] | |
198 aucs = [] | |
199 mean_fpr = np.linspace(0, 1, 100) | |
200 fig, ax = plt.subplots() | |
201 for i, (train, test) in enumerate(cv.split(self.data_z, self.output)): | |
202 method.fit(self.data_z[train], self.output[train]) | |
203 viz = plot_roc_curve(method, self.data_z[test], self.output[test], name='ROC fold {}'.format(i), alpha=0.3, lw=1, ax=ax) | |
204 interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr) | |
205 interp_tpr[0] = 0.0 | |
206 tprs.append(interp_tpr) | |
207 aucs.append(viz.roc_auc) | |
208 ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8) | |
209 mean_tpr = np.mean(tprs, axis=0) | |
210 mean_tpr[-1] = 1.0 | |
211 mean_auc = auc(mean_fpr, mean_tpr) | |
212 std_auc = np.std(aucs) | |
213 ax.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), lw=2, alpha=.8) | |
214 std_tpr = np.std(tprs, axis=0) | |
215 tprs_upper = np.minimum(mean_tpr + std_tpr, 1) | |
216 tprs_lower = np.maximum(mean_tpr - std_tpr, 0) | |
217 ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.') | |
218 ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], title="Receiver operating characteristic example") | |
219 ax.legend(loc="lower right") | |
220 plt.show() | |
221 | |
222 def __permutation_importance(self, method): | |
223 from sklearn.inspection import permutation_importance | |
224 r = permutation_importance(method, self.X_test, self.y_test, n_repeats=5) | |
225 for i in r.importances_mean.argsort()[::-1]: | |
226 if r.importances_mean[i] - 2 * r.importances_std[i] > 0.001: | |
227 print(f"{self.dataset.columns[i]:<8}" | |
228 f" {r.importances_mean[i]:.3f}" | |
229 f" +/- {r.importances_std[i]:.3f}") | |
230 | |
231 def recursive_feature_elimination(self, method): | |
232 from sklearn.feature_selection import RFECV | |
233 selector = RFECV(method, cv=5)#, min_features_to_select=200) | |
234 selector.fit(self.data_z, self.output) | |
235 print(selector.ranking_) | |
236 self.data_reduced = selector.transform(self.data_z) | |
237 | |
238 def predict_interaction(self, phage, bacteria): | |
239 from sklearn.svm import LinearSVC | |
240 from sklearn.ensemble import RandomForestClassifier | |
241 import numpy as np | |
242 from feature_construction import FeatureConstruction | |
243 | |
244 phageProts = self.__find_phage_proteins(phage) # dictionary | |
245 bactProts = self.__find_bact_proteins(bacteria) | |
246 if not phageProts or not bactProts: | |
247 print('oops') | |
248 return None | |
249 list_carb = {} | |
250 list_prot = {} | |
251 for prot in phageProts.keys(): | |
252 if any(z in phageProts[prot][0].lower() for z in ['lysin', 'collagen', 'glyco', 'galac', 'chitin', 'wall', 'pectin', 'glycan', 'sialidase', 'neuramin', 'amid', 'lysozyme', 'murami', 'pectate', 'sgnh']): | |
253 list_carb[prot] = phageProts[prot] | |
254 else: | |
255 list_prot[prot] = phageProts[prot] | |
256 inter = np.array([]) | |
257 fc = FeatureConstruction() | |
258 grouping = fc.get_grouping(phage=list_prot, phage_carb=list_carb, bacteria=bactProts) # takes a list of protein sequences | |
259 inter = np.append(inter, grouping) | |
260 composition = fc.get_composition(phage=list_prot, phage_carb=list_carb, bacteria=bactProts) | |
261 inter = np.append(inter, composition) | |
262 kmers = fc.get_kmers(phage=list_prot, phage_carb=list_carb, bacteria=bactProts) | |
263 inter = np.append(inter, kmers) | |
264 inter = inter.reshape(1, -1) | |
265 inter = self.scaler.transform(inter) | |
266 # svm = LinearSVC(C=0.01, tol=0.010, dual=False) | |
267 clf = RandomForestClassifier(n_estimators=250, bootstrap=False, ccp_alpha=0.0, max_features='sqrt') | |
268 clf = clf.fit(self.data_z, self.output) | |
269 pred = clf.predict(inter)[0] | |
270 print(pred) | |
271 return pred | |
272 | |
273 def __find_phage_proteins(self, phage): | |
274 import json | |
275 with open('files/phageTails.json', encoding='utf-8') as F: | |
276 phageTails = json.loads(F.read()) | |
277 phageProts = {} | |
278 if phage in phageTails.keys(): | |
279 for prot in phageTails[phage]: | |
280 phageProts[prot] = [phageTails[phage][prot][0], phageTails[phage][prot][1]] | |
281 else: | |
282 from domain_search import DomainSearch | |
283 phageProts = self.__find_proteins(phage) | |
284 ds = DomainSearch() | |
285 phageProts = ds.find_domains_interpro(phageProts) | |
286 phageProts = ds.find_domains_blast(phageProts) | |
287 phageProts = ds.find_domains_uniprot(phageProts) | |
288 return phageProts | |
289 | |
290 def __find_bact_proteins(self, bacteria): | |
291 import os | |
292 import json | |
293 if bacteria + '.json' in os.listdir('files/bacteria'): | |
294 with open('files/bacteria/' + bacteria + '.json', encoding='utf-8') as F: | |
295 bactProts = json.loads(F.read()) | |
296 else: | |
297 pass | |
298 # bactProts = self.__find_proteins(bacteria) | |
299 # Implementar previsão de localização celular | |
300 return bactProts | |
301 | |
302 def __find_proteins(self, id): | |
303 from Bio import Entrez | |
304 from Bio import SeqIO | |
305 Entrez.email = 'pedro_araujo97@hotmail.com' | |
306 prots = {} | |
307 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=id) as handle: | |
308 genomeBac = SeqIO.read(handle, "gb") | |
309 for feat in genomeBac.features: | |
310 if feat.type == 'CDS': | |
311 try: prots[feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]] | |
312 except: pass | |
313 if len(genomeBac.features) <= 5: | |
314 with Entrez.efetch(db="nucleotide", rettype="gbwithparts", retmode="text", id=id) as handle: | |
315 genomeBac = handle.readlines() | |
316 for i in range(len(genomeBac)): | |
317 if ' CDS ' in genomeBac[i]: | |
318 j = i | |
319 protDone = False | |
320 while j < len(genomeBac): | |
321 if protDone: | |
322 break | |
323 if '/product=' in genomeBac[j]: | |
324 product = genomeBac[j].strip()[10:] | |
325 j += 1 | |
326 elif '_id=' in genomeBac[j]: | |
327 protKey = genomeBac[j].strip()[13:-1] | |
328 j += 1 | |
329 elif '/translation=' in genomeBac[j]: | |
330 protSeq = genomeBac[j].strip()[14:] | |
331 j += 1 | |
332 for k in range(j, len(genomeBac)): | |
333 if genomeBac[k].islower(): | |
334 j = k | |
335 protDone = True | |
336 break | |
337 else: | |
338 protSeq += genomeBac[k].strip() | |
339 else: | |
340 j += 1 | |
341 prots[protKey] = [product, protSeq[:protSeq.find('"')]] | |
342 return prots | |
343 | |
344 | |
345 if __name__ == '__main__': | |
346 ml = PredictInteraction('dataset_reduced') # FeatureDataset | |
347 # ml.predict_interaction('NC_050143', 'NC_020524.1') # NC_010468.1 NC_013941.1 NZ_CP029060.1 NZ_CP027394.1 NZ_CP025089.1 | |
348 ml.predict_interaction('KM607000', 'NC_020524') # NC_010468.1 NC_013941.1 NZ_CP029060.1 NZ_CP027394.1 NZ_CP025089.1 | |
349 ml.run_knn(2) | |
350 ml.run_random_forest() | |
351 ml.run_svm(0.001) | |
352 ml.run_neural_networks(0.0001) | |
353 ml.run_logistic_reg(0.01) | |
354 | |
355 import pandas as pd | |
356 import ast | |
357 data = pd.read_csv('files/NCBI_Phage_Bacteria_Data.csv', header=0, index_col=0) | |
358 abaumannii = {} | |
359 for phage in data.index: | |
360 name = data.loc[phage, 'Host'] | |
361 if 'acinetobacter' in name.lower(): | |
362 for bact in ast.literal_eval(data.loc[phage, 'Host_ID']): | |
363 abaumannii[bact] = 0 | |
364 list_yes = {} | |
365 list_yes['KT588074'] = [] | |
366 for bact in abaumannii.keys(): | |
367 predict = ml.predict_interaction('KT588074', bact) | |
368 if predict == 'Yes': | |
369 list_yes['KT588074'].append(bact) |