annotate phage_host_prediction/machine_learning.py @ 2:3e1e8be4e65c draft default tip

Uploaded
author pedro_araujo
date Fri, 02 Apr 2021 10:11:13 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
2 class PredictInteraction:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
3
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
4 def __init__(self, data = 'FeatureDataset'):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
5 import pickle
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
6 from sklearn.preprocessing import LabelEncoder
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
7 with open('files/' + data, 'rb') as f:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
8 self.dataset = pickle.load(f)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
9 self.dataset = self.dataset.dropna()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
10 le = LabelEncoder()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
11 le.fit(['Yes', 'No'])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
12 self.output = le.transform(self.dataset['Infects'].values)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
13 self.dataset = self.dataset.drop('Infects', 1)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
14 self.__standardize()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
15 self.__split_train_test()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
16
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
17 def __standardize(self):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
18 from sklearn.preprocessing import StandardScaler
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
19 self.scaler = StandardScaler()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
20 self.scaler.fit(self.dataset)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
21 self.data_z = self.scaler.transform(self.dataset)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
22
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
23 def __cross_validation(self, method):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
24 from sklearn.model_selection import cross_val_score
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
25 from sklearn.model_selection import StratifiedKFold
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
26 # from sklearn.model_selection import ShuffleSplit
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
27 # cv = ShuffleSplit(n_splits=4, test_size=0.3) # cross validation normal
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
28 skf = StratifiedKFold(5, shuffle=True)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
29 scores = cross_val_score(method, self.data_z, self.output, cv=skf, scoring='f1_weighted')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
30 print(scores)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
31 return skf
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
32
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
33 def __split_train_test(self):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
34 from sklearn.model_selection import train_test_split
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
35 self.data_z, self.X_val, self.output, self.y_val = train_test_split(self.data_z, self.output, test_size=0.2)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
36 self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.data_z, self.output, test_size=0.3)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
37
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
38 def run_knn(self, n=2):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
39 from sklearn.neighbors import KNeighborsClassifier
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
40 from sklearn.metrics import confusion_matrix
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
41 neigh = KNeighborsClassifier(n_neighbors=2, weights='distance', p=1, algorithm='auto', leaf_size=5)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
42 neigh.fit(self.X_train, self.y_train)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
43 # print(neigh.score(self.X_test, self.y_test))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
44 self.__score_metrics(neigh)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
45 print(confusion_matrix(self.y_test, neigh.predict(self.X_test)))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
46 # import time
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
47 # start_time = time.time()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
48 # cv = self.__cross_validation(neigh)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
49 # print("--- %s seconds ---" % (time.time() - start_time))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
50 # self.__plot_roc_cv(neigh, cv)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
51 # self.__auc_curve(neigh)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
52 # self.__permutation_importance(self._hyperparameters_knn(neigh))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
53
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
54 def _hyperparameters_knn(self, method):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
55 from sklearn.model_selection import GridSearchCV
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
56 parameters = {'leaf_size': [5, 15, 30, 50], 'n_neighbors': [2, 3, 5, 7], 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 'p': [1, 2]}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
57 clf = GridSearchCV(method, parameters)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
58 clf.fit(self.X_val, self.y_val)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
59 self.__score_metrics(clf)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
60 print(clf.best_params_)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
61 return clf
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
62
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
63 def run_random_forest(self):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
64 from sklearn.ensemble import RandomForestClassifier
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
65 from sklearn.metrics import confusion_matrix
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
66 clf = RandomForestClassifier(n_estimators=200, bootstrap=False, criterion='gini', min_samples_leaf=2, min_samples_split=4, oob_score=False)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
67 clf = clf.fit(self.X_train, self.y_train)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
68 # print(clf.score(self.X_test, self.y_test))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
69 self.__score_metrics(clf)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
70 print(confusion_matrix(self.y_test, clf.predict(self.X_test)))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
71 # self.recursive_feature_elimination(clf)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
72 # import time
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
73 # start_time = time.time()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
74 # cv = self.__cross_validation(clf)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
75 # print("--- %s seconds ---" % (time.time() - start_time))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
76 # self.__plot_roc_cv(clf, cv)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
77 # self.__auc_curve(clf)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
78 # self.__permutation_importance(clf)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
79
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
80 def _hyperparameters_rf(self, method):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
81 from sklearn.model_selection import GridSearchCV
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
82 parameters = {'n_estimators': [50, 100, 150, 200, 250], 'criterion': ['gini', 'entropy'], 'min_samples_split': [2, 4, 6], 'min_samples_leaf': [2, 4, 6], 'bootstrap': [True, False], 'oob_score': [True, False]}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
83 clf = GridSearchCV(method, parameters)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
84 clf.fit(self.X_val, self.y_val)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
85 self.__score_metrics(clf)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
86 print(clf.best_params_)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
87 return clf
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
88
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
89 def run_svm(self, c=0.1):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
90 from sklearn.svm import SVC
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
91 from sklearn.metrics import confusion_matrix
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
92 svm = SVC(C=10, degree=2, gamma='auto', kernel='rbf')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
93 svm = svm.fit(self.X_train, self.y_train)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
94 # print(svm.score(self.X_test, self.y_test))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
95 self.__score_metrics(svm)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
96 print(confusion_matrix(self.y_test, svm.predict(self.X_test)))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
97 # import time
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
98 # start_time = time.time()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
99 # cv = self.__cross_validation(svm)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
100 # print("--- %s seconds ---" % (time.time() - start_time))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
101 # self.recursive_feature_elimination(svm)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
102 # self.__plot_roc_cv(svm, cv)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
103 # self.__auc_curve(svm)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
104 # self.__permutation_importance(svm)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
105
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
106 def _hyperparameters_svm(self, method):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
107 from sklearn.model_selection import GridSearchCV
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
108 parameters = {'C': [0.01, 0.1, 1, 10], 'kernel': ['linear','rbf','poly','sigmoid', 'precomputed'], 'degree': [2, 3, 4], 'gamma': ['scale', 'auto']}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
109 clf = GridSearchCV(method, parameters)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
110 clf.fit(self.X_val, self.y_val)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
111 self.__score_metrics(clf)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
112 print(clf.best_params_)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
113 return clf
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
114
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
115 def run_neural_networks(self, alpha=1):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
116 from sklearn.neural_network import MLPClassifier
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
117 from sklearn.metrics import confusion_matrix
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
118 clf = MLPClassifier(alpha=0.0001, activation='tanh', hidden_layer_sizes=200, learning_rate='adaptive', solver='adam')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
119 clf.fit(self.X_train, self.y_train)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
120 self.__score_metrics(clf)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
121 print(confusion_matrix(self.y_test, clf.predict(self.X_test)))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
122 # import time
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
123 # start_time = time.time()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
124 # cv = self.__cross_validation(clf)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
125 # print("--- %s seconds ---" % (time.time() - start_time))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
126 # self.__plot_roc_cv(clf, cv)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
127 # self.__auc_curve(clf)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
128 # self.__permutation_importance(clf)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
129
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
130 def _hyperparameters_ann(self, method):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
131 from sklearn.model_selection import GridSearchCV
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
132 parameters = {'hidden_layer_sizes': [50, 100, 200], 'activation': ['identity', 'logistic', 'tanh', 'relu'], 'solver': ['lbfgs', 'sgd', 'adam'], 'alpha': [0.0001, 0.05], 'learning_rate': ['constant', 'invscaling', 'adaptive']}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
133 clf = GridSearchCV(method, parameters)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
134 clf.fit(self.X_val, self.y_val)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
135 self.__score_metrics(clf)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
136 print(clf.best_params_)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
137 return clf
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
138
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
139 def run_logistic_reg(self, c=1):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
140 from sklearn.linear_model import LogisticRegression
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
141 from sklearn.metrics import confusion_matrix
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
142 clf = LogisticRegression(C=10, penalty='l2', solver='liblinear')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
143 clf.fit(self.X_train, self.y_train)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
144 self.__score_metrics(clf)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
145 print(confusion_matrix(self.y_test, clf.predict(self.X_test)))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
146 # import time
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
147 # start_time = time.time()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
148 # cv = self.__cross_validation(clf)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
149 # print("--- %s seconds ---" % (time.time() - start_time))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
150 # self.__plot_roc_cv(clf, cv)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
151 # self.__auc_curve(clf)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
152 # self.__permutation_importance(clf)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
153
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
154 def _hyperparameters_lr(self, method):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
155 from sklearn.model_selection import GridSearchCV
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
156 parameters = {'penalty': ['l1', 'l2', 'elasticnet', 'none'], 'C': [0.01, 0.1, 1, 10], 'solver': ['lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga']}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
157 clf = GridSearchCV(method, parameters)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
158 clf.fit(self.X_val, self.y_val)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
159 self.__score_metrics(clf)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
160 print(clf.best_params_)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
161 return clf
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
162
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
163 def hyperparameter_tuning(self): # Best Params: {'bootstrap': False, 'ccp_alpha': 0.0, 'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': 250}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
164 from sklearn.model_selection import GridSearchCV
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
165 from sklearn.ensemble import RandomForestClassifier
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
166 clf = RandomForestClassifier()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
167 n_estimators = [50, 100, 150, 200, 250]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
168 criterion = ['gini', 'entropy']
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
169 max_features = ['auto', 'sqrt', 'log2']
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
170 bootstrap = [True, False]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
171 ccp_alpha = [0.0, 0.01, 0.02]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
172 param_grid = dict(n_estimators=n_estimators, criterion=criterion, max_features=max_features, bootstrap=bootstrap, ccp_alpha=ccp_alpha)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
173 grid = GridSearchCV(clf, param_grid, n_jobs=-1)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
174 grid_result = grid.fit(self.X_val, self.y_val)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
175 print('Best Score: ', grid_result.best_score_)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
176 print('Best Params: ', grid_result.best_params_)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
177
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
178 def __score_metrics(self, method):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
179 from sklearn.metrics import matthews_corrcoef, f1_score, precision_score, recall_score
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
180 print(matthews_corrcoef(self.y_test, method.predict(self.X_test)))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
181 print(f1_score(self.y_test, method.predict(self.X_test), average='binary'))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
182 print(precision_score(self.y_test, method.predict(self.X_test)))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
183 print(recall_score(self.y_test, method.predict(self.X_test)))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
184
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
185 def __auc_curve(self, method):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
186 import matplotlib.pyplot as plt
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
187 from sklearn import metrics
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
188 metrics.plot_roc_curve(method, self.X_test, self.y_test)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
189 plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
190 plt.xlim(0, 0.2)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
191 plt.show()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
192
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
193 def __plot_roc_cv(self, method, cv):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
194 import numpy as np
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
195 import matplotlib.pyplot as plt
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
196 from sklearn import datasets
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
197 from sklearn.metrics import auc
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
198 from sklearn.metrics import plot_roc_curve
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
199 tprs = []
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
200 aucs = []
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
201 mean_fpr = np.linspace(0, 1, 100)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
202 fig, ax = plt.subplots()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
203 for i, (train, test) in enumerate(cv.split(self.data_z, self.output)):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
204 method.fit(self.data_z[train], self.output[train])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
205 viz = plot_roc_curve(method, self.data_z[test], self.output[test], name='ROC fold {}'.format(i), alpha=0.3, lw=1, ax=ax)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
206 interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
207 interp_tpr[0] = 0.0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
208 tprs.append(interp_tpr)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
209 aucs.append(viz.roc_auc)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
210 ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
211 mean_tpr = np.mean(tprs, axis=0)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
212 mean_tpr[-1] = 1.0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
213 mean_auc = auc(mean_fpr, mean_tpr)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
214 std_auc = np.std(aucs)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
215 ax.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), lw=2, alpha=.8)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
216 std_tpr = np.std(tprs, axis=0)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
217 tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
218 tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
219 ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
220 ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], title="Receiver operating characteristic example")
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
221 ax.legend(loc="lower right")
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
222 plt.show()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
223
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
224 def __permutation_importance(self, method):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
225 from sklearn.inspection import permutation_importance
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
226 r = permutation_importance(method, self.X_test, self.y_test, n_repeats=5)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
227 for i in r.importances_mean.argsort()[::-1]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
228 if r.importances_mean[i] - 2 * r.importances_std[i] > 0.001:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
229 print(f"{self.dataset.columns[i]:<8}"
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
230 f" {r.importances_mean[i]:.3f}"
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
231 f" +/- {r.importances_std[i]:.3f}")
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
232
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
233 def recursive_feature_elimination(self, method):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
234 from sklearn.feature_selection import RFECV
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
235 selector = RFECV(method, cv=5)#, min_features_to_select=200)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
236 selector.fit(self.data_z, self.output)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
237 print(selector.ranking_)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
238 self.data_reduced = selector.transform(self.data_z)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
239
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
240 def predict_interaction(self, phage, bacteria):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
241 from sklearn.svm import LinearSVC
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
242 from sklearn.ensemble import RandomForestClassifier
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
243 import numpy as np
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
244 from feature_construction import FeatureConstruction
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
245
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
246 phageProts = self.__find_phage_proteins(phage) # dictionary
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
247 bactProts = self.__find_bact_proteins(bacteria)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
248 if not phageProts or not bactProts:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
249 print('oops')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
250 return None
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
251 list_carb = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
252 list_prot = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
253 for prot in phageProts.keys():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
254 if any(z in phageProts[prot][0].lower() for z in ['lysin', 'collagen', 'glyco', 'galac', 'chitin', 'wall', 'pectin', 'glycan', 'sialidase', 'neuramin', 'amid', 'lysozyme', 'murami', 'pectate', 'sgnh']):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
255 list_carb[prot] = phageProts[prot]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
256 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
257 list_prot[prot] = phageProts[prot]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
258 inter = np.array([])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
259 fc = FeatureConstruction()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
260 grouping = fc.get_grouping(phage=list_prot, phage_carb=list_carb, bacteria=bactProts) # takes a list of protein sequences
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
261 inter = np.append(inter, grouping)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
262 composition = fc.get_composition(phage=list_prot, phage_carb=list_carb, bacteria=bactProts)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
263 inter = np.append(inter, composition)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
264 kmers = fc.get_kmers(phage=list_prot, phage_carb=list_carb, bacteria=bactProts)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
265 inter = np.append(inter, kmers)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
266 inter = inter.reshape(1, -1)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
267 inter = self.scaler.transform(inter)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
268 # svm = LinearSVC(C=0.01, tol=0.010, dual=False)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
269 clf = RandomForestClassifier(n_estimators=250, bootstrap=False, ccp_alpha=0.0, max_features='sqrt')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
270 clf = clf.fit(self.data_z, self.output)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
271 pred = clf.predict(inter)[0]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
272 print(pred)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
273 return pred
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
274
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
275 def __find_phage_proteins(self, phage):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
276 import json
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
277 with open('files/phageTails.json', encoding='utf-8') as F:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
278 phageTails = json.loads(F.read())
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
279 phageProts = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
280 if phage in phageTails.keys():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
281 for prot in phageTails[phage]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
282 phageProts[prot] = [phageTails[phage][prot][0], phageTails[phage][prot][1]]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
283 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
284 from domain_search import DomainSearch
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
285 phageProts = self.__find_proteins(phage)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
286 ds = DomainSearch()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
287 phageProts = ds.find_domains_interpro(phageProts)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
288 phageProts = ds.find_domains_blast(phageProts)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
289 phageProts = ds.find_domains_uniprot(phageProts)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
290 return phageProts
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
291
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
292 def __find_bact_proteins(self, bacteria):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
293 import os
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
294 import json
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
295 if bacteria + '.json' in os.listdir('files/bacteria'):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
296 with open('files/bacteria/' + bacteria + '.json', encoding='utf-8') as F:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
297 bactProts = json.loads(F.read())
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
298 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
299 pass
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
300 # bactProts = self.__find_proteins(bacteria)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
301 # Implementar previsão de localização celular
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
302 return bactProts
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
303
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
304 def __find_proteins(self, id):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
305 from Bio import Entrez
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
306 from Bio import SeqIO
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
307 Entrez.email = 'pedro_araujo97@hotmail.com'
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
308 prots = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
309 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=id) as handle:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
310 genomeBac = SeqIO.read(handle, "gb")
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
311 for feat in genomeBac.features:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
312 if feat.type == 'CDS':
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
313 try: prots[feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
314 except: pass
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
315 if len(genomeBac.features) <= 5:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
316 with Entrez.efetch(db="nucleotide", rettype="gbwithparts", retmode="text", id=id) as handle:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
317 genomeBac = handle.readlines()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
318 for i in range(len(genomeBac)):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
319 if ' CDS ' in genomeBac[i]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
320 j = i
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
321 protDone = False
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
322 while j < len(genomeBac):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
323 if protDone:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
324 break
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
325 if '/product=' in genomeBac[j]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
326 product = genomeBac[j].strip()[10:]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
327 j += 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
328 elif '_id=' in genomeBac[j]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
329 protKey = genomeBac[j].strip()[13:-1]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
330 j += 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
331 elif '/translation=' in genomeBac[j]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
332 protSeq = genomeBac[j].strip()[14:]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
333 j += 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
334 for k in range(j, len(genomeBac)):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
335 if genomeBac[k].islower():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
336 j = k
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
337 protDone = True
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
338 break
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
339 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
340 protSeq += genomeBac[k].strip()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
341 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
342 j += 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
343 prots[protKey] = [product, protSeq[:protSeq.find('"')]]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
344 return prots
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
345
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
346
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
347 if __name__ == '__main__':
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
348 ml = PredictInteraction('dataset_reduced') # feature_dataset
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
349 # ml.predict_interaction('NC_050143', 'NC_020524.1') # NC_010468.1 NC_013941.1 NZ_CP029060.1 NZ_CP027394.1 NZ_CP025089.1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
350 ml.predict_interaction('KM607000', 'NC_020524') # NC_010468.1 NC_013941.1 NZ_CP029060.1 NZ_CP027394.1 NZ_CP025089.1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
351 ml.run_knn(2)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
352 ml.run_random_forest()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
353 ml.run_svm(0.001)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
354 ml.run_neural_networks(0.0001)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
355 ml.run_logistic_reg(0.01)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
356
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
357 import pandas as pd
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
358 import ast
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
359 data = pd.read_csv('files/NCBI_Phage_Bacteria_Data.csv', header=0, index_col=0)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
360 abaumannii = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
361 for phage in data.index:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
362 name = data.loc[phage, 'Host']
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
363 if 'acinetobacter' in name.lower():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
364 for bact in ast.literal_eval(data.loc[phage, 'Host_ID']):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
365 abaumannii[bact] = 0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
366 list_yes = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
367 list_yes['KT588074'] = []
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
368 for bact in abaumannii.keys():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
369 predict = ml.predict_interaction('KT588074', bact)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
370 if predict == 'Yes':
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
371 list_yes['KT588074'].append(bact)