| 0 | 1 | 
|  | 2 class PredictInteraction: | 
|  | 3 | 
|  | 4 	def __init__(self, data = 'FeatureDataset'): | 
|  | 5 		import pickle | 
|  | 6 		from sklearn.preprocessing import LabelEncoder | 
|  | 7 		with open('files/' + data, 'rb') as f: | 
|  | 8 			self.dataset = pickle.load(f) | 
|  | 9 		self.dataset = self.dataset.dropna() | 
|  | 10 		le = LabelEncoder() | 
|  | 11 		le.fit(['Yes', 'No']) | 
|  | 12 		self.output = le.transform(self.dataset['Infects'].values) | 
|  | 13 		self.dataset = self.dataset.drop('Infects', 1) | 
|  | 14 		self.__standardize() | 
|  | 15 		self.__split_train_test() | 
|  | 16 | 
|  | 17 	def __standardize(self): | 
|  | 18 		from sklearn.preprocessing import StandardScaler | 
|  | 19 		self.scaler = StandardScaler() | 
|  | 20 		self.scaler.fit(self.dataset) | 
|  | 21 		self.data_z = self.scaler.transform(self.dataset) | 
|  | 22 | 
|  | 23 	def __cross_validation(self, method): | 
|  | 24 		from sklearn.model_selection import cross_val_score | 
|  | 25 		from sklearn.model_selection import StratifiedKFold | 
|  | 26 		# from sklearn.model_selection import ShuffleSplit | 
|  | 27 		# cv = ShuffleSplit(n_splits=4, test_size=0.3) # cross validation normal | 
|  | 28 		skf = StratifiedKFold(5, shuffle=True) | 
|  | 29 		scores = cross_val_score(method, self.data_z, self.output, cv=skf, scoring='f1_weighted') | 
|  | 30 		print(scores) | 
|  | 31 		return skf | 
|  | 32 | 
|  | 33 	def __split_train_test(self): | 
|  | 34 		from sklearn.model_selection import train_test_split | 
|  | 35 		self.data_z, self.X_val, self.output, self.y_val = train_test_split(self.data_z, self.output, test_size=0.2) | 
|  | 36 		self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.data_z, self.output, test_size=0.3) | 
|  | 37 | 
|  | 38 	def run_knn(self, n=2): | 
|  | 39 		from sklearn.neighbors import KNeighborsClassifier | 
|  | 40 		from sklearn.metrics import confusion_matrix | 
|  | 41 		neigh = KNeighborsClassifier(n_neighbors=n, weights='distance', p=1, algorithm='brute') | 
|  | 42 		neigh.fit(self.X_train, self.y_train) | 
|  | 43 		# print(neigh.score(self.X_test, self.y_test)) | 
|  | 44 		self.__score_metrics(neigh) | 
|  | 45 		# print(confusion_matrix(self.y_test, neigh.predict(self.X_test))) | 
|  | 46 		import time | 
|  | 47 		start_time = time.time() | 
|  | 48 		cv = self.__cross_validation(neigh) | 
|  | 49 		print("--- %s seconds ---" % (time.time() - start_time)) | 
|  | 50 		# self.__plot_roc_cv(neigh, cv) | 
|  | 51 		# self.__auc_curve(neigh) | 
|  | 52 		self.__permutation_importance(self._hyperparameters_knn(neigh)) | 
|  | 53 | 
|  | 54 	def _hyperparameters_knn(self, method): | 
|  | 55 		from sklearn.model_selection import GridSearchCV | 
|  | 56 		parameters = {'leaf_size': [5, 15, 30, 50], 'n_neighbors': [2, 3, 5, 7], 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 'p': [1, 2]} | 
|  | 57 		clf = GridSearchCV(method, parameters) | 
|  | 58 		clf.fit(self.X_val, self.y_val) | 
|  | 59 		self.__score_metrics(clf) | 
|  | 60 		print(clf.best_params_) | 
|  | 61 		return clf | 
|  | 62 | 
|  | 63 	def run_random_forest(self): | 
|  | 64 		from sklearn.ensemble import RandomForestClassifier | 
|  | 65 		from sklearn.metrics import confusion_matrix | 
|  | 66 		clf = RandomForestClassifier(n_estimators=250, bootstrap=False, ccp_alpha=0.0, max_features='sqrt') | 
|  | 67 		clf = clf.fit(self.X_train, self.y_train) | 
|  | 68 		# print(clf.score(self.X_test, self.y_test)) | 
|  | 69 		self.__score_metrics(clf) | 
|  | 70 		# print(confusion_matrix(self.y_test, clf.predict(self.X_test))) | 
|  | 71 		# self.recursive_feature_elimination(clf) | 
|  | 72 		import time | 
|  | 73 		start_time = time.time() | 
|  | 74 		cv = self.__cross_validation(clf) | 
|  | 75 		print("--- %s seconds ---" % (time.time() - start_time)) | 
|  | 76 		# self.__plot_roc_cv(clf, cv) | 
|  | 77 		# self.__auc_curve(clf) | 
|  | 78 		# self.__permutation_importance(clf) | 
|  | 79 | 
|  | 80 	def _hyperparameters_rf(self, method): | 
|  | 81 		from sklearn.model_selection import GridSearchCV | 
|  | 82 		parameters = {'n_estimators': [50, 100, 150, 200, 250], 'criterion': ['gini', 'entropy'], 'min_samples_split': [2, 4, 6], 'min_samples_leaf': [2, 4, 6], 'bootstrap': [True, False], 'oob_score': [True, False]} | 
|  | 83 		clf = GridSearchCV(method, parameters) | 
|  | 84 		clf.fit(self.X_val, self.y_val) | 
|  | 85 		self.__score_metrics(clf) | 
|  | 86 		print(clf.best_params_) | 
|  | 87 		return clf | 
|  | 88 | 
|  | 89 	def run_svm(self, c=0.1): | 
|  | 90 		from sklearn.svm import SVC | 
|  | 91 		from sklearn.metrics import confusion_matrix | 
|  | 92 		svm = SVC(C=c) | 
|  | 93 		svm = svm.fit(self.X_train, self.y_train) | 
|  | 94 		# print(svm.score(self.X_test, self.y_test)) | 
|  | 95 		self.__score_metrics(svm) | 
|  | 96 		# print(confusion_matrix(self.y_test, svm.predict(self.X_test))) | 
|  | 97 		import time | 
|  | 98 		start_time = time.time() | 
|  | 99 		cv = self.__cross_validation(svm) | 
|  | 100 		print("--- %s seconds ---" % (time.time() - start_time)) | 
|  | 101 		# self.recursive_feature_elimination(svm) | 
|  | 102 		# self.__plot_roc_cv(svm, cv) | 
|  | 103 		# self.__auc_curve(svm) | 
|  | 104 		# self.__permutation_importance(svm) | 
|  | 105 | 
|  | 106 	def _hyperparameters_svm(self, method): | 
|  | 107 		from sklearn.model_selection import GridSearchCV | 
|  | 108 		parameters = {'C': [0.01, 0.1, 1, 10], 'kernel': ['linear','rbf','poly','sigmoid', 'precomputed'], 'degree': [2, 3, 4], 'gamma': ['scale', 'auto']} | 
|  | 109 		clf = GridSearchCV(method, parameters) | 
|  | 110 		clf.fit(self.X_val, self.y_val) | 
|  | 111 		self.__score_metrics(clf) | 
|  | 112 		print(clf.best_params_) | 
|  | 113 		return clf | 
|  | 114 | 
|  | 115 	def run_neural_networks(self, alpha=1): | 
|  | 116 		from sklearn.neural_network import MLPClassifier | 
|  | 117 		from sklearn.metrics import confusion_matrix | 
|  | 118 		clf = MLPClassifier(alpha=alpha) | 
|  | 119 		clf.fit(self.X_train, self.y_train) | 
|  | 120 		self.__score_metrics(clf) | 
|  | 121 		# print(confusion_matrix(self.y_test, clf.predict(self.X_test))) | 
|  | 122 		import time | 
|  | 123 		start_time = time.time() | 
|  | 124 		cv = self.__cross_validation(clf) | 
|  | 125 		print("--- %s seconds ---" % (time.time() - start_time)) | 
|  | 126 		# self.__plot_roc_cv(clf, cv) | 
|  | 127 		# self.__auc_curve(clf) | 
|  | 128 		# self.__permutation_importance(clf) | 
|  | 129 | 
|  | 130 	def _hyperparameters_ann(self, method): | 
|  | 131 		from sklearn.model_selection import GridSearchCV | 
|  | 132 		parameters = {'hidden_layer_sizes': [50, 100, 200], 'activation': ['identity', 'logistic', 'tanh', 'relu'], 'solver': ['lbfgs', 'sgd', 'adam'], 'alpha': [0.0001, 0.05], 'learning_rate': ['constant', 'invscaling', 'adaptive']} | 
|  | 133 		clf = GridSearchCV(method, parameters) | 
|  | 134 		clf.fit(self.X_val, self.y_val) | 
|  | 135 		self.__score_metrics(clf) | 
|  | 136 		print(clf.best_params_) | 
|  | 137 		return clf | 
|  | 138 | 
|  | 139 	def run_logistic_reg(self, c=1): | 
|  | 140 		from sklearn.linear_model import LogisticRegression | 
|  | 141 		from sklearn.metrics import confusion_matrix | 
|  | 142 		clf = LogisticRegression(C=c) | 
|  | 143 		clf.fit(self.X_train, self.y_train) | 
|  | 144 		self.__score_metrics(clf) | 
|  | 145 		# print(confusion_matrix(self.y_test, clf.predict(self.X_test))) | 
|  | 146 		import time | 
|  | 147 		start_time = time.time() | 
|  | 148 		cv = self.__cross_validation(clf) | 
|  | 149 		print("--- %s seconds ---" % (time.time() - start_time)) | 
|  | 150 		# self.__plot_roc_cv(clf, cv) | 
|  | 151 		# self.__auc_curve(clf) | 
|  | 152 		# self.__permutation_importance(clf) | 
|  | 153 | 
|  | 154 	def _hyperparameters_lr(self, method): | 
|  | 155 		from sklearn.model_selection import GridSearchCV | 
|  | 156 		parameters = {'penalty': ['l1', 'l2', 'elasticnet', 'none'], 'C': [0.01, 0.1, 1, 10], 'solver': ['lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga']} | 
|  | 157 		clf = GridSearchCV(method, parameters) | 
|  | 158 		clf.fit(self.X_val, self.y_val) | 
|  | 159 		self.__score_metrics(clf) | 
|  | 160 		print(clf.best_params_) | 
|  | 161 		return clf | 
|  | 162 | 
|  | 163 	def hyperparameter_tuning(self):  # Best Params:  {'bootstrap': False, 'ccp_alpha': 0.0, 'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': 250} | 
|  | 164 		from sklearn.model_selection import GridSearchCV | 
|  | 165 		from sklearn.ensemble import RandomForestClassifier | 
|  | 166 		clf = RandomForestClassifier() | 
|  | 167 		n_estimators = [50, 100, 150, 200, 250] | 
|  | 168 		criterion = ['gini', 'entropy'] | 
|  | 169 		max_features = ['auto', 'sqrt', 'log2'] | 
|  | 170 		bootstrap = [True, False] | 
|  | 171 		ccp_alpha = [0.0, 0.01, 0.02] | 
|  | 172 		param_grid = dict(n_estimators=n_estimators, criterion=criterion, max_features=max_features, bootstrap=bootstrap, ccp_alpha=ccp_alpha) | 
|  | 173 		grid = GridSearchCV(clf, param_grid, n_jobs=-1) | 
|  | 174 		grid_result = grid.fit(self.X_val, self.y_val) | 
|  | 175 		print('Best Score: ', grid_result.best_score_) | 
|  | 176 		print('Best Params: ', grid_result.best_params_) | 
|  | 177 | 
|  | 178 	def __score_metrics(self, method): | 
|  | 179 		from sklearn.metrics import matthews_corrcoef, f1_score, precision_score, recall_score | 
|  | 180 		print(matthews_corrcoef(self.y_test, method.predict(self.X_test))) | 
|  | 181 		print(f1_score(self.y_test, method.predict(self.X_test), average=None)) | 
|  | 182 | 
|  | 183 	def __auc_curve(self, method): | 
|  | 184 		import matplotlib.pyplot as plt | 
|  | 185 		from sklearn import metrics | 
|  | 186 		metrics.plot_roc_curve(method, self.X_test, self.y_test) | 
|  | 187 		plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') | 
|  | 188 		plt.xlim(0, 0.2) | 
|  | 189 		plt.show() | 
|  | 190 | 
|  | 191 	def __plot_roc_cv(self, method, cv): | 
|  | 192 		import numpy as np | 
|  | 193 		import matplotlib.pyplot as plt | 
|  | 194 		from sklearn import datasets | 
|  | 195 		from sklearn.metrics import auc | 
|  | 196 		from sklearn.metrics import plot_roc_curve | 
|  | 197 		tprs = [] | 
|  | 198 		aucs = [] | 
|  | 199 		mean_fpr = np.linspace(0, 1, 100) | 
|  | 200 		fig, ax = plt.subplots() | 
|  | 201 		for i, (train, test) in enumerate(cv.split(self.data_z, self.output)): | 
|  | 202 			method.fit(self.data_z[train], self.output[train]) | 
|  | 203 			viz = plot_roc_curve(method, self.data_z[test], self.output[test], name='ROC fold {}'.format(i), alpha=0.3, lw=1, ax=ax) | 
|  | 204 			interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr) | 
|  | 205 			interp_tpr[0] = 0.0 | 
|  | 206 			tprs.append(interp_tpr) | 
|  | 207 			aucs.append(viz.roc_auc) | 
|  | 208 		ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8) | 
|  | 209 		mean_tpr = np.mean(tprs, axis=0) | 
|  | 210 		mean_tpr[-1] = 1.0 | 
|  | 211 		mean_auc = auc(mean_fpr, mean_tpr) | 
|  | 212 		std_auc = np.std(aucs) | 
|  | 213 		ax.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), lw=2, alpha=.8) | 
|  | 214 		std_tpr = np.std(tprs, axis=0) | 
|  | 215 		tprs_upper = np.minimum(mean_tpr + std_tpr, 1) | 
|  | 216 		tprs_lower = np.maximum(mean_tpr - std_tpr, 0) | 
|  | 217 		ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.') | 
|  | 218 		ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], title="Receiver operating characteristic example") | 
|  | 219 		ax.legend(loc="lower right") | 
|  | 220 		plt.show() | 
|  | 221 | 
|  | 222 	def __permutation_importance(self, method): | 
|  | 223 		from sklearn.inspection import permutation_importance | 
|  | 224 		r = permutation_importance(method, self.X_test, self.y_test, n_repeats=5) | 
|  | 225 		for i in r.importances_mean.argsort()[::-1]: | 
|  | 226 			if r.importances_mean[i] - 2 * r.importances_std[i] > 0.001: | 
|  | 227 				print(f"{self.dataset.columns[i]:<8}" | 
|  | 228 					  f" {r.importances_mean[i]:.3f}" | 
|  | 229 					  f" +/- {r.importances_std[i]:.3f}") | 
|  | 230 | 
|  | 231 	def recursive_feature_elimination(self, method): | 
|  | 232 		from sklearn.feature_selection import RFECV | 
|  | 233 		selector = RFECV(method, cv=5)#, min_features_to_select=200) | 
|  | 234 		selector.fit(self.data_z, self.output) | 
|  | 235 		print(selector.ranking_) | 
|  | 236 		self.data_reduced = selector.transform(self.data_z) | 
|  | 237 | 
|  | 238 	def predict_interaction(self, phage, bacteria): | 
|  | 239 		from sklearn.svm import LinearSVC | 
|  | 240 		from sklearn.ensemble import RandomForestClassifier | 
|  | 241 		import numpy as np | 
|  | 242 		from feature_construction import FeatureConstruction | 
|  | 243 | 
|  | 244 		phageProts = self.__find_phage_proteins(phage)  # dictionary | 
|  | 245 		bactProts = self.__find_bact_proteins(bacteria) | 
|  | 246 		if not phageProts or not bactProts: | 
|  | 247 			print('oops') | 
|  | 248 			return None | 
|  | 249 		list_carb = {} | 
|  | 250 		list_prot = {} | 
|  | 251 		for prot in phageProts.keys(): | 
|  | 252 			if any(z in phageProts[prot][0].lower() for z in ['lysin', 'collagen', 'glyco', 'galac', 'chitin', 'wall', 'pectin', 'glycan', 'sialidase', 'neuramin', 'amid', 'lysozyme', 'murami', 'pectate', 'sgnh']): | 
|  | 253 				list_carb[prot] = phageProts[prot] | 
|  | 254 			else: | 
|  | 255 				list_prot[prot] = phageProts[prot] | 
|  | 256 		inter = np.array([]) | 
|  | 257 		fc = FeatureConstruction() | 
|  | 258 		grouping = fc.get_grouping(phage=list_prot, phage_carb=list_carb, bacteria=bactProts)  # takes a list of protein sequences | 
|  | 259 		inter = np.append(inter, grouping) | 
|  | 260 		composition = fc.get_composition(phage=list_prot, phage_carb=list_carb, bacteria=bactProts) | 
|  | 261 		inter = np.append(inter, composition) | 
|  | 262 		kmers = fc.get_kmers(phage=list_prot, phage_carb=list_carb, bacteria=bactProts) | 
|  | 263 		inter = np.append(inter, kmers) | 
|  | 264 		inter = inter.reshape(1, -1) | 
|  | 265 		inter = self.scaler.transform(inter) | 
|  | 266 		# svm = LinearSVC(C=0.01, tol=0.010, dual=False) | 
|  | 267 		clf = RandomForestClassifier(n_estimators=250, bootstrap=False, ccp_alpha=0.0, max_features='sqrt') | 
|  | 268 		clf = clf.fit(self.data_z, self.output) | 
|  | 269 		pred = clf.predict(inter)[0] | 
|  | 270 		print(pred) | 
|  | 271 		return pred | 
|  | 272 | 
|  | 273 	def __find_phage_proteins(self, phage): | 
|  | 274 		import json | 
|  | 275 		with open('files/phageTails.json', encoding='utf-8') as F: | 
|  | 276 			phageTails = json.loads(F.read()) | 
|  | 277 		phageProts = {} | 
|  | 278 		if phage in phageTails.keys(): | 
|  | 279 			for prot in phageTails[phage]: | 
|  | 280 				phageProts[prot] = [phageTails[phage][prot][0], phageTails[phage][prot][1]] | 
|  | 281 		else: | 
|  | 282 			from domain_search import DomainSearch | 
|  | 283 			phageProts = self.__find_proteins(phage) | 
|  | 284 			ds = DomainSearch() | 
|  | 285 			phageProts = ds.find_domains_interpro(phageProts) | 
|  | 286 			phageProts = ds.find_domains_blast(phageProts) | 
|  | 287 			phageProts = ds.find_domains_uniprot(phageProts) | 
|  | 288 		return phageProts | 
|  | 289 | 
|  | 290 	def __find_bact_proteins(self, bacteria): | 
|  | 291 		import os | 
|  | 292 		import json | 
|  | 293 		if bacteria + '.json' in os.listdir('files/bacteria'): | 
|  | 294 			with open('files/bacteria/' + bacteria + '.json', encoding='utf-8') as F: | 
|  | 295 				bactProts = json.loads(F.read()) | 
|  | 296 		else: | 
|  | 297 			pass | 
|  | 298 			# bactProts = self.__find_proteins(bacteria) | 
|  | 299 			# Implementar previsão de localização celular | 
|  | 300 		return bactProts | 
|  | 301 | 
|  | 302 	def __find_proteins(self, id): | 
|  | 303 		from Bio import Entrez | 
|  | 304 		from Bio import SeqIO | 
|  | 305 		Entrez.email = 'pedro_araujo97@hotmail.com' | 
|  | 306 		prots = {} | 
|  | 307 		with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=id) as handle: | 
|  | 308 			genomeBac = SeqIO.read(handle, "gb") | 
|  | 309 		for feat in genomeBac.features: | 
|  | 310 			if feat.type == 'CDS': | 
|  | 311 				try: prots[feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]] | 
|  | 312 				except: pass | 
|  | 313 		if len(genomeBac.features) <= 5: | 
|  | 314 			with Entrez.efetch(db="nucleotide", rettype="gbwithparts", retmode="text", id=id) as handle: | 
|  | 315 				genomeBac = handle.readlines() | 
|  | 316 			for i in range(len(genomeBac)): | 
|  | 317 				if ' CDS ' in genomeBac[i]: | 
|  | 318 					j = i | 
|  | 319 					protDone = False | 
|  | 320 					while j < len(genomeBac): | 
|  | 321 						if protDone: | 
|  | 322 							break | 
|  | 323 						if '/product=' in genomeBac[j]: | 
|  | 324 							product = genomeBac[j].strip()[10:] | 
|  | 325 							j += 1 | 
|  | 326 						elif '_id=' in genomeBac[j]: | 
|  | 327 							protKey = genomeBac[j].strip()[13:-1] | 
|  | 328 							j += 1 | 
|  | 329 						elif '/translation=' in genomeBac[j]: | 
|  | 330 							protSeq = genomeBac[j].strip()[14:] | 
|  | 331 							j += 1 | 
|  | 332 							for k in range(j, len(genomeBac)): | 
|  | 333 								if genomeBac[k].islower(): | 
|  | 334 									j = k | 
|  | 335 									protDone = True | 
|  | 336 									break | 
|  | 337 								else: | 
|  | 338 									protSeq += genomeBac[k].strip() | 
|  | 339 						else: | 
|  | 340 							j += 1 | 
|  | 341 					prots[protKey] = [product, protSeq[:protSeq.find('"')]] | 
|  | 342 		return prots | 
|  | 343 | 
|  | 344 | 
|  | 345 if __name__ == '__main__': | 
|  | 346 	ml = PredictInteraction('dataset_reduced')  # FeatureDataset | 
|  | 347 	# ml.predict_interaction('NC_050143', 'NC_020524.1')  # NC_010468.1 NC_013941.1 NZ_CP029060.1 NZ_CP027394.1 NZ_CP025089.1 | 
|  | 348 	ml.predict_interaction('KM607000', 'NC_020524')  # NC_010468.1 NC_013941.1 NZ_CP029060.1 NZ_CP027394.1 NZ_CP025089.1 | 
|  | 349 	ml.run_knn(2) | 
|  | 350 	ml.run_random_forest() | 
|  | 351 	ml.run_svm(0.001) | 
|  | 352 	ml.run_neural_networks(0.0001) | 
|  | 353 	ml.run_logistic_reg(0.01) | 
|  | 354 | 
|  | 355 	import pandas as pd | 
|  | 356 	import ast | 
|  | 357 	data = pd.read_csv('files/NCBI_Phage_Bacteria_Data.csv', header=0, index_col=0) | 
|  | 358 	abaumannii = {} | 
|  | 359 	for phage in data.index: | 
|  | 360 		name = data.loc[phage, 'Host'] | 
|  | 361 		if 'acinetobacter' in name.lower(): | 
|  | 362 			for bact in ast.literal_eval(data.loc[phage, 'Host_ID']): | 
|  | 363 				abaumannii[bact] = 0 | 
|  | 364 	list_yes = {} | 
|  | 365 	list_yes['KT588074'] = [] | 
|  | 366 	for bact in abaumannii.keys(): | 
|  | 367 		predict = ml.predict_interaction('KT588074', bact) | 
|  | 368 		if predict == 'Yes': | 
|  | 369 			list_yes['KT588074'].append(bact) |