Mercurial > repos > pedro_araujo > phage_host
comparison phage_host_prediction/run_galaxy.py @ 2:3e1e8be4e65c draft default tip
Uploaded
| author | pedro_araujo |
|---|---|
| date | Fri, 02 Apr 2021 10:11:13 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:d9cda08472ea | 2:3e1e8be4e65c |
|---|---|
| 1 | |
| 2 | |
| 3 class GalaxyPrediction: | |
| 4 | |
| 5 def __init__(self, phage_input_type='ID', bact_input_type='ID', phage='', bacteria='', ml_model='RandomForests', run_interpro=False): | |
| 6 import pickle | |
| 7 import os | |
| 8 import re | |
| 9 with open('files/feature_dataset', 'rb') as f: | |
| 10 dataset = pickle.load(f) | |
| 11 self.all_phages = [] | |
| 12 self.all_bacteria = [] | |
| 13 for ID in dataset.index: | |
| 14 temp_phage = ID[:ID.find('--')] | |
| 15 temp_bacteria = ID[ID.find('--')+2:] | |
| 16 if temp_phage not in self.all_phages: | |
| 17 self.all_phages.append(temp_phage) | |
| 18 if temp_bacteria not in self.all_bacteria: | |
| 19 self.all_bacteria.append(temp_bacteria) | |
| 20 if phage_input_type == 'ID': | |
| 21 phage = re.split('\W', phage.replace(' ', '')) | |
| 22 len_phage_id = len(phage) | |
| 23 phage_seqs = self._retrieve_from_phage_id(phage) | |
| 24 elif phage_input_type == 'seq_file': | |
| 25 phage_seqs = {} | |
| 26 phage_seqs['PhageFasta'] = {} | |
| 27 with open(phage, 'r') as f: | |
| 28 temp = f.readlines() | |
| 29 count_prot = 0 | |
| 30 prot = '' | |
| 31 i=0 | |
| 32 while i < len(temp): | |
| 33 if '>' in temp[i]: | |
| 34 if prot: | |
| 35 phage_seqs['PhageFasta']['Protein' + str(count_prot)] = ['Unknown', prot] | |
| 36 count_prot += 1 | |
| 37 prot = '' | |
| 38 i+=1 | |
| 39 else: | |
| 40 prot += temp[i].strip() | |
| 41 i+=1 | |
| 42 if bact_input_type == 'ID': | |
| 43 bacteria = re.split('\W', bacteria.replace(' ', '')) | |
| 44 if len(bacteria) > 1 and len_phage_id == 1 or len(bacteria) == 1: | |
| 45 bact_seqs = self._retrieve_from_bact_id(bacteria) | |
| 46 elif bact_input_type == 'seq_file': | |
| 47 bact_seqs = {} | |
| 48 bact_seqs['BacteriaFasta'] = {} | |
| 49 with open(bacteria, 'r') as f: | |
| 50 temp = f.readlines() | |
| 51 count_prot = 0 | |
| 52 prot = '' | |
| 53 i=0 | |
| 54 while i < len(temp): | |
| 55 if '>' in temp[i]: | |
| 56 if prot: | |
| 57 bact_seqs['BacteriaFasta']['Protein' + str(count_prot)] = ['Unknown', prot] | |
| 58 count_prot += 1 | |
| 59 prot = '' | |
| 60 i+=1 | |
| 61 else: | |
| 62 prot += temp[i].strip() | |
| 63 i+=1 | |
| 64 phage_seqs = self._find_phage_functions(phage_seqs, run_interpro) | |
| 65 phage_seqs = self._find_phage_tails(phage_seqs) | |
| 66 | |
| 67 list_remove = [] | |
| 68 for org in phage_seqs: | |
| 69 if not phage_seqs[org]: | |
| 70 print('Could not find tails for phage ' + org + '. Deleting entry...') | |
| 71 list_remove.append(org) | |
| 72 for org in list_remove: | |
| 73 del phage_seqs[org] | |
| 74 | |
| 75 if phage_seqs: | |
| 76 output = self.run_prediction(phage_seqs, bact_seqs, ml_model) | |
| 77 self.create_output(output, phage_seqs, bact_seqs) | |
| 78 else: | |
| 79 with open(or_location + '/output.tsv', 'w') as f: | |
| 80 f.write('No phage tails found in query') | |
| 81 for file in os.listdir('files'): | |
| 82 if file.startswith('temp'): | |
| 83 os.remove('files/' + file) | |
| 84 | |
| 85 def _retrieve_from_phage_id(self, phage): | |
| 86 temp_phage = {} | |
| 87 for ID in phage: | |
| 88 temp_phage[ID] = {} | |
| 89 if ID in self.all_phages: | |
| 90 import json | |
| 91 with open('files/phageTails.json', encoding='utf-8') as f: | |
| 92 phage_tails = json.loads(f.read()) | |
| 93 temp_phage[ID] = phage_tails[ID] | |
| 94 else: | |
| 95 from Bio import Entrez | |
| 96 from Bio import SeqIO | |
| 97 phage = {} | |
| 98 Entrez.email = 'insert@email.com' | |
| 99 try: | |
| 100 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=ID) as handle: | |
| 101 genome = SeqIO.read(handle, "gb") | |
| 102 for feat in genome.features: | |
| 103 if feat.type == 'CDS': | |
| 104 try: temp_phage[ID][feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]] | |
| 105 except: pass | |
| 106 except: | |
| 107 print(ID, 'not found in GenBank') | |
| 108 return temp_phage | |
| 109 | |
| 110 def _retrieve_from_bact_id(self, bacteria): | |
| 111 temp_bacteria = {} | |
| 112 for ID in bacteria: | |
| 113 temp_bacteria[ID] = {} | |
| 114 if '.' in ID: | |
| 115 ID = ID[:ID.find('.')] | |
| 116 #if ID in self.all_bacteria: | |
| 117 # import json | |
| 118 # with open('files/bacteria/' + ID + '.json', encoding='utf-8') as f: | |
| 119 # temp_bacteria[ID] = json.loads(f.read()) | |
| 120 #else: | |
| 121 from Bio import Entrez | |
| 122 from Bio import SeqIO | |
| 123 bacteria = {} | |
| 124 Entrez.email = 'insert@email.com' | |
| 125 try: | |
| 126 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=ID+'.1') as handle: | |
| 127 genome = SeqIO.read(handle, "gb") | |
| 128 for feat in genome.features: | |
| 129 if feat.type == 'CDS': | |
| 130 try: temp_bacteria[ID][feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]] | |
| 131 except: pass | |
| 132 if len(genome.features) <= 5: | |
| 133 with Entrez.efetch(db="nucleotide", rettype="gbwithparts", retmode="text", id=ID) as handle: | |
| 134 genome = handle.readlines() | |
| 135 for i in range(len(genome)): | |
| 136 if ' CDS ' in genome[i]: | |
| 137 j = i | |
| 138 protDone = False | |
| 139 while j < len(genome): | |
| 140 if protDone: | |
| 141 break | |
| 142 if '/product=' in genome[j]: | |
| 143 product = genome[j].strip()[10:] | |
| 144 j += 1 | |
| 145 elif '_id=' in genome[j]: | |
| 146 protKey = genome[j].strip()[13:-1] | |
| 147 j += 1 | |
| 148 elif '/translation=' in genome[j]: | |
| 149 protSeq = genome[j].strip()[14:] | |
| 150 j += 1 | |
| 151 for k in range(j, len(genome)): | |
| 152 if genome[k].islower(): | |
| 153 j = k | |
| 154 protDone = True | |
| 155 break | |
| 156 else: | |
| 157 protSeq += genome[k].strip() | |
| 158 else: | |
| 159 j += 1 | |
| 160 temp_bacteria[ID][protKey] = [product, protSeq[:protSeq.find('"')]] | |
| 161 except: | |
| 162 print(ID, 'not found in GenBank') | |
| 163 return temp_bacteria | |
| 164 | |
| 165 def _find_phage_functions(self, phage_dict, run_interpro): | |
| 166 import os | |
| 167 import json | |
| 168 with open('files/known_function.json', encoding='utf-8') as F: | |
| 169 known_function = json.loads(F.read()) | |
| 170 with open('files/temp_database.fasta', 'w') as F: | |
| 171 for phage in known_function: | |
| 172 for prot in known_function[phage]: | |
| 173 F.write('>' + phage + '-' + prot + '\n' + known_function[phage][prot][1] + '\n') | |
| 174 os.system('makeblastdb -in files/temp_database.fasta -dbtype prot -title PhageProts -parse_seqids -out files/temp_database -logfile files/temp_log') | |
| 175 for org in phage_dict: | |
| 176 with open('files/temp.fasta', 'w') as F: | |
| 177 for prot in phage_dict[org]: | |
| 178 F.write('>' + prot + '\n' + phage_dict[org][prot][1] + '\n') | |
| 179 os.system('blastp -db files/temp_database -query files/temp.fasta -out files/temp_blast -num_threads 2 -outfmt 6') | |
| 180 phage_dict[org] = self.process_blast(phage_dict[org], known_function) | |
| 181 if run_interpro: phage_dict[org] = self.interpro(phage_dict[org]) | |
| 182 return phage_dict | |
| 183 | |
| 184 def process_blast(self, phage_dict, known_function): | |
| 185 import pandas as pd | |
| 186 import re | |
| 187 blast_domains = pd.read_csv('files/temp_blast', sep='\t', header=None) | |
| 188 for prot in phage_dict: | |
| 189 func = phage_dict[prot][0] | |
| 190 known = False | |
| 191 if (not any(i in func.lower() for i in ['hypothetical', 'unknown', 'kda', 'uncharacterized', 'hyphothetical']) and len(func) > 3) and not ('gp' in func.lower() and len(func.split(' ')) < 2) and not (len(func.split(' ')) == 1 and len(func) < 5): | |
| 192 known = True | |
| 193 if not known: | |
| 194 evalue = [] | |
| 195 bitscore = [] | |
| 196 pred = blast_domains[blast_domains[0] == prot] | |
| 197 if pred.shape[0] == 0: break | |
| 198 for i in pred[10]: | |
| 199 evalue.append(float(i)) | |
| 200 for i in pred[11]: | |
| 201 bitscore.append(float(i)) | |
| 202 if min(evalue) < 1.0 and max(bitscore) > 30.0: | |
| 203 ind = evalue.index(min(evalue)) | |
| 204 if ind != bitscore.index(max(bitscore)): | |
| 205 ind = bitscore.index(max(bitscore)) | |
| 206 temp = pred.iloc[ind, 1] | |
| 207 known_phage = temp[:temp.find('-')] | |
| 208 known_prot = temp[temp.find('-') + 1:] | |
| 209 if known_function[known_phage][known_prot]: | |
| 210 new_func = known_function[known_phage][known_prot][0] | |
| 211 # for j in known_function.keys(): | |
| 212 # if pred.iloc[ind, 1] in known_function[j].keys(): | |
| 213 # new_func = known_function[j][pred.iloc[ind, 1]][0] | |
| 214 # break | |
| 215 x = re.findall('(Gp\d{2,}[^,\d -]|Gp\d{1}[^,\d -])', temp) # se tiver hits, remover | |
| 216 if not any(z in new_func.lower() for z in ['unknown', 'ucp', 'uncharacterized', 'consensus']) and len(new_func) > 3 and not x: | |
| 217 phage_dict[prot][0] = new_func | |
| 218 return phage_dict | |
| 219 | |
| 220 def interpro(self, phage_dict): | |
| 221 import os | |
| 222 import pandas as pd | |
| 223 import re | |
| 224 os.system('interproscan.sh -b ' + 'files/temp_interpro -i ' + 'files/temp.fasta -f tsv > files/temp_interpro_log') | |
| 225 domains = pd.read_csv('files/temp_interpro.tsv', sep='\t', index_col=0, header=None, names=list(range(13))) | |
| 226 domains = domains.fillna('-') | |
| 227 domains = domains[domains.loc[:, 3] != 'Coils'] | |
| 228 domains = domains[domains.loc[:, 3] != 'MobiDBLite'] | |
| 229 for prot in phage_dict: | |
| 230 func = phage_dict[prot][0] | |
| 231 known = False | |
| 232 if (not any(i in func.lower() for i in ['hypothetical', 'unknown', 'kda', 'uncharacterized', 'hyphothetical']) and len(func) > 3) and not ('gp' in func.lower() and len(func.split(' ')) < 2) and not (len(func.split(' ')) == 1 and len(func) < 5): | |
| 233 known = True | |
| 234 if prot in domains.index and not known: | |
| 235 temp = '-' | |
| 236 try: | |
| 237 for i in range(domains.loc[prot, :].shape[0]): | |
| 238 if '-' not in domains.loc[prot, 12].iloc[i].lower(): | |
| 239 if float(domains.loc[prot, 8].iloc[i]) < 1.0: | |
| 240 temp = domains.loc[prot, 12].iloc[i] | |
| 241 break | |
| 242 except: | |
| 243 if float(domains.loc[prot, 8]) < 1.0: | |
| 244 temp = domains.loc[prot, 12] | |
| 245 x = re.findall('(Gp\d{2,}[^,\d -]|Gp\d{1}[^,\d -])', temp) # se tiver hits, remover | |
| 246 if temp != '-' and not any(z in temp.lower() for z in ['unknown', 'ucp', 'uncharacterized', 'consensus']) and len(temp) > 3 and not x: | |
| 247 phage_dict[prot][0] = temp | |
| 248 else: | |
| 249 try: | |
| 250 for i in range(domains.loc[prot, :].shape[0]): | |
| 251 if '-' not in domains.loc[prot, 5].iloc[i].lower(): | |
| 252 temp = domains.loc[prot, 5].iloc[i] | |
| 253 break | |
| 254 except: | |
| 255 temp = domains.loc[prot, 5] | |
| 256 x = re.findall('(Gp\d{2,}[^,\d -]|Gp\d{1}[^,\d -])', temp) | |
| 257 if temp != '-' and not any(z in temp.lower() for z in ['unknown', 'ucp', 'uncharacterized', 'consensus']) and len(temp) > 3 and not x: | |
| 258 phage_dict[prot][0] = temp | |
| 259 return phage_dict | |
| 260 | |
| 261 def _find_phage_tails(self, phage_dict): | |
| 262 for org in phage_dict: | |
| 263 list_remove = [] | |
| 264 for protein in phage_dict[org]: | |
| 265 if any(z in phage_dict[org][protein][0].lower() for z in ['fiber', 'fibre', 'spike', 'hydrolase', 'bind', 'depolymerase', 'peptidase', 'lyase', 'sialidase', 'dextranase', 'lipase', 'adhesin', 'baseplate', 'protein h', 'recognizing', 'protein j', 'protein g', 'gpe', 'duf4035', 'host specifity', 'cor protein', 'specificity', 'baseplate component', 'gp38', 'gp12 tail', 'receptor', 'recognition', 'tail']) \ | |
| 266 and not any(z in phage_dict[org][protein][0].lower() for z in ['nucle', 'dna', 'rna', 'ligase', 'transferase', 'inhibitor', 'assembly', 'connect', 'nudix', 'atp', 'nad', 'transpos', 'ntp', 'molybdenum', 'hns', 'gtp', 'riib', 'inhibitor', 'replicat', 'codon', 'pyruvate', 'catalyst', 'hinge', 'sheath completion', 'head', 'capsid', 'tape', 'tip', 'strand', 'matur', 'portal', 'terminase', 'nucl', 'promot', 'block', 'olfact', 'wedge', 'lysozyme', 'mur', 'sheat']): | |
| 267 pass | |
| 268 else: | |
| 269 list_remove.append(protein) | |
| 270 for protein in list_remove: | |
| 271 del phage_dict[org][protein] | |
| 272 return phage_dict | |
| 273 | |
| 274 def run_prediction(self, phage_dict, bact_dict, ml_model): | |
| 275 from feature_construction import FeatureConstruction | |
| 276 import pickle | |
| 277 from sklearn.preprocessing import LabelEncoder | |
| 278 from sklearn.preprocessing import StandardScaler | |
| 279 import numpy as np | |
| 280 | |
| 281 if ml_model == 'RandomForests': | |
| 282 with open('files/dataset_reduced', 'rb') as f: | |
| 283 dataset = pickle.load(f) | |
| 284 columns_remove = [3, 7, 9, 11, 24, 28, 32, 34, 38, 42, 45, 52, 53, 61, 65, 73, 75, 79, 104, 122, 141, 151, 154, 155, 157, 159, 160, 161, 163, 165, 169, 170, 173, 176, 178, 180, 182, 183, 185, 186, 187, 190, 193, 194, 196, 197, 201, 202, 203, 206, 207, 209, 210, 212, 216, 217, 221, 223, 225, 226, 230, 233, 235, 236, 245, 251] | |
| 285 elif ml_model == 'SVM': | |
| 286 with open('files/feature_dataset', 'rb') as f: | |
| 287 dataset = pickle.load(f) | |
| 288 columns_remove = [] | |
| 289 | |
| 290 dataset = dataset.dropna() | |
| 291 le = LabelEncoder() | |
| 292 le.fit(['Yes', 'No']) | |
| 293 output = le.transform(dataset['Infects'].values) | |
| 294 dataset = dataset.drop('Infects', 1) | |
| 295 scaler = StandardScaler() | |
| 296 scaler.fit(dataset) | |
| 297 data_z = scaler.transform(dataset) | |
| 298 | |
| 299 fc = FeatureConstruction() | |
| 300 solution = [] | |
| 301 for phage in phage_dict: | |
| 302 for bacteria in bact_dict: | |
| 303 temp_solution = np.array([]) | |
| 304 temp_solution = np.append(temp_solution, fc.get_grouping(phage_dict[phage], bact_dict[bacteria])) | |
| 305 temp_solution = np.append(temp_solution, fc.get_composition(phage_dict[phage], bact_dict[bacteria])) | |
| 306 temp_solution = np.append(temp_solution, fc.get_kmers(phage_dict[phage], bact_dict[bacteria])) | |
| 307 temp_solution = temp_solution.reshape(1, -1) | |
| 308 if columns_remove: | |
| 309 temp_solution = np.delete(temp_solution, columns_remove, 1) | |
| 310 if phage in self.all_phages: | |
| 311 for ID in dataset.index: | |
| 312 if phage in ID: | |
| 313 for i in range(len(dataset.loc[ID].index)): | |
| 314 if 'phage' in dataset.loc[ID].index[i]: | |
| 315 temp_solution[0][i] = dataset.loc[ID, dataset.loc[ID].index[i]] | |
| 316 break | |
| 317 if bacteria in self.all_bacteria: | |
| 318 for ID in dataset.index: | |
| 319 if bacteria in ID: | |
| 320 for i in range(len(dataset.loc[ID].index)): | |
| 321 if 'bact' in dataset.loc[ID].index[i]: | |
| 322 temp_solution[0][i] = dataset.loc[ID, dataset.loc[ID].index[i]] | |
| 323 break | |
| 324 if type(solution) != np.ndarray: | |
| 325 solution = temp_solution | |
| 326 else: | |
| 327 solution = np.append(solution, temp_solution, axis=0) | |
| 328 # solution = solution.reshape(1, -1) | |
| 329 solution = scaler.transform(solution) | |
| 330 | |
| 331 if ml_model == 'RandomForests': | |
| 332 from sklearn.ensemble import RandomForestClassifier | |
| 333 clf = RandomForestClassifier(n_estimators=200, bootstrap=False, criterion='gini', min_samples_leaf=2, min_samples_split=4, oob_score=False) | |
| 334 clf = clf.fit(data_z, output) | |
| 335 elif ml_model == 'SVM': | |
| 336 from sklearn.svm import SVC | |
| 337 clf = SVC(C=10, degree=2, gamma='auto', kernel='rbf') | |
| 338 clf = clf.fit(data_z, output) | |
| 339 pred = clf.predict(solution) | |
| 340 pred = list(le.inverse_transform(pred)) | |
| 341 return pred | |
| 342 | |
| 343 def create_output(self, output, phage_seqs, bact_seqs): | |
| 344 import pandas as pd | |
| 345 list_orgs = [] | |
| 346 for phage in phage_seqs: | |
| 347 for bact in bact_seqs: | |
| 348 list_orgs.append(phage + ' - ' + bact) | |
| 349 file = pd.DataFrame({'Phage - Bacteria': list_orgs, 'Infects': output}) | |
| 350 file.to_csv('files/output.tsv', sep='\t', index=False, header=True) | |
| 351 file.to_csv(or_location + '/output.tsv', sep='\t', index=False, header=True) | |
| 352 | |
| 353 | |
| 354 if __name__ == '__main__': | |
| 355 import sys | |
| 356 import os | |
| 357 global or_location | |
| 358 or_location = os.getcwd() | |
| 359 os.chdir(os.path.dirname(__file__)) | |
| 360 | |
| 361 phage_input_type = sys.argv[1] | |
| 362 Phages = sys.argv[2] | |
| 363 bact_input_type = sys.argv[3] | |
| 364 Bacts = sys.argv[4] | |
| 365 run_interpro = sys.argv[5] | |
| 366 if run_interpro == 'True': | |
| 367 run_interpro = True | |
| 368 else: | |
| 369 run_interpro = False | |
| 370 model = sys.argv[6] | |
| 371 GalaxyPrediction(phage_input_type=phage_input_type, bact_input_type=bact_input_type, phage=Phages, bacteria=Bacts, ml_model=model, run_interpro=run_interpro) | |
| 372 #rg = GalaxyPrediction(phage_input_type='ID', bact_input_type='ID', phage='NC_050154', bacteria='NC_007414,NZ_MK033499,NZ_CP031214') | |
| 373 # GalaxyPrediction(phage_input_type='ID', bact_input_type='ID', phage='NC_031087,NC_049833,NC_049838,NC_049444', bacteria='LR133964', ml_model='SVM') |
