Mercurial > repos > pedro_araujo > phage_host
comparison phage_host_prediction/run_galaxy.py @ 2:3e1e8be4e65c draft default tip
Uploaded
author | pedro_araujo |
---|---|
date | Fri, 02 Apr 2021 10:11:13 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
1:d9cda08472ea | 2:3e1e8be4e65c |
---|---|
1 | |
2 | |
3 class GalaxyPrediction: | |
4 | |
5 def __init__(self, phage_input_type='ID', bact_input_type='ID', phage='', bacteria='', ml_model='RandomForests', run_interpro=False): | |
6 import pickle | |
7 import os | |
8 import re | |
9 with open('files/feature_dataset', 'rb') as f: | |
10 dataset = pickle.load(f) | |
11 self.all_phages = [] | |
12 self.all_bacteria = [] | |
13 for ID in dataset.index: | |
14 temp_phage = ID[:ID.find('--')] | |
15 temp_bacteria = ID[ID.find('--')+2:] | |
16 if temp_phage not in self.all_phages: | |
17 self.all_phages.append(temp_phage) | |
18 if temp_bacteria not in self.all_bacteria: | |
19 self.all_bacteria.append(temp_bacteria) | |
20 if phage_input_type == 'ID': | |
21 phage = re.split('\W', phage.replace(' ', '')) | |
22 len_phage_id = len(phage) | |
23 phage_seqs = self._retrieve_from_phage_id(phage) | |
24 elif phage_input_type == 'seq_file': | |
25 phage_seqs = {} | |
26 phage_seqs['PhageFasta'] = {} | |
27 with open(phage, 'r') as f: | |
28 temp = f.readlines() | |
29 count_prot = 0 | |
30 prot = '' | |
31 i=0 | |
32 while i < len(temp): | |
33 if '>' in temp[i]: | |
34 if prot: | |
35 phage_seqs['PhageFasta']['Protein' + str(count_prot)] = ['Unknown', prot] | |
36 count_prot += 1 | |
37 prot = '' | |
38 i+=1 | |
39 else: | |
40 prot += temp[i].strip() | |
41 i+=1 | |
42 if bact_input_type == 'ID': | |
43 bacteria = re.split('\W', bacteria.replace(' ', '')) | |
44 if len(bacteria) > 1 and len_phage_id == 1 or len(bacteria) == 1: | |
45 bact_seqs = self._retrieve_from_bact_id(bacteria) | |
46 elif bact_input_type == 'seq_file': | |
47 bact_seqs = {} | |
48 bact_seqs['BacteriaFasta'] = {} | |
49 with open(bacteria, 'r') as f: | |
50 temp = f.readlines() | |
51 count_prot = 0 | |
52 prot = '' | |
53 i=0 | |
54 while i < len(temp): | |
55 if '>' in temp[i]: | |
56 if prot: | |
57 bact_seqs['BacteriaFasta']['Protein' + str(count_prot)] = ['Unknown', prot] | |
58 count_prot += 1 | |
59 prot = '' | |
60 i+=1 | |
61 else: | |
62 prot += temp[i].strip() | |
63 i+=1 | |
64 phage_seqs = self._find_phage_functions(phage_seqs, run_interpro) | |
65 phage_seqs = self._find_phage_tails(phage_seqs) | |
66 | |
67 list_remove = [] | |
68 for org in phage_seqs: | |
69 if not phage_seqs[org]: | |
70 print('Could not find tails for phage ' + org + '. Deleting entry...') | |
71 list_remove.append(org) | |
72 for org in list_remove: | |
73 del phage_seqs[org] | |
74 | |
75 if phage_seqs: | |
76 output = self.run_prediction(phage_seqs, bact_seqs, ml_model) | |
77 self.create_output(output, phage_seqs, bact_seqs) | |
78 else: | |
79 with open(or_location + '/output.tsv', 'w') as f: | |
80 f.write('No phage tails found in query') | |
81 for file in os.listdir('files'): | |
82 if file.startswith('temp'): | |
83 os.remove('files/' + file) | |
84 | |
85 def _retrieve_from_phage_id(self, phage): | |
86 temp_phage = {} | |
87 for ID in phage: | |
88 temp_phage[ID] = {} | |
89 if ID in self.all_phages: | |
90 import json | |
91 with open('files/phageTails.json', encoding='utf-8') as f: | |
92 phage_tails = json.loads(f.read()) | |
93 temp_phage[ID] = phage_tails[ID] | |
94 else: | |
95 from Bio import Entrez | |
96 from Bio import SeqIO | |
97 phage = {} | |
98 Entrez.email = 'insert@email.com' | |
99 try: | |
100 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=ID) as handle: | |
101 genome = SeqIO.read(handle, "gb") | |
102 for feat in genome.features: | |
103 if feat.type == 'CDS': | |
104 try: temp_phage[ID][feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]] | |
105 except: pass | |
106 except: | |
107 print(ID, 'not found in GenBank') | |
108 return temp_phage | |
109 | |
110 def _retrieve_from_bact_id(self, bacteria): | |
111 temp_bacteria = {} | |
112 for ID in bacteria: | |
113 temp_bacteria[ID] = {} | |
114 if '.' in ID: | |
115 ID = ID[:ID.find('.')] | |
116 #if ID in self.all_bacteria: | |
117 # import json | |
118 # with open('files/bacteria/' + ID + '.json', encoding='utf-8') as f: | |
119 # temp_bacteria[ID] = json.loads(f.read()) | |
120 #else: | |
121 from Bio import Entrez | |
122 from Bio import SeqIO | |
123 bacteria = {} | |
124 Entrez.email = 'insert@email.com' | |
125 try: | |
126 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=ID+'.1') as handle: | |
127 genome = SeqIO.read(handle, "gb") | |
128 for feat in genome.features: | |
129 if feat.type == 'CDS': | |
130 try: temp_bacteria[ID][feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]] | |
131 except: pass | |
132 if len(genome.features) <= 5: | |
133 with Entrez.efetch(db="nucleotide", rettype="gbwithparts", retmode="text", id=ID) as handle: | |
134 genome = handle.readlines() | |
135 for i in range(len(genome)): | |
136 if ' CDS ' in genome[i]: | |
137 j = i | |
138 protDone = False | |
139 while j < len(genome): | |
140 if protDone: | |
141 break | |
142 if '/product=' in genome[j]: | |
143 product = genome[j].strip()[10:] | |
144 j += 1 | |
145 elif '_id=' in genome[j]: | |
146 protKey = genome[j].strip()[13:-1] | |
147 j += 1 | |
148 elif '/translation=' in genome[j]: | |
149 protSeq = genome[j].strip()[14:] | |
150 j += 1 | |
151 for k in range(j, len(genome)): | |
152 if genome[k].islower(): | |
153 j = k | |
154 protDone = True | |
155 break | |
156 else: | |
157 protSeq += genome[k].strip() | |
158 else: | |
159 j += 1 | |
160 temp_bacteria[ID][protKey] = [product, protSeq[:protSeq.find('"')]] | |
161 except: | |
162 print(ID, 'not found in GenBank') | |
163 return temp_bacteria | |
164 | |
165 def _find_phage_functions(self, phage_dict, run_interpro): | |
166 import os | |
167 import json | |
168 with open('files/known_function.json', encoding='utf-8') as F: | |
169 known_function = json.loads(F.read()) | |
170 with open('files/temp_database.fasta', 'w') as F: | |
171 for phage in known_function: | |
172 for prot in known_function[phage]: | |
173 F.write('>' + phage + '-' + prot + '\n' + known_function[phage][prot][1] + '\n') | |
174 os.system('makeblastdb -in files/temp_database.fasta -dbtype prot -title PhageProts -parse_seqids -out files/temp_database -logfile files/temp_log') | |
175 for org in phage_dict: | |
176 with open('files/temp.fasta', 'w') as F: | |
177 for prot in phage_dict[org]: | |
178 F.write('>' + prot + '\n' + phage_dict[org][prot][1] + '\n') | |
179 os.system('blastp -db files/temp_database -query files/temp.fasta -out files/temp_blast -num_threads 2 -outfmt 6') | |
180 phage_dict[org] = self.process_blast(phage_dict[org], known_function) | |
181 if run_interpro: phage_dict[org] = self.interpro(phage_dict[org]) | |
182 return phage_dict | |
183 | |
184 def process_blast(self, phage_dict, known_function): | |
185 import pandas as pd | |
186 import re | |
187 blast_domains = pd.read_csv('files/temp_blast', sep='\t', header=None) | |
188 for prot in phage_dict: | |
189 func = phage_dict[prot][0] | |
190 known = False | |
191 if (not any(i in func.lower() for i in ['hypothetical', 'unknown', 'kda', 'uncharacterized', 'hyphothetical']) and len(func) > 3) and not ('gp' in func.lower() and len(func.split(' ')) < 2) and not (len(func.split(' ')) == 1 and len(func) < 5): | |
192 known = True | |
193 if not known: | |
194 evalue = [] | |
195 bitscore = [] | |
196 pred = blast_domains[blast_domains[0] == prot] | |
197 if pred.shape[0] == 0: break | |
198 for i in pred[10]: | |
199 evalue.append(float(i)) | |
200 for i in pred[11]: | |
201 bitscore.append(float(i)) | |
202 if min(evalue) < 1.0 and max(bitscore) > 30.0: | |
203 ind = evalue.index(min(evalue)) | |
204 if ind != bitscore.index(max(bitscore)): | |
205 ind = bitscore.index(max(bitscore)) | |
206 temp = pred.iloc[ind, 1] | |
207 known_phage = temp[:temp.find('-')] | |
208 known_prot = temp[temp.find('-') + 1:] | |
209 if known_function[known_phage][known_prot]: | |
210 new_func = known_function[known_phage][known_prot][0] | |
211 # for j in known_function.keys(): | |
212 # if pred.iloc[ind, 1] in known_function[j].keys(): | |
213 # new_func = known_function[j][pred.iloc[ind, 1]][0] | |
214 # break | |
215 x = re.findall('(Gp\d{2,}[^,\d -]|Gp\d{1}[^,\d -])', temp) # se tiver hits, remover | |
216 if not any(z in new_func.lower() for z in ['unknown', 'ucp', 'uncharacterized', 'consensus']) and len(new_func) > 3 and not x: | |
217 phage_dict[prot][0] = new_func | |
218 return phage_dict | |
219 | |
220 def interpro(self, phage_dict): | |
221 import os | |
222 import pandas as pd | |
223 import re | |
224 os.system('interproscan.sh -b ' + 'files/temp_interpro -i ' + 'files/temp.fasta -f tsv > files/temp_interpro_log') | |
225 domains = pd.read_csv('files/temp_interpro.tsv', sep='\t', index_col=0, header=None, names=list(range(13))) | |
226 domains = domains.fillna('-') | |
227 domains = domains[domains.loc[:, 3] != 'Coils'] | |
228 domains = domains[domains.loc[:, 3] != 'MobiDBLite'] | |
229 for prot in phage_dict: | |
230 func = phage_dict[prot][0] | |
231 known = False | |
232 if (not any(i in func.lower() for i in ['hypothetical', 'unknown', 'kda', 'uncharacterized', 'hyphothetical']) and len(func) > 3) and not ('gp' in func.lower() and len(func.split(' ')) < 2) and not (len(func.split(' ')) == 1 and len(func) < 5): | |
233 known = True | |
234 if prot in domains.index and not known: | |
235 temp = '-' | |
236 try: | |
237 for i in range(domains.loc[prot, :].shape[0]): | |
238 if '-' not in domains.loc[prot, 12].iloc[i].lower(): | |
239 if float(domains.loc[prot, 8].iloc[i]) < 1.0: | |
240 temp = domains.loc[prot, 12].iloc[i] | |
241 break | |
242 except: | |
243 if float(domains.loc[prot, 8]) < 1.0: | |
244 temp = domains.loc[prot, 12] | |
245 x = re.findall('(Gp\d{2,}[^,\d -]|Gp\d{1}[^,\d -])', temp) # se tiver hits, remover | |
246 if temp != '-' and not any(z in temp.lower() for z in ['unknown', 'ucp', 'uncharacterized', 'consensus']) and len(temp) > 3 and not x: | |
247 phage_dict[prot][0] = temp | |
248 else: | |
249 try: | |
250 for i in range(domains.loc[prot, :].shape[0]): | |
251 if '-' not in domains.loc[prot, 5].iloc[i].lower(): | |
252 temp = domains.loc[prot, 5].iloc[i] | |
253 break | |
254 except: | |
255 temp = domains.loc[prot, 5] | |
256 x = re.findall('(Gp\d{2,}[^,\d -]|Gp\d{1}[^,\d -])', temp) | |
257 if temp != '-' and not any(z in temp.lower() for z in ['unknown', 'ucp', 'uncharacterized', 'consensus']) and len(temp) > 3 and not x: | |
258 phage_dict[prot][0] = temp | |
259 return phage_dict | |
260 | |
261 def _find_phage_tails(self, phage_dict): | |
262 for org in phage_dict: | |
263 list_remove = [] | |
264 for protein in phage_dict[org]: | |
265 if any(z in phage_dict[org][protein][0].lower() for z in ['fiber', 'fibre', 'spike', 'hydrolase', 'bind', 'depolymerase', 'peptidase', 'lyase', 'sialidase', 'dextranase', 'lipase', 'adhesin', 'baseplate', 'protein h', 'recognizing', 'protein j', 'protein g', 'gpe', 'duf4035', 'host specifity', 'cor protein', 'specificity', 'baseplate component', 'gp38', 'gp12 tail', 'receptor', 'recognition', 'tail']) \ | |
266 and not any(z in phage_dict[org][protein][0].lower() for z in ['nucle', 'dna', 'rna', 'ligase', 'transferase', 'inhibitor', 'assembly', 'connect', 'nudix', 'atp', 'nad', 'transpos', 'ntp', 'molybdenum', 'hns', 'gtp', 'riib', 'inhibitor', 'replicat', 'codon', 'pyruvate', 'catalyst', 'hinge', 'sheath completion', 'head', 'capsid', 'tape', 'tip', 'strand', 'matur', 'portal', 'terminase', 'nucl', 'promot', 'block', 'olfact', 'wedge', 'lysozyme', 'mur', 'sheat']): | |
267 pass | |
268 else: | |
269 list_remove.append(protein) | |
270 for protein in list_remove: | |
271 del phage_dict[org][protein] | |
272 return phage_dict | |
273 | |
274 def run_prediction(self, phage_dict, bact_dict, ml_model): | |
275 from feature_construction import FeatureConstruction | |
276 import pickle | |
277 from sklearn.preprocessing import LabelEncoder | |
278 from sklearn.preprocessing import StandardScaler | |
279 import numpy as np | |
280 | |
281 if ml_model == 'RandomForests': | |
282 with open('files/dataset_reduced', 'rb') as f: | |
283 dataset = pickle.load(f) | |
284 columns_remove = [3, 7, 9, 11, 24, 28, 32, 34, 38, 42, 45, 52, 53, 61, 65, 73, 75, 79, 104, 122, 141, 151, 154, 155, 157, 159, 160, 161, 163, 165, 169, 170, 173, 176, 178, 180, 182, 183, 185, 186, 187, 190, 193, 194, 196, 197, 201, 202, 203, 206, 207, 209, 210, 212, 216, 217, 221, 223, 225, 226, 230, 233, 235, 236, 245, 251] | |
285 elif ml_model == 'SVM': | |
286 with open('files/feature_dataset', 'rb') as f: | |
287 dataset = pickle.load(f) | |
288 columns_remove = [] | |
289 | |
290 dataset = dataset.dropna() | |
291 le = LabelEncoder() | |
292 le.fit(['Yes', 'No']) | |
293 output = le.transform(dataset['Infects'].values) | |
294 dataset = dataset.drop('Infects', 1) | |
295 scaler = StandardScaler() | |
296 scaler.fit(dataset) | |
297 data_z = scaler.transform(dataset) | |
298 | |
299 fc = FeatureConstruction() | |
300 solution = [] | |
301 for phage in phage_dict: | |
302 for bacteria in bact_dict: | |
303 temp_solution = np.array([]) | |
304 temp_solution = np.append(temp_solution, fc.get_grouping(phage_dict[phage], bact_dict[bacteria])) | |
305 temp_solution = np.append(temp_solution, fc.get_composition(phage_dict[phage], bact_dict[bacteria])) | |
306 temp_solution = np.append(temp_solution, fc.get_kmers(phage_dict[phage], bact_dict[bacteria])) | |
307 temp_solution = temp_solution.reshape(1, -1) | |
308 if columns_remove: | |
309 temp_solution = np.delete(temp_solution, columns_remove, 1) | |
310 if phage in self.all_phages: | |
311 for ID in dataset.index: | |
312 if phage in ID: | |
313 for i in range(len(dataset.loc[ID].index)): | |
314 if 'phage' in dataset.loc[ID].index[i]: | |
315 temp_solution[0][i] = dataset.loc[ID, dataset.loc[ID].index[i]] | |
316 break | |
317 if bacteria in self.all_bacteria: | |
318 for ID in dataset.index: | |
319 if bacteria in ID: | |
320 for i in range(len(dataset.loc[ID].index)): | |
321 if 'bact' in dataset.loc[ID].index[i]: | |
322 temp_solution[0][i] = dataset.loc[ID, dataset.loc[ID].index[i]] | |
323 break | |
324 if type(solution) != np.ndarray: | |
325 solution = temp_solution | |
326 else: | |
327 solution = np.append(solution, temp_solution, axis=0) | |
328 # solution = solution.reshape(1, -1) | |
329 solution = scaler.transform(solution) | |
330 | |
331 if ml_model == 'RandomForests': | |
332 from sklearn.ensemble import RandomForestClassifier | |
333 clf = RandomForestClassifier(n_estimators=200, bootstrap=False, criterion='gini', min_samples_leaf=2, min_samples_split=4, oob_score=False) | |
334 clf = clf.fit(data_z, output) | |
335 elif ml_model == 'SVM': | |
336 from sklearn.svm import SVC | |
337 clf = SVC(C=10, degree=2, gamma='auto', kernel='rbf') | |
338 clf = clf.fit(data_z, output) | |
339 pred = clf.predict(solution) | |
340 pred = list(le.inverse_transform(pred)) | |
341 return pred | |
342 | |
343 def create_output(self, output, phage_seqs, bact_seqs): | |
344 import pandas as pd | |
345 list_orgs = [] | |
346 for phage in phage_seqs: | |
347 for bact in bact_seqs: | |
348 list_orgs.append(phage + ' - ' + bact) | |
349 file = pd.DataFrame({'Phage - Bacteria': list_orgs, 'Infects': output}) | |
350 file.to_csv('files/output.tsv', sep='\t', index=False, header=True) | |
351 file.to_csv(or_location + '/output.tsv', sep='\t', index=False, header=True) | |
352 | |
353 | |
354 if __name__ == '__main__': | |
355 import sys | |
356 import os | |
357 global or_location | |
358 or_location = os.getcwd() | |
359 os.chdir(os.path.dirname(__file__)) | |
360 | |
361 phage_input_type = sys.argv[1] | |
362 Phages = sys.argv[2] | |
363 bact_input_type = sys.argv[3] | |
364 Bacts = sys.argv[4] | |
365 run_interpro = sys.argv[5] | |
366 if run_interpro == 'True': | |
367 run_interpro = True | |
368 else: | |
369 run_interpro = False | |
370 model = sys.argv[6] | |
371 GalaxyPrediction(phage_input_type=phage_input_type, bact_input_type=bact_input_type, phage=Phages, bacteria=Bacts, ml_model=model, run_interpro=run_interpro) | |
372 #rg = GalaxyPrediction(phage_input_type='ID', bact_input_type='ID', phage='NC_050154', bacteria='NC_007414,NZ_MK033499,NZ_CP031214') | |
373 # GalaxyPrediction(phage_input_type='ID', bact_input_type='ID', phage='NC_031087,NC_049833,NC_049838,NC_049444', bacteria='LR133964', ml_model='SVM') |