Mercurial > repos > pedro_araujo > phage_host_prediction
comparison run_galaxy.py @ 0:e4b3fc88efe0 draft
Uploaded
author | pedro_araujo |
---|---|
date | Wed, 27 Jan 2021 13:50:11 +0000 |
parents | |
children | f8dee15a72a4 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e4b3fc88efe0 |
---|---|
1 | |
2 | |
3 class GalaxyPrediction: | |
4 | |
5 def __init__(self, phage_input_type='ID', bact_input_type='ID', phage='', bacteria='', ml_model='RandomForests', run_interpro=False): | |
6 import pickle | |
7 import os | |
8 import re | |
9 with open('files/FeatureDataset', 'rb') as f: | |
10 dataset = pickle.load(f) | |
11 self.all_phages = [] | |
12 self.all_bacteria = [] | |
13 for ID in dataset.index: | |
14 temp_phage = ID[:ID.find('--')] | |
15 temp_bacteria = ID[ID.find('--')+2:] | |
16 if temp_phage not in self.all_phages: | |
17 self.all_phages.append(temp_phage) | |
18 if temp_bacteria not in self.all_bacteria: | |
19 self.all_bacteria.append(temp_bacteria) | |
20 if phage_input_type == 'ID': | |
21 phage = re.split('\W', phage.replace(' ', '')) | |
22 len_phage_id = len(phage) | |
23 phage_seqs = self._retrieve_from_phage_id(phage) | |
24 elif phage_input_type == 'seq_file': | |
25 phage_seqs = {} | |
26 phage_seqs['PhageFasta'] = {} | |
27 with open(phage, 'r') as f: | |
28 temp = f.readlines() | |
29 count_prot = 0 | |
30 prot = '' | |
31 i=0 | |
32 while i < len(temp): | |
33 if '>' in temp[i]: | |
34 if prot: | |
35 phage_seqs['PhageFasta']['Protein' + str(count_prot)] = ['Unknown', prot] | |
36 count_prot += 1 | |
37 prot = '' | |
38 i+=1 | |
39 else: | |
40 prot += temp[i].strip() | |
41 i+=1 | |
42 if bact_input_type == 'ID': | |
43 bacteria = re.split('\W', bacteria.replace(' ', '')) | |
44 if len(bacteria) > 1 and len_phage_id == 1 or len(bacteria) == 1: | |
45 bact_seqs = self._retrieve_from_bact_id(bacteria) | |
46 elif bact_input_type == 'seq_file': | |
47 bact_seqs = {} | |
48 bact_seqs['BacteriaFasta'] = {} | |
49 with open(bacteria, 'r') as f: | |
50 temp = f.readlines() | |
51 count_prot = 0 | |
52 prot = '' | |
53 i=0 | |
54 while i < len(temp): | |
55 if '>' in temp[i]: | |
56 if prot: | |
57 bact_seqs['BacteriaFasta']['Protein' + str(count_prot)] = ['Unknown', prot] | |
58 count_prot += 1 | |
59 prot = '' | |
60 i+=1 | |
61 else: | |
62 prot += temp[i].strip() | |
63 i+=1 | |
64 phage_seqs = self._find_phage_functions(phage_seqs, run_interpro) | |
65 phage_seqs = self._find_phage_tails(phage_seqs) | |
66 | |
67 list_remove = [] | |
68 for org in phage_seqs: | |
69 if not phage_seqs[org]: | |
70 print('Could not find tails for phage ' + org + '. Deleting entry...') | |
71 list_remove.append(org) | |
72 for org in list_remove: | |
73 del phage_seqs[org] | |
74 | |
75 if phage_seqs: | |
76 output = self.run_prediction(phage_seqs, bact_seqs, ml_model) | |
77 self.create_output(output, phage_seqs, bact_seqs) | |
78 else: | |
79 with open(or_location + '/output.tsv', 'w') as f: | |
80 f.write('No phage tails found in query') | |
81 for file in os.listdir('files'): | |
82 if file.startswith('temp'): | |
83 os.remove('files/' + file) | |
84 | |
85 def _retrieve_from_phage_id(self, phage): | |
86 temp_phage = {} | |
87 for ID in phage: | |
88 temp_phage[ID] = {} | |
89 if ID in self.all_phages: | |
90 import json | |
91 with open('files/phageTails.json', encoding='utf-8') as f: | |
92 phage_tails = json.loads(f.read()) | |
93 temp_phage[ID] = phage_tails[ID] | |
94 else: | |
95 from Bio import Entrez | |
96 from Bio import SeqIO | |
97 phage = {} | |
98 Entrez.email = 'insert@email.com' | |
99 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=ID) as handle: | |
100 genome = SeqIO.read(handle, "gb") | |
101 for feat in genome.features: | |
102 if feat.type == 'CDS': | |
103 try: temp_phage[ID][feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]] | |
104 except: pass | |
105 return temp_phage | |
106 | |
107 def _retrieve_from_bact_id(self, bacteria): | |
108 temp_bacteria = {} | |
109 for ID in bacteria: | |
110 temp_bacteria[ID] = {} | |
111 if '.' in ID: | |
112 ID = ID[:ID.find('.')] | |
113 if ID in self.all_bacteria: | |
114 import json | |
115 with open('files/bacteria/' + ID + '.json', encoding='utf-8') as f: | |
116 temp_bacteria[ID] = json.loads(f.read()) | |
117 else: | |
118 from Bio import Entrez | |
119 from Bio import SeqIO | |
120 bacteria = {} | |
121 Entrez.email = 'insert@email.com' | |
122 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=ID+'.1') as handle: | |
123 genome = SeqIO.read(handle, "gb") | |
124 for feat in genome.features: | |
125 if feat.type == 'CDS': | |
126 try: temp_bacteria[ID][feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]] | |
127 except: pass | |
128 if len(genome.features) <= 5: | |
129 with Entrez.efetch(db="nucleotide", rettype="gbwithparts", retmode="text", id=ID) as handle: | |
130 genome = handle.readlines() | |
131 for i in range(len(genome)): | |
132 if ' CDS ' in genome[i]: | |
133 j = i | |
134 protDone = False | |
135 while j < len(genome): | |
136 if protDone: | |
137 break | |
138 if '/product=' in genome[j]: | |
139 product = genome[j].strip()[10:] | |
140 j += 1 | |
141 elif '_id=' in genome[j]: | |
142 protKey = genome[j].strip()[13:-1] | |
143 j += 1 | |
144 elif '/translation=' in genome[j]: | |
145 protSeq = genome[j].strip()[14:] | |
146 j += 1 | |
147 for k in range(j, len(genome)): | |
148 if genome[k].islower(): | |
149 j = k | |
150 protDone = True | |
151 break | |
152 else: | |
153 protSeq += genome[k].strip() | |
154 else: | |
155 j += 1 | |
156 temp_bacteria[ID][protKey] = [product, protSeq[:protSeq.find('"')]] | |
157 return temp_bacteria | |
158 | |
159 def _find_phage_functions(self, phage_dict, run_interpro): | |
160 import os | |
161 import json | |
162 with open('files/known_function.json', encoding='utf-8') as F: | |
163 known_function = json.loads(F.read()) | |
164 with open('files/temp_database.fasta', 'w') as F: | |
165 for phage in known_function: | |
166 for prot in known_function[phage]: | |
167 F.write('>' + phage + '-' + prot + '\n' + known_function[phage][prot][1] + '\n') | |
168 os.system('makeblastdb -in files/temp_database.fasta -dbtype prot -title PhageProts -parse_seqids -out files/temp_database -logfile files/temp_log') | |
169 for org in phage_dict: | |
170 with open('files/temp.fasta', 'w') as F: | |
171 for prot in phage_dict[org]: | |
172 F.write('>' + prot + '\n' + phage_dict[org][prot][1] + '\n') | |
173 os.system('blastp -db files/temp_database -query files/temp.fasta -out files/temp_blast -num_threads 2 -outfmt 6') | |
174 phage_dict[org] = self.process_blast(phage_dict[org], known_function) | |
175 if run_interpro: phage_dict[org] = self.interpro(phage_dict[org]) | |
176 return phage_dict | |
177 | |
178 def process_blast(self, phage_dict, known_function): | |
179 import pandas as pd | |
180 import re | |
181 blast_domains = pd.read_csv('files/temp_blast', sep='\t', header=None) | |
182 for prot in phage_dict: | |
183 func = phage_dict[prot][0] | |
184 known = False | |
185 if (not any(i in func.lower() for i in ['hypothetical', 'unknown', 'kda', 'uncharacterized', 'hyphothetical']) and len(func) > 3) and not ('gp' in func.lower() and len(func.split(' ')) < 2) and not (len(func.split(' ')) == 1 and len(func) < 5): | |
186 known = True | |
187 if not known: | |
188 evalue = [] | |
189 bitscore = [] | |
190 pred = blast_domains[blast_domains[0] == prot] | |
191 if pred.shape[0] == 0: break | |
192 for i in pred[10]: | |
193 evalue.append(float(i)) | |
194 for i in pred[11]: | |
195 bitscore.append(float(i)) | |
196 if min(evalue) < 1.0 and max(bitscore) > 30.0: | |
197 ind = evalue.index(min(evalue)) | |
198 if ind != bitscore.index(max(bitscore)): | |
199 ind = bitscore.index(max(bitscore)) | |
200 temp = pred.iloc[ind, 1] | |
201 known_phage = temp[:temp.find('-')] | |
202 known_prot = temp[temp.find('-') + 1:] | |
203 if known_function[known_phage][known_prot]: | |
204 new_func = known_function[known_phage][known_prot][0] | |
205 # for j in known_function.keys(): | |
206 # if pred.iloc[ind, 1] in known_function[j].keys(): | |
207 # new_func = known_function[j][pred.iloc[ind, 1]][0] | |
208 # break | |
209 x = re.findall('(Gp\d{2,}[^,\d -]|Gp\d{1}[^,\d -])', temp) # se tiver hits, remover | |
210 if not any(z in new_func.lower() for z in ['unknown', 'ucp', 'uncharacterized', 'consensus']) and len(new_func) > 3 and not x: | |
211 phage_dict[prot][0] = new_func | |
212 return phage_dict | |
213 | |
214 def interpro(self, phage_dict): | |
215 import os | |
216 import pandas as pd | |
217 import re | |
218 os.system('interproscan.sh -b ' + 'files/temp_interpro -i ' + 'files/temp.fasta -f tsv > files/temp_interpro_log') | |
219 domains = pd.read_csv('files/temp_interpro.tsv', sep='\t', index_col=0, header=None, names=list(range(13))) | |
220 domains = domains.fillna('-') | |
221 domains = domains[domains.loc[:, 3] != 'Coils'] | |
222 domains = domains[domains.loc[:, 3] != 'MobiDBLite'] | |
223 for prot in phage_dict: | |
224 func = phage_dict[prot][0] | |
225 known = False | |
226 if (not any(i in func.lower() for i in ['hypothetical', 'unknown', 'kda', 'uncharacterized', 'hyphothetical']) and len(func) > 3) and not ('gp' in func.lower() and len(func.split(' ')) < 2) and not (len(func.split(' ')) == 1 and len(func) < 5): | |
227 known = True | |
228 if prot in domains.index and not known: | |
229 temp = '-' | |
230 try: | |
231 for i in range(domains.loc[prot, :].shape[0]): | |
232 if '-' not in domains.loc[prot, 12].iloc[i].lower(): | |
233 if float(domains.loc[prot, 8].iloc[i]) < 1.0: | |
234 temp = domains.loc[prot, 12].iloc[i] | |
235 break | |
236 except: | |
237 if float(domains.loc[prot, 8]) < 1.0: | |
238 temp = domains.loc[prot, 12] | |
239 x = re.findall('(Gp\d{2,}[^,\d -]|Gp\d{1}[^,\d -])', temp) # se tiver hits, remover | |
240 if temp != '-' and not any(z in temp.lower() for z in ['unknown', 'ucp', 'uncharacterized', 'consensus']) and len(temp) > 3 and not x: | |
241 phage_dict[prot][0] = temp | |
242 else: | |
243 try: | |
244 for i in range(domains.loc[prot, :].shape[0]): | |
245 if '-' not in domains.loc[prot, 5].iloc[i].lower(): | |
246 temp = domains.loc[prot, 5].iloc[i] | |
247 break | |
248 except: | |
249 temp = domains.loc[prot, 5] | |
250 x = re.findall('(Gp\d{2,}[^,\d -]|Gp\d{1}[^,\d -])', temp) | |
251 if temp != '-' and not any(z in temp.lower() for z in ['unknown', 'ucp', 'uncharacterized', 'consensus']) and len(temp) > 3 and not x: | |
252 phage_dict[prot][0] = temp | |
253 return phage_dict | |
254 | |
255 def _find_phage_tails(self, phage_dict): | |
256 for org in phage_dict: | |
257 list_remove = [] | |
258 for protein in phage_dict[org]: | |
259 if any(z in phage_dict[org][protein][0].lower() for z in ['fiber', 'fibre', 'spike', 'hydrolase', 'bind', 'depolymerase', 'peptidase', 'lyase', 'sialidase', 'dextranase', 'lipase', 'adhesin', 'baseplate', 'protein h', 'recognizing', 'protein j', 'protein g', 'gpe', 'duf4035', 'host specifity', 'cor protein', 'specificity', 'baseplate component', 'gp38', 'gp12 tail', 'receptor', 'recognition', 'tail']) \ | |
260 and not any(z in phage_dict[org][protein][0].lower() for z in ['nucle', 'dna', 'rna', 'ligase', 'transferase', 'inhibitor', 'assembly', 'connect', 'nudix', 'atp', 'nad', 'transpos', 'ntp', 'molybdenum', 'hns', 'gtp', 'riib', 'inhibitor', 'replicat', 'codon', 'pyruvate', 'catalyst', 'hinge', 'sheath completion', 'head', 'capsid', 'tape', 'tip', 'strand', 'matur', 'portal', 'terminase', 'nucl', 'promot', 'block', 'olfact', 'wedge', 'lysozyme', 'mur', 'sheat']): | |
261 pass | |
262 else: | |
263 list_remove.append(protein) | |
264 for protein in list_remove: | |
265 del phage_dict[org][protein] | |
266 return phage_dict | |
267 | |
268 def run_prediction(self, phage_dict, bact_dict, ml_model): | |
269 from feature_construction import FeatureConstruction | |
270 import pickle | |
271 from sklearn.preprocessing import LabelEncoder | |
272 from sklearn.preprocessing import StandardScaler | |
273 import numpy as np | |
274 | |
275 if ml_model == 'RandomForests': | |
276 with open('files/dataset_reduced', 'rb') as f: | |
277 dataset = pickle.load(f) | |
278 columns_remove = [3, 7, 9, 11, 24, 28, 32, 34, 38, 42, 45, 52, 53, 61, 65, 73, 75, 79, 104, 122, 141, 151, 154, 155, 157, 159, 160, 161, 163, 165, 169, 170, 173, 176, 178, 180, 182, 183, 185, 186, 187, 190, 193, 194, 196, 197, 201, 202, 203, 206, 207, 209, 210, 212, 216, 217, 221, 223, 225, 226, 230, 233, 235, 236, 245, 251] | |
279 elif ml_model == 'SVM': | |
280 with open('files/feature_dataset', 'rb') as f: | |
281 dataset = pickle.load(f) | |
282 columns_remove = [] | |
283 | |
284 dataset = dataset.dropna() | |
285 le = LabelEncoder() | |
286 le.fit(['Yes', 'No']) | |
287 output = le.transform(dataset['Infects'].values) | |
288 dataset = dataset.drop('Infects', 1) | |
289 scaler = StandardScaler() | |
290 scaler.fit(dataset) | |
291 data_z = scaler.transform(dataset) | |
292 | |
293 fc = FeatureConstruction() | |
294 solution = [] | |
295 for phage in phage_dict: | |
296 for bacteria in bact_dict: | |
297 temp_solution = np.array([]) | |
298 temp_solution = np.append(temp_solution, fc.get_grouping(phage_dict[phage], bact_dict[bacteria])) | |
299 temp_solution = np.append(temp_solution, fc.get_composition(phage_dict[phage], bact_dict[bacteria])) | |
300 temp_solution = np.append(temp_solution, fc.get_kmers(phage_dict[phage], bact_dict[bacteria])) | |
301 temp_solution = temp_solution.reshape(1, -1) | |
302 if columns_remove: | |
303 temp_solution = np.delete(temp_solution, columns_remove, 1) | |
304 if phage in self.all_phages: | |
305 for ID in dataset.index: | |
306 if phage in ID: | |
307 for i in range(len(dataset.loc[ID].index)): | |
308 if 'phage' in dataset.loc[ID].index[i]: | |
309 temp_solution[0][i] = dataset.loc[ID, dataset.loc[ID].index[i]] | |
310 break | |
311 if bacteria in self.all_bacteria: | |
312 for ID in dataset.index: | |
313 if bacteria in ID: | |
314 for i in range(len(dataset.loc[ID].index)): | |
315 if 'bact' in dataset.loc[ID].index[i]: | |
316 temp_solution[0][i] = dataset.loc[ID, dataset.loc[ID].index[i]] | |
317 break | |
318 if type(solution) != np.ndarray: | |
319 solution = temp_solution | |
320 else: | |
321 solution = np.append(solution, temp_solution, axis=0) | |
322 # solution = solution.reshape(1, -1) | |
323 solution = scaler.transform(solution) | |
324 | |
325 if ml_model == 'RandomForests': | |
326 from sklearn.ensemble import RandomForestClassifier | |
327 clf = RandomForestClassifier(n_estimators=200, bootstrap=False, criterion='gini', min_samples_leaf=2, min_samples_split=4, oob_score=False) | |
328 clf = clf.fit(data_z, output) | |
329 elif ml_model == 'SVM': | |
330 from sklearn.svm import SVC | |
331 clf = SVC(C=10, degree=2, gamma='auto', kernel='rbf') | |
332 clf = clf.fit(data_z, output) | |
333 pred = clf.predict(solution) | |
334 pred = list(le.inverse_transform(pred)) | |
335 return pred | |
336 | |
337 def create_output(self, output, phage_seqs, bact_seqs): | |
338 import pandas as pd | |
339 list_orgs = [] | |
340 for phage in phage_seqs: | |
341 for bact in bact_seqs: | |
342 list_orgs.append(phage + ' - ' + bact) | |
343 file = pd.DataFrame({'Phage - Bacteria': list_orgs, 'Infects': output}) | |
344 file.to_csv('files/output.tsv', sep='\t', index=False, header=True) | |
345 file.to_csv(or_location + '/output.tsv', sep='\t', index=False, header=True) | |
346 | |
347 | |
348 if __name__ == '__main__': | |
349 import sys | |
350 import os | |
351 global or_location | |
352 or_location = os.getcwd() | |
353 os.chdir(os.path.dirname(__file__)) | |
354 | |
355 phage_input_type = sys.argv[1] | |
356 Phages = sys.argv[2] | |
357 bact_input_type = sys.argv[3] | |
358 Bacts = sys.argv[4] | |
359 run_interpro = sys.argv[5] | |
360 if run_interpro == 'True': | |
361 run_interpro = True | |
362 else: | |
363 run_interpro = False | |
364 model = sys.argv[6] | |
365 GalaxyPrediction(phage_input_type=phage_input_type, bact_input_type=bact_input_type, phage=Phages, bacteria=Bacts, ml_model=model, run_interpro=run_interpro) | |
366 # rg = GalaxyPrediction(phage_input_type='ID', bact_input_type='ID', phage='NC_050154', bacteria='NC_007414,NZ_MK033499,NZ_CP031214') | |
367 # GalaxyPrediction(phage_input_type='ID', bact_input_type='ID', phage='NC_031087,NC_049833,NC_049838,NC_049444', bacteria='LR133964', ml_model='SVM') |