comparison phage_host_prediction/domain_search.py @ 2:3e1e8be4e65c draft default tip

Uploaded
author pedro_araujo
date Fri, 02 Apr 2021 10:11:13 +0000
parents
children
comparison
equal deleted inserted replaced
1:d9cda08472ea 2:3e1e8be4e65c
1
2 class DomainSearch:
3
4 def __init__(self):
5 '''
6 This still needs a bit of modifications
7 :param phagesProteins: protein function and sequences, as provided in NCBI. Each phage ID has every protein represented with a dicionary with keys as protein IDs
8 :param phageDomains: for each phage and each of it's proteins, a list of predicted domains is given. If unavailable, it returns an empty list
9 '''
10 import json
11 import pandas as pd
12 with open('files/phagesProteins.json', encoding='utf-8') as F:
13 self.phagesProteins = json.loads(F.read())
14 # with open('files/bactProteins.json', encoding='utf-8') as F: # For later use, implement the same way as phage, more or less. Include psort
15 # self.bacProt = json.loads(F.read())
16 data = pd.read_csv('files/NCBI_Phage_Bacteria_Data.csv', header=0, index_col=0)
17 for phage in data.index:
18 if data.loc[phage, 'Host_ID'] == '[]':
19 try: del self.phagesProteins[phage]
20 except: pass
21 self._filter_phage()
22
23 def _create_fasta(self, dic, name):
24 '''
25 Creates a fasta file containing every protein sequence for a given dictionary.
26 :return:
27 '''
28 with open('files/' + name, 'w') as F:
29 for org in dic:
30 for prot in dic[org]:
31 F.write('>' + org + '-' + prot + '\n' + dic[org][prot][1] + '\n')
32
33 def _filter_phage(self):
34 self.known_function = {}
35 self.unknown_function = {}
36 for phage in self.phagesProteins.keys():
37 self.known_function[phage] = {}
38 self.unknown_function[phage] = {}
39 for prot in self.phagesProteins[phage].keys():
40 func = self.phagesProteins[phage][prot][0]
41 if (not any(i in func.lower() for i in ['hypothetical', 'unknown', 'kda', 'uncharacterized', 'hyphothetical']) and len(func) > 3) and not ('gp' in func.lower() and len(func.split(' ')) < 2) and not (len(func.split(' ')) == 1 and len(func) < 5):
42 self.known_function[phage][prot] = self.phagesProteins[phage][prot]
43 else:
44 self.unknown_function[phage][prot] = self.phagesProteins[phage][prot]
45
46 def scanInterPro(self, InterPro_path='/home/pedro-linux/Downloads/interproscan-5.46-81.0/', out_path='/home/pedro-linux/OneDrive/UMinho/Cenas_de_tese_idk/test_tese_process/files/'):
47 '''
48 Creates a fasta file containing every protein and scans it using Interproscan. Creates a tsv file
49 :param InterPro_path: path to the interproscan executable
50 :param out_path: path to save the tsv output
51 :return: domains_output.tsv, a file that contains the domain associated with each protein
52 '''
53 import os
54 self._create_fasta(self.unknown_function, 'unknown_phages.fasta')
55 os.system(InterPro_path + 'interproscan.sh -b ' + out_path + '/interpro/domains_output -i ' + out_path + 'unknown_phages.fasta -f tsv')
56
57 def iter_interpro(self, InterPro_path='/home/pedro-linux/Downloads/interproscan-5.46-81.0/', out_path='/home/pedro-linux/OneDrive/UMinho/Cenas_de_tese_idk/test_tese_process/files/interpro/'):
58 import os
59 from pathlib import Path
60 count = 0
61 F = open('files/interpro/temp_100.fasta', 'w')
62 for phage in self.unknown_function:
63 for prot in self.unknown_function[phage]:
64 count += 1
65 my_file = Path("files/interpro/domains_output" + str(count) + ".tsv")
66 if count % 100 == 0 and not my_file.is_file():
67 F.write('>' + prot + '\n' + self.unknown_function[phage][prot][1] + '\n')
68 F.close()
69 os.system(InterPro_path + 'interproscan.sh -b ' + out_path + 'domains_output' + str(count) + ' -i ' + out_path + 'temp_100.fasta -f tsv')
70 F = open('files/interpro/temp_100.fasta', 'w')
71 else:
72 F.write('>' + prot + '\n' + self.unknown_function[phage][prot][1] + '\n')
73 if count % 100 != 0:
74 F.close()
75 os.system(InterPro_path + 'interproscan.sh -b ' + out_path + 'domains_output' + str(count) + ' -i ' + out_path + 'temp_100.fasta -f tsv')
76
77 def processInterPro(self):
78 '''
79 Processes the tsv file created from scanInterPro. Domains are saved in the protdomains variable.
80 :return: phageDomains, a dictionary that, for each protein in a given species, has domains associated
81 '''
82 import os
83 from pathlib import Path
84 import pandas as pd
85 import re
86 my_file = Path("files/interpro/domains_output.tsv")
87 if not my_file.is_file():
88 with open('files/interpro/domains_output.tsv', 'w') as F:
89 for file in os.listdir('files/interpro/'):
90 if 'temp_100' not in file:
91 with open('files/interpro/' + file, 'r') as f:
92 F.write(f.read())
93 domains = pd.read_csv('files/interpro/domains_output.tsv', sep='\t', index_col=0, header=None, names=list(range(13)))
94 domains = domains.fillna('-')
95 domains = domains[domains.loc[:, 3] != 'Coils']
96 domains = domains[domains.loc[:, 3] != 'MobiDBLite']
97 # domains = domains.groupby(domains.index).last()
98 add_domains = {}
99 for spec in self.phagesProteins:
100 for prot in self.phagesProteins[spec]:
101 if prot in domains.index:
102 temp = '-'
103 try:
104 for i in range(domains.loc[prot, :].shape[0]):
105 if '-' not in domains.loc[prot, 12].iloc[i].lower():
106 if float(domains.loc[id, 8].iloc[i]) < 1.0:
107 temp = domains.loc[id, 12].iloc[i]
108 break
109 except:
110 if float(domains.loc[id, 8]) < 1.0:
111 temp = domains.loc[id, 12]
112 x = re.findall('(Gp\d{2,}[^,\d -]|Gp\d{1}[^,\d -])', temp) # se tiver hits, remover
113 if temp != '-' and not any(z in temp.lower() for z in ['unknown', 'ucp', 'uncharacterized', 'consensus']) and len(temp) > 3 and not x:
114 if temp not in add_domains.keys():
115 add_domains[temp] = input('Add function: ' + temp).lower()
116 if 'y' in add_domains[temp]:
117 self.phagesProteins[spec][prot][0] = temp
118 else:
119 try:
120 for i in range(domains.loc[prot, :].shape[0]):
121 if '-' not in domains.loc[prot, 5].iloc[i].lower():
122 temp = domains.loc[prot, 5].iloc[i]
123 break
124 except:
125 temp = domains.loc[prot, 5]
126 x = re.findall('(Gp\d{2,}[^,\d -]|Gp\d{1}[^,\d -])', temp)
127 if temp != '-' and not any(z in temp.lower() for z in ['unknown', 'ucp', 'uncharacterized', 'consensus']) and len(temp) > 3 and not x:
128 if temp not in add_domains.keys():
129 add_domains[temp] = input('Add function: ' + temp).lower()
130 if 'y' in add_domains[temp]:
131 self.phagesProteins[spec][prot][0] = temp
132
133 def find_domains_interpro(self, dic):
134 import os
135 import pandas as pd
136 import re
137 InterPro_path='/home/pedro-linux/Downloads/interproscan-5.46-81.0/'
138 out_path='/home/pedro-linux/OneDrive/UMinho/Cenas_de_tese_idk/WholeProcess/files/'
139 with open('files/SinglePhageProteins.fasta', 'w') as F:
140 for prot in dic.keys():
141 F.write('>' + dic[prot][0] + '\n' + dic[prot][1] + '\n')
142 os.system(InterPro_path + 'interproscan.sh -b ' + out_path + 'single_phage_domains -i ' + out_path + 'SinglePhageProteins.fasta -f tsv')
143
144 domains = pd.read_csv('files/single_phage_domains.tsv', sep='\t', index_col=0, header=None, names=list(range(13)))
145 domains = domains.fillna('-')
146 for prot in dic:
147 if prot in domains.index:
148 temp = '-'
149 try:
150 for i in range(domains.loc[prot, :].shape[0]):
151 if 'coil' not in domains.loc[prot, 12].iloc[i].lower() and '-' not in domains.loc[prot, 12].iloc[i].lower():
152 temp = domains.loc[prot, 12].iloc[i]
153 break
154 except:
155 temp = domains.loc[prot, 12]
156 x = re.findall('(Gp\d{2,}[^,\d -]|Gp\d{1}[^,\d -])', temp) # se tiver hits, remover
157 if temp != '-' and 'unknown' not in temp and 'UCP' not in temp and len(temp)>3 and not x:
158 dic[prot][0] = temp
159 else:
160 try:
161 for i in range(domains.loc[prot, :].shape[0]):
162 if 'coil' not in domains.loc[prot, 5].iloc[i].lower() and '-' not in domains.loc[prot, 12].iloc[i].lower():
163 temp = domains.loc[prot, 5].iloc[i]
164 break
165 except:
166 temp = domains.loc[prot, 5]
167 x = re.findall('(Gp\d{2,}[^,\d -]|Gp\d{1}[^,\d -])', temp)
168 if temp != '-' and 'unknown' not in temp and 'UCP' not in temp and len(temp) > 3 and not x:
169 dic[prot][0] = temp
170 return dic
171
172 def fillDomainsBLAST(self):
173 '''
174 Using the NCBIWWW package, it searches for domains with BLAST. Domains are saved in the protdomains variable.
175 :return: phageDomains, a dictionary that, for each protein in a given species, has domains associated
176 '''
177 print('Finding functions/domains with BLAST')
178 from Bio.Blast import NCBIWWW
179 from Bio.Blast import NCBIXML
180 import pickle
181 from pathlib import Path
182 my_file = Path("files/phage_list_blast")
183 if my_file.is_file():
184 with open('files/phage_list_blast', 'rb') as f:
185 list_done = pickle.load(f)
186 else:
187 list_done = []
188 for spec in self.phagesProteins:
189 if spec not in list_done:
190 for prot in self.phagesProteins[spec]:
191 if 'hypothetical' in self.phagesProteins[spec][prot][0].lower() or 'uncharacterized' in self.phagesProteins[spec][prot][0].lower() or 'unknown' in self.phagesProteins[spec][prot][0].lower():
192 # if not self.phageDomains[bac][prot]:
193 result_handle = NCBIWWW.qblast('blastp', 'nr', self.phagesProteins[spec][prot][1], entrez_query='Acinetobacter baumannii (taxid:470), Escherichia coli (taxid:562), Klebsiella pneumonia (taxid:573)')
194 blastout = NCBIXML.read(result_handle)
195 for ali in blastout.alignments:
196 if 'hypothetical' not in ali.hit_def.lower() and 'uncharacterized' not in ali.hit_def.lower():
197 print(ali.hit_def[:ali.hit_def.find(' [')])
198 self.phagesProteins[spec][prot][0] = ali.hit_def[:ali.hit_def.find(' [')]
199 break
200 list_done.append(spec)
201 with open('files/phage_list_blast', 'wb') as f:
202 pickle.dump(list_done, f)
203 self.saveDomains()
204
205 def find_domains_blast(self, dic):
206 from Bio.Blast import NCBIWWW
207 from Bio.Blast import NCBIXML
208
209 for prot in dic.keys():
210 if 'hypothetical' in dic[prot][0].lower() or 'uncharacterized' in dic[prot][0].lower() or 'unknown' in dic[prot][0].lower():
211 result_handle = NCBIWWW.qblast('blastp', 'nr', prot, entrez_query='Acinetobacter baumannii (taxid:470), Escherichia coli (taxid:562), Klebsiella pneumonia (taxid:573)')
212 blastout = NCBIXML.read(result_handle)
213 for ali in blastout.alignments:
214 if 'hypothetical' not in ali.hit_def.lower() and 'uncharacterized' not in ali.hit_def.lower():
215 print(ali.hit_def[:ali.hit_def.find(' [')])
216 self.phagesProteins[spec][prot][0] = ali.hit_def[:ali.hit_def.find(' [')]
217 break
218 return dic
219
220 def fillDomainsUniProt(self):
221 '''
222 Using the UniProt website, similar sequences are obtained and the ones with function assigned are saved into the domains. Domains are saved in the protdomains variable.
223 :return: phageDomains, a dictionary that, for each protein in a given species, has domains associated
224 '''
225 print('Finding functions/domains with UniProt')
226 import requests
227 import pickle
228 from pathlib import Path
229 my_file = Path("files/phage_list_uniprot")
230 if my_file.is_file():
231 with open('files/phage_list_uniprot', 'rb') as f:
232 list_done = pickle.load(f)
233 else:
234 list_done = []
235 for phage in self.phagesProteins:
236 if phage not in list_done:
237 for accID in self.phagesProteins[phage]:
238 if 'hypothetical' in self.phagesProteins[phage][accID][0].lower() or 'uncharacterized' in self.phagesProteins[phage][accID][0].lower() or 'unknown' in self.phagesProteins[phage][accID][0].lower():
239 # if not self.phageDomains[phage][accID]:
240 fullURL = ('https://www.uniprot.org/uniprot/?query=' + accID + '&sort=score&format=list')
241 result = requests.get(fullURL)
242 uniprot_acc = result.text.strip()
243 fullURL = ('https://www.uniprot.org/uniprot/?query=cluster:(uniprot:' + uniprot_acc + '* identity:1.0) not id:' + uniprot_acc + '&format=txt')
244 result = requests.get(fullURL)
245 listResults = result.text.split('\n')
246 for entry in listResults:
247 if entry[:2] == 'DE':
248 start_pos = entry.find('Full=') + 5
249 end_pos = entry.find(' {ECO')
250 domain = entry[start_pos:end_pos]
251 if not any(z in domain.lower() for z in ['uncharacterized', 'flags', 'domain', 'bacteriophage protein', 'family protein', 'phage-like', 'phage protein', 'unassigned', 'orf', 'gene']) and len(domain) > 5:
252 print(domain)
253 self.phagesProteins[phage][accID][0] = domain
254 break
255 list_done.append(phage)
256 with open('files/phage_list_uniprot', 'wb') as f:
257 pickle.dump(list_done, f)
258 self.saveDomains()
259
260 def find_domains_uniprot(self, dic):
261 import requests
262 for accID in dic.keys():
263 if 'hypothetical' in dic[accID][0].lower() or 'uncharacterized' in dic[accID][0].lower() or 'unknown' in dic[accID][0].lower():
264 fullURL = ('https://www.uniprot.org/uniprot/?query=' + accID + '&sort=score&format=list')
265 result = requests.get(fullURL)
266 uniprot_acc = result.text.strip()
267 fullURL = ('https://www.uniprot.org/uniprot/?query=cluster:(uniprot:' + uniprot_acc + '* identity:1.0) not id:' + uniprot_acc + '&format=txt')
268 result = requests.get(fullURL)
269 listResults = result.text.split('\n')
270 for entry in listResults:
271 if entry[:2] == 'DE':
272 start_pos = entry.find('Full=') + 5
273 end_pos = entry.find(' {ECO')
274 domain = entry[start_pos:end_pos]
275 if not any(z in domain.lower() for z in ['uncharacterized', 'flags', 'domain', 'bacteriophage protein', 'family protein', 'phage-like', 'phage protein', 'unassigned', 'orf', 'gene']) and len(domain) > 5:
276 dic[accID][0] = domain
277 break
278 return dic
279
280 def cdHit(self):
281 import os
282 from pathlib import Path
283 my_file = Path('files/phagesProteins.fasta')
284 if not my_file.is_file():
285 self._create_fasta(self.phagesProteins, 'phagesProteins.fasta')
286 my_file = Path('files/complete_cdhit.clstr')
287 if not my_file.is_file():
288 os.system('cd-hit -i files/phagesProteins.fasta -d 50 -o files/complete_cdhit')
289 # clusters = {}
290 temp_cluster = []
291 list_found = []
292 found = False
293 with open('files/complete_cdhit.clstr', 'r') as f:
294 for line in f.readlines():
295 if '>Cluster' in line:
296 if temp_cluster and found:
297 if len(list_found) == 1:
298 function = list_found[0]
299 else:
300 x = int(input(str(list_found) + '\nChoose from 1 to ' + str(len(list_found)) + ': ')) - 1
301 function = list_found[x]
302 for clust in temp_cluster:
303 self.phagesProteins[clust[clust.find('-') + 1:]][clust[:clust.find('-')]][0] = function
304
305 temp_cluster = []
306 list_found = []
307 found = False
308 else:
309 pos_i = line.find('>') + 1
310 pos_f = line.find('...')
311 pos_m = line.find('-')
312 prot = line[pos_i:pos_m]
313 phage = line[pos_m + 1:pos_f]
314 if prot in self.known_function[phage].keys() and not found:
315 function = self.known_function[phage][prot][0]
316 list_found.append(function)
317 found = True
318 elif prot in self.known_function[phage].keys() and found:
319 if function != self.known_function[phage][prot][0] and self.known_function[phage][prot][0] not in list_found:
320 function = self.known_function[phage][prot][0]
321 list_found.append(function)
322 elif prot in self.unknown_function[phage].keys():
323 temp_cluster.append(line[pos_i:pos_f])
324
325 def create_blast_db(self):
326 import os
327 self._create_fasta(self.known_function, 'database_phages.fasta')
328 os.system('makeblastdb -in files/database_phages.fasta -dbtype prot -title PhageProts -parse_seqids -out files/database_phages')
329 self._create_fasta(self.unknown_function, 'unknown_phages.fasta')
330 os.system('blastp -db files/database_phages -query files/unknown_phages.fasta -out files/test_blast -num_threads 2 -outfmt 6')
331
332 def process_blastdb(self, blastdb):
333 import pandas as pd
334 blast_domains = pd.read_csv('files/' + blastdb, sep='\t', header=None)
335 for phage in self.unknown_function.keys():
336 for prot in self.unknown_function[phage]:
337 evalue = []
338 bitscore = []
339 pred = blast_domains[blast_domains[0] == phage + '-' + prot]
340 if pred.shape[0] == 0: break
341 for i in pred[10]:
342 evalue.append(float(i))
343 for i in pred[11]:
344 bitscore.append(float(i))
345 if min(evalue) < 1.0 and max(bitscore) > 30.0:
346 ind = evalue.index(min(evalue))
347 if ind != bitscore.index(max(bitscore)):
348 ind = bitscore.index(max(bitscore))
349 temp = pred.iloc[ind,1]
350 known_phage = temp[:temp.find('-')]
351 known_prot = temp[temp.find('-')+1:]
352 if self.known_function[known_phage][known_prot]:
353 new_func = self.known_function[known_phage][known_prot][0]
354 for j in self.known_function.keys():
355 if pred.iloc[ind,1] in self.known_function[j].keys():
356 new_func = self.known_function[j][pred.iloc[ind,1]][0]
357 break
358 self.phagesProteins[phage][prot][0] = new_func
359 self.saveDomains()
360
361 def extract_bact_location(self):
362 import pandas as pd
363 import ast
364 import requests
365 import re
366 from pathlib import Path
367 data = pd.read_csv('files/NCBI_Phage_Bacteria_Data.csv', header=0, index_col=0)
368 all_bact = []
369 for i in data.index:
370 for bact in ast.literal_eval(data.loc[i, 'Host_ID']):
371 if bact[:-2] not in all_bact:
372 all_bact.append(bact[:-2])
373 fullURL = ('https://db.psort.org/downloads/precomputed?version=3.00')
374 result = requests.get(fullURL)
375 psort = result.text.strip()
376 urls = re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+/[a-z]+/\S+\"{1}', psort)
377 i = 1
378 while i < len(urls):
379 temp = urls[i]
380 bact = temp[temp.rfind('=') + 1:temp.find('"')]
381 if bact not in all_bact:
382 i += 3
383 else:
384 my_file = Path('files/psort/' + bact + ".faa.out")
385 if not my_file.is_file():
386 temp_url = urls[i+1].strip('"')
387 r = requests.get(temp_url)
388 with open('files/psort/' + bact + ".faa.out", 'wb') as f:
389 f.write(r.content)
390 i += 3
391
392 def create_fasta_psort(self):
393 from pathlib import Path
394 import json
395 import os
396 for bact in os.listdir('files/bacteria'):
397 my_file = Path('files/psort/' + bact[:-5] + '.faa.out')
398 if not my_file.is_file():
399 with open('files/bacteria/' + bact, encoding='utf-8') as F:
400 bact_prots = json.loads(F.read())
401 self._create_fasta(bact_prots, 'psort/' + bact[:-5] + '.fasta')
402 os.system('./psortb -n -i /home/pedro-linux/OneDrive/UMinho/Cenas_de_tese_idk/test_tese_process/files/psort/' + bact[:-5] + '.fasta -r . -o long')
403 os.listdir('./psortb')
404 os.replace('', '/home/pedro-linux/OneDrive/UMinho/Cenas_de_tese_idk/test_tese_process/files/psort/' + bact[:-5] + '.faa.out') # move and rename output
405 os.remove('files/psort/' + bact[:-5] + '.fasta')
406
407 def saveDomains(self):
408 '''
409 Saves the protdomain variable in a file.
410 :return: SearchedDomains.json
411 '''
412 import json
413 with open('files/phagesProteins.json', 'w') as f:
414 json.dump(self.phagesProteins, f)
415 # with open('files/phagesProteins.fasta', 'w') as F:
416 # for phage in self.phagesProteins.keys():
417 # for prot in self.phagesProteins[phage]:
418 # F.write('>' + prot + '\n' + self.phagesProteins[phage][prot][1] + '\n')
419
420
421 if __name__ == '__main__':
422 test = DomainSearch()
423
424 test.extract_bact_location()
425 test.create_fasta_psort()
426
427 test.create_blast_db()
428 test.process_blastdb('test_blast')
429
430 test.cdHit()
431 test.scanInterPro()
432 test.processInterPro()
433
434 test.fillDomainsBLAST()
435 test.fillDomainsUniProt()
436 test.saveDomains()