Mercurial > repos > pedro_araujo > phage_host_prediction
comparison domain_search.py @ 0:e4b3fc88efe0 draft
Uploaded
author | pedro_araujo |
---|---|
date | Wed, 27 Jan 2021 13:50:11 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e4b3fc88efe0 |
---|---|
1 | |
2 class DomainSearch: | |
3 | |
4 def __init__(self): | |
5 ''' | |
6 This still needs a bit of modifications | |
7 :param phagesProteins: protein function and sequences, as provided in NCBI. Each phage ID has every protein represented with a dicionary with keys as protein IDs | |
8 :param phageDomains: for each phage and each of it's proteins, a list of predicted domains is given. If unavailable, it returns an empty list | |
9 ''' | |
10 import json | |
11 import pandas as pd | |
12 with open('files/phagesProteins.json', encoding='utf-8') as F: | |
13 self.phagesProteins = json.loads(F.read()) | |
14 # with open('files/bactProteins.json', encoding='utf-8') as F: # For later use, implement the same way as phage, more or less. Include psort | |
15 # self.bacProt = json.loads(F.read()) | |
16 data = pd.read_csv('files/NCBI_Phage_Bacteria_Data.csv', header=0, index_col=0) | |
17 for phage in data.index: | |
18 if data.loc[phage, 'Host_ID'] == '[]': | |
19 try: del self.phagesProteins[phage] | |
20 except: pass | |
21 self._filter_phage() | |
22 | |
23 def _create_fasta(self, dic, name): | |
24 ''' | |
25 Creates a fasta file containing every protein sequence for a given dictionary. | |
26 :return: | |
27 ''' | |
28 with open('files/' + name, 'w') as F: | |
29 for org in dic: | |
30 for prot in dic[org]: | |
31 F.write('>' + org + '-' + prot + '\n' + dic[org][prot][1] + '\n') | |
32 | |
33 def _filter_phage(self): | |
34 self.known_function = {} | |
35 self.unknown_function = {} | |
36 for phage in self.phagesProteins.keys(): | |
37 self.known_function[phage] = {} | |
38 self.unknown_function[phage] = {} | |
39 for prot in self.phagesProteins[phage].keys(): | |
40 func = self.phagesProteins[phage][prot][0] | |
41 if (not any(i in func.lower() for i in ['hypothetical', 'unknown', 'kda', 'uncharacterized', 'hyphothetical']) and len(func) > 3) and not ('gp' in func.lower() and len(func.split(' ')) < 2) and not (len(func.split(' ')) == 1 and len(func) < 5): | |
42 self.known_function[phage][prot] = self.phagesProteins[phage][prot] | |
43 else: | |
44 self.unknown_function[phage][prot] = self.phagesProteins[phage][prot] | |
45 | |
46 def scanInterPro(self, InterPro_path='/home/pedro-linux/Downloads/interproscan-5.46-81.0/', out_path='/home/pedro-linux/OneDrive/UMinho/Cenas_de_tese_idk/test_tese_process/files/'): | |
47 ''' | |
48 Creates a fasta file containing every protein and scans it using Interproscan. Creates a tsv file | |
49 :param InterPro_path: path to the interproscan executable | |
50 :param out_path: path to save the tsv output | |
51 :return: domains_output.tsv, a file that contains the domain associated with each protein | |
52 ''' | |
53 import os | |
54 self._create_fasta(self.unknown_function, 'unknown_phages.fasta') | |
55 os.system(InterPro_path + 'interproscan.sh -b ' + out_path + '/interpro/domains_output -i ' + out_path + 'unknown_phages.fasta -f tsv') | |
56 | |
57 def iter_interpro(self, InterPro_path='/home/pedro-linux/Downloads/interproscan-5.46-81.0/', out_path='/home/pedro-linux/OneDrive/UMinho/Cenas_de_tese_idk/test_tese_process/files/interpro/'): | |
58 import os | |
59 from pathlib import Path | |
60 count = 0 | |
61 F = open('files/interpro/temp_100.fasta', 'w') | |
62 for phage in self.unknown_function: | |
63 for prot in self.unknown_function[phage]: | |
64 count += 1 | |
65 my_file = Path("files/interpro/domains_output" + str(count) + ".tsv") | |
66 if count % 100 == 0 and not my_file.is_file(): | |
67 F.write('>' + prot + '\n' + self.unknown_function[phage][prot][1] + '\n') | |
68 F.close() | |
69 os.system(InterPro_path + 'interproscan.sh -b ' + out_path + 'domains_output' + str(count) + ' -i ' + out_path + 'temp_100.fasta -f tsv') | |
70 F = open('files/interpro/temp_100.fasta', 'w') | |
71 else: | |
72 F.write('>' + prot + '\n' + self.unknown_function[phage][prot][1] + '\n') | |
73 if count % 100 != 0: | |
74 F.close() | |
75 os.system(InterPro_path + 'interproscan.sh -b ' + out_path + 'domains_output' + str(count) + ' -i ' + out_path + 'temp_100.fasta -f tsv') | |
76 | |
77 def processInterPro(self): | |
78 ''' | |
79 Processes the tsv file created from scanInterPro. Domains are saved in the protdomains variable. | |
80 :return: phageDomains, a dictionary that, for each protein in a given species, has domains associated | |
81 ''' | |
82 import os | |
83 from pathlib import Path | |
84 import pandas as pd | |
85 import re | |
86 my_file = Path("files/interpro/domains_output.tsv") | |
87 if not my_file.is_file(): | |
88 with open('files/interpro/domains_output.tsv', 'w') as F: | |
89 for file in os.listdir('files/interpro/'): | |
90 if 'temp_100' not in file: | |
91 with open('files/interpro/' + file, 'r') as f: | |
92 F.write(f.read()) | |
93 domains = pd.read_csv('files/interpro/domains_output.tsv', sep='\t', index_col=0, header=None, names=list(range(13))) | |
94 domains = domains.fillna('-') | |
95 domains = domains[domains.loc[:, 3] != 'Coils'] | |
96 domains = domains[domains.loc[:, 3] != 'MobiDBLite'] | |
97 # domains = domains.groupby(domains.index).last() | |
98 add_domains = {} | |
99 for spec in self.phagesProteins: | |
100 for prot in self.phagesProteins[spec]: | |
101 if prot in domains.index: | |
102 temp = '-' | |
103 try: | |
104 for i in range(domains.loc[prot, :].shape[0]): | |
105 if '-' not in domains.loc[prot, 12].iloc[i].lower(): | |
106 if float(domains.loc[id, 8].iloc[i]) < 1.0: | |
107 temp = domains.loc[id, 12].iloc[i] | |
108 break | |
109 except: | |
110 if float(domains.loc[id, 8]) < 1.0: | |
111 temp = domains.loc[id, 12] | |
112 x = re.findall('(Gp\d{2,}[^,\d -]|Gp\d{1}[^,\d -])', temp) # se tiver hits, remover | |
113 if temp != '-' and not any(z in temp.lower() for z in ['unknown', 'ucp', 'uncharacterized', 'consensus']) and len(temp) > 3 and not x: | |
114 if temp not in add_domains.keys(): | |
115 add_domains[temp] = input('Add function: ' + temp).lower() | |
116 if 'y' in add_domains[temp]: | |
117 self.phagesProteins[spec][prot][0] = temp | |
118 else: | |
119 try: | |
120 for i in range(domains.loc[prot, :].shape[0]): | |
121 if '-' not in domains.loc[prot, 5].iloc[i].lower(): | |
122 temp = domains.loc[prot, 5].iloc[i] | |
123 break | |
124 except: | |
125 temp = domains.loc[prot, 5] | |
126 x = re.findall('(Gp\d{2,}[^,\d -]|Gp\d{1}[^,\d -])', temp) | |
127 if temp != '-' and not any(z in temp.lower() for z in ['unknown', 'ucp', 'uncharacterized', 'consensus']) and len(temp) > 3 and not x: | |
128 if temp not in add_domains.keys(): | |
129 add_domains[temp] = input('Add function: ' + temp).lower() | |
130 if 'y' in add_domains[temp]: | |
131 self.phagesProteins[spec][prot][0] = temp | |
132 | |
133 def find_domains_interpro(self, dic): | |
134 import os | |
135 import pandas as pd | |
136 import re | |
137 InterPro_path='/home/pedro-linux/Downloads/interproscan-5.46-81.0/' | |
138 out_path='/home/pedro-linux/OneDrive/UMinho/Cenas_de_tese_idk/WholeProcess/files/' | |
139 with open('files/SinglePhageProteins.fasta', 'w') as F: | |
140 for prot in dic.keys(): | |
141 F.write('>' + dic[prot][0] + '\n' + dic[prot][1] + '\n') | |
142 os.system(InterPro_path + 'interproscan.sh -b ' + out_path + 'single_phage_domains -i ' + out_path + 'SinglePhageProteins.fasta -f tsv') | |
143 | |
144 domains = pd.read_csv('files/single_phage_domains.tsv', sep='\t', index_col=0, header=None, names=list(range(13))) | |
145 domains = domains.fillna('-') | |
146 for prot in dic: | |
147 if prot in domains.index: | |
148 temp = '-' | |
149 try: | |
150 for i in range(domains.loc[prot, :].shape[0]): | |
151 if 'coil' not in domains.loc[prot, 12].iloc[i].lower() and '-' not in domains.loc[prot, 12].iloc[i].lower(): | |
152 temp = domains.loc[prot, 12].iloc[i] | |
153 break | |
154 except: | |
155 temp = domains.loc[prot, 12] | |
156 x = re.findall('(Gp\d{2,}[^,\d -]|Gp\d{1}[^,\d -])', temp) # se tiver hits, remover | |
157 if temp != '-' and 'unknown' not in temp and 'UCP' not in temp and len(temp)>3 and not x: | |
158 dic[prot][0] = temp | |
159 else: | |
160 try: | |
161 for i in range(domains.loc[prot, :].shape[0]): | |
162 if 'coil' not in domains.loc[prot, 5].iloc[i].lower() and '-' not in domains.loc[prot, 12].iloc[i].lower(): | |
163 temp = domains.loc[prot, 5].iloc[i] | |
164 break | |
165 except: | |
166 temp = domains.loc[prot, 5] | |
167 x = re.findall('(Gp\d{2,}[^,\d -]|Gp\d{1}[^,\d -])', temp) | |
168 if temp != '-' and 'unknown' not in temp and 'UCP' not in temp and len(temp) > 3 and not x: | |
169 dic[prot][0] = temp | |
170 return dic | |
171 | |
172 def fillDomainsBLAST(self): | |
173 ''' | |
174 Using the NCBIWWW package, it searches for domains with BLAST. Domains are saved in the protdomains variable. | |
175 :return: phageDomains, a dictionary that, for each protein in a given species, has domains associated | |
176 ''' | |
177 print('Finding functions/domains with BLAST') | |
178 from Bio.Blast import NCBIWWW | |
179 from Bio.Blast import NCBIXML | |
180 import pickle | |
181 from pathlib import Path | |
182 my_file = Path("files/phage_list_blast") | |
183 if my_file.is_file(): | |
184 with open('files/phage_list_blast', 'rb') as f: | |
185 list_done = pickle.load(f) | |
186 else: | |
187 list_done = [] | |
188 for spec in self.phagesProteins: | |
189 if spec not in list_done: | |
190 for prot in self.phagesProteins[spec]: | |
191 if 'hypothetical' in self.phagesProteins[spec][prot][0].lower() or 'uncharacterized' in self.phagesProteins[spec][prot][0].lower() or 'unknown' in self.phagesProteins[spec][prot][0].lower(): | |
192 # if not self.phageDomains[bac][prot]: | |
193 result_handle = NCBIWWW.qblast('blastp', 'nr', self.phagesProteins[spec][prot][1], entrez_query='Acinetobacter baumannii (taxid:470), Escherichia coli (taxid:562), Klebsiella pneumonia (taxid:573)') | |
194 blastout = NCBIXML.read(result_handle) | |
195 for ali in blastout.alignments: | |
196 if 'hypothetical' not in ali.hit_def.lower() and 'uncharacterized' not in ali.hit_def.lower(): | |
197 print(ali.hit_def[:ali.hit_def.find(' [')]) | |
198 self.phagesProteins[spec][prot][0] = ali.hit_def[:ali.hit_def.find(' [')] | |
199 break | |
200 list_done.append(spec) | |
201 with open('files/phage_list_blast', 'wb') as f: | |
202 pickle.dump(list_done, f) | |
203 self.saveDomains() | |
204 | |
205 def find_domains_blast(self, dic): | |
206 from Bio.Blast import NCBIWWW | |
207 from Bio.Blast import NCBIXML | |
208 | |
209 for prot in dic.keys(): | |
210 if 'hypothetical' in dic[prot][0].lower() or 'uncharacterized' in dic[prot][0].lower() or 'unknown' in dic[prot][0].lower(): | |
211 result_handle = NCBIWWW.qblast('blastp', 'nr', prot, entrez_query='Acinetobacter baumannii (taxid:470), Escherichia coli (taxid:562), Klebsiella pneumonia (taxid:573)') | |
212 blastout = NCBIXML.read(result_handle) | |
213 for ali in blastout.alignments: | |
214 if 'hypothetical' not in ali.hit_def.lower() and 'uncharacterized' not in ali.hit_def.lower(): | |
215 print(ali.hit_def[:ali.hit_def.find(' [')]) | |
216 self.phagesProteins[spec][prot][0] = ali.hit_def[:ali.hit_def.find(' [')] | |
217 break | |
218 return dic | |
219 | |
220 def fillDomainsUniProt(self): | |
221 ''' | |
222 Using the UniProt website, similar sequences are obtained and the ones with function assigned are saved into the domains. Domains are saved in the protdomains variable. | |
223 :return: phageDomains, a dictionary that, for each protein in a given species, has domains associated | |
224 ''' | |
225 print('Finding functions/domains with UniProt') | |
226 import requests | |
227 import pickle | |
228 from pathlib import Path | |
229 my_file = Path("files/phage_list_uniprot") | |
230 if my_file.is_file(): | |
231 with open('files/phage_list_uniprot', 'rb') as f: | |
232 list_done = pickle.load(f) | |
233 else: | |
234 list_done = [] | |
235 for phage in self.phagesProteins: | |
236 if phage not in list_done: | |
237 for accID in self.phagesProteins[phage]: | |
238 if 'hypothetical' in self.phagesProteins[phage][accID][0].lower() or 'uncharacterized' in self.phagesProteins[phage][accID][0].lower() or 'unknown' in self.phagesProteins[phage][accID][0].lower(): | |
239 # if not self.phageDomains[phage][accID]: | |
240 fullURL = ('https://www.uniprot.org/uniprot/?query=' + accID + '&sort=score&format=list') | |
241 result = requests.get(fullURL) | |
242 uniprot_acc = result.text.strip() | |
243 fullURL = ('https://www.uniprot.org/uniprot/?query=cluster:(uniprot:' + uniprot_acc + '* identity:1.0) not id:' + uniprot_acc + '&format=txt') | |
244 result = requests.get(fullURL) | |
245 listResults = result.text.split('\n') | |
246 for entry in listResults: | |
247 if entry[:2] == 'DE': | |
248 start_pos = entry.find('Full=') + 5 | |
249 end_pos = entry.find(' {ECO') | |
250 domain = entry[start_pos:end_pos] | |
251 if not any(z in domain.lower() for z in ['uncharacterized', 'flags', 'domain', 'bacteriophage protein', 'family protein', 'phage-like', 'phage protein', 'unassigned', 'orf', 'gene']) and len(domain) > 5: | |
252 print(domain) | |
253 self.phagesProteins[phage][accID][0] = domain | |
254 break | |
255 list_done.append(phage) | |
256 with open('files/phage_list_uniprot', 'wb') as f: | |
257 pickle.dump(list_done, f) | |
258 self.saveDomains() | |
259 | |
260 def find_domains_uniprot(self, dic): | |
261 import requests | |
262 for accID in dic.keys(): | |
263 if 'hypothetical' in dic[accID][0].lower() or 'uncharacterized' in dic[accID][0].lower() or 'unknown' in dic[accID][0].lower(): | |
264 fullURL = ('https://www.uniprot.org/uniprot/?query=' + accID + '&sort=score&format=list') | |
265 result = requests.get(fullURL) | |
266 uniprot_acc = result.text.strip() | |
267 fullURL = ('https://www.uniprot.org/uniprot/?query=cluster:(uniprot:' + uniprot_acc + '* identity:1.0) not id:' + uniprot_acc + '&format=txt') | |
268 result = requests.get(fullURL) | |
269 listResults = result.text.split('\n') | |
270 for entry in listResults: | |
271 if entry[:2] == 'DE': | |
272 start_pos = entry.find('Full=') + 5 | |
273 end_pos = entry.find(' {ECO') | |
274 domain = entry[start_pos:end_pos] | |
275 if not any(z in domain.lower() for z in ['uncharacterized', 'flags', 'domain', 'bacteriophage protein', 'family protein', 'phage-like', 'phage protein', 'unassigned', 'orf', 'gene']) and len(domain) > 5: | |
276 dic[accID][0] = domain | |
277 break | |
278 return dic | |
279 | |
280 def cdHit(self): | |
281 import os | |
282 from pathlib import Path | |
283 my_file = Path('files/phagesProteins.fasta') | |
284 if not my_file.is_file(): | |
285 self._create_fasta(self.phagesProteins, 'phagesProteins.fasta') | |
286 my_file = Path('files/complete_cdhit.clstr') | |
287 if not my_file.is_file(): | |
288 os.system('cd-hit -i files/phagesProteins.fasta -d 50 -o files/complete_cdhit') | |
289 # clusters = {} | |
290 temp_cluster = [] | |
291 list_found = [] | |
292 found = False | |
293 with open('files/complete_cdhit.clstr', 'r') as f: | |
294 for line in f.readlines(): | |
295 if '>Cluster' in line: | |
296 if temp_cluster and found: | |
297 if len(list_found) == 1: | |
298 function = list_found[0] | |
299 else: | |
300 x = int(input(str(list_found) + '\nChoose from 1 to ' + str(len(list_found)) + ': ')) - 1 | |
301 function = list_found[x] | |
302 for clust in temp_cluster: | |
303 self.phagesProteins[clust[clust.find('-') + 1:]][clust[:clust.find('-')]][0] = function | |
304 | |
305 temp_cluster = [] | |
306 list_found = [] | |
307 found = False | |
308 else: | |
309 pos_i = line.find('>') + 1 | |
310 pos_f = line.find('...') | |
311 pos_m = line.find('-') | |
312 prot = line[pos_i:pos_m] | |
313 phage = line[pos_m + 1:pos_f] | |
314 if prot in self.known_function[phage].keys() and not found: | |
315 function = self.known_function[phage][prot][0] | |
316 list_found.append(function) | |
317 found = True | |
318 elif prot in self.known_function[phage].keys() and found: | |
319 if function != self.known_function[phage][prot][0] and self.known_function[phage][prot][0] not in list_found: | |
320 function = self.known_function[phage][prot][0] | |
321 list_found.append(function) | |
322 elif prot in self.unknown_function[phage].keys(): | |
323 temp_cluster.append(line[pos_i:pos_f]) | |
324 | |
325 def create_blast_db(self): | |
326 import os | |
327 self._create_fasta(self.known_function, 'database_phages.fasta') | |
328 os.system('makeblastdb -in files/database_phages.fasta -dbtype prot -title PhageProts -parse_seqids -out files/database_phages') | |
329 self._create_fasta(self.unknown_function, 'unknown_phages.fasta') | |
330 os.system('blastp -db files/database_phages -query files/unknown_phages.fasta -out files/test_blast -num_threads 2 -outfmt 6') | |
331 | |
332 def process_blastdb(self, blastdb): | |
333 import pandas as pd | |
334 blast_domains = pd.read_csv('files/' + blastdb, sep='\t', header=None) | |
335 for phage in self.unknown_function.keys(): | |
336 for prot in self.unknown_function[phage]: | |
337 evalue = [] | |
338 bitscore = [] | |
339 pred = blast_domains[blast_domains[0] == phage + '-' + prot] | |
340 if pred.shape[0] == 0: break | |
341 for i in pred[10]: | |
342 evalue.append(float(i)) | |
343 for i in pred[11]: | |
344 bitscore.append(float(i)) | |
345 if min(evalue) < 1.0 and max(bitscore) > 30.0: | |
346 ind = evalue.index(min(evalue)) | |
347 if ind != bitscore.index(max(bitscore)): | |
348 ind = bitscore.index(max(bitscore)) | |
349 temp = pred.iloc[ind,1] | |
350 known_phage = temp[:temp.find('-')] | |
351 known_prot = temp[temp.find('-')+1:] | |
352 if self.known_function[known_phage][known_prot]: | |
353 new_func = self.known_function[known_phage][known_prot][0] | |
354 for j in self.known_function.keys(): | |
355 if pred.iloc[ind,1] in self.known_function[j].keys(): | |
356 new_func = self.known_function[j][pred.iloc[ind,1]][0] | |
357 break | |
358 self.phagesProteins[phage][prot][0] = new_func | |
359 self.saveDomains() | |
360 | |
361 def extract_bact_location(self): | |
362 import pandas as pd | |
363 import ast | |
364 import requests | |
365 import re | |
366 from pathlib import Path | |
367 data = pd.read_csv('files/NCBI_Phage_Bacteria_Data.csv', header=0, index_col=0) | |
368 all_bact = [] | |
369 for i in data.index: | |
370 for bact in ast.literal_eval(data.loc[i, 'Host_ID']): | |
371 if bact[:-2] not in all_bact: | |
372 all_bact.append(bact[:-2]) | |
373 fullURL = ('https://db.psort.org/downloads/precomputed?version=3.00') | |
374 result = requests.get(fullURL) | |
375 psort = result.text.strip() | |
376 urls = re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+/[a-z]+/\S+\"{1}', psort) | |
377 i = 1 | |
378 while i < len(urls): | |
379 temp = urls[i] | |
380 bact = temp[temp.rfind('=') + 1:temp.find('"')] | |
381 if bact not in all_bact: | |
382 i += 3 | |
383 else: | |
384 my_file = Path('files/psort/' + bact + ".faa.out") | |
385 if not my_file.is_file(): | |
386 temp_url = urls[i+1].strip('"') | |
387 r = requests.get(temp_url) | |
388 with open('files/psort/' + bact + ".faa.out", 'wb') as f: | |
389 f.write(r.content) | |
390 i += 3 | |
391 | |
392 def create_fasta_psort(self): | |
393 from pathlib import Path | |
394 import json | |
395 import os | |
396 for bact in os.listdir('files/bacteria'): | |
397 my_file = Path('files/psort/' + bact[:-5] + '.faa.out') | |
398 if not my_file.is_file(): | |
399 with open('files/bacteria/' + bact, encoding='utf-8') as F: | |
400 bact_prots = json.loads(F.read()) | |
401 self._create_fasta(bact_prots, 'psort/' + bact[:-5] + '.fasta') | |
402 os.system('./psortb -n -i /home/pedro-linux/OneDrive/UMinho/Cenas_de_tese_idk/test_tese_process/files/psort/' + bact[:-5] + '.fasta -r . -o long') | |
403 os.listdir('./psortb') | |
404 os.replace('', '/home/pedro-linux/OneDrive/UMinho/Cenas_de_tese_idk/test_tese_process/files/psort/' + bact[:-5] + '.faa.out') # move and rename output | |
405 os.remove('files/psort/' + bact[:-5] + '.fasta') | |
406 | |
407 def saveDomains(self): | |
408 ''' | |
409 Saves the protdomain variable in a file. | |
410 :return: SearchedDomains.json | |
411 ''' | |
412 import json | |
413 with open('files/phagesProteins.json', 'w') as f: | |
414 json.dump(self.phagesProteins, f) | |
415 # with open('files/phagesProteins.fasta', 'w') as F: | |
416 # for phage in self.phagesProteins.keys(): | |
417 # for prot in self.phagesProteins[phage]: | |
418 # F.write('>' + prot + '\n' + self.phagesProteins[phage][prot][1] + '\n') | |
419 | |
420 | |
421 if __name__ == '__main__': | |
422 test = DomainSearch() | |
423 | |
424 test.extract_bact_location() | |
425 test.create_fasta_psort() | |
426 | |
427 test.create_blast_db() | |
428 test.process_blastdb('test_blast') | |
429 | |
430 test.cdHit() | |
431 test.scanInterPro() | |
432 test.processInterPro() | |
433 | |
434 test.fillDomainsBLAST() | |
435 test.fillDomainsUniProt() | |
436 test.saveDomains() |