2
|
1
|
|
2 class DomainSearch:
|
|
3
|
|
4 def __init__(self):
|
|
5 '''
|
|
6 This still needs a bit of modifications
|
|
7 :param phagesProteins: protein function and sequences, as provided in NCBI. Each phage ID has every protein represented with a dicionary with keys as protein IDs
|
|
8 :param phageDomains: for each phage and each of it's proteins, a list of predicted domains is given. If unavailable, it returns an empty list
|
|
9 '''
|
|
10 import json
|
|
11 import pandas as pd
|
|
12 with open('files/phagesProteins.json', encoding='utf-8') as F:
|
|
13 self.phagesProteins = json.loads(F.read())
|
|
14 # with open('files/bactProteins.json', encoding='utf-8') as F: # For later use, implement the same way as phage, more or less. Include psort
|
|
15 # self.bacProt = json.loads(F.read())
|
|
16 data = pd.read_csv('files/NCBI_Phage_Bacteria_Data.csv', header=0, index_col=0)
|
|
17 for phage in data.index:
|
|
18 if data.loc[phage, 'Host_ID'] == '[]':
|
|
19 try: del self.phagesProteins[phage]
|
|
20 except: pass
|
|
21 self._filter_phage()
|
|
22
|
|
23 def _create_fasta(self, dic, name):
|
|
24 '''
|
|
25 Creates a fasta file containing every protein sequence for a given dictionary.
|
|
26 :return:
|
|
27 '''
|
|
28 with open('files/' + name, 'w') as F:
|
|
29 for org in dic:
|
|
30 for prot in dic[org]:
|
|
31 F.write('>' + org + '-' + prot + '\n' + dic[org][prot][1] + '\n')
|
|
32
|
|
33 def _filter_phage(self):
|
|
34 self.known_function = {}
|
|
35 self.unknown_function = {}
|
|
36 for phage in self.phagesProteins.keys():
|
|
37 self.known_function[phage] = {}
|
|
38 self.unknown_function[phage] = {}
|
|
39 for prot in self.phagesProteins[phage].keys():
|
|
40 func = self.phagesProteins[phage][prot][0]
|
|
41 if (not any(i in func.lower() for i in ['hypothetical', 'unknown', 'kda', 'uncharacterized', 'hyphothetical']) and len(func) > 3) and not ('gp' in func.lower() and len(func.split(' ')) < 2) and not (len(func.split(' ')) == 1 and len(func) < 5):
|
|
42 self.known_function[phage][prot] = self.phagesProteins[phage][prot]
|
|
43 else:
|
|
44 self.unknown_function[phage][prot] = self.phagesProteins[phage][prot]
|
|
45
|
|
46 def scanInterPro(self, InterPro_path='/home/pedro-linux/Downloads/interproscan-5.46-81.0/', out_path='/home/pedro-linux/OneDrive/UMinho/Cenas_de_tese_idk/test_tese_process/files/'):
|
|
47 '''
|
|
48 Creates a fasta file containing every protein and scans it using Interproscan. Creates a tsv file
|
|
49 :param InterPro_path: path to the interproscan executable
|
|
50 :param out_path: path to save the tsv output
|
|
51 :return: domains_output.tsv, a file that contains the domain associated with each protein
|
|
52 '''
|
|
53 import os
|
|
54 self._create_fasta(self.unknown_function, 'unknown_phages.fasta')
|
|
55 os.system(InterPro_path + 'interproscan.sh -b ' + out_path + '/interpro/domains_output -i ' + out_path + 'unknown_phages.fasta -f tsv')
|
|
56
|
|
57 def iter_interpro(self, InterPro_path='/home/pedro-linux/Downloads/interproscan-5.46-81.0/', out_path='/home/pedro-linux/OneDrive/UMinho/Cenas_de_tese_idk/test_tese_process/files/interpro/'):
|
|
58 import os
|
|
59 from pathlib import Path
|
|
60 count = 0
|
|
61 F = open('files/interpro/temp_100.fasta', 'w')
|
|
62 for phage in self.unknown_function:
|
|
63 for prot in self.unknown_function[phage]:
|
|
64 count += 1
|
|
65 my_file = Path("files/interpro/domains_output" + str(count) + ".tsv")
|
|
66 if count % 100 == 0 and not my_file.is_file():
|
|
67 F.write('>' + prot + '\n' + self.unknown_function[phage][prot][1] + '\n')
|
|
68 F.close()
|
|
69 os.system(InterPro_path + 'interproscan.sh -b ' + out_path + 'domains_output' + str(count) + ' -i ' + out_path + 'temp_100.fasta -f tsv')
|
|
70 F = open('files/interpro/temp_100.fasta', 'w')
|
|
71 else:
|
|
72 F.write('>' + prot + '\n' + self.unknown_function[phage][prot][1] + '\n')
|
|
73 if count % 100 != 0:
|
|
74 F.close()
|
|
75 os.system(InterPro_path + 'interproscan.sh -b ' + out_path + 'domains_output' + str(count) + ' -i ' + out_path + 'temp_100.fasta -f tsv')
|
|
76
|
|
77 def processInterPro(self):
|
|
78 '''
|
|
79 Processes the tsv file created from scanInterPro. Domains are saved in the protdomains variable.
|
|
80 :return: phageDomains, a dictionary that, for each protein in a given species, has domains associated
|
|
81 '''
|
|
82 import os
|
|
83 from pathlib import Path
|
|
84 import pandas as pd
|
|
85 import re
|
|
86 my_file = Path("files/interpro/domains_output.tsv")
|
|
87 if not my_file.is_file():
|
|
88 with open('files/interpro/domains_output.tsv', 'w') as F:
|
|
89 for file in os.listdir('files/interpro/'):
|
|
90 if 'temp_100' not in file:
|
|
91 with open('files/interpro/' + file, 'r') as f:
|
|
92 F.write(f.read())
|
|
93 domains = pd.read_csv('files/interpro/domains_output.tsv', sep='\t', index_col=0, header=None, names=list(range(13)))
|
|
94 domains = domains.fillna('-')
|
|
95 domains = domains[domains.loc[:, 3] != 'Coils']
|
|
96 domains = domains[domains.loc[:, 3] != 'MobiDBLite']
|
|
97 # domains = domains.groupby(domains.index).last()
|
|
98 add_domains = {}
|
|
99 for spec in self.phagesProteins:
|
|
100 for prot in self.phagesProteins[spec]:
|
|
101 if prot in domains.index:
|
|
102 temp = '-'
|
|
103 try:
|
|
104 for i in range(domains.loc[prot, :].shape[0]):
|
|
105 if '-' not in domains.loc[prot, 12].iloc[i].lower():
|
|
106 if float(domains.loc[id, 8].iloc[i]) < 1.0:
|
|
107 temp = domains.loc[id, 12].iloc[i]
|
|
108 break
|
|
109 except:
|
|
110 if float(domains.loc[id, 8]) < 1.0:
|
|
111 temp = domains.loc[id, 12]
|
|
112 x = re.findall('(Gp\d{2,}[^,\d -]|Gp\d{1}[^,\d -])', temp) # se tiver hits, remover
|
|
113 if temp != '-' and not any(z in temp.lower() for z in ['unknown', 'ucp', 'uncharacterized', 'consensus']) and len(temp) > 3 and not x:
|
|
114 if temp not in add_domains.keys():
|
|
115 add_domains[temp] = input('Add function: ' + temp).lower()
|
|
116 if 'y' in add_domains[temp]:
|
|
117 self.phagesProteins[spec][prot][0] = temp
|
|
118 else:
|
|
119 try:
|
|
120 for i in range(domains.loc[prot, :].shape[0]):
|
|
121 if '-' not in domains.loc[prot, 5].iloc[i].lower():
|
|
122 temp = domains.loc[prot, 5].iloc[i]
|
|
123 break
|
|
124 except:
|
|
125 temp = domains.loc[prot, 5]
|
|
126 x = re.findall('(Gp\d{2,}[^,\d -]|Gp\d{1}[^,\d -])', temp)
|
|
127 if temp != '-' and not any(z in temp.lower() for z in ['unknown', 'ucp', 'uncharacterized', 'consensus']) and len(temp) > 3 and not x:
|
|
128 if temp not in add_domains.keys():
|
|
129 add_domains[temp] = input('Add function: ' + temp).lower()
|
|
130 if 'y' in add_domains[temp]:
|
|
131 self.phagesProteins[spec][prot][0] = temp
|
|
132
|
|
133 def find_domains_interpro(self, dic):
|
|
134 import os
|
|
135 import pandas as pd
|
|
136 import re
|
|
137 InterPro_path='/home/pedro-linux/Downloads/interproscan-5.46-81.0/'
|
|
138 out_path='/home/pedro-linux/OneDrive/UMinho/Cenas_de_tese_idk/WholeProcess/files/'
|
|
139 with open('files/SinglePhageProteins.fasta', 'w') as F:
|
|
140 for prot in dic.keys():
|
|
141 F.write('>' + dic[prot][0] + '\n' + dic[prot][1] + '\n')
|
|
142 os.system(InterPro_path + 'interproscan.sh -b ' + out_path + 'single_phage_domains -i ' + out_path + 'SinglePhageProteins.fasta -f tsv')
|
|
143
|
|
144 domains = pd.read_csv('files/single_phage_domains.tsv', sep='\t', index_col=0, header=None, names=list(range(13)))
|
|
145 domains = domains.fillna('-')
|
|
146 for prot in dic:
|
|
147 if prot in domains.index:
|
|
148 temp = '-'
|
|
149 try:
|
|
150 for i in range(domains.loc[prot, :].shape[0]):
|
|
151 if 'coil' not in domains.loc[prot, 12].iloc[i].lower() and '-' not in domains.loc[prot, 12].iloc[i].lower():
|
|
152 temp = domains.loc[prot, 12].iloc[i]
|
|
153 break
|
|
154 except:
|
|
155 temp = domains.loc[prot, 12]
|
|
156 x = re.findall('(Gp\d{2,}[^,\d -]|Gp\d{1}[^,\d -])', temp) # se tiver hits, remover
|
|
157 if temp != '-' and 'unknown' not in temp and 'UCP' not in temp and len(temp)>3 and not x:
|
|
158 dic[prot][0] = temp
|
|
159 else:
|
|
160 try:
|
|
161 for i in range(domains.loc[prot, :].shape[0]):
|
|
162 if 'coil' not in domains.loc[prot, 5].iloc[i].lower() and '-' not in domains.loc[prot, 12].iloc[i].lower():
|
|
163 temp = domains.loc[prot, 5].iloc[i]
|
|
164 break
|
|
165 except:
|
|
166 temp = domains.loc[prot, 5]
|
|
167 x = re.findall('(Gp\d{2,}[^,\d -]|Gp\d{1}[^,\d -])', temp)
|
|
168 if temp != '-' and 'unknown' not in temp and 'UCP' not in temp and len(temp) > 3 and not x:
|
|
169 dic[prot][0] = temp
|
|
170 return dic
|
|
171
|
|
172 def fillDomainsBLAST(self):
|
|
173 '''
|
|
174 Using the NCBIWWW package, it searches for domains with BLAST. Domains are saved in the protdomains variable.
|
|
175 :return: phageDomains, a dictionary that, for each protein in a given species, has domains associated
|
|
176 '''
|
|
177 print('Finding functions/domains with BLAST')
|
|
178 from Bio.Blast import NCBIWWW
|
|
179 from Bio.Blast import NCBIXML
|
|
180 import pickle
|
|
181 from pathlib import Path
|
|
182 my_file = Path("files/phage_list_blast")
|
|
183 if my_file.is_file():
|
|
184 with open('files/phage_list_blast', 'rb') as f:
|
|
185 list_done = pickle.load(f)
|
|
186 else:
|
|
187 list_done = []
|
|
188 for spec in self.phagesProteins:
|
|
189 if spec not in list_done:
|
|
190 for prot in self.phagesProteins[spec]:
|
|
191 if 'hypothetical' in self.phagesProteins[spec][prot][0].lower() or 'uncharacterized' in self.phagesProteins[spec][prot][0].lower() or 'unknown' in self.phagesProteins[spec][prot][0].lower():
|
|
192 # if not self.phageDomains[bac][prot]:
|
|
193 result_handle = NCBIWWW.qblast('blastp', 'nr', self.phagesProteins[spec][prot][1], entrez_query='Acinetobacter baumannii (taxid:470), Escherichia coli (taxid:562), Klebsiella pneumonia (taxid:573)')
|
|
194 blastout = NCBIXML.read(result_handle)
|
|
195 for ali in blastout.alignments:
|
|
196 if 'hypothetical' not in ali.hit_def.lower() and 'uncharacterized' not in ali.hit_def.lower():
|
|
197 print(ali.hit_def[:ali.hit_def.find(' [')])
|
|
198 self.phagesProteins[spec][prot][0] = ali.hit_def[:ali.hit_def.find(' [')]
|
|
199 break
|
|
200 list_done.append(spec)
|
|
201 with open('files/phage_list_blast', 'wb') as f:
|
|
202 pickle.dump(list_done, f)
|
|
203 self.saveDomains()
|
|
204
|
|
205 def find_domains_blast(self, dic):
|
|
206 from Bio.Blast import NCBIWWW
|
|
207 from Bio.Blast import NCBIXML
|
|
208
|
|
209 for prot in dic.keys():
|
|
210 if 'hypothetical' in dic[prot][0].lower() or 'uncharacterized' in dic[prot][0].lower() or 'unknown' in dic[prot][0].lower():
|
|
211 result_handle = NCBIWWW.qblast('blastp', 'nr', prot, entrez_query='Acinetobacter baumannii (taxid:470), Escherichia coli (taxid:562), Klebsiella pneumonia (taxid:573)')
|
|
212 blastout = NCBIXML.read(result_handle)
|
|
213 for ali in blastout.alignments:
|
|
214 if 'hypothetical' not in ali.hit_def.lower() and 'uncharacterized' not in ali.hit_def.lower():
|
|
215 print(ali.hit_def[:ali.hit_def.find(' [')])
|
|
216 self.phagesProteins[spec][prot][0] = ali.hit_def[:ali.hit_def.find(' [')]
|
|
217 break
|
|
218 return dic
|
|
219
|
|
220 def fillDomainsUniProt(self):
|
|
221 '''
|
|
222 Using the UniProt website, similar sequences are obtained and the ones with function assigned are saved into the domains. Domains are saved in the protdomains variable.
|
|
223 :return: phageDomains, a dictionary that, for each protein in a given species, has domains associated
|
|
224 '''
|
|
225 print('Finding functions/domains with UniProt')
|
|
226 import requests
|
|
227 import pickle
|
|
228 from pathlib import Path
|
|
229 my_file = Path("files/phage_list_uniprot")
|
|
230 if my_file.is_file():
|
|
231 with open('files/phage_list_uniprot', 'rb') as f:
|
|
232 list_done = pickle.load(f)
|
|
233 else:
|
|
234 list_done = []
|
|
235 for phage in self.phagesProteins:
|
|
236 if phage not in list_done:
|
|
237 for accID in self.phagesProteins[phage]:
|
|
238 if 'hypothetical' in self.phagesProteins[phage][accID][0].lower() or 'uncharacterized' in self.phagesProteins[phage][accID][0].lower() or 'unknown' in self.phagesProteins[phage][accID][0].lower():
|
|
239 # if not self.phageDomains[phage][accID]:
|
|
240 fullURL = ('https://www.uniprot.org/uniprot/?query=' + accID + '&sort=score&format=list')
|
|
241 result = requests.get(fullURL)
|
|
242 uniprot_acc = result.text.strip()
|
|
243 fullURL = ('https://www.uniprot.org/uniprot/?query=cluster:(uniprot:' + uniprot_acc + '* identity:1.0) not id:' + uniprot_acc + '&format=txt')
|
|
244 result = requests.get(fullURL)
|
|
245 listResults = result.text.split('\n')
|
|
246 for entry in listResults:
|
|
247 if entry[:2] == 'DE':
|
|
248 start_pos = entry.find('Full=') + 5
|
|
249 end_pos = entry.find(' {ECO')
|
|
250 domain = entry[start_pos:end_pos]
|
|
251 if not any(z in domain.lower() for z in ['uncharacterized', 'flags', 'domain', 'bacteriophage protein', 'family protein', 'phage-like', 'phage protein', 'unassigned', 'orf', 'gene']) and len(domain) > 5:
|
|
252 print(domain)
|
|
253 self.phagesProteins[phage][accID][0] = domain
|
|
254 break
|
|
255 list_done.append(phage)
|
|
256 with open('files/phage_list_uniprot', 'wb') as f:
|
|
257 pickle.dump(list_done, f)
|
|
258 self.saveDomains()
|
|
259
|
|
260 def find_domains_uniprot(self, dic):
|
|
261 import requests
|
|
262 for accID in dic.keys():
|
|
263 if 'hypothetical' in dic[accID][0].lower() or 'uncharacterized' in dic[accID][0].lower() or 'unknown' in dic[accID][0].lower():
|
|
264 fullURL = ('https://www.uniprot.org/uniprot/?query=' + accID + '&sort=score&format=list')
|
|
265 result = requests.get(fullURL)
|
|
266 uniprot_acc = result.text.strip()
|
|
267 fullURL = ('https://www.uniprot.org/uniprot/?query=cluster:(uniprot:' + uniprot_acc + '* identity:1.0) not id:' + uniprot_acc + '&format=txt')
|
|
268 result = requests.get(fullURL)
|
|
269 listResults = result.text.split('\n')
|
|
270 for entry in listResults:
|
|
271 if entry[:2] == 'DE':
|
|
272 start_pos = entry.find('Full=') + 5
|
|
273 end_pos = entry.find(' {ECO')
|
|
274 domain = entry[start_pos:end_pos]
|
|
275 if not any(z in domain.lower() for z in ['uncharacterized', 'flags', 'domain', 'bacteriophage protein', 'family protein', 'phage-like', 'phage protein', 'unassigned', 'orf', 'gene']) and len(domain) > 5:
|
|
276 dic[accID][0] = domain
|
|
277 break
|
|
278 return dic
|
|
279
|
|
280 def cdHit(self):
|
|
281 import os
|
|
282 from pathlib import Path
|
|
283 my_file = Path('files/phagesProteins.fasta')
|
|
284 if not my_file.is_file():
|
|
285 self._create_fasta(self.phagesProteins, 'phagesProteins.fasta')
|
|
286 my_file = Path('files/complete_cdhit.clstr')
|
|
287 if not my_file.is_file():
|
|
288 os.system('cd-hit -i files/phagesProteins.fasta -d 50 -o files/complete_cdhit')
|
|
289 # clusters = {}
|
|
290 temp_cluster = []
|
|
291 list_found = []
|
|
292 found = False
|
|
293 with open('files/complete_cdhit.clstr', 'r') as f:
|
|
294 for line in f.readlines():
|
|
295 if '>Cluster' in line:
|
|
296 if temp_cluster and found:
|
|
297 if len(list_found) == 1:
|
|
298 function = list_found[0]
|
|
299 else:
|
|
300 x = int(input(str(list_found) + '\nChoose from 1 to ' + str(len(list_found)) + ': ')) - 1
|
|
301 function = list_found[x]
|
|
302 for clust in temp_cluster:
|
|
303 self.phagesProteins[clust[clust.find('-') + 1:]][clust[:clust.find('-')]][0] = function
|
|
304
|
|
305 temp_cluster = []
|
|
306 list_found = []
|
|
307 found = False
|
|
308 else:
|
|
309 pos_i = line.find('>') + 1
|
|
310 pos_f = line.find('...')
|
|
311 pos_m = line.find('-')
|
|
312 prot = line[pos_i:pos_m]
|
|
313 phage = line[pos_m + 1:pos_f]
|
|
314 if prot in self.known_function[phage].keys() and not found:
|
|
315 function = self.known_function[phage][prot][0]
|
|
316 list_found.append(function)
|
|
317 found = True
|
|
318 elif prot in self.known_function[phage].keys() and found:
|
|
319 if function != self.known_function[phage][prot][0] and self.known_function[phage][prot][0] not in list_found:
|
|
320 function = self.known_function[phage][prot][0]
|
|
321 list_found.append(function)
|
|
322 elif prot in self.unknown_function[phage].keys():
|
|
323 temp_cluster.append(line[pos_i:pos_f])
|
|
324
|
|
325 def create_blast_db(self):
|
|
326 import os
|
|
327 self._create_fasta(self.known_function, 'database_phages.fasta')
|
|
328 os.system('makeblastdb -in files/database_phages.fasta -dbtype prot -title PhageProts -parse_seqids -out files/database_phages')
|
|
329 self._create_fasta(self.unknown_function, 'unknown_phages.fasta')
|
|
330 os.system('blastp -db files/database_phages -query files/unknown_phages.fasta -out files/test_blast -num_threads 2 -outfmt 6')
|
|
331
|
|
332 def process_blastdb(self, blastdb):
|
|
333 import pandas as pd
|
|
334 blast_domains = pd.read_csv('files/' + blastdb, sep='\t', header=None)
|
|
335 for phage in self.unknown_function.keys():
|
|
336 for prot in self.unknown_function[phage]:
|
|
337 evalue = []
|
|
338 bitscore = []
|
|
339 pred = blast_domains[blast_domains[0] == phage + '-' + prot]
|
|
340 if pred.shape[0] == 0: break
|
|
341 for i in pred[10]:
|
|
342 evalue.append(float(i))
|
|
343 for i in pred[11]:
|
|
344 bitscore.append(float(i))
|
|
345 if min(evalue) < 1.0 and max(bitscore) > 30.0:
|
|
346 ind = evalue.index(min(evalue))
|
|
347 if ind != bitscore.index(max(bitscore)):
|
|
348 ind = bitscore.index(max(bitscore))
|
|
349 temp = pred.iloc[ind,1]
|
|
350 known_phage = temp[:temp.find('-')]
|
|
351 known_prot = temp[temp.find('-')+1:]
|
|
352 if self.known_function[known_phage][known_prot]:
|
|
353 new_func = self.known_function[known_phage][known_prot][0]
|
|
354 for j in self.known_function.keys():
|
|
355 if pred.iloc[ind,1] in self.known_function[j].keys():
|
|
356 new_func = self.known_function[j][pred.iloc[ind,1]][0]
|
|
357 break
|
|
358 self.phagesProteins[phage][prot][0] = new_func
|
|
359 self.saveDomains()
|
|
360
|
|
361 def extract_bact_location(self):
|
|
362 import pandas as pd
|
|
363 import ast
|
|
364 import requests
|
|
365 import re
|
|
366 from pathlib import Path
|
|
367 data = pd.read_csv('files/NCBI_Phage_Bacteria_Data.csv', header=0, index_col=0)
|
|
368 all_bact = []
|
|
369 for i in data.index:
|
|
370 for bact in ast.literal_eval(data.loc[i, 'Host_ID']):
|
|
371 if bact[:-2] not in all_bact:
|
|
372 all_bact.append(bact[:-2])
|
|
373 fullURL = ('https://db.psort.org/downloads/precomputed?version=3.00')
|
|
374 result = requests.get(fullURL)
|
|
375 psort = result.text.strip()
|
|
376 urls = re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+/[a-z]+/\S+\"{1}', psort)
|
|
377 i = 1
|
|
378 while i < len(urls):
|
|
379 temp = urls[i]
|
|
380 bact = temp[temp.rfind('=') + 1:temp.find('"')]
|
|
381 if bact not in all_bact:
|
|
382 i += 3
|
|
383 else:
|
|
384 my_file = Path('files/psort/' + bact + ".faa.out")
|
|
385 if not my_file.is_file():
|
|
386 temp_url = urls[i+1].strip('"')
|
|
387 r = requests.get(temp_url)
|
|
388 with open('files/psort/' + bact + ".faa.out", 'wb') as f:
|
|
389 f.write(r.content)
|
|
390 i += 3
|
|
391
|
|
392 def create_fasta_psort(self):
|
|
393 from pathlib import Path
|
|
394 import json
|
|
395 import os
|
|
396 for bact in os.listdir('files/bacteria'):
|
|
397 my_file = Path('files/psort/' + bact[:-5] + '.faa.out')
|
|
398 if not my_file.is_file():
|
|
399 with open('files/bacteria/' + bact, encoding='utf-8') as F:
|
|
400 bact_prots = json.loads(F.read())
|
|
401 self._create_fasta(bact_prots, 'psort/' + bact[:-5] + '.fasta')
|
|
402 os.system('./psortb -n -i /home/pedro-linux/OneDrive/UMinho/Cenas_de_tese_idk/test_tese_process/files/psort/' + bact[:-5] + '.fasta -r . -o long')
|
|
403 os.listdir('./psortb')
|
|
404 os.replace('', '/home/pedro-linux/OneDrive/UMinho/Cenas_de_tese_idk/test_tese_process/files/psort/' + bact[:-5] + '.faa.out') # move and rename output
|
|
405 os.remove('files/psort/' + bact[:-5] + '.fasta')
|
|
406
|
|
407 def saveDomains(self):
|
|
408 '''
|
|
409 Saves the protdomain variable in a file.
|
|
410 :return: SearchedDomains.json
|
|
411 '''
|
|
412 import json
|
|
413 with open('files/phagesProteins.json', 'w') as f:
|
|
414 json.dump(self.phagesProteins, f)
|
|
415 # with open('files/phagesProteins.fasta', 'w') as F:
|
|
416 # for phage in self.phagesProteins.keys():
|
|
417 # for prot in self.phagesProteins[phage]:
|
|
418 # F.write('>' + prot + '\n' + self.phagesProteins[phage][prot][1] + '\n')
|
|
419
|
|
420
|
|
421 if __name__ == '__main__':
|
|
422 test = DomainSearch()
|
|
423
|
|
424 test.extract_bact_location()
|
|
425 test.create_fasta_psort()
|
|
426
|
|
427 test.create_blast_db()
|
|
428 test.process_blastdb('test_blast')
|
|
429
|
|
430 test.cdHit()
|
|
431 test.scanInterPro()
|
|
432 test.processInterPro()
|
|
433
|
|
434 test.fillDomainsBLAST()
|
|
435 test.fillDomainsUniProt()
|
|
436 test.saveDomains()
|