0
|
1 """
|
|
2 https://www.ncbi.nlm.nih.gov/Sequin/acc.html
|
|
3 https://www.ncbi.nlm.nih.gov/books/NBK21091/table/ch18.T.refseq_accession_numbers_and_mole/?report=objectonly
|
|
4 """
|
|
5
|
|
6
|
|
7 class PhageBacteriaData:
|
|
8
|
|
9 def __init__(self, dataset=None):
|
|
10 """
|
|
11 Imports a dataset from NCBI Virus, where the columns are Phage ID, Phage Name, Bacteria Name, Bacteria ID.
|
|
12 If a phage entry does not have a bacteria associated, it is deleted
|
|
13 :param dataset:
|
|
14 """
|
|
15 import pandas as pd
|
|
16 import json
|
|
17 self.listBacID = []
|
|
18 if dataset is None:
|
|
19 file = False
|
|
20 while not file:
|
|
21 try:
|
|
22 name = input('File name: ')
|
|
23 self.data = pd.read_csv('files/' + name, header=0, index_col=0)
|
|
24 file = True
|
|
25 except:
|
|
26 print('Couldn\'t find file')
|
|
27 else:
|
|
28 self.data = pd.read_csv('files/' + dataset, header=0, index_col=0)
|
|
29 self.data = self.data.dropna(how='any')
|
|
30 self.data = self.data[self.data['Host'] != 'unclassified bacterium']
|
|
31 index_remove = []
|
|
32 for i in range(len(self.data)):
|
|
33 if 'uncultured' in self.data.iloc[i]['Species']:
|
|
34 index_remove.append(i)
|
|
35 elif 'virus' not in self.data.iloc[i]['Species'] and 'phage' not in self.data.iloc[i]['Species']:
|
|
36 index_remove.append(i)
|
|
37 self.data = self.data.drop(self.data.index[index_remove])
|
|
38 index_remove = []
|
|
39 for i in range(len(self.data)):
|
|
40 temp = self.data['Host'][i].split(' ')
|
|
41 if len(temp) <= 1:
|
|
42 index_remove.append(i)
|
|
43 self.data = self.data.drop(self.data.index[index_remove])
|
|
44 if 'Host_ID' not in self.data.columns:
|
|
45 temp = []
|
|
46 for i in self.data.index:
|
|
47 temp.append([])
|
|
48 self.data['Host_ID'] = temp
|
|
49 try:
|
|
50 with open('files/searched_accessions', 'r') as f:
|
|
51 self.searched = json.loads(f.read())
|
|
52 except:
|
|
53 self.searched = {}
|
|
54
|
|
55 def addPhageName(self):
|
|
56 """
|
|
57 Using the entrez service, from NCBI, for each phage the name is added, from its features.
|
|
58 :return:
|
|
59 """
|
|
60 from Bio import Entrez
|
|
61 from Bio import SeqIO
|
|
62 Entrez.email = "pedro_araujo97@hotmail.com"
|
|
63 listNames = []
|
|
64 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=self.data.index) as handle:
|
|
65 for seq_record in SeqIO.parse(handle, "gb"):
|
|
66 listNames.append(seq_record.annotations['organism'])
|
|
67 self.data['Species'] = listNames
|
|
68
|
|
69 def addBacteriaName(self):
|
|
70 """
|
|
71 Using the entrez service, from NCBI, for each phage the infecting bacteria name is added, from its features.
|
|
72 :return:
|
|
73 """
|
|
74 from Bio import Entrez
|
|
75 from Bio import SeqIO
|
|
76 Entrez.email = "pedro_araujo97@hotmail.com"
|
|
77 for phage in self.data.index:
|
|
78 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=phage) as handle:
|
|
79 seq_record = SeqIO.read(handle, "gb")
|
|
80 try:
|
|
81 if len(seq_record.features[0].qualifiers['host'][0].split(' ')) > 2:
|
|
82 self.data.loc[phage, 'Host'] = seq_record.features[0].qualifiers['host'][0]
|
|
83 except:
|
|
84 if len(seq_record.features[0].qualifiers['lab_host'][0].split(' ')) > 2:
|
|
85 self.data.loc[phage, 'Host'] = seq_record.features[0].qualifiers['lab_host'][0]
|
|
86 self.save_data()
|
|
87
|
|
88 def addBacteriaGenome(self):
|
|
89 """
|
|
90 For each phage, the associated scientific articles in pubmed are looked. From theses pubmed articles, all associated IDs are extracted.
|
|
91 If the ID corresponds to a bacterial strain, its accession ID is added to the Bacteria ID column.
|
|
92 :return:
|
|
93 """
|
|
94 from Bio import Entrez
|
|
95 import ast
|
|
96 import pickle
|
|
97 from pathlib import Path
|
|
98 Entrez.email = "pedro_araujo97@hotmail.com"
|
|
99 my_file = Path("files/searched_hosts")
|
|
100 if my_file.is_file():
|
|
101 with open('files/searched_hosts', 'rb') as f:
|
|
102 list_done = pickle.load(f)
|
|
103 else:
|
|
104 list_done = []
|
|
105 count = 0
|
|
106 try:
|
|
107 for phageID in self.data.index:
|
|
108 if phageID in list_done: continue
|
|
109 listBactID = ast.literal_eval(self.data.loc[phageID, 'Host_ID'])
|
|
110 try:
|
|
111 with Entrez.elink(dbfrom='nuccore', db='pubmed', id=phageID) as handle:
|
|
112 pubmed = Entrez.read(handle)
|
|
113 for i in pubmed[0]['LinkSetDb']:
|
|
114 if 'weighted' not in i["LinkName"]:
|
|
115 for link in i["Link"]:
|
|
116 try:
|
|
117 with Entrez.elink(dbfrom='pubmed', db="nucleotide", id=link['Id']) as handle:
|
|
118 genomes = Entrez.read(handle)
|
|
119 for i in genomes[0]['LinkSetDb']:
|
|
120 if 'weighted' not in i['LinkName']:
|
|
121 for id in i['Link']:
|
|
122 with Entrez.esummary(db='nucleotide', id=id['Id']) as handle:
|
|
123 bacorg = Entrez.read(handle)
|
|
124 if bacorg[0]['Caption'] != phageID and 'phage' not in bacorg[0][
|
|
125 'Title'].lower() \
|
|
126 and bacorg[0][
|
|
127 'AccessionVersion'] not in listBactID and 'cds' not in \
|
|
128 bacorg[0]['Title'].lower() and 'shotgun' not in bacorg[0][
|
|
129 'Title'].lower():
|
|
130 if any(z in bacorg[0]['AccessionVersion'][:3] for z in
|
|
131 ['NC_', 'AC_', 'NZ_', 'CP', 'AE', 'CY', 'AP']):
|
|
132 listBactID.append(bacorg[0]['AccessionVersion'])
|
|
133 self.searched[bacorg[0]['AccessionVersion']] = 'yes'
|
|
134 count += 1
|
|
135 elif not any(z in bacorg[0]['AccessionVersion'][:3] for z in
|
|
136 ['MN', 'FM', 'MQ', 'MR', 'MK', 'AB', 'MF', 'KP',
|
|
137 'NM_', 'KC', 'MH', 'AY', 'FN', 'AY']) \
|
|
138 and 'complete' in bacorg[0]['Title'].lower():
|
|
139 if bacorg[0]['AccessionVersion'] in self.searched.keys():
|
|
140 add = self.searched[bacorg[0]['AccessionVersion']]
|
|
141 else:
|
|
142 add = input('Check ' + bacorg[0][
|
|
143 'AccessionVersion'] + '\nDo you wish to add it? (yes/no)')
|
|
144 if 'y' in add.lower():
|
|
145 listBactID.append(bacorg[0]['AccessionVersion'])
|
|
146 self.searched[bacorg[0]['AccessionVersion']] = 'yes'
|
|
147 count += 1
|
|
148 else:
|
|
149 self.searched[bacorg[0]['AccessionVersion']] = 'no'
|
|
150 except:
|
|
151 pass
|
|
152 except:
|
|
153 pass
|
|
154 self.data.loc[phageID, 'Host_ID'] = listBactID
|
|
155 list_done.append(phageID)
|
|
156 with open('files/searched_hosts', 'wb') as f:
|
|
157 pickle.dump(list_done, f)
|
|
158 self.save_data()
|
|
159 print(phageID)
|
|
160 except:
|
|
161 print('Bacterial host name missing. Searching from phage id')
|
|
162 pass
|
|
163 print('For future reference,', count, "new bacterial ID's were added.")
|
|
164 self.save_data()
|
|
165
|
|
166 def checkAbstracts(self):
|
|
167 """
|
|
168 For each phage, the associated scientific articles in pubmed are looked. From theses pubmed articles, the abstracted is searched for mentions of bacterial strains.
|
|
169 If bacterial strains are found, its accession IDs are added to the Bacteria ID column.
|
|
170 :return:
|
|
171 """
|
|
172 from Bio import Entrez
|
|
173 import re
|
|
174 import ast
|
|
175 Entrez.email = 'pedro_araujo97@hotmail.com'
|
|
176 count = 0
|
|
177 for phageID in self.data.index:
|
|
178 if len(self.data.loc[phageID, 'Host'].split()) < 3:
|
|
179 with Entrez.elink(dbfrom='nuccore', db='pubmed', id=phageID) as handle:
|
|
180 pubmed = Entrez.read(handle)
|
|
181 listBactID = ast.literal_eval(self.data.loc[phageID, 'Host_ID'])
|
|
182 for i in pubmed[0]['LinkSetDb']:
|
|
183 if 'weighted' not in i["LinkName"]:
|
|
184 for link in i["Link"]:
|
|
185 try:
|
|
186 with Entrez.efetch(db="pubmed", rettype="medline", retmode="xml",
|
|
187 id=link['Id']) as handle:
|
|
188 article = Entrez.read(handle)
|
|
189 abstract = \
|
|
190 article['PubmedArticle'][0]['MedlineCitation']['Article']['Abstract']['AbstractText'][0]
|
|
191 x = re.findall('\w{0,1}[A-Z]{1,5}[0-9]{1,5}[-,:]{0,1}[A-Z]{0,5}[1-9]{0,5}', abstract)
|
|
192 for i in range(len(x)):
|
|
193 x[i] = x[i].strip(',;')
|
|
194 x = list(set(x))
|
|
195 for i in x:
|
|
196 if 'ORF' in i:
|
|
197 x.remove(i)
|
|
198 for strain in x:
|
|
199 with Entrez.esearch(db='nucleotide', term='((((' + self.data.loc[
|
|
200 phageID, 'Host'] + ' ' + strain + ' AND complete sequence) NOT shotgun[Title]) NOT phage[Title]) NOT cds[Title]) NOT gene[Title]',
|
|
201 idtype="acc") as handle:
|
|
202 species = Entrez.read(handle)
|
|
203 strains = species['IdList']
|
|
204 for i in strains:
|
|
205 if any(z in i for z in ['NC_', 'NZ_', 'AC_', 'CP', 'AE', 'CY',
|
|
206 'AP']) and i not in listBactID:
|
|
207 listBactID.append(i)
|
|
208 self.searched[i] = 'yes'
|
|
209 count += 1
|
|
210 elif not any(z in i[:3] for z in
|
|
211 ['MN', 'FM', 'MQ', 'MR', 'MK', 'AB', 'MF', 'KP', 'NM_', 'KC',
|
|
212 'MH', 'AY', 'FN', 'AY']) and i not in listBactID:
|
|
213 if i in self.searched.keys():
|
|
214 add = self.searched[i]
|
|
215 else:
|
|
216 add = input('Check ' + i + '\nDo you wish to add it? (yes/no)')
|
|
217 if 'y' in add.lower():
|
|
218 listBactID.append(i)
|
|
219 self.searched[i] = 'yes'
|
|
220 count += 1
|
|
221 else:
|
|
222 self.searched = 'no'
|
|
223 except:
|
|
224 pass
|
|
225 self.data.loc[phageID, 'Host_ID'] = listBactID
|
|
226 print('For future reference,', count, "new bacterial ID's were added.")
|
|
227 self.save_data()
|
|
228
|
|
229 def saveAbstracts(self):
|
|
230 """
|
|
231 For each phage, the associated scientific articles in pubmed are looked.
|
|
232 From theses pubmed articles, the abstracted is saved in a dictionary structure, where each phage ID is associated with a list of abstracts.
|
|
233 These abstracts can be used for later processing.
|
|
234 :return:
|
|
235 """
|
|
236 from Bio import Entrez
|
|
237 import json
|
|
238 Entrez.email = 'pedro_araujo97@hotmail.com'
|
|
239 dicPhageAbstracts = {}
|
|
240
|
|
241 for phageID in self.data.index:
|
|
242 print(phageID, end='\n')
|
|
243 dicPhageAbstracts[phageID] = []
|
|
244 with Entrez.elink(dbfrom='nuccore', db='pubmed', id=phageID) as handle:
|
|
245 pubmed = Entrez.read(handle)
|
|
246 lista = []
|
|
247 for i in pubmed[0]['LinkSetDb']:
|
|
248 if 'weighted' not in i["LinkName"]:
|
|
249 for link in i["Link"]:
|
|
250 try:
|
|
251 with Entrez.efetch(db="pubmed", rettype="medline", retmode="xml", id=link['Id']) as handle:
|
|
252 article = Entrez.read(handle)
|
|
253 abstract = \
|
|
254 article['PubmedArticle'][0]['MedlineCitation']['Article']['Abstract']['AbstractText'][0]
|
|
255 dicPhageAbstracts[phageID].append([link['Id'], abstract])
|
|
256 except:
|
|
257 pass
|
|
258 with open('files/phageAbstracts.json', 'w') as f:
|
|
259 json.dump(dicPhageAbstracts, f)
|
|
260
|
|
261 def searchBacName(self):
|
|
262 from Bio import Entrez
|
|
263 import ast
|
|
264 Entrez.email = "pedro_araujo97@hotmail.com"
|
|
265 count = 0
|
|
266 for phageID in self.data.index:
|
|
267 if len(self.data.loc[phageID, 'Host'].split()) > 2:
|
|
268 listBactID = ast.literal_eval(self.data.loc[phageID, 'Host_ID'])
|
|
269 with Entrez.esearch(db='nucleotide', term='((((' + self.data.loc[
|
|
270 phageID, 'Host'] + ' AND complete sequence) NOT shotgun[Title]) NOT phage[Title]) NOT cds[Title]) NOT gene[Title]',
|
|
271 idtype="acc") as handle:
|
|
272 species = Entrez.read(handle)
|
|
273 strains = species['IdList']
|
|
274 for j in strains:
|
|
275 if any(z in j[:3] for z in
|
|
276 ['NC_', 'NZ_', 'AC_', 'CP', 'AE', 'CY', 'AP']) and j not in listBactID:
|
|
277 listBactID.append(j)
|
|
278 self.searched[j] = 'yes'
|
|
279 count += 1
|
|
280 elif not any(z in j[:3] for z in
|
|
281 ['MN', 'FM', 'MQ', 'MR', 'MK', 'AB', 'MF', 'KP', 'NM_', 'KC', 'MH', 'AY', 'FN',
|
|
282 'AY']) and j not in listBactID:
|
|
283 if j in self.searched.keys():
|
|
284 add = self.searched[j]
|
|
285 else:
|
|
286 add = input('Check ' + j + '\nDo you wish to add it? (yes/no)')
|
|
287 if 'y' in add.lower():
|
|
288 listBactID.append(j)
|
|
289 self.searched[j] = 'yes'
|
|
290 count += 1
|
|
291 else:
|
|
292 self.searched[j] = 'no'
|
|
293 self.data.loc[phageID, 'Host_ID'] = listBactID
|
|
294 print('For future reference,', count, "new bacterial ID's were added.")
|
|
295 self.save_data()
|
|
296
|
|
297 def createListBacID(self, lower=0, upper=100):
|
|
298 """
|
|
299 More sequential than previous methods. Maybe include every single one...
|
|
300 :param lower: lower index from the phage list (numeric)
|
|
301 :param upper: upper index from the phage list (numeric)
|
|
302 :return:
|
|
303 """
|
|
304 from Bio import Entrez
|
|
305 Entrez.email = 'pedro_araujo97@hotmail.com'
|
|
306 for i in range(lower, upper):
|
|
307 phageID = self.data.index[i]
|
|
308 BactID = []
|
|
309 name = test.data.loc[phageID]['Bacteria Name']
|
|
310 try:
|
|
311 if name != 'unclassified bacterium' and not name != name: # Verificação de hosts válidos
|
|
312 with Entrez.elink(dbfrom='nuccore', db='pubmed', id=phageID) as handle:
|
|
313 pubmed = Entrez.read(handle)
|
|
314 for link in pubmed[0]["LinkSetDb"][0]["Link"]:
|
|
315 try:
|
|
316 with Entrez.elink(dbfrom='pubmed', db="nucleotide", id=link['Id']) as handle:
|
|
317 genomes = Entrez.read(handle)
|
|
318 for id in genomes[0]['LinkSetDb'][0]['Link']:
|
|
319 with Entrez.esummary(db='nucleotide', id=id['Id']) as handle:
|
|
320 bacorg = Entrez.read(handle)
|
|
321 if 'NC_' in bacorg[0]['AccessionVersion'] or 'NZ_' in bacorg[0]['AccessionVersion']:
|
|
322 if bacorg[0]['Caption'] != phageID:
|
|
323 BactID.append(bacorg[0]['AccessionVersion'])
|
|
324 except:
|
|
325 pass
|
|
326 else:
|
|
327 pass
|
|
328 except:
|
|
329 pass
|
|
330 self.listBacID.append(BactID)
|
|
331
|
|
332 def check_bacteria(self):
|
|
333 from Bio import Entrez
|
|
334 from Bio import SeqIO
|
|
335 import ast
|
|
336 Entrez.email = "pedro_araujo97@hotmail.com"
|
|
337 all_bact = []
|
|
338 for i in self.data.index:
|
|
339 for bact in ast.literal_eval(self.data.loc[i, 'Host_ID']):
|
|
340 if bact[:-2] not in all_bact:
|
|
341 all_bact.append(bact[:-2])
|
|
342 list_remove = []
|
|
343 for bact in all_bact:
|
|
344 if bact not in list_remove:
|
|
345 try:
|
|
346 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=bact) as handle:
|
|
347 seq_record = SeqIO.read(handle, "gb")
|
|
348 if not any(i in seq_record.description.lower() for i in ['pneumoniae', 'coli', 'baumannii']) or not any(i in seq_record.description.lower() for i in ['escherichia', 'acinetobacter', 'klebsiella']) \
|
|
349 or 'phage' in seq_record.description.lower() or 'virus' in seq_record.description.lower():
|
|
350 list_remove.append(bact)
|
|
351 except:
|
|
352 list_remove.append(bact)
|
|
353 print(list_remove)
|
|
354 for phage in self.data.index:
|
|
355 listBactID = ast.literal_eval(self.data.loc[phage, 'Host_ID'])
|
|
356 for bact in listBactID:
|
|
357 if bact[:-2] in list_remove:
|
|
358 listBactID.remove(bact)
|
|
359 self.data.loc[phage, 'Host_ID'] = listBactID
|
|
360 self.data.to_csv('files/NCBI_Phage_Bacteria_Data.csv')
|
|
361
|
|
362 def save_data(self):
|
|
363 """
|
|
364 Saves the data in csv format.
|
|
365 :return:
|
|
366 """
|
|
367 import json
|
|
368 self.data.to_csv('files/NCBI_Phage_Bacteria_Data.csv')
|
|
369 with open('files/searched_accessions', 'w') as f:
|
|
370 json.dump(self.searched, f)
|
|
371
|
|
372
|
|
373 if __name__ == '__main__':
|
|
374 test = PhageBacteriaData('NCBI_Phage_Bacteria_Data.csv') # sequences
|
|
375 test.addBacteriaName()
|
|
376 test.addBacteriaGenome()
|
|
377 test.searchBacName() # 2266 bacteria added
|
|
378 test.checkAbstracts()
|
|
379 # test.data = test.data.drop(columns=['Bacteria ID'])
|
|
380 test.searchBacName()
|
|
381 # test.createListBacID(0, 100)
|
|
382 # test.data = test.data.iloc[:, 0:3]
|
|
383 test.check_bacteria()
|
|
384 test.save_data()
|
|
385 # test.extractProtein()
|
|
386 # test.importProtein('Phage')
|