comparison process_raw_data.py @ 0:e4b3fc88efe0 draft

Uploaded
author pedro_araujo
date Wed, 27 Jan 2021 13:50:11 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e4b3fc88efe0
1 """
2 https://www.ncbi.nlm.nih.gov/Sequin/acc.html
3 https://www.ncbi.nlm.nih.gov/books/NBK21091/table/ch18.T.refseq_accession_numbers_and_mole/?report=objectonly
4 """
5
6
7 class PhageBacteriaData:
8
9 def __init__(self, dataset=None):
10 """
11 Imports a dataset from NCBI Virus, where the columns are Phage ID, Phage Name, Bacteria Name, Bacteria ID.
12 If a phage entry does not have a bacteria associated, it is deleted
13 :param dataset:
14 """
15 import pandas as pd
16 import json
17 self.listBacID = []
18 if dataset is None:
19 file = False
20 while not file:
21 try:
22 name = input('File name: ')
23 self.data = pd.read_csv('files/' + name, header=0, index_col=0)
24 file = True
25 except:
26 print('Couldn\'t find file')
27 else:
28 self.data = pd.read_csv('files/' + dataset, header=0, index_col=0)
29 self.data = self.data.dropna(how='any')
30 self.data = self.data[self.data['Host'] != 'unclassified bacterium']
31 index_remove = []
32 for i in range(len(self.data)):
33 if 'uncultured' in self.data.iloc[i]['Species']:
34 index_remove.append(i)
35 elif 'virus' not in self.data.iloc[i]['Species'] and 'phage' not in self.data.iloc[i]['Species']:
36 index_remove.append(i)
37 self.data = self.data.drop(self.data.index[index_remove])
38 index_remove = []
39 for i in range(len(self.data)):
40 temp = self.data['Host'][i].split(' ')
41 if len(temp) <= 1:
42 index_remove.append(i)
43 self.data = self.data.drop(self.data.index[index_remove])
44 if 'Host_ID' not in self.data.columns:
45 temp = []
46 for i in self.data.index:
47 temp.append([])
48 self.data['Host_ID'] = temp
49 try:
50 with open('files/searched_accessions', 'r') as f:
51 self.searched = json.loads(f.read())
52 except:
53 self.searched = {}
54
55 def addPhageName(self):
56 """
57 Using the entrez service, from NCBI, for each phage the name is added, from its features.
58 :return:
59 """
60 from Bio import Entrez
61 from Bio import SeqIO
62 Entrez.email = "pedro_araujo97@hotmail.com"
63 listNames = []
64 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=self.data.index) as handle:
65 for seq_record in SeqIO.parse(handle, "gb"):
66 listNames.append(seq_record.annotations['organism'])
67 self.data['Species'] = listNames
68
69 def addBacteriaName(self):
70 """
71 Using the entrez service, from NCBI, for each phage the infecting bacteria name is added, from its features.
72 :return:
73 """
74 from Bio import Entrez
75 from Bio import SeqIO
76 Entrez.email = "pedro_araujo97@hotmail.com"
77 for phage in self.data.index:
78 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=phage) as handle:
79 seq_record = SeqIO.read(handle, "gb")
80 try:
81 if len(seq_record.features[0].qualifiers['host'][0].split(' ')) > 2:
82 self.data.loc[phage, 'Host'] = seq_record.features[0].qualifiers['host'][0]
83 except:
84 if len(seq_record.features[0].qualifiers['lab_host'][0].split(' ')) > 2:
85 self.data.loc[phage, 'Host'] = seq_record.features[0].qualifiers['lab_host'][0]
86 self.save_data()
87
88 def addBacteriaGenome(self):
89 """
90 For each phage, the associated scientific articles in pubmed are looked. From theses pubmed articles, all associated IDs are extracted.
91 If the ID corresponds to a bacterial strain, its accession ID is added to the Bacteria ID column.
92 :return:
93 """
94 from Bio import Entrez
95 import ast
96 import pickle
97 from pathlib import Path
98 Entrez.email = "pedro_araujo97@hotmail.com"
99 my_file = Path("files/searched_hosts")
100 if my_file.is_file():
101 with open('files/searched_hosts', 'rb') as f:
102 list_done = pickle.load(f)
103 else:
104 list_done = []
105 count = 0
106 try:
107 for phageID in self.data.index:
108 if phageID in list_done: continue
109 listBactID = ast.literal_eval(self.data.loc[phageID, 'Host_ID'])
110 try:
111 with Entrez.elink(dbfrom='nuccore', db='pubmed', id=phageID) as handle:
112 pubmed = Entrez.read(handle)
113 for i in pubmed[0]['LinkSetDb']:
114 if 'weighted' not in i["LinkName"]:
115 for link in i["Link"]:
116 try:
117 with Entrez.elink(dbfrom='pubmed', db="nucleotide", id=link['Id']) as handle:
118 genomes = Entrez.read(handle)
119 for i in genomes[0]['LinkSetDb']:
120 if 'weighted' not in i['LinkName']:
121 for id in i['Link']:
122 with Entrez.esummary(db='nucleotide', id=id['Id']) as handle:
123 bacorg = Entrez.read(handle)
124 if bacorg[0]['Caption'] != phageID and 'phage' not in bacorg[0][
125 'Title'].lower() \
126 and bacorg[0][
127 'AccessionVersion'] not in listBactID and 'cds' not in \
128 bacorg[0]['Title'].lower() and 'shotgun' not in bacorg[0][
129 'Title'].lower():
130 if any(z in bacorg[0]['AccessionVersion'][:3] for z in
131 ['NC_', 'AC_', 'NZ_', 'CP', 'AE', 'CY', 'AP']):
132 listBactID.append(bacorg[0]['AccessionVersion'])
133 self.searched[bacorg[0]['AccessionVersion']] = 'yes'
134 count += 1
135 elif not any(z in bacorg[0]['AccessionVersion'][:3] for z in
136 ['MN', 'FM', 'MQ', 'MR', 'MK', 'AB', 'MF', 'KP',
137 'NM_', 'KC', 'MH', 'AY', 'FN', 'AY']) \
138 and 'complete' in bacorg[0]['Title'].lower():
139 if bacorg[0]['AccessionVersion'] in self.searched.keys():
140 add = self.searched[bacorg[0]['AccessionVersion']]
141 else:
142 add = input('Check ' + bacorg[0][
143 'AccessionVersion'] + '\nDo you wish to add it? (yes/no)')
144 if 'y' in add.lower():
145 listBactID.append(bacorg[0]['AccessionVersion'])
146 self.searched[bacorg[0]['AccessionVersion']] = 'yes'
147 count += 1
148 else:
149 self.searched[bacorg[0]['AccessionVersion']] = 'no'
150 except:
151 pass
152 except:
153 pass
154 self.data.loc[phageID, 'Host_ID'] = listBactID
155 list_done.append(phageID)
156 with open('files/searched_hosts', 'wb') as f:
157 pickle.dump(list_done, f)
158 self.save_data()
159 print(phageID)
160 except:
161 print('Bacterial host name missing. Searching from phage id')
162 pass
163 print('For future reference,', count, "new bacterial ID's were added.")
164 self.save_data()
165
166 def checkAbstracts(self):
167 """
168 For each phage, the associated scientific articles in pubmed are looked. From theses pubmed articles, the abstracted is searched for mentions of bacterial strains.
169 If bacterial strains are found, its accession IDs are added to the Bacteria ID column.
170 :return:
171 """
172 from Bio import Entrez
173 import re
174 import ast
175 Entrez.email = 'pedro_araujo97@hotmail.com'
176 count = 0
177 for phageID in self.data.index:
178 if len(self.data.loc[phageID, 'Host'].split()) < 3:
179 with Entrez.elink(dbfrom='nuccore', db='pubmed', id=phageID) as handle:
180 pubmed = Entrez.read(handle)
181 listBactID = ast.literal_eval(self.data.loc[phageID, 'Host_ID'])
182 for i in pubmed[0]['LinkSetDb']:
183 if 'weighted' not in i["LinkName"]:
184 for link in i["Link"]:
185 try:
186 with Entrez.efetch(db="pubmed", rettype="medline", retmode="xml",
187 id=link['Id']) as handle:
188 article = Entrez.read(handle)
189 abstract = \
190 article['PubmedArticle'][0]['MedlineCitation']['Article']['Abstract']['AbstractText'][0]
191 x = re.findall('\w{0,1}[A-Z]{1,5}[0-9]{1,5}[-,:]{0,1}[A-Z]{0,5}[1-9]{0,5}', abstract)
192 for i in range(len(x)):
193 x[i] = x[i].strip(',;')
194 x = list(set(x))
195 for i in x:
196 if 'ORF' in i:
197 x.remove(i)
198 for strain in x:
199 with Entrez.esearch(db='nucleotide', term='((((' + self.data.loc[
200 phageID, 'Host'] + ' ' + strain + ' AND complete sequence) NOT shotgun[Title]) NOT phage[Title]) NOT cds[Title]) NOT gene[Title]',
201 idtype="acc") as handle:
202 species = Entrez.read(handle)
203 strains = species['IdList']
204 for i in strains:
205 if any(z in i for z in ['NC_', 'NZ_', 'AC_', 'CP', 'AE', 'CY',
206 'AP']) and i not in listBactID:
207 listBactID.append(i)
208 self.searched[i] = 'yes'
209 count += 1
210 elif not any(z in i[:3] for z in
211 ['MN', 'FM', 'MQ', 'MR', 'MK', 'AB', 'MF', 'KP', 'NM_', 'KC',
212 'MH', 'AY', 'FN', 'AY']) and i not in listBactID:
213 if i in self.searched.keys():
214 add = self.searched[i]
215 else:
216 add = input('Check ' + i + '\nDo you wish to add it? (yes/no)')
217 if 'y' in add.lower():
218 listBactID.append(i)
219 self.searched[i] = 'yes'
220 count += 1
221 else:
222 self.searched = 'no'
223 except:
224 pass
225 self.data.loc[phageID, 'Host_ID'] = listBactID
226 print('For future reference,', count, "new bacterial ID's were added.")
227 self.save_data()
228
229 def saveAbstracts(self):
230 """
231 For each phage, the associated scientific articles in pubmed are looked.
232 From theses pubmed articles, the abstracted is saved in a dictionary structure, where each phage ID is associated with a list of abstracts.
233 These abstracts can be used for later processing.
234 :return:
235 """
236 from Bio import Entrez
237 import json
238 Entrez.email = 'pedro_araujo97@hotmail.com'
239 dicPhageAbstracts = {}
240
241 for phageID in self.data.index:
242 print(phageID, end='\n')
243 dicPhageAbstracts[phageID] = []
244 with Entrez.elink(dbfrom='nuccore', db='pubmed', id=phageID) as handle:
245 pubmed = Entrez.read(handle)
246 lista = []
247 for i in pubmed[0]['LinkSetDb']:
248 if 'weighted' not in i["LinkName"]:
249 for link in i["Link"]:
250 try:
251 with Entrez.efetch(db="pubmed", rettype="medline", retmode="xml", id=link['Id']) as handle:
252 article = Entrez.read(handle)
253 abstract = \
254 article['PubmedArticle'][0]['MedlineCitation']['Article']['Abstract']['AbstractText'][0]
255 dicPhageAbstracts[phageID].append([link['Id'], abstract])
256 except:
257 pass
258 with open('files/phageAbstracts.json', 'w') as f:
259 json.dump(dicPhageAbstracts, f)
260
261 def searchBacName(self):
262 from Bio import Entrez
263 import ast
264 Entrez.email = "pedro_araujo97@hotmail.com"
265 count = 0
266 for phageID in self.data.index:
267 if len(self.data.loc[phageID, 'Host'].split()) > 2:
268 listBactID = ast.literal_eval(self.data.loc[phageID, 'Host_ID'])
269 with Entrez.esearch(db='nucleotide', term='((((' + self.data.loc[
270 phageID, 'Host'] + ' AND complete sequence) NOT shotgun[Title]) NOT phage[Title]) NOT cds[Title]) NOT gene[Title]',
271 idtype="acc") as handle:
272 species = Entrez.read(handle)
273 strains = species['IdList']
274 for j in strains:
275 if any(z in j[:3] for z in
276 ['NC_', 'NZ_', 'AC_', 'CP', 'AE', 'CY', 'AP']) and j not in listBactID:
277 listBactID.append(j)
278 self.searched[j] = 'yes'
279 count += 1
280 elif not any(z in j[:3] for z in
281 ['MN', 'FM', 'MQ', 'MR', 'MK', 'AB', 'MF', 'KP', 'NM_', 'KC', 'MH', 'AY', 'FN',
282 'AY']) and j not in listBactID:
283 if j in self.searched.keys():
284 add = self.searched[j]
285 else:
286 add = input('Check ' + j + '\nDo you wish to add it? (yes/no)')
287 if 'y' in add.lower():
288 listBactID.append(j)
289 self.searched[j] = 'yes'
290 count += 1
291 else:
292 self.searched[j] = 'no'
293 self.data.loc[phageID, 'Host_ID'] = listBactID
294 print('For future reference,', count, "new bacterial ID's were added.")
295 self.save_data()
296
297 def createListBacID(self, lower=0, upper=100):
298 """
299 More sequential than previous methods. Maybe include every single one...
300 :param lower: lower index from the phage list (numeric)
301 :param upper: upper index from the phage list (numeric)
302 :return:
303 """
304 from Bio import Entrez
305 Entrez.email = 'pedro_araujo97@hotmail.com'
306 for i in range(lower, upper):
307 phageID = self.data.index[i]
308 BactID = []
309 name = test.data.loc[phageID]['Bacteria Name']
310 try:
311 if name != 'unclassified bacterium' and not name != name: # Verificação de hosts válidos
312 with Entrez.elink(dbfrom='nuccore', db='pubmed', id=phageID) as handle:
313 pubmed = Entrez.read(handle)
314 for link in pubmed[0]["LinkSetDb"][0]["Link"]:
315 try:
316 with Entrez.elink(dbfrom='pubmed', db="nucleotide", id=link['Id']) as handle:
317 genomes = Entrez.read(handle)
318 for id in genomes[0]['LinkSetDb'][0]['Link']:
319 with Entrez.esummary(db='nucleotide', id=id['Id']) as handle:
320 bacorg = Entrez.read(handle)
321 if 'NC_' in bacorg[0]['AccessionVersion'] or 'NZ_' in bacorg[0]['AccessionVersion']:
322 if bacorg[0]['Caption'] != phageID:
323 BactID.append(bacorg[0]['AccessionVersion'])
324 except:
325 pass
326 else:
327 pass
328 except:
329 pass
330 self.listBacID.append(BactID)
331
332 def check_bacteria(self):
333 from Bio import Entrez
334 from Bio import SeqIO
335 import ast
336 Entrez.email = "pedro_araujo97@hotmail.com"
337 all_bact = []
338 for i in self.data.index:
339 for bact in ast.literal_eval(self.data.loc[i, 'Host_ID']):
340 if bact[:-2] not in all_bact:
341 all_bact.append(bact[:-2])
342 list_remove = []
343 for bact in all_bact:
344 if bact not in list_remove:
345 try:
346 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=bact) as handle:
347 seq_record = SeqIO.read(handle, "gb")
348 if not any(i in seq_record.description.lower() for i in ['pneumoniae', 'coli', 'baumannii']) or not any(i in seq_record.description.lower() for i in ['escherichia', 'acinetobacter', 'klebsiella']) \
349 or 'phage' in seq_record.description.lower() or 'virus' in seq_record.description.lower():
350 list_remove.append(bact)
351 except:
352 list_remove.append(bact)
353 print(list_remove)
354 for phage in self.data.index:
355 listBactID = ast.literal_eval(self.data.loc[phage, 'Host_ID'])
356 for bact in listBactID:
357 if bact[:-2] in list_remove:
358 listBactID.remove(bact)
359 self.data.loc[phage, 'Host_ID'] = listBactID
360 self.data.to_csv('files/NCBI_Phage_Bacteria_Data.csv')
361
362 def save_data(self):
363 """
364 Saves the data in csv format.
365 :return:
366 """
367 import json
368 self.data.to_csv('files/NCBI_Phage_Bacteria_Data.csv')
369 with open('files/searched_accessions', 'w') as f:
370 json.dump(self.searched, f)
371
372
373 if __name__ == '__main__':
374 test = PhageBacteriaData('NCBI_Phage_Bacteria_Data.csv') # sequences
375 test.addBacteriaName()
376 test.addBacteriaGenome()
377 test.searchBacName() # 2266 bacteria added
378 test.checkAbstracts()
379 # test.data = test.data.drop(columns=['Bacteria ID'])
380 test.searchBacName()
381 # test.createListBacID(0, 100)
382 # test.data = test.data.iloc[:, 0:3]
383 test.check_bacteria()
384 test.save_data()
385 # test.extractProtein()
386 # test.importProtein('Phage')