Mercurial > repos > pedro_araujo > phage_host_prediction
comparison process_raw_data.py @ 0:e4b3fc88efe0 draft
Uploaded
author | pedro_araujo |
---|---|
date | Wed, 27 Jan 2021 13:50:11 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e4b3fc88efe0 |
---|---|
1 """ | |
2 https://www.ncbi.nlm.nih.gov/Sequin/acc.html | |
3 https://www.ncbi.nlm.nih.gov/books/NBK21091/table/ch18.T.refseq_accession_numbers_and_mole/?report=objectonly | |
4 """ | |
5 | |
6 | |
7 class PhageBacteriaData: | |
8 | |
9 def __init__(self, dataset=None): | |
10 """ | |
11 Imports a dataset from NCBI Virus, where the columns are Phage ID, Phage Name, Bacteria Name, Bacteria ID. | |
12 If a phage entry does not have a bacteria associated, it is deleted | |
13 :param dataset: | |
14 """ | |
15 import pandas as pd | |
16 import json | |
17 self.listBacID = [] | |
18 if dataset is None: | |
19 file = False | |
20 while not file: | |
21 try: | |
22 name = input('File name: ') | |
23 self.data = pd.read_csv('files/' + name, header=0, index_col=0) | |
24 file = True | |
25 except: | |
26 print('Couldn\'t find file') | |
27 else: | |
28 self.data = pd.read_csv('files/' + dataset, header=0, index_col=0) | |
29 self.data = self.data.dropna(how='any') | |
30 self.data = self.data[self.data['Host'] != 'unclassified bacterium'] | |
31 index_remove = [] | |
32 for i in range(len(self.data)): | |
33 if 'uncultured' in self.data.iloc[i]['Species']: | |
34 index_remove.append(i) | |
35 elif 'virus' not in self.data.iloc[i]['Species'] and 'phage' not in self.data.iloc[i]['Species']: | |
36 index_remove.append(i) | |
37 self.data = self.data.drop(self.data.index[index_remove]) | |
38 index_remove = [] | |
39 for i in range(len(self.data)): | |
40 temp = self.data['Host'][i].split(' ') | |
41 if len(temp) <= 1: | |
42 index_remove.append(i) | |
43 self.data = self.data.drop(self.data.index[index_remove]) | |
44 if 'Host_ID' not in self.data.columns: | |
45 temp = [] | |
46 for i in self.data.index: | |
47 temp.append([]) | |
48 self.data['Host_ID'] = temp | |
49 try: | |
50 with open('files/searched_accessions', 'r') as f: | |
51 self.searched = json.loads(f.read()) | |
52 except: | |
53 self.searched = {} | |
54 | |
55 def addPhageName(self): | |
56 """ | |
57 Using the entrez service, from NCBI, for each phage the name is added, from its features. | |
58 :return: | |
59 """ | |
60 from Bio import Entrez | |
61 from Bio import SeqIO | |
62 Entrez.email = "pedro_araujo97@hotmail.com" | |
63 listNames = [] | |
64 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=self.data.index) as handle: | |
65 for seq_record in SeqIO.parse(handle, "gb"): | |
66 listNames.append(seq_record.annotations['organism']) | |
67 self.data['Species'] = listNames | |
68 | |
69 def addBacteriaName(self): | |
70 """ | |
71 Using the entrez service, from NCBI, for each phage the infecting bacteria name is added, from its features. | |
72 :return: | |
73 """ | |
74 from Bio import Entrez | |
75 from Bio import SeqIO | |
76 Entrez.email = "pedro_araujo97@hotmail.com" | |
77 for phage in self.data.index: | |
78 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=phage) as handle: | |
79 seq_record = SeqIO.read(handle, "gb") | |
80 try: | |
81 if len(seq_record.features[0].qualifiers['host'][0].split(' ')) > 2: | |
82 self.data.loc[phage, 'Host'] = seq_record.features[0].qualifiers['host'][0] | |
83 except: | |
84 if len(seq_record.features[0].qualifiers['lab_host'][0].split(' ')) > 2: | |
85 self.data.loc[phage, 'Host'] = seq_record.features[0].qualifiers['lab_host'][0] | |
86 self.save_data() | |
87 | |
88 def addBacteriaGenome(self): | |
89 """ | |
90 For each phage, the associated scientific articles in pubmed are looked. From theses pubmed articles, all associated IDs are extracted. | |
91 If the ID corresponds to a bacterial strain, its accession ID is added to the Bacteria ID column. | |
92 :return: | |
93 """ | |
94 from Bio import Entrez | |
95 import ast | |
96 import pickle | |
97 from pathlib import Path | |
98 Entrez.email = "pedro_araujo97@hotmail.com" | |
99 my_file = Path("files/searched_hosts") | |
100 if my_file.is_file(): | |
101 with open('files/searched_hosts', 'rb') as f: | |
102 list_done = pickle.load(f) | |
103 else: | |
104 list_done = [] | |
105 count = 0 | |
106 try: | |
107 for phageID in self.data.index: | |
108 if phageID in list_done: continue | |
109 listBactID = ast.literal_eval(self.data.loc[phageID, 'Host_ID']) | |
110 try: | |
111 with Entrez.elink(dbfrom='nuccore', db='pubmed', id=phageID) as handle: | |
112 pubmed = Entrez.read(handle) | |
113 for i in pubmed[0]['LinkSetDb']: | |
114 if 'weighted' not in i["LinkName"]: | |
115 for link in i["Link"]: | |
116 try: | |
117 with Entrez.elink(dbfrom='pubmed', db="nucleotide", id=link['Id']) as handle: | |
118 genomes = Entrez.read(handle) | |
119 for i in genomes[0]['LinkSetDb']: | |
120 if 'weighted' not in i['LinkName']: | |
121 for id in i['Link']: | |
122 with Entrez.esummary(db='nucleotide', id=id['Id']) as handle: | |
123 bacorg = Entrez.read(handle) | |
124 if bacorg[0]['Caption'] != phageID and 'phage' not in bacorg[0][ | |
125 'Title'].lower() \ | |
126 and bacorg[0][ | |
127 'AccessionVersion'] not in listBactID and 'cds' not in \ | |
128 bacorg[0]['Title'].lower() and 'shotgun' not in bacorg[0][ | |
129 'Title'].lower(): | |
130 if any(z in bacorg[0]['AccessionVersion'][:3] for z in | |
131 ['NC_', 'AC_', 'NZ_', 'CP', 'AE', 'CY', 'AP']): | |
132 listBactID.append(bacorg[0]['AccessionVersion']) | |
133 self.searched[bacorg[0]['AccessionVersion']] = 'yes' | |
134 count += 1 | |
135 elif not any(z in bacorg[0]['AccessionVersion'][:3] for z in | |
136 ['MN', 'FM', 'MQ', 'MR', 'MK', 'AB', 'MF', 'KP', | |
137 'NM_', 'KC', 'MH', 'AY', 'FN', 'AY']) \ | |
138 and 'complete' in bacorg[0]['Title'].lower(): | |
139 if bacorg[0]['AccessionVersion'] in self.searched.keys(): | |
140 add = self.searched[bacorg[0]['AccessionVersion']] | |
141 else: | |
142 add = input('Check ' + bacorg[0][ | |
143 'AccessionVersion'] + '\nDo you wish to add it? (yes/no)') | |
144 if 'y' in add.lower(): | |
145 listBactID.append(bacorg[0]['AccessionVersion']) | |
146 self.searched[bacorg[0]['AccessionVersion']] = 'yes' | |
147 count += 1 | |
148 else: | |
149 self.searched[bacorg[0]['AccessionVersion']] = 'no' | |
150 except: | |
151 pass | |
152 except: | |
153 pass | |
154 self.data.loc[phageID, 'Host_ID'] = listBactID | |
155 list_done.append(phageID) | |
156 with open('files/searched_hosts', 'wb') as f: | |
157 pickle.dump(list_done, f) | |
158 self.save_data() | |
159 print(phageID) | |
160 except: | |
161 print('Bacterial host name missing. Searching from phage id') | |
162 pass | |
163 print('For future reference,', count, "new bacterial ID's were added.") | |
164 self.save_data() | |
165 | |
166 def checkAbstracts(self): | |
167 """ | |
168 For each phage, the associated scientific articles in pubmed are looked. From theses pubmed articles, the abstracted is searched for mentions of bacterial strains. | |
169 If bacterial strains are found, its accession IDs are added to the Bacteria ID column. | |
170 :return: | |
171 """ | |
172 from Bio import Entrez | |
173 import re | |
174 import ast | |
175 Entrez.email = 'pedro_araujo97@hotmail.com' | |
176 count = 0 | |
177 for phageID in self.data.index: | |
178 if len(self.data.loc[phageID, 'Host'].split()) < 3: | |
179 with Entrez.elink(dbfrom='nuccore', db='pubmed', id=phageID) as handle: | |
180 pubmed = Entrez.read(handle) | |
181 listBactID = ast.literal_eval(self.data.loc[phageID, 'Host_ID']) | |
182 for i in pubmed[0]['LinkSetDb']: | |
183 if 'weighted' not in i["LinkName"]: | |
184 for link in i["Link"]: | |
185 try: | |
186 with Entrez.efetch(db="pubmed", rettype="medline", retmode="xml", | |
187 id=link['Id']) as handle: | |
188 article = Entrez.read(handle) | |
189 abstract = \ | |
190 article['PubmedArticle'][0]['MedlineCitation']['Article']['Abstract']['AbstractText'][0] | |
191 x = re.findall('\w{0,1}[A-Z]{1,5}[0-9]{1,5}[-,:]{0,1}[A-Z]{0,5}[1-9]{0,5}', abstract) | |
192 for i in range(len(x)): | |
193 x[i] = x[i].strip(',;') | |
194 x = list(set(x)) | |
195 for i in x: | |
196 if 'ORF' in i: | |
197 x.remove(i) | |
198 for strain in x: | |
199 with Entrez.esearch(db='nucleotide', term='((((' + self.data.loc[ | |
200 phageID, 'Host'] + ' ' + strain + ' AND complete sequence) NOT shotgun[Title]) NOT phage[Title]) NOT cds[Title]) NOT gene[Title]', | |
201 idtype="acc") as handle: | |
202 species = Entrez.read(handle) | |
203 strains = species['IdList'] | |
204 for i in strains: | |
205 if any(z in i for z in ['NC_', 'NZ_', 'AC_', 'CP', 'AE', 'CY', | |
206 'AP']) and i not in listBactID: | |
207 listBactID.append(i) | |
208 self.searched[i] = 'yes' | |
209 count += 1 | |
210 elif not any(z in i[:3] for z in | |
211 ['MN', 'FM', 'MQ', 'MR', 'MK', 'AB', 'MF', 'KP', 'NM_', 'KC', | |
212 'MH', 'AY', 'FN', 'AY']) and i not in listBactID: | |
213 if i in self.searched.keys(): | |
214 add = self.searched[i] | |
215 else: | |
216 add = input('Check ' + i + '\nDo you wish to add it? (yes/no)') | |
217 if 'y' in add.lower(): | |
218 listBactID.append(i) | |
219 self.searched[i] = 'yes' | |
220 count += 1 | |
221 else: | |
222 self.searched = 'no' | |
223 except: | |
224 pass | |
225 self.data.loc[phageID, 'Host_ID'] = listBactID | |
226 print('For future reference,', count, "new bacterial ID's were added.") | |
227 self.save_data() | |
228 | |
229 def saveAbstracts(self): | |
230 """ | |
231 For each phage, the associated scientific articles in pubmed are looked. | |
232 From theses pubmed articles, the abstracted is saved in a dictionary structure, where each phage ID is associated with a list of abstracts. | |
233 These abstracts can be used for later processing. | |
234 :return: | |
235 """ | |
236 from Bio import Entrez | |
237 import json | |
238 Entrez.email = 'pedro_araujo97@hotmail.com' | |
239 dicPhageAbstracts = {} | |
240 | |
241 for phageID in self.data.index: | |
242 print(phageID, end='\n') | |
243 dicPhageAbstracts[phageID] = [] | |
244 with Entrez.elink(dbfrom='nuccore', db='pubmed', id=phageID) as handle: | |
245 pubmed = Entrez.read(handle) | |
246 lista = [] | |
247 for i in pubmed[0]['LinkSetDb']: | |
248 if 'weighted' not in i["LinkName"]: | |
249 for link in i["Link"]: | |
250 try: | |
251 with Entrez.efetch(db="pubmed", rettype="medline", retmode="xml", id=link['Id']) as handle: | |
252 article = Entrez.read(handle) | |
253 abstract = \ | |
254 article['PubmedArticle'][0]['MedlineCitation']['Article']['Abstract']['AbstractText'][0] | |
255 dicPhageAbstracts[phageID].append([link['Id'], abstract]) | |
256 except: | |
257 pass | |
258 with open('files/phageAbstracts.json', 'w') as f: | |
259 json.dump(dicPhageAbstracts, f) | |
260 | |
261 def searchBacName(self): | |
262 from Bio import Entrez | |
263 import ast | |
264 Entrez.email = "pedro_araujo97@hotmail.com" | |
265 count = 0 | |
266 for phageID in self.data.index: | |
267 if len(self.data.loc[phageID, 'Host'].split()) > 2: | |
268 listBactID = ast.literal_eval(self.data.loc[phageID, 'Host_ID']) | |
269 with Entrez.esearch(db='nucleotide', term='((((' + self.data.loc[ | |
270 phageID, 'Host'] + ' AND complete sequence) NOT shotgun[Title]) NOT phage[Title]) NOT cds[Title]) NOT gene[Title]', | |
271 idtype="acc") as handle: | |
272 species = Entrez.read(handle) | |
273 strains = species['IdList'] | |
274 for j in strains: | |
275 if any(z in j[:3] for z in | |
276 ['NC_', 'NZ_', 'AC_', 'CP', 'AE', 'CY', 'AP']) and j not in listBactID: | |
277 listBactID.append(j) | |
278 self.searched[j] = 'yes' | |
279 count += 1 | |
280 elif not any(z in j[:3] for z in | |
281 ['MN', 'FM', 'MQ', 'MR', 'MK', 'AB', 'MF', 'KP', 'NM_', 'KC', 'MH', 'AY', 'FN', | |
282 'AY']) and j not in listBactID: | |
283 if j in self.searched.keys(): | |
284 add = self.searched[j] | |
285 else: | |
286 add = input('Check ' + j + '\nDo you wish to add it? (yes/no)') | |
287 if 'y' in add.lower(): | |
288 listBactID.append(j) | |
289 self.searched[j] = 'yes' | |
290 count += 1 | |
291 else: | |
292 self.searched[j] = 'no' | |
293 self.data.loc[phageID, 'Host_ID'] = listBactID | |
294 print('For future reference,', count, "new bacterial ID's were added.") | |
295 self.save_data() | |
296 | |
297 def createListBacID(self, lower=0, upper=100): | |
298 """ | |
299 More sequential than previous methods. Maybe include every single one... | |
300 :param lower: lower index from the phage list (numeric) | |
301 :param upper: upper index from the phage list (numeric) | |
302 :return: | |
303 """ | |
304 from Bio import Entrez | |
305 Entrez.email = 'pedro_araujo97@hotmail.com' | |
306 for i in range(lower, upper): | |
307 phageID = self.data.index[i] | |
308 BactID = [] | |
309 name = test.data.loc[phageID]['Bacteria Name'] | |
310 try: | |
311 if name != 'unclassified bacterium' and not name != name: # Verificação de hosts válidos | |
312 with Entrez.elink(dbfrom='nuccore', db='pubmed', id=phageID) as handle: | |
313 pubmed = Entrez.read(handle) | |
314 for link in pubmed[0]["LinkSetDb"][0]["Link"]: | |
315 try: | |
316 with Entrez.elink(dbfrom='pubmed', db="nucleotide", id=link['Id']) as handle: | |
317 genomes = Entrez.read(handle) | |
318 for id in genomes[0]['LinkSetDb'][0]['Link']: | |
319 with Entrez.esummary(db='nucleotide', id=id['Id']) as handle: | |
320 bacorg = Entrez.read(handle) | |
321 if 'NC_' in bacorg[0]['AccessionVersion'] or 'NZ_' in bacorg[0]['AccessionVersion']: | |
322 if bacorg[0]['Caption'] != phageID: | |
323 BactID.append(bacorg[0]['AccessionVersion']) | |
324 except: | |
325 pass | |
326 else: | |
327 pass | |
328 except: | |
329 pass | |
330 self.listBacID.append(BactID) | |
331 | |
332 def check_bacteria(self): | |
333 from Bio import Entrez | |
334 from Bio import SeqIO | |
335 import ast | |
336 Entrez.email = "pedro_araujo97@hotmail.com" | |
337 all_bact = [] | |
338 for i in self.data.index: | |
339 for bact in ast.literal_eval(self.data.loc[i, 'Host_ID']): | |
340 if bact[:-2] not in all_bact: | |
341 all_bact.append(bact[:-2]) | |
342 list_remove = [] | |
343 for bact in all_bact: | |
344 if bact not in list_remove: | |
345 try: | |
346 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=bact) as handle: | |
347 seq_record = SeqIO.read(handle, "gb") | |
348 if not any(i in seq_record.description.lower() for i in ['pneumoniae', 'coli', 'baumannii']) or not any(i in seq_record.description.lower() for i in ['escherichia', 'acinetobacter', 'klebsiella']) \ | |
349 or 'phage' in seq_record.description.lower() or 'virus' in seq_record.description.lower(): | |
350 list_remove.append(bact) | |
351 except: | |
352 list_remove.append(bact) | |
353 print(list_remove) | |
354 for phage in self.data.index: | |
355 listBactID = ast.literal_eval(self.data.loc[phage, 'Host_ID']) | |
356 for bact in listBactID: | |
357 if bact[:-2] in list_remove: | |
358 listBactID.remove(bact) | |
359 self.data.loc[phage, 'Host_ID'] = listBactID | |
360 self.data.to_csv('files/NCBI_Phage_Bacteria_Data.csv') | |
361 | |
362 def save_data(self): | |
363 """ | |
364 Saves the data in csv format. | |
365 :return: | |
366 """ | |
367 import json | |
368 self.data.to_csv('files/NCBI_Phage_Bacteria_Data.csv') | |
369 with open('files/searched_accessions', 'w') as f: | |
370 json.dump(self.searched, f) | |
371 | |
372 | |
373 if __name__ == '__main__': | |
374 test = PhageBacteriaData('NCBI_Phage_Bacteria_Data.csv') # sequences | |
375 test.addBacteriaName() | |
376 test.addBacteriaGenome() | |
377 test.searchBacName() # 2266 bacteria added | |
378 test.checkAbstracts() | |
379 # test.data = test.data.drop(columns=['Bacteria ID']) | |
380 test.searchBacName() | |
381 # test.createListBacID(0, 100) | |
382 # test.data = test.data.iloc[:, 0:3] | |
383 test.check_bacteria() | |
384 test.save_data() | |
385 # test.extractProtein() | |
386 # test.importProtein('Phage') |