annotate phage_host_prediction/process_raw_data.py @ 2:3e1e8be4e65c draft default tip

Uploaded
author pedro_araujo
date Fri, 02 Apr 2021 10:11:13 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
1 """
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
2 https://www.ncbi.nlm.nih.gov/Sequin/acc.html
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
3 https://www.ncbi.nlm.nih.gov/books/NBK21091/table/ch18.T.refseq_accession_numbers_and_mole/?report=objectonly
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
4 """
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
5
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
6
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
7 class PhageBacteriaData:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
8
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
9 def __init__(self, dataset=None):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
10 """
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
11 Imports a dataset from NCBI Virus, where the columns are Phage ID, Phage Name, Bacteria Name, Bacteria ID.
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
12 If a phage entry does not have a bacteria associated, it is deleted
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
13 :param dataset:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
14 """
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
15 import pandas as pd
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
16 import json
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
17 self.listBacID = []
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
18 if dataset is None:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
19 file = False
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
20 while not file:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
21 try:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
22 name = input('File name: ')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
23 self.data = pd.read_csv('files/' + name, header=0, index_col=0)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
24 file = True
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
25 except:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
26 print('Couldn\'t find file')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
27 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
28 self.data = pd.read_csv('files/' + dataset, header=0, index_col=0)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
29 self.data = self.data.dropna(how='any')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
30 self.data = self.data[self.data['Host'] != 'unclassified bacterium']
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
31 index_remove = []
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
32 for i in range(len(self.data)):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
33 if 'uncultured' in self.data.iloc[i]['Species']:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
34 index_remove.append(i)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
35 elif 'virus' not in self.data.iloc[i]['Species'] and 'phage' not in self.data.iloc[i]['Species']:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
36 index_remove.append(i)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
37 self.data = self.data.drop(self.data.index[index_remove])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
38 index_remove = []
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
39 for i in range(len(self.data)):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
40 temp = self.data['Host'][i].split(' ')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
41 if len(temp) <= 1:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
42 index_remove.append(i)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
43 self.data = self.data.drop(self.data.index[index_remove])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
44 if 'Host_ID' not in self.data.columns:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
45 temp = []
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
46 for i in self.data.index:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
47 temp.append([])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
48 self.data['Host_ID'] = temp
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
49 try:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
50 with open('files/searched_accessions', 'r') as f:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
51 self.searched = json.loads(f.read())
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
52 except:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
53 self.searched = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
54
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
55 def addPhageName(self):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
56 """
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
57 Using the entrez service, from NCBI, for each phage the name is added, from its features.
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
58 :return:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
59 """
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
60 from Bio import Entrez
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
61 from Bio import SeqIO
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
62 Entrez.email = "pedro_araujo97@hotmail.com"
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
63 listNames = []
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
64 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=self.data.index) as handle:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
65 for seq_record in SeqIO.parse(handle, "gb"):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
66 listNames.append(seq_record.annotations['organism'])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
67 self.data['Species'] = listNames
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
68
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
69 def addBacteriaName(self):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
70 """
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
71 Using the entrez service, from NCBI, for each phage the infecting bacteria name is added, from its features.
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
72 :return:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
73 """
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
74 from Bio import Entrez
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
75 from Bio import SeqIO
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
76 Entrez.email = "pedro_araujo97@hotmail.com"
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
77 for phage in self.data.index:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
78 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=phage) as handle:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
79 seq_record = SeqIO.read(handle, "gb")
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
80 try:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
81 if len(seq_record.features[0].qualifiers['host'][0].split(' ')) > 2:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
82 self.data.loc[phage, 'Host'] = seq_record.features[0].qualifiers['host'][0]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
83 except:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
84 if len(seq_record.features[0].qualifiers['lab_host'][0].split(' ')) > 2:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
85 self.data.loc[phage, 'Host'] = seq_record.features[0].qualifiers['lab_host'][0]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
86 self.save_data()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
87
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
88 def addBacteriaGenome(self):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
89 """
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
90 For each phage, the associated scientific articles in pubmed are looked. From theses pubmed articles, all associated IDs are extracted.
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
91 If the ID corresponds to a bacterial strain, its accession ID is added to the Bacteria ID column.
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
92 :return:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
93 """
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
94 from Bio import Entrez
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
95 import ast
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
96 import pickle
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
97 from pathlib import Path
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
98 Entrez.email = "pedro_araujo97@hotmail.com"
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
99 my_file = Path("files/searched_hosts")
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
100 if my_file.is_file():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
101 with open('files/searched_hosts', 'rb') as f:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
102 list_done = pickle.load(f)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
103 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
104 list_done = []
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
105 count = 0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
106 try:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
107 for phageID in self.data.index:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
108 if phageID in list_done: continue
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
109 listBactID = ast.literal_eval(self.data.loc[phageID, 'Host_ID'])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
110 try:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
111 with Entrez.elink(dbfrom='nuccore', db='pubmed', id=phageID) as handle:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
112 pubmed = Entrez.read(handle)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
113 for i in pubmed[0]['LinkSetDb']:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
114 if 'weighted' not in i["LinkName"]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
115 for link in i["Link"]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
116 try:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
117 with Entrez.elink(dbfrom='pubmed', db="nucleotide", id=link['Id']) as handle:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
118 genomes = Entrez.read(handle)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
119 for i in genomes[0]['LinkSetDb']:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
120 if 'weighted' not in i['LinkName']:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
121 for id in i['Link']:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
122 with Entrez.esummary(db='nucleotide', id=id['Id']) as handle:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
123 bacorg = Entrez.read(handle)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
124 if bacorg[0]['Caption'] != phageID and 'phage' not in bacorg[0][
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
125 'Title'].lower() \
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
126 and bacorg[0][
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
127 'AccessionVersion'] not in listBactID and 'cds' not in \
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
128 bacorg[0]['Title'].lower() and 'shotgun' not in bacorg[0][
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
129 'Title'].lower():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
130 if any(z in bacorg[0]['AccessionVersion'][:3] for z in
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
131 ['NC_', 'AC_', 'NZ_', 'CP', 'AE', 'CY', 'AP']):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
132 listBactID.append(bacorg[0]['AccessionVersion'])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
133 self.searched[bacorg[0]['AccessionVersion']] = 'yes'
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
134 count += 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
135 elif not any(z in bacorg[0]['AccessionVersion'][:3] for z in
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
136 ['MN', 'FM', 'MQ', 'MR', 'MK', 'AB', 'MF', 'KP',
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
137 'NM_', 'KC', 'MH', 'AY', 'FN', 'AY']) \
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
138 and 'complete' in bacorg[0]['Title'].lower():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
139 if bacorg[0]['AccessionVersion'] in self.searched.keys():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
140 add = self.searched[bacorg[0]['AccessionVersion']]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
141 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
142 add = input('Check ' + bacorg[0][
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
143 'AccessionVersion'] + '\nDo you wish to add it? (yes/no)')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
144 if 'y' in add.lower():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
145 listBactID.append(bacorg[0]['AccessionVersion'])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
146 self.searched[bacorg[0]['AccessionVersion']] = 'yes'
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
147 count += 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
148 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
149 self.searched[bacorg[0]['AccessionVersion']] = 'no'
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
150 except:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
151 pass
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
152 except:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
153 pass
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
154 self.data.loc[phageID, 'Host_ID'] = listBactID
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
155 list_done.append(phageID)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
156 with open('files/searched_hosts', 'wb') as f:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
157 pickle.dump(list_done, f)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
158 self.save_data()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
159 print(phageID)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
160 except:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
161 print('Bacterial host name missing. Searching from phage id')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
162 pass
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
163 print('For future reference,', count, "new bacterial ID's were added.")
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
164 self.save_data()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
165
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
166 def checkAbstracts(self):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
167 """
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
168 For each phage, the associated scientific articles in pubmed are looked. From theses pubmed articles, the abstracted is searched for mentions of bacterial strains.
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
169 If bacterial strains are found, its accession IDs are added to the Bacteria ID column.
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
170 :return:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
171 """
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
172 from Bio import Entrez
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
173 import re
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
174 import ast
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
175 Entrez.email = 'pedro_araujo97@hotmail.com'
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
176 count = 0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
177 for phageID in self.data.index:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
178 if len(self.data.loc[phageID, 'Host'].split()) < 3:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
179 with Entrez.elink(dbfrom='nuccore', db='pubmed', id=phageID) as handle:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
180 pubmed = Entrez.read(handle)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
181 listBactID = ast.literal_eval(self.data.loc[phageID, 'Host_ID'])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
182 for i in pubmed[0]['LinkSetDb']:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
183 if 'weighted' not in i["LinkName"]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
184 for link in i["Link"]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
185 try:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
186 with Entrez.efetch(db="pubmed", rettype="medline", retmode="xml",
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
187 id=link['Id']) as handle:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
188 article = Entrez.read(handle)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
189 abstract = \
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
190 article['PubmedArticle'][0]['MedlineCitation']['Article']['Abstract']['AbstractText'][0]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
191 x = re.findall('\w{0,1}[A-Z]{1,5}[0-9]{1,5}[-,:]{0,1}[A-Z]{0,5}[1-9]{0,5}', abstract)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
192 for i in range(len(x)):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
193 x[i] = x[i].strip(',;')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
194 x = list(set(x))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
195 for i in x:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
196 if 'ORF' in i:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
197 x.remove(i)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
198 for strain in x:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
199 with Entrez.esearch(db='nucleotide', term='((((' + self.data.loc[
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
200 phageID, 'Host'] + ' ' + strain + ' AND complete sequence) NOT shotgun[Title]) NOT phage[Title]) NOT cds[Title]) NOT gene[Title]',
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
201 idtype="acc") as handle:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
202 species = Entrez.read(handle)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
203 strains = species['IdList']
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
204 for i in strains:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
205 if any(z in i for z in ['NC_', 'NZ_', 'AC_', 'CP', 'AE', 'CY',
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
206 'AP']) and i not in listBactID:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
207 listBactID.append(i)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
208 self.searched[i] = 'yes'
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
209 count += 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
210 elif not any(z in i[:3] for z in
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
211 ['MN', 'FM', 'MQ', 'MR', 'MK', 'AB', 'MF', 'KP', 'NM_', 'KC',
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
212 'MH', 'AY', 'FN', 'AY']) and i not in listBactID:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
213 if i in self.searched.keys():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
214 add = self.searched[i]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
215 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
216 add = input('Check ' + i + '\nDo you wish to add it? (yes/no)')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
217 if 'y' in add.lower():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
218 listBactID.append(i)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
219 self.searched[i] = 'yes'
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
220 count += 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
221 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
222 self.searched = 'no'
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
223 except:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
224 pass
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
225 self.data.loc[phageID, 'Host_ID'] = listBactID
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
226 print('For future reference,', count, "new bacterial ID's were added.")
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
227 self.save_data()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
228
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
229 def saveAbstracts(self):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
230 """
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
231 For each phage, the associated scientific articles in pubmed are looked.
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
232 From theses pubmed articles, the abstracted is saved in a dictionary structure, where each phage ID is associated with a list of abstracts.
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
233 These abstracts can be used for later processing.
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
234 :return:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
235 """
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
236 from Bio import Entrez
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
237 import json
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
238 Entrez.email = 'pedro_araujo97@hotmail.com'
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
239 dicPhageAbstracts = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
240
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
241 for phageID in self.data.index:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
242 print(phageID, end='\n')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
243 dicPhageAbstracts[phageID] = []
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
244 with Entrez.elink(dbfrom='nuccore', db='pubmed', id=phageID) as handle:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
245 pubmed = Entrez.read(handle)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
246 lista = []
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
247 for i in pubmed[0]['LinkSetDb']:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
248 if 'weighted' not in i["LinkName"]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
249 for link in i["Link"]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
250 try:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
251 with Entrez.efetch(db="pubmed", rettype="medline", retmode="xml", id=link['Id']) as handle:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
252 article = Entrez.read(handle)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
253 abstract = \
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
254 article['PubmedArticle'][0]['MedlineCitation']['Article']['Abstract']['AbstractText'][0]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
255 dicPhageAbstracts[phageID].append([link['Id'], abstract])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
256 except:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
257 pass
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
258 with open('files/phageAbstracts.json', 'w') as f:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
259 json.dump(dicPhageAbstracts, f)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
260
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
261 def searchBacName(self):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
262 from Bio import Entrez
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
263 import ast
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
264 Entrez.email = "pedro_araujo97@hotmail.com"
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
265 count = 0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
266 for phageID in self.data.index:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
267 if len(self.data.loc[phageID, 'Host'].split()) > 2:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
268 listBactID = ast.literal_eval(self.data.loc[phageID, 'Host_ID'])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
269 with Entrez.esearch(db='nucleotide', term='((((' + self.data.loc[
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
270 phageID, 'Host'] + ' AND complete sequence) NOT shotgun[Title]) NOT phage[Title]) NOT cds[Title]) NOT gene[Title]',
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
271 idtype="acc") as handle:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
272 species = Entrez.read(handle)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
273 strains = species['IdList']
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
274 for j in strains:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
275 if any(z in j[:3] for z in
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
276 ['NC_', 'NZ_', 'AC_', 'CP', 'AE', 'CY', 'AP']) and j not in listBactID:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
277 listBactID.append(j)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
278 self.searched[j] = 'yes'
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
279 count += 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
280 elif not any(z in j[:3] for z in
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
281 ['MN', 'FM', 'MQ', 'MR', 'MK', 'AB', 'MF', 'KP', 'NM_', 'KC', 'MH', 'AY', 'FN',
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
282 'AY']) and j not in listBactID:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
283 if j in self.searched.keys():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
284 add = self.searched[j]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
285 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
286 add = input('Check ' + j + '\nDo you wish to add it? (yes/no)')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
287 if 'y' in add.lower():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
288 listBactID.append(j)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
289 self.searched[j] = 'yes'
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
290 count += 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
291 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
292 self.searched[j] = 'no'
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
293 self.data.loc[phageID, 'Host_ID'] = listBactID
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
294 print('For future reference,', count, "new bacterial ID's were added.")
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
295 self.save_data()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
296
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
297 def createListBacID(self, lower=0, upper=100):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
298 """
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
299 More sequential than previous methods. Maybe include every single one...
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
300 :param lower: lower index from the phage list (numeric)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
301 :param upper: upper index from the phage list (numeric)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
302 :return:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
303 """
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
304 from Bio import Entrez
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
305 Entrez.email = 'pedro_araujo97@hotmail.com'
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
306 for i in range(lower, upper):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
307 phageID = self.data.index[i]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
308 BactID = []
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
309 name = test.data.loc[phageID]['Bacteria Name']
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
310 try:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
311 if name != 'unclassified bacterium' and not name != name: # Verificação de hosts válidos
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
312 with Entrez.elink(dbfrom='nuccore', db='pubmed', id=phageID) as handle:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
313 pubmed = Entrez.read(handle)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
314 for link in pubmed[0]["LinkSetDb"][0]["Link"]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
315 try:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
316 with Entrez.elink(dbfrom='pubmed', db="nucleotide", id=link['Id']) as handle:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
317 genomes = Entrez.read(handle)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
318 for id in genomes[0]['LinkSetDb'][0]['Link']:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
319 with Entrez.esummary(db='nucleotide', id=id['Id']) as handle:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
320 bacorg = Entrez.read(handle)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
321 if 'NC_' in bacorg[0]['AccessionVersion'] or 'NZ_' in bacorg[0]['AccessionVersion']:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
322 if bacorg[0]['Caption'] != phageID:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
323 BactID.append(bacorg[0]['AccessionVersion'])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
324 except:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
325 pass
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
326 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
327 pass
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
328 except:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
329 pass
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
330 self.listBacID.append(BactID)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
331
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
332 def check_bacteria(self):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
333 from Bio import Entrez
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
334 from Bio import SeqIO
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
335 import ast
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
336 Entrez.email = "pedro_araujo97@hotmail.com"
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
337 all_bact = []
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
338 for i in self.data.index:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
339 for bact in ast.literal_eval(self.data.loc[i, 'Host_ID']):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
340 if bact[:-2] not in all_bact:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
341 all_bact.append(bact[:-2])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
342 list_remove = []
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
343 for bact in all_bact:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
344 if bact not in list_remove:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
345 try:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
346 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=bact) as handle:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
347 seq_record = SeqIO.read(handle, "gb")
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
348 if not any(i in seq_record.description.lower() for i in ['pneumoniae', 'coli', 'baumannii']) or not any(i in seq_record.description.lower() for i in ['escherichia', 'acinetobacter', 'klebsiella']) \
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
349 or 'phage' in seq_record.description.lower() or 'virus' in seq_record.description.lower():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
350 list_remove.append(bact)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
351 except:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
352 list_remove.append(bact)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
353 print(list_remove)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
354 for phage in self.data.index:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
355 listBactID = ast.literal_eval(self.data.loc[phage, 'Host_ID'])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
356 for bact in listBactID:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
357 if bact[:-2] in list_remove:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
358 listBactID.remove(bact)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
359 self.data.loc[phage, 'Host_ID'] = listBactID
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
360 self.data.to_csv('files/NCBI_Phage_Bacteria_Data.csv')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
361
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
362 def save_data(self):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
363 """
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
364 Saves the data in csv format.
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
365 :return:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
366 """
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
367 import json
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
368 self.data.to_csv('files/NCBI_Phage_Bacteria_Data.csv')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
369 with open('files/searched_accessions', 'w') as f:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
370 json.dump(self.searched, f)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
371
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
372
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
373 if __name__ == '__main__':
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
374 test = PhageBacteriaData('NCBI_Phage_Bacteria_Data.csv') # sequences
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
375 test.addBacteriaName()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
376 test.addBacteriaGenome()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
377 test.searchBacName() # 2266 bacteria added
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
378 test.checkAbstracts()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
379 test.searchBacName()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
380 # test.createListBacID(0, 100)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
381 # test.data = test.data.iloc[:, 0:3]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
382 test.check_bacteria()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
383 test.save_data()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
384 # test.extractProtein()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
385 # test.importProtein('Phage')