comparison get_proteins.py @ 0:e4b3fc88efe0 draft

Uploaded
author pedro_araujo
date Wed, 27 Jan 2021 13:50:11 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e4b3fc88efe0
1
2 class PhageBacteriaInformation:
3
4 def __init__(self, dataset=None):
5 """
6 Imports a dataset from NCBI Virus, where the columns are Phage ID, Phage Name, Bacteria Name, Bacteria ID.
7 If a phage entry does not have a bacteria associated, it is deleted
8 :param dataset:
9 """
10 import pandas as pd
11 import ast
12 self.phagesProteins = {}
13 # self.phagesDNA = {}
14 self.bactProteins = {}
15 # self.bactDNA = {}
16 self.data = pd.read_csv('files/'+dataset, header=0, index_col=0)
17 self.data = self.data.dropna(how='any')
18 self.data = self.data[self.data['Host_ID'] != '[]']
19 index_remove = []
20 for i in range(len(self.data)):
21 temp = self.data['Host'][i].split(' ')
22 if len(temp) <= 1:
23 index_remove.append(i)
24 self.data = self.data.drop(self.data.index[index_remove])
25 self.all_bact = []
26 for i in self.data.index:
27 for bact in ast.literal_eval(self.data.loc[i, 'Host_ID']):
28 if bact[:-2] not in self.all_bact:
29 self.all_bact.append(bact[:-2])
30 self.data.to_csv('files/Filtered_Phage_Bacteria.csv')
31
32 def addFeatures(self):
33 """
34 For each phage in the data, it saves its DNA sequence and all proteins, as provided by NCBI. It saves them into two variables
35 Each bacteria associated with the phage is also searched for its DNA and proteins sequences.
36 :return:
37 """
38 from Bio import Entrez
39 from Bio import SeqIO
40 import json
41 import ast
42 Entrez.email = 'pedro_araujo97@hotmail.com'
43 print('Working...')
44 for phageID in self.data.index:
45 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=phageID) as handle:
46 genomePhage = SeqIO.read(handle, "gb")
47 protsPhage = {}
48 for feat in genomePhage.features:
49 if feat.type == 'CDS':
50 try: protsPhage[feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]]
51 except: pass
52 self.phagesProteins[phageID] = protsPhage
53
54 for bact in self.all_bact:
55 protsBac = {}
56 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=bact) as handle:
57 genomeBac = SeqIO.read(handle, "gb")
58 for feat in genomeBac.features:
59 if feat.type == 'CDS':
60 try: protsBac[feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]]
61 except: pass
62 self.bactProteins[bact] = protsBac
63
64 with open('files/phagesProteins.json', 'w') as f:
65 json.dump(self.phagesProteins, f)
66 self.__createFasta(self.phagesProteins, 'phagesProteins')
67 with open('files/bactProteins.json', 'w') as f:
68 json.dump(self.bactProteins, f)
69 self.__createFasta(self.bactProteins, 'bactProteins')
70 print('Done')
71
72 def addBacteriaFeatures(self):
73 """
74 For each unique bacteria present in the dataset, the DNA and protein sequences are saved in two variables.
75 :return:
76 """
77 from Bio import Entrez
78 from Bio import SeqIO
79 import json
80 import ast
81 Entrez.email = 'pedro_araujo97@hotmail.com'
82 print('Working...')
83 for bact in self.all_bact:
84 if bact not in self.bactProteins.keys():
85 protsBac = {}
86 try:
87 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=bact) as handle:
88 genomeBac = SeqIO.read(handle, "gb")
89 for feat in genomeBac.features:
90 if feat.type == 'CDS':
91 try: protsBac[feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]]
92 except: pass
93 if len(genomeBac.features) <= 5:
94 with Entrez.efetch(db="nucleotide", rettype="gbwithparts", retmode="text", id=bact) as handle:
95 genomeBac = handle.readlines()
96 for i in range(len(genomeBac)):
97 if ' CDS ' in genomeBac[i]:
98 j = i
99 protDone = False
100 while j < len(genomeBac):
101 if protDone:
102 break
103 if '/product=' in genomeBac[j]:
104 product = genomeBac[j].strip()[10:]
105 j += 1
106 elif '_id=' in genomeBac[j]:
107 protKey = genomeBac[j].strip()[13:-1]
108 j += 1
109 elif '/translation=' in genomeBac[j]:
110 protSeq = genomeBac[j].strip()[14:]
111 j += 1
112 for k in range(j, len(genomeBac)):
113 if genomeBac[k].islower():
114 j = k
115 protDone = True
116 break
117 else:
118 protSeq += genomeBac[k].strip()
119 else:
120 j += 1
121 protsBac[protKey] = [product, protSeq[:protSeq.find('"')]]
122 self.bactProteins[bact] = protsBac
123 except:
124 print(bact + ' failed')
125 with open('files/bactProteins.json', 'w') as f:
126 json.dump(self.bactProteins, f)
127 self.__createFasta(self.bactProteins, 'bactProteins')
128 print('Done')
129
130 def add_individual_bacteria(self):
131 """
132 For each unique bacteria present in the dataset, the DNA and protein sequences are saved in two variables.
133 :return:
134 """
135 from Bio import Entrez
136 from Bio import SeqIO
137 import json
138 from pathlib import Path
139 Entrez.email = 'pedro_araujo97@hotmail.com'
140 print('Working...')
141 for bact in self.all_bact:
142 my_file = Path('files/bacteria/' + bact + ".json")
143 if not my_file.is_file():
144 protsBac = {}
145 try:
146 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=bact) as handle:
147 genomeBac = SeqIO.read(handle, "gb")
148 for feat in genomeBac.features:
149 if feat.type == 'CDS':
150 try: protsBac[feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]]
151 except: pass
152 if len(genomeBac.features) <= 5:
153 with Entrez.efetch(db="nucleotide", rettype="gbwithparts", retmode="text", id=bact) as handle:
154 genomeBac = handle.readlines()
155 for i in range(len(genomeBac)):
156 if ' CDS ' in genomeBac[i]:
157 j = i
158 protDone = False
159 while j < len(genomeBac):
160 if protDone:
161 break
162 if '/product=' in genomeBac[j]:
163 product = genomeBac[j].strip()[10:]
164 j += 1
165 elif '_id=' in genomeBac[j]:
166 protKey = genomeBac[j].strip()[13:-1]
167 j += 1
168 elif '/translation=' in genomeBac[j]:
169 protSeq = genomeBac[j].strip()[14:]
170 j += 1
171 for k in range(j, len(genomeBac)):
172 if genomeBac[k].islower():
173 j = k
174 protDone = True
175 break
176 else:
177 protSeq += genomeBac[k].strip()
178 else:
179 j += 1
180 protsBac[protKey] = [product, protSeq[:protSeq.find('"')]]
181 with open('files/bacteria/' + bact + '.json', 'w') as f:
182 json.dump(protsBac, f)
183 except:
184 print(bact + ' failed')
185 # with open('files/bactProteins.json', 'w') as f:
186 # json.dump(self.bactProteins, f)
187 # self.__createFasta(self.bactProteins, 'bactProteins')
188 print('Done')
189
190 def importData(self):
191 """
192 Imports the previously saved DNA and protein sequences. This needs to be improved so the user can specify which data to import.
193 :return:
194 """
195 import json
196 with open('files/phagesProteins.json', encoding='utf-8') as F:
197 self.phagesProteins = json.loads(F.read())
198 # with open('files/bactProteins.json', encoding='utf-8') as F:
199 # self.bactProteins = json.loads(F.read())
200
201 def addPhageProt(self):
202 """
203 For each unique phage present in the dataset, the DNA and protein sequences are saved in two variables.
204 :return:
205 """
206 from Bio import Entrez
207 from Bio import SeqIO
208 import json
209 Entrez.email = 'pedro_araujo97@hotmail.com'
210 print('Working...')
211
212 for phageID in self.data.index:
213 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=phageID) as handle:
214 genomePhage = SeqIO.read(handle, "gb")
215 protsPhage = {}
216 for feat in genomePhage.features:
217 if feat.type == 'CDS':
218 try: protsPhage[feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]]
219 except: pass
220 self.phagesProteins[phageID] = protsPhage
221
222 with open('files/phagesProteins.json', 'w') as f:
223 json.dump(self.phagesProteins, f)
224 self.__createFasta(self.phagesProteins, 'phagesProteins')
225 return self.phagesProteins
226
227 def add_missing_phage(self):
228 """
229 For each unique phage present in the dataset, the DNA and protein sequences are saved in two variables.
230 :return:
231 """
232 from Bio import Entrez
233 from Bio import SeqIO
234 import json
235 Entrez.email = 'pedro_araujo97@hotmail.com'
236 print('Working...')
237
238 for phageID in self.data.index:
239 if phageID not in self.phagesProteins.keys():
240 print(phageID)
241 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=phageID) as handle:
242 genomePhage = SeqIO.read(handle, "gb")
243 protsPhage = {}
244 for feat in genomePhage.features:
245 if feat.type == 'CDS':
246 try: protsPhage[feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]]
247 except: pass
248 self.phagesProteins[phageID] = protsPhage
249 with open('files/phagesProteins.json', 'w') as f:
250 json.dump(self.phagesProteins, f)
251 self.__createFasta(self.phagesProteins, 'phagesProteins')
252 return self.phagesProteins
253
254 def __createFasta(self, var, name):
255 with open('files/' + name + '.fasta', 'w') as F:
256 for spec in var:
257 try:
258 for prot in var[spec]:
259 F.write('>' + prot + '-' + spec + '\n' + var[spec][prot][1] + '\n')
260 except:
261 F.write('>' + spec + '\n' + var[spec] + '\n')
262
263
264 if __name__ == '__main__':
265 test = PhageBacteriaInformation('NCBI_Phage_Bacteria_Data.csv')
266 test.add_individual_bacteria()
267 test.addPhageProt()
268 test.add_missing_phage()
269 test.importData()