Mercurial > repos > pedro_araujo > phage_host_prediction
comparison get_proteins.py @ 0:e4b3fc88efe0 draft
Uploaded
author | pedro_araujo |
---|---|
date | Wed, 27 Jan 2021 13:50:11 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e4b3fc88efe0 |
---|---|
1 | |
2 class PhageBacteriaInformation: | |
3 | |
4 def __init__(self, dataset=None): | |
5 """ | |
6 Imports a dataset from NCBI Virus, where the columns are Phage ID, Phage Name, Bacteria Name, Bacteria ID. | |
7 If a phage entry does not have a bacteria associated, it is deleted | |
8 :param dataset: | |
9 """ | |
10 import pandas as pd | |
11 import ast | |
12 self.phagesProteins = {} | |
13 # self.phagesDNA = {} | |
14 self.bactProteins = {} | |
15 # self.bactDNA = {} | |
16 self.data = pd.read_csv('files/'+dataset, header=0, index_col=0) | |
17 self.data = self.data.dropna(how='any') | |
18 self.data = self.data[self.data['Host_ID'] != '[]'] | |
19 index_remove = [] | |
20 for i in range(len(self.data)): | |
21 temp = self.data['Host'][i].split(' ') | |
22 if len(temp) <= 1: | |
23 index_remove.append(i) | |
24 self.data = self.data.drop(self.data.index[index_remove]) | |
25 self.all_bact = [] | |
26 for i in self.data.index: | |
27 for bact in ast.literal_eval(self.data.loc[i, 'Host_ID']): | |
28 if bact[:-2] not in self.all_bact: | |
29 self.all_bact.append(bact[:-2]) | |
30 self.data.to_csv('files/Filtered_Phage_Bacteria.csv') | |
31 | |
32 def addFeatures(self): | |
33 """ | |
34 For each phage in the data, it saves its DNA sequence and all proteins, as provided by NCBI. It saves them into two variables | |
35 Each bacteria associated with the phage is also searched for its DNA and proteins sequences. | |
36 :return: | |
37 """ | |
38 from Bio import Entrez | |
39 from Bio import SeqIO | |
40 import json | |
41 import ast | |
42 Entrez.email = 'pedro_araujo97@hotmail.com' | |
43 print('Working...') | |
44 for phageID in self.data.index: | |
45 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=phageID) as handle: | |
46 genomePhage = SeqIO.read(handle, "gb") | |
47 protsPhage = {} | |
48 for feat in genomePhage.features: | |
49 if feat.type == 'CDS': | |
50 try: protsPhage[feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]] | |
51 except: pass | |
52 self.phagesProteins[phageID] = protsPhage | |
53 | |
54 for bact in self.all_bact: | |
55 protsBac = {} | |
56 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=bact) as handle: | |
57 genomeBac = SeqIO.read(handle, "gb") | |
58 for feat in genomeBac.features: | |
59 if feat.type == 'CDS': | |
60 try: protsBac[feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]] | |
61 except: pass | |
62 self.bactProteins[bact] = protsBac | |
63 | |
64 with open('files/phagesProteins.json', 'w') as f: | |
65 json.dump(self.phagesProteins, f) | |
66 self.__createFasta(self.phagesProteins, 'phagesProteins') | |
67 with open('files/bactProteins.json', 'w') as f: | |
68 json.dump(self.bactProteins, f) | |
69 self.__createFasta(self.bactProteins, 'bactProteins') | |
70 print('Done') | |
71 | |
72 def addBacteriaFeatures(self): | |
73 """ | |
74 For each unique bacteria present in the dataset, the DNA and protein sequences are saved in two variables. | |
75 :return: | |
76 """ | |
77 from Bio import Entrez | |
78 from Bio import SeqIO | |
79 import json | |
80 import ast | |
81 Entrez.email = 'pedro_araujo97@hotmail.com' | |
82 print('Working...') | |
83 for bact in self.all_bact: | |
84 if bact not in self.bactProteins.keys(): | |
85 protsBac = {} | |
86 try: | |
87 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=bact) as handle: | |
88 genomeBac = SeqIO.read(handle, "gb") | |
89 for feat in genomeBac.features: | |
90 if feat.type == 'CDS': | |
91 try: protsBac[feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]] | |
92 except: pass | |
93 if len(genomeBac.features) <= 5: | |
94 with Entrez.efetch(db="nucleotide", rettype="gbwithparts", retmode="text", id=bact) as handle: | |
95 genomeBac = handle.readlines() | |
96 for i in range(len(genomeBac)): | |
97 if ' CDS ' in genomeBac[i]: | |
98 j = i | |
99 protDone = False | |
100 while j < len(genomeBac): | |
101 if protDone: | |
102 break | |
103 if '/product=' in genomeBac[j]: | |
104 product = genomeBac[j].strip()[10:] | |
105 j += 1 | |
106 elif '_id=' in genomeBac[j]: | |
107 protKey = genomeBac[j].strip()[13:-1] | |
108 j += 1 | |
109 elif '/translation=' in genomeBac[j]: | |
110 protSeq = genomeBac[j].strip()[14:] | |
111 j += 1 | |
112 for k in range(j, len(genomeBac)): | |
113 if genomeBac[k].islower(): | |
114 j = k | |
115 protDone = True | |
116 break | |
117 else: | |
118 protSeq += genomeBac[k].strip() | |
119 else: | |
120 j += 1 | |
121 protsBac[protKey] = [product, protSeq[:protSeq.find('"')]] | |
122 self.bactProteins[bact] = protsBac | |
123 except: | |
124 print(bact + ' failed') | |
125 with open('files/bactProteins.json', 'w') as f: | |
126 json.dump(self.bactProteins, f) | |
127 self.__createFasta(self.bactProteins, 'bactProteins') | |
128 print('Done') | |
129 | |
130 def add_individual_bacteria(self): | |
131 """ | |
132 For each unique bacteria present in the dataset, the DNA and protein sequences are saved in two variables. | |
133 :return: | |
134 """ | |
135 from Bio import Entrez | |
136 from Bio import SeqIO | |
137 import json | |
138 from pathlib import Path | |
139 Entrez.email = 'pedro_araujo97@hotmail.com' | |
140 print('Working...') | |
141 for bact in self.all_bact: | |
142 my_file = Path('files/bacteria/' + bact + ".json") | |
143 if not my_file.is_file(): | |
144 protsBac = {} | |
145 try: | |
146 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=bact) as handle: | |
147 genomeBac = SeqIO.read(handle, "gb") | |
148 for feat in genomeBac.features: | |
149 if feat.type == 'CDS': | |
150 try: protsBac[feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]] | |
151 except: pass | |
152 if len(genomeBac.features) <= 5: | |
153 with Entrez.efetch(db="nucleotide", rettype="gbwithparts", retmode="text", id=bact) as handle: | |
154 genomeBac = handle.readlines() | |
155 for i in range(len(genomeBac)): | |
156 if ' CDS ' in genomeBac[i]: | |
157 j = i | |
158 protDone = False | |
159 while j < len(genomeBac): | |
160 if protDone: | |
161 break | |
162 if '/product=' in genomeBac[j]: | |
163 product = genomeBac[j].strip()[10:] | |
164 j += 1 | |
165 elif '_id=' in genomeBac[j]: | |
166 protKey = genomeBac[j].strip()[13:-1] | |
167 j += 1 | |
168 elif '/translation=' in genomeBac[j]: | |
169 protSeq = genomeBac[j].strip()[14:] | |
170 j += 1 | |
171 for k in range(j, len(genomeBac)): | |
172 if genomeBac[k].islower(): | |
173 j = k | |
174 protDone = True | |
175 break | |
176 else: | |
177 protSeq += genomeBac[k].strip() | |
178 else: | |
179 j += 1 | |
180 protsBac[protKey] = [product, protSeq[:protSeq.find('"')]] | |
181 with open('files/bacteria/' + bact + '.json', 'w') as f: | |
182 json.dump(protsBac, f) | |
183 except: | |
184 print(bact + ' failed') | |
185 # with open('files/bactProteins.json', 'w') as f: | |
186 # json.dump(self.bactProteins, f) | |
187 # self.__createFasta(self.bactProteins, 'bactProteins') | |
188 print('Done') | |
189 | |
190 def importData(self): | |
191 """ | |
192 Imports the previously saved DNA and protein sequences. This needs to be improved so the user can specify which data to import. | |
193 :return: | |
194 """ | |
195 import json | |
196 with open('files/phagesProteins.json', encoding='utf-8') as F: | |
197 self.phagesProteins = json.loads(F.read()) | |
198 # with open('files/bactProteins.json', encoding='utf-8') as F: | |
199 # self.bactProteins = json.loads(F.read()) | |
200 | |
201 def addPhageProt(self): | |
202 """ | |
203 For each unique phage present in the dataset, the DNA and protein sequences are saved in two variables. | |
204 :return: | |
205 """ | |
206 from Bio import Entrez | |
207 from Bio import SeqIO | |
208 import json | |
209 Entrez.email = 'pedro_araujo97@hotmail.com' | |
210 print('Working...') | |
211 | |
212 for phageID in self.data.index: | |
213 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=phageID) as handle: | |
214 genomePhage = SeqIO.read(handle, "gb") | |
215 protsPhage = {} | |
216 for feat in genomePhage.features: | |
217 if feat.type == 'CDS': | |
218 try: protsPhage[feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]] | |
219 except: pass | |
220 self.phagesProteins[phageID] = protsPhage | |
221 | |
222 with open('files/phagesProteins.json', 'w') as f: | |
223 json.dump(self.phagesProteins, f) | |
224 self.__createFasta(self.phagesProteins, 'phagesProteins') | |
225 return self.phagesProteins | |
226 | |
227 def add_missing_phage(self): | |
228 """ | |
229 For each unique phage present in the dataset, the DNA and protein sequences are saved in two variables. | |
230 :return: | |
231 """ | |
232 from Bio import Entrez | |
233 from Bio import SeqIO | |
234 import json | |
235 Entrez.email = 'pedro_araujo97@hotmail.com' | |
236 print('Working...') | |
237 | |
238 for phageID in self.data.index: | |
239 if phageID not in self.phagesProteins.keys(): | |
240 print(phageID) | |
241 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=phageID) as handle: | |
242 genomePhage = SeqIO.read(handle, "gb") | |
243 protsPhage = {} | |
244 for feat in genomePhage.features: | |
245 if feat.type == 'CDS': | |
246 try: protsPhage[feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]] | |
247 except: pass | |
248 self.phagesProteins[phageID] = protsPhage | |
249 with open('files/phagesProteins.json', 'w') as f: | |
250 json.dump(self.phagesProteins, f) | |
251 self.__createFasta(self.phagesProteins, 'phagesProteins') | |
252 return self.phagesProteins | |
253 | |
254 def __createFasta(self, var, name): | |
255 with open('files/' + name + '.fasta', 'w') as F: | |
256 for spec in var: | |
257 try: | |
258 for prot in var[spec]: | |
259 F.write('>' + prot + '-' + spec + '\n' + var[spec][prot][1] + '\n') | |
260 except: | |
261 F.write('>' + spec + '\n' + var[spec] + '\n') | |
262 | |
263 | |
264 if __name__ == '__main__': | |
265 test = PhageBacteriaInformation('NCBI_Phage_Bacteria_Data.csv') | |
266 test.add_individual_bacteria() | |
267 test.addPhageProt() | |
268 test.add_missing_phage() | |
269 test.importData() |