annotate phage_host_prediction/get_proteins.py @ 2:3e1e8be4e65c draft default tip

Uploaded
author pedro_araujo
date Fri, 02 Apr 2021 10:11:13 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
2 class PhageBacteriaInformation:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
3
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
4 def __init__(self, dataset=None):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
5 """
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
6 Imports a dataset from NCBI Virus, where the columns are Phage ID, Phage Name, Bacteria Name, Bacteria ID.
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
7 If a phage entry does not have a bacteria associated, it is deleted
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
8 :param dataset:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
9 """
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
10 import pandas as pd
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
11 import ast
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
12 self.phagesProteins = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
13 # self.phagesDNA = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
14 self.bactProteins = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
15 # self.bactDNA = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
16 self.data = pd.read_csv('files/'+dataset, header=0, index_col=0)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
17 self.data = self.data.dropna(how='any')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
18 self.data = self.data[self.data['Host_ID'] != '[]']
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
19 index_remove = []
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
20 for i in range(len(self.data)):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
21 temp = self.data['Host'][i].split(' ')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
22 if len(temp) <= 1:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
23 index_remove.append(i)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
24 self.data = self.data.drop(self.data.index[index_remove])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
25 self.all_bact = []
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
26 for i in self.data.index:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
27 for bact in ast.literal_eval(self.data.loc[i, 'Host_ID']):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
28 if bact[:-2] not in self.all_bact:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
29 self.all_bact.append(bact[:-2])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
30 self.data.to_csv('files/Filtered_Phage_Bacteria.csv')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
31
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
32 def addFeatures(self):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
33 """
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
34 For each phage in the data, it saves its DNA sequence and all proteins, as provided by NCBI. It saves them into two variables
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
35 Each bacteria associated with the phage is also searched for its DNA and proteins sequences.
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
36 :return:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
37 """
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
38 from Bio import Entrez
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
39 from Bio import SeqIO
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
40 import json
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
41 import ast
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
42 Entrez.email = 'pedro_araujo97@hotmail.com'
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
43 print('Working...')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
44 for phageID in self.data.index:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
45 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=phageID) as handle:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
46 genomePhage = SeqIO.read(handle, "gb")
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
47 protsPhage = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
48 for feat in genomePhage.features:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
49 if feat.type == 'CDS':
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
50 try: protsPhage[feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
51 except: pass
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
52 self.phagesProteins[phageID] = protsPhage
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
53
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
54 for bact in self.all_bact:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
55 protsBac = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
56 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=bact) as handle:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
57 genomeBac = SeqIO.read(handle, "gb")
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
58 for feat in genomeBac.features:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
59 if feat.type == 'CDS':
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
60 try: protsBac[feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
61 except: pass
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
62 self.bactProteins[bact] = protsBac
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
63
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
64 with open('files/phagesProteins.json', 'w') as f:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
65 json.dump(self.phagesProteins, f)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
66 self.__createFasta(self.phagesProteins, 'phagesProteins')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
67 with open('files/bactProteins.json', 'w') as f:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
68 json.dump(self.bactProteins, f)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
69 self.__createFasta(self.bactProteins, 'bactProteins')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
70 print('Done')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
71
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
72 def addBacteriaFeatures(self):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
73 """
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
74 For each unique bacteria present in the dataset, the DNA and protein sequences are saved in two variables.
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
75 :return:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
76 """
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
77 from Bio import Entrez
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
78 from Bio import SeqIO
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
79 import json
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
80 import ast
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
81 Entrez.email = 'pedro_araujo97@hotmail.com'
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
82 print('Working...')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
83 for bact in self.all_bact:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
84 if bact not in self.bactProteins.keys():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
85 protsBac = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
86 try:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
87 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=bact) as handle:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
88 genomeBac = SeqIO.read(handle, "gb")
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
89 for feat in genomeBac.features:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
90 if feat.type == 'CDS':
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
91 try: protsBac[feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
92 except: pass
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
93 if len(genomeBac.features) <= 5:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
94 with Entrez.efetch(db="nucleotide", rettype="gbwithparts", retmode="text", id=bact) as handle:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
95 genomeBac = handle.readlines()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
96 for i in range(len(genomeBac)):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
97 if ' CDS ' in genomeBac[i]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
98 j = i
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
99 protDone = False
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
100 while j < len(genomeBac):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
101 if protDone:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
102 break
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
103 if '/product=' in genomeBac[j]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
104 product = genomeBac[j].strip()[10:]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
105 j += 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
106 elif '_id=' in genomeBac[j]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
107 protKey = genomeBac[j].strip()[13:-1]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
108 j += 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
109 elif '/translation=' in genomeBac[j]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
110 protSeq = genomeBac[j].strip()[14:]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
111 j += 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
112 for k in range(j, len(genomeBac)):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
113 if genomeBac[k].islower():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
114 j = k
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
115 protDone = True
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
116 break
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
117 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
118 protSeq += genomeBac[k].strip()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
119 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
120 j += 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
121 protsBac[protKey] = [product, protSeq[:protSeq.find('"')]]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
122 self.bactProteins[bact] = protsBac
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
123 except:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
124 print(bact + ' failed')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
125 with open('files/bactProteins.json', 'w') as f:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
126 json.dump(self.bactProteins, f)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
127 self.__createFasta(self.bactProteins, 'bactProteins')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
128 print('Done')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
129
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
130 def add_individual_bacteria(self):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
131 """
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
132 For each unique bacteria present in the dataset, the DNA and protein sequences are saved in two variables.
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
133 :return:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
134 """
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
135 from Bio import Entrez
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
136 from Bio import SeqIO
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
137 import json
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
138 from pathlib import Path
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
139 Entrez.email = 'pedro_araujo97@hotmail.com'
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
140 print('Working...')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
141 for bact in self.all_bact:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
142 my_file = Path('files/bacteria/' + bact + ".json")
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
143 if not my_file.is_file():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
144 protsBac = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
145 try:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
146 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=bact) as handle:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
147 genomeBac = SeqIO.read(handle, "gb")
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
148 for feat in genomeBac.features:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
149 if feat.type == 'CDS':
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
150 try: protsBac[feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
151 except: pass
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
152 if len(genomeBac.features) <= 5:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
153 with Entrez.efetch(db="nucleotide", rettype="gbwithparts", retmode="text", id=bact) as handle:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
154 genomeBac = handle.readlines()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
155 for i in range(len(genomeBac)):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
156 if ' CDS ' in genomeBac[i]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
157 j = i
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
158 protDone = False
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
159 while j < len(genomeBac):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
160 if protDone:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
161 break
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
162 if '/product=' in genomeBac[j]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
163 product = genomeBac[j].strip()[10:]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
164 j += 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
165 elif '_id=' in genomeBac[j]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
166 protKey = genomeBac[j].strip()[13:-1]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
167 j += 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
168 elif '/translation=' in genomeBac[j]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
169 protSeq = genomeBac[j].strip()[14:]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
170 j += 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
171 for k in range(j, len(genomeBac)):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
172 if genomeBac[k].islower():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
173 j = k
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
174 protDone = True
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
175 break
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
176 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
177 protSeq += genomeBac[k].strip()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
178 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
179 j += 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
180 protsBac[protKey] = [product, protSeq[:protSeq.find('"')]]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
181 with open('files/bacteria/' + bact + '.json', 'w') as f:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
182 json.dump(protsBac, f)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
183 except:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
184 print(bact + ' failed')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
185 # with open('files/bactProteins.json', 'w') as f:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
186 # json.dump(self.bactProteins, f)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
187 # self.__createFasta(self.bactProteins, 'bactProteins')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
188 print('Done')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
189
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
190 def importData(self):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
191 """
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
192 Imports the previously saved DNA and protein sequences. This needs to be improved so the user can specify which data to import.
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
193 :return:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
194 """
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
195 import json
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
196 with open('files/phagesProteins.json', encoding='utf-8') as F:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
197 self.phagesProteins = json.loads(F.read())
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
198 # with open('files/bactProteins.json', encoding='utf-8') as F:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
199 # self.bactProteins = json.loads(F.read())
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
200
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
201 def addPhageProt(self):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
202 """
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
203 For each unique phage present in the dataset, the DNA and protein sequences are saved in two variables.
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
204 :return:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
205 """
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
206 from Bio import Entrez
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
207 from Bio import SeqIO
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
208 import json
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
209 Entrez.email = 'pedro_araujo97@hotmail.com'
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
210 print('Working...')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
211
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
212 for phageID in self.data.index:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
213 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=phageID) as handle:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
214 genomePhage = SeqIO.read(handle, "gb")
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
215 protsPhage = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
216 for feat in genomePhage.features:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
217 if feat.type == 'CDS':
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
218 try: protsPhage[feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
219 except: pass
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
220 self.phagesProteins[phageID] = protsPhage
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
221
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
222 with open('files/phagesProteins.json', 'w') as f:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
223 json.dump(self.phagesProteins, f)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
224 self.__createFasta(self.phagesProteins, 'phagesProteins')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
225 return self.phagesProteins
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
226
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
227 def add_missing_phage(self):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
228 """
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
229 For each unique phage present in the dataset, the DNA and protein sequences are saved in two variables.
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
230 :return:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
231 """
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
232 from Bio import Entrez
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
233 from Bio import SeqIO
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
234 import json
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
235 Entrez.email = 'pedro_araujo97@hotmail.com'
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
236 print('Working...')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
237
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
238 for phageID in self.data.index:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
239 if phageID not in self.phagesProteins.keys():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
240 print(phageID)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
241 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=phageID) as handle:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
242 genomePhage = SeqIO.read(handle, "gb")
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
243 protsPhage = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
244 for feat in genomePhage.features:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
245 if feat.type == 'CDS':
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
246 try: protsPhage[feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
247 except: pass
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
248 self.phagesProteins[phageID] = protsPhage
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
249 with open('files/phagesProteins.json', 'w') as f:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
250 json.dump(self.phagesProteins, f)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
251 self.__createFasta(self.phagesProteins, 'phagesProteins')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
252 return self.phagesProteins
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
253
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
254 def __createFasta(self, var, name):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
255 with open('files/' + name + '.fasta', 'w') as F:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
256 for spec in var:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
257 try:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
258 for prot in var[spec]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
259 F.write('>' + prot + '-' + spec + '\n' + var[spec][prot][1] + '\n')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
260 except:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
261 F.write('>' + spec + '\n' + var[spec] + '\n')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
262
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
263
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
264 if __name__ == '__main__':
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
265 test = PhageBacteriaInformation('NCBI_Phage_Bacteria_Data.csv')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
266 test.add_individual_bacteria()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
267 test.addPhageProt()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
268 test.add_missing_phage()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
269 test.importData()