2
|
1 #!/usr/bin/env python
|
|
2
|
|
3 __author__= "Gianmarco Piccinno"
|
|
4 __version__ = "1.0.0"
|
|
5
|
|
6 import Bio
|
|
7 from Bio import SeqIO
|
|
8 from Bio.Seq import Seq
|
|
9 from Bio.Alphabet import IUPAC
|
|
10 from Bio.Data import IUPACData
|
|
11 from Bio.Data import CodonTable
|
|
12 import re
|
|
13 import sre_yield
|
|
14
|
|
15 import re
|
|
16 import itertools
|
|
17 from functools import reduce
|
|
18
|
|
19 import Bio
|
|
20 from Bio import Data
|
|
21 from Bio.Data import IUPACData
|
|
22 from Bio.Data import CodonTable
|
|
23
|
|
24 from pprint import pprint
|
|
25
|
|
26 import pandas as pd
|
|
27
|
|
28 def _check_bases(seq_string):
|
|
29 """
|
|
30 Check characters in a string (PRIVATE).
|
|
31 Remove digits and white space present in string. Allows any valid ambiguous
|
|
32 IUPAC DNA single letters codes (ABCDGHKMNRSTVWY, upper case are converted).
|
|
33
|
|
34 Other characters (e.g. symbols) trigger a TypeError.
|
|
35
|
|
36 Returns the string WITH A LEADING SPACE (!). This is for backwards
|
|
37 compatibility, and may in part be explained by the fact that
|
|
38 Bio.Restriction doesn't use zero based counting.
|
|
39 """
|
|
40 # Remove white space and make upper case:
|
|
41 seq_string = "".join(seq_string.split()).upper()
|
|
42 # Remove digits
|
|
43 for c in "0123456789":
|
|
44 seq_string = seq_string.replace(c, "")
|
|
45 # Check only allowed IUPAC letters
|
|
46 if not set(seq_string).issubset(set("ABCDGHKMNRSTVWY")):
|
|
47 raise TypeError("Invalid character found in %s" % repr(seq_string))
|
|
48 return " " + seq_string
|
|
49
|
|
50 matching = {'A': 'ARWMHVDN', 'C': 'CYSMHBVN', 'G': 'GRSKBVDN',
|
|
51 'T': 'TYWKHBDN', 'R': 'ABDGHKMNSRWV', 'Y': 'CBDHKMNSTWVY',
|
|
52 'W': 'ABDHKMNRTWVY', 'S': 'CBDGHKMNSRVY', 'M': 'ACBDHMNSRWVY',
|
|
53 'K': 'BDGHKNSRTWVY', 'H': 'ACBDHKMNSRTWVY',
|
|
54 'B': 'CBDGHKMNSRTWVY', 'V': 'ACBDGHKMNSRWVY',
|
|
55 'D': 'ABDGHKMNSRTWVY', 'N': 'ACBDGHKMNSRTWVY'}
|
|
56
|
|
57 class pattern(object):
|
|
58
|
|
59
|
|
60 def __init__(self, pattern_input):
|
|
61 s = str(pattern_input)
|
|
62 self.upper = s.isupper()
|
|
63 self.data = _check_bases(s)
|
|
64 self.pattern = s
|
|
65
|
|
66 def plan_ambiguity(self):
|
|
67 val = Bio.Data.IUPACData.ambiguous_dna_values
|
|
68 re_pattern = ""
|
|
69 for el in self.pattern:
|
|
70 re_pattern = re_pattern + "[" + val[el] + "]"
|
|
71 return re_pattern
|
|
72
|
|
73 class annotated_genome(object):
|
|
74
|
|
75 def __init__(self, seq):
|
|
76 s = str(seq)
|
|
77 self.upper = s.isupper()
|
|
78 self.data = _check_bases(s)
|
|
79 self.seq = s
|
|
80
|
|
81 def codon_usage(self, codonTable):
|
|
82
|
|
83 codon_usage = {}
|
|
84 tmp = [x for x in re.split(r'(\w{3})', self.seq) if x != ""]
|
|
85
|
|
86 b_cod_table = CodonTable.unambiguous_dna_by_name["Bacterial"].forward_table
|
|
87 aas = set(b_cod_table.values())
|
|
88
|
|
89 for aa in aas:
|
|
90 codon_usage[aa] = {}
|
|
91 for codon in b_cod_table.keys():
|
|
92 if b_cod_table[codon] == aa:
|
|
93 codon_usage[aa][codon] = tmp.count(codon)
|
|
94
|
|
95 tups = {(outerKey, innerKey): values for outerKey, innerDict in codon_usage.iteritems() for innerKey, values in innerDict.iteritems()}
|
|
96
|
|
97 codon_usage_ = pd.DataFrame(pd.Series(tups), columns = ["Count"])
|
|
98 codon_usage_.index = codon_usage_.index.set_names(["AA", "Codon"])
|
|
99 codon_usage_['Proportion'] = codon_usage_.groupby(level=0).transform(lambda x: (x / x.sum()).round(2))
|
|
100
|
|
101 codon_usage_.reset_index(inplace=True)
|
|
102 codon_usage_.index = codon_usage_["Codon"]
|
|
103
|
|
104 return {"Dictionary": codon_usage, "Tuples": tups, "Table": codon_usage_}
|
|
105
|
|
106 class plasmid(object):
|
|
107 """
|
|
108 This class represents a circular plasmid
|
|
109 """
|
|
110
|
|
111 def __init__(self, seq = "", circular=True, features = None):
|
|
112
|
|
113 if type(seq) in [Bio.SeqRecord.SeqRecord, plasmid, Seq]:
|
|
114 s = str(seq.seq)
|
|
115 self.features = seq.features
|
|
116 else:
|
|
117 s = str(seq)
|
|
118 if features != None:
|
|
119 self.features = features
|
|
120 self.upper = s.isupper()
|
|
121 #self.data = _check_bases(s)
|
|
122 self.data = s
|
|
123 self.circular = circular
|
|
124 self.Class = s.__class__
|
|
125 self.size = len(s)
|
|
126 self.seq = self.data
|
|
127
|
|
128 def __len__(self):
|
|
129 return len(self.data)
|
|
130
|
|
131 def __repr__(self):
|
|
132 return "plasmid(%s, circular=%s)" % (repr(self[:]),repr(self.circular))
|
|
133
|
|
134 def __eq__(self, other):
|
|
135 if isinstance(other, plasmid):
|
|
136 if repr(self) == repr(other):
|
|
137 return True
|
|
138 else:
|
|
139 return False
|
|
140 return False
|
|
141
|
|
142 def __getitem__(self, i):
|
|
143 if self.upper:
|
|
144 return self.Class((self.data[i]).upper())
|
|
145 return self.Class(self.data[i])
|
|
146
|
|
147 def sequence(self):
|
|
148 return self.data
|
|
149
|
|
150
|
|
151 def extract(self, feature):
|
|
152 return self.data[feature.location.start:feature.location.end]#[::feature.strand]
|
|
153
|
|
154
|
|
155 def findpattern(self, pattern, ambiguous_pattern):
|
|
156 """
|
|
157 Return a list of a given pattern which occurs in the sequence.
|
|
158 The list is made of tuple (location, pattern.group).
|
|
159 The latter is used with non palindromic sites.
|
|
160 Pattern is the regular expression pattern corresponding to the
|
|
161 enzyme restriction site.
|
|
162 Size is the size of the restriction enzyme recognition-site size.
|
|
163 """
|
|
164 if not self.circular:
|
|
165 data = self.data
|
|
166 else:
|
|
167 data = self.data + self.data[:len(ambiguous_pattern)]
|
|
168 print(data)
|
|
169 print(pattern)
|
|
170 return [(data[i.start():i.end()], i.start(), i.end(), "innner") if (i.end()<=self.size) else (data[i.start():i.end()], i.start(), i.end()-self.size, "junction") for i in re.finditer(pattern, data)]
|
|
171
|
|
172 def findpatterns(self, patterns, ambiguous_patterns):
|
|
173 """
|
|
174 Return a list of a given pattern which occurs in the sequence.
|
|
175 The list is made of tuple (location, pattern.group).
|
|
176 The latter is used with non palindromic sites.
|
|
177 Pattern is the regular expression pattern corresponding to the
|
|
178 enzyme restriction site.
|
|
179 Size is the size of the restriction enzyme recognition-site size.
|
|
180 """
|
|
181 patts = {}
|
|
182 for i in range(len(patterns)):
|
|
183 #print(patterns[i])
|
|
184 #print(ambiguous_patterns[i])
|
|
185 if not self.circular:
|
|
186 data = self.data
|
|
187 else:
|
|
188 data = self.data + self.data[:len(ambiguous_patterns[i])]
|
|
189 #print(data)
|
|
190 patts[ambiguous_patterns[i]] = [(data[j.start():j.end()], j.start(), j.end(), "innner") if (j.end()<=self.size) else (data[j.start():j.end()], j.start(), j.end()-self.size, "junction") for j in re.finditer(patterns[i], data)]
|
|
191
|
|
192 return patts
|
|
193
|
|
194
|
|
195 def codon_switch(self, patterns, annotation, codon_usage):
|
|
196
|
|
197 table = {}
|
|
198
|
|
199 for pattern in patterns:
|
|
200 if pattern[1] >= annotation & pattern[2] <= annotation:
|
|
201 # the pattern is in a coding sequence
|
|
202 # do codon switch
|
|
203 print("Do codon switch!")
|
|
204 else:
|
|
205 next
|
|
206 return table
|
|
207
|
|
208 def transversion(self, patterns):
|
|
209 return
|
|
210
|
|
211 def read_patterns(input):
|
|
212
|
|
213 patterns = []
|
|
214
|
|
215 with open(input, "rU") as handle:
|
|
216 for pat in handle.readlines():
|
|
217
|
|
218 tmp = Seq(pat.rstrip(), IUPAC.ambiguous_dna)
|
|
219
|
|
220 patterns.append(tmp)
|
|
221 #patterns.append(tmp.reverse_complement())
|
|
222
|
|
223
|
|
224 return patterns
|
|
225
|
|
226
|
|
227 def codon_usage(seqs, codonTable):
|
|
228
|
|
229 codon_usage = {}
|
|
230 tmp = [x for x in re.split(r'(\w{3})', seqs) if x != ""]
|
|
231
|
|
232 b_cod_table = CodonTable.unambiguous_dna_by_name[codonTable].forward_table
|
|
233
|
|
234 for cod in CodonTable.unambiguous_dna_by_name[codonTable].stop_codons:
|
|
235 b_cod_table[cod] = "_Stop"
|
|
236
|
|
237 for cod in CodonTable.unambiguous_dna_by_name[codonTable].start_codons:
|
|
238 #print(cod)
|
|
239 b_cod_table[cod] = b_cod_table[cod]
|
|
240
|
|
241 aas = set(b_cod_table.values())
|
|
242
|
|
243 for aa in aas:
|
|
244 #print(aa)
|
|
245 #codon_usage[aa] = {}
|
|
246 for codon in b_cod_table.keys():
|
|
247 if b_cod_table[codon] == aa:
|
|
248 codon_usage[codon] = tmp.count(codon.split(" ")[0])
|
|
249
|
|
250 return codon_usage
|
|
251
|
|
252
|
|
253 def read_annotated_genome(data="example.fna", type_="fasta"):
|
|
254 """
|
|
255 Accepted formats:
|
|
256 - fasta (multifasta)
|
|
257 - gbk
|
|
258
|
|
259 """
|
|
260
|
|
261 seqs = ""
|
|
262
|
|
263 if type_ == "fasta":
|
|
264 with open(data, "rU") as handle:
|
|
265 for record in SeqIO.parse(handle, type_):
|
|
266 seqs = seqs + str(record.seq)
|
|
267
|
|
268 elif type_ == "genbank":
|
|
269 with open(data, "rU") as input_handle:
|
|
270 types = []
|
|
271 for record in SeqIO.parse(input_handle, "genbank"):
|
|
272 for feature in record.features:
|
|
273 types.append(feature.type)
|
|
274 if feature.type == "CDS":
|
|
275 if feature.location.strand == +1:
|
|
276 seq = record.seq[feature.location.start:feature.location.end]
|
|
277 seqs = seqs + str(seq)
|
|
278 elif feature.location.strand == -1:
|
|
279 seq = record.seq[feature.location.start:
|
|
280 feature.location.end].reverse_complement()
|
|
281 seqs = seqs + str(seq)
|
|
282 return seqs
|
|
283
|
|
284
|
|
285 def synonims_(table_name="Bacterial"):
|
|
286
|
|
287 b_cod_table = CodonTable.unambiguous_dna_by_name[table_name].forward_table
|
|
288
|
|
289 print(b_cod_table)
|
|
290
|
|
291 for cod in CodonTable.unambiguous_dna_by_name[table_name].stop_codons:
|
|
292 b_cod_table[cod] = "_Stop"
|
|
293
|
|
294 for cod in CodonTable.unambiguous_dna_by_name[table_name].start_codons:
|
|
295 b_cod_table[cod] = "_Start"
|
|
296
|
|
297 #pprint(b_cod_table)
|
|
298 codons = {}
|
|
299
|
|
300 aas = set(b_cod_table.values())
|
|
301
|
|
302 for aa in aas:
|
|
303 codons[aa] = []
|
|
304 for codon in b_cod_table.keys():
|
|
305 if b_cod_table[codon] == aa:
|
|
306 codons[aa].append(codon)
|
|
307
|
|
308 #break
|
|
309
|
|
310 synonims = {}
|
|
311
|
|
312 for el1 in codons.keys():
|
|
313 print(el1)
|
|
314 for el2 in codons[el1]:
|
|
315 print(el2)
|
|
316 synonims[el2] = codons[el1]
|
|
317 #synonims[el2] = []
|
|
318 #for el3 in codons[el1]#set.difference(set(codons[el1]), {el2}):
|
|
319 # print(el3)
|
|
320 # synonims[el2].append(el3)
|
|
321 #break
|
|
322 #break
|
|
323 #break
|
|
324
|
|
325
|
|
326 anti_codons = {}
|
|
327
|
|
328 for codon in synonims.keys():
|
|
329 tmp_codon = Bio.Seq.Seq(codon, IUPAC.unambiguous_dna)
|
|
330 tmp_anticodon = str(tmp_codon.reverse_complement())
|
|
331
|
|
332 anti_codons[tmp_anticodon] = []
|
|
333
|
|
334 for synonim in synonims[codon]:
|
|
335 tmp_synonim = Bio.Seq.Seq(synonim, IUPAC.unambiguous_dna)
|
|
336 tmp_antisynonim = str(tmp_synonim.reverse_complement())
|
|
337 anti_codons[tmp_anticodon].append(tmp_antisynonim)
|
|
338
|
|
339 check = Bio.Seq.Seq("CTT")
|
|
340 anti_check = check.reverse_complement()
|
|
341 print("\nCheck:\n" + str(check))
|
|
342 print("\nCodons:\n")
|
|
343
|
|
344 for key in codons.keys():
|
|
345 if str(check) in codons[key]:
|
|
346 print(codons[key])
|
|
347
|
|
348 #pprint(codons)
|
|
349 print("\nSynonims:\n")
|
|
350 pprint(synonims[str(check)])
|
|
351 print("\nAnti_Codons:\n")
|
|
352 pprint(anti_codons[str(anti_check)])
|
|
353
|
|
354 #i = synonims.keys()
|
|
355 #right = True
|
|
356 #while len(i) > 0:
|
|
357 # tmp = i.pop()
|
|
358 # check = Bio.Seq.Seq(tmp)
|
|
359 # anti_check = check.reverse_complement()
|
|
360
|
|
361
|
|
362 return {"synonims":synonims, "anti_synonims":anti_codons}
|