annotate CodonSwitchTool/syngenic.py @ 2:aad5e435e4dc draft default tip

Uploaded
author gianmarco_piccinno
date Tue, 21 May 2019 05:24:56 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
1 #!/usr/bin/env python
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
2
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
3 __author__= "Gianmarco Piccinno"
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
4 __version__ = "1.0.0"
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
5
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
6 import Bio
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
7 from Bio import SeqIO
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
8 from Bio.Seq import Seq
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
9 from Bio.Alphabet import IUPAC
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
10 from Bio.Data import IUPACData
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
11 from Bio.Data import CodonTable
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
12 import re
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
13 import sre_yield
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
14
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
15 import re
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
16 import itertools
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
17 from functools import reduce
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
18
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
19 import Bio
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
20 from Bio import Data
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
21 from Bio.Data import IUPACData
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
22 from Bio.Data import CodonTable
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
23
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
24 from pprint import pprint
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
25
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
26 import pandas as pd
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
27
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
28 def _check_bases(seq_string):
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
29 """
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
30 Check characters in a string (PRIVATE).
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
31 Remove digits and white space present in string. Allows any valid ambiguous
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
32 IUPAC DNA single letters codes (ABCDGHKMNRSTVWY, upper case are converted).
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
33
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
34 Other characters (e.g. symbols) trigger a TypeError.
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
35
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
36 Returns the string WITH A LEADING SPACE (!). This is for backwards
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
37 compatibility, and may in part be explained by the fact that
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
38 Bio.Restriction doesn't use zero based counting.
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
39 """
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
40 # Remove white space and make upper case:
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
41 seq_string = "".join(seq_string.split()).upper()
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
42 # Remove digits
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
43 for c in "0123456789":
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
44 seq_string = seq_string.replace(c, "")
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
45 # Check only allowed IUPAC letters
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
46 if not set(seq_string).issubset(set("ABCDGHKMNRSTVWY")):
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
47 raise TypeError("Invalid character found in %s" % repr(seq_string))
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
48 return " " + seq_string
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
49
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
50 matching = {'A': 'ARWMHVDN', 'C': 'CYSMHBVN', 'G': 'GRSKBVDN',
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
51 'T': 'TYWKHBDN', 'R': 'ABDGHKMNSRWV', 'Y': 'CBDHKMNSTWVY',
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
52 'W': 'ABDHKMNRTWVY', 'S': 'CBDGHKMNSRVY', 'M': 'ACBDHMNSRWVY',
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
53 'K': 'BDGHKNSRTWVY', 'H': 'ACBDHKMNSRTWVY',
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
54 'B': 'CBDGHKMNSRTWVY', 'V': 'ACBDGHKMNSRWVY',
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
55 'D': 'ABDGHKMNSRTWVY', 'N': 'ACBDGHKMNSRTWVY'}
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
56
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
57 class pattern(object):
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
58
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
59
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
60 def __init__(self, pattern_input):
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
61 s = str(pattern_input)
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
62 self.upper = s.isupper()
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
63 self.data = _check_bases(s)
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
64 self.pattern = s
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
65
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
66 def plan_ambiguity(self):
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
67 val = Bio.Data.IUPACData.ambiguous_dna_values
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
68 re_pattern = ""
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
69 for el in self.pattern:
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
70 re_pattern = re_pattern + "[" + val[el] + "]"
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
71 return re_pattern
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
72
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
73 class annotated_genome(object):
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
74
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
75 def __init__(self, seq):
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
76 s = str(seq)
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
77 self.upper = s.isupper()
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
78 self.data = _check_bases(s)
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
79 self.seq = s
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
80
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
81 def codon_usage(self, codonTable):
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
82
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
83 codon_usage = {}
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
84 tmp = [x for x in re.split(r'(\w{3})', self.seq) if x != ""]
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
85
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
86 b_cod_table = CodonTable.unambiguous_dna_by_name["Bacterial"].forward_table
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
87 aas = set(b_cod_table.values())
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
88
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
89 for aa in aas:
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
90 codon_usage[aa] = {}
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
91 for codon in b_cod_table.keys():
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
92 if b_cod_table[codon] == aa:
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
93 codon_usage[aa][codon] = tmp.count(codon)
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
94
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
95 tups = {(outerKey, innerKey): values for outerKey, innerDict in codon_usage.iteritems() for innerKey, values in innerDict.iteritems()}
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
96
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
97 codon_usage_ = pd.DataFrame(pd.Series(tups), columns = ["Count"])
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
98 codon_usage_.index = codon_usage_.index.set_names(["AA", "Codon"])
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
99 codon_usage_['Proportion'] = codon_usage_.groupby(level=0).transform(lambda x: (x / x.sum()).round(2))
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
100
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
101 codon_usage_.reset_index(inplace=True)
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
102 codon_usage_.index = codon_usage_["Codon"]
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
103
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
104 return {"Dictionary": codon_usage, "Tuples": tups, "Table": codon_usage_}
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
105
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
106 class plasmid(object):
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
107 """
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
108 This class represents a circular plasmid
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
109 """
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
110
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
111 def __init__(self, seq = "", circular=True, features = None):
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
112
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
113 if type(seq) in [Bio.SeqRecord.SeqRecord, plasmid, Seq]:
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
114 s = str(seq.seq)
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
115 self.features = seq.features
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
116 else:
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
117 s = str(seq)
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
118 if features != None:
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
119 self.features = features
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
120 self.upper = s.isupper()
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
121 #self.data = _check_bases(s)
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
122 self.data = s
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
123 self.circular = circular
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
124 self.Class = s.__class__
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
125 self.size = len(s)
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
126 self.seq = self.data
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
127
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
128 def __len__(self):
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
129 return len(self.data)
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
130
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
131 def __repr__(self):
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
132 return "plasmid(%s, circular=%s)" % (repr(self[:]),repr(self.circular))
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
133
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
134 def __eq__(self, other):
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
135 if isinstance(other, plasmid):
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
136 if repr(self) == repr(other):
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
137 return True
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
138 else:
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
139 return False
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
140 return False
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
141
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
142 def __getitem__(self, i):
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
143 if self.upper:
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
144 return self.Class((self.data[i]).upper())
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
145 return self.Class(self.data[i])
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
146
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
147 def sequence(self):
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
148 return self.data
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
149
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
150
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
151 def extract(self, feature):
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
152 return self.data[feature.location.start:feature.location.end]#[::feature.strand]
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
153
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
154
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
155 def findpattern(self, pattern, ambiguous_pattern):
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
156 """
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
157 Return a list of a given pattern which occurs in the sequence.
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
158 The list is made of tuple (location, pattern.group).
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
159 The latter is used with non palindromic sites.
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
160 Pattern is the regular expression pattern corresponding to the
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
161 enzyme restriction site.
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
162 Size is the size of the restriction enzyme recognition-site size.
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
163 """
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
164 if not self.circular:
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
165 data = self.data
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
166 else:
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
167 data = self.data + self.data[:len(ambiguous_pattern)]
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
168 print(data)
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
169 print(pattern)
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
170 return [(data[i.start():i.end()], i.start(), i.end(), "innner") if (i.end()<=self.size) else (data[i.start():i.end()], i.start(), i.end()-self.size, "junction") for i in re.finditer(pattern, data)]
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
171
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
172 def findpatterns(self, patterns, ambiguous_patterns):
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
173 """
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
174 Return a list of a given pattern which occurs in the sequence.
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
175 The list is made of tuple (location, pattern.group).
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
176 The latter is used with non palindromic sites.
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
177 Pattern is the regular expression pattern corresponding to the
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
178 enzyme restriction site.
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
179 Size is the size of the restriction enzyme recognition-site size.
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
180 """
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
181 patts = {}
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
182 for i in range(len(patterns)):
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
183 #print(patterns[i])
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
184 #print(ambiguous_patterns[i])
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
185 if not self.circular:
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
186 data = self.data
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
187 else:
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
188 data = self.data + self.data[:len(ambiguous_patterns[i])]
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
189 #print(data)
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
190 patts[ambiguous_patterns[i]] = [(data[j.start():j.end()], j.start(), j.end(), "innner") if (j.end()<=self.size) else (data[j.start():j.end()], j.start(), j.end()-self.size, "junction") for j in re.finditer(patterns[i], data)]
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
191
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
192 return patts
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
193
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
194
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
195 def codon_switch(self, patterns, annotation, codon_usage):
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
196
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
197 table = {}
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
198
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
199 for pattern in patterns:
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
200 if pattern[1] >= annotation & pattern[2] <= annotation:
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
201 # the pattern is in a coding sequence
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
202 # do codon switch
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
203 print("Do codon switch!")
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
204 else:
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
205 next
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
206 return table
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
207
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
208 def transversion(self, patterns):
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
209 return
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
210
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
211 def read_patterns(input):
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
212
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
213 patterns = []
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
214
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
215 with open(input, "rU") as handle:
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
216 for pat in handle.readlines():
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
217
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
218 tmp = Seq(pat.rstrip(), IUPAC.ambiguous_dna)
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
219
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
220 patterns.append(tmp)
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
221 #patterns.append(tmp.reverse_complement())
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
222
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
223
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
224 return patterns
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
225
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
226
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
227 def codon_usage(seqs, codonTable):
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
228
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
229 codon_usage = {}
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
230 tmp = [x for x in re.split(r'(\w{3})', seqs) if x != ""]
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
231
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
232 b_cod_table = CodonTable.unambiguous_dna_by_name[codonTable].forward_table
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
233
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
234 for cod in CodonTable.unambiguous_dna_by_name[codonTable].stop_codons:
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
235 b_cod_table[cod] = "_Stop"
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
236
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
237 for cod in CodonTable.unambiguous_dna_by_name[codonTable].start_codons:
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
238 #print(cod)
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
239 b_cod_table[cod] = b_cod_table[cod]
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
240
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
241 aas = set(b_cod_table.values())
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
242
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
243 for aa in aas:
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
244 #print(aa)
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
245 #codon_usage[aa] = {}
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
246 for codon in b_cod_table.keys():
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
247 if b_cod_table[codon] == aa:
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
248 codon_usage[codon] = tmp.count(codon.split(" ")[0])
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
249
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
250 return codon_usage
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
251
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
252
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
253 def read_annotated_genome(data="example.fna", type_="fasta"):
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
254 """
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
255 Accepted formats:
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
256 - fasta (multifasta)
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
257 - gbk
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
258
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
259 """
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
260
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
261 seqs = ""
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
262
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
263 if type_ == "fasta":
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
264 with open(data, "rU") as handle:
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
265 for record in SeqIO.parse(handle, type_):
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
266 seqs = seqs + str(record.seq)
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
267
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
268 elif type_ == "genbank":
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
269 with open(data, "rU") as input_handle:
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
270 types = []
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
271 for record in SeqIO.parse(input_handle, "genbank"):
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
272 for feature in record.features:
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
273 types.append(feature.type)
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
274 if feature.type == "CDS":
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
275 if feature.location.strand == +1:
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
276 seq = record.seq[feature.location.start:feature.location.end]
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
277 seqs = seqs + str(seq)
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
278 elif feature.location.strand == -1:
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
279 seq = record.seq[feature.location.start:
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
280 feature.location.end].reverse_complement()
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
281 seqs = seqs + str(seq)
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
282 return seqs
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
283
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
284
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
285 def synonims_(table_name="Bacterial"):
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
286
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
287 b_cod_table = CodonTable.unambiguous_dna_by_name[table_name].forward_table
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
288
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
289 print(b_cod_table)
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
290
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
291 for cod in CodonTable.unambiguous_dna_by_name[table_name].stop_codons:
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
292 b_cod_table[cod] = "_Stop"
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
293
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
294 for cod in CodonTable.unambiguous_dna_by_name[table_name].start_codons:
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
295 b_cod_table[cod] = "_Start"
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
296
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
297 #pprint(b_cod_table)
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
298 codons = {}
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
299
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
300 aas = set(b_cod_table.values())
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
301
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
302 for aa in aas:
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
303 codons[aa] = []
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
304 for codon in b_cod_table.keys():
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
305 if b_cod_table[codon] == aa:
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
306 codons[aa].append(codon)
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
307
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
308 #break
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
309
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
310 synonims = {}
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
311
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
312 for el1 in codons.keys():
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
313 print(el1)
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
314 for el2 in codons[el1]:
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
315 print(el2)
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
316 synonims[el2] = codons[el1]
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
317 #synonims[el2] = []
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
318 #for el3 in codons[el1]#set.difference(set(codons[el1]), {el2}):
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
319 # print(el3)
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
320 # synonims[el2].append(el3)
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
321 #break
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
322 #break
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
323 #break
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
324
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
325
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
326 anti_codons = {}
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
327
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
328 for codon in synonims.keys():
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
329 tmp_codon = Bio.Seq.Seq(codon, IUPAC.unambiguous_dna)
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
330 tmp_anticodon = str(tmp_codon.reverse_complement())
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
331
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
332 anti_codons[tmp_anticodon] = []
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
333
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
334 for synonim in synonims[codon]:
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
335 tmp_synonim = Bio.Seq.Seq(synonim, IUPAC.unambiguous_dna)
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
336 tmp_antisynonim = str(tmp_synonim.reverse_complement())
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
337 anti_codons[tmp_anticodon].append(tmp_antisynonim)
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
338
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
339 check = Bio.Seq.Seq("CTT")
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
340 anti_check = check.reverse_complement()
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
341 print("\nCheck:\n" + str(check))
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
342 print("\nCodons:\n")
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
343
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
344 for key in codons.keys():
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
345 if str(check) in codons[key]:
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
346 print(codons[key])
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
347
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
348 #pprint(codons)
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
349 print("\nSynonims:\n")
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
350 pprint(synonims[str(check)])
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
351 print("\nAnti_Codons:\n")
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
352 pprint(anti_codons[str(anti_check)])
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
353
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
354 #i = synonims.keys()
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
355 #right = True
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
356 #while len(i) > 0:
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
357 # tmp = i.pop()
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
358 # check = Bio.Seq.Seq(tmp)
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
359 # anti_check = check.reverse_complement()
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
360
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
361
aad5e435e4dc Uploaded
gianmarco_piccinno
parents:
diff changeset
362 return {"synonims":synonims, "anti_synonims":anti_codons}