Mercurial > repos > gianmarco_piccinno > project_rm
comparison CodonSwitchTool/syngenic.py @ 44:5a6d2678217e draft
Uploaded
| author | gianmarco_piccinno |
|---|---|
| date | Mon, 20 May 2019 17:08:56 -0400 |
| parents | bd35b13fabfb |
| children |
comparison
equal
deleted
inserted
replaced
| 43:41dec4e44c35 | 44:5a6d2678217e |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 __author__= "Gianmarco Piccinno" | |
| 4 __version__ = "1.0.0" | |
| 5 | |
| 6 import Bio | |
| 7 from Bio import SeqIO | |
| 8 from Bio.Seq import Seq | |
| 9 from Bio.Alphabet import IUPAC | |
| 10 from Bio.Data import IUPACData | |
| 11 from Bio.Data import CodonTable | |
| 12 import re | |
| 13 import sre_yield | |
| 14 | |
| 15 import re | |
| 16 import itertools | |
| 17 from functools import reduce | |
| 18 | |
| 19 import Bio | |
| 20 from Bio import Data | |
| 21 from Bio.Data import IUPACData | |
| 22 from Bio.Data import CodonTable | |
| 23 | |
| 24 from pprint import pprint | |
| 25 | |
| 26 import pandas as pd | |
| 27 | |
| 28 def _check_bases(seq_string): | |
| 29 """ | |
| 30 Check characters in a string (PRIVATE). | |
| 31 Remove digits and white space present in string. Allows any valid ambiguous | |
| 32 IUPAC DNA single letters codes (ABCDGHKMNRSTVWY, upper case are converted). | |
| 33 | |
| 34 Other characters (e.g. symbols) trigger a TypeError. | |
| 35 | |
| 36 Returns the string WITH A LEADING SPACE (!). This is for backwards | |
| 37 compatibility, and may in part be explained by the fact that | |
| 38 Bio.Restriction doesn't use zero based counting. | |
| 39 """ | |
| 40 # Remove white space and make upper case: | |
| 41 seq_string = "".join(seq_string.split()).upper() | |
| 42 # Remove digits | |
| 43 for c in "0123456789": | |
| 44 seq_string = seq_string.replace(c, "") | |
| 45 # Check only allowed IUPAC letters | |
| 46 if not set(seq_string).issubset(set("ABCDGHKMNRSTVWY")): | |
| 47 raise TypeError("Invalid character found in %s" % repr(seq_string)) | |
| 48 return " " + seq_string | |
| 49 | |
| 50 matching = {'A': 'ARWMHVDN', 'C': 'CYSMHBVN', 'G': 'GRSKBVDN', | |
| 51 'T': 'TYWKHBDN', 'R': 'ABDGHKMNSRWV', 'Y': 'CBDHKMNSTWVY', | |
| 52 'W': 'ABDHKMNRTWVY', 'S': 'CBDGHKMNSRVY', 'M': 'ACBDHMNSRWVY', | |
| 53 'K': 'BDGHKNSRTWVY', 'H': 'ACBDHKMNSRTWVY', | |
| 54 'B': 'CBDGHKMNSRTWVY', 'V': 'ACBDGHKMNSRWVY', | |
| 55 'D': 'ABDGHKMNSRTWVY', 'N': 'ACBDGHKMNSRTWVY'} | |
| 56 | |
| 57 class pattern(object): | |
| 58 | |
| 59 | |
| 60 def __init__(self, pattern_input): | |
| 61 s = str(pattern_input) | |
| 62 self.upper = s.isupper() | |
| 63 self.data = _check_bases(s) | |
| 64 self.pattern = s | |
| 65 | |
| 66 def plan_ambiguity(self): | |
| 67 val = Bio.Data.IUPACData.ambiguous_dna_values | |
| 68 re_pattern = "" | |
| 69 for el in self.pattern: | |
| 70 re_pattern = re_pattern + "[" + val[el] + "]" | |
| 71 return re_pattern | |
| 72 | |
| 73 class annotated_genome(object): | |
| 74 | |
| 75 def __init__(self, seq): | |
| 76 s = str(seq) | |
| 77 self.upper = s.isupper() | |
| 78 self.data = _check_bases(s) | |
| 79 self.seq = s | |
| 80 | |
| 81 def codon_usage(self, codonTable): | |
| 82 | |
| 83 codon_usage = {} | |
| 84 tmp = [x for x in re.split(r'(\w{3})', self.seq) if x != ""] | |
| 85 | |
| 86 b_cod_table = CodonTable.unambiguous_dna_by_name["Bacterial"].forward_table | |
| 87 aas = set(b_cod_table.values()) | |
| 88 | |
| 89 for aa in aas: | |
| 90 codon_usage[aa] = {} | |
| 91 for codon in b_cod_table.keys(): | |
| 92 if b_cod_table[codon] == aa: | |
| 93 codon_usage[aa][codon] = tmp.count(codon) | |
| 94 | |
| 95 tups = {(outerKey, innerKey): values for outerKey, innerDict in codon_usage.iteritems() for innerKey, values in innerDict.iteritems()} | |
| 96 | |
| 97 codon_usage_ = pd.DataFrame(pd.Series(tups), columns = ["Count"]) | |
| 98 codon_usage_.index = codon_usage_.index.set_names(["AA", "Codon"]) | |
| 99 codon_usage_['Proportion'] = codon_usage_.groupby(level=0).transform(lambda x: (x / x.sum()).round(2)) | |
| 100 | |
| 101 codon_usage_.reset_index(inplace=True) | |
| 102 codon_usage_.index = codon_usage_["Codon"] | |
| 103 | |
| 104 return {"Dictionary": codon_usage, "Tuples": tups, "Table": codon_usage_} | |
| 105 | |
| 106 class plasmid(object): | |
| 107 """ | |
| 108 This class represents a circular plasmid | |
| 109 """ | |
| 110 | |
| 111 def __init__(self, seq = "", circular=True, features = None): | |
| 112 | |
| 113 if type(seq) in [Bio.SeqRecord.SeqRecord, plasmid, Seq]: | |
| 114 s = str(seq.seq) | |
| 115 self.features = seq.features | |
| 116 else: | |
| 117 s = str(seq) | |
| 118 if features != None: | |
| 119 self.features = features | |
| 120 self.upper = s.isupper() | |
| 121 #self.data = _check_bases(s) | |
| 122 self.data = s | |
| 123 self.circular = circular | |
| 124 self.Class = s.__class__ | |
| 125 self.size = len(s) | |
| 126 self.seq = self.data | |
| 127 | |
| 128 def __len__(self): | |
| 129 return len(self.data) | |
| 130 | |
| 131 def __repr__(self): | |
| 132 return "plasmid(%s, circular=%s)" % (repr(self[:]),repr(self.circular)) | |
| 133 | |
| 134 def __eq__(self, other): | |
| 135 if isinstance(other, plasmid): | |
| 136 if repr(self) == repr(other): | |
| 137 return True | |
| 138 else: | |
| 139 return False | |
| 140 return False | |
| 141 | |
| 142 def __getitem__(self, i): | |
| 143 if self.upper: | |
| 144 return self.Class((self.data[i]).upper()) | |
| 145 return self.Class(self.data[i]) | |
| 146 | |
| 147 def sequence(self): | |
| 148 return self.data | |
| 149 | |
| 150 | |
| 151 def extract(self, feature): | |
| 152 return self.data[feature.location.start:feature.location.end]#[::feature.strand] | |
| 153 | |
| 154 | |
| 155 def findpattern(self, pattern, ambiguous_pattern): | |
| 156 """ | |
| 157 Return a list of a given pattern which occurs in the sequence. | |
| 158 The list is made of tuple (location, pattern.group). | |
| 159 The latter is used with non palindromic sites. | |
| 160 Pattern is the regular expression pattern corresponding to the | |
| 161 enzyme restriction site. | |
| 162 Size is the size of the restriction enzyme recognition-site size. | |
| 163 """ | |
| 164 if not self.circular: | |
| 165 data = self.data | |
| 166 else: | |
| 167 data = self.data + self.data[:len(ambiguous_pattern)] | |
| 168 print(data) | |
| 169 print(pattern) | |
| 170 return [(data[i.start():i.end()], i.start(), i.end(), "innner") if (i.end()<=self.size) else (data[i.start():i.end()], i.start(), i.end()-self.size, "junction") for i in re.finditer(pattern, data)] | |
| 171 | |
| 172 def findpatterns(self, patterns, ambiguous_patterns): | |
| 173 """ | |
| 174 Return a list of a given pattern which occurs in the sequence. | |
| 175 The list is made of tuple (location, pattern.group). | |
| 176 The latter is used with non palindromic sites. | |
| 177 Pattern is the regular expression pattern corresponding to the | |
| 178 enzyme restriction site. | |
| 179 Size is the size of the restriction enzyme recognition-site size. | |
| 180 """ | |
| 181 patts = {} | |
| 182 for i in range(len(patterns)): | |
| 183 #print(patterns[i]) | |
| 184 #print(ambiguous_patterns[i]) | |
| 185 if not self.circular: | |
| 186 data = self.data | |
| 187 else: | |
| 188 data = self.data + self.data[:len(ambiguous_patterns[i])] | |
| 189 #print(data) | |
| 190 patts[ambiguous_patterns[i]] = [(data[j.start():j.end()], j.start(), j.end(), "innner") if (j.end()<=self.size) else (data[j.start():j.end()], j.start(), j.end()-self.size, "junction") for j in re.finditer(patterns[i], data)] | |
| 191 | |
| 192 return patts | |
| 193 | |
| 194 | |
| 195 def codon_switch(self, patterns, annotation, codon_usage): | |
| 196 | |
| 197 table = {} | |
| 198 | |
| 199 for pattern in patterns: | |
| 200 if pattern[1] >= annotation & pattern[2] <= annotation: | |
| 201 # the pattern is in a coding sequence | |
| 202 # do codon switch | |
| 203 print("Do codon switch!") | |
| 204 else: | |
| 205 next | |
| 206 return table | |
| 207 | |
| 208 def transversion(self, patterns): | |
| 209 return | |
| 210 | |
| 211 def read_patterns(input): | |
| 212 | |
| 213 patterns = [] | |
| 214 | |
| 215 with open(input, "rU") as handle: | |
| 216 for pat in handle.readlines(): | |
| 217 | |
| 218 tmp = Seq(pat.rstrip(), IUPAC.ambiguous_dna) | |
| 219 | |
| 220 patterns.append(tmp) | |
| 221 #patterns.append(tmp.reverse_complement()) | |
| 222 | |
| 223 | |
| 224 return patterns | |
| 225 | |
| 226 | |
| 227 def codon_usage(seqs, codonTable): | |
| 228 | |
| 229 codon_usage = {} | |
| 230 tmp = [x for x in re.split(r'(\w{3})', seqs) if x != ""] | |
| 231 | |
| 232 b_cod_table = CodonTable.unambiguous_dna_by_name[codonTable].forward_table | |
| 233 | |
| 234 for cod in CodonTable.unambiguous_dna_by_name[codonTable].stop_codons: | |
| 235 b_cod_table[cod] = "_Stop" | |
| 236 | |
| 237 for cod in CodonTable.unambiguous_dna_by_name[codonTable].start_codons: | |
| 238 #print(cod) | |
| 239 b_cod_table[cod] = b_cod_table[cod] | |
| 240 | |
| 241 aas = set(b_cod_table.values()) | |
| 242 | |
| 243 for aa in aas: | |
| 244 #print(aa) | |
| 245 #codon_usage[aa] = {} | |
| 246 for codon in b_cod_table.keys(): | |
| 247 if b_cod_table[codon] == aa: | |
| 248 codon_usage[codon] = tmp.count(codon.split(" ")[0]) | |
| 249 | |
| 250 return codon_usage | |
| 251 | |
| 252 | |
| 253 def read_annotated_genome(data="example.fna", type_="fasta"): | |
| 254 """ | |
| 255 Accepted formats: | |
| 256 - fasta (multifasta) | |
| 257 - gbk | |
| 258 | |
| 259 """ | |
| 260 | |
| 261 seqs = "" | |
| 262 | |
| 263 if type_ == "fasta": | |
| 264 with open(data, "rU") as handle: | |
| 265 for record in SeqIO.parse(handle, type_): | |
| 266 seqs = seqs + str(record.seq) | |
| 267 | |
| 268 elif type_ == "genbank": | |
| 269 with open(data, "rU") as input_handle: | |
| 270 types = [] | |
| 271 for record in SeqIO.parse(input_handle, "genbank"): | |
| 272 for feature in record.features: | |
| 273 types.append(feature.type) | |
| 274 if feature.type == "CDS": | |
| 275 if feature.location.strand == +1: | |
| 276 seq = record.seq[feature.location.start:feature.location.end] | |
| 277 seqs = seqs + str(seq) | |
| 278 elif feature.location.strand == -1: | |
| 279 seq = record.seq[feature.location.start: | |
| 280 feature.location.end].reverse_complement() | |
| 281 seqs = seqs + str(seq) | |
| 282 return seqs | |
| 283 | |
| 284 | |
| 285 def synonims_(table_name="Bacterial"): | |
| 286 | |
| 287 b_cod_table = CodonTable.unambiguous_dna_by_name[table_name].forward_table | |
| 288 | |
| 289 print(b_cod_table) | |
| 290 | |
| 291 for cod in CodonTable.unambiguous_dna_by_name[table_name].stop_codons: | |
| 292 b_cod_table[cod] = "_Stop" | |
| 293 | |
| 294 for cod in CodonTable.unambiguous_dna_by_name[table_name].start_codons: | |
| 295 b_cod_table[cod] = "_Start" | |
| 296 | |
| 297 #pprint(b_cod_table) | |
| 298 codons = {} | |
| 299 | |
| 300 aas = set(b_cod_table.values()) | |
| 301 | |
| 302 for aa in aas: | |
| 303 codons[aa] = [] | |
| 304 for codon in b_cod_table.keys(): | |
| 305 if b_cod_table[codon] == aa: | |
| 306 codons[aa].append(codon) | |
| 307 | |
| 308 #break | |
| 309 | |
| 310 synonims = {} | |
| 311 | |
| 312 for el1 in codons.keys(): | |
| 313 print(el1) | |
| 314 for el2 in codons[el1]: | |
| 315 print(el2) | |
| 316 synonims[el2] = codons[el1] | |
| 317 #synonims[el2] = [] | |
| 318 #for el3 in codons[el1]#set.difference(set(codons[el1]), {el2}): | |
| 319 # print(el3) | |
| 320 # synonims[el2].append(el3) | |
| 321 #break | |
| 322 #break | |
| 323 #break | |
| 324 | |
| 325 | |
| 326 anti_codons = {} | |
| 327 | |
| 328 for codon in synonims.keys(): | |
| 329 tmp_codon = Bio.Seq.Seq(codon, IUPAC.unambiguous_dna) | |
| 330 tmp_anticodon = str(tmp_codon.reverse_complement()) | |
| 331 | |
| 332 anti_codons[tmp_anticodon] = [] | |
| 333 | |
| 334 for synonim in synonims[codon]: | |
| 335 tmp_synonim = Bio.Seq.Seq(synonim, IUPAC.unambiguous_dna) | |
| 336 tmp_antisynonim = str(tmp_synonim.reverse_complement()) | |
| 337 anti_codons[tmp_anticodon].append(tmp_antisynonim) | |
| 338 | |
| 339 check = Bio.Seq.Seq("CTT") | |
| 340 anti_check = check.reverse_complement() | |
| 341 print("\nCheck:\n" + str(check)) | |
| 342 print("\nCodons:\n") | |
| 343 | |
| 344 for key in codons.keys(): | |
| 345 if str(check) in codons[key]: | |
| 346 print(codons[key]) | |
| 347 | |
| 348 #pprint(codons) | |
| 349 print("\nSynonims:\n") | |
| 350 pprint(synonims[str(check)]) | |
| 351 print("\nAnti_Codons:\n") | |
| 352 pprint(anti_codons[str(anti_check)]) | |
| 353 | |
| 354 #i = synonims.keys() | |
| 355 #right = True | |
| 356 #while len(i) > 0: | |
| 357 # tmp = i.pop() | |
| 358 # check = Bio.Seq.Seq(tmp) | |
| 359 # anti_check = check.reverse_complement() | |
| 360 | |
| 361 | |
| 362 return {"synonims":synonims, "anti_synonims":anti_codons} |
