| 41 | 1 #!/usr/bin/env python | 
|  | 2 | 
|  | 3 __author__= "Gianmarco Piccinno" | 
|  | 4 __version__ = "1.0.0" | 
|  | 5 | 
|  | 6 import Bio | 
|  | 7 from Bio import SeqIO | 
|  | 8 from Bio.Seq import Seq | 
|  | 9 from Bio.Alphabet import IUPAC | 
|  | 10 from Bio.Data import IUPACData | 
|  | 11 from Bio.Data import CodonTable | 
|  | 12 import re | 
|  | 13 import sre_yield | 
|  | 14 | 
|  | 15 import re | 
|  | 16 import itertools | 
|  | 17 from functools import reduce | 
|  | 18 | 
|  | 19 import Bio | 
|  | 20 from Bio import Data | 
|  | 21 from Bio.Data import IUPACData | 
|  | 22 from Bio.Data import CodonTable | 
|  | 23 | 
|  | 24 from pprint import pprint | 
|  | 25 | 
|  | 26 import pandas as pd | 
|  | 27 | 
|  | 28 def _check_bases(seq_string): | 
|  | 29     """ | 
|  | 30     Check characters in a string (PRIVATE). | 
|  | 31     Remove digits and white space present in string. Allows any valid ambiguous | 
|  | 32     IUPAC DNA single letters codes (ABCDGHKMNRSTVWY, upper case are converted). | 
|  | 33 | 
|  | 34     Other characters (e.g. symbols) trigger a TypeError. | 
|  | 35 | 
|  | 36     Returns the string WITH A LEADING SPACE (!). This is for backwards | 
|  | 37     compatibility, and may in part be explained by the fact that | 
|  | 38     Bio.Restriction doesn't use zero based counting. | 
|  | 39     """ | 
|  | 40     # Remove white space and make upper case: | 
|  | 41     seq_string = "".join(seq_string.split()).upper() | 
|  | 42     # Remove digits | 
|  | 43     for c in "0123456789": | 
|  | 44         seq_string = seq_string.replace(c, "") | 
|  | 45     # Check only allowed IUPAC letters | 
|  | 46     if not set(seq_string).issubset(set("ABCDGHKMNRSTVWY")): | 
|  | 47         raise TypeError("Invalid character found in %s" % repr(seq_string)) | 
|  | 48     return " " + seq_string | 
|  | 49 | 
|  | 50     matching = {'A': 'ARWMHVDN', 'C': 'CYSMHBVN', 'G': 'GRSKBVDN', | 
|  | 51                 'T': 'TYWKHBDN', 'R': 'ABDGHKMNSRWV', 'Y': 'CBDHKMNSTWVY', | 
|  | 52                 'W': 'ABDHKMNRTWVY', 'S': 'CBDGHKMNSRVY', 'M': 'ACBDHMNSRWVY', | 
|  | 53                 'K': 'BDGHKNSRTWVY', 'H': 'ACBDHKMNSRTWVY', | 
|  | 54                 'B': 'CBDGHKMNSRTWVY', 'V': 'ACBDGHKMNSRWVY', | 
|  | 55                 'D': 'ABDGHKMNSRTWVY', 'N': 'ACBDGHKMNSRTWVY'} | 
|  | 56 | 
|  | 57 class pattern(object): | 
|  | 58 | 
|  | 59 | 
|  | 60     def __init__(self, pattern_input): | 
|  | 61         s = str(pattern_input) | 
|  | 62         self.upper = s.isupper() | 
|  | 63         self.data = _check_bases(s) | 
|  | 64         self.pattern = s | 
|  | 65 | 
|  | 66     def plan_ambiguity(self): | 
|  | 67         val = Bio.Data.IUPACData.ambiguous_dna_values | 
|  | 68         re_pattern = "" | 
|  | 69         for el in self.pattern: | 
|  | 70             re_pattern = re_pattern + "[" + val[el] + "]" | 
|  | 71         return re_pattern | 
|  | 72 | 
|  | 73 class annotated_genome(object): | 
|  | 74 | 
|  | 75     def __init__(self, seq): | 
|  | 76         s = str(seq) | 
|  | 77         self.upper = s.isupper() | 
|  | 78         self.data = _check_bases(s) | 
|  | 79         self.seq = s | 
|  | 80 | 
|  | 81     def codon_usage(self, codonTable): | 
|  | 82 | 
|  | 83         codon_usage = {} | 
|  | 84         tmp = [x for x in re.split(r'(\w{3})', self.seq) if x != ""] | 
|  | 85 | 
|  | 86         b_cod_table = CodonTable.unambiguous_dna_by_name["Bacterial"].forward_table | 
|  | 87         aas = set(b_cod_table.values()) | 
|  | 88 | 
|  | 89         for aa in aas: | 
|  | 90             codon_usage[aa] = {} | 
|  | 91             for codon in b_cod_table.keys(): | 
|  | 92                 if b_cod_table[codon] == aa: | 
|  | 93                     codon_usage[aa][codon] = tmp.count(codon) | 
|  | 94 | 
|  | 95         tups = {(outerKey, innerKey): values for outerKey, innerDict in codon_usage.iteritems() for innerKey, values in innerDict.iteritems()} | 
|  | 96 | 
|  | 97         codon_usage_ = pd.DataFrame(pd.Series(tups), columns = ["Count"]) | 
|  | 98         codon_usage_.index = codon_usage_.index.set_names(["AA", "Codon"]) | 
|  | 99         codon_usage_['Proportion'] = codon_usage_.groupby(level=0).transform(lambda x: (x / x.sum()).round(2)) | 
|  | 100 | 
|  | 101         codon_usage_.reset_index(inplace=True) | 
|  | 102         codon_usage_.index = codon_usage_["Codon"] | 
|  | 103 | 
|  | 104         return {"Dictionary": codon_usage, "Tuples": tups, "Table": codon_usage_} | 
|  | 105 | 
|  | 106 class plasmid(object): | 
|  | 107     """ | 
|  | 108     This class represents a circular plasmid | 
|  | 109     """ | 
|  | 110 | 
|  | 111     def __init__(self, seq = "", circular=True, features = None): | 
|  | 112 | 
|  | 113         if type(seq) in [Bio.SeqRecord.SeqRecord, plasmid, Seq]: | 
|  | 114             s = str(seq.seq) | 
|  | 115             self.features = seq.features | 
|  | 116         else: | 
|  | 117             s = str(seq) | 
|  | 118             if features != None: | 
|  | 119                 self.features = features | 
|  | 120         self.upper = s.isupper() | 
|  | 121         #self.data = _check_bases(s) | 
|  | 122         self.data = s | 
|  | 123         self.circular = circular | 
|  | 124         self.Class = s.__class__ | 
|  | 125         self.size = len(s) | 
|  | 126         self.seq = self.data | 
|  | 127 | 
|  | 128     def __len__(self): | 
|  | 129         return len(self.data) | 
|  | 130 | 
|  | 131     def __repr__(self): | 
|  | 132         return "plasmid(%s, circular=%s)" % (repr(self[:]),repr(self.circular)) | 
|  | 133 | 
|  | 134     def __eq__(self, other): | 
|  | 135         if isinstance(other, plasmid): | 
|  | 136             if repr(self) == repr(other): | 
|  | 137                 return True | 
|  | 138             else: | 
|  | 139                 return False | 
|  | 140         return False | 
|  | 141 | 
|  | 142     def __getitem__(self, i): | 
|  | 143         if self.upper: | 
|  | 144             return self.Class((self.data[i]).upper()) | 
|  | 145         return self.Class(self.data[i]) | 
|  | 146 | 
|  | 147     def sequence(self): | 
|  | 148         return self.data | 
|  | 149 | 
|  | 150 | 
|  | 151     def extract(self, feature): | 
|  | 152         return self.data[feature.location.start:feature.location.end]#[::feature.strand] | 
|  | 153 | 
|  | 154 | 
|  | 155     def findpattern(self, pattern, ambiguous_pattern): | 
|  | 156         """ | 
|  | 157         Return a list of a given pattern which occurs in the sequence. | 
|  | 158         The list is made of tuple (location, pattern.group). | 
|  | 159         The latter is used with non palindromic sites. | 
|  | 160         Pattern is the regular expression pattern corresponding to the | 
|  | 161         enzyme restriction site. | 
|  | 162         Size is the size of the restriction enzyme recognition-site size. | 
|  | 163         """ | 
|  | 164         if not self.circular: | 
|  | 165             data = self.data | 
|  | 166         else: | 
|  | 167             data = self.data + self.data[:len(ambiguous_pattern)] | 
|  | 168             print(data) | 
|  | 169             print(pattern) | 
|  | 170         return [(data[i.start():i.end()], i.start(), i.end(), "innner") if (i.end()<=self.size) else (data[i.start():i.end()], i.start(), i.end()-self.size, "junction") for i in re.finditer(pattern, data)] | 
|  | 171 | 
|  | 172     def findpatterns(self, patterns, ambiguous_patterns): | 
|  | 173         """ | 
|  | 174         Return a list of a given pattern which occurs in the sequence. | 
|  | 175         The list is made of tuple (location, pattern.group). | 
|  | 176         The latter is used with non palindromic sites. | 
|  | 177         Pattern is the regular expression pattern corresponding to the | 
|  | 178         enzyme restriction site. | 
|  | 179         Size is the size of the restriction enzyme recognition-site size. | 
|  | 180         """ | 
|  | 181         patts = {} | 
|  | 182         for i in range(len(patterns)): | 
|  | 183             #print(patterns[i]) | 
|  | 184             #print(ambiguous_patterns[i]) | 
|  | 185             if not self.circular: | 
|  | 186                 data = self.data | 
|  | 187             else: | 
|  | 188                 data = self.data + self.data[:len(ambiguous_patterns[i])] | 
|  | 189                 #print(data) | 
|  | 190             patts[ambiguous_patterns[i]] = [(data[j.start():j.end()], j.start(), j.end(), "innner") if (j.end()<=self.size) else (data[j.start():j.end()], j.start(), j.end()-self.size, "junction") for j in re.finditer(patterns[i], data)] | 
|  | 191 | 
|  | 192         return patts | 
|  | 193 | 
|  | 194 | 
|  | 195     def codon_switch(self, patterns, annotation, codon_usage): | 
|  | 196 | 
|  | 197         table = {} | 
|  | 198 | 
|  | 199         for pattern in patterns: | 
|  | 200             if pattern[1] >= annotation & pattern[2] <= annotation: | 
|  | 201                 # the pattern is in a coding sequence | 
|  | 202                 # do codon switch | 
|  | 203                 print("Do codon switch!") | 
|  | 204             else: | 
|  | 205                 next | 
|  | 206         return table | 
|  | 207 | 
|  | 208     def transversion(self, patterns): | 
|  | 209         return | 
|  | 210 | 
|  | 211 def read_patterns(input): | 
|  | 212 | 
|  | 213     patterns = [] | 
|  | 214 | 
|  | 215     with open(input, "rU") as handle: | 
|  | 216         for pat in handle.readlines(): | 
|  | 217 | 
|  | 218             tmp = Seq(pat.rstrip(), IUPAC.ambiguous_dna) | 
|  | 219 | 
|  | 220             patterns.append(tmp) | 
|  | 221             #patterns.append(tmp.reverse_complement()) | 
|  | 222 | 
|  | 223 | 
|  | 224     return patterns | 
|  | 225 | 
|  | 226 | 
|  | 227 def codon_usage(seqs, codonTable): | 
|  | 228 | 
|  | 229     codon_usage = {} | 
|  | 230     tmp = [x for x in re.split(r'(\w{3})', seqs) if x != ""] | 
|  | 231 | 
|  | 232     b_cod_table = CodonTable.unambiguous_dna_by_name[codonTable].forward_table | 
|  | 233 | 
|  | 234     for cod in CodonTable.unambiguous_dna_by_name[codonTable].stop_codons: | 
|  | 235         b_cod_table[cod] = "_Stop" | 
|  | 236 | 
|  | 237     for cod in CodonTable.unambiguous_dna_by_name[codonTable].start_codons: | 
|  | 238         #print(cod) | 
|  | 239         b_cod_table[cod] = b_cod_table[cod] | 
|  | 240 | 
|  | 241     aas = set(b_cod_table.values()) | 
|  | 242 | 
|  | 243     for aa in aas: | 
|  | 244         #print(aa) | 
|  | 245         #codon_usage[aa] = {} | 
|  | 246         for codon in b_cod_table.keys(): | 
|  | 247             if b_cod_table[codon] == aa: | 
|  | 248                 codon_usage[codon] = tmp.count(codon.split(" ")[0]) | 
|  | 249 | 
|  | 250     return codon_usage | 
|  | 251 | 
|  | 252 | 
|  | 253 def read_annotated_genome(data="example.fna", type_="fasta"): | 
|  | 254     """ | 
|  | 255     Accepted formats: | 
|  | 256         - fasta (multifasta) | 
|  | 257         - gbk | 
|  | 258 | 
|  | 259     """ | 
|  | 260 | 
|  | 261     seqs = "" | 
|  | 262 | 
|  | 263     if type_ == "fasta": | 
|  | 264         with open(data, "rU") as handle: | 
|  | 265             for record in SeqIO.parse(handle, type_): | 
|  | 266                 seqs = seqs + str(record.seq) | 
|  | 267 | 
|  | 268     elif type_ == "genbank": | 
|  | 269         with open(data, "rU") as input_handle: | 
|  | 270             types = [] | 
|  | 271             for record in SeqIO.parse(input_handle, "genbank"): | 
|  | 272                 for feature in record.features: | 
|  | 273                     types.append(feature.type) | 
|  | 274                     if feature.type == "CDS": | 
|  | 275                         if feature.location.strand == +1: | 
|  | 276                             seq = record.seq[feature.location.start:feature.location.end] | 
|  | 277                             seqs = seqs + str(seq) | 
|  | 278                         elif feature.location.strand == -1: | 
|  | 279                             seq = record.seq[feature.location.start: | 
|  | 280                                              feature.location.end].reverse_complement() | 
|  | 281                             seqs = seqs + str(seq) | 
|  | 282     return seqs | 
|  | 283 | 
|  | 284 | 
|  | 285 def synonims_(table_name="Bacterial"): | 
|  | 286 | 
|  | 287     b_cod_table = CodonTable.unambiguous_dna_by_name[table_name].forward_table | 
|  | 288 | 
|  | 289     print(b_cod_table) | 
|  | 290 | 
|  | 291     for cod in CodonTable.unambiguous_dna_by_name[table_name].stop_codons: | 
|  | 292         b_cod_table[cod] = "_Stop" | 
|  | 293 | 
|  | 294     for cod in CodonTable.unambiguous_dna_by_name[table_name].start_codons: | 
|  | 295         b_cod_table[cod] = "_Start" | 
|  | 296 | 
|  | 297     #pprint(b_cod_table) | 
|  | 298     codons = {} | 
|  | 299 | 
|  | 300     aas = set(b_cod_table.values()) | 
|  | 301 | 
|  | 302     for aa in aas: | 
|  | 303         codons[aa] = [] | 
|  | 304         for codon in b_cod_table.keys(): | 
|  | 305             if b_cod_table[codon] == aa: | 
|  | 306                 codons[aa].append(codon) | 
|  | 307 | 
|  | 308         #break | 
|  | 309 | 
|  | 310     synonims = {} | 
|  | 311 | 
|  | 312     for el1 in codons.keys(): | 
|  | 313         print(el1) | 
|  | 314         for el2 in codons[el1]: | 
|  | 315             print(el2) | 
|  | 316             synonims[el2] = codons[el1] | 
|  | 317             #synonims[el2] = [] | 
|  | 318             #for el3 in codons[el1]#set.difference(set(codons[el1]), {el2}): | 
|  | 319             #    print(el3) | 
|  | 320             #    synonims[el2].append(el3) | 
|  | 321                 #break | 
|  | 322             #break | 
|  | 323         #break | 
|  | 324 | 
|  | 325 | 
|  | 326     anti_codons = {} | 
|  | 327 | 
|  | 328     for codon in synonims.keys(): | 
|  | 329             tmp_codon = Bio.Seq.Seq(codon, IUPAC.unambiguous_dna) | 
|  | 330             tmp_anticodon = str(tmp_codon.reverse_complement()) | 
|  | 331 | 
|  | 332             anti_codons[tmp_anticodon] = [] | 
|  | 333 | 
|  | 334             for synonim in synonims[codon]: | 
|  | 335                     tmp_synonim = Bio.Seq.Seq(synonim, IUPAC.unambiguous_dna) | 
|  | 336                     tmp_antisynonim = str(tmp_synonim.reverse_complement()) | 
|  | 337                     anti_codons[tmp_anticodon].append(tmp_antisynonim) | 
|  | 338 | 
|  | 339     check = Bio.Seq.Seq("CTT") | 
|  | 340     anti_check = check.reverse_complement() | 
|  | 341     print("\nCheck:\n" + str(check)) | 
|  | 342     print("\nCodons:\n") | 
|  | 343 | 
|  | 344     for key in codons.keys(): | 
|  | 345         if str(check) in codons[key]: | 
|  | 346             print(codons[key]) | 
|  | 347 | 
|  | 348     #pprint(codons) | 
|  | 349     print("\nSynonims:\n") | 
|  | 350     pprint(synonims[str(check)]) | 
|  | 351     print("\nAnti_Codons:\n") | 
|  | 352     pprint(anti_codons[str(anti_check)]) | 
|  | 353 | 
|  | 354     #i = synonims.keys() | 
|  | 355     #right = True | 
|  | 356     #while len(i) > 0: | 
|  | 357     #    tmp = i.pop() | 
|  | 358     #    check = Bio.Seq.Seq(tmp) | 
|  | 359     #    anti_check = check.reverse_complement() | 
|  | 360 | 
|  | 361 | 
|  | 362     return {"synonims":synonims, "anti_synonims":anti_codons} |