| 
41
 | 
     1 #!/usr/bin/env python
 | 
| 
 | 
     2 
 | 
| 
 | 
     3 __author__= "Gianmarco Piccinno"
 | 
| 
 | 
     4 __version__ = "1.0.0"
 | 
| 
 | 
     5 
 | 
| 
 | 
     6 import Bio
 | 
| 
 | 
     7 from Bio import SeqIO
 | 
| 
 | 
     8 from Bio.Seq import Seq
 | 
| 
 | 
     9 from Bio.Alphabet import IUPAC
 | 
| 
 | 
    10 from Bio.Data import IUPACData
 | 
| 
 | 
    11 from Bio.Data import CodonTable
 | 
| 
 | 
    12 import re
 | 
| 
 | 
    13 import sre_yield
 | 
| 
 | 
    14 
 | 
| 
 | 
    15 import re
 | 
| 
 | 
    16 import itertools
 | 
| 
 | 
    17 from functools import reduce
 | 
| 
 | 
    18 
 | 
| 
 | 
    19 import Bio
 | 
| 
 | 
    20 from Bio import Data
 | 
| 
 | 
    21 from Bio.Data import IUPACData
 | 
| 
 | 
    22 from Bio.Data import CodonTable
 | 
| 
 | 
    23 
 | 
| 
 | 
    24 from pprint import pprint
 | 
| 
 | 
    25 
 | 
| 
 | 
    26 import pandas as pd
 | 
| 
 | 
    27 
 | 
| 
 | 
    28 def _check_bases(seq_string):
 | 
| 
 | 
    29     """
 | 
| 
 | 
    30     Check characters in a string (PRIVATE).
 | 
| 
 | 
    31     Remove digits and white space present in string. Allows any valid ambiguous
 | 
| 
 | 
    32     IUPAC DNA single letters codes (ABCDGHKMNRSTVWY, upper case are converted).
 | 
| 
 | 
    33 
 | 
| 
 | 
    34     Other characters (e.g. symbols) trigger a TypeError.
 | 
| 
 | 
    35 
 | 
| 
 | 
    36     Returns the string WITH A LEADING SPACE (!). This is for backwards
 | 
| 
 | 
    37     compatibility, and may in part be explained by the fact that
 | 
| 
 | 
    38     Bio.Restriction doesn't use zero based counting.
 | 
| 
 | 
    39     """
 | 
| 
 | 
    40     # Remove white space and make upper case:
 | 
| 
 | 
    41     seq_string = "".join(seq_string.split()).upper()
 | 
| 
 | 
    42     # Remove digits
 | 
| 
 | 
    43     for c in "0123456789":
 | 
| 
 | 
    44         seq_string = seq_string.replace(c, "")
 | 
| 
 | 
    45     # Check only allowed IUPAC letters
 | 
| 
 | 
    46     if not set(seq_string).issubset(set("ABCDGHKMNRSTVWY")):
 | 
| 
 | 
    47         raise TypeError("Invalid character found in %s" % repr(seq_string))
 | 
| 
 | 
    48     return " " + seq_string
 | 
| 
 | 
    49 
 | 
| 
 | 
    50     matching = {'A': 'ARWMHVDN', 'C': 'CYSMHBVN', 'G': 'GRSKBVDN',
 | 
| 
 | 
    51                 'T': 'TYWKHBDN', 'R': 'ABDGHKMNSRWV', 'Y': 'CBDHKMNSTWVY',
 | 
| 
 | 
    52                 'W': 'ABDHKMNRTWVY', 'S': 'CBDGHKMNSRVY', 'M': 'ACBDHMNSRWVY',
 | 
| 
 | 
    53                 'K': 'BDGHKNSRTWVY', 'H': 'ACBDHKMNSRTWVY',
 | 
| 
 | 
    54                 'B': 'CBDGHKMNSRTWVY', 'V': 'ACBDGHKMNSRWVY',
 | 
| 
 | 
    55                 'D': 'ABDGHKMNSRTWVY', 'N': 'ACBDGHKMNSRTWVY'}
 | 
| 
 | 
    56 
 | 
| 
 | 
    57 class pattern(object):
 | 
| 
 | 
    58 
 | 
| 
 | 
    59 
 | 
| 
 | 
    60     def __init__(self, pattern_input):
 | 
| 
 | 
    61         s = str(pattern_input)
 | 
| 
 | 
    62         self.upper = s.isupper()
 | 
| 
 | 
    63         self.data = _check_bases(s)
 | 
| 
 | 
    64         self.pattern = s
 | 
| 
 | 
    65 
 | 
| 
 | 
    66     def plan_ambiguity(self):
 | 
| 
 | 
    67         val = Bio.Data.IUPACData.ambiguous_dna_values
 | 
| 
 | 
    68         re_pattern = ""
 | 
| 
 | 
    69         for el in self.pattern:
 | 
| 
 | 
    70             re_pattern = re_pattern + "[" + val[el] + "]"
 | 
| 
 | 
    71         return re_pattern
 | 
| 
 | 
    72 
 | 
| 
 | 
    73 class annotated_genome(object):
 | 
| 
 | 
    74 
 | 
| 
 | 
    75     def __init__(self, seq):
 | 
| 
 | 
    76         s = str(seq)
 | 
| 
 | 
    77         self.upper = s.isupper()
 | 
| 
 | 
    78         self.data = _check_bases(s)
 | 
| 
 | 
    79         self.seq = s
 | 
| 
 | 
    80 
 | 
| 
 | 
    81     def codon_usage(self, codonTable):
 | 
| 
 | 
    82 
 | 
| 
 | 
    83         codon_usage = {}
 | 
| 
 | 
    84         tmp = [x for x in re.split(r'(\w{3})', self.seq) if x != ""]
 | 
| 
 | 
    85 
 | 
| 
 | 
    86         b_cod_table = CodonTable.unambiguous_dna_by_name["Bacterial"].forward_table
 | 
| 
 | 
    87         aas = set(b_cod_table.values())
 | 
| 
 | 
    88 
 | 
| 
 | 
    89         for aa in aas:
 | 
| 
 | 
    90             codon_usage[aa] = {}
 | 
| 
 | 
    91             for codon in b_cod_table.keys():
 | 
| 
 | 
    92                 if b_cod_table[codon] == aa:
 | 
| 
 | 
    93                     codon_usage[aa][codon] = tmp.count(codon)
 | 
| 
 | 
    94 
 | 
| 
 | 
    95         tups = {(outerKey, innerKey): values for outerKey, innerDict in codon_usage.iteritems() for innerKey, values in innerDict.iteritems()}
 | 
| 
 | 
    96 
 | 
| 
 | 
    97         codon_usage_ = pd.DataFrame(pd.Series(tups), columns = ["Count"])
 | 
| 
 | 
    98         codon_usage_.index = codon_usage_.index.set_names(["AA", "Codon"])
 | 
| 
 | 
    99         codon_usage_['Proportion'] = codon_usage_.groupby(level=0).transform(lambda x: (x / x.sum()).round(2))
 | 
| 
 | 
   100 
 | 
| 
 | 
   101         codon_usage_.reset_index(inplace=True)
 | 
| 
 | 
   102         codon_usage_.index = codon_usage_["Codon"]
 | 
| 
 | 
   103 
 | 
| 
 | 
   104         return {"Dictionary": codon_usage, "Tuples": tups, "Table": codon_usage_}
 | 
| 
 | 
   105 
 | 
| 
 | 
   106 class plasmid(object):
 | 
| 
 | 
   107     """
 | 
| 
 | 
   108     This class represents a circular plasmid
 | 
| 
 | 
   109     """
 | 
| 
 | 
   110 
 | 
| 
 | 
   111     def __init__(self, seq = "", circular=True, features = None):
 | 
| 
 | 
   112 
 | 
| 
 | 
   113         if type(seq) in [Bio.SeqRecord.SeqRecord, plasmid, Seq]:
 | 
| 
 | 
   114             s = str(seq.seq)
 | 
| 
 | 
   115             self.features = seq.features
 | 
| 
 | 
   116         else:
 | 
| 
 | 
   117             s = str(seq)
 | 
| 
 | 
   118             if features != None:
 | 
| 
 | 
   119                 self.features = features
 | 
| 
 | 
   120         self.upper = s.isupper()
 | 
| 
 | 
   121         #self.data = _check_bases(s)
 | 
| 
 | 
   122         self.data = s
 | 
| 
 | 
   123         self.circular = circular
 | 
| 
 | 
   124         self.Class = s.__class__
 | 
| 
 | 
   125         self.size = len(s)
 | 
| 
 | 
   126         self.seq = self.data
 | 
| 
 | 
   127 
 | 
| 
 | 
   128     def __len__(self):
 | 
| 
 | 
   129         return len(self.data)
 | 
| 
 | 
   130 
 | 
| 
 | 
   131     def __repr__(self):
 | 
| 
 | 
   132         return "plasmid(%s, circular=%s)" % (repr(self[:]),repr(self.circular))
 | 
| 
 | 
   133 
 | 
| 
 | 
   134     def __eq__(self, other):
 | 
| 
 | 
   135         if isinstance(other, plasmid):
 | 
| 
 | 
   136             if repr(self) == repr(other):
 | 
| 
 | 
   137                 return True
 | 
| 
 | 
   138             else:
 | 
| 
 | 
   139                 return False
 | 
| 
 | 
   140         return False
 | 
| 
 | 
   141 
 | 
| 
 | 
   142     def __getitem__(self, i):
 | 
| 
 | 
   143         if self.upper:
 | 
| 
 | 
   144             return self.Class((self.data[i]).upper())
 | 
| 
 | 
   145         return self.Class(self.data[i])
 | 
| 
 | 
   146 
 | 
| 
 | 
   147     def sequence(self):
 | 
| 
 | 
   148         return self.data
 | 
| 
 | 
   149 
 | 
| 
 | 
   150 
 | 
| 
 | 
   151     def extract(self, feature):
 | 
| 
 | 
   152         return self.data[feature.location.start:feature.location.end]#[::feature.strand]
 | 
| 
 | 
   153 
 | 
| 
 | 
   154 
 | 
| 
 | 
   155     def findpattern(self, pattern, ambiguous_pattern):
 | 
| 
 | 
   156         """
 | 
| 
 | 
   157         Return a list of a given pattern which occurs in the sequence.
 | 
| 
 | 
   158         The list is made of tuple (location, pattern.group).
 | 
| 
 | 
   159         The latter is used with non palindromic sites.
 | 
| 
 | 
   160         Pattern is the regular expression pattern corresponding to the
 | 
| 
 | 
   161         enzyme restriction site.
 | 
| 
 | 
   162         Size is the size of the restriction enzyme recognition-site size.
 | 
| 
 | 
   163         """
 | 
| 
 | 
   164         if not self.circular:
 | 
| 
 | 
   165             data = self.data
 | 
| 
 | 
   166         else:
 | 
| 
 | 
   167             data = self.data + self.data[:len(ambiguous_pattern)]
 | 
| 
 | 
   168             print(data)
 | 
| 
 | 
   169             print(pattern)
 | 
| 
 | 
   170         return [(data[i.start():i.end()], i.start(), i.end(), "innner") if (i.end()<=self.size) else (data[i.start():i.end()], i.start(), i.end()-self.size, "junction") for i in re.finditer(pattern, data)]
 | 
| 
 | 
   171 
 | 
| 
 | 
   172     def findpatterns(self, patterns, ambiguous_patterns):
 | 
| 
 | 
   173         """
 | 
| 
 | 
   174         Return a list of a given pattern which occurs in the sequence.
 | 
| 
 | 
   175         The list is made of tuple (location, pattern.group).
 | 
| 
 | 
   176         The latter is used with non palindromic sites.
 | 
| 
 | 
   177         Pattern is the regular expression pattern corresponding to the
 | 
| 
 | 
   178         enzyme restriction site.
 | 
| 
 | 
   179         Size is the size of the restriction enzyme recognition-site size.
 | 
| 
 | 
   180         """
 | 
| 
 | 
   181         patts = {}
 | 
| 
 | 
   182         for i in range(len(patterns)):
 | 
| 
 | 
   183             #print(patterns[i])
 | 
| 
 | 
   184             #print(ambiguous_patterns[i])
 | 
| 
 | 
   185             if not self.circular:
 | 
| 
 | 
   186                 data = self.data
 | 
| 
 | 
   187             else:
 | 
| 
 | 
   188                 data = self.data + self.data[:len(ambiguous_patterns[i])]
 | 
| 
 | 
   189                 #print(data)
 | 
| 
 | 
   190             patts[ambiguous_patterns[i]] = [(data[j.start():j.end()], j.start(), j.end(), "innner") if (j.end()<=self.size) else (data[j.start():j.end()], j.start(), j.end()-self.size, "junction") for j in re.finditer(patterns[i], data)]
 | 
| 
 | 
   191 
 | 
| 
 | 
   192         return patts
 | 
| 
 | 
   193 
 | 
| 
 | 
   194 
 | 
| 
 | 
   195     def codon_switch(self, patterns, annotation, codon_usage):
 | 
| 
 | 
   196 
 | 
| 
 | 
   197         table = {}
 | 
| 
 | 
   198 
 | 
| 
 | 
   199         for pattern in patterns:
 | 
| 
 | 
   200             if pattern[1] >= annotation & pattern[2] <= annotation:
 | 
| 
 | 
   201                 # the pattern is in a coding sequence
 | 
| 
 | 
   202                 # do codon switch
 | 
| 
 | 
   203                 print("Do codon switch!")
 | 
| 
 | 
   204             else:
 | 
| 
 | 
   205                 next
 | 
| 
 | 
   206         return table
 | 
| 
 | 
   207 
 | 
| 
 | 
   208     def transversion(self, patterns):
 | 
| 
 | 
   209         return
 | 
| 
 | 
   210 
 | 
| 
 | 
   211 def read_patterns(input):
 | 
| 
 | 
   212 
 | 
| 
 | 
   213     patterns = []
 | 
| 
 | 
   214 
 | 
| 
 | 
   215     with open(input, "rU") as handle:
 | 
| 
 | 
   216         for pat in handle.readlines():
 | 
| 
 | 
   217 
 | 
| 
 | 
   218             tmp = Seq(pat.rstrip(), IUPAC.ambiguous_dna)
 | 
| 
 | 
   219 
 | 
| 
 | 
   220             patterns.append(tmp)
 | 
| 
 | 
   221             #patterns.append(tmp.reverse_complement())
 | 
| 
 | 
   222 
 | 
| 
 | 
   223 
 | 
| 
 | 
   224     return patterns
 | 
| 
 | 
   225 
 | 
| 
 | 
   226 
 | 
| 
 | 
   227 def codon_usage(seqs, codonTable):
 | 
| 
 | 
   228 
 | 
| 
 | 
   229     codon_usage = {}
 | 
| 
 | 
   230     tmp = [x for x in re.split(r'(\w{3})', seqs) if x != ""]
 | 
| 
 | 
   231 
 | 
| 
 | 
   232     b_cod_table = CodonTable.unambiguous_dna_by_name[codonTable].forward_table
 | 
| 
 | 
   233 
 | 
| 
 | 
   234     for cod in CodonTable.unambiguous_dna_by_name[codonTable].stop_codons:
 | 
| 
 | 
   235         b_cod_table[cod] = "_Stop"
 | 
| 
 | 
   236 
 | 
| 
 | 
   237     for cod in CodonTable.unambiguous_dna_by_name[codonTable].start_codons:
 | 
| 
 | 
   238         #print(cod)
 | 
| 
 | 
   239         b_cod_table[cod] = b_cod_table[cod]
 | 
| 
 | 
   240 
 | 
| 
 | 
   241     aas = set(b_cod_table.values())
 | 
| 
 | 
   242 
 | 
| 
 | 
   243     for aa in aas:
 | 
| 
 | 
   244         #print(aa)
 | 
| 
 | 
   245         #codon_usage[aa] = {}
 | 
| 
 | 
   246         for codon in b_cod_table.keys():
 | 
| 
 | 
   247             if b_cod_table[codon] == aa:
 | 
| 
 | 
   248                 codon_usage[codon] = tmp.count(codon.split(" ")[0])
 | 
| 
 | 
   249 
 | 
| 
 | 
   250     return codon_usage
 | 
| 
 | 
   251 
 | 
| 
 | 
   252 
 | 
| 
 | 
   253 def read_annotated_genome(data="example.fna", type_="fasta"):
 | 
| 
 | 
   254     """
 | 
| 
 | 
   255     Accepted formats:
 | 
| 
 | 
   256         - fasta (multifasta)
 | 
| 
 | 
   257         - gbk
 | 
| 
 | 
   258 
 | 
| 
 | 
   259     """
 | 
| 
 | 
   260 
 | 
| 
 | 
   261     seqs = ""
 | 
| 
 | 
   262 
 | 
| 
 | 
   263     if type_ == "fasta":
 | 
| 
 | 
   264         with open(data, "rU") as handle:
 | 
| 
 | 
   265             for record in SeqIO.parse(handle, type_):
 | 
| 
 | 
   266                 seqs = seqs + str(record.seq)
 | 
| 
 | 
   267 
 | 
| 
 | 
   268     elif type_ == "genbank":
 | 
| 
 | 
   269         with open(data, "rU") as input_handle:
 | 
| 
 | 
   270             types = []
 | 
| 
 | 
   271             for record in SeqIO.parse(input_handle, "genbank"):
 | 
| 
 | 
   272                 for feature in record.features:
 | 
| 
 | 
   273                     types.append(feature.type)
 | 
| 
 | 
   274                     if feature.type == "CDS":
 | 
| 
 | 
   275                         if feature.location.strand == +1:
 | 
| 
 | 
   276                             seq = record.seq[feature.location.start:feature.location.end]
 | 
| 
 | 
   277                             seqs = seqs + str(seq)
 | 
| 
 | 
   278                         elif feature.location.strand == -1:
 | 
| 
 | 
   279                             seq = record.seq[feature.location.start:
 | 
| 
 | 
   280                                              feature.location.end].reverse_complement()
 | 
| 
 | 
   281                             seqs = seqs + str(seq)
 | 
| 
 | 
   282     return seqs
 | 
| 
 | 
   283 
 | 
| 
 | 
   284 
 | 
| 
 | 
   285 def synonims_(table_name="Bacterial"):
 | 
| 
 | 
   286 
 | 
| 
 | 
   287     b_cod_table = CodonTable.unambiguous_dna_by_name[table_name].forward_table
 | 
| 
 | 
   288 
 | 
| 
 | 
   289     print(b_cod_table)
 | 
| 
 | 
   290 
 | 
| 
 | 
   291     for cod in CodonTable.unambiguous_dna_by_name[table_name].stop_codons:
 | 
| 
 | 
   292         b_cod_table[cod] = "_Stop"
 | 
| 
 | 
   293 
 | 
| 
 | 
   294     for cod in CodonTable.unambiguous_dna_by_name[table_name].start_codons:
 | 
| 
 | 
   295         b_cod_table[cod] = "_Start"
 | 
| 
 | 
   296 
 | 
| 
 | 
   297     #pprint(b_cod_table)
 | 
| 
 | 
   298     codons = {}
 | 
| 
 | 
   299 
 | 
| 
 | 
   300     aas = set(b_cod_table.values())
 | 
| 
 | 
   301 
 | 
| 
 | 
   302     for aa in aas:
 | 
| 
 | 
   303         codons[aa] = []
 | 
| 
 | 
   304         for codon in b_cod_table.keys():
 | 
| 
 | 
   305             if b_cod_table[codon] == aa:
 | 
| 
 | 
   306                 codons[aa].append(codon)
 | 
| 
 | 
   307 
 | 
| 
 | 
   308         #break
 | 
| 
 | 
   309 
 | 
| 
 | 
   310     synonims = {}
 | 
| 
 | 
   311 
 | 
| 
 | 
   312     for el1 in codons.keys():
 | 
| 
 | 
   313         print(el1)
 | 
| 
 | 
   314         for el2 in codons[el1]:
 | 
| 
 | 
   315             print(el2)
 | 
| 
 | 
   316             synonims[el2] = codons[el1]
 | 
| 
 | 
   317             #synonims[el2] = []
 | 
| 
 | 
   318             #for el3 in codons[el1]#set.difference(set(codons[el1]), {el2}):
 | 
| 
 | 
   319             #    print(el3)
 | 
| 
 | 
   320             #    synonims[el2].append(el3)
 | 
| 
 | 
   321                 #break
 | 
| 
 | 
   322             #break
 | 
| 
 | 
   323         #break
 | 
| 
 | 
   324 
 | 
| 
 | 
   325 
 | 
| 
 | 
   326     anti_codons = {}
 | 
| 
 | 
   327 
 | 
| 
 | 
   328     for codon in synonims.keys():
 | 
| 
 | 
   329             tmp_codon = Bio.Seq.Seq(codon, IUPAC.unambiguous_dna)
 | 
| 
 | 
   330             tmp_anticodon = str(tmp_codon.reverse_complement())
 | 
| 
 | 
   331 
 | 
| 
 | 
   332             anti_codons[tmp_anticodon] = []
 | 
| 
 | 
   333 
 | 
| 
 | 
   334             for synonim in synonims[codon]:
 | 
| 
 | 
   335                     tmp_synonim = Bio.Seq.Seq(synonim, IUPAC.unambiguous_dna)
 | 
| 
 | 
   336                     tmp_antisynonim = str(tmp_synonim.reverse_complement())
 | 
| 
 | 
   337                     anti_codons[tmp_anticodon].append(tmp_antisynonim)
 | 
| 
 | 
   338 
 | 
| 
 | 
   339     check = Bio.Seq.Seq("CTT")
 | 
| 
 | 
   340     anti_check = check.reverse_complement()
 | 
| 
 | 
   341     print("\nCheck:\n" + str(check))
 | 
| 
 | 
   342     print("\nCodons:\n")
 | 
| 
 | 
   343 
 | 
| 
 | 
   344     for key in codons.keys():
 | 
| 
 | 
   345         if str(check) in codons[key]:
 | 
| 
 | 
   346             print(codons[key])
 | 
| 
 | 
   347 
 | 
| 
 | 
   348     #pprint(codons)
 | 
| 
 | 
   349     print("\nSynonims:\n")
 | 
| 
 | 
   350     pprint(synonims[str(check)])
 | 
| 
 | 
   351     print("\nAnti_Codons:\n")
 | 
| 
 | 
   352     pprint(anti_codons[str(anti_check)])
 | 
| 
 | 
   353 
 | 
| 
 | 
   354     #i = synonims.keys()
 | 
| 
 | 
   355     #right = True
 | 
| 
 | 
   356     #while len(i) > 0:
 | 
| 
 | 
   357     #    tmp = i.pop()
 | 
| 
 | 
   358     #    check = Bio.Seq.Seq(tmp)
 | 
| 
 | 
   359     #    anti_check = check.reverse_complement()
 | 
| 
 | 
   360 
 | 
| 
 | 
   361 
 | 
| 
 | 
   362     return {"synonims":synonims, "anti_synonims":anti_codons}
 |