| 
28
 | 
     1 #!/usr/bin/env python
 | 
| 
 | 
     2 
 | 
| 
 | 
     3 import Bio as Bio
 | 
| 
 | 
     4 from Bio import SeqIO
 | 
| 
 | 
     5 from Bio.Data import CodonTable
 | 
| 
 | 
     6 import re
 | 
| 
 | 
     7 from pprint import pprint
 | 
| 
 | 
     8 import argparse as ap
 | 
| 
 | 
     9 import sys
 | 
| 
 | 
    10 import os
 | 
| 
 | 
    11 import pandas as pd
 | 
| 
 | 
    12 #from BCBio import GFF
 | 
| 
 | 
    13 
 | 
| 
 | 
    14 
 | 
| 
 | 
    15 def read_input(data = "example.fna", type_ = "fasta"):
 | 
| 
 | 
    16 
 | 
| 
 | 
    17     """
 | 
| 
 | 
    18     Accepted formats:
 | 
| 
 | 
    19         - fasta (multifasta)
 | 
| 
 | 
    20         - gff
 | 
| 
 | 
    21         - gbk
 | 
| 
 | 
    22 
 | 
| 
 | 
    23     """
 | 
| 
 | 
    24 
 | 
| 
 | 
    25 
 | 
| 
 | 
    26     seqs = ""
 | 
| 
 | 
    27 
 | 
| 
 | 
    28     if type_ == "fasta":
 | 
| 
 | 
    29         with open(data, "rU") as handle:
 | 
| 
 | 
    30             for record in SeqIO.parse(handle, type_):
 | 
| 
 | 
    31                 seqs = seqs + str(record.seq)
 | 
| 
 | 
    32 
 | 
| 
 | 
    33 
 | 
| 
 | 
    34     #elif type_ == "gff":
 | 
| 
 | 
    35     #    with open(data, "rU") as handle:
 | 
| 
 | 
    36     #        for record in GFF.parse(handle):
 | 
| 
 | 
    37     #            seqs = seqs + str(record.seq)
 | 
| 
 | 
    38 
 | 
| 
 | 
    39     elif type_ == "gbk":
 | 
| 
 | 
    40         with open(data, "rU") as input_handle:
 | 
| 
 | 
    41                 for record in SeqIO.parse(input_handle, "genbank"):
 | 
| 
 | 
    42                     seqs = seqs + str(record.seq)
 | 
| 
 | 
    43 
 | 
| 
 | 
    44 
 | 
| 
 | 
    45     return seqs
 | 
| 
 | 
    46 
 | 
| 
 | 
    47 def codon_usage(seqs, codonTable):
 | 
| 
 | 
    48 
 | 
| 
 | 
    49     codon_usage = {}
 | 
| 
 | 
    50     tmp = [x for x in re.split(r'(\w{3})', seqs) if x != ""]
 | 
| 
 | 
    51 
 | 
| 
 | 
    52     b_cod_table = CodonTable.unambiguous_dna_by_name[codonTable].forward_table
 | 
| 
 | 
    53 
 | 
| 
 | 
    54 
 | 
| 
 | 
    55     for cod in CodonTable.unambiguous_dna_by_name[codonTable].stop_codons:
 | 
| 
 | 
    56         b_cod_table[cod] = "_Stop"
 | 
| 
 | 
    57 
 | 
| 
 | 
    58     for cod in CodonTable.unambiguous_dna_by_name[codonTable].start_codons:
 | 
| 
 | 
    59             b_cod_table[cod + " Start"] = b_cod_table[cod]
 | 
| 
 | 
    60             b_cod_table.pop(cod)
 | 
| 
 | 
    61 
 | 
| 
 | 
    62     aas = set(b_cod_table.values())
 | 
| 
 | 
    63 
 | 
| 
 | 
    64 
 | 
| 
 | 
    65     for aa in aas:
 | 
| 
 | 
    66         codon_usage[aa] = {}
 | 
| 
 | 
    67         for codon in b_cod_table.keys():
 | 
| 
 | 
    68             if b_cod_table[codon] == aa:
 | 
| 
 | 
    69                 codon_usage[aa][codon] = tmp.count(codon.split(" ")[0])
 | 
| 
 | 
    70 
 | 
| 
 | 
    71 
 | 
| 
 | 
    72     tups = {(outerKey, innerKey): values for outerKey, innerDict in codon_usage.iteritems() for innerKey, values in innerDict.iteritems()}
 | 
| 
 | 
    73 
 | 
| 
 | 
    74     codon_usage_ = pd.DataFrame(pd.Series(tups), columns = ["Count"])
 | 
| 
 | 
    75     codon_usage_.index = codon_usage_.index.set_names(["AA", "Codon"])
 | 
| 
 | 
    76     codon_usage_['Proportion'] = codon_usage_.groupby(level=0).transform(lambda x: (x / x.sum()).round(2))
 | 
| 
 | 
    77 
 | 
| 
 | 
    78     return {"Dictionary": codon_usage, "Tuples": tups, "Table": codon_usage_}
 | 
| 
 | 
    79 
 | 
| 
 | 
    80 if __name__ == '__main__':
 | 
| 
 | 
    81 
 | 
| 
 | 
    82     parser = ap.ArgumentParser(description=
 | 
| 
 | 
    83     'This script takes as input gff, gbk and single or multifasta files and \n'
 | 
| 
 | 
    84     'compute the codon usage for a specified codon table.\n'
 | 
| 
 | 
    85     'Usage:\n'
 | 
| 
 | 
    86     'python codon_usage.py -i example.gbk -t genebank -o gbk_example -c Bacterial\n'
 | 
| 
 | 
    87     'python codon_usage.py -i example.ffn -t fasta -o fasta_example -c Bacterial\n'
 | 
| 
 | 
    88     'python codon_usage.py -i example.gff -t gff -o gff_example -c Bacterial\n',
 | 
| 
 | 
    89     formatter_class=ap.RawTextHelpFormatter)
 | 
| 
 | 
    90 
 | 
| 
 | 
    91     parser.add_argument('-i','--input', help='The path to the input file',required=True)
 | 
| 
 | 
    92     parser.add_argument('-t','--type', help=
 | 
| 
 | 
    93     'The format of the file [genebank, fasta, gff ...]', required=True)
 | 
| 
 | 
    94     parser.add_argument('-c','--codonTable', help=
 | 
| 
 | 
    95     'The codon table to be used [Standard, Bacterial, Archaeal ...]\n'
 | 
| 
 | 
    96     'Alternative Flatworm Mitochondrial,\\n'
 | 
| 
 | 
    97     'Alternative Yeast Nuclear,\n'
 | 
| 
 | 
    98     'Archaeal,\n'
 | 
| 
 | 
    99     'Ascidian Mitochondrial,\n'
 | 
| 
 | 
   100     'Bacterial,\n'
 | 
| 
 | 
   101     'Blastocrithidia Nuclear,\n'
 | 
| 
 | 
   102     'Blepharisma Macronuclear,\n'
 | 
| 
 | 
   103     'Candidate Division SR1,\n'
 | 
| 
 | 
   104     'Chlorophycean Mitochondrial,\n'
 | 
| 
 | 
   105     'Ciliate Nuclear,\n'
 | 
| 
 | 
   106     'Coelenterate Mitochondrial,\n'
 | 
| 
 | 
   107     'Condylostoma Nuclear,\n'
 | 
| 
 | 
   108     'Dasycladacean Nuclear,\n'
 | 
| 
 | 
   109     'Echinoderm Mitochondrial,\n'
 | 
| 
 | 
   110     'Euplotid Nuclear,\n'
 | 
| 
 | 
   111     'Flatworm Mitochondrial,\n'
 | 
| 
 | 
   112     'Gracilibacteria,\n'
 | 
| 
 | 
   113     'Hexamita Nuclear,\n'
 | 
| 
 | 
   114     'Invertebrate Mitochondrial,\n'
 | 
| 
 | 
   115     'Karyorelict Nuclear,\n'
 | 
| 
 | 
   116     'Mesodinium Nuclear,\n'
 | 
| 
 | 
   117     'Mold Mitochondrial,\n'
 | 
| 
 | 
   118     'Mycoplasma,\n'
 | 
| 
 | 
   119     'Pachysolen tannophilus Nuclear,\n'
 | 
| 
 | 
   120     'Peritrich Nuclear,\n'
 | 
| 
 | 
   121     'Plant Plastid,\n'
 | 
| 
 | 
   122     'Protozoan Mitochondrial,\n'
 | 
| 
 | 
   123     'Pterobranchia Mitochondrial,\n'
 | 
| 
 | 
   124     'SGC0,\n'
 | 
| 
 | 
   125     'SGC1,\n'
 | 
| 
 | 
   126     'SGC2,\n'
 | 
| 
 | 
   127     'SGC3,\n'
 | 
| 
 | 
   128     'SGC4,\n'
 | 
| 
 | 
   129     'SGC5,\n'
 | 
| 
 | 
   130     'SGC8,\n'
 | 
| 
 | 
   131     'SGC9,\n'
 | 
| 
 | 
   132     'Scenedesmus obliquus Mitochondrial,\n'
 | 
| 
 | 
   133     'Spiroplasma,\n'
 | 
| 
 | 
   134     'Standard,\n'
 | 
| 
 | 
   135     'Thraustochytrium Mitochondrial,\n'
 | 
| 
 | 
   136     'Trematode Mitochondrial,\n'
 | 
| 
 | 
   137     'Vertebrate Mitochondrial,\n'
 | 
| 
 | 
   138     'Yeast Mitochondrial\n', required=True)
 | 
| 
 | 
   139 
 | 
| 
 | 
   140     parser.add_argument('-o','--output', help='Description for bar argument', required=True)
 | 
| 
 | 
   141     args = vars(parser.parse_args())
 | 
| 
 | 
   142 
 | 
| 
 | 
   143     seqs = read_input(data=args['input'], type_=args['type'])
 | 
| 
 | 
   144     out = codon_usage(seqs, args['codonTable'])
 | 
| 
 | 
   145 
 | 
| 
 | 
   146     with open(args['output'], "w") as outf:
 | 
| 
 | 
   147         out["Table"].to_csv(outf, sep="\t", index_label=["AA", "Codon"])
 | 
| 
 | 
   148 
 | 
| 
 | 
   149      |