| 28 | 1 #!/usr/bin/env python | 
|  | 2 | 
|  | 3 import Bio as Bio | 
|  | 4 from Bio import SeqIO | 
|  | 5 from Bio.Data import CodonTable | 
|  | 6 import re | 
|  | 7 from pprint import pprint | 
|  | 8 import argparse as ap | 
|  | 9 import sys | 
|  | 10 import os | 
|  | 11 import pandas as pd | 
|  | 12 #from BCBio import GFF | 
|  | 13 | 
|  | 14 | 
|  | 15 def read_input(data = "example.fna", type_ = "fasta"): | 
|  | 16 | 
|  | 17     """ | 
|  | 18     Accepted formats: | 
|  | 19         - fasta (multifasta) | 
|  | 20         - gff | 
|  | 21         - gbk | 
|  | 22 | 
|  | 23     """ | 
|  | 24 | 
|  | 25 | 
|  | 26     seqs = "" | 
|  | 27 | 
|  | 28     if type_ == "fasta": | 
|  | 29         with open(data, "rU") as handle: | 
|  | 30             for record in SeqIO.parse(handle, type_): | 
|  | 31                 seqs = seqs + str(record.seq) | 
|  | 32 | 
|  | 33 | 
|  | 34     #elif type_ == "gff": | 
|  | 35     #    with open(data, "rU") as handle: | 
|  | 36     #        for record in GFF.parse(handle): | 
|  | 37     #            seqs = seqs + str(record.seq) | 
|  | 38 | 
|  | 39     elif type_ == "gbk": | 
|  | 40         with open(data, "rU") as input_handle: | 
|  | 41                 for record in SeqIO.parse(input_handle, "genbank"): | 
|  | 42                     seqs = seqs + str(record.seq) | 
|  | 43 | 
|  | 44 | 
|  | 45     return seqs | 
|  | 46 | 
|  | 47 def codon_usage(seqs, codonTable): | 
|  | 48 | 
|  | 49     codon_usage = {} | 
|  | 50     tmp = [x for x in re.split(r'(\w{3})', seqs) if x != ""] | 
|  | 51 | 
|  | 52     b_cod_table = CodonTable.unambiguous_dna_by_name[codonTable].forward_table | 
|  | 53 | 
|  | 54 | 
|  | 55     for cod in CodonTable.unambiguous_dna_by_name[codonTable].stop_codons: | 
|  | 56         b_cod_table[cod] = "_Stop" | 
|  | 57 | 
|  | 58     for cod in CodonTable.unambiguous_dna_by_name[codonTable].start_codons: | 
|  | 59             b_cod_table[cod + " Start"] = b_cod_table[cod] | 
|  | 60             b_cod_table.pop(cod) | 
|  | 61 | 
|  | 62     aas = set(b_cod_table.values()) | 
|  | 63 | 
|  | 64 | 
|  | 65     for aa in aas: | 
|  | 66         codon_usage[aa] = {} | 
|  | 67         for codon in b_cod_table.keys(): | 
|  | 68             if b_cod_table[codon] == aa: | 
|  | 69                 codon_usage[aa][codon] = tmp.count(codon.split(" ")[0]) | 
|  | 70 | 
|  | 71 | 
|  | 72     tups = {(outerKey, innerKey): values for outerKey, innerDict in codon_usage.iteritems() for innerKey, values in innerDict.iteritems()} | 
|  | 73 | 
|  | 74     codon_usage_ = pd.DataFrame(pd.Series(tups), columns = ["Count"]) | 
|  | 75     codon_usage_.index = codon_usage_.index.set_names(["AA", "Codon"]) | 
|  | 76     codon_usage_['Proportion'] = codon_usage_.groupby(level=0).transform(lambda x: (x / x.sum()).round(2)) | 
|  | 77 | 
|  | 78     return {"Dictionary": codon_usage, "Tuples": tups, "Table": codon_usage_} | 
|  | 79 | 
|  | 80 if __name__ == '__main__': | 
|  | 81 | 
|  | 82     parser = ap.ArgumentParser(description= | 
|  | 83     'This script takes as input gff, gbk and single or multifasta files and \n' | 
|  | 84     'compute the codon usage for a specified codon table.\n' | 
|  | 85     'Usage:\n' | 
|  | 86     'python codon_usage.py -i example.gbk -t genebank -o gbk_example -c Bacterial\n' | 
|  | 87     'python codon_usage.py -i example.ffn -t fasta -o fasta_example -c Bacterial\n' | 
|  | 88     'python codon_usage.py -i example.gff -t gff -o gff_example -c Bacterial\n', | 
|  | 89     formatter_class=ap.RawTextHelpFormatter) | 
|  | 90 | 
|  | 91     parser.add_argument('-i','--input', help='The path to the input file',required=True) | 
|  | 92     parser.add_argument('-t','--type', help= | 
|  | 93     'The format of the file [genebank, fasta, gff ...]', required=True) | 
|  | 94     parser.add_argument('-c','--codonTable', help= | 
|  | 95     'The codon table to be used [Standard, Bacterial, Archaeal ...]\n' | 
|  | 96     'Alternative Flatworm Mitochondrial,\\n' | 
|  | 97     'Alternative Yeast Nuclear,\n' | 
|  | 98     'Archaeal,\n' | 
|  | 99     'Ascidian Mitochondrial,\n' | 
|  | 100     'Bacterial,\n' | 
|  | 101     'Blastocrithidia Nuclear,\n' | 
|  | 102     'Blepharisma Macronuclear,\n' | 
|  | 103     'Candidate Division SR1,\n' | 
|  | 104     'Chlorophycean Mitochondrial,\n' | 
|  | 105     'Ciliate Nuclear,\n' | 
|  | 106     'Coelenterate Mitochondrial,\n' | 
|  | 107     'Condylostoma Nuclear,\n' | 
|  | 108     'Dasycladacean Nuclear,\n' | 
|  | 109     'Echinoderm Mitochondrial,\n' | 
|  | 110     'Euplotid Nuclear,\n' | 
|  | 111     'Flatworm Mitochondrial,\n' | 
|  | 112     'Gracilibacteria,\n' | 
|  | 113     'Hexamita Nuclear,\n' | 
|  | 114     'Invertebrate Mitochondrial,\n' | 
|  | 115     'Karyorelict Nuclear,\n' | 
|  | 116     'Mesodinium Nuclear,\n' | 
|  | 117     'Mold Mitochondrial,\n' | 
|  | 118     'Mycoplasma,\n' | 
|  | 119     'Pachysolen tannophilus Nuclear,\n' | 
|  | 120     'Peritrich Nuclear,\n' | 
|  | 121     'Plant Plastid,\n' | 
|  | 122     'Protozoan Mitochondrial,\n' | 
|  | 123     'Pterobranchia Mitochondrial,\n' | 
|  | 124     'SGC0,\n' | 
|  | 125     'SGC1,\n' | 
|  | 126     'SGC2,\n' | 
|  | 127     'SGC3,\n' | 
|  | 128     'SGC4,\n' | 
|  | 129     'SGC5,\n' | 
|  | 130     'SGC8,\n' | 
|  | 131     'SGC9,\n' | 
|  | 132     'Scenedesmus obliquus Mitochondrial,\n' | 
|  | 133     'Spiroplasma,\n' | 
|  | 134     'Standard,\n' | 
|  | 135     'Thraustochytrium Mitochondrial,\n' | 
|  | 136     'Trematode Mitochondrial,\n' | 
|  | 137     'Vertebrate Mitochondrial,\n' | 
|  | 138     'Yeast Mitochondrial\n', required=True) | 
|  | 139 | 
|  | 140     parser.add_argument('-o','--output', help='Description for bar argument', required=True) | 
|  | 141     args = vars(parser.parse_args()) | 
|  | 142 | 
|  | 143     seqs = read_input(data=args['input'], type_=args['type']) | 
|  | 144     out = codon_usage(seqs, args['codonTable']) | 
|  | 145 | 
|  | 146     with open(args['output'], "w") as outf: | 
|  | 147         out["Table"].to_csv(outf, sep="\t", index_label=["AA", "Codon"]) | 
|  | 148 | 
|  | 149 |