| 42 | 1 #!/usr/bin/env python | 
|  | 2 | 
|  | 3 import Bio as Bio | 
|  | 4 from Bio import SeqIO | 
|  | 5 from Bio.Data import CodonTable | 
|  | 6 import re | 
|  | 7 from pprint import pprint | 
|  | 8 import argparse as ap | 
|  | 9 import sys | 
|  | 10 import os | 
|  | 11 import pandas as pd | 
|  | 12 | 
|  | 13 | 
|  | 14 def read_input(data = "example.fna", type_ = "fasta"): | 
|  | 15 | 
|  | 16     """ | 
|  | 17     Accepted formats: | 
|  | 18         - fasta (multifasta) | 
|  | 19         - gff | 
|  | 20         - gbk | 
|  | 21 | 
|  | 22     """ | 
|  | 23 | 
|  | 24     seqs = "" | 
|  | 25 | 
|  | 26     if type_ == "fasta": | 
|  | 27         with open(data, "rU") as handle: | 
|  | 28             for record in SeqIO.parse(handle, type_): | 
|  | 29                 seqs = seqs + str(record.seq) | 
|  | 30 | 
|  | 31     elif type_ == "gbk": | 
|  | 32         with open(data, "rU") as input_handle: | 
|  | 33             types = [] | 
|  | 34             for record in SeqIO.parse(input_handle, "genbank"): | 
|  | 35                 for feature in record.features: | 
|  | 36                     types.append(feature.type) | 
|  | 37                     if feature.type == "CDS": | 
|  | 38                         if feature.location.strand == +1: | 
|  | 39                             seq = record.seq[feature.location.start:feature.location.end] | 
|  | 40                             seqs = seqs + str(seq) | 
|  | 41                         elif feature.location.strand == -1: | 
|  | 42                             seq = record.seq[feature.location.start:feature.location.end].reverse_complement | 
|  | 43                             seqs = seqs + str(seq) | 
|  | 44     return seqs | 
|  | 45 | 
|  | 46 def codon_usage(seqs, codonTable): | 
|  | 47 | 
|  | 48     codon_usage = {} | 
|  | 49     tmp = [x for x in re.split(r'(\w{3})', seqs) if x != ""] | 
|  | 50 | 
|  | 51     b_cod_table = CodonTable.unambiguous_dna_by_name[codonTable].forward_table | 
|  | 52 | 
|  | 53 | 
|  | 54     for cod in CodonTable.unambiguous_dna_by_name[codonTable].stop_codons: | 
|  | 55         b_cod_table[cod] = "_Stop" | 
|  | 56 | 
|  | 57     for cod in CodonTable.unambiguous_dna_by_name[codonTable].start_codons: | 
|  | 58             b_cod_table[cod + " Start"] = b_cod_table[cod] | 
|  | 59             b_cod_table.pop(cod) | 
|  | 60 | 
|  | 61     aas = set(b_cod_table.values()) | 
|  | 62 | 
|  | 63 | 
|  | 64     for aa in aas: | 
|  | 65         codon_usage[aa] = {} | 
|  | 66         for codon in b_cod_table.keys(): | 
|  | 67             if b_cod_table[codon] == aa: | 
|  | 68                 codon_usage[aa][codon] = tmp.count(codon.split(" ")[0]) | 
|  | 69 | 
|  | 70 | 
|  | 71     tups = {(outerKey, innerKey): values for outerKey, innerDict in codon_usage.iteritems() for innerKey, values in innerDict.iteritems()} | 
|  | 72 | 
|  | 73     codon_usage_ = pd.DataFrame(pd.Series(tups), columns = ["Count"]) | 
|  | 74     codon_usage_.index = codon_usage_.index.set_names(["AA", "Codon"]) | 
|  | 75     codon_usage_['Proportion'] = codon_usage_.groupby(level=0).transform(lambda x: (x / x.sum()).round(2)) | 
|  | 76 | 
|  | 77     return {"Dictionary": codon_usage, "Tuples": tups, "Table": codon_usage_} | 
|  | 78 | 
|  | 79 if __name__ == '__main__': | 
|  | 80 | 
|  | 81     parser = ap.ArgumentParser(description= | 
|  | 82     'This script takes as input gff, gbk and single or multifasta files and \n' | 
|  | 83     'compute the codon usage for a specified codon table.\n' | 
|  | 84     'Usage:\n' | 
|  | 85     'python codon_usage.py -i example.gbk -t genebank -o gbk_example -c Bacterial\n' | 
|  | 86     'python codon_usage.py -i example.ffn -t fasta -o fasta_example -c Bacterial\n' | 
|  | 87     'python codon_usage.py -i example.gff -t gff -o gff_example -c Bacterial\n', | 
|  | 88     formatter_class=ap.RawTextHelpFormatter) | 
|  | 89 | 
|  | 90     parser.add_argument('-i','--input', help='The path to the input file',required=True) | 
|  | 91     parser.add_argument('-t','--type', help= | 
|  | 92     'The format of the file [genebank, fasta, gff ...]', required=True) | 
|  | 93     parser.add_argument('-c','--codonTable', help= | 
|  | 94     'The codon table to be used [Standard, Bacterial, Archaeal ...]\n' | 
|  | 95     'Alternative Flatworm Mitochondrial,\\n' | 
|  | 96     'Alternative Yeast Nuclear,\n' | 
|  | 97     'Archaeal,\n' | 
|  | 98     'Ascidian Mitochondrial,\n' | 
|  | 99     'Bacterial,\n' | 
|  | 100     'Blastocrithidia Nuclear,\n' | 
|  | 101     'Blepharisma Macronuclear,\n' | 
|  | 102     'Candidate Division SR1,\n' | 
|  | 103     'Chlorophycean Mitochondrial,\n' | 
|  | 104     'Ciliate Nuclear,\n' | 
|  | 105     'Coelenterate Mitochondrial,\n' | 
|  | 106     'Condylostoma Nuclear,\n' | 
|  | 107     'Dasycladacean Nuclear,\n' | 
|  | 108     'Echinoderm Mitochondrial,\n' | 
|  | 109     'Euplotid Nuclear,\n' | 
|  | 110     'Flatworm Mitochondrial,\n' | 
|  | 111     'Gracilibacteria,\n' | 
|  | 112     'Hexamita Nuclear,\n' | 
|  | 113     'Invertebrate Mitochondrial,\n' | 
|  | 114     'Karyorelict Nuclear,\n' | 
|  | 115     'Mesodinium Nuclear,\n' | 
|  | 116     'Mold Mitochondrial,\n' | 
|  | 117     'Mycoplasma,\n' | 
|  | 118     'Pachysolen tannophilus Nuclear,\n' | 
|  | 119     'Peritrich Nuclear,\n' | 
|  | 120     'Plant Plastid,\n' | 
|  | 121     'Protozoan Mitochondrial,\n' | 
|  | 122     'Pterobranchia Mitochondrial,\n' | 
|  | 123     'SGC0,\n' | 
|  | 124     'SGC1,\n' | 
|  | 125     'SGC2,\n' | 
|  | 126     'SGC3,\n' | 
|  | 127     'SGC4,\n' | 
|  | 128     'SGC5,\n' | 
|  | 129     'SGC8,\n' | 
|  | 130     'SGC9,\n' | 
|  | 131     'Scenedesmus obliquus Mitochondrial,\n' | 
|  | 132     'Spiroplasma,\n' | 
|  | 133     'Standard,\n' | 
|  | 134     'Thraustochytrium Mitochondrial,\n' | 
|  | 135     'Trematode Mitochondrial,\n' | 
|  | 136     'Vertebrate Mitochondrial,\n' | 
|  | 137     'Yeast Mitochondrial\n', required=True) | 
|  | 138 | 
|  | 139     parser.add_argument('-o','--output', help='Description for bar argument', required=True) | 
|  | 140     args = vars(parser.parse_args()) | 
|  | 141 | 
|  | 142     seqs = read_input(data=args['input'], type_=args['type']) | 
|  | 143     out = codon_usage(seqs, args['codonTable']) | 
|  | 144 | 
|  | 145     with open(args['output'], "w") as outf: | 
|  | 146         out["Table"].to_csv(outf, sep="\t", index_label=["AA", "Codon"]) | 
|  | 147 | 
|  | 148 |