Mercurial > repos > gianmarco_piccinno > project_rm
comparison codon_usage.py @ 28:e85a1feaaf38 draft
Uploaded
| author | fabio |
|---|---|
| date | Wed, 12 Dec 2018 08:12:25 -0500 |
| parents | |
| children | 3cb2af2435d3 |
comparison
equal
deleted
inserted
replaced
| 27:1eb19659ff16 | 28:e85a1feaaf38 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 import Bio as Bio | |
| 4 from Bio import SeqIO | |
| 5 from Bio.Data import CodonTable | |
| 6 import re | |
| 7 from pprint import pprint | |
| 8 import argparse as ap | |
| 9 import sys | |
| 10 import os | |
| 11 import pandas as pd | |
| 12 #from BCBio import GFF | |
| 13 | |
| 14 | |
| 15 def read_input(data = "example.fna", type_ = "fasta"): | |
| 16 | |
| 17 """ | |
| 18 Accepted formats: | |
| 19 - fasta (multifasta) | |
| 20 - gff | |
| 21 - gbk | |
| 22 | |
| 23 """ | |
| 24 | |
| 25 | |
| 26 seqs = "" | |
| 27 | |
| 28 if type_ == "fasta": | |
| 29 with open(data, "rU") as handle: | |
| 30 for record in SeqIO.parse(handle, type_): | |
| 31 seqs = seqs + str(record.seq) | |
| 32 | |
| 33 | |
| 34 #elif type_ == "gff": | |
| 35 # with open(data, "rU") as handle: | |
| 36 # for record in GFF.parse(handle): | |
| 37 # seqs = seqs + str(record.seq) | |
| 38 | |
| 39 elif type_ == "gbk": | |
| 40 with open(data, "rU") as input_handle: | |
| 41 for record in SeqIO.parse(input_handle, "genbank"): | |
| 42 seqs = seqs + str(record.seq) | |
| 43 | |
| 44 | |
| 45 return seqs | |
| 46 | |
| 47 def codon_usage(seqs, codonTable): | |
| 48 | |
| 49 codon_usage = {} | |
| 50 tmp = [x for x in re.split(r'(\w{3})', seqs) if x != ""] | |
| 51 | |
| 52 b_cod_table = CodonTable.unambiguous_dna_by_name[codonTable].forward_table | |
| 53 | |
| 54 | |
| 55 for cod in CodonTable.unambiguous_dna_by_name[codonTable].stop_codons: | |
| 56 b_cod_table[cod] = "_Stop" | |
| 57 | |
| 58 for cod in CodonTable.unambiguous_dna_by_name[codonTable].start_codons: | |
| 59 b_cod_table[cod + " Start"] = b_cod_table[cod] | |
| 60 b_cod_table.pop(cod) | |
| 61 | |
| 62 aas = set(b_cod_table.values()) | |
| 63 | |
| 64 | |
| 65 for aa in aas: | |
| 66 codon_usage[aa] = {} | |
| 67 for codon in b_cod_table.keys(): | |
| 68 if b_cod_table[codon] == aa: | |
| 69 codon_usage[aa][codon] = tmp.count(codon.split(" ")[0]) | |
| 70 | |
| 71 | |
| 72 tups = {(outerKey, innerKey): values for outerKey, innerDict in codon_usage.iteritems() for innerKey, values in innerDict.iteritems()} | |
| 73 | |
| 74 codon_usage_ = pd.DataFrame(pd.Series(tups), columns = ["Count"]) | |
| 75 codon_usage_.index = codon_usage_.index.set_names(["AA", "Codon"]) | |
| 76 codon_usage_['Proportion'] = codon_usage_.groupby(level=0).transform(lambda x: (x / x.sum()).round(2)) | |
| 77 | |
| 78 return {"Dictionary": codon_usage, "Tuples": tups, "Table": codon_usage_} | |
| 79 | |
| 80 if __name__ == '__main__': | |
| 81 | |
| 82 parser = ap.ArgumentParser(description= | |
| 83 'This script takes as input gff, gbk and single or multifasta files and \n' | |
| 84 'compute the codon usage for a specified codon table.\n' | |
| 85 'Usage:\n' | |
| 86 'python codon_usage.py -i example.gbk -t genebank -o gbk_example -c Bacterial\n' | |
| 87 'python codon_usage.py -i example.ffn -t fasta -o fasta_example -c Bacterial\n' | |
| 88 'python codon_usage.py -i example.gff -t gff -o gff_example -c Bacterial\n', | |
| 89 formatter_class=ap.RawTextHelpFormatter) | |
| 90 | |
| 91 parser.add_argument('-i','--input', help='The path to the input file',required=True) | |
| 92 parser.add_argument('-t','--type', help= | |
| 93 'The format of the file [genebank, fasta, gff ...]', required=True) | |
| 94 parser.add_argument('-c','--codonTable', help= | |
| 95 'The codon table to be used [Standard, Bacterial, Archaeal ...]\n' | |
| 96 'Alternative Flatworm Mitochondrial,\\n' | |
| 97 'Alternative Yeast Nuclear,\n' | |
| 98 'Archaeal,\n' | |
| 99 'Ascidian Mitochondrial,\n' | |
| 100 'Bacterial,\n' | |
| 101 'Blastocrithidia Nuclear,\n' | |
| 102 'Blepharisma Macronuclear,\n' | |
| 103 'Candidate Division SR1,\n' | |
| 104 'Chlorophycean Mitochondrial,\n' | |
| 105 'Ciliate Nuclear,\n' | |
| 106 'Coelenterate Mitochondrial,\n' | |
| 107 'Condylostoma Nuclear,\n' | |
| 108 'Dasycladacean Nuclear,\n' | |
| 109 'Echinoderm Mitochondrial,\n' | |
| 110 'Euplotid Nuclear,\n' | |
| 111 'Flatworm Mitochondrial,\n' | |
| 112 'Gracilibacteria,\n' | |
| 113 'Hexamita Nuclear,\n' | |
| 114 'Invertebrate Mitochondrial,\n' | |
| 115 'Karyorelict Nuclear,\n' | |
| 116 'Mesodinium Nuclear,\n' | |
| 117 'Mold Mitochondrial,\n' | |
| 118 'Mycoplasma,\n' | |
| 119 'Pachysolen tannophilus Nuclear,\n' | |
| 120 'Peritrich Nuclear,\n' | |
| 121 'Plant Plastid,\n' | |
| 122 'Protozoan Mitochondrial,\n' | |
| 123 'Pterobranchia Mitochondrial,\n' | |
| 124 'SGC0,\n' | |
| 125 'SGC1,\n' | |
| 126 'SGC2,\n' | |
| 127 'SGC3,\n' | |
| 128 'SGC4,\n' | |
| 129 'SGC5,\n' | |
| 130 'SGC8,\n' | |
| 131 'SGC9,\n' | |
| 132 'Scenedesmus obliquus Mitochondrial,\n' | |
| 133 'Spiroplasma,\n' | |
| 134 'Standard,\n' | |
| 135 'Thraustochytrium Mitochondrial,\n' | |
| 136 'Trematode Mitochondrial,\n' | |
| 137 'Vertebrate Mitochondrial,\n' | |
| 138 'Yeast Mitochondrial\n', required=True) | |
| 139 | |
| 140 parser.add_argument('-o','--output', help='Description for bar argument', required=True) | |
| 141 args = vars(parser.parse_args()) | |
| 142 | |
| 143 seqs = read_input(data=args['input'], type_=args['type']) | |
| 144 out = codon_usage(seqs, args['codonTable']) | |
| 145 | |
| 146 with open(args['output'], "w") as outf: | |
| 147 out["Table"].to_csv(outf, sep="\t", index_label=["AA", "Codon"]) | |
| 148 | |
| 149 |
