0
|
1 #!/home/gianmarco/galaxy-python/python
|
|
2
|
|
3 import Bio
|
|
4 from Bio import SeqIO
|
|
5 from Bio.Data import CodonTable
|
|
6 import re
|
|
7 import sys
|
|
8 import os
|
|
9 import pandas as pd
|
|
10
|
|
11 def read_input(data = "example.fna"):
|
|
12
|
|
13 seqs = ""
|
|
14 with open(data, "rU") as handle:
|
|
15 for record in SeqIO.parse(handle, "fasta"):
|
|
16 seqs = seqs + str(record.seq)
|
|
17
|
|
18 return seqs
|
|
19
|
|
20 def codon_usage(seqs, codonTable):
|
|
21
|
|
22 codon_usage = {}
|
|
23 tmp = [x for x in re.split(r'(\w{3})', seqs) if x != ""]
|
|
24
|
|
25 b_cod_table = CodonTable.unambiguous_dna_by_name[codonTable].forward_table
|
|
26
|
|
27
|
|
28 for cod in CodonTable.unambiguous_dna_by_name[codonTable].stop_codons:
|
|
29 b_cod_table[cod] = "_Stop"
|
|
30
|
|
31 for cod in CodonTable.unambiguous_dna_by_name[codonTable].start_codons:
|
|
32 b_cod_table[cod + " Start"] = b_cod_table[cod]
|
|
33 b_cod_table.pop(cod)
|
|
34
|
|
35 aas = set(b_cod_table.values())
|
|
36
|
|
37
|
|
38 for aa in aas:
|
|
39 codon_usage[aa] = {}
|
|
40 for codon in b_cod_table.keys():
|
|
41 if b_cod_table[codon] == aa:
|
|
42 codon_usage[aa][codon] = tmp.count(codon.split(" ")[0])
|
|
43
|
|
44
|
|
45 tups = {(outerKey, innerKey): values for outerKey, innerDict in codon_usage.iteritems() for innerKey, values in innerDict.iteritems()}
|
|
46
|
|
47 #aas_ = set(tups.keys())
|
|
48
|
|
49 #stops_ = {el for el in aas_ if el[0] == "Stop"}
|
|
50 #aas_ = list(aas_.difference(stops_))
|
|
51 #stops_ = list(stops_)
|
|
52 #aas_.sort()
|
|
53 #stops_.sort()
|
|
54
|
|
55 codon_usage_ = pd.DataFrame(pd.Series(tups), columns = ["Count"])
|
|
56 codon_usage_.index = codon_usage_.index.set_names(["AA", "Codon"])
|
|
57 #codon_usage_.index.reindex(pd.MultiIndex.from_tuples([aas_, stops_], names=('AA', 'Codon')), level=[0,1])
|
|
58
|
|
59
|
|
60 codon_usage_['Proportion'] = codon_usage_.groupby(level=0).transform(lambda x: (x / x.sum()).round(2))
|
|
61
|
|
62 return {"Dictionary": codon_usage, "Tuples": tups, "Table": codon_usage_}
|
|
63
|
|
64
|
|
65
|
|
66 if __name__ == '__main__':
|
|
67
|
|
68
|
|
69 seqs = read_input(data=sys.argv[1])
|
|
70 out = codon_usage(seqs,"Bacterial")
|
|
71
|
|
72
|
|
73 with open(sys.argv[2], "w") as outf:
|
|
74 out["Table"].to_csv(outf, sep="\t")
|
|
75 #sys.stdout.write(out['Table']) |