comparison galaxy/codon_usage.py @ 42:439b70949f8d draft

Uploaded
author gianmarco_piccinno
date Mon, 20 May 2019 16:44:00 -0400
parents
children
comparison
equal deleted inserted replaced
41:bd35b13fabfb 42:439b70949f8d
1 #!/usr/bin/env python
2
3 import Bio as Bio
4 from Bio import SeqIO
5 from Bio.Data import CodonTable
6 import re
7 from pprint import pprint
8 import argparse as ap
9 import sys
10 import os
11 import pandas as pd
12
13
14 def read_input(data = "example.fna", type_ = "fasta"):
15
16 """
17 Accepted formats:
18 - fasta (multifasta)
19 - gff
20 - gbk
21
22 """
23
24 seqs = ""
25
26 if type_ == "fasta":
27 with open(data, "rU") as handle:
28 for record in SeqIO.parse(handle, type_):
29 seqs = seqs + str(record.seq)
30
31 elif type_ == "gbk":
32 with open(data, "rU") as input_handle:
33 types = []
34 for record in SeqIO.parse(input_handle, "genbank"):
35 for feature in record.features:
36 types.append(feature.type)
37 if feature.type == "CDS":
38 if feature.location.strand == +1:
39 seq = record.seq[feature.location.start:feature.location.end]
40 seqs = seqs + str(seq)
41 elif feature.location.strand == -1:
42 seq = record.seq[feature.location.start:feature.location.end].reverse_complement
43 seqs = seqs + str(seq)
44 return seqs
45
46 def codon_usage(seqs, codonTable):
47
48 codon_usage = {}
49 tmp = [x for x in re.split(r'(\w{3})', seqs) if x != ""]
50
51 b_cod_table = CodonTable.unambiguous_dna_by_name[codonTable].forward_table
52
53
54 for cod in CodonTable.unambiguous_dna_by_name[codonTable].stop_codons:
55 b_cod_table[cod] = "_Stop"
56
57 for cod in CodonTable.unambiguous_dna_by_name[codonTable].start_codons:
58 b_cod_table[cod + " Start"] = b_cod_table[cod]
59 b_cod_table.pop(cod)
60
61 aas = set(b_cod_table.values())
62
63
64 for aa in aas:
65 codon_usage[aa] = {}
66 for codon in b_cod_table.keys():
67 if b_cod_table[codon] == aa:
68 codon_usage[aa][codon] = tmp.count(codon.split(" ")[0])
69
70
71 tups = {(outerKey, innerKey): values for outerKey, innerDict in codon_usage.iteritems() for innerKey, values in innerDict.iteritems()}
72
73 codon_usage_ = pd.DataFrame(pd.Series(tups), columns = ["Count"])
74 codon_usage_.index = codon_usage_.index.set_names(["AA", "Codon"])
75 codon_usage_['Proportion'] = codon_usage_.groupby(level=0).transform(lambda x: (x / x.sum()).round(2))
76
77 return {"Dictionary": codon_usage, "Tuples": tups, "Table": codon_usage_}
78
79 if __name__ == '__main__':
80
81 parser = ap.ArgumentParser(description=
82 'This script takes as input gff, gbk and single or multifasta files and \n'
83 'compute the codon usage for a specified codon table.\n'
84 'Usage:\n'
85 'python codon_usage.py -i example.gbk -t genebank -o gbk_example -c Bacterial\n'
86 'python codon_usage.py -i example.ffn -t fasta -o fasta_example -c Bacterial\n'
87 'python codon_usage.py -i example.gff -t gff -o gff_example -c Bacterial\n',
88 formatter_class=ap.RawTextHelpFormatter)
89
90 parser.add_argument('-i','--input', help='The path to the input file',required=True)
91 parser.add_argument('-t','--type', help=
92 'The format of the file [genebank, fasta, gff ...]', required=True)
93 parser.add_argument('-c','--codonTable', help=
94 'The codon table to be used [Standard, Bacterial, Archaeal ...]\n'
95 'Alternative Flatworm Mitochondrial,\\n'
96 'Alternative Yeast Nuclear,\n'
97 'Archaeal,\n'
98 'Ascidian Mitochondrial,\n'
99 'Bacterial,\n'
100 'Blastocrithidia Nuclear,\n'
101 'Blepharisma Macronuclear,\n'
102 'Candidate Division SR1,\n'
103 'Chlorophycean Mitochondrial,\n'
104 'Ciliate Nuclear,\n'
105 'Coelenterate Mitochondrial,\n'
106 'Condylostoma Nuclear,\n'
107 'Dasycladacean Nuclear,\n'
108 'Echinoderm Mitochondrial,\n'
109 'Euplotid Nuclear,\n'
110 'Flatworm Mitochondrial,\n'
111 'Gracilibacteria,\n'
112 'Hexamita Nuclear,\n'
113 'Invertebrate Mitochondrial,\n'
114 'Karyorelict Nuclear,\n'
115 'Mesodinium Nuclear,\n'
116 'Mold Mitochondrial,\n'
117 'Mycoplasma,\n'
118 'Pachysolen tannophilus Nuclear,\n'
119 'Peritrich Nuclear,\n'
120 'Plant Plastid,\n'
121 'Protozoan Mitochondrial,\n'
122 'Pterobranchia Mitochondrial,\n'
123 'SGC0,\n'
124 'SGC1,\n'
125 'SGC2,\n'
126 'SGC3,\n'
127 'SGC4,\n'
128 'SGC5,\n'
129 'SGC8,\n'
130 'SGC9,\n'
131 'Scenedesmus obliquus Mitochondrial,\n'
132 'Spiroplasma,\n'
133 'Standard,\n'
134 'Thraustochytrium Mitochondrial,\n'
135 'Trematode Mitochondrial,\n'
136 'Vertebrate Mitochondrial,\n'
137 'Yeast Mitochondrial\n', required=True)
138
139 parser.add_argument('-o','--output', help='Description for bar argument', required=True)
140 args = vars(parser.parse_args())
141
142 seqs = read_input(data=args['input'], type_=args['type'])
143 out = codon_usage(seqs, args['codonTable'])
144
145 with open(args['output'], "w") as outf:
146 out["Table"].to_csv(outf, sep="\t", index_label=["AA", "Codon"])
147
148