Mercurial > repos > gianmarco_piccinno > project_rm
comparison galaxy/codon_usage.py @ 42:439b70949f8d draft
Uploaded
author | gianmarco_piccinno |
---|---|
date | Mon, 20 May 2019 16:44:00 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
41:bd35b13fabfb | 42:439b70949f8d |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import Bio as Bio | |
4 from Bio import SeqIO | |
5 from Bio.Data import CodonTable | |
6 import re | |
7 from pprint import pprint | |
8 import argparse as ap | |
9 import sys | |
10 import os | |
11 import pandas as pd | |
12 | |
13 | |
14 def read_input(data = "example.fna", type_ = "fasta"): | |
15 | |
16 """ | |
17 Accepted formats: | |
18 - fasta (multifasta) | |
19 - gff | |
20 - gbk | |
21 | |
22 """ | |
23 | |
24 seqs = "" | |
25 | |
26 if type_ == "fasta": | |
27 with open(data, "rU") as handle: | |
28 for record in SeqIO.parse(handle, type_): | |
29 seqs = seqs + str(record.seq) | |
30 | |
31 elif type_ == "gbk": | |
32 with open(data, "rU") as input_handle: | |
33 types = [] | |
34 for record in SeqIO.parse(input_handle, "genbank"): | |
35 for feature in record.features: | |
36 types.append(feature.type) | |
37 if feature.type == "CDS": | |
38 if feature.location.strand == +1: | |
39 seq = record.seq[feature.location.start:feature.location.end] | |
40 seqs = seqs + str(seq) | |
41 elif feature.location.strand == -1: | |
42 seq = record.seq[feature.location.start:feature.location.end].reverse_complement | |
43 seqs = seqs + str(seq) | |
44 return seqs | |
45 | |
46 def codon_usage(seqs, codonTable): | |
47 | |
48 codon_usage = {} | |
49 tmp = [x for x in re.split(r'(\w{3})', seqs) if x != ""] | |
50 | |
51 b_cod_table = CodonTable.unambiguous_dna_by_name[codonTable].forward_table | |
52 | |
53 | |
54 for cod in CodonTable.unambiguous_dna_by_name[codonTable].stop_codons: | |
55 b_cod_table[cod] = "_Stop" | |
56 | |
57 for cod in CodonTable.unambiguous_dna_by_name[codonTable].start_codons: | |
58 b_cod_table[cod + " Start"] = b_cod_table[cod] | |
59 b_cod_table.pop(cod) | |
60 | |
61 aas = set(b_cod_table.values()) | |
62 | |
63 | |
64 for aa in aas: | |
65 codon_usage[aa] = {} | |
66 for codon in b_cod_table.keys(): | |
67 if b_cod_table[codon] == aa: | |
68 codon_usage[aa][codon] = tmp.count(codon.split(" ")[0]) | |
69 | |
70 | |
71 tups = {(outerKey, innerKey): values for outerKey, innerDict in codon_usage.iteritems() for innerKey, values in innerDict.iteritems()} | |
72 | |
73 codon_usage_ = pd.DataFrame(pd.Series(tups), columns = ["Count"]) | |
74 codon_usage_.index = codon_usage_.index.set_names(["AA", "Codon"]) | |
75 codon_usage_['Proportion'] = codon_usage_.groupby(level=0).transform(lambda x: (x / x.sum()).round(2)) | |
76 | |
77 return {"Dictionary": codon_usage, "Tuples": tups, "Table": codon_usage_} | |
78 | |
79 if __name__ == '__main__': | |
80 | |
81 parser = ap.ArgumentParser(description= | |
82 'This script takes as input gff, gbk and single or multifasta files and \n' | |
83 'compute the codon usage for a specified codon table.\n' | |
84 'Usage:\n' | |
85 'python codon_usage.py -i example.gbk -t genebank -o gbk_example -c Bacterial\n' | |
86 'python codon_usage.py -i example.ffn -t fasta -o fasta_example -c Bacterial\n' | |
87 'python codon_usage.py -i example.gff -t gff -o gff_example -c Bacterial\n', | |
88 formatter_class=ap.RawTextHelpFormatter) | |
89 | |
90 parser.add_argument('-i','--input', help='The path to the input file',required=True) | |
91 parser.add_argument('-t','--type', help= | |
92 'The format of the file [genebank, fasta, gff ...]', required=True) | |
93 parser.add_argument('-c','--codonTable', help= | |
94 'The codon table to be used [Standard, Bacterial, Archaeal ...]\n' | |
95 'Alternative Flatworm Mitochondrial,\\n' | |
96 'Alternative Yeast Nuclear,\n' | |
97 'Archaeal,\n' | |
98 'Ascidian Mitochondrial,\n' | |
99 'Bacterial,\n' | |
100 'Blastocrithidia Nuclear,\n' | |
101 'Blepharisma Macronuclear,\n' | |
102 'Candidate Division SR1,\n' | |
103 'Chlorophycean Mitochondrial,\n' | |
104 'Ciliate Nuclear,\n' | |
105 'Coelenterate Mitochondrial,\n' | |
106 'Condylostoma Nuclear,\n' | |
107 'Dasycladacean Nuclear,\n' | |
108 'Echinoderm Mitochondrial,\n' | |
109 'Euplotid Nuclear,\n' | |
110 'Flatworm Mitochondrial,\n' | |
111 'Gracilibacteria,\n' | |
112 'Hexamita Nuclear,\n' | |
113 'Invertebrate Mitochondrial,\n' | |
114 'Karyorelict Nuclear,\n' | |
115 'Mesodinium Nuclear,\n' | |
116 'Mold Mitochondrial,\n' | |
117 'Mycoplasma,\n' | |
118 'Pachysolen tannophilus Nuclear,\n' | |
119 'Peritrich Nuclear,\n' | |
120 'Plant Plastid,\n' | |
121 'Protozoan Mitochondrial,\n' | |
122 'Pterobranchia Mitochondrial,\n' | |
123 'SGC0,\n' | |
124 'SGC1,\n' | |
125 'SGC2,\n' | |
126 'SGC3,\n' | |
127 'SGC4,\n' | |
128 'SGC5,\n' | |
129 'SGC8,\n' | |
130 'SGC9,\n' | |
131 'Scenedesmus obliquus Mitochondrial,\n' | |
132 'Spiroplasma,\n' | |
133 'Standard,\n' | |
134 'Thraustochytrium Mitochondrial,\n' | |
135 'Trematode Mitochondrial,\n' | |
136 'Vertebrate Mitochondrial,\n' | |
137 'Yeast Mitochondrial\n', required=True) | |
138 | |
139 parser.add_argument('-o','--output', help='Description for bar argument', required=True) | |
140 args = vars(parser.parse_args()) | |
141 | |
142 seqs = read_input(data=args['input'], type_=args['type']) | |
143 out = codon_usage(seqs, args['codonTable']) | |
144 | |
145 with open(args['output'], "w") as outf: | |
146 out["Table"].to_csv(outf, sep="\t", index_label=["AA", "Codon"]) | |
147 | |
148 |