Mercurial > repos > gianmarco_piccinno > project_rm
comparison galaxy/codon_usage.py @ 42:439b70949f8d draft
Uploaded
| author | gianmarco_piccinno |
|---|---|
| date | Mon, 20 May 2019 16:44:00 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 41:bd35b13fabfb | 42:439b70949f8d |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 import Bio as Bio | |
| 4 from Bio import SeqIO | |
| 5 from Bio.Data import CodonTable | |
| 6 import re | |
| 7 from pprint import pprint | |
| 8 import argparse as ap | |
| 9 import sys | |
| 10 import os | |
| 11 import pandas as pd | |
| 12 | |
| 13 | |
| 14 def read_input(data = "example.fna", type_ = "fasta"): | |
| 15 | |
| 16 """ | |
| 17 Accepted formats: | |
| 18 - fasta (multifasta) | |
| 19 - gff | |
| 20 - gbk | |
| 21 | |
| 22 """ | |
| 23 | |
| 24 seqs = "" | |
| 25 | |
| 26 if type_ == "fasta": | |
| 27 with open(data, "rU") as handle: | |
| 28 for record in SeqIO.parse(handle, type_): | |
| 29 seqs = seqs + str(record.seq) | |
| 30 | |
| 31 elif type_ == "gbk": | |
| 32 with open(data, "rU") as input_handle: | |
| 33 types = [] | |
| 34 for record in SeqIO.parse(input_handle, "genbank"): | |
| 35 for feature in record.features: | |
| 36 types.append(feature.type) | |
| 37 if feature.type == "CDS": | |
| 38 if feature.location.strand == +1: | |
| 39 seq = record.seq[feature.location.start:feature.location.end] | |
| 40 seqs = seqs + str(seq) | |
| 41 elif feature.location.strand == -1: | |
| 42 seq = record.seq[feature.location.start:feature.location.end].reverse_complement | |
| 43 seqs = seqs + str(seq) | |
| 44 return seqs | |
| 45 | |
| 46 def codon_usage(seqs, codonTable): | |
| 47 | |
| 48 codon_usage = {} | |
| 49 tmp = [x for x in re.split(r'(\w{3})', seqs) if x != ""] | |
| 50 | |
| 51 b_cod_table = CodonTable.unambiguous_dna_by_name[codonTable].forward_table | |
| 52 | |
| 53 | |
| 54 for cod in CodonTable.unambiguous_dna_by_name[codonTable].stop_codons: | |
| 55 b_cod_table[cod] = "_Stop" | |
| 56 | |
| 57 for cod in CodonTable.unambiguous_dna_by_name[codonTable].start_codons: | |
| 58 b_cod_table[cod + " Start"] = b_cod_table[cod] | |
| 59 b_cod_table.pop(cod) | |
| 60 | |
| 61 aas = set(b_cod_table.values()) | |
| 62 | |
| 63 | |
| 64 for aa in aas: | |
| 65 codon_usage[aa] = {} | |
| 66 for codon in b_cod_table.keys(): | |
| 67 if b_cod_table[codon] == aa: | |
| 68 codon_usage[aa][codon] = tmp.count(codon.split(" ")[0]) | |
| 69 | |
| 70 | |
| 71 tups = {(outerKey, innerKey): values for outerKey, innerDict in codon_usage.iteritems() for innerKey, values in innerDict.iteritems()} | |
| 72 | |
| 73 codon_usage_ = pd.DataFrame(pd.Series(tups), columns = ["Count"]) | |
| 74 codon_usage_.index = codon_usage_.index.set_names(["AA", "Codon"]) | |
| 75 codon_usage_['Proportion'] = codon_usage_.groupby(level=0).transform(lambda x: (x / x.sum()).round(2)) | |
| 76 | |
| 77 return {"Dictionary": codon_usage, "Tuples": tups, "Table": codon_usage_} | |
| 78 | |
| 79 if __name__ == '__main__': | |
| 80 | |
| 81 parser = ap.ArgumentParser(description= | |
| 82 'This script takes as input gff, gbk and single or multifasta files and \n' | |
| 83 'compute the codon usage for a specified codon table.\n' | |
| 84 'Usage:\n' | |
| 85 'python codon_usage.py -i example.gbk -t genebank -o gbk_example -c Bacterial\n' | |
| 86 'python codon_usage.py -i example.ffn -t fasta -o fasta_example -c Bacterial\n' | |
| 87 'python codon_usage.py -i example.gff -t gff -o gff_example -c Bacterial\n', | |
| 88 formatter_class=ap.RawTextHelpFormatter) | |
| 89 | |
| 90 parser.add_argument('-i','--input', help='The path to the input file',required=True) | |
| 91 parser.add_argument('-t','--type', help= | |
| 92 'The format of the file [genebank, fasta, gff ...]', required=True) | |
| 93 parser.add_argument('-c','--codonTable', help= | |
| 94 'The codon table to be used [Standard, Bacterial, Archaeal ...]\n' | |
| 95 'Alternative Flatworm Mitochondrial,\\n' | |
| 96 'Alternative Yeast Nuclear,\n' | |
| 97 'Archaeal,\n' | |
| 98 'Ascidian Mitochondrial,\n' | |
| 99 'Bacterial,\n' | |
| 100 'Blastocrithidia Nuclear,\n' | |
| 101 'Blepharisma Macronuclear,\n' | |
| 102 'Candidate Division SR1,\n' | |
| 103 'Chlorophycean Mitochondrial,\n' | |
| 104 'Ciliate Nuclear,\n' | |
| 105 'Coelenterate Mitochondrial,\n' | |
| 106 'Condylostoma Nuclear,\n' | |
| 107 'Dasycladacean Nuclear,\n' | |
| 108 'Echinoderm Mitochondrial,\n' | |
| 109 'Euplotid Nuclear,\n' | |
| 110 'Flatworm Mitochondrial,\n' | |
| 111 'Gracilibacteria,\n' | |
| 112 'Hexamita Nuclear,\n' | |
| 113 'Invertebrate Mitochondrial,\n' | |
| 114 'Karyorelict Nuclear,\n' | |
| 115 'Mesodinium Nuclear,\n' | |
| 116 'Mold Mitochondrial,\n' | |
| 117 'Mycoplasma,\n' | |
| 118 'Pachysolen tannophilus Nuclear,\n' | |
| 119 'Peritrich Nuclear,\n' | |
| 120 'Plant Plastid,\n' | |
| 121 'Protozoan Mitochondrial,\n' | |
| 122 'Pterobranchia Mitochondrial,\n' | |
| 123 'SGC0,\n' | |
| 124 'SGC1,\n' | |
| 125 'SGC2,\n' | |
| 126 'SGC3,\n' | |
| 127 'SGC4,\n' | |
| 128 'SGC5,\n' | |
| 129 'SGC8,\n' | |
| 130 'SGC9,\n' | |
| 131 'Scenedesmus obliquus Mitochondrial,\n' | |
| 132 'Spiroplasma,\n' | |
| 133 'Standard,\n' | |
| 134 'Thraustochytrium Mitochondrial,\n' | |
| 135 'Trematode Mitochondrial,\n' | |
| 136 'Vertebrate Mitochondrial,\n' | |
| 137 'Yeast Mitochondrial\n', required=True) | |
| 138 | |
| 139 parser.add_argument('-o','--output', help='Description for bar argument', required=True) | |
| 140 args = vars(parser.parse_args()) | |
| 141 | |
| 142 seqs = read_input(data=args['input'], type_=args['type']) | |
| 143 out = codon_usage(seqs, args['codonTable']) | |
| 144 | |
| 145 with open(args['output'], "w") as outf: | |
| 146 out["Table"].to_csv(outf, sep="\t", index_label=["AA", "Codon"]) | |
| 147 | |
| 148 |
