Mercurial > repos > gianmarco_piccinno > project_rm
changeset 37:9225a7649585 draft
Uploaded
| author | gianmarco_piccinno | 
|---|---|
| date | Mon, 11 Feb 2019 04:09:02 -0500 | 
| parents | b52b91e34d15 | 
| children | 18e79593c628 | 
| files | codon_usage.py codon_usage.xml project_rm/codon_usage.py project_rm/codon_usage.xml | 
| diffstat | 4 files changed, 188 insertions(+), 188 deletions(-) [+] | 
line wrap: on
 line diff
--- a/codon_usage.py Wed Dec 12 09:10:57 2018 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,148 +0,0 @@ -#!/usr/bin/env python - -import Bio as Bio -from Bio import SeqIO -from Bio.Data import CodonTable -import re -from pprint import pprint -import argparse as ap -import sys -import os -import pandas as pd - - -def read_input(data = "example.fna", type_ = "fasta"): - - """ - Accepted formats: - - fasta (multifasta) - - gff - - gbk - - """ - - - seqs = "" - - if type_ == "fasta": - with open(data, "rU") as handle: - for record in SeqIO.parse(handle, type_): - seqs = seqs + str(record.seq) - - - #elif type_ == "gff": - # with open(data, "rU") as handle: - # for record in GFF.parse(handle): - # seqs = seqs + str(record.seq) - - elif type_ == "gbk": - with open(data, "rU") as input_handle: - for record in SeqIO.parse(input_handle, "genbank"): - seqs = seqs + str(record.seq) - - - return seqs - -def codon_usage(seqs, codonTable): - - codon_usage = {} - tmp = [x for x in re.split(r'(\w{3})', seqs) if x != ""] - - b_cod_table = CodonTable.unambiguous_dna_by_name[codonTable].forward_table - - - for cod in CodonTable.unambiguous_dna_by_name[codonTable].stop_codons: - b_cod_table[cod] = "_Stop" - - for cod in CodonTable.unambiguous_dna_by_name[codonTable].start_codons: - b_cod_table[cod + " Start"] = b_cod_table[cod] - b_cod_table.pop(cod) - - aas = set(b_cod_table.values()) - - - for aa in aas: - codon_usage[aa] = {} - for codon in b_cod_table.keys(): - if b_cod_table[codon] == aa: - codon_usage[aa][codon] = tmp.count(codon.split(" ")[0]) - - - tups = {(outerKey, innerKey): values for outerKey, innerDict in codon_usage.iteritems() for innerKey, values in innerDict.iteritems()} - - codon_usage_ = pd.DataFrame(pd.Series(tups), columns = ["Count"]) - codon_usage_.index = codon_usage_.index.set_names(["AA", "Codon"]) - codon_usage_['Proportion'] = codon_usage_.groupby(level=0).transform(lambda x: (x / x.sum()).round(2)) - - return {"Dictionary": codon_usage, "Tuples": tups, "Table": codon_usage_} - -if __name__ == '__main__': - - parser = ap.ArgumentParser(description= - 'This script takes as input gff, gbk and single or multifasta files and \n' - 'compute the codon usage for a specified codon table.\n' - 'Usage:\n' - 'python codon_usage.py -i example.gbk -t genebank -o gbk_example -c Bacterial\n' - 'python codon_usage.py -i example.ffn -t fasta -o fasta_example -c Bacterial\n' - 'python codon_usage.py -i example.gff -t gff -o gff_example -c Bacterial\n', - formatter_class=ap.RawTextHelpFormatter) - - parser.add_argument('-i','--input', help='The path to the input file',required=True) - parser.add_argument('-t','--type', help= - 'The format of the file [genebank, fasta, gff ...]', required=True) - parser.add_argument('-c','--codonTable', help= - 'The codon table to be used [Standard, Bacterial, Archaeal ...]\n' - 'Alternative Flatworm Mitochondrial,\\n' - 'Alternative Yeast Nuclear,\n' - 'Archaeal,\n' - 'Ascidian Mitochondrial,\n' - 'Bacterial,\n' - 'Blastocrithidia Nuclear,\n' - 'Blepharisma Macronuclear,\n' - 'Candidate Division SR1,\n' - 'Chlorophycean Mitochondrial,\n' - 'Ciliate Nuclear,\n' - 'Coelenterate Mitochondrial,\n' - 'Condylostoma Nuclear,\n' - 'Dasycladacean Nuclear,\n' - 'Echinoderm Mitochondrial,\n' - 'Euplotid Nuclear,\n' - 'Flatworm Mitochondrial,\n' - 'Gracilibacteria,\n' - 'Hexamita Nuclear,\n' - 'Invertebrate Mitochondrial,\n' - 'Karyorelict Nuclear,\n' - 'Mesodinium Nuclear,\n' - 'Mold Mitochondrial,\n' - 'Mycoplasma,\n' - 'Pachysolen tannophilus Nuclear,\n' - 'Peritrich Nuclear,\n' - 'Plant Plastid,\n' - 'Protozoan Mitochondrial,\n' - 'Pterobranchia Mitochondrial,\n' - 'SGC0,\n' - 'SGC1,\n' - 'SGC2,\n' - 'SGC3,\n' - 'SGC4,\n' - 'SGC5,\n' - 'SGC8,\n' - 'SGC9,\n' - 'Scenedesmus obliquus Mitochondrial,\n' - 'Spiroplasma,\n' - 'Standard,\n' - 'Thraustochytrium Mitochondrial,\n' - 'Trematode Mitochondrial,\n' - 'Vertebrate Mitochondrial,\n' - 'Yeast Mitochondrial\n', required=True) - - parser.add_argument('-o','--output', help='Description for bar argument', required=True) - args = vars(parser.parse_args()) - - seqs = read_input(data=args['input'], type_=args['type']) - out = codon_usage(seqs, args['codonTable']) - - with open(args['output'], "w") as outf: - out["Table"].to_csv(outf, sep="\t", index_label=["AA", "Codon"]) - - \ No newline at end of file
--- a/codon_usage.xml Wed Dec 12 09:10:57 2018 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,40 +0,0 @@ -<?xml version="1.0"?> -<tool name="Codon Usage" id="codon_usage" version="0.2.3"> - <description>for each sequence in a file</description> - <requirements> - <requirement type="package" version="2.7.15">python</requirement> - <requirement type="package" version="1.72">biopython</requirement> - <requirement type="package" version="0.23.4">pandas</requirement> - <requirement type="package" version="1.15.3">numpy</requirement> - </requirements> - - <command> -<![CDATA[ - python '$__tool_directory__/codon_usage.py' -i $input -t $input_type -o $output -c $codon_table -]]> - </command> - <inputs> - <param name="input" format="fasta" type="data" label="Source file"/> - - <param name="input_type" type="select" format="text"> - <label>Indicate the input file format</label> - <option value="fasta">Fasta</option> - <option value="gbk">gbk</option> - </param> - - <param name="codon_table" type="select" format="text"> - <label>Choose the proper codon table for your organism)</label> - <option value="Archaeal"> Archaeal</option> - <option value="Bacterial">Bacterial</option> - <option value="Standard">Standard</option> - </param> - </inputs> - - <outputs> - <data format="tabular" name="output" /> - </outputs> - - <help> -This tool compute codon usage of an annotated genome [preferably Prokaryotes]. - </help> -</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/project_rm/codon_usage.py Mon Feb 11 04:09:02 2019 -0500 @@ -0,0 +1,148 @@ +#!/usr/bin/env python + +import Bio as Bio +from Bio import SeqIO +from Bio.Data import CodonTable +import re +from pprint import pprint +import argparse as ap +import sys +import os +import pandas as pd + + +def read_input(data = "example.fna", type_ = "fasta"): + + """ + Accepted formats: + - fasta (multifasta) + - gff + - gbk + + """ + + seqs = "" + + if type_ == "fasta": + with open(data, "rU") as handle: + for record in SeqIO.parse(handle, type_): + seqs = seqs + str(record.seq) + + elif type_ == "gbk": + with open(data, "rU") as input_handle: + types = [] + for record in SeqIO.parse(input_handle, "genbank"): + for feature in record.features: + types.append(feature.type) + if feature.type == "CDS": + if feature.location.strand == +1: + seq = record.seq[feature.location.start:feature.location.end] + seqs = seqs + str(seq) + elif feature.location.strand == -1: + seq = record.seq[feature.location.start:feature.location.end].reverse_complement + seqs = seqs + str(seq) + return seqs + +def codon_usage(seqs, codonTable): + + codon_usage = {} + tmp = [x for x in re.split(r'(\w{3})', seqs) if x != ""] + + b_cod_table = CodonTable.unambiguous_dna_by_name[codonTable].forward_table + + + for cod in CodonTable.unambiguous_dna_by_name[codonTable].stop_codons: + b_cod_table[cod] = "_Stop" + + for cod in CodonTable.unambiguous_dna_by_name[codonTable].start_codons: + b_cod_table[cod + " Start"] = b_cod_table[cod] + b_cod_table.pop(cod) + + aas = set(b_cod_table.values()) + + + for aa in aas: + codon_usage[aa] = {} + for codon in b_cod_table.keys(): + if b_cod_table[codon] == aa: + codon_usage[aa][codon] = tmp.count(codon.split(" ")[0]) + + + tups = {(outerKey, innerKey): values for outerKey, innerDict in codon_usage.iteritems() for innerKey, values in innerDict.iteritems()} + + codon_usage_ = pd.DataFrame(pd.Series(tups), columns = ["Count"]) + codon_usage_.index = codon_usage_.index.set_names(["AA", "Codon"]) + codon_usage_['Proportion'] = codon_usage_.groupby(level=0).transform(lambda x: (x / x.sum()).round(2)) + + return {"Dictionary": codon_usage, "Tuples": tups, "Table": codon_usage_} + +if __name__ == '__main__': + + parser = ap.ArgumentParser(description= + 'This script takes as input gff, gbk and single or multifasta files and \n' + 'compute the codon usage for a specified codon table.\n' + 'Usage:\n' + 'python codon_usage.py -i example.gbk -t genebank -o gbk_example -c Bacterial\n' + 'python codon_usage.py -i example.ffn -t fasta -o fasta_example -c Bacterial\n' + 'python codon_usage.py -i example.gff -t gff -o gff_example -c Bacterial\n', + formatter_class=ap.RawTextHelpFormatter) + + parser.add_argument('-i','--input', help='The path to the input file',required=True) + parser.add_argument('-t','--type', help= + 'The format of the file [genebank, fasta, gff ...]', required=True) + parser.add_argument('-c','--codonTable', help= + 'The codon table to be used [Standard, Bacterial, Archaeal ...]\n' + 'Alternative Flatworm Mitochondrial,\\n' + 'Alternative Yeast Nuclear,\n' + 'Archaeal,\n' + 'Ascidian Mitochondrial,\n' + 'Bacterial,\n' + 'Blastocrithidia Nuclear,\n' + 'Blepharisma Macronuclear,\n' + 'Candidate Division SR1,\n' + 'Chlorophycean Mitochondrial,\n' + 'Ciliate Nuclear,\n' + 'Coelenterate Mitochondrial,\n' + 'Condylostoma Nuclear,\n' + 'Dasycladacean Nuclear,\n' + 'Echinoderm Mitochondrial,\n' + 'Euplotid Nuclear,\n' + 'Flatworm Mitochondrial,\n' + 'Gracilibacteria,\n' + 'Hexamita Nuclear,\n' + 'Invertebrate Mitochondrial,\n' + 'Karyorelict Nuclear,\n' + 'Mesodinium Nuclear,\n' + 'Mold Mitochondrial,\n' + 'Mycoplasma,\n' + 'Pachysolen tannophilus Nuclear,\n' + 'Peritrich Nuclear,\n' + 'Plant Plastid,\n' + 'Protozoan Mitochondrial,\n' + 'Pterobranchia Mitochondrial,\n' + 'SGC0,\n' + 'SGC1,\n' + 'SGC2,\n' + 'SGC3,\n' + 'SGC4,\n' + 'SGC5,\n' + 'SGC8,\n' + 'SGC9,\n' + 'Scenedesmus obliquus Mitochondrial,\n' + 'Spiroplasma,\n' + 'Standard,\n' + 'Thraustochytrium Mitochondrial,\n' + 'Trematode Mitochondrial,\n' + 'Vertebrate Mitochondrial,\n' + 'Yeast Mitochondrial\n', required=True) + + parser.add_argument('-o','--output', help='Description for bar argument', required=True) + args = vars(parser.parse_args()) + + seqs = read_input(data=args['input'], type_=args['type']) + out = codon_usage(seqs, args['codonTable']) + + with open(args['output'], "w") as outf: + out["Table"].to_csv(outf, sep="\t", index_label=["AA", "Codon"]) + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/project_rm/codon_usage.xml Mon Feb 11 04:09:02 2019 -0500 @@ -0,0 +1,40 @@ +<?xml version="1.0"?> +<tool name="Codon Usage" id="codon_usage" version="0.2.3"> + <description>for each sequence in a file</description> + <requirements> + <requirement type="package" version="2.7.15">python</requirement> + <requirement type="package" version="1.72">biopython</requirement> + <requirement type="package" version="0.23.4">pandas</requirement> + <requirement type="package" version="1.15.3">numpy</requirement> + </requirements> + + <command> +<![CDATA[ + python '$__tool_directory__/codon_usage.py' -i $input -t $input_type -o $output -c $codon_table +]]> + </command> + <inputs> + <param name="input" format="fasta" type="data" label="Source file"/> + + <param name="input_type" type="select" format="text"> + <label>Indicate the input file format</label> + <option value="fasta">Fasta</option> + <option value="gbk">gbk</option> + </param> + + <param name="codon_table" type="select" format="text"> + <label>Choose the proper codon table for your organism)</label> + <option value="Archaeal"> Archaeal</option> + <option value="Bacterial">Bacterial</option> + <option value="Standard">Standard</option> + </param> + </inputs> + + <outputs> + <data format="tabular" name="output" /> + </outputs> + + <help> +This tool compute codon usage of an annotated genome [preferably Prokaryotes]. + </help> +</tool>
