Mercurial > repos > bgruening > glimmer_hmm
diff glimmerHMM/glimmerhmm_tabular_to_sequence.py @ 0:c9699375fcf6 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/glimmer_hmm commit 0dc67759bcbdf5a8a285ded9ba751340d741fe63
author | bgruening |
---|---|
date | Wed, 13 Jul 2016 10:55:48 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/glimmerHMM/glimmerhmm_tabular_to_sequence.py Wed Jul 13 10:55:48 2016 -0400 @@ -0,0 +1,92 @@ +#!/usr/bin/env python +"""Convert GlimmerHMM gene predictions into protein sequences. + +This works with the GlimmerHMM specific output format: + + 12 1 + Initial 10748 10762 15 + 12 2 + Internal 10940 10971 32 + 12 3 + Internal 11035 11039 5 + 12 4 + Internal 11072 11110 39 + 12 5 + Internal 11146 11221 76 + 12 6 + Terminal 11265 11388 124 + +http://www.cbcb.umd.edu/software/GlimmerHMM/ + +Modified version of the converter from Brad Chapman: https://github.com/chapmanb/bcbb/blob/master/biopython/glimmer_to_proteins.py + +Usage: + glimmer_to_proteins.py <glimmer output> <ref fasta> <output file> <convert to protein ... False|True> +""" + +import sys +import os +import operator + +from Bio import SeqIO +from Bio.SeqRecord import SeqRecord + +def main(glimmer_file, ref_file, out_file, to_protein): + with open(ref_file) as in_handle: + ref_rec = SeqIO.read(in_handle, "fasta") + + base, ext = os.path.splitext(glimmer_file) + + with open(out_file, "w") as out_handle: + SeqIO.write(protein_recs(glimmer_file, ref_rec, to_protein), out_handle, "fasta") + +def protein_recs(glimmer_file, ref_rec, to_protein): + """Generate protein records + """ + with open(glimmer_file) as in_handle: + for gene_num, exons, strand in glimmer_predictions(in_handle): + seq_exons = [] + for start, end in exons: + seq_exons.append(ref_rec.seq[start:end]) + gene_seq = reduce(operator.add, seq_exons) + if strand == '-': + gene_seq = gene_seq.reverse_complement() + if to_protein: + yield SeqRecord(gene_seq.translate(), gene_num, "", "") + else: + yield SeqRecord(gene_seq, gene_num, "", "") + +def glimmer_predictions(in_handle): + """Parse Glimmer output, generating a exons and strand for each prediction. + """ + # read the header + while 1: + line = in_handle.readline() + if line.startswith(" # #"): + break + in_handle.readline() + # read gene predictions one at a time + cur_exons, cur_gene_num, cur_strand = ([], None, None) + while 1: + line = in_handle.readline() + if not line: + break + parts = line.strip().split() + # new exon + if len(parts) == 0: + yield cur_gene_num, cur_exons, cur_strand + cur_exons, cur_gene_num, cur_strand = ([], None, None) + else: + this_gene_num = parts[0] + this_strand = parts[2] + this_start = int(parts[4]) - 1 # 1 based + this_end = int(parts[5]) + if cur_gene_num is None: + cur_gene_num = this_gene_num + cur_strand = this_strand + else: + assert cur_gene_num == this_gene_num + assert cur_strand == this_strand + cur_exons.append((this_start, this_end)) + if len(cur_exons) > 0: + yield cur_gene_num, cur_exons, cur_strand + +if __name__ == "__main__": + if len(sys.argv) != 5: + print __doc__ + sys.exit() + main(*sys.argv[1:])