Mercurial > repos > bgruening > glimmer_hmm
view glimmerHMM/glimmerhmm_tabular_to_sequence.py @ 3:5e14355f802a draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/glimmer_hmm commit 2effed877a778e455c63a76e994a0f2bb8f4dba0
author | rmarenco |
---|---|
date | Thu, 14 Jul 2016 15:20:33 -0400 |
parents | c9699375fcf6 |
children |
line wrap: on
line source
#!/usr/bin/env python """Convert GlimmerHMM gene predictions into protein sequences. This works with the GlimmerHMM specific output format: 12 1 + Initial 10748 10762 15 12 2 + Internal 10940 10971 32 12 3 + Internal 11035 11039 5 12 4 + Internal 11072 11110 39 12 5 + Internal 11146 11221 76 12 6 + Terminal 11265 11388 124 http://www.cbcb.umd.edu/software/GlimmerHMM/ Modified version of the converter from Brad Chapman: https://github.com/chapmanb/bcbb/blob/master/biopython/glimmer_to_proteins.py Usage: glimmer_to_proteins.py <glimmer output> <ref fasta> <output file> <convert to protein ... False|True> """ import sys import os import operator from Bio import SeqIO from Bio.SeqRecord import SeqRecord def main(glimmer_file, ref_file, out_file, to_protein): with open(ref_file) as in_handle: ref_rec = SeqIO.read(in_handle, "fasta") base, ext = os.path.splitext(glimmer_file) with open(out_file, "w") as out_handle: SeqIO.write(protein_recs(glimmer_file, ref_rec, to_protein), out_handle, "fasta") def protein_recs(glimmer_file, ref_rec, to_protein): """Generate protein records """ with open(glimmer_file) as in_handle: for gene_num, exons, strand in glimmer_predictions(in_handle): seq_exons = [] for start, end in exons: seq_exons.append(ref_rec.seq[start:end]) gene_seq = reduce(operator.add, seq_exons) if strand == '-': gene_seq = gene_seq.reverse_complement() if to_protein: yield SeqRecord(gene_seq.translate(), gene_num, "", "") else: yield SeqRecord(gene_seq, gene_num, "", "") def glimmer_predictions(in_handle): """Parse Glimmer output, generating a exons and strand for each prediction. """ # read the header while 1: line = in_handle.readline() if line.startswith(" # #"): break in_handle.readline() # read gene predictions one at a time cur_exons, cur_gene_num, cur_strand = ([], None, None) while 1: line = in_handle.readline() if not line: break parts = line.strip().split() # new exon if len(parts) == 0: yield cur_gene_num, cur_exons, cur_strand cur_exons, cur_gene_num, cur_strand = ([], None, None) else: this_gene_num = parts[0] this_strand = parts[2] this_start = int(parts[4]) - 1 # 1 based this_end = int(parts[5]) if cur_gene_num is None: cur_gene_num = this_gene_num cur_strand = this_strand else: assert cur_gene_num == this_gene_num assert cur_strand == this_strand cur_exons.append((this_start, this_end)) if len(cur_exons) > 0: yield cur_gene_num, cur_exons, cur_strand if __name__ == "__main__": if len(sys.argv) != 5: print __doc__ sys.exit() main(*sys.argv[1:])