Mercurial > repos > bgruening > glimmer_hmm
diff glimmerHMM/glimmerhmm_gff_to_sequence.py @ 0:c9699375fcf6 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/glimmer_hmm commit 0dc67759bcbdf5a8a285ded9ba751340d741fe63
author | bgruening |
---|---|
date | Wed, 13 Jul 2016 10:55:48 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/glimmerHMM/glimmerhmm_gff_to_sequence.py Wed Jul 13 10:55:48 2016 -0400 @@ -0,0 +1,66 @@ +#!/usr/bin/env python +"""Convert GlimmerHMM GFF3 gene predictions into protein sequences. + +This works with the GlimmerHMM GFF3 output format: + +##gff-version 3 +##sequence-region Contig5.15 1 47390 +Contig5.15 GlimmerHMM mRNA 323 325 . + . ID=Contig5.15.path1.gene1;Name=Contig5.15.path1.gene1 +Contig5.15 GlimmerHMM CDS 323 325 . + 0 ID=Contig5.15.cds1.1;Parent=Contig5.15.path1.gene1;Name=Contig5.15.path1.gene1;Note=final-exon + +http://www.cbcb.umd.edu/software/GlimmerHMM/ + +Modified version of the converter from Brad Chapman: https://github.com/chapmanb/bcbb/blob/master/biopython/glimmergff_to_proteins.py + +Usage: + glimmer_to_proteins.py <glimmer output> <ref fasta> <output file> <convert to protein ... False|True> +""" +import sys +import os +import operator + +from Bio import SeqIO +from Bio.SeqRecord import SeqRecord + +from BCBio import GFF + +def main(glimmer_file, ref_file, out_file, to_protein): + with open(ref_file) as in_handle: + ref_recs = SeqIO.to_dict(SeqIO.parse(in_handle, "fasta")) + + base, ext = os.path.splitext(glimmer_file) + + with open(out_file, "w") as out_handle: + SeqIO.write(protein_recs(glimmer_file, ref_recs, to_protein), out_handle, "fasta") + +def protein_recs(glimmer_file, ref_recs, to_protein): + """Generate protein records from GlimmerHMM gene predictions. + """ + with open(glimmer_file) as in_handle: + for rec in glimmer_predictions(in_handle, ref_recs): + for feature in rec.features: + seq_exons = [] + for cds in feature.sub_features: + seq_exons.append(rec.seq[ + cds.location.nofuzzy_start: + cds.location.nofuzzy_end]) + gene_seq = reduce(operator.add, seq_exons) + if feature.strand == -1: + gene_seq = gene_seq.reverse_complement() + + if to_protein: + yield SeqRecord(gene_seq.translate(), feature.qualifiers["ID"][0], "", "") + else: + yield SeqRecord(gene_seq, feature.qualifiers["ID"][0], "", "") + +def glimmer_predictions(in_handle, ref_recs): + """Parse Glimmer output, generating SeqRecord and SeqFeatures for predictions + """ + for rec in GFF.parse(in_handle, target_lines=1000, base_dict=ref_recs): + yield rec + +if __name__ == "__main__": + if len(sys.argv) != 3: + print __doc__ + sys.exit() + main(*sys.argv[1:])