Mercurial > repos > bgruening > glimmer_hmm
comparison glimmerHMM/glimmerhmm_tabular_to_sequence.py @ 0:c9699375fcf6 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/glimmer_hmm commit 0dc67759bcbdf5a8a285ded9ba751340d741fe63
| author | bgruening |
|---|---|
| date | Wed, 13 Jul 2016 10:55:48 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:c9699375fcf6 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 """Convert GlimmerHMM gene predictions into protein sequences. | |
| 3 | |
| 4 This works with the GlimmerHMM specific output format: | |
| 5 | |
| 6 12 1 + Initial 10748 10762 15 | |
| 7 12 2 + Internal 10940 10971 32 | |
| 8 12 3 + Internal 11035 11039 5 | |
| 9 12 4 + Internal 11072 11110 39 | |
| 10 12 5 + Internal 11146 11221 76 | |
| 11 12 6 + Terminal 11265 11388 124 | |
| 12 | |
| 13 http://www.cbcb.umd.edu/software/GlimmerHMM/ | |
| 14 | |
| 15 Modified version of the converter from Brad Chapman: https://github.com/chapmanb/bcbb/blob/master/biopython/glimmer_to_proteins.py | |
| 16 | |
| 17 Usage: | |
| 18 glimmer_to_proteins.py <glimmer output> <ref fasta> <output file> <convert to protein ... False|True> | |
| 19 """ | |
| 20 | |
| 21 import sys | |
| 22 import os | |
| 23 import operator | |
| 24 | |
| 25 from Bio import SeqIO | |
| 26 from Bio.SeqRecord import SeqRecord | |
| 27 | |
| 28 def main(glimmer_file, ref_file, out_file, to_protein): | |
| 29 with open(ref_file) as in_handle: | |
| 30 ref_rec = SeqIO.read(in_handle, "fasta") | |
| 31 | |
| 32 base, ext = os.path.splitext(glimmer_file) | |
| 33 | |
| 34 with open(out_file, "w") as out_handle: | |
| 35 SeqIO.write(protein_recs(glimmer_file, ref_rec, to_protein), out_handle, "fasta") | |
| 36 | |
| 37 def protein_recs(glimmer_file, ref_rec, to_protein): | |
| 38 """Generate protein records | |
| 39 """ | |
| 40 with open(glimmer_file) as in_handle: | |
| 41 for gene_num, exons, strand in glimmer_predictions(in_handle): | |
| 42 seq_exons = [] | |
| 43 for start, end in exons: | |
| 44 seq_exons.append(ref_rec.seq[start:end]) | |
| 45 gene_seq = reduce(operator.add, seq_exons) | |
| 46 if strand == '-': | |
| 47 gene_seq = gene_seq.reverse_complement() | |
| 48 if to_protein: | |
| 49 yield SeqRecord(gene_seq.translate(), gene_num, "", "") | |
| 50 else: | |
| 51 yield SeqRecord(gene_seq, gene_num, "", "") | |
| 52 | |
| 53 def glimmer_predictions(in_handle): | |
| 54 """Parse Glimmer output, generating a exons and strand for each prediction. | |
| 55 """ | |
| 56 # read the header | |
| 57 while 1: | |
| 58 line = in_handle.readline() | |
| 59 if line.startswith(" # #"): | |
| 60 break | |
| 61 in_handle.readline() | |
| 62 # read gene predictions one at a time | |
| 63 cur_exons, cur_gene_num, cur_strand = ([], None, None) | |
| 64 while 1: | |
| 65 line = in_handle.readline() | |
| 66 if not line: | |
| 67 break | |
| 68 parts = line.strip().split() | |
| 69 # new exon | |
| 70 if len(parts) == 0: | |
| 71 yield cur_gene_num, cur_exons, cur_strand | |
| 72 cur_exons, cur_gene_num, cur_strand = ([], None, None) | |
| 73 else: | |
| 74 this_gene_num = parts[0] | |
| 75 this_strand = parts[2] | |
| 76 this_start = int(parts[4]) - 1 # 1 based | |
| 77 this_end = int(parts[5]) | |
| 78 if cur_gene_num is None: | |
| 79 cur_gene_num = this_gene_num | |
| 80 cur_strand = this_strand | |
| 81 else: | |
| 82 assert cur_gene_num == this_gene_num | |
| 83 assert cur_strand == this_strand | |
| 84 cur_exons.append((this_start, this_end)) | |
| 85 if len(cur_exons) > 0: | |
| 86 yield cur_gene_num, cur_exons, cur_strand | |
| 87 | |
| 88 if __name__ == "__main__": | |
| 89 if len(sys.argv) != 5: | |
| 90 print __doc__ | |
| 91 sys.exit() | |
| 92 main(*sys.argv[1:]) |
