| 0 | 1 #!/usr/bin/env python | 
|  | 2 """Convert GlimmerHMM gene predictions into protein sequences. | 
|  | 3 | 
|  | 4 This works with the GlimmerHMM specific output format: | 
|  | 5 | 
|  | 6   12    1  +  Initial       10748      10762       15 | 
|  | 7   12    2  +  Internal      10940      10971       32 | 
|  | 8   12    3  +  Internal      11035      11039        5 | 
|  | 9   12    4  +  Internal      11072      11110       39 | 
|  | 10   12    5  +  Internal      11146      11221       76 | 
|  | 11   12    6  +  Terminal      11265      11388      124 | 
|  | 12 | 
|  | 13 http://www.cbcb.umd.edu/software/GlimmerHMM/ | 
|  | 14 | 
|  | 15 Modified version of the converter from Brad Chapman: https://github.com/chapmanb/bcbb/blob/master/biopython/glimmer_to_proteins.py | 
|  | 16 | 
|  | 17 Usage: | 
|  | 18     glimmer_to_proteins.py <glimmer output> <ref fasta> <output file> <convert to protein ... False|True> | 
|  | 19 """ | 
|  | 20 | 
|  | 21 import sys | 
|  | 22 import os | 
|  | 23 import operator | 
|  | 24 | 
|  | 25 from Bio import SeqIO | 
|  | 26 from Bio.SeqRecord import SeqRecord | 
|  | 27 | 
|  | 28 def main(glimmer_file, ref_file, out_file, to_protein): | 
|  | 29     with open(ref_file) as in_handle: | 
|  | 30         ref_rec = SeqIO.read(in_handle, "fasta") | 
|  | 31 | 
|  | 32     base, ext = os.path.splitext(glimmer_file) | 
|  | 33 | 
|  | 34     with open(out_file, "w") as out_handle: | 
|  | 35         SeqIO.write(protein_recs(glimmer_file, ref_rec, to_protein), out_handle, "fasta") | 
|  | 36 | 
|  | 37 def protein_recs(glimmer_file, ref_rec, to_protein): | 
|  | 38     """Generate protein records | 
|  | 39     """ | 
|  | 40     with open(glimmer_file) as in_handle: | 
|  | 41         for gene_num, exons, strand in glimmer_predictions(in_handle): | 
|  | 42             seq_exons = [] | 
|  | 43             for start, end in exons: | 
|  | 44                 seq_exons.append(ref_rec.seq[start:end]) | 
|  | 45             gene_seq = reduce(operator.add, seq_exons) | 
|  | 46             if strand == '-': | 
|  | 47                 gene_seq = gene_seq.reverse_complement() | 
|  | 48             if to_protein: | 
|  | 49                 yield SeqRecord(gene_seq.translate(), gene_num, "", "") | 
|  | 50             else: | 
|  | 51                 yield SeqRecord(gene_seq, gene_num, "", "") | 
|  | 52 | 
|  | 53 def glimmer_predictions(in_handle): | 
|  | 54     """Parse Glimmer output, generating a exons and strand for each prediction. | 
|  | 55     """ | 
|  | 56     # read the header | 
|  | 57     while 1: | 
|  | 58         line = in_handle.readline() | 
|  | 59         if line.startswith("   #    #"): | 
|  | 60             break | 
|  | 61     in_handle.readline() | 
|  | 62     # read gene predictions one at a time | 
|  | 63     cur_exons, cur_gene_num, cur_strand = ([], None, None) | 
|  | 64     while 1: | 
|  | 65         line = in_handle.readline() | 
|  | 66         if not line: | 
|  | 67             break | 
|  | 68         parts = line.strip().split() | 
|  | 69         # new exon | 
|  | 70         if len(parts) == 0: | 
|  | 71             yield cur_gene_num, cur_exons, cur_strand | 
|  | 72             cur_exons, cur_gene_num, cur_strand = ([], None, None) | 
|  | 73         else: | 
|  | 74             this_gene_num = parts[0] | 
|  | 75             this_strand = parts[2] | 
|  | 76             this_start = int(parts[4]) - 1 # 1 based | 
|  | 77             this_end = int(parts[5]) | 
|  | 78             if cur_gene_num is None: | 
|  | 79                 cur_gene_num = this_gene_num | 
|  | 80                 cur_strand = this_strand | 
|  | 81             else: | 
|  | 82                 assert cur_gene_num == this_gene_num | 
|  | 83                 assert cur_strand == this_strand | 
|  | 84             cur_exons.append((this_start, this_end)) | 
|  | 85     if len(cur_exons) > 0: | 
|  | 86         yield cur_gene_num, cur_exons, cur_strand | 
|  | 87 | 
|  | 88 if __name__ == "__main__": | 
|  | 89     if len(sys.argv) != 5: | 
|  | 90         print __doc__ | 
|  | 91         sys.exit() | 
|  | 92     main(*sys.argv[1:]) |