| 
0
 | 
     1 #!/usr/bin/env python
 | 
| 
 | 
     2 """Convert GlimmerHMM gene predictions into protein sequences.
 | 
| 
 | 
     3 
 | 
| 
 | 
     4 This works with the GlimmerHMM specific output format:
 | 
| 
 | 
     5 
 | 
| 
 | 
     6   12    1  +  Initial       10748      10762       15
 | 
| 
 | 
     7   12    2  +  Internal      10940      10971       32
 | 
| 
 | 
     8   12    3  +  Internal      11035      11039        5
 | 
| 
 | 
     9   12    4  +  Internal      11072      11110       39
 | 
| 
 | 
    10   12    5  +  Internal      11146      11221       76
 | 
| 
 | 
    11   12    6  +  Terminal      11265      11388      124
 | 
| 
 | 
    12 
 | 
| 
 | 
    13 http://www.cbcb.umd.edu/software/GlimmerHMM/
 | 
| 
 | 
    14 
 | 
| 
 | 
    15 Modified version of the converter from Brad Chapman: https://github.com/chapmanb/bcbb/blob/master/biopython/glimmer_to_proteins.py
 | 
| 
 | 
    16 
 | 
| 
 | 
    17 Usage:
 | 
| 
 | 
    18     glimmer_to_proteins.py <glimmer output> <ref fasta> <output file> <convert to protein ... False|True>
 | 
| 
 | 
    19 """
 | 
| 
 | 
    20 
 | 
| 
 | 
    21 import sys
 | 
| 
 | 
    22 import os
 | 
| 
 | 
    23 import operator
 | 
| 
 | 
    24 
 | 
| 
 | 
    25 from Bio import SeqIO
 | 
| 
 | 
    26 from Bio.SeqRecord import SeqRecord
 | 
| 
 | 
    27 
 | 
| 
 | 
    28 def main(glimmer_file, ref_file, out_file, to_protein):
 | 
| 
 | 
    29     with open(ref_file) as in_handle:
 | 
| 
 | 
    30         ref_rec = SeqIO.read(in_handle, "fasta")
 | 
| 
 | 
    31 
 | 
| 
 | 
    32     base, ext = os.path.splitext(glimmer_file)
 | 
| 
 | 
    33 
 | 
| 
 | 
    34     with open(out_file, "w") as out_handle:
 | 
| 
 | 
    35         SeqIO.write(protein_recs(glimmer_file, ref_rec, to_protein), out_handle, "fasta")
 | 
| 
 | 
    36 
 | 
| 
 | 
    37 def protein_recs(glimmer_file, ref_rec, to_protein):
 | 
| 
 | 
    38     """Generate protein records
 | 
| 
 | 
    39     """
 | 
| 
 | 
    40     with open(glimmer_file) as in_handle:
 | 
| 
 | 
    41         for gene_num, exons, strand in glimmer_predictions(in_handle):
 | 
| 
 | 
    42             seq_exons = []
 | 
| 
 | 
    43             for start, end in exons:
 | 
| 
 | 
    44                 seq_exons.append(ref_rec.seq[start:end])
 | 
| 
 | 
    45             gene_seq = reduce(operator.add, seq_exons)
 | 
| 
 | 
    46             if strand == '-':
 | 
| 
 | 
    47                 gene_seq = gene_seq.reverse_complement()
 | 
| 
 | 
    48             if to_protein:
 | 
| 
 | 
    49                 yield SeqRecord(gene_seq.translate(), gene_num, "", "")
 | 
| 
 | 
    50             else:
 | 
| 
 | 
    51                 yield SeqRecord(gene_seq, gene_num, "", "")
 | 
| 
 | 
    52 
 | 
| 
 | 
    53 def glimmer_predictions(in_handle):
 | 
| 
 | 
    54     """Parse Glimmer output, generating a exons and strand for each prediction.
 | 
| 
 | 
    55     """
 | 
| 
 | 
    56     # read the header
 | 
| 
 | 
    57     while 1:
 | 
| 
 | 
    58         line = in_handle.readline()
 | 
| 
 | 
    59         if line.startswith("   #    #"):
 | 
| 
 | 
    60             break
 | 
| 
 | 
    61     in_handle.readline()
 | 
| 
 | 
    62     # read gene predictions one at a time
 | 
| 
 | 
    63     cur_exons, cur_gene_num, cur_strand = ([], None, None)
 | 
| 
 | 
    64     while 1:
 | 
| 
 | 
    65         line = in_handle.readline()
 | 
| 
 | 
    66         if not line:
 | 
| 
 | 
    67             break
 | 
| 
 | 
    68         parts = line.strip().split()
 | 
| 
 | 
    69         # new exon
 | 
| 
 | 
    70         if len(parts) == 0:
 | 
| 
 | 
    71             yield cur_gene_num, cur_exons, cur_strand
 | 
| 
 | 
    72             cur_exons, cur_gene_num, cur_strand = ([], None, None)
 | 
| 
 | 
    73         else:
 | 
| 
 | 
    74             this_gene_num = parts[0]
 | 
| 
 | 
    75             this_strand = parts[2]
 | 
| 
 | 
    76             this_start = int(parts[4]) - 1 # 1 based
 | 
| 
 | 
    77             this_end = int(parts[5])
 | 
| 
 | 
    78             if cur_gene_num is None:
 | 
| 
 | 
    79                 cur_gene_num = this_gene_num
 | 
| 
 | 
    80                 cur_strand = this_strand
 | 
| 
 | 
    81             else:
 | 
| 
 | 
    82                 assert cur_gene_num == this_gene_num
 | 
| 
 | 
    83                 assert cur_strand == this_strand
 | 
| 
 | 
    84             cur_exons.append((this_start, this_end))
 | 
| 
 | 
    85     if len(cur_exons) > 0:
 | 
| 
 | 
    86         yield cur_gene_num, cur_exons, cur_strand
 | 
| 
 | 
    87 
 | 
| 
 | 
    88 if __name__ == "__main__":
 | 
| 
 | 
    89     if len(sys.argv) != 5:
 | 
| 
 | 
    90         print __doc__
 | 
| 
 | 
    91         sys.exit()
 | 
| 
 | 
    92     main(*sys.argv[1:])
 |