comparison cpt_convert_glimmer_to_gff3.py @ 7:843ea2c82e9a draft

planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author cpt
date Mon, 05 Jun 2023 02:40:30 +0000
parents
children 14306e57683d
comparison
equal deleted inserted replaced
6:3aac15f277c5 7:843ea2c82e9a
1 #!/usr/bin/env python
2 import sys
3 import argparse
4 from CPT_GFFParser import gffParse, gffWrite, gffSeqFeature
5 from Bio import SeqIO
6 from Bio.SeqFeature import SeqFeature
7 from Bio.SeqFeature import FeatureLocation
8 import logging
9
10 logging.basicConfig(level=logging.INFO)
11
12
13 def glimmer3_to_gff3(glimmer, genome):
14 seq_dict = SeqIO.to_dict(SeqIO.parse(genome, "fasta"))
15
16 current_record = None
17 for line in glimmer:
18 if line.startswith(">"):
19 chromId = line.strip().replace(">", "")
20 if chromId in seq_dict:
21 if current_record is not None:
22 yield current_record
23 current_record = seq_dict[chromId]
24 else:
25 raise Exception(
26 "Found results for sequence %s which was not in fasta file sequences (%s)"
27 % (chromId, ", ".join(seq_dict.keys()))
28 )
29
30 if not line.startswith(">"):
31 (gene_id, gstart, gend, phase, score) = line.strip().split()
32 gstart = int(gstart)
33 gend = int(gend)
34
35 if "+" in phase:
36 strand = 1
37 start = gstart
38 end = gend
39 else:
40 strand = -1
41 start = gend
42 end = gstart
43
44 # Correct for gff3
45 start -= 1
46
47 if start > end:
48 # gene found on boundary (ex [4000, 200]) from glimmer assuming circular genome
49 # -------------start<=======|sequence end|========>end------
50 if strand > 0:
51 end = len(current_record)
52 else:
53 start = 0
54 gene_id += "_truncated"
55
56 cds_feat = gffSeqFeature(
57 FeatureLocation(start, end),
58 type="CDS",
59 strand=strand,
60 id="%s.%s" % (current_record.id, gene_id),
61 qualifiers={
62 "source": "Glimmer3",
63 "ID": "%s.cds_%s" % (current_record.id, gene_id),
64 },
65 source="Glimmer3",
66 )
67
68 gene = gffSeqFeature(
69 FeatureLocation(start, end),
70 type="gene",
71 strand=strand,
72 id="%s.%s" % (current_record.id, gene_id),
73 qualifiers={
74 "source": "Glimmer3",
75 "ID": "%s.%s" % (current_record.id, gene_id),
76 },
77 source="Glimmer3",
78 )
79 gene.sub_features = [cds_feat]
80 current_record.features.append(gene)
81 yield current_record
82
83
84 if __name__ == "__main__":
85 parser = argparse.ArgumentParser(description="Convert Glimmer to GFF3")
86 parser.add_argument("glimmer", type=argparse.FileType("r"), help="Glimmer3 Output")
87 parser.add_argument("genome", type=argparse.FileType("r"), help="Fasta Genome")
88 args = parser.parse_args()
89
90 for result in glimmer3_to_gff3(**vars(args)):
91 gffWrite([result], sys.stdout)