Mercurial > repos > cpt > cpt_convert_glimmer
comparison cpt_convert_glimmer_to_gff3.py @ 7:843ea2c82e9a draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:40:30 +0000 |
parents | |
children | 14306e57683d |
comparison
equal
deleted
inserted
replaced
6:3aac15f277c5 | 7:843ea2c82e9a |
---|---|
1 #!/usr/bin/env python | |
2 import sys | |
3 import argparse | |
4 from CPT_GFFParser import gffParse, gffWrite, gffSeqFeature | |
5 from Bio import SeqIO | |
6 from Bio.SeqFeature import SeqFeature | |
7 from Bio.SeqFeature import FeatureLocation | |
8 import logging | |
9 | |
10 logging.basicConfig(level=logging.INFO) | |
11 | |
12 | |
13 def glimmer3_to_gff3(glimmer, genome): | |
14 seq_dict = SeqIO.to_dict(SeqIO.parse(genome, "fasta")) | |
15 | |
16 current_record = None | |
17 for line in glimmer: | |
18 if line.startswith(">"): | |
19 chromId = line.strip().replace(">", "") | |
20 if chromId in seq_dict: | |
21 if current_record is not None: | |
22 yield current_record | |
23 current_record = seq_dict[chromId] | |
24 else: | |
25 raise Exception( | |
26 "Found results for sequence %s which was not in fasta file sequences (%s)" | |
27 % (chromId, ", ".join(seq_dict.keys())) | |
28 ) | |
29 | |
30 if not line.startswith(">"): | |
31 (gene_id, gstart, gend, phase, score) = line.strip().split() | |
32 gstart = int(gstart) | |
33 gend = int(gend) | |
34 | |
35 if "+" in phase: | |
36 strand = 1 | |
37 start = gstart | |
38 end = gend | |
39 else: | |
40 strand = -1 | |
41 start = gend | |
42 end = gstart | |
43 | |
44 # Correct for gff3 | |
45 start -= 1 | |
46 | |
47 if start > end: | |
48 # gene found on boundary (ex [4000, 200]) from glimmer assuming circular genome | |
49 # -------------start<=======|sequence end|========>end------ | |
50 if strand > 0: | |
51 end = len(current_record) | |
52 else: | |
53 start = 0 | |
54 gene_id += "_truncated" | |
55 | |
56 cds_feat = gffSeqFeature( | |
57 FeatureLocation(start, end), | |
58 type="CDS", | |
59 strand=strand, | |
60 id="%s.%s" % (current_record.id, gene_id), | |
61 qualifiers={ | |
62 "source": "Glimmer3", | |
63 "ID": "%s.cds_%s" % (current_record.id, gene_id), | |
64 }, | |
65 source="Glimmer3", | |
66 ) | |
67 | |
68 gene = gffSeqFeature( | |
69 FeatureLocation(start, end), | |
70 type="gene", | |
71 strand=strand, | |
72 id="%s.%s" % (current_record.id, gene_id), | |
73 qualifiers={ | |
74 "source": "Glimmer3", | |
75 "ID": "%s.%s" % (current_record.id, gene_id), | |
76 }, | |
77 source="Glimmer3", | |
78 ) | |
79 gene.sub_features = [cds_feat] | |
80 current_record.features.append(gene) | |
81 yield current_record | |
82 | |
83 | |
84 if __name__ == "__main__": | |
85 parser = argparse.ArgumentParser(description="Convert Glimmer to GFF3") | |
86 parser.add_argument("glimmer", type=argparse.FileType("r"), help="Glimmer3 Output") | |
87 parser.add_argument("genome", type=argparse.FileType("r"), help="Fasta Genome") | |
88 args = parser.parse_args() | |
89 | |
90 for result in glimmer3_to_gff3(**vars(args)): | |
91 gffWrite([result], sys.stdout) |