Mercurial > repos > cpt > cpt_convert_glimmer
comparison cpt_convert_glimmer_to_gff3.py @ 7:843ea2c82e9a draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
| author | cpt |
|---|---|
| date | Mon, 05 Jun 2023 02:40:30 +0000 |
| parents | |
| children | 14306e57683d |
comparison
equal
deleted
inserted
replaced
| 6:3aac15f277c5 | 7:843ea2c82e9a |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 import sys | |
| 3 import argparse | |
| 4 from CPT_GFFParser import gffParse, gffWrite, gffSeqFeature | |
| 5 from Bio import SeqIO | |
| 6 from Bio.SeqFeature import SeqFeature | |
| 7 from Bio.SeqFeature import FeatureLocation | |
| 8 import logging | |
| 9 | |
| 10 logging.basicConfig(level=logging.INFO) | |
| 11 | |
| 12 | |
| 13 def glimmer3_to_gff3(glimmer, genome): | |
| 14 seq_dict = SeqIO.to_dict(SeqIO.parse(genome, "fasta")) | |
| 15 | |
| 16 current_record = None | |
| 17 for line in glimmer: | |
| 18 if line.startswith(">"): | |
| 19 chromId = line.strip().replace(">", "") | |
| 20 if chromId in seq_dict: | |
| 21 if current_record is not None: | |
| 22 yield current_record | |
| 23 current_record = seq_dict[chromId] | |
| 24 else: | |
| 25 raise Exception( | |
| 26 "Found results for sequence %s which was not in fasta file sequences (%s)" | |
| 27 % (chromId, ", ".join(seq_dict.keys())) | |
| 28 ) | |
| 29 | |
| 30 if not line.startswith(">"): | |
| 31 (gene_id, gstart, gend, phase, score) = line.strip().split() | |
| 32 gstart = int(gstart) | |
| 33 gend = int(gend) | |
| 34 | |
| 35 if "+" in phase: | |
| 36 strand = 1 | |
| 37 start = gstart | |
| 38 end = gend | |
| 39 else: | |
| 40 strand = -1 | |
| 41 start = gend | |
| 42 end = gstart | |
| 43 | |
| 44 # Correct for gff3 | |
| 45 start -= 1 | |
| 46 | |
| 47 if start > end: | |
| 48 # gene found on boundary (ex [4000, 200]) from glimmer assuming circular genome | |
| 49 # -------------start<=======|sequence end|========>end------ | |
| 50 if strand > 0: | |
| 51 end = len(current_record) | |
| 52 else: | |
| 53 start = 0 | |
| 54 gene_id += "_truncated" | |
| 55 | |
| 56 cds_feat = gffSeqFeature( | |
| 57 FeatureLocation(start, end), | |
| 58 type="CDS", | |
| 59 strand=strand, | |
| 60 id="%s.%s" % (current_record.id, gene_id), | |
| 61 qualifiers={ | |
| 62 "source": "Glimmer3", | |
| 63 "ID": "%s.cds_%s" % (current_record.id, gene_id), | |
| 64 }, | |
| 65 source="Glimmer3", | |
| 66 ) | |
| 67 | |
| 68 gene = gffSeqFeature( | |
| 69 FeatureLocation(start, end), | |
| 70 type="gene", | |
| 71 strand=strand, | |
| 72 id="%s.%s" % (current_record.id, gene_id), | |
| 73 qualifiers={ | |
| 74 "source": "Glimmer3", | |
| 75 "ID": "%s.%s" % (current_record.id, gene_id), | |
| 76 }, | |
| 77 source="Glimmer3", | |
| 78 ) | |
| 79 gene.sub_features = [cds_feat] | |
| 80 current_record.features.append(gene) | |
| 81 yield current_record | |
| 82 | |
| 83 | |
| 84 if __name__ == "__main__": | |
| 85 parser = argparse.ArgumentParser(description="Convert Glimmer to GFF3") | |
| 86 parser.add_argument("glimmer", type=argparse.FileType("r"), help="Glimmer3 Output") | |
| 87 parser.add_argument("genome", type=argparse.FileType("r"), help="Fasta Genome") | |
| 88 args = parser.parse_args() | |
| 89 | |
| 90 for result in glimmer3_to_gff3(**vars(args)): | |
| 91 gffWrite([result], sys.stdout) |
