Mercurial > repos > cpt > cpt_convert_glimmer
diff cpt_convert_glimmer_to_gff3.py @ 7:843ea2c82e9a draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:40:30 +0000 |
parents | |
children | 14306e57683d |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_convert_glimmer_to_gff3.py Mon Jun 05 02:40:30 2023 +0000 @@ -0,0 +1,91 @@ +#!/usr/bin/env python +import sys +import argparse +from CPT_GFFParser import gffParse, gffWrite, gffSeqFeature +from Bio import SeqIO +from Bio.SeqFeature import SeqFeature +from Bio.SeqFeature import FeatureLocation +import logging + +logging.basicConfig(level=logging.INFO) + + +def glimmer3_to_gff3(glimmer, genome): + seq_dict = SeqIO.to_dict(SeqIO.parse(genome, "fasta")) + + current_record = None + for line in glimmer: + if line.startswith(">"): + chromId = line.strip().replace(">", "") + if chromId in seq_dict: + if current_record is not None: + yield current_record + current_record = seq_dict[chromId] + else: + raise Exception( + "Found results for sequence %s which was not in fasta file sequences (%s)" + % (chromId, ", ".join(seq_dict.keys())) + ) + + if not line.startswith(">"): + (gene_id, gstart, gend, phase, score) = line.strip().split() + gstart = int(gstart) + gend = int(gend) + + if "+" in phase: + strand = 1 + start = gstart + end = gend + else: + strand = -1 + start = gend + end = gstart + + # Correct for gff3 + start -= 1 + + if start > end: + # gene found on boundary (ex [4000, 200]) from glimmer assuming circular genome + # -------------start<=======|sequence end|========>end------ + if strand > 0: + end = len(current_record) + else: + start = 0 + gene_id += "_truncated" + + cds_feat = gffSeqFeature( + FeatureLocation(start, end), + type="CDS", + strand=strand, + id="%s.%s" % (current_record.id, gene_id), + qualifiers={ + "source": "Glimmer3", + "ID": "%s.cds_%s" % (current_record.id, gene_id), + }, + source="Glimmer3", + ) + + gene = gffSeqFeature( + FeatureLocation(start, end), + type="gene", + strand=strand, + id="%s.%s" % (current_record.id, gene_id), + qualifiers={ + "source": "Glimmer3", + "ID": "%s.%s" % (current_record.id, gene_id), + }, + source="Glimmer3", + ) + gene.sub_features = [cds_feat] + current_record.features.append(gene) + yield current_record + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Convert Glimmer to GFF3") + parser.add_argument("glimmer", type=argparse.FileType("r"), help="Glimmer3 Output") + parser.add_argument("genome", type=argparse.FileType("r"), help="Fasta Genome") + args = parser.parse_args() + + for result in glimmer3_to_gff3(**vars(args)): + gffWrite([result], sys.stdout)