view gff3_require_phage_start.py @ 7:4962f275bcaa draft default tip

planemo upload commit f33bdf952d796c5d7a240b132af3c4cbd102decc
author cpt
date Fri, 05 Jan 2024 05:57:00 +0000
parents 5de686a70681
children
line wrap: on
line source

#!/usr/bin/env python
import sys
import argparse
import logging
from Bio import SeqIO
from CPT_GFFParser import gffParse, gffWrite
from gff3 import feature_lambda, feature_test_type

logging.basicConfig(level=logging.INFO)
log = logging.getLogger(__name__)


def require_shinefind(gff3, fasta):
    # Load up sequence(s) for GFF3 data
    seq_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta"))
    # Parse GFF3 records
    for record in gffParse(gff3, base_dict=seq_dict):
        # Reopen
        genes = list(
            feature_lambda(
                record.features, feature_test_type, {"type": "gene"}, subfeatures=True
            )
        )
        good_genes = []
        for gene in genes:
            cdss = list(
                feature_lambda(
                    gene.sub_features,
                    feature_test_type,
                    {"type": "CDS"},
                    subfeatures=False,
                )
            )
            if len(cdss) == 0:
                continue

            one_good_cds = False
            for cds in cdss:
                if cds.extract(record).seq[0:3].upper() in ("GTG", "ATG", "TTG"):
                    one_good_cds = True

            if one_good_cds:
                good_genes.append(gene)
        record.features = good_genes
        record.annotations = {}
        yield record


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Require specific start codons")
    parser.add_argument("fasta", type=argparse.FileType("r"), help="Fasta Genome")
    parser.add_argument("gff3", type=argparse.FileType("r"), help="GFF3 annotations")
    args = parser.parse_args()

    for rec in require_shinefind(**vars(args)):
        gffWrite([rec], sys.stdout)