diff gff3_require_phage_start.py @ 4:5de686a70681 draft

planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author cpt
date Mon, 05 Jun 2023 02:52:41 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gff3_require_phage_start.py	Mon Jun 05 02:52:41 2023 +0000
@@ -0,0 +1,56 @@
+#!/usr/bin/env python
+import sys
+import argparse
+import logging
+from Bio import SeqIO
+from CPT_GFFParser import gffParse, gffWrite
+from gff3 import feature_lambda, feature_test_type
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger(__name__)
+
+
+def require_shinefind(gff3, fasta):
+    # Load up sequence(s) for GFF3 data
+    seq_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta"))
+    # Parse GFF3 records
+    for record in gffParse(gff3, base_dict=seq_dict):
+        # Reopen
+        genes = list(
+            feature_lambda(
+                record.features, feature_test_type, {"type": "gene"}, subfeatures=True
+            )
+        )
+        good_genes = []
+        for gene in genes:
+            cdss = list(
+                feature_lambda(
+                    gene.sub_features,
+                    feature_test_type,
+                    {"type": "CDS"},
+                    subfeatures=False,
+                )
+            )
+            if len(cdss) == 0:
+                continue
+
+            one_good_cds = False
+            for cds in cdss:
+                if cds.extract(record).seq[0:3].upper() in ("GTG", "ATG", "TTG"):
+                    one_good_cds = True
+
+            if one_good_cds:
+                good_genes.append(gene)
+        record.features = good_genes
+        record.annotations = {}
+        yield record
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Require specific start codons")
+    parser.add_argument("fasta", type=argparse.FileType("r"), help="Fasta Genome")
+    parser.add_argument("gff3", type=argparse.FileType("r"), help="GFF3 annotations")
+    args = parser.parse_args()
+
+    for rec in require_shinefind(**vars(args)):
+        gffWrite([rec], sys.stdout)