diff gbk_to_five_col.py @ 1:1bdd481d5c25 draft

planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author cpt
date Mon, 05 Jun 2023 02:42:57 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gbk_to_five_col.py	Mon Jun 05 02:42:57 2023 +0000
@@ -0,0 +1,61 @@
+#!/usr/bin/env python
+import BIO_FIX_TOPO  # NOQA
+import argparse
+import logging
+from Bio import SeqIO
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger()
+
+
+# Read in Genbank file and parse features
+# Output features into Five Column format
+
+"""
+>Feature SeqID
+Line 1
+    Column 1: Start location (first nucleotide) of a feature
+    Column 2: Stop location (last nucleotide) of a feature
+    Column 3: Feature name (for example, 'CDS' or 'mRNA' or 'rRNA' or 'gene' or 'exon')
+Line2:
+    Column 4: Qualifier name (for example, 'product' or 'number' or 'gene' or 'note')
+    Column 5: Qualifier value
+
+Repeat for each feature in a seq
+Repeat Line 2 for each qualifier in a feature
+"""
+
+
+def gbk_to_5col(genbank):
+    """Converts genbank to BankIt five column format"""
+    for record in SeqIO.parse(genbank, "genbank"):
+        print(">Feature %s" % record.id)
+        for feature in record.features:
+            if feature.type == "source":
+                continue
+            else:
+                for index, part in enumerate(feature.location.parts):
+                    if part.strand > 0:
+                        start = int(part.start) + 1
+                        end = int(part.end)
+                    else:
+                        start = int(part.end)
+                        end = int(part.start) + 1
+                    if index == 0:
+                        name = feature.type
+                        print("%d\t%d\t%s" % (start, end, name))
+                    else:
+                        print("%d\t%d" % (start, end))
+                for (qualifier, values) in feature.qualifiers.items():
+                    for value in values:
+                        print("\t\t\t%s\t%s" % (qualifier, value))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Convert a Genbank file into five column format"
+    )
+    parser.add_argument("genbank", type=argparse.FileType("r"), help="Genbank file")
+
+    args = vars(parser.parse_args())
+    gbk_to_5col(**args)