diff cpt_fasta_translate/fasta_translate.py @ 0:cb42bee49abb draft

Uploaded
author cpt
date Fri, 10 Jun 2022 08:47:31 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_fasta_translate/fasta_translate.py	Fri Jun 10 08:47:31 2022 +0000
@@ -0,0 +1,77 @@
+#!/usr/bin/env python
+import sys
+import logging
+import argparse
+from Bio import SeqIO
+from Bio.Data import CodonTable
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger()
+
+
+def translate(fasta_file, target="protein", table=11, strip_stops=False, met=False):
+    records = list(SeqIO.parse(fasta_file, "fasta"))
+
+    for record in records:
+        if target == "protein":
+            mod = len(record.seq) % 3
+            if mod != 0:
+                record.seq = record.seq[0:-mod]
+
+            # Read http://biopython.org/DIST/docs/api/Bio.Seq.Seq-class.html#transcribe
+            # for valid CDS conditions.
+
+            # Will first try to translate sequence as a CDS,
+            # then just as a sequence if this fails.
+
+            try:
+                tmpseq = record.seq.translate(table=table, cds=True)
+            except CodonTable.TranslationError as cte:
+                log.info("Translation issue at %s: %s", record.id, cte)
+                tmpseq = record.seq.translate(table=table, cds=False)
+
+            # check if stop in middle of protein
+            if "*" in tmpseq:
+                log.info(
+                    "Trimming %s from %s to %s due to stop codons",
+                    record.id,
+                    len(record.seq),
+                    3 * len(tmpseq) - 3,
+                )
+                tmpseq = tmpseq[0 : str(tmpseq).index("*")]
+
+            # add stop to end if strip_stops=False
+            if not strip_stops:
+                tmpseq = tmpseq + "*"
+
+            if met:
+                tmpseq = "M" + tmpseq[1:]
+
+            record.seq = tmpseq
+            if len(record.seq) > 0:
+                SeqIO.write(record, sys.stdout, "fasta")
+        else:
+            record.seq = record.seq.transcribe()
+            SeqIO.write(record, sys.stdout, "fasta")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Translate fasta file")
+    parser.add_argument("fasta_file", type=argparse.FileType("r"), help="Fasta file")
+    parser.add_argument("--target", choices=["protein", "rna"])
+    parser.add_argument(
+        "--table",
+        type=int,
+        default=11,
+        help="Translation table to use",
+        choices=range(1, 23),
+    )
+    parser.add_argument(
+        "--strip_stops", action="store_true", help="Remove stop characters"
+    )
+    parser.add_argument(
+        "--met", action="store_true", help="Convert first residue to Met"
+    )
+
+    args = parser.parse_args()
+    translate(**vars(args))