diff cpt_blasttab_dice_filter/blasttab_dice_filter.py @ 0:f430415c668f draft

Uploaded
author cpt
date Fri, 13 May 2022 04:46:32 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_blasttab_dice_filter/blasttab_dice_filter.py	Fri May 13 04:46:32 2022 +0000
@@ -0,0 +1,61 @@
+#!/usr/bin/env python
+import argparse
+import logging
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger(name="blasttab2gff3")
+
+__doc__ = """
+Blast TSV files, when transformed to GFF3, do not normally show gaps in the
+blast hits. This tool aims to fill that "gap".
+"""
+
+
+def blasttsv2gff3(blasttsv, min_dice=50):
+    # 01 Query Seq-id (ID of your sequence)
+    # 02 Subject Seq-id (ID of the database hit)
+    # 03 Percentage of identical matches
+    # 04 Alignment length
+    # 05 Number of mismatches
+    # 06 Number of gap openings
+    # 07 Start of alignment in query
+    # 08 End of alignment in query
+    # 09 Start of alignment in subject (database hit)
+    # 10 End of alignment in subject (database hit)
+    # 11 Expectation value (E-value)
+    # 12 Bit score
+    # 13 All subject Seq-id(s), separated by a ';'
+    # 14 Raw score
+    # 15 Number of identical matches
+    # 16 Number of positive-scoring matches
+    # 17 Total number of gaps
+    # 18 Percentage of positive-scoring matches
+    # 19 Query frame
+    # 20 Subject frame
+    # 21 Aligned part of query sequence
+    # 22 Aligned part of subject sequence
+    # 23 Query sequence length
+    # 24 Subject sequence length
+    # 25 All subject title(s), separated by a '<>'
+
+    for line in blasttsv:
+        line = line.strip("\n")
+        data = line.split("\t")
+        dice = 2 * float(data[14]) / (float(data[22]) + float(data[23]))
+        
+        if dice >= min_dice:
+            yield line
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Convert Blast TSV to gapped GFF3")
+    parser.add_argument(
+        "blasttsv", type=argparse.FileType("r"), help="Blast TSV Output"
+    )
+    parser.add_argument(
+        "--min_dice", type=float, help="Minimum dice score", default=0.5
+    )
+    args = parser.parse_args()
+
+    for line in blasttsv2gff3(**vars(args)):
+        print(line)