diff filter_multihit_paf.py @ 127:fbabf7498471 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/jbrowse2 commit 116b1a4bbd62251ad552306df2dc8aa8f46c6721
author fubar
date Mon, 07 Oct 2024 02:11:55 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filter_multihit_paf.py	Mon Oct 07 02:11:55 2024 +0000
@@ -0,0 +1,75 @@
+# bed for each multimatch paf
+# idea from https://github.com/marbl/MashMap/blob/master/scripts/denovo_repeat_annotation.py
+# adds filter for >1 match and #matches as a score
+# ross lazarus october 6 2024
+
+from os import sys
+
+CHROMOSOMECOL1 = 0
+STARTCOL1 = 2
+ENDCOL1 = 3
+STRAND = 4
+CHROMOSOMECOL2 = 5
+STARTCOL2 = 7
+ENDCOL2 = 8
+IDENTITY = 9
+
+hitTable1 = {}
+hitTable2 = {}
+hitTable1_lens = {}
+repeatList = []
+filterLen = 1
+
+with open(sys.argv[1]) as f:
+    for line in f:
+        rowElements = line.split()
+        chromosome1 = rowElements[CHROMOSOMECOL1]
+        start1 = int(rowElements[STARTCOL1])
+        end1 = int(rowElements[ENDCOL1])
+        strand = rowElements[STRAND]
+        chromosome2 = rowElements[CHROMOSOMECOL2]
+        start2 = int(rowElements[STARTCOL2])
+        end2 = int(rowElements[ENDCOL2])
+        identity = float(rowElements[IDENTITY])
+
+        if chromosome1 != chromosome2 or (
+            abs(start1 - start2) >= 1.5 * int(sys.argv[2])
+            and abs(end1 - end2) >= 1.5 * int(sys.argv[2])
+        ):
+            if end1 - start1 + 1 >= int(sys.argv[2]):  ##  and identity + 1 >= float(sys.argv[2]):  # added one to identity for sensitivity
+                h1key = "%s~%d" % (chromosome1, start1)
+                h2key = "%s~%d" % (chromosome2, start2)
+                if hitTable1.get(h1key, None):
+                    hitTable1[h1key].append(h2key)
+                    hitTable1_lens[h1key] = abs(end1 - start1)
+                else:
+                    hitTable1[h1key] = [
+                        h2key,
+                    ]
+                if hitTable2.get(h2key, None):
+                    hitTable2[h2key].append(h1key)
+                else:
+                    hitTable2[h2key] = [
+                        h1key,
+                    ]
+        else:
+            print(line)
+for k in hitTable1.keys():
+    print(k)
+    nk1 = len(hitTable1[k])
+    nk2 = 0
+    l2 = []
+    for i, k2 in enumerate(hitTable1[k]):
+        k2l = hitTable2.get(k2,[])
+        if len(k2l) > 1:
+            nk2 += len(k2l)
+            l2.append(','.join(k2l))
+    if nk1 > 1 or nk2 > 1:
+        print(hitTable1[k], '->', ','.join(l2))
+        (chr, start) = k.split("~")
+        end = int(start) + hitTable1_lens.get(k,0)
+        repeatList.append((chr, start, "%d" % end, k, "%d" % (nk1 + nk2)))
+with open(sys.argv[3], 'w') as f:
+    for row in repeatList:
+        f.write("\t".join(row))
+        f.write("\n")