comparison filter_multihit_paf.py @ 127:fbabf7498471 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/jbrowse2 commit 116b1a4bbd62251ad552306df2dc8aa8f46c6721
author fubar
date Mon, 07 Oct 2024 02:11:55 +0000
parents
children
comparison
equal deleted inserted replaced
126:fd0fc6fdc7c5 127:fbabf7498471
1 # bed for each multimatch paf
2 # idea from https://github.com/marbl/MashMap/blob/master/scripts/denovo_repeat_annotation.py
3 # adds filter for >1 match and #matches as a score
4 # ross lazarus october 6 2024
5
6 from os import sys
7
8 CHROMOSOMECOL1 = 0
9 STARTCOL1 = 2
10 ENDCOL1 = 3
11 STRAND = 4
12 CHROMOSOMECOL2 = 5
13 STARTCOL2 = 7
14 ENDCOL2 = 8
15 IDENTITY = 9
16
17 hitTable1 = {}
18 hitTable2 = {}
19 hitTable1_lens = {}
20 repeatList = []
21 filterLen = 1
22
23 with open(sys.argv[1]) as f:
24 for line in f:
25 rowElements = line.split()
26 chromosome1 = rowElements[CHROMOSOMECOL1]
27 start1 = int(rowElements[STARTCOL1])
28 end1 = int(rowElements[ENDCOL1])
29 strand = rowElements[STRAND]
30 chromosome2 = rowElements[CHROMOSOMECOL2]
31 start2 = int(rowElements[STARTCOL2])
32 end2 = int(rowElements[ENDCOL2])
33 identity = float(rowElements[IDENTITY])
34
35 if chromosome1 != chromosome2 or (
36 abs(start1 - start2) >= 1.5 * int(sys.argv[2])
37 and abs(end1 - end2) >= 1.5 * int(sys.argv[2])
38 ):
39 if end1 - start1 + 1 >= int(sys.argv[2]): ## and identity + 1 >= float(sys.argv[2]): # added one to identity for sensitivity
40 h1key = "%s~%d" % (chromosome1, start1)
41 h2key = "%s~%d" % (chromosome2, start2)
42 if hitTable1.get(h1key, None):
43 hitTable1[h1key].append(h2key)
44 hitTable1_lens[h1key] = abs(end1 - start1)
45 else:
46 hitTable1[h1key] = [
47 h2key,
48 ]
49 if hitTable2.get(h2key, None):
50 hitTable2[h2key].append(h1key)
51 else:
52 hitTable2[h2key] = [
53 h1key,
54 ]
55 else:
56 print(line)
57 for k in hitTable1.keys():
58 print(k)
59 nk1 = len(hitTable1[k])
60 nk2 = 0
61 l2 = []
62 for i, k2 in enumerate(hitTable1[k]):
63 k2l = hitTable2.get(k2,[])
64 if len(k2l) > 1:
65 nk2 += len(k2l)
66 l2.append(','.join(k2l))
67 if nk1 > 1 or nk2 > 1:
68 print(hitTable1[k], '->', ','.join(l2))
69 (chr, start) = k.split("~")
70 end = int(start) + hitTable1_lens.get(k,0)
71 repeatList.append((chr, start, "%d" % end, k, "%d" % (nk1 + nk2)))
72 with open(sys.argv[3], 'w') as f:
73 for row in repeatList:
74 f.write("\t".join(row))
75 f.write("\n")