Mercurial > repos > fubar > jbrowse2
comparison filter_multihit_paf.py @ 129:d08080933718 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/jbrowse2 commit 6d8a6a308c219c112dbfc09fe48ad462746d6fb0
author | fubar |
---|---|
date | Mon, 07 Oct 2024 08:55:19 +0000 |
parents | fbabf7498471 |
children |
comparison
equal
deleted
inserted
replaced
128:fce4ed3b1702 | 129:d08080933718 |
---|---|
1 # bed for each multimatch paf | |
2 # idea from https://github.com/marbl/MashMap/blob/master/scripts/denovo_repeat_annotation.py | |
3 # adds filter for >1 match and #matches as a score | |
4 # ross lazarus october 6 2024 | |
5 | |
6 from os import sys | |
7 | |
8 CHROMOSOMECOL1 = 0 | |
9 STARTCOL1 = 2 | |
10 ENDCOL1 = 3 | |
11 STRAND = 4 | |
12 CHROMOSOMECOL2 = 5 | |
13 STARTCOL2 = 7 | |
14 ENDCOL2 = 8 | |
15 IDENTITY = 9 | |
16 | |
17 hitTable1 = {} | |
18 hitTable2 = {} | |
19 hitTable1_lens = {} | |
20 repeatList = [] | |
21 filterLen = 1 | |
22 | |
23 with open(sys.argv[1]) as f: | |
24 for line in f: | |
25 rowElements = line.split() | |
26 chromosome1 = rowElements[CHROMOSOMECOL1] | |
27 start1 = int(rowElements[STARTCOL1]) | |
28 end1 = int(rowElements[ENDCOL1]) | |
29 strand = rowElements[STRAND] | |
30 chromosome2 = rowElements[CHROMOSOMECOL2] | |
31 start2 = int(rowElements[STARTCOL2]) | |
32 end2 = int(rowElements[ENDCOL2]) | |
33 identity = float(rowElements[IDENTITY]) | |
34 | |
35 if chromosome1 != chromosome2 or ( | |
36 abs(start1 - start2) >= 1.5 * int(sys.argv[2]) | |
37 and abs(end1 - end2) >= 1.5 * int(sys.argv[2]) | |
38 ): | |
39 if end1 - start1 + 1 >= int(sys.argv[2]): ## and identity + 1 >= float(sys.argv[2]): # added one to identity for sensitivity | |
40 h1key = "%s~%d" % (chromosome1, start1) | |
41 h2key = "%s~%d" % (chromosome2, start2) | |
42 if hitTable1.get(h1key, None): | |
43 hitTable1[h1key].append(h2key) | |
44 hitTable1_lens[h1key] = abs(end1 - start1) | |
45 else: | |
46 hitTable1[h1key] = [ | |
47 h2key, | |
48 ] | |
49 if hitTable2.get(h2key, None): | |
50 hitTable2[h2key].append(h1key) | |
51 else: | |
52 hitTable2[h2key] = [ | |
53 h1key, | |
54 ] | |
55 else: | |
56 print(line) | |
57 for k in hitTable1.keys(): | |
58 print(k) | |
59 nk1 = len(hitTable1[k]) | |
60 nk2 = 0 | |
61 l2 = [] | |
62 for i, k2 in enumerate(hitTable1[k]): | |
63 k2l = hitTable2.get(k2,[]) | |
64 if len(k2l) > 1: | |
65 nk2 += len(k2l) | |
66 l2.append(','.join(k2l)) | |
67 if nk1 > 1 or nk2 > 1: | |
68 print(hitTable1[k], '->', ','.join(l2)) | |
69 (chr, start) = k.split("~") | |
70 end = int(start) + hitTable1_lens.get(k,0) | |
71 repeatList.append((chr, start, "%d" % end, k, "%d" % (nk1 + nk2))) | |
72 with open(sys.argv[3], 'w') as f: | |
73 for row in repeatList: | |
74 f.write("\t".join(row)) | |
75 f.write("\n") |