Mercurial > repos > fubar > jbrowse2
view filter_multihit_paf.py @ 127:fbabf7498471 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/jbrowse2 commit 116b1a4bbd62251ad552306df2dc8aa8f46c6721
author | fubar |
---|---|
date | Mon, 07 Oct 2024 02:11:55 +0000 |
parents | |
children |
line wrap: on
line source
# bed for each multimatch paf # idea from https://github.com/marbl/MashMap/blob/master/scripts/denovo_repeat_annotation.py # adds filter for >1 match and #matches as a score # ross lazarus october 6 2024 from os import sys CHROMOSOMECOL1 = 0 STARTCOL1 = 2 ENDCOL1 = 3 STRAND = 4 CHROMOSOMECOL2 = 5 STARTCOL2 = 7 ENDCOL2 = 8 IDENTITY = 9 hitTable1 = {} hitTable2 = {} hitTable1_lens = {} repeatList = [] filterLen = 1 with open(sys.argv[1]) as f: for line in f: rowElements = line.split() chromosome1 = rowElements[CHROMOSOMECOL1] start1 = int(rowElements[STARTCOL1]) end1 = int(rowElements[ENDCOL1]) strand = rowElements[STRAND] chromosome2 = rowElements[CHROMOSOMECOL2] start2 = int(rowElements[STARTCOL2]) end2 = int(rowElements[ENDCOL2]) identity = float(rowElements[IDENTITY]) if chromosome1 != chromosome2 or ( abs(start1 - start2) >= 1.5 * int(sys.argv[2]) and abs(end1 - end2) >= 1.5 * int(sys.argv[2]) ): if end1 - start1 + 1 >= int(sys.argv[2]): ## and identity + 1 >= float(sys.argv[2]): # added one to identity for sensitivity h1key = "%s~%d" % (chromosome1, start1) h2key = "%s~%d" % (chromosome2, start2) if hitTable1.get(h1key, None): hitTable1[h1key].append(h2key) hitTable1_lens[h1key] = abs(end1 - start1) else: hitTable1[h1key] = [ h2key, ] if hitTable2.get(h2key, None): hitTable2[h2key].append(h1key) else: hitTable2[h2key] = [ h1key, ] else: print(line) for k in hitTable1.keys(): print(k) nk1 = len(hitTable1[k]) nk2 = 0 l2 = [] for i, k2 in enumerate(hitTable1[k]): k2l = hitTable2.get(k2,[]) if len(k2l) > 1: nk2 += len(k2l) l2.append(','.join(k2l)) if nk1 > 1 or nk2 > 1: print(hitTable1[k], '->', ','.join(l2)) (chr, start) = k.split("~") end = int(start) + hitTable1_lens.get(k,0) repeatList.append((chr, start, "%d" % end, k, "%d" % (nk1 + nk2))) with open(sys.argv[3], 'w') as f: for row in repeatList: f.write("\t".join(row)) f.write("\n")