view filter_multihit_paf.py @ 129:d08080933718 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/jbrowse2 commit 6d8a6a308c219c112dbfc09fe48ad462746d6fb0
author fubar
date Mon, 07 Oct 2024 08:55:19 +0000
parents fbabf7498471
children
line wrap: on
line source

# bed for each multimatch paf
# idea from https://github.com/marbl/MashMap/blob/master/scripts/denovo_repeat_annotation.py
# adds filter for >1 match and #matches as a score
# ross lazarus october 6 2024

from os import sys

CHROMOSOMECOL1 = 0
STARTCOL1 = 2
ENDCOL1 = 3
STRAND = 4
CHROMOSOMECOL2 = 5
STARTCOL2 = 7
ENDCOL2 = 8
IDENTITY = 9

hitTable1 = {}
hitTable2 = {}
hitTable1_lens = {}
repeatList = []
filterLen = 1

with open(sys.argv[1]) as f:
    for line in f:
        rowElements = line.split()
        chromosome1 = rowElements[CHROMOSOMECOL1]
        start1 = int(rowElements[STARTCOL1])
        end1 = int(rowElements[ENDCOL1])
        strand = rowElements[STRAND]
        chromosome2 = rowElements[CHROMOSOMECOL2]
        start2 = int(rowElements[STARTCOL2])
        end2 = int(rowElements[ENDCOL2])
        identity = float(rowElements[IDENTITY])

        if chromosome1 != chromosome2 or (
            abs(start1 - start2) >= 1.5 * int(sys.argv[2])
            and abs(end1 - end2) >= 1.5 * int(sys.argv[2])
        ):
            if end1 - start1 + 1 >= int(sys.argv[2]):  ##  and identity + 1 >= float(sys.argv[2]):  # added one to identity for sensitivity
                h1key = "%s~%d" % (chromosome1, start1)
                h2key = "%s~%d" % (chromosome2, start2)
                if hitTable1.get(h1key, None):
                    hitTable1[h1key].append(h2key)
                    hitTable1_lens[h1key] = abs(end1 - start1)
                else:
                    hitTable1[h1key] = [
                        h2key,
                    ]
                if hitTable2.get(h2key, None):
                    hitTable2[h2key].append(h1key)
                else:
                    hitTable2[h2key] = [
                        h1key,
                    ]
        else:
            print(line)
for k in hitTable1.keys():
    print(k)
    nk1 = len(hitTable1[k])
    nk2 = 0
    l2 = []
    for i, k2 in enumerate(hitTable1[k]):
        k2l = hitTable2.get(k2,[])
        if len(k2l) > 1:
            nk2 += len(k2l)
            l2.append(','.join(k2l))
    if nk1 > 1 or nk2 > 1:
        print(hitTable1[k], '->', ','.join(l2))
        (chr, start) = k.split("~")
        end = int(start) + hitTable1_lens.get(k,0)
        repeatList.append((chr, start, "%d" % end, k, "%d" % (nk1 + nk2)))
with open(sys.argv[3], 'w') as f:
    for row in repeatList:
        f.write("\t".join(row))
        f.write("\n")