Mercurial > repos > fubar > jbrowse2
diff filter_multihit_paf.py @ 127:fbabf7498471 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/jbrowse2 commit 116b1a4bbd62251ad552306df2dc8aa8f46c6721
author | fubar |
---|---|
date | Mon, 07 Oct 2024 02:11:55 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter_multihit_paf.py Mon Oct 07 02:11:55 2024 +0000 @@ -0,0 +1,75 @@ +# bed for each multimatch paf +# idea from https://github.com/marbl/MashMap/blob/master/scripts/denovo_repeat_annotation.py +# adds filter for >1 match and #matches as a score +# ross lazarus october 6 2024 + +from os import sys + +CHROMOSOMECOL1 = 0 +STARTCOL1 = 2 +ENDCOL1 = 3 +STRAND = 4 +CHROMOSOMECOL2 = 5 +STARTCOL2 = 7 +ENDCOL2 = 8 +IDENTITY = 9 + +hitTable1 = {} +hitTable2 = {} +hitTable1_lens = {} +repeatList = [] +filterLen = 1 + +with open(sys.argv[1]) as f: + for line in f: + rowElements = line.split() + chromosome1 = rowElements[CHROMOSOMECOL1] + start1 = int(rowElements[STARTCOL1]) + end1 = int(rowElements[ENDCOL1]) + strand = rowElements[STRAND] + chromosome2 = rowElements[CHROMOSOMECOL2] + start2 = int(rowElements[STARTCOL2]) + end2 = int(rowElements[ENDCOL2]) + identity = float(rowElements[IDENTITY]) + + if chromosome1 != chromosome2 or ( + abs(start1 - start2) >= 1.5 * int(sys.argv[2]) + and abs(end1 - end2) >= 1.5 * int(sys.argv[2]) + ): + if end1 - start1 + 1 >= int(sys.argv[2]): ## and identity + 1 >= float(sys.argv[2]): # added one to identity for sensitivity + h1key = "%s~%d" % (chromosome1, start1) + h2key = "%s~%d" % (chromosome2, start2) + if hitTable1.get(h1key, None): + hitTable1[h1key].append(h2key) + hitTable1_lens[h1key] = abs(end1 - start1) + else: + hitTable1[h1key] = [ + h2key, + ] + if hitTable2.get(h2key, None): + hitTable2[h2key].append(h1key) + else: + hitTable2[h2key] = [ + h1key, + ] + else: + print(line) +for k in hitTable1.keys(): + print(k) + nk1 = len(hitTable1[k]) + nk2 = 0 + l2 = [] + for i, k2 in enumerate(hitTable1[k]): + k2l = hitTable2.get(k2,[]) + if len(k2l) > 1: + nk2 += len(k2l) + l2.append(','.join(k2l)) + if nk1 > 1 or nk2 > 1: + print(hitTable1[k], '->', ','.join(l2)) + (chr, start) = k.split("~") + end = int(start) + hitTable1_lens.get(k,0) + repeatList.append((chr, start, "%d" % end, k, "%d" % (nk1 + nk2))) +with open(sys.argv[3], 'w') as f: + for row in repeatList: + f.write("\t".join(row)) + f.write("\n")