Mercurial > repos > fubar > jbrowse2
diff filter_multihit_paf.py @ 135:21bb464c1d53 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/jbrowse2 commit 78bf7abb931bf3d348837c7211cd3cff32486691
author | fubar |
---|---|
date | Sun, 15 Dec 2024 23:47:40 +0000 |
parents | 69c6ea16c148 |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter_multihit_paf.py Sun Dec 15 23:47:40 2024 +0000 @@ -0,0 +1,94 @@ +# idea from https://github.com/marbl/MashMap/blob/master/scripts/denovo_repeat_annotation.py +# adds filter for more than minMatch denovo repeated paf read locations: >1 gives ~13%, >3 gives 3% of all hits in testing a big and small paf. +# A haplotype paf filtered of denovo repeats might give cleaner dotplots, because they add visual noise and repeats are less informative for sequence similarity. +# Alternative to use repeatmasked haplotype as the mashmap reference might also be useful. +# Also outputs a bed with the multimatch regions filtered from the paf with their repeat count as the score +# ross lazarus october 6 2024 + +import argparse + +from collections import OrderedDict + + +def pafDeDupe(inPaf, outDeNovoBed, outPaf, minMatch): + """ + Usage notes: + if a segment appears more than once on the left of a paf row, it can be considered a denovo repeat! + Contig and start offset are used as the id so approximate-ish. + Seems a consistent 13% 1+ denovo repeats with different levels of identity and match length + 3+ finds a much lower 3% of hits + + With a 181217 row squirrelhaps1k.paf at 1k 99%, get about 13% denovo repeat rows + (venv311) ross@pn50:~/rossgit/galaxytools/tools/jbrowse2$ python filter_multihit_paf.py --inpaf squirrelhaps1k.paf --outbed sh1k.bed --outpaf sh1kdedupe.paf --minMatch 1 + (venv311) ross@pn50:~/rossgit/galaxytools/tools/jbrowse2$ wc -l sh1kdedupe.paf + 109179 sh1kdedupe.paf + (venv311) ross@pn50:~/rossgit/galaxytools/tools/jbrowse2$ wc -l sh1k.bed + 23015 sh1k.bed + + venv311) ross@pn50:~/rossgit/galaxytools/tools/jbrowse2$ python filter_multihit_paf.py --inpaf squirrelhaps1k.paf --outbed sh1k.bed --outpaf sh1kdedupe.paf --minMatch 3 + (venv311) ross@pn50:~/rossgit/galaxytools/tools/jbrowse2$ wc -l sh1k.bed + 7996 sh1k.bed + + with a smaller 26610 10k 95%, get about + (venv3row 11) ross@pn50:~/rossgit/galaxytools/tools/jbrowse2$ wc -l squirrelhaps.paf + 26610 squirrelhaps.paf + (venv311) ross@pn50:~/rossgit/galaxytools/tools/jbrowse2$ python filter_multihit_paf.py squirrelhaps.paf sh.bed shdedupe.paf + (venv311) ross@pn50:~/rossgit/galaxytools/tools/jbrowse2$ wc -l shdedupe.paf + 18332 shdedupe.paf + (venv311) ross@pn50:~/rossgit/galaxytools/tools/jbrowse2$ wc -l sh.bed + 3663 sh.bed + """ + + CHROMOSOMECOL1 = 0 + STARTCOL1 = 2 + ENDCOL1 = 3 + hitTable1 = OrderedDict() + hitTable1_lens = {} + filterLen = minMatch + minMatch = int(minMatch) + filterMe = {} + + with open(inPaf) as f: + for i, line in enumerate(f): + rowElements = line.split() + chromosome1 = rowElements[CHROMOSOMECOL1] + start1 = int(rowElements[STARTCOL1]) + end1 = int(rowElements[ENDCOL1]) + h1key = "%s~%d" % (chromosome1, start1) + if hitTable1.get(h1key, None): + hitTable1[h1key].append(i) + if len(hitTable1[h1key]) > minMatch: + filterMe[i] = i + else: + hitTable1[h1key] = [ + i, + ] + hitTable1_lens[h1key] = abs(end1 - start1) + with open(outDeNovoBed, "w") as f: + for k in hitTable1.keys(): + # OrderedDict so input paf order preserved - if that's wrong so is the bed. + nk = len(hitTable1[k]) + if nk > minMatch: + (chr, start) = k.split("~") + end = int(start) + hitTable1_lens.get(k, 0) + name= '_'.join([chr,start]) + row = (chr, start, "%d" % end, name, "%d" % nk) + f.write("\t".join(row)) + f.write("\n") + with open(outPaf, "w") as f: + f.writelines( + line for i, line in enumerate(open(inPaf)) if not filterMe.get(i, None) + ) + + +if __name__ == "__main__": + VERS = 0.01 + parser = argparse.ArgumentParser() + a = parser.add_argument + a("--inpaf", default=None) + a("--outbed", default=None) + a("--outpaf", default=None) + a("--minmatch", default=1, type=int) + a('--version', action='version', version='%(prog)s 0.1') + args = parser.parse_args() + pafDeDupe(args.inpaf, args.outbed, args.outpaf, args.minmatch)