comparison bigwig_outlier_bed.py @ 0:ebcd48f183b3 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/bigwig_outlier_bed commit 091caba3c5b066b293745ccee5cd31132fec3b4b
author iuc
date Fri, 05 Jul 2024 06:00:15 +0000
parents
children 8377a6abb4da
comparison
equal deleted inserted replaced
-1:000000000000 0:ebcd48f183b3
1 """
2 Ross Lazarus June 2024 for VGP
3 Bigwigs are great, but hard to reliably "see" small low coverage or small very high coverage regions.
4 Colouring in JB2 tracks will need a new plugin, so this code will find bigwig regions above and below a chosen percentile point.
5 0.99 and 0.01 work well in testing with a minimum span of 10 bp.
6 Multiple bigwigs **with the same reference** can be combined - bed segments will be named appropriately
7 Combining multiple references works but is silly because only display will rely on one reference so others will not be shown...
8 Tricksy numpy method from http://gregoryzynda.com/python/numpy/contiguous/interval/2019/11/29/contiguous-regions.html
9 takes about 95 seconds for a 17MB test wiggle
10 JBrowse2 bed normally displays ignore the score, so could provide separate low/high bed file outputs as an option.
11 Update june 30 2024: wrote a 'no-build' plugin for beds to display red/blue if >0/<0 so those are used for scores
12 Bed interval naming must be short for JB2 but needs input bigwig name and (lo or hi).
13 """
14
15 import argparse
16 import copy
17 import os
18 import sys
19 from pathlib import Path
20
21 import numpy as np
22 import pybigtools
23
24
25 class findOut:
26
27 def __init__(self, args):
28 self.bwnames = args.bigwig
29 self.bwlabels = args.bigwiglabels
30 self.bedwin = args.minwin
31 self.qlo = args.qlo
32 self.qhi = args.qhi
33 self.outbeds = args.outbeds
34 self.bedouthi = args.bedouthi
35 self.bedoutlo = args.bedoutlo
36 self.bedouthilo = args.bedouthilo
37 self.tableoutfile = args.tableoutfile
38 self.bedwin = args.minwin
39 self.qhi = args.qhi
40 self.qlo = args.qlo
41 nbw = len(args.bigwig)
42 nlab = len(args.bigwiglabels)
43 if nlab < nbw:
44 self.bwlabels += ["Nolabel"] * (nbw - nlab)
45 self.makeBed()
46
47 def processVals(self, bw, isTop):
48 """
49 idea from http://gregoryzynda.com/python/numpy/contiguous/interval/2019/11/29/contiguous-regions.html
50 Fast segmentation into regions by taking np.diff on the boolean array of over (under) cutpoint indicators in bwex.
51 This only gives non-zero values at the segment boundaries where there's a change, so those zeros are all removed in bwexdnz
52 leaving an array of segment start/end positions. That's twisted around into an array of start/end coordinates.
53 Magical. Fast. Could do the same for means or medians over windows for sparse bigwigs like repeat regions.
54 """
55 if isTop:
56 bwex = np.r_[False, bw >= self.bwtop, False] # extend with 0s
57 else:
58 bwex = np.r_[False, bw <= self.bwbot, False]
59 bwexd = np.diff(bwex)
60 bwexdnz = bwexd.nonzero()[0]
61 bwregions = np.reshape(bwexdnz, (-1, 2))
62 return bwregions
63
64 def writeBed(self, bed, bedfname):
65 """
66 potentially multiple
67 """
68 bed.sort()
69 beds = ["%s\t%d\t%d\t%s\t%d" % x for x in bed]
70 with open(bedfname, "w") as bedf:
71 bedf.write("\n".join(beds))
72 bedf.write("\n")
73
74 def makeTableRow(self, bw, bwlabel, chr):
75 """
76 called for every contig, but messy inline
77 """
78 bwmean = np.mean(bw)
79 bwstd = np.std(bw)
80 bwmax = np.max(bw)
81 nrow = np.size(bw)
82 bwmin = np.min(bw)
83 row = "%s\t%s\t%d\t%f\t%f\t%f\t%f" % (
84 bwlabel,
85 chr,
86 nrow,
87 bwmean,
88 bwstd,
89 bwmin,
90 bwmax,
91 )
92 if self.qhi is not None:
93 row += "\t%f" % self.bwtop
94 else:
95 row += "\t"
96 if self.qlo is not None:
97 row += "\t%f" % self.bwbot
98 else:
99 row += "\t"
100 return row
101
102 def makeBed(self):
103 bedhi = []
104 bedlo = []
105 bwlabels = self.bwlabels
106 bwnames = self.bwnames
107 if self.tableoutfile:
108 restab = ["bigwig\tcontig\tn\tmean\tstd\tmin\tmax\tqtop\tqbot"]
109 for i, bwname in enumerate(bwnames):
110 bwlabel = bwlabels[i].replace(" ", "")
111 fakepath = "in%d.bw" % i
112 if os.path.isfile(fakepath):
113 os.remove(fakepath)
114 p = Path(fakepath)
115 p.symlink_to(bwname) # required by pybigtools (!)
116 bwf = pybigtools.open(fakepath)
117 chrlist = bwf.chroms()
118 chrs = list(chrlist.keys())
119 chrs.sort()
120 for chr in chrs:
121 bw = bwf.values(chr)
122 bw = bw[~np.isnan(bw)] # some have NaN if parts of a contig not covered
123 if self.qhi is not None:
124 self.bwtop = np.quantile(bw, self.qhi)
125 bwhi = self.processVals(bw, isTop=True)
126 for j, seg in enumerate(bwhi):
127 if seg[1] - seg[0] >= self.bedwin:
128 bedhi.append((chr, seg[0], seg[1], "%s_hi" % (bwlabel), 1))
129 if self.qlo is not None:
130 self.bwbot = np.quantile(bw, self.qlo)
131 bwlo = self.processVals(bw, isTop=False)
132 for j, seg in enumerate(bwlo):
133 if seg[1] - seg[0] >= self.bedwin:
134 bedlo.append((chr, seg[0], seg[1], "%s_lo" % (bwlabel), -1))
135 if self.tableoutfile:
136 row = self.makeTableRow(bw, bwlabel, chr)
137 restab.append(copy.copy(row))
138 if self.tableoutfile:
139 stable = "\n".join(restab)
140 with open(self.tableoutfile, "w") as t:
141 t.write(stable)
142 t.write("\n")
143 some = False
144 if self.qlo:
145 if self.outbeds in ["outall", "outlo", "outlohi"]:
146 self.writeBed(bedlo, self.bedoutlo)
147 some = True
148 if self.qhi:
149 if self.outbeds in ["outall", "outlohi", "outhi"]:
150 self.writeBed(bedhi, self.bedouthi)
151 some = True
152 if self.outbeds in ["outall", "outhilo"]:
153 allbed = bedlo + bedhi
154 self.writeBed(allbed, self.bedouthilo)
155 some = True
156 if not some:
157 sys.stderr.write(
158 "Invalid configuration - no output could be created. Was qlo missing and only low output requested for example?"
159 )
160 sys.exit(2)
161
162
163 if __name__ == "__main__":
164 parser = argparse.ArgumentParser()
165 a = parser.add_argument
166 a("-m", "--minwin", default=10, type=int)
167 a("-l", "--qlo", default=None, type=float)
168 a("-i", "--qhi", default=None, type=float)
169 a("--bedouthi", default=None)
170 a("--bedoutlo", default=None)
171 a("--bedouthilo", default=None)
172 a("-w", "--bigwig", nargs="+")
173 a("-n", "--bigwiglabels", nargs="+")
174 a("-o", "--outbeds", default="outhilo", help="optional high and low combined bed")
175 a("-t", "--tableoutfile", default=None)
176 args = parser.parse_args()
177 findOut(args)