comparison bigwig_outlier_bed.py @ 6:eb17eb8a3658 draft

planemo upload commit 1baff96e75def9248afdcf21edec9bdc7ed42b1f-dirty
author fubar
date Tue, 23 Jul 2024 23:12:23 +0000
parents c71db540eb38
children c8e22efcaeda
comparison
equal deleted inserted replaced
5:68cb8e7e266b 6:eb17eb8a3658
1 """ 1 """
2 Ross Lazarus June 2024 for VGP 2 Ross Lazarus June 2024 for VGP
3
4 Bigwigs are great, but hard to reliably "see" small low coverage or small very high coverage regions. 3 Bigwigs are great, but hard to reliably "see" small low coverage or small very high coverage regions.
5 Colouring in JB2 tracks will need a new plugin, so this code will find bigwig regions above and below a chosen percentile point. 4 Colouring in JB2 tracks will need a new plugin, so this code will find bigwig regions above and below a chosen percentile point.
6 0.99 and 0.01 work well in testing with a minimum span of 10 bp. 5 0.99 and 0.01 work well in testing with a minimum span of 10 bp.
7
8 Multiple bigwigs **with the same reference** can be combined - bed segments will be named appropriately 6 Multiple bigwigs **with the same reference** can be combined - bed segments will be named appropriately
9 Combining multiple references works but is silly because only display will rely on one reference so others will not be shown... 7 Combining multiple references works but is silly because only display will rely on one reference so others will not be shown...
10
11 Tricksy numpy method from http://gregoryzynda.com/python/numpy/contiguous/interval/2019/11/29/contiguous-regions.html 8 Tricksy numpy method from http://gregoryzynda.com/python/numpy/contiguous/interval/2019/11/29/contiguous-regions.html
12 takes about 95 seconds for a 17MB test wiggle 9 takes about 95 seconds for a 17MB test wiggle
13
14 JBrowse2 bed normally displays ignore the score, so could provide separate low/high bed file outputs as an option. 10 JBrowse2 bed normally displays ignore the score, so could provide separate low/high bed file outputs as an option.
15
16 Update june 30 2024: wrote a 'no-build' plugin for beds to display red/blue if >0/<0 so those are used for scores 11 Update june 30 2024: wrote a 'no-build' plugin for beds to display red/blue if >0/<0 so those are used for scores
17 Bed interval naming must be short for JB2 but needs input bigwig name and (lo or hi). 12 Bed interval naming must be short for JB2 but needs input bigwig name and (lo or hi).
18
19 """ 13 """
20 14
21
22 import argparse 15 import argparse
16 import os
17 import sys
18 from pathlib import Path
19
23 import numpy as np 20 import numpy as np
24 import pybigtools 21 import pybigtools
25 import sys 22
26 23
27 24 class asciihist:
28 from pathlib import Path 25
29 26 def __init__(
30 class findOut(): 27 self,
28 data,
29 bins=10,
30 minmax=None,
31 str_tag="",
32 scale_output=80,
33 generate_only=True,
34 ):
35 """
36 https://gist.github.com/bgbg/608d9ef4fd75032731651257fe67fc81
37 Create an ASCII histogram from an interable of numbers.
38 Author: Boris Gorelik boris@gorelik.net. based on http://econpy.googlecode.com/svn/trunk/pytrix/pytrix.py
39 License: MIT
40 """
41 self.data = data
42 self.minmax = minmax
43 self.str_tag = str_tag
44 self.bins = bins
45 self.generate_only = generate_only
46 self.scale_output = scale_output
47 self.itarray = np.asanyarray(self.data)
48 if self.minmax == "auto":
49 self.minmax = np.percentile(data, [5, 95])
50 if self.minmax[0] == self.minmax[1]:
51 # for very ugly distributions
52 self.minmax = None
53 if self.minmax is not None:
54 # discard values that are outside minmax range
55 mn = self.minmax[0]
56 mx = self.minmax[1]
57 self.itarray = self.itarray[self.itarray >= mn]
58 self.itarray = self.itarray[self.itarray <= mx]
59
60 def draw(self):
61 values, counts = np.unique(self.data, return_counts=True)
62 if len(values) <= 20:
63 self.bins = len(values)
64 ret = []
65 if self.itarray.size:
66 total = len(self.itarray)
67 counts, cutoffs = np.histogram(self.itarray, bins=self.bins)
68 cutoffs = cutoffs[1:]
69 if self.str_tag:
70 self.str_tag = "%s " % self.str_tag
71 else:
72 self.str_tag = ""
73 if self.scale_output is not None:
74 scaled_counts = counts.astype(float) / counts.sum() * self.scale_output
75 else:
76 scaled_counts = counts
77 footerbar = "{:s}{:s} |{:s} |".format(self.str_tag, "-" * 12, "-" * 12,)
78 if self.minmax is not None:
79 ret.append(
80 "Trimmed to range (%s - %s)"
81 % (str(self.minmax[0]), str(self.minmax[1]))
82 )
83 for cutoff, original_count, scaled_count in zip(
84 cutoffs, counts, scaled_counts
85 ):
86 ret.append(
87 "{:s}{:>12.2f} |{:>12,d} | {:s}".format(
88 self.str_tag, cutoff, original_count, "*" * int(scaled_count)
89 )
90 )
91 ret.append(footerbar)
92 ret.append("{:s}{:>12s} |{:>12,d} |".format(self.str_tag, "N=", total))
93 ret.append(footerbar)
94 ret.append('')
95 else:
96 ret = []
97 if not self.generate_only:
98 for line in ret:
99 print(line)
100 ret = "\n".join(ret)
101 return ret
102
103
104 class findOut:
31 105
32 def __init__(self, args): 106 def __init__(self, args):
33 self.bwnames=args.bigwig 107 self.bwnames = args.bigwig
34 self.bwlabels=args.bigwiglabels 108 self.bwlabels = args.bigwiglabels
35 self.bedwin=args.minwin
36 self.qlo=args.qlo
37 self.qhi=args.qhi
38 self.bedouthilo=args.bedouthilo
39 self.bedouthi=args.bedouthi
40 self.bedoutlo=args.bedoutlo
41 self.tableout = args.tableout
42 self.bedwin = args.minwin 109 self.bedwin = args.minwin
43 self.qhi = args.qhi 110 self.outbeds = args.outbeds
44 self.qlo = args.qlo 111 self.bedouthi = args.bedouthi
112 self.bedoutlo = args.bedoutlo
113 self.bedouthilo = args.bedouthilo
114 self.tableoutfile = args.tableoutfile
115 self.bedwin = args.minwin
116 self.qlo = None
117 self.qhi = None
118 if args.outbeds != "outtab":
119 self.qhi = args.qhi
120 if args.qlo:
121 try:
122 f = float(args.qlo)
123 self.qlo = f
124 except Exception:
125 print('qlo not provided')
126 nbw = len(args.bigwig)
127 nlab = len(args.bigwiglabels)
128 if nlab < nbw:
129 self.bwlabels += ["Nolabel"] * (nbw - nlab)
45 self.makeBed() 130 self.makeBed()
46 131
47 def processVals(self, bw, isTop): 132 def processVals(self, bw, isTop):
48 # http://gregoryzynda.com/python/numpy/contiguous/interval/2019/11/29/contiguous-regions.html 133 """
134 idea from http://gregoryzynda.com/python/numpy/contiguous/interval/2019/11/29/contiguous-regions.html
135 Fast segmentation into regions by taking np.diff on the boolean array of over (under) cutpoint indicators in bwex.
136 This only gives non-zero values at the segment boundaries where there's a change, so those zeros are all removed in bwexdnz
137 leaving an array of segment start/end positions. That's twisted around into an array of start/end coordinates.
138 Magical. Fast. Could do the same for means or medians over windows for sparse bigwigs like repeat regions.
139 """
49 if isTop: 140 if isTop:
50 bwex = np.r_[False, bw >= self.bwtop, False] # extend with 0s 141 bwex = np.r_[False, bw >= self.bwtop, False] # extend with 0s
51 else: 142 else:
52 bwex = np.r_[False, bw <= self.bwbot, False] 143 bwex = np.r_[False, bw <= self.bwbot, False]
53 bwexd = np.diff(bwex) 144 bwexd = np.diff(bwex)
54 bwexdnz = bwexd.nonzero()[0] 145 bwexdnz = bwexd.nonzero()[0]
55 bwregions = np.reshape(bwexdnz, (-1,2)) 146 bwregions = np.reshape(bwexdnz, (-1, 2))
56 return bwregions 147 return bwregions
57 148
58 def writeBed(self, bed, bedfname): 149 def writeBed(self, bed, bedfname):
59 """ 150 """
60 potentially multiple 151 potentially multiple
61 """ 152 """
62 bed.sort() 153 bed.sort()
63 beds = ['%s\t%d\t%d\t%s\t%d' % x for x in bed] 154 beds = ["%s\t%d\t%d\t%s\t%d" % x for x in bed]
64 with open(bedfname, "w") as bedf: 155 with open(bedfname, "w") as bedf:
65 bedf.write('\n'.join(beds)) 156 bedf.write("\n".join(beds))
66 bedf.write('\n') 157 bedf.write("\n")
67 print('Wrote %d bed regions to %s' % (len(bed), bedfname)) 158
68 159 def makeTableRow(self, bw, bwlabel, chr):
160 """
161 called for every contig, but messy inline
162 """
163 bwmean = np.mean(bw)
164 bwstd = np.std(bw)
165 bwmax = np.max(bw)
166 nrow = np.size(bw)
167 bwmin = np.min(bw)
168 row = "%s\t%s\t%d\t%f\t%f\t%f\t%f" % (
169 bwlabel,
170 chr,
171 nrow,
172 bwmean,
173 bwstd,
174 bwmin,
175 bwmax,
176 )
177 if self.qhi is not None:
178 row += "\t%.2f" % self.bwtop
179 else:
180 row += "\tnoqhi"
181 if self.qlo is not None:
182 row += "\t%.2f" % self.bwbot
183 else:
184 row += "\tnoqlo"
185 return row
186
69 def makeBed(self): 187 def makeBed(self):
70 bedhi = [] 188 bedhi = []
71 bedlo = [] 189 bedlo = []
190 restab = []
72 bwlabels = self.bwlabels 191 bwlabels = self.bwlabels
73 bwnames = self.bwnames 192 bwnames = self.bwnames
74 print('bwnames=', bwnames, "bwlabs=", bwlabels) 193 reshead = "bigwig\tcontig\tn\tmean\tstd\tmin\tmax\tqtop\tqbot"
75 for i, bwname in enumerate(bwnames): 194 for i, bwname in enumerate(bwnames):
76 bwlabel = bwlabels[i].replace(" ",'') 195 bwlabel = bwlabels[i].replace(" ", "")
77 p = Path('in.bw') 196 fakepath = "in%d.bw" % i
78 p.symlink_to( bwname ) # required by pybigtools (!) 197 if os.path.isfile(fakepath):
79 bwf = pybigtools.open('in.bw') 198 os.remove(fakepath)
199 p = Path(fakepath)
200 p.symlink_to(bwname) # required by pybigtools (!)
201 bwf = pybigtools.open(fakepath)
80 chrlist = bwf.chroms() 202 chrlist = bwf.chroms()
81 chrs = list(chrlist.keys()) 203 chrs = list(chrlist.keys())
82 chrs.sort()
83 restab = ["contig\tn\tmean\tstd\tmin\tmax\tqtop\tqbot"]
84 for chr in chrs: 204 for chr in chrs:
205 first_few = None
85 bw = bwf.values(chr) 206 bw = bwf.values(chr)
86 bw = bw[~np.isnan(bw)] # some have NaN if parts of a contig not covered 207 values, counts = np.unique(bw, return_counts=True)
208 nvalues = len(values)
209 if nvalues <= 20:
210 histo = '\n'.join(['%s: %f occurs %d times' % (chr, values[x], counts[x]) for x in range(len(values))])
211 else:
212 last10 = range(nvalues-10, nvalues)
213 first_few = ['%.2f\t%d' % (values[x],counts[x]) for x in range(10)]
214 first_few += ['%.2f\t%d' % (values[x],counts[x]) for x in last10]
215 first_few.insert(0,'First/Last 10 value counts\nValue\tCount')
216 ha = asciihist(data=bw, bins=20, str_tag=chr)
217 histo = ha.draw()
218 histo = '\n'.join(first_few) + '\nHistogram of bigwig values\n' + histo
219 bw = bw[~np.isnan(bw)] # some have NaN if parts of a contig not covered
87 if self.qhi is not None: 220 if self.qhi is not None:
88 self.bwtop = np.quantile(bw, self.qhi) 221 self.bwtop = np.quantile(bw, self.qhi)
89 bwhi = self.processVals(bw, isTop=True) 222 bwhi = self.processVals(bw, isTop=True)
90 for i, seg in enumerate(bwhi): 223 for j, seg in enumerate(bwhi):
91 if seg[1] - seg[0] >= self.bedwin: 224 seglen = seg[1] - seg[0]
92 bedhi.append((chr, seg[0], seg[1], '%s_hi' % (bwlabel), 1)) 225 if seglen >= self.bedwin:
226 score = np.sum(bw[seg[0]:seg[1]])/float(seglen)
227 bedhi.append(
228 (
229 chr,
230 seg[0],
231 seg[1],
232 "%s_%d" % (bwlabel, score),
233 score,
234 )
235 )
93 if self.qlo is not None: 236 if self.qlo is not None:
94 self.bwbot = np.quantile(bw, self.qlo) 237 self.bwbot = np.quantile(bw, self.qlo)
95 bwlo = self.processVals(bw, isTop=False) 238 bwlo = self.processVals(bw, isTop=False)
96 for i, seg in enumerate(bwlo): 239 for j, seg in enumerate(bwlo):
97 if seg[1] - seg[0] >= self.bedwin: 240 if seg[1] - seg[0] >= self.bedwin:
98 bedlo.append((chr, seg[0], seg[1], '%s_lo' % (bwlabel), -1)) 241 score = -1 * np.sum(bw[seg[0]:seg[1]])/float(seglen)
99 bwmean = np.mean(bw) 242 bedlo.append(
100 bwstd = np.std(bw) 243 (
101 bwmax = np.max(bw) 244 chr,
102 nrow = np.size(bw) 245 seg[0],
103 bwmin = np.min(bw) 246 seg[1],
104 restab.append('%s\t%d\t%f\t%f\t%f\t%f\t%f\t%f' % (chr,nrow,bwmean,bwstd,bwmin,bwmax,self.bwtop,self.bwbot)) 247 "%s_%d" % (bwlabel, score),
105 print('\n'.join(restab), '\n') 248 score,
106 if self.tableout: 249 )
107 with open(self.tableout) as t: 250 )
108 t.write('\n'.join(restab)) 251 if self.tableoutfile:
109 t.write('\n') 252 row = self.makeTableRow(bw, bwlabel, chr)
110 if self.bedoutlo: 253 resheadl = reshead.split('\t')
111 if self.qlo: 254 rowl = row.split()
255 desc = ['%s\t%s' % (resheadl[x], rowl[x]) for x in range(len(rowl))]
256 desc.insert(0, 'Descriptive measures')
257 descn = '\n'.join(desc)
258 restab.append(descn)
259 restab.append(histo)
260 if os.path.isfile(fakepath):
261 os.remove(fakepath)
262 if self.tableoutfile:
263 stable = "\n".join(restab)
264 with open(self.tableoutfile, "w") as t:
265 t.write(stable)
266 t.write("\n")
267 some = False
268 if self.qlo:
269 if self.outbeds in ["outall", "outlo", "outlohi"]:
112 self.writeBed(bedlo, self.bedoutlo) 270 self.writeBed(bedlo, self.bedoutlo)
113 if self.bedouthi: 271 some = True
114 if self.qhi: 272 if self.qhi:
273 if self.outbeds in ["outall", "outlohi", "outhi"]:
115 self.writeBed(bedhi, self.bedouthi) 274 self.writeBed(bedhi, self.bedouthi)
116 if self.bedouthilo: 275 some = True
276 if self.outbeds in ["outall", "outhilo"]:
117 allbed = bedlo + bedhi 277 allbed = bedlo + bedhi
118 self.writeBed(allbed, self.bedouthilo) 278 self.writeBed(allbed, self.bedouthilo)
119 return restab 279 some = True
120 280 if not ((self.outbeds == 'outtab') or some):
281 sys.stderr.write(
282 "Invalid configuration - no output could be created. Was qlo missing and only low output requested for example?"
283 )
284 sys.exit(2)
285
121 286
122 if __name__ == "__main__": 287 if __name__ == "__main__":
123 parser = argparse.ArgumentParser() 288 parser = argparse.ArgumentParser()
124 a = parser.add_argument 289 a = parser.add_argument
125 a('-m', '--minwin',default=10, type=int) 290 a("-m", "--minwin", default=10, type=int)
126 a('-l', '--qlo',default=None, type=float) 291 a("-l", "--qlo", default=None)
127 a('-i', '--qhi',default=None, type=float) 292 a("-i", "--qhi", default=None, type=float)
128 a('-w', '--bigwig', nargs='+') 293 a("--bedouthi", default=None)
129 a('-n', '--bigwiglabels', nargs='+') 294 a("--bedoutlo", default=None)
130 a('-o', '--bedouthilo', default=None, help="optional high and low combined bed") 295 a("--bedouthilo", default=None)
131 a('-u', '--bedouthi', default=None, help="optional high only bed") 296 a("-w", "--bigwig", nargs="+")
132 a('-b', '--bedoutlo', default=None, help="optional low only bed") 297 a("-n", "--bigwiglabels", nargs="+")
133 a('-t', '--tableout', default=None) 298 a("-o", "--outbeds", default="outhilo", help="optional high and low combined bed")
299 a("-t", "--tableoutfile", default=None)
134 args = parser.parse_args() 300 args = parser.parse_args()
135 print('args=', args) 301 findOut(args)
136 if not (args.bedouthilo or args.bedouthi or args.bedoutlo):
137 sys.stderr.write("bigwig_outlier_bed.py cannot usefully run - need a bed output choice - must be one of low only, high only or both combined")
138 sys.exit(2)
139 if not (args.qlo or args.qhi):
140 sys.stderr.write("bigwig_outlier_bed.py cannot usefully run - need one or both of quantile cutpoints qhi and qlo")
141 sys.exit(2)
142 restab = findOut(args)
143 if args.tableout:
144 with open(args.tableout, 'w') as tout:
145 tout.write('\n'.join(restab))
146 tout.write('\n')
147
148