Mercurial > repos > fubar > bigwig_outlier_bed
comparison bigwig_outlier_bed.py @ 6:eb17eb8a3658 draft
planemo upload commit 1baff96e75def9248afdcf21edec9bdc7ed42b1f-dirty
author | fubar |
---|---|
date | Tue, 23 Jul 2024 23:12:23 +0000 |
parents | c71db540eb38 |
children | c8e22efcaeda |
comparison
equal
deleted
inserted
replaced
5:68cb8e7e266b | 6:eb17eb8a3658 |
---|---|
1 """ | 1 """ |
2 Ross Lazarus June 2024 for VGP | 2 Ross Lazarus June 2024 for VGP |
3 | |
4 Bigwigs are great, but hard to reliably "see" small low coverage or small very high coverage regions. | 3 Bigwigs are great, but hard to reliably "see" small low coverage or small very high coverage regions. |
5 Colouring in JB2 tracks will need a new plugin, so this code will find bigwig regions above and below a chosen percentile point. | 4 Colouring in JB2 tracks will need a new plugin, so this code will find bigwig regions above and below a chosen percentile point. |
6 0.99 and 0.01 work well in testing with a minimum span of 10 bp. | 5 0.99 and 0.01 work well in testing with a minimum span of 10 bp. |
7 | |
8 Multiple bigwigs **with the same reference** can be combined - bed segments will be named appropriately | 6 Multiple bigwigs **with the same reference** can be combined - bed segments will be named appropriately |
9 Combining multiple references works but is silly because only display will rely on one reference so others will not be shown... | 7 Combining multiple references works but is silly because only display will rely on one reference so others will not be shown... |
10 | |
11 Tricksy numpy method from http://gregoryzynda.com/python/numpy/contiguous/interval/2019/11/29/contiguous-regions.html | 8 Tricksy numpy method from http://gregoryzynda.com/python/numpy/contiguous/interval/2019/11/29/contiguous-regions.html |
12 takes about 95 seconds for a 17MB test wiggle | 9 takes about 95 seconds for a 17MB test wiggle |
13 | |
14 JBrowse2 bed normally displays ignore the score, so could provide separate low/high bed file outputs as an option. | 10 JBrowse2 bed normally displays ignore the score, so could provide separate low/high bed file outputs as an option. |
15 | |
16 Update june 30 2024: wrote a 'no-build' plugin for beds to display red/blue if >0/<0 so those are used for scores | 11 Update june 30 2024: wrote a 'no-build' plugin for beds to display red/blue if >0/<0 so those are used for scores |
17 Bed interval naming must be short for JB2 but needs input bigwig name and (lo or hi). | 12 Bed interval naming must be short for JB2 but needs input bigwig name and (lo or hi). |
18 | |
19 """ | 13 """ |
20 | 14 |
21 | |
22 import argparse | 15 import argparse |
16 import os | |
17 import sys | |
18 from pathlib import Path | |
19 | |
23 import numpy as np | 20 import numpy as np |
24 import pybigtools | 21 import pybigtools |
25 import sys | 22 |
26 | 23 |
27 | 24 class asciihist: |
28 from pathlib import Path | 25 |
29 | 26 def __init__( |
30 class findOut(): | 27 self, |
28 data, | |
29 bins=10, | |
30 minmax=None, | |
31 str_tag="", | |
32 scale_output=80, | |
33 generate_only=True, | |
34 ): | |
35 """ | |
36 https://gist.github.com/bgbg/608d9ef4fd75032731651257fe67fc81 | |
37 Create an ASCII histogram from an interable of numbers. | |
38 Author: Boris Gorelik boris@gorelik.net. based on http://econpy.googlecode.com/svn/trunk/pytrix/pytrix.py | |
39 License: MIT | |
40 """ | |
41 self.data = data | |
42 self.minmax = minmax | |
43 self.str_tag = str_tag | |
44 self.bins = bins | |
45 self.generate_only = generate_only | |
46 self.scale_output = scale_output | |
47 self.itarray = np.asanyarray(self.data) | |
48 if self.minmax == "auto": | |
49 self.minmax = np.percentile(data, [5, 95]) | |
50 if self.minmax[0] == self.minmax[1]: | |
51 # for very ugly distributions | |
52 self.minmax = None | |
53 if self.minmax is not None: | |
54 # discard values that are outside minmax range | |
55 mn = self.minmax[0] | |
56 mx = self.minmax[1] | |
57 self.itarray = self.itarray[self.itarray >= mn] | |
58 self.itarray = self.itarray[self.itarray <= mx] | |
59 | |
60 def draw(self): | |
61 values, counts = np.unique(self.data, return_counts=True) | |
62 if len(values) <= 20: | |
63 self.bins = len(values) | |
64 ret = [] | |
65 if self.itarray.size: | |
66 total = len(self.itarray) | |
67 counts, cutoffs = np.histogram(self.itarray, bins=self.bins) | |
68 cutoffs = cutoffs[1:] | |
69 if self.str_tag: | |
70 self.str_tag = "%s " % self.str_tag | |
71 else: | |
72 self.str_tag = "" | |
73 if self.scale_output is not None: | |
74 scaled_counts = counts.astype(float) / counts.sum() * self.scale_output | |
75 else: | |
76 scaled_counts = counts | |
77 footerbar = "{:s}{:s} |{:s} |".format(self.str_tag, "-" * 12, "-" * 12,) | |
78 if self.minmax is not None: | |
79 ret.append( | |
80 "Trimmed to range (%s - %s)" | |
81 % (str(self.minmax[0]), str(self.minmax[1])) | |
82 ) | |
83 for cutoff, original_count, scaled_count in zip( | |
84 cutoffs, counts, scaled_counts | |
85 ): | |
86 ret.append( | |
87 "{:s}{:>12.2f} |{:>12,d} | {:s}".format( | |
88 self.str_tag, cutoff, original_count, "*" * int(scaled_count) | |
89 ) | |
90 ) | |
91 ret.append(footerbar) | |
92 ret.append("{:s}{:>12s} |{:>12,d} |".format(self.str_tag, "N=", total)) | |
93 ret.append(footerbar) | |
94 ret.append('') | |
95 else: | |
96 ret = [] | |
97 if not self.generate_only: | |
98 for line in ret: | |
99 print(line) | |
100 ret = "\n".join(ret) | |
101 return ret | |
102 | |
103 | |
104 class findOut: | |
31 | 105 |
32 def __init__(self, args): | 106 def __init__(self, args): |
33 self.bwnames=args.bigwig | 107 self.bwnames = args.bigwig |
34 self.bwlabels=args.bigwiglabels | 108 self.bwlabels = args.bigwiglabels |
35 self.bedwin=args.minwin | |
36 self.qlo=args.qlo | |
37 self.qhi=args.qhi | |
38 self.bedouthilo=args.bedouthilo | |
39 self.bedouthi=args.bedouthi | |
40 self.bedoutlo=args.bedoutlo | |
41 self.tableout = args.tableout | |
42 self.bedwin = args.minwin | 109 self.bedwin = args.minwin |
43 self.qhi = args.qhi | 110 self.outbeds = args.outbeds |
44 self.qlo = args.qlo | 111 self.bedouthi = args.bedouthi |
112 self.bedoutlo = args.bedoutlo | |
113 self.bedouthilo = args.bedouthilo | |
114 self.tableoutfile = args.tableoutfile | |
115 self.bedwin = args.minwin | |
116 self.qlo = None | |
117 self.qhi = None | |
118 if args.outbeds != "outtab": | |
119 self.qhi = args.qhi | |
120 if args.qlo: | |
121 try: | |
122 f = float(args.qlo) | |
123 self.qlo = f | |
124 except Exception: | |
125 print('qlo not provided') | |
126 nbw = len(args.bigwig) | |
127 nlab = len(args.bigwiglabels) | |
128 if nlab < nbw: | |
129 self.bwlabels += ["Nolabel"] * (nbw - nlab) | |
45 self.makeBed() | 130 self.makeBed() |
46 | 131 |
47 def processVals(self, bw, isTop): | 132 def processVals(self, bw, isTop): |
48 # http://gregoryzynda.com/python/numpy/contiguous/interval/2019/11/29/contiguous-regions.html | 133 """ |
134 idea from http://gregoryzynda.com/python/numpy/contiguous/interval/2019/11/29/contiguous-regions.html | |
135 Fast segmentation into regions by taking np.diff on the boolean array of over (under) cutpoint indicators in bwex. | |
136 This only gives non-zero values at the segment boundaries where there's a change, so those zeros are all removed in bwexdnz | |
137 leaving an array of segment start/end positions. That's twisted around into an array of start/end coordinates. | |
138 Magical. Fast. Could do the same for means or medians over windows for sparse bigwigs like repeat regions. | |
139 """ | |
49 if isTop: | 140 if isTop: |
50 bwex = np.r_[False, bw >= self.bwtop, False] # extend with 0s | 141 bwex = np.r_[False, bw >= self.bwtop, False] # extend with 0s |
51 else: | 142 else: |
52 bwex = np.r_[False, bw <= self.bwbot, False] | 143 bwex = np.r_[False, bw <= self.bwbot, False] |
53 bwexd = np.diff(bwex) | 144 bwexd = np.diff(bwex) |
54 bwexdnz = bwexd.nonzero()[0] | 145 bwexdnz = bwexd.nonzero()[0] |
55 bwregions = np.reshape(bwexdnz, (-1,2)) | 146 bwregions = np.reshape(bwexdnz, (-1, 2)) |
56 return bwregions | 147 return bwregions |
57 | 148 |
58 def writeBed(self, bed, bedfname): | 149 def writeBed(self, bed, bedfname): |
59 """ | 150 """ |
60 potentially multiple | 151 potentially multiple |
61 """ | 152 """ |
62 bed.sort() | 153 bed.sort() |
63 beds = ['%s\t%d\t%d\t%s\t%d' % x for x in bed] | 154 beds = ["%s\t%d\t%d\t%s\t%d" % x for x in bed] |
64 with open(bedfname, "w") as bedf: | 155 with open(bedfname, "w") as bedf: |
65 bedf.write('\n'.join(beds)) | 156 bedf.write("\n".join(beds)) |
66 bedf.write('\n') | 157 bedf.write("\n") |
67 print('Wrote %d bed regions to %s' % (len(bed), bedfname)) | 158 |
68 | 159 def makeTableRow(self, bw, bwlabel, chr): |
160 """ | |
161 called for every contig, but messy inline | |
162 """ | |
163 bwmean = np.mean(bw) | |
164 bwstd = np.std(bw) | |
165 bwmax = np.max(bw) | |
166 nrow = np.size(bw) | |
167 bwmin = np.min(bw) | |
168 row = "%s\t%s\t%d\t%f\t%f\t%f\t%f" % ( | |
169 bwlabel, | |
170 chr, | |
171 nrow, | |
172 bwmean, | |
173 bwstd, | |
174 bwmin, | |
175 bwmax, | |
176 ) | |
177 if self.qhi is not None: | |
178 row += "\t%.2f" % self.bwtop | |
179 else: | |
180 row += "\tnoqhi" | |
181 if self.qlo is not None: | |
182 row += "\t%.2f" % self.bwbot | |
183 else: | |
184 row += "\tnoqlo" | |
185 return row | |
186 | |
69 def makeBed(self): | 187 def makeBed(self): |
70 bedhi = [] | 188 bedhi = [] |
71 bedlo = [] | 189 bedlo = [] |
190 restab = [] | |
72 bwlabels = self.bwlabels | 191 bwlabels = self.bwlabels |
73 bwnames = self.bwnames | 192 bwnames = self.bwnames |
74 print('bwnames=', bwnames, "bwlabs=", bwlabels) | 193 reshead = "bigwig\tcontig\tn\tmean\tstd\tmin\tmax\tqtop\tqbot" |
75 for i, bwname in enumerate(bwnames): | 194 for i, bwname in enumerate(bwnames): |
76 bwlabel = bwlabels[i].replace(" ",'') | 195 bwlabel = bwlabels[i].replace(" ", "") |
77 p = Path('in.bw') | 196 fakepath = "in%d.bw" % i |
78 p.symlink_to( bwname ) # required by pybigtools (!) | 197 if os.path.isfile(fakepath): |
79 bwf = pybigtools.open('in.bw') | 198 os.remove(fakepath) |
199 p = Path(fakepath) | |
200 p.symlink_to(bwname) # required by pybigtools (!) | |
201 bwf = pybigtools.open(fakepath) | |
80 chrlist = bwf.chroms() | 202 chrlist = bwf.chroms() |
81 chrs = list(chrlist.keys()) | 203 chrs = list(chrlist.keys()) |
82 chrs.sort() | |
83 restab = ["contig\tn\tmean\tstd\tmin\tmax\tqtop\tqbot"] | |
84 for chr in chrs: | 204 for chr in chrs: |
205 first_few = None | |
85 bw = bwf.values(chr) | 206 bw = bwf.values(chr) |
86 bw = bw[~np.isnan(bw)] # some have NaN if parts of a contig not covered | 207 values, counts = np.unique(bw, return_counts=True) |
208 nvalues = len(values) | |
209 if nvalues <= 20: | |
210 histo = '\n'.join(['%s: %f occurs %d times' % (chr, values[x], counts[x]) for x in range(len(values))]) | |
211 else: | |
212 last10 = range(nvalues-10, nvalues) | |
213 first_few = ['%.2f\t%d' % (values[x],counts[x]) for x in range(10)] | |
214 first_few += ['%.2f\t%d' % (values[x],counts[x]) for x in last10] | |
215 first_few.insert(0,'First/Last 10 value counts\nValue\tCount') | |
216 ha = asciihist(data=bw, bins=20, str_tag=chr) | |
217 histo = ha.draw() | |
218 histo = '\n'.join(first_few) + '\nHistogram of bigwig values\n' + histo | |
219 bw = bw[~np.isnan(bw)] # some have NaN if parts of a contig not covered | |
87 if self.qhi is not None: | 220 if self.qhi is not None: |
88 self.bwtop = np.quantile(bw, self.qhi) | 221 self.bwtop = np.quantile(bw, self.qhi) |
89 bwhi = self.processVals(bw, isTop=True) | 222 bwhi = self.processVals(bw, isTop=True) |
90 for i, seg in enumerate(bwhi): | 223 for j, seg in enumerate(bwhi): |
91 if seg[1] - seg[0] >= self.bedwin: | 224 seglen = seg[1] - seg[0] |
92 bedhi.append((chr, seg[0], seg[1], '%s_hi' % (bwlabel), 1)) | 225 if seglen >= self.bedwin: |
226 score = np.sum(bw[seg[0]:seg[1]])/float(seglen) | |
227 bedhi.append( | |
228 ( | |
229 chr, | |
230 seg[0], | |
231 seg[1], | |
232 "%s_%d" % (bwlabel, score), | |
233 score, | |
234 ) | |
235 ) | |
93 if self.qlo is not None: | 236 if self.qlo is not None: |
94 self.bwbot = np.quantile(bw, self.qlo) | 237 self.bwbot = np.quantile(bw, self.qlo) |
95 bwlo = self.processVals(bw, isTop=False) | 238 bwlo = self.processVals(bw, isTop=False) |
96 for i, seg in enumerate(bwlo): | 239 for j, seg in enumerate(bwlo): |
97 if seg[1] - seg[0] >= self.bedwin: | 240 if seg[1] - seg[0] >= self.bedwin: |
98 bedlo.append((chr, seg[0], seg[1], '%s_lo' % (bwlabel), -1)) | 241 score = -1 * np.sum(bw[seg[0]:seg[1]])/float(seglen) |
99 bwmean = np.mean(bw) | 242 bedlo.append( |
100 bwstd = np.std(bw) | 243 ( |
101 bwmax = np.max(bw) | 244 chr, |
102 nrow = np.size(bw) | 245 seg[0], |
103 bwmin = np.min(bw) | 246 seg[1], |
104 restab.append('%s\t%d\t%f\t%f\t%f\t%f\t%f\t%f' % (chr,nrow,bwmean,bwstd,bwmin,bwmax,self.bwtop,self.bwbot)) | 247 "%s_%d" % (bwlabel, score), |
105 print('\n'.join(restab), '\n') | 248 score, |
106 if self.tableout: | 249 ) |
107 with open(self.tableout) as t: | 250 ) |
108 t.write('\n'.join(restab)) | 251 if self.tableoutfile: |
109 t.write('\n') | 252 row = self.makeTableRow(bw, bwlabel, chr) |
110 if self.bedoutlo: | 253 resheadl = reshead.split('\t') |
111 if self.qlo: | 254 rowl = row.split() |
255 desc = ['%s\t%s' % (resheadl[x], rowl[x]) for x in range(len(rowl))] | |
256 desc.insert(0, 'Descriptive measures') | |
257 descn = '\n'.join(desc) | |
258 restab.append(descn) | |
259 restab.append(histo) | |
260 if os.path.isfile(fakepath): | |
261 os.remove(fakepath) | |
262 if self.tableoutfile: | |
263 stable = "\n".join(restab) | |
264 with open(self.tableoutfile, "w") as t: | |
265 t.write(stable) | |
266 t.write("\n") | |
267 some = False | |
268 if self.qlo: | |
269 if self.outbeds in ["outall", "outlo", "outlohi"]: | |
112 self.writeBed(bedlo, self.bedoutlo) | 270 self.writeBed(bedlo, self.bedoutlo) |
113 if self.bedouthi: | 271 some = True |
114 if self.qhi: | 272 if self.qhi: |
273 if self.outbeds in ["outall", "outlohi", "outhi"]: | |
115 self.writeBed(bedhi, self.bedouthi) | 274 self.writeBed(bedhi, self.bedouthi) |
116 if self.bedouthilo: | 275 some = True |
276 if self.outbeds in ["outall", "outhilo"]: | |
117 allbed = bedlo + bedhi | 277 allbed = bedlo + bedhi |
118 self.writeBed(allbed, self.bedouthilo) | 278 self.writeBed(allbed, self.bedouthilo) |
119 return restab | 279 some = True |
120 | 280 if not ((self.outbeds == 'outtab') or some): |
281 sys.stderr.write( | |
282 "Invalid configuration - no output could be created. Was qlo missing and only low output requested for example?" | |
283 ) | |
284 sys.exit(2) | |
285 | |
121 | 286 |
122 if __name__ == "__main__": | 287 if __name__ == "__main__": |
123 parser = argparse.ArgumentParser() | 288 parser = argparse.ArgumentParser() |
124 a = parser.add_argument | 289 a = parser.add_argument |
125 a('-m', '--minwin',default=10, type=int) | 290 a("-m", "--minwin", default=10, type=int) |
126 a('-l', '--qlo',default=None, type=float) | 291 a("-l", "--qlo", default=None) |
127 a('-i', '--qhi',default=None, type=float) | 292 a("-i", "--qhi", default=None, type=float) |
128 a('-w', '--bigwig', nargs='+') | 293 a("--bedouthi", default=None) |
129 a('-n', '--bigwiglabels', nargs='+') | 294 a("--bedoutlo", default=None) |
130 a('-o', '--bedouthilo', default=None, help="optional high and low combined bed") | 295 a("--bedouthilo", default=None) |
131 a('-u', '--bedouthi', default=None, help="optional high only bed") | 296 a("-w", "--bigwig", nargs="+") |
132 a('-b', '--bedoutlo', default=None, help="optional low only bed") | 297 a("-n", "--bigwiglabels", nargs="+") |
133 a('-t', '--tableout', default=None) | 298 a("-o", "--outbeds", default="outhilo", help="optional high and low combined bed") |
299 a("-t", "--tableoutfile", default=None) | |
134 args = parser.parse_args() | 300 args = parser.parse_args() |
135 print('args=', args) | 301 findOut(args) |
136 if not (args.bedouthilo or args.bedouthi or args.bedoutlo): | |
137 sys.stderr.write("bigwig_outlier_bed.py cannot usefully run - need a bed output choice - must be one of low only, high only or both combined") | |
138 sys.exit(2) | |
139 if not (args.qlo or args.qhi): | |
140 sys.stderr.write("bigwig_outlier_bed.py cannot usefully run - need one or both of quantile cutpoints qhi and qlo") | |
141 sys.exit(2) | |
142 restab = findOut(args) | |
143 if args.tableout: | |
144 with open(args.tableout, 'w') as tout: | |
145 tout.write('\n'.join(restab)) | |
146 tout.write('\n') | |
147 | |
148 |