Mercurial > repos > fubar > bigwig_outlier_bed
diff bigwig_outlier_bed.py @ 6:eb17eb8a3658 draft
planemo upload commit 1baff96e75def9248afdcf21edec9bdc7ed42b1f-dirty
| author | fubar | 
|---|---|
| date | Tue, 23 Jul 2024 23:12:23 +0000 | 
| parents | c71db540eb38 | 
| children | c8e22efcaeda | 
line wrap: on
 line diff
--- a/bigwig_outlier_bed.py Mon Jul 01 05:02:05 2024 +0000 +++ b/bigwig_outlier_bed.py Tue Jul 23 23:12:23 2024 +0000 @@ -1,58 +1,149 @@ """ -Ross Lazarus June 2024 for VGP - +Ross Lazarus June 2024 for VGP Bigwigs are great, but hard to reliably "see" small low coverage or small very high coverage regions. Colouring in JB2 tracks will need a new plugin, so this code will find bigwig regions above and below a chosen percentile point. 0.99 and 0.01 work well in testing with a minimum span of 10 bp. - Multiple bigwigs **with the same reference** can be combined - bed segments will be named appropriately Combining multiple references works but is silly because only display will rely on one reference so others will not be shown... - Tricksy numpy method from http://gregoryzynda.com/python/numpy/contiguous/interval/2019/11/29/contiguous-regions.html takes about 95 seconds for a 17MB test wiggle - JBrowse2 bed normally displays ignore the score, so could provide separate low/high bed file outputs as an option. - Update june 30 2024: wrote a 'no-build' plugin for beds to display red/blue if >0/<0 so those are used for scores Bed interval naming must be short for JB2 but needs input bigwig name and (lo or hi). - """ +import argparse +import os +import sys +from pathlib import Path -import argparse import numpy as np import pybigtools -import sys -from pathlib import Path +class asciihist: + + def __init__( + self, + data, + bins=10, + minmax=None, + str_tag="", + scale_output=80, + generate_only=True, + ): + """ + https://gist.github.com/bgbg/608d9ef4fd75032731651257fe67fc81 + Create an ASCII histogram from an interable of numbers. + Author: Boris Gorelik boris@gorelik.net. based on http://econpy.googlecode.com/svn/trunk/pytrix/pytrix.py + License: MIT + """ + self.data = data + self.minmax = minmax + self.str_tag = str_tag + self.bins = bins + self.generate_only = generate_only + self.scale_output = scale_output + self.itarray = np.asanyarray(self.data) + if self.minmax == "auto": + self.minmax = np.percentile(data, [5, 95]) + if self.minmax[0] == self.minmax[1]: + # for very ugly distributions + self.minmax = None + if self.minmax is not None: + # discard values that are outside minmax range + mn = self.minmax[0] + mx = self.minmax[1] + self.itarray = self.itarray[self.itarray >= mn] + self.itarray = self.itarray[self.itarray <= mx] -class findOut(): + def draw(self): + values, counts = np.unique(self.data, return_counts=True) + if len(values) <= 20: + self.bins = len(values) + ret = [] + if self.itarray.size: + total = len(self.itarray) + counts, cutoffs = np.histogram(self.itarray, bins=self.bins) + cutoffs = cutoffs[1:] + if self.str_tag: + self.str_tag = "%s " % self.str_tag + else: + self.str_tag = "" + if self.scale_output is not None: + scaled_counts = counts.astype(float) / counts.sum() * self.scale_output + else: + scaled_counts = counts + footerbar = "{:s}{:s} |{:s} |".format(self.str_tag, "-" * 12, "-" * 12,) + if self.minmax is not None: + ret.append( + "Trimmed to range (%s - %s)" + % (str(self.minmax[0]), str(self.minmax[1])) + ) + for cutoff, original_count, scaled_count in zip( + cutoffs, counts, scaled_counts + ): + ret.append( + "{:s}{:>12.2f} |{:>12,d} | {:s}".format( + self.str_tag, cutoff, original_count, "*" * int(scaled_count) + ) + ) + ret.append(footerbar) + ret.append("{:s}{:>12s} |{:>12,d} |".format(self.str_tag, "N=", total)) + ret.append(footerbar) + ret.append('') + else: + ret = [] + if not self.generate_only: + for line in ret: + print(line) + ret = "\n".join(ret) + return ret + + +class findOut: def __init__(self, args): - self.bwnames=args.bigwig - self.bwlabels=args.bigwiglabels - self.bedwin=args.minwin - self.qlo=args.qlo - self.qhi=args.qhi - self.bedouthilo=args.bedouthilo - self.bedouthi=args.bedouthi - self.bedoutlo=args.bedoutlo - self.tableout = args.tableout + self.bwnames = args.bigwig + self.bwlabels = args.bigwiglabels + self.bedwin = args.minwin + self.outbeds = args.outbeds + self.bedouthi = args.bedouthi + self.bedoutlo = args.bedoutlo + self.bedouthilo = args.bedouthilo + self.tableoutfile = args.tableoutfile self.bedwin = args.minwin - self.qhi = args.qhi - self.qlo = args.qlo + self.qlo = None + self.qhi = None + if args.outbeds != "outtab": + self.qhi = args.qhi + if args.qlo: + try: + f = float(args.qlo) + self.qlo = f + except Exception: + print('qlo not provided') + nbw = len(args.bigwig) + nlab = len(args.bigwiglabels) + if nlab < nbw: + self.bwlabels += ["Nolabel"] * (nbw - nlab) self.makeBed() def processVals(self, bw, isTop): - # http://gregoryzynda.com/python/numpy/contiguous/interval/2019/11/29/contiguous-regions.html + """ + idea from http://gregoryzynda.com/python/numpy/contiguous/interval/2019/11/29/contiguous-regions.html + Fast segmentation into regions by taking np.diff on the boolean array of over (under) cutpoint indicators in bwex. + This only gives non-zero values at the segment boundaries where there's a change, so those zeros are all removed in bwexdnz + leaving an array of segment start/end positions. That's twisted around into an array of start/end coordinates. + Magical. Fast. Could do the same for means or medians over windows for sparse bigwigs like repeat regions. + """ if isTop: - bwex = np.r_[False, bw >= self.bwtop, False] # extend with 0s + bwex = np.r_[False, bw >= self.bwtop, False] # extend with 0s else: bwex = np.r_[False, bw <= self.bwbot, False] bwexd = np.diff(bwex) bwexdnz = bwexd.nonzero()[0] - bwregions = np.reshape(bwexdnz, (-1,2)) + bwregions = np.reshape(bwexdnz, (-1, 2)) return bwregions def writeBed(self, bed, bedfname): @@ -60,89 +151,151 @@ potentially multiple """ bed.sort() - beds = ['%s\t%d\t%d\t%s\t%d' % x for x in bed] + beds = ["%s\t%d\t%d\t%s\t%d" % x for x in bed] with open(bedfname, "w") as bedf: - bedf.write('\n'.join(beds)) - bedf.write('\n') - print('Wrote %d bed regions to %s' % (len(bed), bedfname)) - + bedf.write("\n".join(beds)) + bedf.write("\n") + + def makeTableRow(self, bw, bwlabel, chr): + """ + called for every contig, but messy inline + """ + bwmean = np.mean(bw) + bwstd = np.std(bw) + bwmax = np.max(bw) + nrow = np.size(bw) + bwmin = np.min(bw) + row = "%s\t%s\t%d\t%f\t%f\t%f\t%f" % ( + bwlabel, + chr, + nrow, + bwmean, + bwstd, + bwmin, + bwmax, + ) + if self.qhi is not None: + row += "\t%.2f" % self.bwtop + else: + row += "\tnoqhi" + if self.qlo is not None: + row += "\t%.2f" % self.bwbot + else: + row += "\tnoqlo" + return row + def makeBed(self): bedhi = [] bedlo = [] + restab = [] bwlabels = self.bwlabels bwnames = self.bwnames - print('bwnames=', bwnames, "bwlabs=", bwlabels) + reshead = "bigwig\tcontig\tn\tmean\tstd\tmin\tmax\tqtop\tqbot" for i, bwname in enumerate(bwnames): - bwlabel = bwlabels[i].replace(" ",'') - p = Path('in.bw') - p.symlink_to( bwname ) # required by pybigtools (!) - bwf = pybigtools.open('in.bw') + bwlabel = bwlabels[i].replace(" ", "") + fakepath = "in%d.bw" % i + if os.path.isfile(fakepath): + os.remove(fakepath) + p = Path(fakepath) + p.symlink_to(bwname) # required by pybigtools (!) + bwf = pybigtools.open(fakepath) chrlist = bwf.chroms() chrs = list(chrlist.keys()) - chrs.sort() - restab = ["contig\tn\tmean\tstd\tmin\tmax\tqtop\tqbot"] for chr in chrs: + first_few = None bw = bwf.values(chr) - bw = bw[~np.isnan(bw)] # some have NaN if parts of a contig not covered + values, counts = np.unique(bw, return_counts=True) + nvalues = len(values) + if nvalues <= 20: + histo = '\n'.join(['%s: %f occurs %d times' % (chr, values[x], counts[x]) for x in range(len(values))]) + else: + last10 = range(nvalues-10, nvalues) + first_few = ['%.2f\t%d' % (values[x],counts[x]) for x in range(10)] + first_few += ['%.2f\t%d' % (values[x],counts[x]) for x in last10] + first_few.insert(0,'First/Last 10 value counts\nValue\tCount') + ha = asciihist(data=bw, bins=20, str_tag=chr) + histo = ha.draw() + histo = '\n'.join(first_few) + '\nHistogram of bigwig values\n' + histo + bw = bw[~np.isnan(bw)] # some have NaN if parts of a contig not covered if self.qhi is not None: self.bwtop = np.quantile(bw, self.qhi) bwhi = self.processVals(bw, isTop=True) - for i, seg in enumerate(bwhi): - if seg[1] - seg[0] >= self.bedwin: - bedhi.append((chr, seg[0], seg[1], '%s_hi' % (bwlabel), 1)) + for j, seg in enumerate(bwhi): + seglen = seg[1] - seg[0] + if seglen >= self.bedwin: + score = np.sum(bw[seg[0]:seg[1]])/float(seglen) + bedhi.append( + ( + chr, + seg[0], + seg[1], + "%s_%d" % (bwlabel, score), + score, + ) + ) if self.qlo is not None: self.bwbot = np.quantile(bw, self.qlo) - bwlo = self.processVals(bw, isTop=False) - for i, seg in enumerate(bwlo): + bwlo = self.processVals(bw, isTop=False) + for j, seg in enumerate(bwlo): if seg[1] - seg[0] >= self.bedwin: - bedlo.append((chr, seg[0], seg[1], '%s_lo' % (bwlabel), -1)) - bwmean = np.mean(bw) - bwstd = np.std(bw) - bwmax = np.max(bw) - nrow = np.size(bw) - bwmin = np.min(bw) - restab.append('%s\t%d\t%f\t%f\t%f\t%f\t%f\t%f' % (chr,nrow,bwmean,bwstd,bwmin,bwmax,self.bwtop,self.bwbot)) - print('\n'.join(restab), '\n') - if self.tableout: - with open(self.tableout) as t: - t.write('\n'.join(restab)) - t.write('\n') - if self.bedoutlo: - if self.qlo: + score = -1 * np.sum(bw[seg[0]:seg[1]])/float(seglen) + bedlo.append( + ( + chr, + seg[0], + seg[1], + "%s_%d" % (bwlabel, score), + score, + ) + ) + if self.tableoutfile: + row = self.makeTableRow(bw, bwlabel, chr) + resheadl = reshead.split('\t') + rowl = row.split() + desc = ['%s\t%s' % (resheadl[x], rowl[x]) for x in range(len(rowl))] + desc.insert(0, 'Descriptive measures') + descn = '\n'.join(desc) + restab.append(descn) + restab.append(histo) + if os.path.isfile(fakepath): + os.remove(fakepath) + if self.tableoutfile: + stable = "\n".join(restab) + with open(self.tableoutfile, "w") as t: + t.write(stable) + t.write("\n") + some = False + if self.qlo: + if self.outbeds in ["outall", "outlo", "outlohi"]: self.writeBed(bedlo, self.bedoutlo) - if self.bedouthi: - if self.qhi: + some = True + if self.qhi: + if self.outbeds in ["outall", "outlohi", "outhi"]: self.writeBed(bedhi, self.bedouthi) - if self.bedouthilo: + some = True + if self.outbeds in ["outall", "outhilo"]: allbed = bedlo + bedhi self.writeBed(allbed, self.bedouthilo) - return restab - + some = True + if not ((self.outbeds == 'outtab') or some): + sys.stderr.write( + "Invalid configuration - no output could be created. Was qlo missing and only low output requested for example?" + ) + sys.exit(2) + if __name__ == "__main__": parser = argparse.ArgumentParser() a = parser.add_argument - a('-m', '--minwin',default=10, type=int) - a('-l', '--qlo',default=None, type=float) - a('-i', '--qhi',default=None, type=float) - a('-w', '--bigwig', nargs='+') - a('-n', '--bigwiglabels', nargs='+') - a('-o', '--bedouthilo', default=None, help="optional high and low combined bed") - a('-u', '--bedouthi', default=None, help="optional high only bed") - a('-b', '--bedoutlo', default=None, help="optional low only bed") - a('-t', '--tableout', default=None) + a("-m", "--minwin", default=10, type=int) + a("-l", "--qlo", default=None) + a("-i", "--qhi", default=None, type=float) + a("--bedouthi", default=None) + a("--bedoutlo", default=None) + a("--bedouthilo", default=None) + a("-w", "--bigwig", nargs="+") + a("-n", "--bigwiglabels", nargs="+") + a("-o", "--outbeds", default="outhilo", help="optional high and low combined bed") + a("-t", "--tableoutfile", default=None) args = parser.parse_args() - print('args=', args) - if not (args.bedouthilo or args.bedouthi or args.bedoutlo): - sys.stderr.write("bigwig_outlier_bed.py cannot usefully run - need a bed output choice - must be one of low only, high only or both combined") - sys.exit(2) - if not (args.qlo or args.qhi): - sys.stderr.write("bigwig_outlier_bed.py cannot usefully run - need one or both of quantile cutpoints qhi and qlo") - sys.exit(2) - restab = findOut(args) - if args.tableout: - with open(args.tableout, 'w') as tout: - tout.write('\n'.join(restab)) - tout.write('\n') - - + findOut(args)
