diff bigwig_outlier_bed.py @ 6:eb17eb8a3658 draft

planemo upload commit 1baff96e75def9248afdcf21edec9bdc7ed42b1f-dirty
author fubar
date Tue, 23 Jul 2024 23:12:23 +0000
parents c71db540eb38
children c8e22efcaeda
line wrap: on
line diff
--- a/bigwig_outlier_bed.py	Mon Jul 01 05:02:05 2024 +0000
+++ b/bigwig_outlier_bed.py	Tue Jul 23 23:12:23 2024 +0000
@@ -1,58 +1,149 @@
 """
-Ross Lazarus June 2024 for VGP 
-
+Ross Lazarus June 2024 for VGP
 Bigwigs are great, but hard to reliably "see" small low coverage or small very high coverage regions.
 Colouring in JB2 tracks will need a new plugin, so this code will find bigwig regions above and below a chosen percentile point.
 0.99 and 0.01 work well in testing with a minimum span of 10 bp.
-
 Multiple bigwigs **with the same reference** can be combined - bed segments will be named appropriately
 Combining multiple references works but is silly because only display will rely on one reference so others will not be shown...
-
 Tricksy numpy method from http://gregoryzynda.com/python/numpy/contiguous/interval/2019/11/29/contiguous-regions.html
 takes about 95 seconds for a 17MB test wiggle
-
 JBrowse2 bed normally displays ignore the score, so could provide separate low/high bed file outputs as an option.
-
 Update june 30 2024: wrote a 'no-build' plugin for beds to display red/blue if >0/<0 so those are used for scores
 Bed interval naming must be short for JB2 but needs input bigwig name and (lo or hi).
-
 """
 
+import argparse
+import os
+import sys
+from pathlib import Path
 
-import argparse
 import numpy as np
 import pybigtools
-import sys
 
 
-from pathlib import Path
+class asciihist:
+
+    def __init__(
+        self,
+        data,
+        bins=10,
+        minmax=None,
+        str_tag="",
+        scale_output=80,
+        generate_only=True,
+    ):
+        """
+        https://gist.github.com/bgbg/608d9ef4fd75032731651257fe67fc81
+        Create an ASCII histogram from an interable of numbers.
+        Author: Boris Gorelik boris@gorelik.net. based on  http://econpy.googlecode.com/svn/trunk/pytrix/pytrix.py
+        License: MIT
+        """
+        self.data = data
+        self.minmax = minmax
+        self.str_tag = str_tag
+        self.bins = bins
+        self.generate_only = generate_only
+        self.scale_output = scale_output
+        self.itarray = np.asanyarray(self.data)
+        if self.minmax == "auto":
+            self.minmax = np.percentile(data, [5, 95])
+            if self.minmax[0] == self.minmax[1]:
+                # for very ugly distributions
+                self.minmax = None
+        if self.minmax is not None:
+            # discard values that are outside minmax range
+            mn = self.minmax[0]
+            mx = self.minmax[1]
+            self.itarray = self.itarray[self.itarray >= mn]
+            self.itarray = self.itarray[self.itarray <= mx]
 
-class findOut():
+    def draw(self):
+        values, counts = np.unique(self.data, return_counts=True)
+        if len(values) <= 20:
+            self.bins = len(values)
+        ret = []
+        if self.itarray.size:
+            total = len(self.itarray)
+            counts, cutoffs = np.histogram(self.itarray, bins=self.bins)
+            cutoffs = cutoffs[1:]
+            if self.str_tag:
+                self.str_tag = "%s " % self.str_tag
+            else:
+                self.str_tag = ""
+            if self.scale_output is not None:
+                scaled_counts = counts.astype(float) / counts.sum() * self.scale_output
+            else:
+                scaled_counts = counts
+            footerbar =  "{:s}{:s} |{:s} |".format(self.str_tag, "-" * 12, "-" * 12,)
+            if self.minmax is not None:
+                ret.append(
+                    "Trimmed to range (%s - %s)"
+                    % (str(self.minmax[0]), str(self.minmax[1]))
+                )
+            for cutoff, original_count, scaled_count in zip(
+                cutoffs, counts, scaled_counts
+            ):
+                ret.append(
+                    "{:s}{:>12.2f} |{:>12,d} | {:s}".format(
+                        self.str_tag, cutoff, original_count, "*" * int(scaled_count)
+                    )
+                )
+            ret.append(footerbar)
+            ret.append("{:s}{:>12s} |{:>12,d} |".format(self.str_tag, "N=", total))
+            ret.append(footerbar)
+            ret.append('')
+        else:
+            ret = []
+        if not self.generate_only:
+            for line in ret:
+                print(line)
+        ret = "\n".join(ret)
+        return ret
+
+
+class findOut:
 
     def __init__(self, args):
-        self.bwnames=args.bigwig
-        self.bwlabels=args.bigwiglabels
-        self.bedwin=args.minwin
-        self.qlo=args.qlo
-        self.qhi=args.qhi
-        self.bedouthilo=args.bedouthilo
-        self.bedouthi=args.bedouthi
-        self.bedoutlo=args.bedoutlo
-        self.tableout = args.tableout
+        self.bwnames = args.bigwig
+        self.bwlabels = args.bigwiglabels
+        self.bedwin = args.minwin
+        self.outbeds = args.outbeds
+        self.bedouthi = args.bedouthi
+        self.bedoutlo = args.bedoutlo
+        self.bedouthilo = args.bedouthilo
+        self.tableoutfile = args.tableoutfile
         self.bedwin = args.minwin
-        self.qhi = args.qhi
-        self.qlo = args.qlo
+        self.qlo = None
+        self.qhi = None
+        if args.outbeds != "outtab":
+            self.qhi = args.qhi
+            if args.qlo:
+                try:
+                    f = float(args.qlo)
+                    self.qlo = f
+                except Exception:
+                    print('qlo not provided')
+        nbw = len(args.bigwig)
+        nlab = len(args.bigwiglabels)
+        if nlab < nbw:
+            self.bwlabels += ["Nolabel"] * (nbw - nlab)
         self.makeBed()
 
     def processVals(self, bw, isTop):
-        # http://gregoryzynda.com/python/numpy/contiguous/interval/2019/11/29/contiguous-regions.html
+        """
+        idea from http://gregoryzynda.com/python/numpy/contiguous/interval/2019/11/29/contiguous-regions.html
+        Fast segmentation into regions by taking np.diff on the boolean array of over (under) cutpoint indicators in bwex.
+        This only gives non-zero values at the segment boundaries where there's a change, so those zeros are all removed in bwexdnz
+        leaving an array of segment start/end positions. That's twisted around into an array of start/end coordinates.
+        Magical. Fast. Could do the same for means or medians over windows for sparse bigwigs like repeat regions.
+        """
         if isTop:
-            bwex = np.r_[False, bw >= self.bwtop, False] # extend with 0s
+            bwex = np.r_[False, bw >= self.bwtop, False]  # extend with 0s
         else:
             bwex = np.r_[False, bw <= self.bwbot, False]
         bwexd = np.diff(bwex)
         bwexdnz = bwexd.nonzero()[0]
-        bwregions = np.reshape(bwexdnz, (-1,2))
+        bwregions = np.reshape(bwexdnz, (-1, 2))
         return bwregions
 
     def writeBed(self, bed, bedfname):
@@ -60,89 +151,151 @@
         potentially multiple
         """
         bed.sort()
-        beds = ['%s\t%d\t%d\t%s\t%d' % x for x in bed]
+        beds = ["%s\t%d\t%d\t%s\t%d" % x for x in bed]
         with open(bedfname, "w") as bedf:
-            bedf.write('\n'.join(beds))
-            bedf.write('\n')
-        print('Wrote %d bed regions to %s' % (len(bed), bedfname))
-        
+            bedf.write("\n".join(beds))
+            bedf.write("\n")
+
+    def makeTableRow(self, bw, bwlabel, chr):
+        """
+        called for every contig, but messy inline
+        """
+        bwmean = np.mean(bw)
+        bwstd = np.std(bw)
+        bwmax = np.max(bw)
+        nrow = np.size(bw)
+        bwmin = np.min(bw)
+        row = "%s\t%s\t%d\t%f\t%f\t%f\t%f" % (
+            bwlabel,
+            chr,
+            nrow,
+            bwmean,
+            bwstd,
+            bwmin,
+            bwmax,
+        )
+        if self.qhi is not None:
+            row += "\t%.2f" % self.bwtop
+        else:
+            row += "\tnoqhi"
+        if self.qlo is not None:
+            row += "\t%.2f" % self.bwbot
+        else:
+            row += "\tnoqlo"
+        return row
+
     def makeBed(self):
         bedhi = []
         bedlo = []
+        restab = []
         bwlabels = self.bwlabels
         bwnames = self.bwnames
-        print('bwnames=', bwnames, "bwlabs=", bwlabels)
+        reshead =  "bigwig\tcontig\tn\tmean\tstd\tmin\tmax\tqtop\tqbot"
         for i, bwname in enumerate(bwnames):
-            bwlabel = bwlabels[i].replace(" ",'')
-            p = Path('in.bw')
-            p.symlink_to( bwname ) # required by pybigtools (!)
-            bwf = pybigtools.open('in.bw')
+            bwlabel = bwlabels[i].replace(" ", "")
+            fakepath = "in%d.bw" % i
+            if os.path.isfile(fakepath):
+                os.remove(fakepath)
+            p = Path(fakepath)
+            p.symlink_to(bwname)  # required by pybigtools (!)
+            bwf = pybigtools.open(fakepath)
             chrlist = bwf.chroms()
             chrs = list(chrlist.keys())
-            chrs.sort()
-            restab = ["contig\tn\tmean\tstd\tmin\tmax\tqtop\tqbot"]
             for chr in chrs:
+                first_few = None
                 bw = bwf.values(chr)
-                bw = bw[~np.isnan(bw)] # some have NaN if parts of a contig not covered
+                values, counts = np.unique(bw, return_counts=True)
+                nvalues = len(values)
+                if nvalues <= 20:
+                    histo = '\n'.join(['%s: %f occurs %d times' % (chr, values[x], counts[x]) for x in range(len(values))])
+                else:
+                    last10 = range(nvalues-10, nvalues)
+                    first_few = ['%.2f\t%d' % (values[x],counts[x]) for x in range(10)]
+                    first_few += ['%.2f\t%d' % (values[x],counts[x]) for x in last10]
+                    first_few.insert(0,'First/Last 10 value counts\nValue\tCount')
+                    ha = asciihist(data=bw, bins=20, str_tag=chr)
+                    histo = ha.draw()
+                    histo = '\n'.join(first_few) + '\nHistogram of bigwig values\n' + histo
+                bw = bw[~np.isnan(bw)]  # some have NaN if parts of a contig not covered
                 if self.qhi is not None:
                     self.bwtop = np.quantile(bw, self.qhi)
                     bwhi = self.processVals(bw, isTop=True)
-                    for i, seg in enumerate(bwhi):
-                        if seg[1] - seg[0] >= self.bedwin:
-                            bedhi.append((chr, seg[0], seg[1], '%s_hi' % (bwlabel), 1))
+                    for j, seg in enumerate(bwhi):
+                        seglen = seg[1] - seg[0]
+                        if seglen >= self.bedwin:
+                            score = np.sum(bw[seg[0]:seg[1]])/float(seglen)
+                            bedhi.append(
+                                (
+                                    chr,
+                                    seg[0],
+                                    seg[1],
+                                    "%s_%d" % (bwlabel, score),
+                                    score,
+                                )
+                            )
                 if self.qlo is not None:
                     self.bwbot = np.quantile(bw, self.qlo)
-                    bwlo = self.processVals(bw, isTop=False)            
-                    for i, seg in enumerate(bwlo):
+                    bwlo = self.processVals(bw, isTop=False)
+                    for j, seg in enumerate(bwlo):
                         if seg[1] - seg[0] >= self.bedwin:
-                            bedlo.append((chr, seg[0], seg[1], '%s_lo' % (bwlabel), -1))
-                bwmean = np.mean(bw)
-                bwstd = np.std(bw)
-                bwmax = np.max(bw)
-                nrow = np.size(bw)
-                bwmin = np.min(bw)
-                restab.append('%s\t%d\t%f\t%f\t%f\t%f\t%f\t%f' % (chr,nrow,bwmean,bwstd,bwmin,bwmax,self.bwtop,self.bwbot))        
-        print('\n'.join(restab), '\n')
-        if self.tableout:
-            with open(self.tableout) as t:
-                t.write('\n'.join(restab))
-                t.write('\n')
-        if self.bedoutlo:
-            if self.qlo:
+                            score = -1 * np.sum(bw[seg[0]:seg[1]])/float(seglen)
+                            bedlo.append(
+                                (
+                                    chr,
+                                    seg[0],
+                                    seg[1],
+                                    "%s_%d" % (bwlabel, score),
+                                    score,
+                                )
+                            )
+                if self.tableoutfile:
+                    row = self.makeTableRow(bw, bwlabel, chr)
+                    resheadl = reshead.split('\t')
+                    rowl = row.split()
+                    desc = ['%s\t%s' % (resheadl[x], rowl[x]) for x in range(len(rowl))]
+                    desc.insert(0, 'Descriptive measures')
+                    descn = '\n'.join(desc)
+                    restab.append(descn)
+                    restab.append(histo)
+        if os.path.isfile(fakepath):
+            os.remove(fakepath)
+        if self.tableoutfile:
+            stable = "\n".join(restab)
+            with open(self.tableoutfile, "w") as t:
+                t.write(stable)
+                t.write("\n")
+        some = False
+        if self.qlo:
+            if self.outbeds in ["outall", "outlo", "outlohi"]:
                 self.writeBed(bedlo, self.bedoutlo)
-        if self.bedouthi:
-            if self.qhi:
+                some = True
+        if self.qhi:
+            if self.outbeds in ["outall", "outlohi", "outhi"]:
                 self.writeBed(bedhi, self.bedouthi)
-        if self.bedouthilo:
+                some = True
+        if self.outbeds in ["outall", "outhilo"]:
             allbed = bedlo + bedhi
             self.writeBed(allbed, self.bedouthilo)
-        return restab
-    
+            some = True
+        if not ((self.outbeds == 'outtab') or some):
+            sys.stderr.write(
+                "Invalid configuration - no output could be created. Was qlo missing and only low output requested for example?"
+            )
+            sys.exit(2)
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     a = parser.add_argument
-    a('-m', '--minwin',default=10, type=int)
-    a('-l', '--qlo',default=None, type=float)
-    a('-i', '--qhi',default=None, type=float)
-    a('-w', '--bigwig', nargs='+')
-    a('-n', '--bigwiglabels', nargs='+')
-    a('-o', '--bedouthilo', default=None, help="optional high and low combined bed")
-    a('-u', '--bedouthi', default=None, help="optional high only bed")
-    a('-b', '--bedoutlo', default=None, help="optional low only bed")
-    a('-t', '--tableout', default=None)
+    a("-m", "--minwin", default=10, type=int)
+    a("-l", "--qlo", default=None)
+    a("-i", "--qhi", default=None, type=float)
+    a("--bedouthi", default=None)
+    a("--bedoutlo", default=None)
+    a("--bedouthilo", default=None)
+    a("-w", "--bigwig", nargs="+")
+    a("-n", "--bigwiglabels", nargs="+")
+    a("-o", "--outbeds", default="outhilo", help="optional high and low combined bed")
+    a("-t", "--tableoutfile", default=None)
     args = parser.parse_args()
-    print('args=', args)
-    if not (args.bedouthilo or args.bedouthi or args.bedoutlo):
-        sys.stderr.write("bigwig_outlier_bed.py cannot usefully run - need a bed output choice - must be one of low only, high only or both combined")
-        sys.exit(2)
-    if not (args.qlo or args.qhi):
-        sys.stderr.write("bigwig_outlier_bed.py cannot usefully run - need one or both of quantile cutpoints qhi and qlo")
-        sys.exit(2)
-    restab = findOut(args)
-    if args.tableout:
-        with open(args.tableout, 'w') as tout:
-            tout.write('\n'.join(restab))
-            tout.write('\n')
-    
-
+    findOut(args)