diff ngsutils/bed/__init__.py @ 0:4e4e4093d65d draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ngsutils commit 09194687c74a424732f8b0c017cbb942aad89068
author iuc
date Wed, 11 Nov 2015 13:04:07 -0500
parents
children 7a68005de299
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsutils/bed/__init__.py	Wed Nov 11 13:04:07 2015 -0500
@@ -0,0 +1,282 @@
+import os
+import ngsutils.support.ngs_utils
+import pysam
+
+
+class BedStreamer(object):
+    '''
+    Streams BedRegions from a BED file
+
+    Note - this can only be used once! There is no mechanism to seek the stream.
+    '''
+
+    def __init__(self, fname=None, fileobj=None, quiet=False):
+        if not fname and not fileobj:
+            raise ValueError("You must specify either fname or fileobj!")
+
+        self.reader = ngsutils.support.gzip_reader(fname=fname, quiet=quiet, fileobj=fileobj)
+
+    def __iter__(self):
+        return self
+
+    def next(self):
+        try:
+            while True:
+                line = self.reader.next().strip()
+                if line and line[0] != '#':
+                    cols = line.split('\t')
+                    while len(cols) < 6:
+                        cols.append('')
+
+                    return BedRegion(*cols)
+        except:
+            raise StopIteration
+
+
+
+class BedFile(object):
+    '''
+    BED files are read in their entirety memory, in a series of bins. Each bin
+    is ~100kb in size. Each bin can then be iterated over.
+
+    This is less efficient than using a proper index, but in reality, this
+    usually isn't an issue. However, if the BED file has been Tabix indexed,
+    that index will be used for random access.
+
+    NOTE: This isn't very efficient, so perhaps this can be remodeled into a BedFile
+    and a BedFileIndex where the file is indexed only if random access is requested.
+    '''
+
+    _bin_const = 100000
+
+    def __init__(self, fname=None, fileobj=None, region=None):
+        self._bins = {}
+        self._bin_list = []
+        self._cur_bin_idx = 0
+        self._cur_bin_pos = 0
+        self._tellpos = 0
+        self._total = 0
+        self._length = 0
+        self.__tabix = None
+
+        self.filename = fname
+
+        if os.path.exists('%s.tbi' % fname):
+            self.__tabix = pysam.Tabixfile(fname)
+
+        if fileobj:
+            self.__readfile(fileobj)
+        elif fname:
+            with ngsutils.support.ngs_utils.gzip_opener(fname) as fobj:
+                self.__readfile(fobj)
+        elif region:
+            chrom, startend = region.split(':')
+            if '-' in startend:
+                start, end = [int(x) for x in startend.split('-')]
+            else:
+                start = int(startend)
+                end = start
+            start -= 1
+
+            self.__add_region(BedRegion(chrom, start, end))
+        else:
+            raise ValueError("Must specify either filename, fileobj, or region")
+
+    def __readfile(self, fobj):
+        for line in fobj:
+            line = line.strip()
+            if line and line[0] != '#':
+                cols = line.split('\t')
+                while len(cols) < 6:
+                    cols.append('')
+
+                region = BedRegion(*cols)
+                self.__add_region(region)
+
+        self._bin_list.sort()
+        for bin in self._bins:
+            self._bins[bin].sort()
+
+    def __add_region(self, region):
+        self._total += region.end - region.start
+        self._length += 1
+
+        startbin = region.start / BedFile._bin_const
+        endbin = region.end / BedFile._bin_const
+
+        for bin in xrange(startbin, endbin + 1):
+            if not (region.chrom, bin) in self._bins:
+                self._bin_list.append((region.chrom, bin))
+                self._bins[(region.chrom, bin)] = []
+            self._bins[(region.chrom, bin)].append(region)
+
+    def fetch(self, chrom, start, end, strand=None):
+        '''
+        For TABIX indexed BED files, find all regions w/in a range
+
+        For non-TABIX index BED files, use the calculated bins, and
+        output matching regions
+        '''
+
+        if self.__tabix:
+            for match in self.__tabix.fetch(chrom, start, end):
+                region = BedRegion(*match.split('\t'))
+                if not strand or (strand and region.strand == strand):
+                    yield region
+        else:
+            startbin = start / BedFile._bin_const
+            endbin = end / BedFile._bin_const
+
+            buf = set()
+
+            for bin in xrange(startbin, endbin + 1):
+                if (chrom, bin) in self._bins:
+                    for region in self._bins[(chrom, bin)]:
+                        if strand and strand != region.strand:
+                            continue
+                        if start <= region.start <= end or start <= region.end <= end:
+                            if not region in buf:
+                                yield region
+                                buf.add(region)
+                        elif region.start < start and region.end > end:
+                            if not region in buf:
+                                yield region
+                                buf.add(region)
+
+    def tell(self):
+        return self._tellpos
+
+    def close(self):
+        pass
+
+    @property
+    def length(self):
+        return self._length
+
+    @property
+    def total(self):
+        return self._total
+
+    def __iter__(self):
+        self._cur_bin_idx = 0
+        self._cur_bin_pos = 0
+        self._tellpos = 0
+        return self
+
+    def next(self):
+        if self._cur_bin_idx >= len(self._bin_list):
+            raise StopIteration
+
+        binvals = self._bins[self._bin_list[self._cur_bin_idx]]
+        while self._cur_bin_pos < len(binvals):
+            val = binvals[self._cur_bin_pos]
+            self._cur_bin_pos += 1
+
+            startbin = (val.chrom, val.start / BedFile._bin_const)
+            if startbin == self._bin_list[self._cur_bin_idx]:
+                self._tellpos += 1
+                return val
+
+        self._cur_bin_idx += 1
+        self._cur_bin_pos = 0
+        return self.next()
+
+
+class BedRegion(object):
+    def __init__(self, chrom, start, end, name='', score='', strand='', thickStart='', thickEnd='', rgb='', *args):
+        self.chrom = chrom
+        self.start = int(start)
+        self.end = int(end)
+        self.name = name
+
+        if score == '':
+            self.score = 0
+        else:
+            self.score = float(score)
+
+        if strand == '':
+            self.strand = None
+        else:
+            self.strand = strand
+
+        if thickStart == '':
+            self.thickStart = None
+        else:
+            self.thickStart = thickStart
+
+        if thickEnd == '':
+            self.thickEnd = None
+        else:
+            self.thickEnd = thickEnd
+
+        if rgb == '':
+            self.rgb = None
+        else:
+            self.rgb = rgb
+
+        self.extras = args
+
+    def clone(self, chrom=None, start=None, end=None, name=None, score=None, strand=None, thickStart=None, thickEnd=None, rgb=None, *args):
+        cols = []
+        cols.append(self.chrom if chrom is None else chrom)
+        cols.append(self.start if start is None else start)
+        cols.append(self.end if end is None else end)
+        cols.append(self.name if name is None else name)
+        cols.append(self.score if score is None else score)
+        cols.append(self.strand if strand is None else strand)
+        cols.append(self.thickStart if thickStart is None else thickStart)
+        cols.append(self.thickEnd if thickEnd is None else thickEnd)
+        cols.append(self.rgb if rgb is None else rgb)
+
+        for i, val in enumerate(self.extras):
+            if len(args) > i:
+                cols.append(args[i])
+            else:
+                cols.append(val)
+
+        return BedRegion(*cols)
+
+    @property
+    def score_int(self):
+        score = str(self.score)
+        if score[-2:] == '.0':
+            score = score[:-2]
+
+        return score
+
+    def __key(self):
+        return (self.chrom, self.start, self.end, self.strand, self.name)
+
+    def __lt__(self, other):
+        return self.__key() < other.__key()
+
+    def __gt__(self, other):
+        return self.__key() > other.__key()
+
+    def __eq__(self, other):
+        return self.__key() == other.__key()
+
+    def write(self, out):
+        out.write('%s\n' % self)
+
+    def __repr__(self):
+        outcols = []
+
+        if self.rgb:
+            outcols.append(self.rgb)
+        if self.thickEnd or outcols:
+            outcols.append(self.thickEnd if self.thickEnd else self.end)
+        if self.thickStart or outcols:
+            outcols.append(self.thickStart if self.thickStart else self.start)
+        if self.strand or outcols:
+            outcols.append(self.strand)
+        if self.score_int != '' or outcols:
+            outcols.append(self.score_int)
+        if self.name or outcols:
+            outcols.append(self.name)
+
+        outcols.append(self.end)
+        outcols.append(self.start)
+        outcols.append(self.chrom)
+
+        return '\t'. join([str(x) for x in outcols[::-1]])