comparison ngsutils/bed/__init__.py @ 0:4e4e4093d65d draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ngsutils commit 09194687c74a424732f8b0c017cbb942aad89068
author iuc
date Wed, 11 Nov 2015 13:04:07 -0500
parents
children 7a68005de299
comparison
equal deleted inserted replaced
-1:000000000000 0:4e4e4093d65d
1 import os
2 import ngsutils.support.ngs_utils
3 import pysam
4
5
6 class BedStreamer(object):
7 '''
8 Streams BedRegions from a BED file
9
10 Note - this can only be used once! There is no mechanism to seek the stream.
11 '''
12
13 def __init__(self, fname=None, fileobj=None, quiet=False):
14 if not fname and not fileobj:
15 raise ValueError("You must specify either fname or fileobj!")
16
17 self.reader = ngsutils.support.gzip_reader(fname=fname, quiet=quiet, fileobj=fileobj)
18
19 def __iter__(self):
20 return self
21
22 def next(self):
23 try:
24 while True:
25 line = self.reader.next().strip()
26 if line and line[0] != '#':
27 cols = line.split('\t')
28 while len(cols) < 6:
29 cols.append('')
30
31 return BedRegion(*cols)
32 except:
33 raise StopIteration
34
35
36
37 class BedFile(object):
38 '''
39 BED files are read in their entirety memory, in a series of bins. Each bin
40 is ~100kb in size. Each bin can then be iterated over.
41
42 This is less efficient than using a proper index, but in reality, this
43 usually isn't an issue. However, if the BED file has been Tabix indexed,
44 that index will be used for random access.
45
46 NOTE: This isn't very efficient, so perhaps this can be remodeled into a BedFile
47 and a BedFileIndex where the file is indexed only if random access is requested.
48 '''
49
50 _bin_const = 100000
51
52 def __init__(self, fname=None, fileobj=None, region=None):
53 self._bins = {}
54 self._bin_list = []
55 self._cur_bin_idx = 0
56 self._cur_bin_pos = 0
57 self._tellpos = 0
58 self._total = 0
59 self._length = 0
60 self.__tabix = None
61
62 self.filename = fname
63
64 if os.path.exists('%s.tbi' % fname):
65 self.__tabix = pysam.Tabixfile(fname)
66
67 if fileobj:
68 self.__readfile(fileobj)
69 elif fname:
70 with ngsutils.support.ngs_utils.gzip_opener(fname) as fobj:
71 self.__readfile(fobj)
72 elif region:
73 chrom, startend = region.split(':')
74 if '-' in startend:
75 start, end = [int(x) for x in startend.split('-')]
76 else:
77 start = int(startend)
78 end = start
79 start -= 1
80
81 self.__add_region(BedRegion(chrom, start, end))
82 else:
83 raise ValueError("Must specify either filename, fileobj, or region")
84
85 def __readfile(self, fobj):
86 for line in fobj:
87 line = line.strip()
88 if line and line[0] != '#':
89 cols = line.split('\t')
90 while len(cols) < 6:
91 cols.append('')
92
93 region = BedRegion(*cols)
94 self.__add_region(region)
95
96 self._bin_list.sort()
97 for bin in self._bins:
98 self._bins[bin].sort()
99
100 def __add_region(self, region):
101 self._total += region.end - region.start
102 self._length += 1
103
104 startbin = region.start / BedFile._bin_const
105 endbin = region.end / BedFile._bin_const
106
107 for bin in xrange(startbin, endbin + 1):
108 if not (region.chrom, bin) in self._bins:
109 self._bin_list.append((region.chrom, bin))
110 self._bins[(region.chrom, bin)] = []
111 self._bins[(region.chrom, bin)].append(region)
112
113 def fetch(self, chrom, start, end, strand=None):
114 '''
115 For TABIX indexed BED files, find all regions w/in a range
116
117 For non-TABIX index BED files, use the calculated bins, and
118 output matching regions
119 '''
120
121 if self.__tabix:
122 for match in self.__tabix.fetch(chrom, start, end):
123 region = BedRegion(*match.split('\t'))
124 if not strand or (strand and region.strand == strand):
125 yield region
126 else:
127 startbin = start / BedFile._bin_const
128 endbin = end / BedFile._bin_const
129
130 buf = set()
131
132 for bin in xrange(startbin, endbin + 1):
133 if (chrom, bin) in self._bins:
134 for region in self._bins[(chrom, bin)]:
135 if strand and strand != region.strand:
136 continue
137 if start <= region.start <= end or start <= region.end <= end:
138 if not region in buf:
139 yield region
140 buf.add(region)
141 elif region.start < start and region.end > end:
142 if not region in buf:
143 yield region
144 buf.add(region)
145
146 def tell(self):
147 return self._tellpos
148
149 def close(self):
150 pass
151
152 @property
153 def length(self):
154 return self._length
155
156 @property
157 def total(self):
158 return self._total
159
160 def __iter__(self):
161 self._cur_bin_idx = 0
162 self._cur_bin_pos = 0
163 self._tellpos = 0
164 return self
165
166 def next(self):
167 if self._cur_bin_idx >= len(self._bin_list):
168 raise StopIteration
169
170 binvals = self._bins[self._bin_list[self._cur_bin_idx]]
171 while self._cur_bin_pos < len(binvals):
172 val = binvals[self._cur_bin_pos]
173 self._cur_bin_pos += 1
174
175 startbin = (val.chrom, val.start / BedFile._bin_const)
176 if startbin == self._bin_list[self._cur_bin_idx]:
177 self._tellpos += 1
178 return val
179
180 self._cur_bin_idx += 1
181 self._cur_bin_pos = 0
182 return self.next()
183
184
185 class BedRegion(object):
186 def __init__(self, chrom, start, end, name='', score='', strand='', thickStart='', thickEnd='', rgb='', *args):
187 self.chrom = chrom
188 self.start = int(start)
189 self.end = int(end)
190 self.name = name
191
192 if score == '':
193 self.score = 0
194 else:
195 self.score = float(score)
196
197 if strand == '':
198 self.strand = None
199 else:
200 self.strand = strand
201
202 if thickStart == '':
203 self.thickStart = None
204 else:
205 self.thickStart = thickStart
206
207 if thickEnd == '':
208 self.thickEnd = None
209 else:
210 self.thickEnd = thickEnd
211
212 if rgb == '':
213 self.rgb = None
214 else:
215 self.rgb = rgb
216
217 self.extras = args
218
219 def clone(self, chrom=None, start=None, end=None, name=None, score=None, strand=None, thickStart=None, thickEnd=None, rgb=None, *args):
220 cols = []
221 cols.append(self.chrom if chrom is None else chrom)
222 cols.append(self.start if start is None else start)
223 cols.append(self.end if end is None else end)
224 cols.append(self.name if name is None else name)
225 cols.append(self.score if score is None else score)
226 cols.append(self.strand if strand is None else strand)
227 cols.append(self.thickStart if thickStart is None else thickStart)
228 cols.append(self.thickEnd if thickEnd is None else thickEnd)
229 cols.append(self.rgb if rgb is None else rgb)
230
231 for i, val in enumerate(self.extras):
232 if len(args) > i:
233 cols.append(args[i])
234 else:
235 cols.append(val)
236
237 return BedRegion(*cols)
238
239 @property
240 def score_int(self):
241 score = str(self.score)
242 if score[-2:] == '.0':
243 score = score[:-2]
244
245 return score
246
247 def __key(self):
248 return (self.chrom, self.start, self.end, self.strand, self.name)
249
250 def __lt__(self, other):
251 return self.__key() < other.__key()
252
253 def __gt__(self, other):
254 return self.__key() > other.__key()
255
256 def __eq__(self, other):
257 return self.__key() == other.__key()
258
259 def write(self, out):
260 out.write('%s\n' % self)
261
262 def __repr__(self):
263 outcols = []
264
265 if self.rgb:
266 outcols.append(self.rgb)
267 if self.thickEnd or outcols:
268 outcols.append(self.thickEnd if self.thickEnd else self.end)
269 if self.thickStart or outcols:
270 outcols.append(self.thickStart if self.thickStart else self.start)
271 if self.strand or outcols:
272 outcols.append(self.strand)
273 if self.score_int != '' or outcols:
274 outcols.append(self.score_int)
275 if self.name or outcols:
276 outcols.append(self.name)
277
278 outcols.append(self.end)
279 outcols.append(self.start)
280 outcols.append(self.chrom)
281
282 return '\t'. join([str(x) for x in outcols[::-1]])