Mercurial > repos > iuc > ngsutils_bam_filter
comparison ngsutils/bed/__init__.py @ 0:4e4e4093d65d draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ngsutils commit 09194687c74a424732f8b0c017cbb942aad89068
author | iuc |
---|---|
date | Wed, 11 Nov 2015 13:04:07 -0500 |
parents | |
children | 7a68005de299 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4e4e4093d65d |
---|---|
1 import os | |
2 import ngsutils.support.ngs_utils | |
3 import pysam | |
4 | |
5 | |
6 class BedStreamer(object): | |
7 ''' | |
8 Streams BedRegions from a BED file | |
9 | |
10 Note - this can only be used once! There is no mechanism to seek the stream. | |
11 ''' | |
12 | |
13 def __init__(self, fname=None, fileobj=None, quiet=False): | |
14 if not fname and not fileobj: | |
15 raise ValueError("You must specify either fname or fileobj!") | |
16 | |
17 self.reader = ngsutils.support.gzip_reader(fname=fname, quiet=quiet, fileobj=fileobj) | |
18 | |
19 def __iter__(self): | |
20 return self | |
21 | |
22 def next(self): | |
23 try: | |
24 while True: | |
25 line = self.reader.next().strip() | |
26 if line and line[0] != '#': | |
27 cols = line.split('\t') | |
28 while len(cols) < 6: | |
29 cols.append('') | |
30 | |
31 return BedRegion(*cols) | |
32 except: | |
33 raise StopIteration | |
34 | |
35 | |
36 | |
37 class BedFile(object): | |
38 ''' | |
39 BED files are read in their entirety memory, in a series of bins. Each bin | |
40 is ~100kb in size. Each bin can then be iterated over. | |
41 | |
42 This is less efficient than using a proper index, but in reality, this | |
43 usually isn't an issue. However, if the BED file has been Tabix indexed, | |
44 that index will be used for random access. | |
45 | |
46 NOTE: This isn't very efficient, so perhaps this can be remodeled into a BedFile | |
47 and a BedFileIndex where the file is indexed only if random access is requested. | |
48 ''' | |
49 | |
50 _bin_const = 100000 | |
51 | |
52 def __init__(self, fname=None, fileobj=None, region=None): | |
53 self._bins = {} | |
54 self._bin_list = [] | |
55 self._cur_bin_idx = 0 | |
56 self._cur_bin_pos = 0 | |
57 self._tellpos = 0 | |
58 self._total = 0 | |
59 self._length = 0 | |
60 self.__tabix = None | |
61 | |
62 self.filename = fname | |
63 | |
64 if os.path.exists('%s.tbi' % fname): | |
65 self.__tabix = pysam.Tabixfile(fname) | |
66 | |
67 if fileobj: | |
68 self.__readfile(fileobj) | |
69 elif fname: | |
70 with ngsutils.support.ngs_utils.gzip_opener(fname) as fobj: | |
71 self.__readfile(fobj) | |
72 elif region: | |
73 chrom, startend = region.split(':') | |
74 if '-' in startend: | |
75 start, end = [int(x) for x in startend.split('-')] | |
76 else: | |
77 start = int(startend) | |
78 end = start | |
79 start -= 1 | |
80 | |
81 self.__add_region(BedRegion(chrom, start, end)) | |
82 else: | |
83 raise ValueError("Must specify either filename, fileobj, or region") | |
84 | |
85 def __readfile(self, fobj): | |
86 for line in fobj: | |
87 line = line.strip() | |
88 if line and line[0] != '#': | |
89 cols = line.split('\t') | |
90 while len(cols) < 6: | |
91 cols.append('') | |
92 | |
93 region = BedRegion(*cols) | |
94 self.__add_region(region) | |
95 | |
96 self._bin_list.sort() | |
97 for bin in self._bins: | |
98 self._bins[bin].sort() | |
99 | |
100 def __add_region(self, region): | |
101 self._total += region.end - region.start | |
102 self._length += 1 | |
103 | |
104 startbin = region.start / BedFile._bin_const | |
105 endbin = region.end / BedFile._bin_const | |
106 | |
107 for bin in xrange(startbin, endbin + 1): | |
108 if not (region.chrom, bin) in self._bins: | |
109 self._bin_list.append((region.chrom, bin)) | |
110 self._bins[(region.chrom, bin)] = [] | |
111 self._bins[(region.chrom, bin)].append(region) | |
112 | |
113 def fetch(self, chrom, start, end, strand=None): | |
114 ''' | |
115 For TABIX indexed BED files, find all regions w/in a range | |
116 | |
117 For non-TABIX index BED files, use the calculated bins, and | |
118 output matching regions | |
119 ''' | |
120 | |
121 if self.__tabix: | |
122 for match in self.__tabix.fetch(chrom, start, end): | |
123 region = BedRegion(*match.split('\t')) | |
124 if not strand or (strand and region.strand == strand): | |
125 yield region | |
126 else: | |
127 startbin = start / BedFile._bin_const | |
128 endbin = end / BedFile._bin_const | |
129 | |
130 buf = set() | |
131 | |
132 for bin in xrange(startbin, endbin + 1): | |
133 if (chrom, bin) in self._bins: | |
134 for region in self._bins[(chrom, bin)]: | |
135 if strand and strand != region.strand: | |
136 continue | |
137 if start <= region.start <= end or start <= region.end <= end: | |
138 if not region in buf: | |
139 yield region | |
140 buf.add(region) | |
141 elif region.start < start and region.end > end: | |
142 if not region in buf: | |
143 yield region | |
144 buf.add(region) | |
145 | |
146 def tell(self): | |
147 return self._tellpos | |
148 | |
149 def close(self): | |
150 pass | |
151 | |
152 @property | |
153 def length(self): | |
154 return self._length | |
155 | |
156 @property | |
157 def total(self): | |
158 return self._total | |
159 | |
160 def __iter__(self): | |
161 self._cur_bin_idx = 0 | |
162 self._cur_bin_pos = 0 | |
163 self._tellpos = 0 | |
164 return self | |
165 | |
166 def next(self): | |
167 if self._cur_bin_idx >= len(self._bin_list): | |
168 raise StopIteration | |
169 | |
170 binvals = self._bins[self._bin_list[self._cur_bin_idx]] | |
171 while self._cur_bin_pos < len(binvals): | |
172 val = binvals[self._cur_bin_pos] | |
173 self._cur_bin_pos += 1 | |
174 | |
175 startbin = (val.chrom, val.start / BedFile._bin_const) | |
176 if startbin == self._bin_list[self._cur_bin_idx]: | |
177 self._tellpos += 1 | |
178 return val | |
179 | |
180 self._cur_bin_idx += 1 | |
181 self._cur_bin_pos = 0 | |
182 return self.next() | |
183 | |
184 | |
185 class BedRegion(object): | |
186 def __init__(self, chrom, start, end, name='', score='', strand='', thickStart='', thickEnd='', rgb='', *args): | |
187 self.chrom = chrom | |
188 self.start = int(start) | |
189 self.end = int(end) | |
190 self.name = name | |
191 | |
192 if score == '': | |
193 self.score = 0 | |
194 else: | |
195 self.score = float(score) | |
196 | |
197 if strand == '': | |
198 self.strand = None | |
199 else: | |
200 self.strand = strand | |
201 | |
202 if thickStart == '': | |
203 self.thickStart = None | |
204 else: | |
205 self.thickStart = thickStart | |
206 | |
207 if thickEnd == '': | |
208 self.thickEnd = None | |
209 else: | |
210 self.thickEnd = thickEnd | |
211 | |
212 if rgb == '': | |
213 self.rgb = None | |
214 else: | |
215 self.rgb = rgb | |
216 | |
217 self.extras = args | |
218 | |
219 def clone(self, chrom=None, start=None, end=None, name=None, score=None, strand=None, thickStart=None, thickEnd=None, rgb=None, *args): | |
220 cols = [] | |
221 cols.append(self.chrom if chrom is None else chrom) | |
222 cols.append(self.start if start is None else start) | |
223 cols.append(self.end if end is None else end) | |
224 cols.append(self.name if name is None else name) | |
225 cols.append(self.score if score is None else score) | |
226 cols.append(self.strand if strand is None else strand) | |
227 cols.append(self.thickStart if thickStart is None else thickStart) | |
228 cols.append(self.thickEnd if thickEnd is None else thickEnd) | |
229 cols.append(self.rgb if rgb is None else rgb) | |
230 | |
231 for i, val in enumerate(self.extras): | |
232 if len(args) > i: | |
233 cols.append(args[i]) | |
234 else: | |
235 cols.append(val) | |
236 | |
237 return BedRegion(*cols) | |
238 | |
239 @property | |
240 def score_int(self): | |
241 score = str(self.score) | |
242 if score[-2:] == '.0': | |
243 score = score[:-2] | |
244 | |
245 return score | |
246 | |
247 def __key(self): | |
248 return (self.chrom, self.start, self.end, self.strand, self.name) | |
249 | |
250 def __lt__(self, other): | |
251 return self.__key() < other.__key() | |
252 | |
253 def __gt__(self, other): | |
254 return self.__key() > other.__key() | |
255 | |
256 def __eq__(self, other): | |
257 return self.__key() == other.__key() | |
258 | |
259 def write(self, out): | |
260 out.write('%s\n' % self) | |
261 | |
262 def __repr__(self): | |
263 outcols = [] | |
264 | |
265 if self.rgb: | |
266 outcols.append(self.rgb) | |
267 if self.thickEnd or outcols: | |
268 outcols.append(self.thickEnd if self.thickEnd else self.end) | |
269 if self.thickStart or outcols: | |
270 outcols.append(self.thickStart if self.thickStart else self.start) | |
271 if self.strand or outcols: | |
272 outcols.append(self.strand) | |
273 if self.score_int != '' or outcols: | |
274 outcols.append(self.score_int) | |
275 if self.name or outcols: | |
276 outcols.append(self.name) | |
277 | |
278 outcols.append(self.end) | |
279 outcols.append(self.start) | |
280 outcols.append(self.chrom) | |
281 | |
282 return '\t'. join([str(x) for x in outcols[::-1]]) |