Mercurial > repos > iuc > genetrack
comparison genetrack_util.py @ 0:25cd59a002d9 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/genetrack commit e96df94dba60050fa28aaf55b5bb095717a5f260
author | iuc |
---|---|
date | Tue, 22 Dec 2015 17:03:27 -0500 |
parents | |
children | df7ac50ade5d |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:25cd59a002d9 |
---|---|
1 import bisect | |
2 import math | |
3 import numpy | |
4 import re | |
5 import subprocess | |
6 import sys | |
7 import tempfile | |
8 | |
9 GFF_EXT = 'gff' | |
10 SCIDX_EXT = 'scidx' | |
11 | |
12 | |
13 def noop(data): | |
14 return data | |
15 | |
16 | |
17 def zeropad_to_numeric(data): | |
18 return re.sub(r'chr0(\d)', r'chr\1', data) | |
19 | |
20 | |
21 def numeric_to_zeropad(data): | |
22 return re.sub(r'chr(\d([^\d]|$))', r'chr0\1', data) | |
23 | |
24 | |
25 FORMATS = ['zeropad', 'numeric'] | |
26 IN_CONVERT = {'zeropad': zeropad_to_numeric, 'numeric': noop} | |
27 OUT_CONVERT = {'zeropad': numeric_to_zeropad, 'numeric': noop} | |
28 | |
29 | |
30 def conversion_functions(in_fmt, out_fmt): | |
31 """ | |
32 Returns the proper list of functions to apply to perform a conversion | |
33 """ | |
34 return [IN_CONVERT[in_fmt], OUT_CONVERT[out_fmt]] | |
35 | |
36 | |
37 def convert_data(data, in_fmt, out_fmt): | |
38 for fn in conversion_functions(in_fmt, out_fmt): | |
39 data = fn(data) | |
40 return data | |
41 | |
42 | |
43 class ChromosomeManager(object): | |
44 """ | |
45 Manages a CSV reader of an index file to only load one chrom at a time | |
46 """ | |
47 | |
48 def __init__(self, reader): | |
49 self.done = False | |
50 self.reader = reader | |
51 self.processed_chromosomes = [] | |
52 self.current_index = 0 | |
53 self.next_valid() | |
54 | |
55 def next(self): | |
56 self.line = self.reader.next() | |
57 | |
58 def is_valid(self, line): | |
59 if len(line) not in [4, 5, 9]: | |
60 return False | |
61 try: | |
62 [int(i) for i in line[1:]] | |
63 self.format = SCIDX_EXT | |
64 return True | |
65 except ValueError: | |
66 try: | |
67 if len(line) < 6: | |
68 return False | |
69 [int(line[4]), int(line[5])] | |
70 self.format = GFF_EXT | |
71 return True | |
72 except ValueError: | |
73 return False | |
74 | |
75 def next_valid(self): | |
76 """ | |
77 Advance to the next valid line in the reader | |
78 """ | |
79 self.line = self.reader.next() | |
80 s = 0 | |
81 while not self.is_valid(self.line): | |
82 self.line = self.reader.next() | |
83 s += 1 | |
84 if s > 0: | |
85 # Skip initial line(s) of file | |
86 pass | |
87 | |
88 def parse_line(self, line): | |
89 if self.format == SCIDX_EXT: | |
90 return [int(line[1]), int(line[2]), int(line[3])] | |
91 else: | |
92 return [int(line[3]), line[6], line[5]] | |
93 | |
94 def chromosome_name(self): | |
95 """ | |
96 Return the name of the chromosome about to be loaded | |
97 """ | |
98 return self.line[0] | |
99 | |
100 def load_chromosome(self, collect_data=True): | |
101 """ | |
102 Load the current chromosome into an array and return it | |
103 """ | |
104 cname = self.chromosome_name() | |
105 if cname in self.processed_chromosomes: | |
106 stop_err('File is not grouped by chromosome') | |
107 self.data = [] | |
108 while self.line[0] == cname: | |
109 if collect_data: | |
110 read = self.parse_line(self.line) | |
111 if read[0] < self.current_index: | |
112 msg = 'Reads in chromosome %s are not sorted by index. (At index %d)' % (cname, self.current_index) | |
113 stop_err(msg) | |
114 self.current_index = read[0] | |
115 self.add_read(read) | |
116 try: | |
117 self.next() | |
118 except StopIteration: | |
119 self.done = True | |
120 break | |
121 self.processed_chromosomes.append(cname) | |
122 self.current_index = 0 | |
123 data = self.data | |
124 # Don't retain reference anymore to save memory | |
125 del self.data | |
126 return data | |
127 | |
128 def add_read(self, read): | |
129 if self.format == SCIDX_EXT: | |
130 self.data.append(read) | |
131 else: | |
132 index, strand, value = read | |
133 if value == '' or value == '.': | |
134 value = 1 | |
135 else: | |
136 value = int(value) | |
137 if not self.data: | |
138 self.data.append([index, 0, 0]) | |
139 current_read = self.data[-1] | |
140 if self.data[-1][0] == index: | |
141 current_read = self.data[-1] | |
142 elif self.data[-1][0] < index: | |
143 self.data.append([index, 0, 0]) | |
144 current_read = self.data[-1] | |
145 else: | |
146 msg = 'Reads in chromosome %s are not sorted by index. (At index %d)' % (self.chromosome_name(), index) | |
147 stop_err(msg) | |
148 if strand == '+': | |
149 current_read[1] += value | |
150 elif strand == '-': | |
151 current_read[2] += value | |
152 else: | |
153 msg = 'Strand "%s" at chromosome "%s" index %d is not valid.' % (strand, self.chromosome_name(), index) | |
154 stop_err(msg) | |
155 | |
156 def skip_chromosome(self): | |
157 """ | |
158 Skip the current chromosome, discarding data | |
159 """ | |
160 self.load_chromosome(collect_data=False) | |
161 | |
162 | |
163 class Peak(object): | |
164 def __init__(self, index, pos_width, neg_width): | |
165 self.index = index | |
166 self.start = index - neg_width | |
167 self.end = index + pos_width | |
168 self.value = 0 | |
169 self.deleted = False | |
170 self.safe = False | |
171 | |
172 def __repr__(self): | |
173 return '[%d] %d' % (self.index, self.value) | |
174 | |
175 | |
176 def gff_row(cname, start, end, score, source, type='.', strand='.', phase='.', attrs={}): | |
177 return (cname, source, type, start, end, score, strand, phase, gff_attrs(attrs)) | |
178 | |
179 | |
180 def gff_attrs(d): | |
181 if not d: | |
182 return '.' | |
183 return ';'.join('%s=%s' % item for item in d.items()) | |
184 | |
185 | |
186 def stop_err(msg): | |
187 sys.stderr.write(msg) | |
188 sys.exit(1) | |
189 | |
190 | |
191 def is_int(i): | |
192 try: | |
193 int(i) | |
194 return True | |
195 except ValueError: | |
196 return False | |
197 | |
198 | |
199 def make_keys(data): | |
200 return [read[0] for read in data] | |
201 | |
202 | |
203 def make_peak_keys(peaks): | |
204 return [peak.index for peak in peaks] | |
205 | |
206 | |
207 def get_window(data, start, end, keys): | |
208 """ | |
209 Returns all reads from the data set with index between the two indexes | |
210 """ | |
211 start_index = bisect.bisect_left(keys, start) | |
212 end_index = bisect.bisect_right(keys, end) | |
213 return data[start_index:end_index] | |
214 | |
215 | |
216 def get_index(value, keys): | |
217 """ | |
218 Returns the index of the value in the keys using bisect | |
219 """ | |
220 return bisect.bisect_left(keys, value) | |
221 | |
222 | |
223 def get_range(data): | |
224 lo = min([item[0] for item in data]) | |
225 hi = max([item[0] for item in data]) | |
226 return lo, hi | |
227 | |
228 | |
229 def get_chunks(lo, hi, size, overlap=500): | |
230 """ | |
231 Divides a range into chunks of maximum size size. Returns a list of | |
232 2-tuples (slice_range, process_range), each a 2-tuple (start, end). | |
233 process_range has zero overlap and should be given to process_chromosome | |
234 as-is, and slice_range is overlapped and should be used to slice the | |
235 data (using get_window) to be given to process_chromosome. | |
236 """ | |
237 chunks = [] | |
238 for start_index in range(lo, hi, size): | |
239 process_start = start_index | |
240 # Don't go over upper bound | |
241 process_end = min(start_index + size, hi) | |
242 # Don't go under lower bound | |
243 slice_start = max(process_start - overlap, lo) | |
244 # Don't go over upper bound | |
245 slice_end = min(process_end + overlap, hi) | |
246 chunks.append(((slice_start, slice_end), (process_start, process_end))) | |
247 return chunks | |
248 | |
249 | |
250 def allocate_array(data, width): | |
251 """ | |
252 Allocates a new array with the dimensions required to fit all reads in | |
253 the argument. The new array is totally empty. Returns the array and the | |
254 shift (number to add to a read index to get the position in the array it | |
255 should be at). | |
256 """ | |
257 lo, hi = get_range(data) | |
258 rng = hi - lo | |
259 shift = width - lo | |
260 return numpy.zeros(rng+width*2, numpy.float), shift | |
261 | |
262 | |
263 def normal_array(width, sigma, normalize=True): | |
264 """ | |
265 Returns an array of the normal distribution of the specified width | |
266 """ | |
267 sigma2 = float(sigma)**2 | |
268 | |
269 def normal_func(x): | |
270 return math.exp(-x * x / (2 * sigma2)) | |
271 | |
272 # width is the half of the distribution | |
273 values = map(normal_func, range(-width, width)) | |
274 values = numpy.array(values, numpy.float) | |
275 # normalization | |
276 if normalize: | |
277 values = 1.0/math.sqrt(2 * numpy.pi * sigma2) * values | |
278 return values | |
279 | |
280 | |
281 def call_peaks(array, shift, data, keys, direction, down_width, up_width, exclusion): | |
282 peaks = [] | |
283 | |
284 def find_peaks(): | |
285 # Go through the array and call each peak | |
286 results = (array > numpy.roll(array, 1)) & (array > numpy.roll(array, -1)) | |
287 indexes = numpy.where(results) | |
288 for index in indexes[0]: | |
289 pos = down_width or exclusion // 2 | |
290 neg = up_width or exclusion // 2 | |
291 # Reverse strand | |
292 if direction == 2: | |
293 # Swap positive and negative widths | |
294 pos, neg = neg, pos | |
295 peaks.append(Peak(int(index)-shift, pos, neg)) | |
296 find_peaks() | |
297 | |
298 def calculate_reads(): | |
299 # Calculate the number of reads in each peak | |
300 for peak in peaks: | |
301 reads = get_window(data, peak.start, peak.end, keys) | |
302 peak.value = sum([read[direction] for read in reads]) | |
303 # Flat list of indexes with frequency | |
304 indexes = [r for read in reads for r in [read[0]] * read[direction]] | |
305 peak.stddev = numpy.std(indexes) | |
306 calculate_reads() | |
307 | |
308 def perform_exclusion(): | |
309 # Process the exclusion zone | |
310 peak_keys = make_peak_keys(peaks) | |
311 peaks_by_value = peaks[:] | |
312 peaks_by_value.sort(key=lambda peak: -peak.value) | |
313 for peak in peaks_by_value: | |
314 peak.safe = True | |
315 window = get_window(peaks, | |
316 peak.index-exclusion//2, | |
317 peak.index+exclusion//2, | |
318 peak_keys) | |
319 for excluded in window: | |
320 if excluded.safe: | |
321 continue | |
322 i = get_index(excluded.index, peak_keys) | |
323 del peak_keys[i] | |
324 del peaks[i] | |
325 perform_exclusion() | |
326 return peaks | |
327 | |
328 | |
329 def process_chromosome(cname, data, writer, process_bounds, width, sigma, down_width, up_width, exclusion, filter): | |
330 """ | |
331 Process a chromosome. Takes the chromosome name, list of reads, a CSV | |
332 writer to write processes results to, the bounds (2-tuple) to write | |
333 results in, and options. | |
334 """ | |
335 if not data: | |
336 return | |
337 keys = make_keys(data) | |
338 # Create the arrays that hold the sum of the normals | |
339 forward_array, forward_shift = allocate_array(data, width) | |
340 reverse_array, reverse_shift = allocate_array(data, width) | |
341 normal = normal_array(width, sigma) | |
342 | |
343 def populate_array(): | |
344 # Add each read's normal to the array | |
345 for read in data: | |
346 index, forward, reverse = read | |
347 # Add the normals to the appropriate regions | |
348 if forward: | |
349 forward_array[index+forward_shift-width:index+forward_shift+width] += normal * forward | |
350 if reverse: | |
351 reverse_array[index+reverse_shift-width:index+reverse_shift+width] += normal * reverse | |
352 populate_array() | |
353 forward_peaks = call_peaks(forward_array, forward_shift, data, keys, 1, down_width, up_width, exclusion) | |
354 reverse_peaks = call_peaks(reverse_array, reverse_shift, data, keys, 2, down_width, up_width, exclusion) | |
355 # Convert chromosome name in preparation for writing output | |
356 cname = convert_data(cname, 'zeropad', 'numeric') | |
357 | |
358 def write(cname, strand, peak): | |
359 start = max(peak.start, 1) | |
360 end = peak.end | |
361 value = peak.value | |
362 stddev = peak.stddev | |
363 if value > filter: | |
364 # This version of genetrack outputs only gff files. | |
365 writer.writerow(gff_row(cname=cname, | |
366 source='genetrack', | |
367 start=start, | |
368 end=end, | |
369 score=value, | |
370 strand=strand, | |
371 attrs={'stddev': stddev})) | |
372 | |
373 for peak in forward_peaks: | |
374 if process_bounds[0] < peak.index < process_bounds[1]: | |
375 write(cname, '+', peak) | |
376 for peak in reverse_peaks: | |
377 if process_bounds[0] < peak.index < process_bounds[1]: | |
378 write(cname, '-', peak) | |
379 | |
380 | |
381 def sort_chromosome_reads_by_index(input_path): | |
382 """ | |
383 Return a gff file with chromosome reads sorted by index. | |
384 """ | |
385 # Will this sort produce different results across platforms? | |
386 output_path = tempfile.NamedTemporaryFile(delete=False).name | |
387 command = 'sort -k 1,1 -k 4,4n "%s" > "%s"' % (input_path, output_path) | |
388 p = subprocess.Popen(command, shell=True) | |
389 p.wait() | |
390 return output_path |