comparison genetrack_util.py @ 0:25cd59a002d9 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/genetrack commit e96df94dba60050fa28aaf55b5bb095717a5f260
author iuc
date Tue, 22 Dec 2015 17:03:27 -0500
parents
children df7ac50ade5d
comparison
equal deleted inserted replaced
-1:000000000000 0:25cd59a002d9
1 import bisect
2 import math
3 import numpy
4 import re
5 import subprocess
6 import sys
7 import tempfile
8
9 GFF_EXT = 'gff'
10 SCIDX_EXT = 'scidx'
11
12
13 def noop(data):
14 return data
15
16
17 def zeropad_to_numeric(data):
18 return re.sub(r'chr0(\d)', r'chr\1', data)
19
20
21 def numeric_to_zeropad(data):
22 return re.sub(r'chr(\d([^\d]|$))', r'chr0\1', data)
23
24
25 FORMATS = ['zeropad', 'numeric']
26 IN_CONVERT = {'zeropad': zeropad_to_numeric, 'numeric': noop}
27 OUT_CONVERT = {'zeropad': numeric_to_zeropad, 'numeric': noop}
28
29
30 def conversion_functions(in_fmt, out_fmt):
31 """
32 Returns the proper list of functions to apply to perform a conversion
33 """
34 return [IN_CONVERT[in_fmt], OUT_CONVERT[out_fmt]]
35
36
37 def convert_data(data, in_fmt, out_fmt):
38 for fn in conversion_functions(in_fmt, out_fmt):
39 data = fn(data)
40 return data
41
42
43 class ChromosomeManager(object):
44 """
45 Manages a CSV reader of an index file to only load one chrom at a time
46 """
47
48 def __init__(self, reader):
49 self.done = False
50 self.reader = reader
51 self.processed_chromosomes = []
52 self.current_index = 0
53 self.next_valid()
54
55 def next(self):
56 self.line = self.reader.next()
57
58 def is_valid(self, line):
59 if len(line) not in [4, 5, 9]:
60 return False
61 try:
62 [int(i) for i in line[1:]]
63 self.format = SCIDX_EXT
64 return True
65 except ValueError:
66 try:
67 if len(line) < 6:
68 return False
69 [int(line[4]), int(line[5])]
70 self.format = GFF_EXT
71 return True
72 except ValueError:
73 return False
74
75 def next_valid(self):
76 """
77 Advance to the next valid line in the reader
78 """
79 self.line = self.reader.next()
80 s = 0
81 while not self.is_valid(self.line):
82 self.line = self.reader.next()
83 s += 1
84 if s > 0:
85 # Skip initial line(s) of file
86 pass
87
88 def parse_line(self, line):
89 if self.format == SCIDX_EXT:
90 return [int(line[1]), int(line[2]), int(line[3])]
91 else:
92 return [int(line[3]), line[6], line[5]]
93
94 def chromosome_name(self):
95 """
96 Return the name of the chromosome about to be loaded
97 """
98 return self.line[0]
99
100 def load_chromosome(self, collect_data=True):
101 """
102 Load the current chromosome into an array and return it
103 """
104 cname = self.chromosome_name()
105 if cname in self.processed_chromosomes:
106 stop_err('File is not grouped by chromosome')
107 self.data = []
108 while self.line[0] == cname:
109 if collect_data:
110 read = self.parse_line(self.line)
111 if read[0] < self.current_index:
112 msg = 'Reads in chromosome %s are not sorted by index. (At index %d)' % (cname, self.current_index)
113 stop_err(msg)
114 self.current_index = read[0]
115 self.add_read(read)
116 try:
117 self.next()
118 except StopIteration:
119 self.done = True
120 break
121 self.processed_chromosomes.append(cname)
122 self.current_index = 0
123 data = self.data
124 # Don't retain reference anymore to save memory
125 del self.data
126 return data
127
128 def add_read(self, read):
129 if self.format == SCIDX_EXT:
130 self.data.append(read)
131 else:
132 index, strand, value = read
133 if value == '' or value == '.':
134 value = 1
135 else:
136 value = int(value)
137 if not self.data:
138 self.data.append([index, 0, 0])
139 current_read = self.data[-1]
140 if self.data[-1][0] == index:
141 current_read = self.data[-1]
142 elif self.data[-1][0] < index:
143 self.data.append([index, 0, 0])
144 current_read = self.data[-1]
145 else:
146 msg = 'Reads in chromosome %s are not sorted by index. (At index %d)' % (self.chromosome_name(), index)
147 stop_err(msg)
148 if strand == '+':
149 current_read[1] += value
150 elif strand == '-':
151 current_read[2] += value
152 else:
153 msg = 'Strand "%s" at chromosome "%s" index %d is not valid.' % (strand, self.chromosome_name(), index)
154 stop_err(msg)
155
156 def skip_chromosome(self):
157 """
158 Skip the current chromosome, discarding data
159 """
160 self.load_chromosome(collect_data=False)
161
162
163 class Peak(object):
164 def __init__(self, index, pos_width, neg_width):
165 self.index = index
166 self.start = index - neg_width
167 self.end = index + pos_width
168 self.value = 0
169 self.deleted = False
170 self.safe = False
171
172 def __repr__(self):
173 return '[%d] %d' % (self.index, self.value)
174
175
176 def gff_row(cname, start, end, score, source, type='.', strand='.', phase='.', attrs={}):
177 return (cname, source, type, start, end, score, strand, phase, gff_attrs(attrs))
178
179
180 def gff_attrs(d):
181 if not d:
182 return '.'
183 return ';'.join('%s=%s' % item for item in d.items())
184
185
186 def stop_err(msg):
187 sys.stderr.write(msg)
188 sys.exit(1)
189
190
191 def is_int(i):
192 try:
193 int(i)
194 return True
195 except ValueError:
196 return False
197
198
199 def make_keys(data):
200 return [read[0] for read in data]
201
202
203 def make_peak_keys(peaks):
204 return [peak.index for peak in peaks]
205
206
207 def get_window(data, start, end, keys):
208 """
209 Returns all reads from the data set with index between the two indexes
210 """
211 start_index = bisect.bisect_left(keys, start)
212 end_index = bisect.bisect_right(keys, end)
213 return data[start_index:end_index]
214
215
216 def get_index(value, keys):
217 """
218 Returns the index of the value in the keys using bisect
219 """
220 return bisect.bisect_left(keys, value)
221
222
223 def get_range(data):
224 lo = min([item[0] for item in data])
225 hi = max([item[0] for item in data])
226 return lo, hi
227
228
229 def get_chunks(lo, hi, size, overlap=500):
230 """
231 Divides a range into chunks of maximum size size. Returns a list of
232 2-tuples (slice_range, process_range), each a 2-tuple (start, end).
233 process_range has zero overlap and should be given to process_chromosome
234 as-is, and slice_range is overlapped and should be used to slice the
235 data (using get_window) to be given to process_chromosome.
236 """
237 chunks = []
238 for start_index in range(lo, hi, size):
239 process_start = start_index
240 # Don't go over upper bound
241 process_end = min(start_index + size, hi)
242 # Don't go under lower bound
243 slice_start = max(process_start - overlap, lo)
244 # Don't go over upper bound
245 slice_end = min(process_end + overlap, hi)
246 chunks.append(((slice_start, slice_end), (process_start, process_end)))
247 return chunks
248
249
250 def allocate_array(data, width):
251 """
252 Allocates a new array with the dimensions required to fit all reads in
253 the argument. The new array is totally empty. Returns the array and the
254 shift (number to add to a read index to get the position in the array it
255 should be at).
256 """
257 lo, hi = get_range(data)
258 rng = hi - lo
259 shift = width - lo
260 return numpy.zeros(rng+width*2, numpy.float), shift
261
262
263 def normal_array(width, sigma, normalize=True):
264 """
265 Returns an array of the normal distribution of the specified width
266 """
267 sigma2 = float(sigma)**2
268
269 def normal_func(x):
270 return math.exp(-x * x / (2 * sigma2))
271
272 # width is the half of the distribution
273 values = map(normal_func, range(-width, width))
274 values = numpy.array(values, numpy.float)
275 # normalization
276 if normalize:
277 values = 1.0/math.sqrt(2 * numpy.pi * sigma2) * values
278 return values
279
280
281 def call_peaks(array, shift, data, keys, direction, down_width, up_width, exclusion):
282 peaks = []
283
284 def find_peaks():
285 # Go through the array and call each peak
286 results = (array > numpy.roll(array, 1)) & (array > numpy.roll(array, -1))
287 indexes = numpy.where(results)
288 for index in indexes[0]:
289 pos = down_width or exclusion // 2
290 neg = up_width or exclusion // 2
291 # Reverse strand
292 if direction == 2:
293 # Swap positive and negative widths
294 pos, neg = neg, pos
295 peaks.append(Peak(int(index)-shift, pos, neg))
296 find_peaks()
297
298 def calculate_reads():
299 # Calculate the number of reads in each peak
300 for peak in peaks:
301 reads = get_window(data, peak.start, peak.end, keys)
302 peak.value = sum([read[direction] for read in reads])
303 # Flat list of indexes with frequency
304 indexes = [r for read in reads for r in [read[0]] * read[direction]]
305 peak.stddev = numpy.std(indexes)
306 calculate_reads()
307
308 def perform_exclusion():
309 # Process the exclusion zone
310 peak_keys = make_peak_keys(peaks)
311 peaks_by_value = peaks[:]
312 peaks_by_value.sort(key=lambda peak: -peak.value)
313 for peak in peaks_by_value:
314 peak.safe = True
315 window = get_window(peaks,
316 peak.index-exclusion//2,
317 peak.index+exclusion//2,
318 peak_keys)
319 for excluded in window:
320 if excluded.safe:
321 continue
322 i = get_index(excluded.index, peak_keys)
323 del peak_keys[i]
324 del peaks[i]
325 perform_exclusion()
326 return peaks
327
328
329 def process_chromosome(cname, data, writer, process_bounds, width, sigma, down_width, up_width, exclusion, filter):
330 """
331 Process a chromosome. Takes the chromosome name, list of reads, a CSV
332 writer to write processes results to, the bounds (2-tuple) to write
333 results in, and options.
334 """
335 if not data:
336 return
337 keys = make_keys(data)
338 # Create the arrays that hold the sum of the normals
339 forward_array, forward_shift = allocate_array(data, width)
340 reverse_array, reverse_shift = allocate_array(data, width)
341 normal = normal_array(width, sigma)
342
343 def populate_array():
344 # Add each read's normal to the array
345 for read in data:
346 index, forward, reverse = read
347 # Add the normals to the appropriate regions
348 if forward:
349 forward_array[index+forward_shift-width:index+forward_shift+width] += normal * forward
350 if reverse:
351 reverse_array[index+reverse_shift-width:index+reverse_shift+width] += normal * reverse
352 populate_array()
353 forward_peaks = call_peaks(forward_array, forward_shift, data, keys, 1, down_width, up_width, exclusion)
354 reverse_peaks = call_peaks(reverse_array, reverse_shift, data, keys, 2, down_width, up_width, exclusion)
355 # Convert chromosome name in preparation for writing output
356 cname = convert_data(cname, 'zeropad', 'numeric')
357
358 def write(cname, strand, peak):
359 start = max(peak.start, 1)
360 end = peak.end
361 value = peak.value
362 stddev = peak.stddev
363 if value > filter:
364 # This version of genetrack outputs only gff files.
365 writer.writerow(gff_row(cname=cname,
366 source='genetrack',
367 start=start,
368 end=end,
369 score=value,
370 strand=strand,
371 attrs={'stddev': stddev}))
372
373 for peak in forward_peaks:
374 if process_bounds[0] < peak.index < process_bounds[1]:
375 write(cname, '+', peak)
376 for peak in reverse_peaks:
377 if process_bounds[0] < peak.index < process_bounds[1]:
378 write(cname, '-', peak)
379
380
381 def sort_chromosome_reads_by_index(input_path):
382 """
383 Return a gff file with chromosome reads sorted by index.
384 """
385 # Will this sort produce different results across platforms?
386 output_path = tempfile.NamedTemporaryFile(delete=False).name
387 command = 'sort -k 1,1 -k 4,4n "%s" > "%s"' % (input_path, output_path)
388 p = subprocess.Popen(command, shell=True)
389 p.wait()
390 return output_path