comparison repmatch_gff3_util.py @ 0:a072f0f30ea3 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/repmatch_gff3 commit 0e04a4c237677c1f5be1950babcf8591097996a9
author iuc
date Wed, 23 Dec 2015 09:25:42 -0500
parents
children e5c7fffdc078
comparison
equal deleted inserted replaced
-1:000000000000 0:a072f0f30ea3
1 import bisect
2 import csv
3 import os
4 import shutil
5 import sys
6 import tempfile
7 import matplotlib
8 matplotlib.use('Agg')
9 from matplotlib import pyplot
10
11 # Graph settings
12 Y_LABEL = 'Counts'
13 X_LABEL = 'Number of matched replicates'
14 TICK_WIDTH = 3
15 # Amount to shift the graph to make labels fit, [left, right, top, bottom]
16 ADJUST = [0.180, 0.9, 0.9, 0.1]
17 # Length of tick marks, use TICK_WIDTH for width
18 pyplot.rc('xtick.major', size=10.00)
19 pyplot.rc('ytick.major', size=10.00)
20 pyplot.rc('lines', linewidth=4.00)
21 pyplot.rc('axes', linewidth=3.00)
22 pyplot.rc('font', family='Bitstream Vera Sans', size=32.0)
23
24 COLORS = 'krb'
25
26
27 class Replicate(object):
28
29 def __init__(self, id, dataset_path):
30 self.id = id
31 self.dataset_path = dataset_path
32 self.parse(csv.reader(open(dataset_path, 'rt'), delimiter='\t'))
33
34 def parse(self, reader):
35 self.chromosomes = {}
36 for line in reader:
37 if line[0].startswith("#") or line[0].startswith('"'):
38 continue
39 cname, junk, junk, mid, midplus, value, strand, junk, attrs = line
40 attrs = parse_gff_attrs(attrs)
41 distance = attrs['cw_distance']
42 mid = int(mid)
43 midplus = int(midplus)
44 value = float(value)
45 distance = int(distance)
46 if cname not in self.chromosomes:
47 self.chromosomes[cname] = Chromosome(cname)
48 chrom = self.chromosomes[cname]
49 chrom.add_peak(Peak(cname, mid, value, distance, self))
50 for chrom in self.chromosomes.values():
51 chrom.sort_by_index()
52
53 def filter(self, up_limit, low_limit):
54 for chrom in self.chromosomes.values():
55 chrom.filter(up_limit, low_limit)
56
57 def size(self):
58 return sum([len(c.peaks) for c in self.chromosomes.values()])
59
60
61 class Chromosome(object):
62
63 def __init__(self, name):
64 self.name = name
65 self.peaks = []
66
67 def add_peak(self, peak):
68 self.peaks.append(peak)
69
70 def sort_by_index(self):
71 self.peaks.sort(key=lambda peak: peak.midpoint)
72 self.keys = make_keys(self.peaks)
73
74 def remove_peak(self, peak):
75 i = bisect.bisect_left(self.keys, peak.midpoint)
76 # If the peak was actually found
77 if i < len(self.peaks) and self.peaks[i].midpoint == peak.midpoint:
78 del self.keys[i]
79 del self.peaks[i]
80
81 def filter(self, up_limit, low_limit):
82 self.peaks = [p for p in self.peaks if low_limit <= p.distance <= up_limit]
83 self.keys = make_keys(self.peaks)
84
85
86 class Peak(object):
87
88 def __init__(self, chrom, midpoint, value, distance, replicate):
89 self.chrom = chrom
90 self.value = value
91 self.midpoint = midpoint
92 self.distance = distance
93 self.replicate = replicate
94
95 def normalized_value(self, med):
96 return self.value * med / self.replicate.median
97
98
99 class PeakGroup(object):
100
101 def __init__(self):
102 self.peaks = {}
103
104 def add_peak(self, repid, peak):
105 self.peaks[repid] = peak
106
107 @property
108 def chrom(self):
109 return self.peaks.values()[0].chrom
110
111 @property
112 def midpoint(self):
113 return median([peak.midpoint for peak in self.peaks.values()])
114
115 @property
116 def num_replicates(self):
117 return len(self.peaks)
118
119 @property
120 def median_distance(self):
121 return median([peak.distance for peak in self.peaks.values()])
122
123 @property
124 def value_sum(self):
125 return sum([peak.value for peak in self.peaks.values()])
126
127 def normalized_value(self, med):
128 values = []
129 for peak in self.peaks.values():
130 values.append(peak.normalized_value(med))
131 return median(values)
132
133 @property
134 def peakpeak_distance(self):
135 keys = self.peaks.keys()
136 return abs(self.peaks[keys[0]].midpoint - self.peaks[keys[1]].midpoint)
137
138
139 class FrequencyDistribution(object):
140
141 def __init__(self, d=None):
142 self.dist = d or {}
143
144 def add(self, x):
145 self.dist[x] = self.dist.get(x, 0) + 1
146
147 def graph_series(self):
148 x = []
149 y = []
150 for key, val in self.dist.items():
151 x.append(key)
152 y.append(val)
153 return x, y
154
155 def mode(self):
156 return max(self.dist.items(), key=lambda data: data[1])[0]
157
158 def size(self):
159 return sum(self.dist.values())
160
161
162 def stop_err(msg):
163 sys.stderr.write(msg)
164 sys.exit(1)
165
166
167 def median(data):
168 """
169 Find the integer median of the data set.
170 """
171 if not data:
172 return 0
173 sdata = sorted(data)
174 if len(data) % 2 == 0:
175 return (sdata[len(data)//2] + sdata[len(data)//2-1]) / 2
176 else:
177 return sdata[len(data)//2]
178
179
180 def make_keys(peaks):
181 return [data.midpoint for data in peaks]
182
183
184 def get_window(chromosome, target_peaks, distance):
185 """
186 Returns a window of all peaks from a replicate within a certain distance of
187 a peak from another replicate.
188 """
189 lower = target_peaks[0].midpoint
190 upper = target_peaks[0].midpoint
191 for peak in target_peaks:
192 lower = min(lower, peak.midpoint - distance)
193 upper = max(upper, peak.midpoint + distance)
194 start_index = bisect.bisect_left(chromosome.keys, lower)
195 end_index = bisect.bisect_right(chromosome.keys, upper)
196 return (chromosome.peaks[start_index: end_index], chromosome.name)
197
198
199 def match_largest(window, peak, chrum):
200 if not window:
201 return None
202 if peak.chrom != chrum:
203 return None
204 return max(window, key=lambda cpeak: cpeak.value)
205
206
207 def match_closest(window, peak, chrum):
208 if not window:
209 return None
210 if peak.chrom != chrum:
211 return None
212 return min(window, key=lambda match: abs(match.midpoint - peak.midpoint))
213
214
215 def frequency_histogram(freqs, dataset_path, labels=[], title=''):
216 pyplot.clf()
217 pyplot.figure(figsize=(10, 10))
218 for i, freq in enumerate(freqs):
219 xvals, yvals = freq.graph_series()
220 # Go from high to low
221 xvals.reverse()
222 pyplot.bar([x-0.4 + 0.8/len(freqs)*i for x in xvals], yvals, width=0.8/len(freqs), color=COLORS[i])
223 pyplot.xticks(range(min(xvals), max(xvals)+1), map(str, reversed(range(min(xvals), max(xvals)+1))))
224 pyplot.xlabel(X_LABEL)
225 pyplot.ylabel(Y_LABEL)
226 pyplot.subplots_adjust(left=ADJUST[0], right=ADJUST[1], top=ADJUST[2], bottom=ADJUST[3])
227 ax = pyplot.gca()
228 for l in ax.get_xticklines() + ax.get_yticklines():
229 l.set_markeredgewidth(TICK_WIDTH)
230 pyplot.savefig(dataset_path)
231
232
233 METHODS = {'closest': match_closest, 'largest': match_largest}
234
235
236 def gff_attrs(d):
237 if not d:
238 return '.'
239 return ';'.join('%s=%s' % item for item in d.items())
240
241
242 def parse_gff_attrs(s):
243 d = {}
244 if s == '.':
245 return d
246 for item in s.split(';'):
247 key, val = item.split('=')
248 d[key] = val
249 return d
250
251
252 def gff_row(cname, start, end, score, source, type='.', strand='.', phase='.', attrs={}):
253 return (cname, source, type, start, end, score, strand, phase, gff_attrs(attrs))
254
255
256 def get_temporary_plot_path():
257 """
258 Return the path to a temporary file with a valid image format
259 file extension that can be used with bioformats.
260 """
261 tmp_dir = tempfile.mkdtemp(prefix='tmp-repmatch-')
262 fd, name = tempfile.mkstemp(suffix='.pdf', dir=tmp_dir)
263 os.close(fd)
264 return name
265
266
267 def process_files(dataset_paths, galaxy_hids, method, distance, step, replicates, up_limit, low_limit, output_files,
268 output_matched_peaks, output_unmatched_peaks, output_detail, output_statistics_table, output_statistics_histogram):
269 output_statistics_histogram_file = output_files in ["all"] and method in ["all"]
270 if len(dataset_paths) < 2:
271 return
272 if method == 'all':
273 match_methods = METHODS.keys()
274 else:
275 match_methods = [method]
276 for match_method in match_methods:
277 statistics = perform_process(dataset_paths,
278 galaxy_hids,
279 match_method,
280 distance,
281 step,
282 replicates,
283 up_limit,
284 low_limit,
285 output_files,
286 output_matched_peaks,
287 output_unmatched_peaks,
288 output_detail,
289 output_statistics_table,
290 output_statistics_histogram)
291 if output_statistics_histogram_file:
292 tmp_statistics_histogram_path = get_temporary_plot_path()
293 frequency_histogram([stat['distribution'] for stat in [statistics]],
294 tmp_statistics_histogram_path,
295 METHODS.keys())
296 shutil.move(tmp_statistics_histogram_path, output_statistics_histogram)
297
298
299 def perform_process(dataset_paths, galaxy_hids, method, distance, step, num_required, up_limit, low_limit, output_files,
300 output_matched_peaks, output_unmatched_peaks, output_detail, output_statistics_table, output_statistics_histogram):
301 output_detail_file = output_files in ["all"] and output_detail is not None
302 output_statistics_table_file = output_files in ["all"] and output_statistics_table is not None
303 output_unmatched_peaks_file = output_files in ["all", "matched_peaks_unmatched_peaks"] and output_unmatched_peaks is not None
304 output_statistics_histogram_file = output_files in ["all"] and output_statistics_histogram is not None
305 replicates = []
306 for i, dataset_path in enumerate(dataset_paths):
307 try:
308 galaxy_hid = galaxy_hids[i]
309 r = Replicate(galaxy_hid, dataset_path)
310 replicates.append(r)
311 except Exception, e:
312 stop_err('Unable to parse file "%s", exception: %s' % (dataset_path, str(e)))
313 attrs = 'd%sr%s' % (distance, num_required)
314 if up_limit != 1000:
315 attrs += 'u%d' % up_limit
316 if low_limit != -1000:
317 attrs += 'l%d' % low_limit
318 if step != 0:
319 attrs += 's%d' % step
320
321 def td_writer(file_path):
322 # Returns a tab-delimited writer for a certain output
323 return csv.writer(open(file_path, 'wt'), delimiter='\t')
324
325 labels = ('chrom',
326 'median midpoint',
327 'median midpoint+1',
328 'median normalized reads',
329 'replicates',
330 'median c-w distance',
331 'reads sum')
332 for replicate in replicates:
333 labels += ('chrom',
334 'median midpoint',
335 'median midpoint+1',
336 'c-w sum',
337 'c-w distance',
338 'replicate id')
339 matched_peaks_output = td_writer(output_matched_peaks)
340 if output_statistics_table_file:
341 statistics_table_output = td_writer(output_statistics_table)
342 statistics_table_output.writerow(('data', 'median read count'))
343 if output_detail_file:
344 detail_output = td_writer(output_detail)
345 detail_output.writerow(labels)
346 if output_unmatched_peaks_file:
347 unmatched_peaks_output = td_writer(output_unmatched_peaks)
348 unmatched_peaks_output.writerow(('chrom', 'midpoint', 'midpoint+1', 'c-w sum', 'c-w distance', 'replicate id'))
349 # Perform filtering
350 if up_limit < 1000 or low_limit > -1000:
351 for replicate in replicates:
352 replicate.filter(up_limit, low_limit)
353 # Actually merge the peaks
354 peak_groups = []
355 unmatched_peaks = []
356 freq = FrequencyDistribution()
357
358 def do_match(reps, distance):
359 # Copy list because we will mutate it, but keep replicate references.
360 reps = reps[:]
361 while len(reps) > 1:
362 # Iterate over each replicate as "main"
363 main = reps[0]
364 reps.remove(main)
365 for chromosome in main.chromosomes.values():
366 peaks_by_value = chromosome.peaks[:]
367 # Sort main replicate by value
368 peaks_by_value.sort(key=lambda peak: -peak.value)
369
370 def search_for_matches(group):
371 # Here we use multiple passes, expanding the window to be
372 # +- distance from any previously matched peak.
373 while True:
374 new_match = False
375 for replicate in reps:
376 if replicate.id in group.peaks:
377 # Stop if match already found for this replicate
378 continue
379 try:
380 # Lines changed to remove a major bug by Rohit Reja.
381 window, chrum = get_window(replicate.chromosomes[chromosome.name],
382 group.peaks.values(),
383 distance)
384 match = METHODS[method](window, peak, chrum)
385 except KeyError:
386 continue
387 if match:
388 group.add_peak(replicate.id, match)
389 new_match = True
390 if not new_match:
391 break
392 # Attempt to enlarge existing peak groups
393 for group in peak_groups:
394 old_peaks = group.peaks.values()[:]
395 search_for_matches(group)
396 for peak in group.peaks.values():
397 if peak not in old_peaks:
398 peak.replicate.chromosomes[chromosome.name].remove_peak(peak)
399 # Attempt to find new peaks groups. For each peak in the
400 # main replicate, search for matches in the other replicates
401 for peak in peaks_by_value:
402 matches = PeakGroup()
403 matches.add_peak(main.id, peak)
404 search_for_matches(matches)
405 # Were enough replicates matched?
406 if matches.num_replicates >= num_required:
407 for peak in matches.peaks.values():
408 peak.replicate.chromosomes[chromosome.name].remove_peak(peak)
409 peak_groups.append(matches)
410 # Zero or less = no stepping
411 if step <= 0:
412 do_match(replicates, distance)
413 else:
414 for d in range(0, distance, step):
415 do_match(replicates, d)
416 for group in peak_groups:
417 freq.add(group.num_replicates)
418 # Collect together the remaining unmatched_peaks
419 for replicate in replicates:
420 for chromosome in replicate.chromosomes.values():
421 for peak in chromosome.peaks:
422 freq.add(1)
423 unmatched_peaks.append(peak)
424 # Average the unmatched_peaks count in the graph by # replicates
425 med = median([peak.value for group in peak_groups for peak in group.peaks.values()])
426 for replicate in replicates:
427 replicate.median = median([peak.value for group in peak_groups for peak in group.peaks.values() if peak.replicate == replicate])
428 statistics_table_output.writerow((replicate.id, replicate.median))
429 for group in peak_groups:
430 # Output matched_peaks (matched pairs).
431 matched_peaks_output.writerow(gff_row(cname=group.chrom,
432 start=group.midpoint,
433 end=group.midpoint+1,
434 source='repmatch',
435 score=group.normalized_value(med),
436 attrs={'median_distance': group.median_distance,
437 'replicates': group.num_replicates,
438 'value_sum': group.value_sum}))
439 if output_detail_file:
440 matched_peaks = (group.chrom,
441 group.midpoint,
442 group.midpoint+1,
443 group.normalized_value(med),
444 group.num_replicates,
445 group.median_distance,
446 group.value_sum)
447 for peak in group.peaks.values():
448 matched_peaks += (peak.chrom, peak.midpoint, peak.midpoint+1, peak.value, peak.distance, peak.replicate.id)
449 detail_output.writerow(matched_peaks)
450 if output_unmatched_peaks_file:
451 for unmatched_peak in unmatched_peaks:
452 unmatched_peaks_output.writerow((unmatched_peak.chrom,
453 unmatched_peak.midpoint,
454 unmatched_peak.midpoint+1,
455 unmatched_peak.value,
456 unmatched_peak.distance,
457 unmatched_peak.replicate.id))
458 if output_statistics_histogram_file:
459 tmp_statistics_histogram_path = get_temporary_plot_path()
460 frequency_histogram([freq], tmp_statistics_histogram_path)
461 shutil.move(tmp_statistics_histogram_path, output_statistics_histogram)
462 return {'distribution': freq}