comparison repmatch_gff3_util.py @ 4:6acaa2c93f47 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/repmatch_gff3 commit 315c3ddcdbf38a27d43753aae3b6d379306be5a9
author iuc
date Wed, 12 Jul 2017 10:11:53 -0400
parents e5c7fffdc078
children 2365720de36d
comparison
equal deleted inserted replaced
3:f7608d0363bf 4:6acaa2c93f47
21 pyplot.rc('lines', linewidth=4.00) 21 pyplot.rc('lines', linewidth=4.00)
22 pyplot.rc('axes', linewidth=3.00) 22 pyplot.rc('axes', linewidth=3.00)
23 pyplot.rc('font', family='Bitstream Vera Sans', size=32.0) 23 pyplot.rc('font', family='Bitstream Vera Sans', size=32.0)
24 24
25 COLORS = 'krb' 25 COLORS = 'krb'
26 ISPY2 = sys.version_info[0] == 2
26 27
27 28
28 class Replicate(object): 29 class Replicate(object):
29 30
30 def __init__(self, id, dataset_path): 31 def __init__(self, id, dataset_path):
31 self.id = id 32 self.id = id
32 self.dataset_path = dataset_path 33 self.dataset_path = dataset_path
33 self.parse(csv.reader(open(dataset_path, 'rt'), delimiter='\t')) 34 if ISPY2:
35 fh = open(dataset_path, 'rb')
36 else:
37 fh = open(dataset_path, 'r', newline='')
38 self.parse(csv.reader(fh, delimiter='\t'))
34 39
35 def parse(self, reader): 40 def parse(self, reader):
36 self.chromosomes = {} 41 self.chromosomes = {}
37 for line in reader: 42 for line in reader:
38 if line[0].startswith("#") or line[0].startswith('"'): 43 if line[0].startswith("#") or line[0].startswith('"'):
39 continue 44 continue
40 cname, junk, junk, mid, midplus, value, strand, junk, attrs = line 45 cname, junk, junk, mid, midplus, value, strand, junk, attrs = line
41 attrs = parse_gff_attrs(attrs) 46 attrs = parse_gff_attrs(attrs)
42 distance = attrs['cw_distance'] 47 distance = int(attrs['cw_distance'])
43 mid = int(mid) 48 mid = int(mid)
44 midplus = int(midplus) 49 midplus = int(midplus)
45 value = float(value) 50 value = float(value)
46 distance = int(distance)
47 if cname not in self.chromosomes: 51 if cname not in self.chromosomes:
48 self.chromosomes[cname] = Chromosome(cname) 52 self.chromosomes[cname] = Chromosome(cname)
49 chrom = self.chromosomes[cname] 53 chrom = self.chromosomes[cname]
50 chrom.add_peak(Peak(cname, mid, value, distance, self)) 54 chrom.add_peak(Peak(cname, mid, value, distance, self))
51 for chrom in self.chromosomes.values(): 55 for chrom in self.chromosomes.values():
105 def add_peak(self, repid, peak): 109 def add_peak(self, repid, peak):
106 self.peaks[repid] = peak 110 self.peaks[repid] = peak
107 111
108 @property 112 @property
109 def chrom(self): 113 def chrom(self):
110 return self.peaks.values()[0].chrom 114 return list(self.peaks.values())[0].chrom
111 115
112 @property 116 @property
113 def midpoint(self): 117 def midpoint(self):
114 return median([peak.midpoint for peak in self.peaks.values()]) 118 return int(median([peak.midpoint for peak in self.peaks.values()]))
115 119
116 @property 120 @property
117 def num_replicates(self): 121 def num_replicates(self):
118 return len(self.peaks) 122 return len(self.peaks)
119 123
120 @property 124 @property
121 def median_distance(self): 125 def median_distance(self):
122 return median([peak.distance for peak in self.peaks.values()]) 126 return int(median([peak.distance for peak in self.peaks.values()]))
123 127
124 @property 128 @property
125 def value_sum(self): 129 def value_sum(self):
126 return sum([peak.value for peak in self.peaks.values()]) 130 return sum([peak.value for peak in self.peaks.values()])
127 131
131 values.append(peak.normalized_value(med)) 135 values.append(peak.normalized_value(med))
132 return median(values) 136 return median(values)
133 137
134 @property 138 @property
135 def peakpeak_distance(self): 139 def peakpeak_distance(self):
136 keys = self.peaks.keys() 140 keys = list(self.peaks.keys())
137 return abs(self.peaks[keys[0]].midpoint - self.peaks[keys[1]].midpoint) 141 return abs(self.peaks[keys[0]].midpoint - self.peaks[keys[1]].midpoint)
138 142
139 143
140 class FrequencyDistribution(object): 144 class FrequencyDistribution(object):
141 145
185 def get_window(chromosome, target_peaks, distance): 189 def get_window(chromosome, target_peaks, distance):
186 """ 190 """
187 Returns a window of all peaks from a replicate within a certain distance of 191 Returns a window of all peaks from a replicate within a certain distance of
188 a peak from another replicate. 192 a peak from another replicate.
189 """ 193 """
190 lower = target_peaks[0].midpoint 194 lower = list(target_peaks)[0].midpoint
191 upper = target_peaks[0].midpoint 195 upper = list(target_peaks)[0].midpoint
192 for peak in target_peaks: 196 for peak in target_peaks:
193 lower = min(lower, peak.midpoint - distance) 197 lower = min(lower, peak.midpoint - distance)
194 upper = max(upper, peak.midpoint + distance) 198 upper = max(upper, peak.midpoint + distance)
195 start_index = bisect.bisect_left(chromosome.keys, lower) 199 start_index = bisect.bisect_left(chromosome.keys, lower)
196 end_index = bisect.bisect_right(chromosome.keys, upper) 200 end_index = bisect.bisect_right(chromosome.keys, upper)
232 236
233 237
234 METHODS = {'closest': match_closest, 'largest': match_largest} 238 METHODS = {'closest': match_closest, 'largest': match_largest}
235 239
236 240
237 def gff_attrs(d): 241 def gff_attrs(l):
238 if not d: 242 if len(l) == 0:
239 return '.' 243 return '.'
240 return ';'.join('%s=%s' % item for item in d.items()) 244 return ';'.join('%s=%s' % (tup[0], tup[1]) for tup in l)
241 245
242 246
243 def parse_gff_attrs(s): 247 def parse_gff_attrs(s):
244 d = {} 248 d = {}
245 if s == '.': 249 if s == '.':
248 key, val = item.split('=') 252 key, val = item.split('=')
249 d[key] = val 253 d[key] = val
250 return d 254 return d
251 255
252 256
253 def gff_row(cname, start, end, score, source, type='.', strand='.', phase='.', attrs={}): 257 def gff_row(cname, start, end, score, source, stype='.', strand='.', phase='.', attrs=None):
254 return (cname, source, type, start, end, score, strand, phase, gff_attrs(attrs)) 258 return (cname, source, stype, start, end, score, strand, phase, gff_attrs(attrs or []))
255 259
256 260
257 def get_temporary_plot_path(): 261 def get_temporary_plot_path():
258 """ 262 """
259 Return the path to a temporary file with a valid image format 263 Return the path to a temporary file with a valid image format
319 if step != 0: 323 if step != 0:
320 attrs += 's%d' % step 324 attrs += 's%d' % step
321 325
322 def td_writer(file_path): 326 def td_writer(file_path):
323 # Returns a tab-delimited writer for a certain output 327 # Returns a tab-delimited writer for a certain output
324 return csv.writer(open(file_path, 'wt'), delimiter='\t') 328 if ISPY2:
329 fh = open(file_path, 'wb')
330 return csv.writer(fh, delimiter='\t')
331 else:
332 fh = open(file_path, 'w', newline='')
333 return csv.writer(fh, delimiter='\t', quoting=csv.QUOTE_NONE)
325 334
326 labels = ('chrom', 335 labels = ('chrom',
327 'median midpoint', 336 'median midpoint',
328 'median midpoint+1', 337 'median midpoint+1',
329 'median normalized reads', 338 'median normalized reads',
361 reps = reps[:] 370 reps = reps[:]
362 while len(reps) > 1: 371 while len(reps) > 1:
363 # Iterate over each replicate as "main" 372 # Iterate over each replicate as "main"
364 main = reps[0] 373 main = reps[0]
365 reps.remove(main) 374 reps.remove(main)
366 for chromosome in main.chromosomes.values(): 375 for chromosome in list(main.chromosomes.values()):
367 peaks_by_value = chromosome.peaks[:] 376 peaks_by_value = chromosome.peaks[:]
368 # Sort main replicate by value 377 # Sort main replicate by value
369 peaks_by_value.sort(key=lambda peak: -peak.value) 378 peaks_by_value.sort(key=lambda peak: -peak.value)
370 379
371 def search_for_matches(group): 380 def search_for_matches(group):
377 if replicate.id in group.peaks: 386 if replicate.id in group.peaks:
378 # Stop if match already found for this replicate 387 # Stop if match already found for this replicate
379 continue 388 continue
380 try: 389 try:
381 # Lines changed to remove a major bug by Rohit Reja. 390 # Lines changed to remove a major bug by Rohit Reja.
382 window, chrum = get_window(replicate.chromosomes[chromosome.name], 391 window, chrum = get_window(replicate.chromosomes[chromosome.name], list(group.peaks.values()), distance)
383 group.peaks.values(),
384 distance)
385 match = METHODS[method](window, peak, chrum) 392 match = METHODS[method](window, peak, chrum)
386 except KeyError: 393 except KeyError:
387 continue 394 continue
388 if match: 395 if match:
389 group.add_peak(replicate.id, match) 396 group.add_peak(replicate.id, match)
390 new_match = True 397 new_match = True
391 if not new_match: 398 if not new_match:
392 break 399 break
393 # Attempt to enlarge existing peak groups 400 # Attempt to enlarge existing peak groups
394 for group in peak_groups: 401 for group in peak_groups:
395 old_peaks = group.peaks.values()[:] 402 old_peaks = list(group.peaks.values())
396 search_for_matches(group) 403 search_for_matches(group)
397 for peak in group.peaks.values(): 404 for peak in list(group.peaks.values()):
398 if peak not in old_peaks: 405 if peak not in old_peaks:
399 peak.replicate.chromosomes[chromosome.name].remove_peak(peak) 406 peak.replicate.chromosomes[chromosome.name].remove_peak(peak)
400 # Attempt to find new peaks groups. For each peak in the 407 # Attempt to find new peaks groups. For each peak in the
401 # main replicate, search for matches in the other replicates 408 # main replicate, search for matches in the other replicates
402 for peak in peaks_by_value: 409 for peak in peaks_by_value:
403 matches = PeakGroup() 410 matches = PeakGroup()
404 matches.add_peak(main.id, peak) 411 matches.add_peak(main.id, peak)
405 search_for_matches(matches) 412 search_for_matches(matches)
406 # Were enough replicates matched? 413 # Were enough replicates matched?
407 if matches.num_replicates >= num_required: 414 if matches.num_replicates >= num_required:
408 for peak in matches.peaks.values(): 415 for peak in list(matches.peaks.values()):
409 peak.replicate.chromosomes[chromosome.name].remove_peak(peak) 416 peak.replicate.chromosomes[chromosome.name].remove_peak(peak)
410 peak_groups.append(matches) 417 peak_groups.append(matches)
411 # Zero or less = no stepping 418 # Zero or less = no stepping
412 if step <= 0: 419 if step <= 0:
413 do_match(replicates, distance) 420 do_match(replicates, distance)
430 for group in peak_groups: 437 for group in peak_groups:
431 # Output matched_peaks (matched pairs). 438 # Output matched_peaks (matched pairs).
432 matched_peaks_output.writerow(gff_row(cname=group.chrom, 439 matched_peaks_output.writerow(gff_row(cname=group.chrom,
433 start=group.midpoint, 440 start=group.midpoint,
434 end=group.midpoint + 1, 441 end=group.midpoint + 1,
442 score=group.normalized_value(med),
435 source='repmatch', 443 source='repmatch',
436 score=group.normalized_value(med), 444 stype='.',
437 attrs={'median_distance': group.median_distance, 445 strand='.',
438 'replicates': group.num_replicates, 446 phase='.',
439 'value_sum': group.value_sum})) 447 attrs=[('median_distance', group.median_distance),
448 ('value_sum', group.value_sum),
449 ('replicates', group.num_replicates)]))
440 if output_detail_file: 450 if output_detail_file:
441 matched_peaks = (group.chrom, 451 matched_peaks = (group.chrom,
442 group.midpoint, 452 group.midpoint,
443 group.midpoint + 1, 453 group.midpoint + 1,
444 group.normalized_value(med), 454 group.normalized_value(med),