Mercurial > repos > iuc > cwpair2
comparison cwpair2_util.py @ 0:8600bfe7ed52 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/cwpair2 commit e96df94dba60050fa28aaf55b5bb095717a5f260
| author | iuc |
|---|---|
| date | Tue, 22 Dec 2015 17:03:46 -0500 |
| parents | |
| children | abc464ca7260 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:8600bfe7ed52 |
|---|---|
| 1 import bisect | |
| 2 import csv | |
| 3 import os | |
| 4 import sys | |
| 5 import traceback | |
| 6 import matplotlib | |
| 7 matplotlib.use('Agg') | |
| 8 from matplotlib import pyplot | |
| 9 | |
| 10 # Data outputs | |
| 11 DETAILS = 'D' | |
| 12 MATCHED_PAIRS = 'MP' | |
| 13 ORPHANS = 'O' | |
| 14 # Data output formats | |
| 15 GFF_EXT = 'gff' | |
| 16 TABULAR_EXT = 'tabular' | |
| 17 # Statistics historgrams output directory. | |
| 18 HISTOGRAM = 'H' | |
| 19 # Statistics outputs | |
| 20 FINAL_PLOTS = 'F' | |
| 21 PREVIEW_PLOTS = 'P' | |
| 22 STATS_GRAPH = 'C' | |
| 23 | |
| 24 # Graph settings. | |
| 25 COLORS = 'krg' | |
| 26 Y_LABEL = 'Peak-pair counts' | |
| 27 X_LABEL = 'Peak-pair distance (bp)' | |
| 28 TICK_WIDTH = 3 | |
| 29 ADJUST = [0.140, 0.9, 0.9, 0.1] | |
| 30 PLOT_FORMAT = 'pdf' | |
| 31 pyplot.rc('xtick.major', size=10.00) | |
| 32 pyplot.rc('ytick.major', size=10.00) | |
| 33 pyplot.rc('lines', linewidth=4.00) | |
| 34 pyplot.rc('axes', linewidth=3.00) | |
| 35 pyplot.rc('font', family='Bitstream Vera Sans', size=32.0) | |
| 36 | |
| 37 | |
| 38 class FrequencyDistribution(object): | |
| 39 | |
| 40 def __init__(self, start, end, binsize=10, d=None): | |
| 41 self.start = start | |
| 42 self.end = end | |
| 43 self.dist = d or {} | |
| 44 self.binsize = binsize | |
| 45 | |
| 46 def get_bin(self, x): | |
| 47 """ | |
| 48 Returns the bin in which a data point falls | |
| 49 """ | |
| 50 return self.start + (x-self.start) // self.binsize * self.binsize + self.binsize/2.0 | |
| 51 | |
| 52 def add(self, x): | |
| 53 x = self.get_bin(x) | |
| 54 self.dist[x] = self.dist.get(x, 0) + 1 | |
| 55 | |
| 56 def graph_series(self): | |
| 57 x = [] | |
| 58 y = [] | |
| 59 for i in range(self.start, self.end, self.binsize): | |
| 60 center = self.get_bin(i) | |
| 61 x.append(center) | |
| 62 y.append(self.dist.get(center, 0)) | |
| 63 return x, y | |
| 64 | |
| 65 def mode(self): | |
| 66 return max(self.dist.items(), key=lambda data: data[1])[0] | |
| 67 | |
| 68 def size(self): | |
| 69 return sum(self.dist.values()) | |
| 70 | |
| 71 | |
| 72 def stop_err(msg): | |
| 73 sys.stderr.write(msg) | |
| 74 sys.exit(1) | |
| 75 | |
| 76 | |
| 77 def distance(peak1, peak2): | |
| 78 return (peak2[1]+peak2[2])/2 - (peak1[1]+peak1[2])/2 | |
| 79 | |
| 80 | |
| 81 def gff_row(cname, start, end, score, source, type='.', strand='.', phase='.', attrs={}): | |
| 82 return (cname, source, type, start, end, score, strand, phase, gff_attrs(attrs)) | |
| 83 | |
| 84 | |
| 85 def gff_attrs(d): | |
| 86 if not d: | |
| 87 return '.' | |
| 88 return ';'.join('%s=%s' % item for item in d.items()) | |
| 89 | |
| 90 | |
| 91 def parse_chromosomes(reader): | |
| 92 # This version of cwpair2 accepts only gff format as input. | |
| 93 chromosomes = {} | |
| 94 reader.next() | |
| 95 for line in reader: | |
| 96 cname, junk, junk, start, end, value, strand, junk, junk = line | |
| 97 start = int(start) | |
| 98 end = int(end) | |
| 99 value = float(value) | |
| 100 if cname not in chromosomes: | |
| 101 chromosomes[cname] = [] | |
| 102 peaks = chromosomes[cname] | |
| 103 peaks.append((strand, start, end, value)) | |
| 104 return chromosomes | |
| 105 | |
| 106 | |
| 107 def perc95(chromosomes): | |
| 108 """ | |
| 109 Returns the 95th percentile value of the given chromosomes. | |
| 110 """ | |
| 111 values = [] | |
| 112 for peaks in chromosomes.values(): | |
| 113 for peak in peaks: | |
| 114 values.append(peak[3]) | |
| 115 values.sort() | |
| 116 # Get 95% value | |
| 117 return values[int(len(values)*0.95)] | |
| 118 | |
| 119 | |
| 120 def filter(chromosomes, threshold=0.05): | |
| 121 """ | |
| 122 Filters the peaks to those above a threshold. Threshold < 1.0 is interpreted | |
| 123 as a proportion of the maximum, >=1.0 as an absolute value. | |
| 124 """ | |
| 125 if threshold < 1: | |
| 126 p95 = perc95(chromosomes) | |
| 127 threshold = p95 * threshold | |
| 128 # Make the threshold a proportion of the | |
| 129 for cname, peaks in chromosomes.items(): | |
| 130 chromosomes[cname] = [peak for peak in peaks if peak[3] > threshold] | |
| 131 | |
| 132 | |
| 133 def split_strands(chromosome): | |
| 134 watson = [peak for peak in chromosome if peak[0] == '+'] | |
| 135 crick = [peak for peak in chromosome if peak[0] == '-'] | |
| 136 return watson, crick | |
| 137 | |
| 138 | |
| 139 def all_pair_distribution(chromosomes, up_distance, down_distance, binsize): | |
| 140 dist = FrequencyDistribution(-up_distance, down_distance, binsize=binsize) | |
| 141 for cname, data in chromosomes.items(): | |
| 142 watson, crick = split_strands(data) | |
| 143 crick.sort(key=lambda data: float(data[1])) | |
| 144 keys = make_keys(crick) | |
| 145 for peak in watson: | |
| 146 for cpeak in get_window(crick, peak, up_distance, down_distance, keys): | |
| 147 dist.add(distance(peak, cpeak)) | |
| 148 return dist | |
| 149 | |
| 150 | |
| 151 def make_keys(crick): | |
| 152 return [(data[1] + data[2])//2 for data in crick] | |
| 153 | |
| 154 | |
| 155 def get_window(crick, peak, up_distance, down_distance, keys=None): | |
| 156 """ | |
| 157 Returns a window of all crick peaks within a distance of a watson peak. | |
| 158 crick strand MUST be sorted by distance | |
| 159 """ | |
| 160 strand, start, end, value = peak | |
| 161 midpoint = (start + end) // 2 | |
| 162 lower = midpoint - up_distance | |
| 163 upper = midpoint + down_distance | |
| 164 keys = keys or make_keys(crick) | |
| 165 start_index = bisect.bisect_left(keys, lower) | |
| 166 end_index = bisect.bisect_right(keys, upper) | |
| 167 return [cpeak for cpeak in crick[start_index:end_index]] | |
| 168 | |
| 169 | |
| 170 def match_largest(window, peak): | |
| 171 if not window: | |
| 172 return None | |
| 173 return max(window, key=lambda cpeak: cpeak[3]) | |
| 174 | |
| 175 | |
| 176 def match_closest(window, peak): | |
| 177 if not window: | |
| 178 return None | |
| 179 | |
| 180 def key(cpeak): | |
| 181 d = distance(peak, cpeak) | |
| 182 # Search negative distances last | |
| 183 if d < 0: | |
| 184 # And then prefer less negative distances | |
| 185 d = 10000 - d | |
| 186 return d | |
| 187 return min(window, key=key) | |
| 188 | |
| 189 | |
| 190 def match_mode(window, peak, mode): | |
| 191 if not window: | |
| 192 return None | |
| 193 return min(window, key=lambda cpeak: abs(distance(peak, cpeak)-mode)) | |
| 194 | |
| 195 METHODS = {'mode': match_mode, 'closest': match_closest, 'largest': match_largest} | |
| 196 | |
| 197 | |
| 198 def frequency_plot(freqs, fname, labels=[], title=''): | |
| 199 pyplot.clf() | |
| 200 pyplot.figure(figsize=(10, 10)) | |
| 201 for i, freq in enumerate(freqs): | |
| 202 x, y = freq.graph_series() | |
| 203 pyplot.plot(x, y, '%s-' % COLORS[i]) | |
| 204 if len(freqs) > 1: | |
| 205 pyplot.legend(labels) | |
| 206 pyplot.xlim(freq.start, freq.end) | |
| 207 pyplot.ylim(ymin=0) | |
| 208 pyplot.ylabel(Y_LABEL) | |
| 209 pyplot.xlabel(X_LABEL) | |
| 210 pyplot.subplots_adjust(left=ADJUST[0], right=ADJUST[1], top=ADJUST[2], bottom=ADJUST[3]) | |
| 211 # Get the current axes | |
| 212 ax = pyplot.gca() | |
| 213 for l in ax.get_xticklines() + ax.get_yticklines(): | |
| 214 l.set_markeredgewidth(TICK_WIDTH) | |
| 215 pyplot.savefig(fname) | |
| 216 | |
| 217 | |
| 218 def create_directories(): | |
| 219 # Output histograms in pdf. | |
| 220 os.mkdir(HISTOGRAM) | |
| 221 os.mkdir('data_%s' % DETAILS) | |
| 222 os.mkdir('data_%s' % ORPHANS) | |
| 223 os.mkdir('data_%s' % MATCHED_PAIRS) | |
| 224 | |
| 225 | |
| 226 def process_file(dataset_path, galaxy_hid, method, threshold, up_distance, | |
| 227 down_distance, binsize, output_files): | |
| 228 if method == 'all': | |
| 229 match_methods = METHODS.keys() | |
| 230 else: | |
| 231 match_methods = [method] | |
| 232 statistics = [] | |
| 233 for match_method in match_methods: | |
| 234 stats = perform_process(dataset_path, | |
| 235 galaxy_hid, | |
| 236 match_method, | |
| 237 threshold, | |
| 238 up_distance, | |
| 239 down_distance, | |
| 240 binsize, | |
| 241 output_files) | |
| 242 statistics.append(stats) | |
| 243 if output_files == 'all' and method == 'all': | |
| 244 frequency_plot([s['dist'] for s in statistics], | |
| 245 statistics[0]['graph_path'], | |
| 246 labels=METHODS.keys()) | |
| 247 return statistics | |
| 248 | |
| 249 | |
| 250 def perform_process(dataset_path, galaxy_hid, method, threshold, up_distance, | |
| 251 down_distance, binsize, output_files): | |
| 252 output_details = output_files in ["all", "matched_pair_orphan_detail"] | |
| 253 output_plots = output_files in ["all"] | |
| 254 output_orphans = output_files in ["all", "matched_pair_orphan", "matched_pair_orphan_detail"] | |
| 255 # Keep track of statistics for the output file | |
| 256 statistics = {} | |
| 257 input = csv.reader(open(dataset_path, 'rt'), delimiter='\t') | |
| 258 fpath, fname = os.path.split(dataset_path) | |
| 259 statistics['fname'] = '%s: data %s' % (method, str(galaxy_hid)) | |
| 260 statistics['dir'] = fpath | |
| 261 if threshold >= 1: | |
| 262 filter_string = 'fa%d' % threshold | |
| 263 else: | |
| 264 filter_string = 'f%d' % (threshold * 100) | |
| 265 fname = '%s_%su%dd%d_on_data_%s' % (method, filter_string, up_distance, down_distance, galaxy_hid) | |
| 266 | |
| 267 def make_histogram_path(output_type, fname): | |
| 268 return os.path.join(HISTOGRAM, 'histogram_%s_%s.%s' % (output_type, fname, PLOT_FORMAT)) | |
| 269 | |
| 270 def make_path(output_type, extension, fname): | |
| 271 # Returns the full path for an output. | |
| 272 return os.path.join(output_type, '%s_%s.%s' % (output_type, fname, extension)) | |
| 273 | |
| 274 def td_writer(output_type, extension, fname): | |
| 275 # Returns a tab-delimited writer for a specified output. | |
| 276 output_file_path = make_path(output_type, extension, fname) | |
| 277 return csv.writer(open(output_file_path, 'wt'), delimiter='\t') | |
| 278 | |
| 279 try: | |
| 280 chromosomes = parse_chromosomes(input) | |
| 281 except Exception: | |
| 282 stop_err('Unable to parse file "%s".\n%s' % (dataset_path, traceback.format_exc())) | |
| 283 if output_details: | |
| 284 # Details | |
| 285 detailed_output = td_writer('data_%s' % DETAILS, TABULAR_EXT, fname) | |
| 286 detailed_output.writerow(('chrom', 'start', 'end', 'value', 'strand') * 2 + ('midpoint', 'c-w reads sum', 'c-w distance (bp)')) | |
| 287 if output_plots: | |
| 288 # Final Plot | |
| 289 final_plot_path = make_histogram_path(FINAL_PLOTS, fname) | |
| 290 if output_orphans: | |
| 291 # Orphans | |
| 292 orphan_output = td_writer('data_%s' % ORPHANS, TABULAR_EXT, fname) | |
| 293 orphan_output.writerow(('chrom', 'strand', 'start', 'end', 'value')) | |
| 294 if output_plots: | |
| 295 # Preview Plot | |
| 296 preview_plot_path = make_histogram_path(PREVIEW_PLOTS, fname) | |
| 297 # Matched Pairs. | |
| 298 matched_pairs_output = td_writer('data_%s' % MATCHED_PAIRS, GFF_EXT, fname) | |
| 299 statistics['stats_path'] = 'statistics.%s' % TABULAR_EXT | |
| 300 if output_plots: | |
| 301 statistics['graph_path'] = make_histogram_path(STATS_GRAPH, fname) | |
| 302 statistics['perc95'] = perc95(chromosomes) | |
| 303 if threshold > 0: | |
| 304 # Apply filter | |
| 305 filter(chromosomes, threshold) | |
| 306 if method == 'mode': | |
| 307 freq = all_pair_distribution(chromosomes, up_distance, down_distance, binsize) | |
| 308 mode = freq.mode() | |
| 309 statistics['preview_mode'] = mode | |
| 310 if output_plots: | |
| 311 frequency_plot([freq], preview_plot_path, title='Preview frequency plot') | |
| 312 else: | |
| 313 statistics['preview_mode'] = 'NA' | |
| 314 dist = FrequencyDistribution(-up_distance, down_distance, binsize=binsize) | |
| 315 orphans = 0 | |
| 316 # x will be used to archive the summary dataset | |
| 317 x = [] | |
| 318 for cname, chromosome in chromosomes.items(): | |
| 319 # Each peak is (strand, start, end, value) | |
| 320 watson, crick = split_strands(chromosome) | |
| 321 # Sort by value of each peak | |
| 322 watson.sort(key=lambda data: -float(data[3])) | |
| 323 # Sort by position to facilitate binary search | |
| 324 crick.sort(key=lambda data: float(data[1])) | |
| 325 keys = make_keys(crick) | |
| 326 for peak in watson: | |
| 327 window = get_window(crick, peak, up_distance, down_distance, keys) | |
| 328 if method == 'mode': | |
| 329 match = match_mode(window, peak, mode) | |
| 330 else: | |
| 331 match = METHODS[method](window, peak) | |
| 332 if match: | |
| 333 midpoint = (match[1] + match[2] + peak[1] + peak[2]) // 4 | |
| 334 d = distance(peak, match) | |
| 335 dist.add(d) | |
| 336 # Simple output in gff format. | |
| 337 x.append(gff_row(cname, | |
| 338 source='cwpair', | |
| 339 start=midpoint, | |
| 340 end=midpoint + 1, | |
| 341 score=peak[3] + match[3], | |
| 342 attrs={'cw_distance': d})) | |
| 343 if output_details: | |
| 344 detailed_output.writerow((cname, | |
| 345 peak[1], | |
| 346 peak[2], | |
| 347 peak[3], | |
| 348 '+', | |
| 349 cname, | |
| 350 match[1], | |
| 351 match[2], | |
| 352 match[3], '-', | |
| 353 midpoint, | |
| 354 peak[3]+match[3], | |
| 355 d)) | |
| 356 i = bisect.bisect_left(keys, (match[1]+match[2])/2) | |
| 357 del crick[i] | |
| 358 del keys[i] | |
| 359 else: | |
| 360 if output_orphans: | |
| 361 orphan_output.writerow((cname, peak[0], peak[1], peak[2], peak[3])) | |
| 362 # Keep track of orphans for statistics. | |
| 363 orphans += 1 | |
| 364 # Remaining crick peaks are orphans | |
| 365 if output_orphans: | |
| 366 for cpeak in crick: | |
| 367 orphan_output.writerow((cname, cpeak[0], cpeak[1], cpeak[2], cpeak[3])) | |
| 368 # Keep track of orphans for statistics. | |
| 369 orphans += len(crick) | |
| 370 # Sort output descending by score. | |
| 371 x.sort(key=lambda data: float(data[5]), reverse=True) | |
| 372 # Writing a summary to gff format file | |
| 373 for row in x: | |
| 374 row_tmp = list(row) | |
| 375 # Dataset in tuple cannot be modified in Python, so row will | |
| 376 # be converted to list format to add 'chr'. | |
| 377 if row_tmp[0] == "999": | |
| 378 row_tmp[0] = 'chrM' | |
| 379 elif row_tmp[0] == "998": | |
| 380 row_tmp[0] = 'chrY' | |
| 381 elif row_tmp[0] == "997": | |
| 382 row_tmp[0] = 'chrX' | |
| 383 else: | |
| 384 row_tmp[0] = row_tmp[0] | |
| 385 # Print row_tmp. | |
| 386 matched_pairs_output.writerow(row_tmp) | |
| 387 statistics['paired'] = dist.size() * 2 | |
| 388 statistics['orphans'] = orphans | |
| 389 statistics['final_mode'] = dist.mode() | |
| 390 if output_plots: | |
| 391 frequency_plot([dist], final_plot_path, title='Frequency distribution') | |
| 392 statistics['dist'] = dist | |
| 393 return statistics |
