comparison hexagram-6ae12361157c/hexagram/hexagram.py @ 0:1407e3634bcf draft default tip

Uploaded r11 from test tool shed.
author adam-novak
date Tue, 22 Oct 2013 14:17:59 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1407e3634bcf
1 #!/usr/bin/env python2.7
2 """
3 hexagram.py: Given a matrix of similarities, produce a hexagram visualization.
4
5 This script takes in the filename of a tab-separated value file containing a
6 sparse similarity matrix (with string labels) and several matrices of
7 layer/score data. It produces an HTML file (and several support files) that
8 provide an interactive visualization of the items clustered on a hexagonal grid.
9
10 This script depends on the DrL graph layout package, binaries for which must be
11 present in your PATH.
12
13 Re-uses sample code and documentation from
14 <http://users.soe.ucsc.edu/~karplus/bme205/f12/Scaffold.html>
15 """
16
17 import argparse, sys, os, itertools, math, numpy, subprocess, shutil, tempfile
18 import collections, multiprocessing, traceback, numpy
19 import scipy.stats, scipy.linalg
20 import os.path
21 import tsv
22
23 # Global variable to hold opened matrices files
24 matrices = [];
25
26
27 def parse_args(args):
28 """
29 Takes in the command-line arguments list (args), and returns a nice argparse
30 result with fields for all the options.
31 Borrows heavily from the argparse documentation examples:
32 <http://docs.python.org/library/argparse.html>
33 """
34
35 # The command line arguments start with the program name, which we don't
36 # want to treat as an argument for argparse. So we remove it.
37 args = args[1:]
38
39 # Construct the parser (which is stored in parser)
40 # Module docstring lives in __doc__
41 # See http://python-forum.com/pythonforum/viewtopic.php?f=3&t=36847
42 # And a formatter class so our examples in the docstring look good. Isn't it
43 # convenient how we already wrapped it to 80 characters?
44 # See http://docs.python.org/library/argparse.html#formatter-class
45 parser = argparse.ArgumentParser(description=__doc__,
46 formatter_class=argparse.RawDescriptionHelpFormatter)
47
48 # Now add all the options to it
49 # Options match the ctdHeatmap tool options as much as possible.
50 parser.add_argument("similarity", type=str, nargs='+',
51 help="the unopened files of similarity matrices")
52 parser.add_argument("--names", type=str, action="append", default=[],
53 help="the unopened files of similarity matrices")
54 parser.add_argument("--scores", type=str,
55 action="append", default=[],
56 help="a TSV to read scores for each signature from")
57 parser.add_argument("--colormaps", type=argparse.FileType("r"),
58 default=None,
59 help="a TSV defining coloring and value names for discrete scores")
60 parser.add_argument("--html", "-H", type=str,
61 default="index.html",
62 help="where to write HTML report")
63 parser.add_argument("--directory", "-d", type=str, default=".",
64 help="directory in which to create other output files")
65 parser.add_argument("--query", type=str, default=None,
66 help="Galaxy-escaped name of the query signature")
67 parser.add_argument("--window_size", type=int, default=20,
68 help="size of the window to use when looking for clusters")
69 parser.add_argument("--truncation_edges", type=int, default=10,
70 help="number of edges for DrL truncate to pass per node")
71 parser.add_argument("--no-stats", dest="stats", action="store_false",
72 default=True,
73 help="disable cluster-finding statistics")
74 parser.add_argument("--include-singletons", dest="singletons",
75 action="store_true", default=False,
76 help="add self-edges to retain unconnected points")
77
78 return parser.parse_args(args)
79
80 def hexagon_center(x, y, scale=1.0):
81 """
82 Given a coordinate on a grid of hexagons (using wiggly rows in x), what is
83 the 2d Euclidian coordinate of its center?
84
85 x and y are integer column and row coordinates of the hexagon in the grid.
86
87 scale is a float specifying hexagon side length.
88
89 The origin in coordinate space is defined as the upper left corner of the
90 bounding box of the hexagon with indices x=0 and y=0.
91
92 Returns a tuple of floats.
93 """
94 # The grid looks like this:
95 #
96 # /-\ /-\ /-\ /-\
97 # /-\-/-\-/-\-/-\-/-\
98 # \-/-\-/-\-/-\-/-\-/
99 # /-\-/-\-/-\-/-\-/-\
100 # \-/-\-/-\-/-\-/-\-/
101 # /-\-/-\-/-\-/-\-/-\
102 # \-/ \-/ \-/ \-/ \-/
103 #
104 # Say a hexagon side has length 1
105 # It's 2 across corner to corner (x), and sqrt(3) across side to side (y)
106 # X coordinates are 1.5 per column
107 # Y coordinates (down from top) are sqrt(3) per row, -1/2 sqrt(3) if you're
108 # in an odd column.
109
110 center_y = math.sqrt(3) * y
111 if x % 2 == 1:
112 # Odd column: shift up
113 center_y -= 0.5 * math.sqrt(3)
114
115 return (1.5 * x * scale + scale, center_y * scale + math.sqrt(3.0) / 2.0 *
116 scale)
117
118 def hexagon_pick(x, y, scale=1.0):
119 """
120 Given floats x and y specifying coordinates in the plane, determine which
121 hexagon grid cell that point is in.
122
123 scale is a float specifying hexagon side length.
124
125 See http://blog.ruslans.com/2011/02/hexagonal-grid-math.html
126 But we flip the direction of the wiggle. Odd rows are up (-y)
127 """
128
129 # How high is a hex?
130 hex_height = math.sqrt(3) * scale
131
132 # First we pick a rectangular tile, from the point of one side-traingle to
133 # the base of the other in width, and the whole hexagon height in height.
134
135 # How wide are these tiles? Corner to line-between-far-corners distance
136 tile_width = (3.0 / 2.0 * scale)
137
138 # Tile X index is floor(x / )
139 tile_x = int(math.floor(x / tile_width))
140
141 # We need this intermediate value for the Y index and for tile-internal
142 # picking
143 corrected_y = y + (tile_x % 2) * hex_height / 2.0
144
145 # Tile Y index is floor((y + (x index mod 2) * hex height/2) / hex height)
146 tile_y = int(math.floor(corrected_y / hex_height))
147
148 # Find coordinates within the tile
149 internal_x = x - tile_x * tile_width
150 internal_y = corrected_y - tile_y * hex_height
151
152 # Do tile-scale picking
153 # Are we in the one corner, the other corner, or the bulk of the tile?
154 if internal_x > scale * abs(0.5 - internal_y / hex_height):
155 # We're in the bulk of the tile
156 # This is the column (x) of the picked hexagon
157 hexagon_x = tile_x
158
159 # This is the row (y) of the picked hexagon
160 hexagon_y = tile_y
161 else:
162 # We're in a corner.
163 # In an even column, the lower left is part of the next row, and the
164 # upper left is part of the same row. In an odd column, the lower left
165 # is part of the same row, and the upper left is part of the previous
166 # row.
167 if internal_y > hex_height / 2.0:
168 # It's the lower left corner
169 # This is the offset in row (y) that being in this corner gives us
170 # The lower left corner is always 1 row below the upper left corner.
171 corner_y_offset = 1
172 else:
173 corner_y_offset = 0
174
175 # TODO: verify this for correctness. It seems to be right, but I want a
176 # unit test to be sure.
177 # This is the row (y) of the picked hexagon
178 hexagon_y = tile_y - tile_x % 2 + corner_y_offset
179
180 # This is the column (x) of the picked hexagon
181 hexagon_x = tile_x - 1
182
183 # Now we've picked the hexagon
184 return (hexagon_x, hexagon_y)
185
186 def radial_search(center_x, center_y):
187 """
188 An iterator that yields coordinate tuples (x, y) in order of increasing
189 hex-grid distance from the specified center position.
190 """
191
192 # A hexagon has neighbors at the following relative coordinates:
193 # (-1, 0), (1, 0), (0, -1), (0, 1)
194 # and ((-1, 1) and (1, 1) if in an even column)
195 # or ((-1, -1) and (1, -1) if in an odd column)
196
197 # We're going to go outwards using breadth-first search, so we need a queue
198 # of hexes to visit and a set of already visited hexes.
199
200 # This holds a queue (really a deque) of hexes waiting to be visited.
201 # A list has O(n) pop/insert at left.
202 queue = collections.deque()
203 # This holds a set of the (x, y) coordinate tuples of already-seen hexes,
204 # so we don't enqueue them again.
205 seen = set()
206
207 # First place to visit is the center.
208 queue.append((center_x, center_y))
209
210 while len(queue) > 0:
211 # We should in theory never run out of items in the queue.
212 # Get the current x and y to visit.
213 x, y = queue.popleft()
214
215 # Yield the location we're visiting
216 yield (x, y)
217
218 # This holds a list of all relative neighbor positions as (x, y) tuples.
219 neighbor_offsets = [(-1, 0), (1, 0), (0, -1), (0, 1)]
220 if y % 2 == 0:
221 # An even-column hex also has these neighbors
222 neighbor_offsets += [(-1, 1), (1, 1)]
223 else:
224 # An odd-column hex also has these neighbors
225 neighbor_offsets += [(-1, -1), (1, -1)]
226
227 for x_offset, y_offset in neighbor_offsets:
228 # First calculate the absolute position of the neighbor in x
229 neighbor_x = x + x_offset
230 # And in y
231 neighbor_y = y + y_offset
232
233 if (neighbor_x, neighbor_y) not in seen:
234 # This is a hex that has never been in the queue. Add it.
235 queue.append((neighbor_x, neighbor_y))
236
237 # Record that it has ever been enqueued
238 seen.add((neighbor_x, neighbor_y))
239
240
241
242
243 def assign_hexagon(hexagons, node_x, node_y, node, scale=1.0):
244 """
245 This function assigns the given node to a hexagon in hexagons. hexagons is a
246 defaultdict from tuples of hexagon (x, y) integer indices to assigned nodes,
247 or None if a hexagon is free. node_x and node_y are the x and y coordinates
248 of the node, adapted so that the seed node lands in the 0, 0 hexagon, and
249 re-scaled to reduce hexagon conflicts. node is the node to be assigned.
250 scale, if specified, is the hexagon side length in node space units.
251
252 This function assigns nodes to their closest hexagon, reprobing outwards if
253 already occupied.
254
255 When the function completes, node is stored in hexagons under some (x, y)
256 tuple.
257
258 Returns the distance this hexagon is from its ideal location.
259 """
260
261 # These hold the hexagon that the point falls in, which may be taken.
262 best_x, best_y = hexagon_pick(node_x, node_y, scale=scale)
263
264 for x, y in radial_search(best_x, best_y):
265 # These hexes are enumerated in order of increasign distance from the
266 # best one, starting with the best hex itself.
267
268 if hexagons[(x, y)] is None:
269 # This is the closest free hex. Break out of the loop, leaving x and
270 # y pointing here.
271 break
272
273 # Assign the node to the hexagon
274 hexagons[(x, y)] = node
275
276 return math.sqrt((x - best_x) ** 2 + (y - best_y) ** 2)
277
278
279
280 def assign_hexagon_local_radial(hexagons, node_x, node_y, node, scale=1.0):
281 """
282 This function assigns the given node to a hexagon in hexagons. hexagons is a
283 defaultdict from tuples of hexagon (x, y) integer indices to assigned nodes,
284 or None if a hexagon is free. node_x and node_y are the x and y coordinates
285 of the node, adapted so that the seed node lands in the 0, 0 hexagon, and
286 re-scaled to reduce hexagon conflicts. node is the node to be assigned.
287 scale, if specified, is the hexagon side length in node space units.
288
289 This function assigns nodes to their closest hexagon. If thast hexagon is
290 full, it re-probes in the direction that the node is from the closest
291 hexagon's center.
292
293 When the function completes, node is stored in hexagons under some (x, y)
294 tuple.
295
296 Returns the distance this hexagon is from its ideal location.
297 """
298
299 # These hold the hexagon that the point falls in, which may be taken.
300 best_x, best_y = hexagon_pick(node_x, node_y, scale=scale)
301
302 # These hold the center of that hexagon in float space
303 center_x, center_y = hexagon_center(best_x, best_y, scale=scale)
304
305 # This holds the distance from this point to the center of that hexagon
306 node_distance = math.sqrt((node_x - center_x) ** 2 + (node_y - center_y) **
307 2)
308
309 # These hold the normalized direction of this point, relative to the center
310 # of its best hexagon
311 direction_x = (node_x - center_x) / node_distance
312 direction_y = (node_y - center_y) / node_distance
313
314 # Do a search in that direction, starting at the best hex.
315
316 # These are the hexagon indices we're considering
317 x, y = best_x, best_y
318
319 # These are the Cartesian coordinates we're probing. Must be in the x, y hex
320 # as a loop invariant.
321 test_x, test_y = center_x, center_y
322
323 while hexagons[(x, y)] is not None:
324 # Re-probe outwards from the best hex in scale/2-sized steps
325 # TODO: is that the right step size? Scale-sized steps seemed slightly
326 # large.
327 test_x += direction_x * scale
328 test_y += direction_y * scale
329
330 # Re-pick x and y for the hex containing our test point
331 x, y = hexagon_pick(test_x, test_y, scale=scale)
332
333 # We've finally reached the edge of the cluster.
334 # Drop our hexagon
335 hexagons[(x, y)] = node
336
337 return math.sqrt((x - best_x) ** 2 + (y - best_y) ** 2)
338
339 def assign_hexagon_radial(hexagons, node_x, node_y, node, scale=1.0):
340 """
341 This function assigns the given node to a hexagon in hexagons. hexagons is a
342 defaultdict from tuples of hexagon (x, y) integer indices to assigned nodes,
343 or None if a hexagon is free. node_x and node_y are the x and y coordinates
344 of the node, adapted so that the seed node lands in the 0, 0 hexagon, and
345 re-scaled to reduce hexagon conflicts. node is the node to be assigned.
346 scale, if specified, is the hexagon side length in node space units.
347
348 This function assigns nodes to hexagons based on radial distance from 0, 0.
349 This makes hexagon assignment much more dense, but can lose spatial
350 structure.
351
352 When the function completes, node is stored in hexagons under some (x, y)
353 tuple.
354
355 Returns the distance this hexagon is from its ideal location. Unfortunately,
356 this doesn't really make sense for this assignment scheme, so it is always
357 0.
358 """
359
360 # Compute node's distance from the origin
361 node_distance = math.sqrt(node_x ** 2 + node_y ** 2)
362
363 # Compute normalized direction from the origin for this node
364 direction_x = node_x / node_distance
365 direction_y = node_y / node_distance
366
367 # These are the coordinates we are testing
368 test_x = 0
369 test_y = 0
370
371 # These are the hexagon indices that correspond to that point
372 x, y = hexagon_pick(test_x, test_y, scale=scale)
373
374 while hexagons[(x, y)] is not None:
375 # Re-probe outwards from the origin in scale-sized steps
376 # TODO: is that the right step size?
377 test_x += direction_x * scale
378 test_y += direction_y * scale
379
380 # Re-pick
381 x, y = hexagon_pick(test_x, test_y, scale=scale)
382
383 # We've finally reached the edge of the cluster.
384 # Drop our hexagon
385 # TODO: this has to be N^2 if we line them all up in a line
386 hexagons[(x, y)] = node
387
388 return 0
389
390 def hexagons_in_window(hexagons, x, y, width, height):
391 """
392 Given a dict from (x, y) position to signature names, return the list of all
393 signatures in the window starting at hexagon x, y and extending width in the
394 x direction and height in the y direction on the hexagon grid.
395 """
396
397 # This holds the list of hexagons we've found
398 found = []
399
400 for i in xrange(x, x + width):
401 for j in xrange(y, y + height):
402 if hexagons.has_key((i, j)):
403 # This position in the window has a hex.
404 found.append(hexagons[(i, j)])
405
406 return found
407
408 class ClusterFinder(object):
409 """
410 A class that can be invoked to find the p value of the best cluster in its
411 layer. Instances are pickleable.
412 """
413
414 def __init__(self, hexagons, layer, window_size=5):
415 """
416 Keep the given hexagons dict (from (x, y) to signature name) and the
417 given layer (a dict from signature name to a value), and the given
418 window size, in a ClusterFinder object.
419 """
420
421 # TODO: This should probably all operate on numpy arrays that we can
422 # slice efficiently.
423
424 # Store the layer
425 self.hexagons = hexagons
426 # Store the hexagon assignments
427 self.layer = layer
428
429 # Store the window size
430 self.window_size = window_size
431
432 @staticmethod
433 def continuous_p(in_values, out_values):
434 """
435 Get the p value for in_values and out_values being distinct continuous
436 distributions.
437
438 in_values and out_values are both Numpy arrays. Returns the p value, or
439 raises a ValueError if the statistical test cannot be run for some
440 reason.
441
442 Uses the Mann-Whitney U test.
443 """
444
445 # Do a Mann-Whitney U test to see how different the data
446 # sets are.
447 u_statistic, p_value = scipy.stats.mannwhitneyu(in_values,
448 out_values)
449
450 return p_value
451
452 @staticmethod
453 def dichotomous_p(in_values, out_values):
454 """
455 Given two one-dimensional Numpy arrays of 0s and 1s, compute a p value
456 for the in_values having a different probability of being 1 than the
457 frequency of 1s in the out_values.
458
459 This test uses the scipy.stats.binom_test function, which does not claim
460 to use the normal approximation. Therefore, this test should be valid
461 for arbitrarily small frequencies of either 0s or 1s in in_values.
462
463 TODO: What if out_values is shorter than in_values?
464 """
465
466 if len(out_values) == 0:
467 raise ValueError("Background group is empty!")
468
469 # This holds the observed frequency of 1s in out_values
470 frequency = numpy.sum(out_values) / len(out_values)
471
472 # This holds the number of 1s in in_values
473 successes = numpy.sum(in_values)
474
475 # This holds the number of "trials" we got that many successes in
476 trials = len(in_values)
477
478 # Return how significantly the frequency inside differs from that
479 # outside.
480 return scipy.stats.binom_test(successes, trials, frequency)
481
482 @staticmethod
483 def categorical_p(in_values, out_values):
484 """
485 Given two one-dimensional Numpy arrays of integers (which may be stored
486 as floats), which represent items being assigned to different
487 categories, return a p value for the distribution of categories observed
488 in in_values differing from that observed in out_values.
489
490 The normal way to do this is with a chi-squared goodness of fit test.
491 However, that test has invalid assumptions when there are fewer than 5
492 expected and 5 observed observations in every category.
493 See http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chis
494 quare.html
495
496 However, we will use it anyway, because the tests that don't break down
497 are prohibitively slow.
498 """
499
500 # Convert our inputs to integer arrays
501 in_values = in_values.astype(int)
502 out_values = out_values.astype(int)
503
504 # How many categories are there (count 0 to the maximum value)
505 num_categories = max(numpy.max(in_values), numpy.max(out_values)) + 1
506
507 # Count the number of in_values and out_values in each category
508 in_counts = numpy.array([len(in_values[in_values == i]) for i in
509 xrange(num_categories)])
510 out_counts = numpy.array([len(out_values[out_values == i]) for i in
511 xrange(num_categories)])
512
513 # Get the p value for the window being from the estimated distribution
514 # None of the distribution parameters count as "estimated from data"
515 # because they aren't estimated from the data under test.
516 _, p_value = scipy.stats.chisquare(in_counts, out_counts)
517
518 return p_value
519
520 def __call__(self):
521 """
522 Find the best p value for any window of size window_size. Return it.
523 """
524
525 # Calculate the bounding box where we want to look for windows.
526 # TODO: This would just be all of a numpy array
527 min_x = min(coords[0] for coords in self.hexagons.iterkeys())
528 min_y = min(coords[1] for coords in self.hexagons.iterkeys())
529 max_x = max(coords[0] for coords in self.hexagons.iterkeys())
530 max_y = max(coords[1] for coords in self.hexagons.iterkeys())
531
532 # This holds a Numpy array of all the data by x, y
533 layer_data = numpy.empty((max_x - min_x + 1, max_y - min_y + 1))
534
535 # Fill it with NaN so we can mask those out later
536 layer_data[:] = numpy.NAN
537
538 for (hex_x, hex_y), name in self.hexagons.iteritems():
539 # Copy the layer values into the Numpy array
540 if self.layer.has_key(name):
541 layer_data[hex_x - min_x, hex_y - min_y] = self.layer[name]
542
543 # This holds a masked version of the layer data
544 layer_data_masked = numpy.ma.masked_invalid(layer_data, copy=False)
545
546 # This holds the smallest p value we have found for this layer
547 best_p = float("+inf")
548
549 # This holds the statistical test to use (a function from two Numpy
550 # arrays to a p value)
551 # The most specific test is the dichotomous test (0 or 1)
552 statistical_test = self.dichotomous_p
553
554 if numpy.sum(~layer_data_masked.mask) == 0:
555 # There is actually no data in this layer at all.
556 # nditer complains if we try to iterate over an empty thing.
557 # So quit early and say we couldn't find anything.
558 return best_p
559
560 for value in numpy.nditer(layer_data_masked[~layer_data_masked.mask]):
561 # Check all the values in the layer.
562 # If this value is out of the domain of the current statistical
563 # test, upgrade to a more general test.
564
565 if statistical_test == self.dichotomous_p and (value > 1 or
566 value < 0):
567
568 # We can't use a dichotomous test on things outside 0 to 1
569 # But we haven't yet detected any non-integers
570 # Use categorical
571 statistical_test = self.categorical_p
572
573 if value % 1 != 0:
574 # This is not an integer value
575 # So, we must use a continuous statistical test
576 statistical_test = self.continuous_p
577
578 # This is the least specific test, so we can stop now
579 break
580
581
582 for i in xrange(min_x, max_x - self.window_size):
583 for j in xrange(min_y, max_y - self.window_size):
584
585 # Get the layer values for hexes in the window, as a Numpy
586 # masked array.
587 in_region = layer_data_masked[i:i + self.window_size,
588 j:j + self.window_size]
589
590 # And as a 1d Numpy array
591 in_values = numpy.reshape(in_region[~in_region.mask], -1).data
592
593 # And out of the window (all the other hexes) as a masked array
594 out_region = numpy.ma.copy(layer_data_masked)
595 # We get this by masking out everything in the region
596 out_region.mask[i:i + self.window_size,
597 j:j + self.window_size] = True
598
599 # And as a 1d Numpy array
600 out_values = numpy.reshape(out_region[~out_region.mask],
601 -1).data
602
603
604 if len(in_values) == 0 or len(out_values) == 0:
605 # Can't do any stats on this window
606 continue
607
608 if len(in_values) < 0.5 * self.window_size ** 2:
609 # The window is less than half full. Skip it.
610 # TODO: Make this threshold configurable.
611 continue
612
613 try:
614
615 # Get the p value for this window under the selected
616 # statistical test
617 p_value = statistical_test(in_values, out_values)
618
619 # If this is the best p value so far, record it
620 best_p = min(best_p, p_value)
621 except ValueError:
622 # Probably an all-zero layer, or something else the test
623 # can't handle.
624 # But let's try all the other windows to be safe.
625 # Maybe one will work.
626 pass
627
628
629
630 # We have now found the best p for any window for this layer.
631 print "Best p found: {}".format(best_p)
632 sys.stdout.flush()
633
634 return best_p
635
636 def run_functor(functor):
637 """
638 Given a no-argument functor (like a ClusterFinder), run it and return its
639 result. We can use this with multiprocessing.map and map it over a list of
640 job functors to do them.
641
642 Handles getting more than multiprocessing's pitiful exception output
643 """
644
645 try:
646 return functor()
647 except:
648 # Put all exception text into an exception and raise that
649 raise Exception(traceback.format_exc())
650
651 def open_matrices(names):
652 """
653 The argument parser now take multiple similarity matrices as input and
654 saves their file name as strings. We want to store the names of these
655 strings for display later in hexagram.js in order to allow the user to
656 navigate and know what type of visualization map they are looking at -
657 gene expression, copy number, etc.
658
659 Since, the parser no longer opens the files automatically we must, do it
660 in this function.
661 """
662
663 # For each file name, open the file and add it to the matrices list
664 # 'r' is the argument stating that the file will be read-only
665 for similarity_filename in names:
666 print "Opening Matrices..."
667 matrix_file = tsv.TsvReader(open(similarity_filename, "r"))
668 matrices.append(matrix_file)
669
670 def compute_beta (coords, matrix, axis, index, options):
671 """
672 Compute and return a beta matrix from coords * matrix.
673 Then print the matrix to a file to be read on clientside.
674 """
675 beta = coords * matrix
676 return beta
677 # Must add writing function
678
679 def drl_similarity_functions(matrix, index, options):
680 """
681 Performs all the functions needed to format a similarity matrix into a
682 tsv format whereby the DrL can take the values. Then all of the DrL
683 functions are performed on the similarity matrix.
684
685 Options is passed to access options.singletons and other required apsects
686 of the parsed args.
687 """
688
689 # Work in a temporary directory
690 # If not available, create the directory.
691 drl_directory = tempfile.mkdtemp()
692
693 # This is the base name for all the files that DrL uses to do the layout
694 # We're going to put it in a temporary directory.
695 # index added to extension in order to keep track of
696 # respective layouts
697 drl_basename = os.path.join(drl_directory, "layout" + str(index))
698
699 # We can just pass our similarity matrix to DrL's truncate
700 # But we want to run it through our tsv parser to strip comments and ensure
701 # it's valid
702
703 # This holds a reader for the similarity matrix
704 sim_reader = matrix
705
706 # This holds a writer for the sim file
707 sim_writer = tsv.TsvWriter(open(drl_basename + ".sim", "w"))
708
709 print "Regularizing similarity matrix..."
710 sys.stdout.flush()
711
712 # This holds a list of all unique signature names in the similarity matrix.
713 # We can use it to add edges to keep singletons.
714 signatures = set()
715
716 print "Reach for parts in sim_reader"
717 for parts in sim_reader:
718 # Keep the signature names used
719 signatures.add(parts[0])
720 signatures.add(parts[1])
721
722 # Save the line to the regularized file
723 sim_writer.list_line(parts)
724
725 if options.singletons:
726 # Now add a self-edge on every node, so we don't drop nodes with no
727 # other strictly positive edges
728 for signature in signatures:
729 sim_writer.line(signature, signature, 1)
730
731 sim_reader.close()
732 sim_writer.close()
733
734 # Now our input for DrL is prepared!
735
736 # Do DrL truncate.
737 # TODO: pass a truncation level
738 print "DrL: Truncating..."
739 sys.stdout.flush()
740 subprocess.check_call(["truncate", "-t", str(options.truncation_edges),
741 drl_basename])
742
743 # Run the DrL layout engine.
744 print "DrL: Doing layout..."
745 sys.stdout.flush()
746 subprocess.check_call(["layout", drl_basename])
747
748 # Put the string names back
749 print "DrL: Restoring names..."
750 sys.stdout.flush()
751 subprocess.check_call(["recoord", drl_basename])
752
753 # Now DrL has saved its coordinates as <signature name>\t<x>\t<y> rows in
754 # <basename>.coord
755
756 # We want to read that.
757 # This holds a reader for the DrL output
758 coord_reader = tsv.TsvReader(open(drl_basename + ".coord", "r"))
759
760 # This holds a dict from signature name string to (x, y) float tuple. It is
761 # also our official collection of node names that made it through DrL, and
762 # therefore need their score data sent to the client.
763 nodes = {}
764
765 print "Reading DrL output..."
766 sys.stdout.flush()
767 for parts in coord_reader:
768 nodes[parts[0]] = (float(parts[1]), float(parts[2]))
769
770 coord_reader.close()
771
772 # Save the DrL coordinates in our bundle, to be displayed client-side for
773 # debugging.
774
775 # index added to drl.tab extension in order to keep track of
776 # respective drl.tabs
777 coord_writer = tsv.TsvWriter(open(
778 os.path.join(options.directory, "drl" + str(index) + ".tab"), "w"))
779
780 for signature_name, (x, y) in nodes.iteritems():
781 # Write a tsv with names instead of numbers, like what DrL recoord would
782 # have written. This is what the Javascript on the client side wants.
783 coord_writer.line(signature_name, x, y)
784
785 coord_writer.close()
786
787 # Delete our temporary directory.
788 shutil.rmtree(drl_directory)
789
790 # Return nodes dict back to main method for further processes
791 return nodes
792
793 def compute_hexagram_assignments (nodes, index, options):
794 """
795 Now that we are taking multiple similarity matrices as inputs, we must
796 compute hexagram assignments for each similarity matrix. These assignments
797 are based up on the nodes ouput provided by the DrL function.
798
799 Index relates each matrix name with its drl output, nodes, assignments, etc.
800 Options contains the parsed arguments that are present in the main method.
801 """
802 # Do the hexagon layout
803 # We do the squiggly rows setup, so express everything as integer x, y
804
805 # This is a defaultdict from (x, y) integer tuple to id that goes there, or
806 # None if it's free.
807 global hexagons
808 hexagons = collections.defaultdict(lambda: None)
809
810 # This holds the side length that we use
811 side_length = 1.0
812
813 # This holds what will be a layer of how badly placed each hexagon is
814 # A dict from node name to layer value
815 placement_badnesses = {}
816
817 for node, (node_x, node_y) in nodes.iteritems():
818 # Assign each node to a hexagon
819 # This holds the resulting placement badness for that hexagon (i.e.
820 # distance from ideal location)
821 badness = assign_hexagon(hexagons, node_x, node_y, node,
822 scale=side_length)
823
824 # Put the badness in the layer
825 placement_badnesses[node] = float(badness)
826
827 # Normalize the placement badness layer
828 # This holds the max placement badness
829 max_placement_badness = max(placement_badnesses.itervalues())
830 print "Max placement badness: {}".format(max_placement_badness)
831
832 if max_placement_badness != 0:
833 # Normalize by the max if possible.
834 placement_badnesses = {node: value / max_placement_badness for node,
835 value in placement_badnesses.iteritems()}
836
837 # The hexagons have been assigned. Make hexagons be a dict instead of a
838 # defaultdict, so it pickles.
839 # TODO: I should change it so I don't need to do this.
840 hexagons = dict(hexagons)
841
842 # Now dump the hexagon assignments as an id, x, y tsv. This will be read by
843 # the JavaScript on the static page and be used to produce the
844 # visualization.
845 hexagon_writer = tsv.TsvWriter(open(os.path.join(options.directory,
846 "assignments"+ str(index) + ".tab"), "w"))
847
848 # First find the x and y offsets needed to make all hexagon positions
849 # positive
850 min_x = min(coords[0] for coords in hexagons.iterkeys())
851 min_y = min(coords[1] for coords in hexagons.iterkeys())
852
853 for coords, name in hexagons.iteritems():
854 # Write this hexagon assignment, converted to all-positive coordinates.
855 hexagon_writer.line(name, coords[0] - min_x, coords[1] - min_y)
856 hexagon_writer.close()
857
858 # Hand placement_badness dict to main method so that it can be used else
859 # where.
860 return placement_badnesses
861
862 def write_matrix_names (options):
863 """
864 Write the names of the similarity matrices so that hexagram.js can
865 process the names and create the toggle layout GUI.
866 We pass options to access the parsed args and thus the matrix names.
867 """
868 name_writer = tsv.TsvWriter(open(os.path.join(options.directory,
869 "matrixnames.tab"), "w"))
870 for i in options.names:
871 name_writer.line(i)
872
873 name_writer.close()
874
875 def main(args):
876 """
877 Parses command line arguments, and makes visualization.
878 "args" specifies the program arguments, with args[0] being the executable
879 name. The return value should be used as the program's exit code.
880 """
881
882 options = parse_args(args) # This holds the nicely-parsed options object
883
884 print "Created Options"
885
886 # Test our picking
887 x, y = hexagon_center(0, 0)
888 if hexagon_pick(x, y) != (0, 0):
889 raise Exception("Picking is broken!")
890
891 # First bit of stdout becomes annotation in Galaxy
892 # Make sure our output directory exists.
893 if not os.path.exists(options.directory):
894 # makedirs is the right thing to use here: recursive
895 os.makedirs(options.directory)
896
897 print "Writing matrix names..."
898 # We must write the file names for hexagram.js to access.
899 write_matrix_names(options)
900
901 print "About to open matrices..."
902
903 # We have file names stored in options.similarities
904 # We must open the files and store them in matrices list for access
905 open_matrices(options.similarity)
906
907 print "Opened matrices..."
908
909 # The nodes list stores the list of nodes for each matrix
910 # We must keep track of each set of nodes
911 nodes_multiple = []
912
913 print "Created nodes_multiple list..."
914
915 # Index for drl.tab and drl.layout file naming. With indexes we can match
916 # file names, to matrices, to drl output files.
917 for index, i in enumerate (matrices):
918 nodes_multiple.append (drl_similarity_functions(i, index, options))
919
920 # Compute Hexagam Assignments for each similarity matrix's drl output,
921 # which is found in nodes_multiple.
922
923 # placement_badnesses_multiple list is required to store the placement
924 # badness dicts that are returned by the compute_hexagram_assignments
925 # function.
926 placement_badnesses_multiple = []
927 for index, i in enumerate (nodes_multiple):
928 placement_badnesses_multiple.append (compute_hexagram_assignments (i, index, options))
929
930 # Now that we have hex assignments, compute layers.
931
932 # In addition to making per-layer files, we're going to copy all the score
933 # matrices to our output directoy. That way, the client can download layers
934 # in big chunks when it wants all layer data for statistics. We need to
935 # write a list of matrices that the client can read, which is written by
936 # this TSV writer.
937 matrix_index_writer = tsv.TsvWriter(open(os.path.join(options.directory,
938 "matrices.tab"), "w"))
939
940 # Read in all the layer data at once
941 # TODO: Don't read in all the layer data at once
942
943 # This holds a dict from layer name to a dict from signature name to
944 # score.
945 layers = {}
946
947 # This holds the names of all layers
948 layer_names = []
949
950 for matrix_number, score_filename in enumerate(options.scores):
951 # First, copy the whole matrix into our output. This holds its filename.
952 output_filename = "matrix_{}.tab".format(matrix_number)
953 shutil.copy2(score_filename, os.path.join(options.directory,
954 output_filename))
955
956 # Record were we put it
957 matrix_index_writer.line(output_filename)
958
959 # This holds a reader for the scores TSV
960 scores_reader = tsv.TsvReader(open(score_filename, "r"))
961
962 # This holds an iterator over lines in that file
963 # TODO: Write a proper header/data API
964 scores_iterator = scores_reader.__iter__()
965
966 try:
967 # This holds the names of the columns (except the first, which is
968 # labels). They also happen to be layer names
969 file_layer_names = scores_iterator.next()[1:]
970
971 # Add all the layers in this file to the complete list of layers.
972 layer_names += file_layer_names
973
974 # Ensure that we have a dict for every layer mentioned in the file
975 # (even the ones that have no data below). Doing it this way means
976 # all score matrices need disjoint columns, or the last one takes
977 # precedence.
978 for name in file_layer_names:
979 layers[name] = {}
980
981 for parts in scores_iterator:
982 # This is the signature that this line is about
983 signature_name = parts[0]
984
985 if signature_name not in nodes_multiple[0]:
986 # This signature wasn't in our DrL output. Don't bother
987 # putting its layer data in our visualization. This saves
988 # space and makes the client-side layer counts accurate for
989 # the data actually displayable.
990 continue
991
992 # These are the scores for all the layers for this signature
993 layer_scores = parts[1:]
994
995 for (layer_name, score) in itertools.izip(file_layer_names,
996 layer_scores):
997
998 # Store all the layer scores in the appropriate
999 # dictionaries.
1000 try:
1001 layers[layer_name][signature_name] = float(score)
1002 except ValueError:
1003 # This is not a float.
1004 # Don't set that entry for this layer.
1005 # TODO: possibly ought to complain to the user? But then
1006 # things like "N/A" won't be handled properly.
1007 continue
1008
1009 except StopIteration:
1010 # We don't have any real data here. Couldn't read the header line.
1011 # Skip to the next file
1012 pass
1013
1014 # We're done with this score file now
1015 scores_reader.close()
1016
1017 # We're done with all the input score matrices, so our index is done too.
1018 matrix_index_writer.close()
1019
1020 # We have now loaded all layer data into memory as Python objects. What
1021 # could possibly go wrong?
1022
1023 # Stick our placement badness layer on the end
1024 layer_names.append("Placement Badness")
1025 layers["Placement Badness"] = placement_badnesses_multiple[0]
1026
1027 # Now we need to write layer files.
1028
1029 # Generate some filenames for layers that we can look up by layer name.
1030 # We do this because layer names may not be valid filenames.
1031 layer_files = {name: os.path.join(options.directory,
1032 "layer_{}.tab".format(number)) for (name, number) in itertools.izip(
1033 layer_names, itertools.count())}
1034
1035 for layer_name, layer in layers.iteritems():
1036 # Write out all the individual layer files
1037 # This holds the writer for this layer file
1038 scores_writer = tsv.TsvWriter(open(layer_files[layer_name], "w"))
1039 for signature_name, score in layer.iteritems():
1040 # Write the score for this signature in this layer
1041 scores_writer.line(signature_name, score)
1042 scores_writer.close()
1043
1044 # We need something to sort layers by. We have "priority" (lower is
1045 # better)
1046
1047 if len(layer_names) > 0 and options.stats:
1048 # We want to do this fancy parallel stats thing.
1049 # We skip it when there are no layers, so we don't try to join a
1050 # never-used pool, which seems to hang.
1051
1052 print "Running statistics..."
1053
1054 # This holds an iterator that makes ClusterFinders for all out layers
1055 cluster_finders = [ClusterFinder(hexagons, layers[layer_name],
1056 window_size=options.window_size) for layer_name in layer_names]
1057
1058 print "{} jobs to do.".format(len(cluster_finders))
1059
1060 # This holds a multiprocessing pool for parallelization
1061 pool = multiprocessing.Pool()
1062
1063 # This holds all the best p values in the same order
1064 best_p_values = pool.map(run_functor, cluster_finders)
1065
1066 # Close down the pool so multiprocessing won't die sillily at the end
1067 pool.close()
1068 pool.join()
1069
1070 # This holds a dict from layer name to priority (best p value)
1071 # We hope the order of the dict items has not changed
1072 layer_priorities = {layer_name: best_p_value for layer_name,
1073 best_p_value in itertools.izip(layer_names, best_p_values)}
1074 else:
1075 # We aren't doing any stats.
1076
1077 print "Skipping statistics."
1078
1079 # Make up priorities.
1080 layer_priorities = {name: float("+inf") for name in layer_names}
1081
1082 # Count how many layer entries are greater than 0 for each binary layer, and
1083 # store that number in this dict by layer name. Things with the default
1084 # empty string instead of a number aren't binary layers, but they can use
1085 # the empty string as their TSV field value, so we can safely pull any layer
1086 # out of this by name.
1087 layer_positives = collections.defaultdict(str)
1088
1089 for layer_name in layer_names:
1090 # Assume it's a binary layer until proven otherwise
1091 layer_positives[layer_name] = 0
1092 for value in layers[layer_name].itervalues():
1093 if value == 1:
1094 # Count up all the 1s in the layer
1095 layer_positives[layer_name] += 1
1096 elif value != 0:
1097 # It has something that isn't 1 or 0, so it can't be a binary
1098 # layer. Throw it out and try the next layer.
1099 layer_positives[layer_name] = ""
1100 break
1101
1102 # Write an index of all the layers we have, in the form:
1103 # <layer>\t<file>\t<priority>\t<number of signatures with data>\t<number of
1104 # signatures that are 1 for binary layers, or empty>
1105 # This is the writer to use.
1106 index_writer = tsv.TsvWriter(open(os.path.join(options.directory,
1107 "layers.tab"), "w"))
1108
1109 for layer_name, layer_file in layer_files.iteritems():
1110 # Write the index entry for this layer
1111 index_writer.line(layer_name, os.path.basename(layer_file),
1112 layer_priorities[layer_name], len(layers[layer_name]),
1113 layer_positives[layer_name])
1114
1115 index_writer.close()
1116
1117 # Sahil will implement linear regression code here
1118
1119 # We must create a m * n matrix of samples * genes
1120 # In order to create this matrix we first must know the number of hexes
1121 # and mantain them in a certain order. The order is important so that
1122 # we populate the matrix with the data values in the proper row (sample).
1123
1124 # Copy over the user-specified colormaps file, or make an empty TSV if it's
1125 # not specified.
1126
1127 # This holds a writer for the sim file. Creating it creates the file.
1128 colormaps_writer = tsv.TsvWriter(open(os.path.join(options.directory,
1129 "colormaps.tab"), "w"))
1130
1131 if options.colormaps is not None:
1132 # The user specified colormap data, so copy it over
1133 # This holds a reader for the colormaps file
1134 colormaps_reader = tsv.TsvReader(options.colormaps)
1135
1136 print "Regularizing colormaps file..."
1137 sys.stdout.flush()
1138
1139 for parts in colormaps_reader:
1140 colormaps_writer.list_line(parts)
1141
1142 colormaps_reader.close()
1143
1144 # Close the colormaps file we wrote. It may have gotten data, or it may
1145 # still be empty.
1146 colormaps_writer.close()
1147
1148 # Now copy any static files from where they live next to this Python file
1149 # into the web page bundle.
1150 # This holds the directory where this script lives, which also contains
1151 # static files.
1152 tool_root = os.path.dirname(os.path.realpath(__file__))
1153
1154 # Copy over all the static files we need for the web page
1155 # This holds a list of them
1156 static_files = [
1157 # Static images
1158 "drag.svg",
1159 "filter.svg",
1160 "statistics.svg",
1161 "right.svg",
1162 "set.svg",
1163 "save.svg",
1164 "inflate.svg",
1165 "throbber.svg",
1166
1167 # jQuery itself is pulled from a CDN.
1168 # We can't take everything offline since Google Maps needs to be sourced
1169 # from Google, so we might as well use CDN jQuery.
1170
1171 # Select2 scripts and resources:
1172 "select2.css",
1173 "select2.js",
1174 "select2.png",
1175 "select2-spinner.gif",
1176 "select2x2.png",
1177
1178 # The jQuery.tsv plugin
1179 "jquery.tsv.js",
1180 # The color library
1181 "color-0.4.1.js",
1182 # The jStat statistics library
1183 "jstat-1.0.0.js",
1184 # The Google Maps MapLabel library
1185 "maplabel-compiled.js",
1186 # The main CSS file
1187 "hexagram.css",
1188 # The main JavaScript file that runs the page
1189 "hexagram.js",
1190 # Web Worker for statistics
1191 "statistics.js",
1192 # File with all the tool code
1193 "tools.js"
1194 ]
1195
1196 # We'd just use a directory of static files, but Galaxy needs single-level
1197 # output.
1198 for filename in static_files:
1199 shutil.copy2(os.path.join(tool_root, filename), options.directory)
1200
1201 # Copy the HTML file to our output file. It automatically knows to read
1202 # assignments.tab, and does its own TSV parsing
1203 shutil.copy2(os.path.join(tool_root, "hexagram.html"), options.html)
1204
1205 print "Visualization generation complete!"
1206
1207 return 0
1208
1209 if __name__ == "__main__" :
1210 try:
1211 # Get the return code to return
1212 # Don't just exit with it because sys.exit works by exceptions.
1213 return_code = main(sys.argv)
1214 except:
1215 traceback.print_exc()
1216 # Return a definite number and not some unspecified error code.
1217 return_code = 1
1218
1219 sys.exit(return_code)