comparison hexagram-6ae12361157c/hexagram/hexagram.py~ @ 0:1407e3634bcf draft default tip

Uploaded r11 from test tool shed.
author adam-novak
date Tue, 22 Oct 2013 14:17:59 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1407e3634bcf
1 #!/usr/bin/env python2.7
2 """
3 hexagram.py: Given a matrix of similarities, produce a hexagram visualization.
4
5 This script takes in the filename of a tab-separated value file containing a
6 sparse similarity matrix (with string labels) and several matrices of
7 layer/score data. It produces an HTML file (and several support files) that
8 provide an interactive visualization of the items clustered on a hexagonal grid.
9
10 This script depends on the DrL graph alyout package, binaries for which must be
11 present in your PATH.
12
13 Re-uses sample code and documentation from
14 <http://users.soe.ucsc.edu/~karplus/bme205/f12/Scaffold.html>
15 """
16
17 import argparse, sys, os, itertools, math, numpy, subprocess, shutil, tempfile
18 import collections, scipy.stats, multiprocessing, traceback, numpy.ma
19 import os.path
20 import tsv
21
22 def parse_args(args):
23 """
24 Takes in the command-line arguments list (args), and returns a nice argparse
25 result with fields for all the options.
26 Borrows heavily from the argparse documentation examples:
27 <http://docs.python.org/library/argparse.html>
28 """
29
30 # The command line arguments start with the program name, which we don't
31 # want to treat as an argument for argparse. So we remove it.
32 args = args[1:]
33
34 # Construct the parser (which is stored in parser)
35 # Module docstring lives in __doc__
36 # See http://python-forum.com/pythonforum/viewtopic.php?f=3&t=36847
37 # And a formatter class so our examples in the docstring look good. Isn't it
38 # convenient how we already wrapped it to 80 characters?
39 # See http://docs.python.org/library/argparse.html#formatter-class
40 parser = argparse.ArgumentParser(description=__doc__,
41 formatter_class=argparse.RawDescriptionHelpFormatter)
42
43 # Now add all the options to it
44 # Options match the ctdHeatmap tool options as much as possible.
45 parser.add_argument("similarities", type=argparse.FileType("r"),
46 help="the TSV file with the similarities for signatures we're using")
47 parser.add_argument("--scores", type=str,
48 action="append", default=[],
49 help="a TSV to read scores for each signature from")
50 parser.add_argument("--colormaps", type=argparse.FileType("r"),
51 default=None,
52 help="a TSV defining coloring and value names for discrete scores")
53 parser.add_argument("--html", "-H", type=str,
54 default="index.html",
55 help="where to write HTML report")
56 parser.add_argument("--directory", "-d", type=str, default=".",
57 help="directory in which to create other output files")
58 parser.add_argument("--query", type=str, default=None,
59 help="Galaxy-escaped name of the query signature")
60 parser.add_argument("--window_size", type=int, default=20,
61 help="size of the window to use when looking for clusters")
62 parser.add_argument("--no-stats", dest="stats", action="store_false",
63 default=True,
64 help="disable cluster-finding statistics")
65
66 return parser.parse_args(args)
67
68 def hexagon_center(x, y, scale=1.0):
69 """
70 Given a coordinate on a grid of hexagons (using wiggly rows in x), what is
71 the 2d Euclidian coordinate of its center?
72
73 x and y are integer column and row coordinates of the hexagon in the grid.
74
75 scale is a float specifying hexagon side length.
76
77 The origin in coordinate space is defined as the upper left corner of the
78 bounding box of the hexagon wityh indices x=0 and y=0.
79
80 Returns a tuple of floats.
81 """
82 # The grid looks like this:
83 #
84 # /-\ /-\ /-\ /-\
85 # /-\-/-\-/-\-/-\-/-\
86 # \-/-\-/-\-/-\-/-\-/
87 # /-\-/-\-/-\-/-\-/-\
88 # \-/-\-/-\-/-\-/-\-/
89 # /-\-/-\-/-\-/-\-/-\
90 # \-/ \-/ \-/ \-/ \-/
91 #
92 # Say a hexagon side has length 1
93 # It's 2 across corner to corner (x), and sqrt(3) across side to side (y)
94 # X coordinates are 1.5 per column
95 # Y coordinates (down from top) are sqrt(3) per row, -1/2 sqrt(3) if you're
96 # in an odd column.
97
98 center_y = math.sqrt(3) * y
99 if x % 2 == 1:
100 # Odd column: shift up
101 center_y -= 0.5 * math.sqrt(3)
102
103 return (1.5 * x * scale + scale, center_y * scale + math.sqrt(3.0) / 2.0 *
104 scale)
105
106 def hexagon_pick(x, y, scale=1.0):
107 """
108 Given floats x and y specifying coordinates in the plane, determine which
109 hexagon grid cell that point is in.
110
111 scale is a float specifying hexagon side length.
112
113 See http://blog.ruslans.com/2011/02/hexagonal-grid-math.html
114 But we flip the direction of the wiggle. Odd rows are up (-y)
115 """
116
117 # How high is a hex?
118 hex_height = math.sqrt(3) * scale
119
120 # First we pick a rectangular tile, from the point of one side-traingle to
121 # the base of the other in width, and the whole hexagon height in height.
122
123 # How wide are these tiles? Corner to line-between-far-corners distance
124 tile_width = (3.0 / 2.0 * scale)
125
126 # Tile X index is floor(x / )
127 tile_x = int(math.floor(x / tile_width))
128
129 # We need this intermediate value for the Y index and for tile-internal
130 # picking
131 corrected_y = y + (tile_x % 2) * hex_height / 2.0
132
133 # Tile Y index is floor((y + (x index mod 2) * hex height/2) / hex height)
134 tile_y = int(math.floor(corrected_y / hex_height))
135
136 # Find coordinates within the tile
137 internal_x = x - tile_x * tile_width
138 internal_y = corrected_y - tile_y * hex_height
139
140 # Do tile-scale picking
141 # Are we in the one corner, the other corner, or the bulk of the tile?
142 if internal_x > scale * abs(0.5 - internal_y / hex_height):
143 # We're in the bulk of the tile
144 # This is the column (x) of the picked hexagon
145 hexagon_x = tile_x
146
147 # This is the row (y) of the picked hexagon
148 hexagon_y = tile_y
149 else:
150 # We're in a corner.
151 # In an even column, the lower left is part of the next row, and the
152 # upper left is part of the same row. In an odd column, the lower left
153 # is part of the same row, and the upper left is part of the previous
154 # row.
155 if internal_y > hex_height / 2.0:
156 # It's the lower left corner
157 # This is the offset in row (y) that being in this corner gives us
158 # The lower left corner is always 1 row below the upper left corner.
159 corner_y_offset = 1
160 else:
161 corner_y_offset = 0
162
163 # TODO: verify this for correctness. It seems to be right, but I want a
164 # unit test to be sure.
165 # This is the row (y) of the picked hexagon
166 hexagon_y = tile_y - tile_x % 2 + corner_y_offset
167
168 # This is the column (x) of the picked hexagon
169 hexagon_x = tile_x - 1
170
171 # Now we've picked the hexagon
172 return (hexagon_x, hexagon_y)
173
174 def radial_search(center_x, center_y):
175 """
176 An iterator that yields coordinate tuples (x, y) in order of increasing
177 hex-grid distance from the specified center position.
178 """
179
180 # A hexagon has neighbors at the following relative coordinates:
181 # (-1, 0), (1, 0), (0, -1), (0, 1)
182 # and ((-1, 1) and (1, 1) if in an even column)
183 # or ((-1, -1) and (1, -1) if in an odd column)
184
185 # We're going to go outwards using breadth-first search, so we need a queue
186 # of hexes to visit and a set of already visited hexes.
187
188 # This holds a queue (really a deque) of hexes waiting to be visited.
189 # A list has O(n) pop/insert at left.
190 queue = collections.deque()
191 # This holds a set of the (x, y) coordinate tuples of already-seen hexes,
192 # so we don't enqueue them again.
193 seen = set()
194
195 # First place to visit is the center.
196 queue.append((center_x, center_y))
197
198 while len(queue) > 0:
199 # We should in theory never run out of items in the queue.
200 # Get the current x and y to visit.
201 x, y = queue.popleft()
202
203 # Yield the location we're visiting
204 yield (x, y)
205
206 # This holds a list of all relative neighbor positions as (x, y) tuples.
207 neighbor_offsets = [(-1, 0), (1, 0), (0, -1), (0, 1)]
208 if y % 2 == 0:
209 # An even-column hex also has these neighbors
210 neighbor_offsets += [(-1, 1), (1, 1)]
211 else:
212 # An odd-column hex also has these neighbors
213 neighbor_offsets += [(-1, -1), (1, -1)]
214
215 for x_offset, y_offset in neighbor_offsets:
216 # First calculate the absolute position of the neighbor in x
217 neighbor_x = x + x_offset
218 # And in y
219 neighbor_y = y + y_offset
220
221 if (neighbor_x, neighbor_y) not in seen:
222 # This is a hex that has never been in the queue. Add it.
223 queue.append((neighbor_x, neighbor_y))
224
225 # Record that it has ever been enqueued
226 seen.add((neighbor_x, neighbor_y))
227
228
229
230
231 def assign_hexagon(hexagons, node_x, node_y, node, scale=1.0):
232 """
233 This function assigns the given node to a hexagon in hexagons. hexagons is a
234 defaultdict from tuples of hexagon (x, y) integer indices to assigned nodes,
235 or None if a hexagon is free. node_x and node_y are the x and y coordinates
236 of the node, adapted so that the seed node lands in the 0, 0 hexagon, and
237 re-scaled to reduce hexagon conflicts. node is the node to be assigned.
238 scale, if specified, is the hexagon side length in node space units.
239
240 This function assigns nodes to their closest hexagon, reprobing outwards if
241 already occupied.
242
243 When the function completes, node is stored in hexagons under some (x, y)
244 tuple.
245
246 Returns the distance this hexagon is from its ideal location.
247 """
248
249 # These hold the hexagon that the point falls in, which may be taken.
250 best_x, best_y = hexagon_pick(node_x, node_y, scale=scale)
251
252 for x, y in radial_search(best_x, best_y):
253 # These hexes are enumerated in order of increasign distance from the
254 # best one, starting with the best hex itself.
255
256 if hexagons[(x, y)] is None:
257 # This is the closest free hex. Break out of the loop, leaving x and
258 # y pointing here.
259 break
260
261 # Assign the node to the hexagon
262 hexagons[(x, y)] = node
263
264 return math.sqrt((x - best_x) ** 2 + (y - best_y) ** 2)
265
266
267
268 def assign_hexagon_local_radial(hexagons, node_x, node_y, node, scale=1.0):
269 """
270 This function assigns the given node to a hexagon in hexagons. hexagons is a
271 defaultdict from tuples of hexagon (x, y) integer indices to assigned nodes,
272 or None if a hexagon is free. node_x and node_y are the x and y coordinates
273 of the node, adapted so that the seed node lands in the 0, 0 hexagon, and
274 re-scaled to reduce hexagon conflicts. node is the node to be assigned.
275 scale, if specified, is the hexagon side length in node space units.
276
277 This function assigns nodes to their closest hexagon. If thast hexagon is
278 full, it re-probes in the direction that the node is from the closest
279 hexagon's center.
280
281 When the function completes, node is stored in hexagons under some (x, y)
282 tuple.
283
284 Returns the distance this hexagon is from its ideal location.
285 """
286
287 # These hold the hexagon that the point falls in, which may be taken.
288 best_x, best_y = hexagon_pick(node_x, node_y, scale=scale)
289
290 # These hold the center of that hexagon in float space
291 center_x, center_y = hexagon_center(best_x, best_y, scale=scale)
292
293 # This holds the distance from this point to the center of that hexagon
294 node_distance = math.sqrt((node_x - center_x) ** 2 + (node_y - center_y) **
295 2)
296
297 # These hold the normalized direction of this point, relative to the center
298 # of its best hexagon
299 direction_x = (node_x - center_x) / node_distance
300 direction_y = (node_y - center_y) / node_distance
301
302 # Do a search in that direction, starting at the best hex.
303
304 # These are the hexagon indices we're considering
305 x, y = best_x, best_y
306
307 # These are the Cartesian coordinates we're probing. Must be in the x, y hex
308 # as a loop invariant.
309 test_x, test_y = center_x, center_y
310
311 while hexagons[(x, y)] is not None:
312 # Re-probe outwards from the best hex in scale/2-sized steps
313 # TODO: is that the right step size? Scale-sized steps seemed slightly
314 # large.
315 test_x += direction_x * scale
316 test_y += direction_y * scale
317
318 # Re-pick x and y for the hex containing our test point
319 x, y = hexagon_pick(test_x, test_y, scale=scale)
320
321 # We've finally reached the edge of the cluster.
322 # Drop our hexagon
323 hexagons[(x, y)] = node
324
325 return math.sqrt((x - best_x) ** 2 + (y - best_y) ** 2)
326
327 def assign_hexagon_radial(hexagons, node_x, node_y, node, scale=1.0):
328 """
329 This function assigns the given node to a hexagon in hexagons. hexagons is a
330 defaultdict from tuples of hexagon (x, y) integer indices to assigned nodes,
331 or None if a hexagon is free. node_x and node_y are the x and y coordinates
332 of the node, adapted so that the seed node lands in the 0, 0 hexagon, and
333 re-scaled to reduce hexagon conflicts. node is the node to be assigned.
334 scale, if specified, is the hexagon side length in node space units.
335
336 This function assigns nodes to hexagons based on radial distance from 0, 0.
337 This makes hexagon assignment much more dense, but can lose spatial
338 structure.
339
340 When the function completes, node is stored in hexagons under some (x, y)
341 tuple.
342
343 Returns the distance this hexagon is from its ideal location. Unfortunately,
344 this doesn't really make sense for this assignment scheme, so it is always
345 0.
346 """
347
348 # Compute node's distance from the origin
349 node_distance = math.sqrt(node_x ** 2 + node_y ** 2)
350
351 # Compute normalized direction from the origin for this node
352 direction_x = node_x / node_distance
353 direction_y = node_y / node_distance
354
355 # These are the coordinates we are testing
356 test_x = 0
357 test_y = 0
358
359 # These are the hexagon indices that correspond to that point
360 x, y = hexagon_pick(test_x, test_y, scale=scale)
361
362 while hexagons[(x, y)] is not None:
363 # Re-probe outwards from the origin in scale-sized steps
364 # TODO: is that the right step size?
365 test_x += direction_x * scale
366 test_y += direction_y * scale
367
368 # Re-pick
369 x, y = hexagon_pick(test_x, test_y, scale=scale)
370
371 # We've finally reached the edge of the cluster.
372 # Drop our hexagon
373 # TODO: this has to be N^2 if we line them all up in a line
374 hexagons[(x, y)] = node
375
376 return 0
377
378 def hexagons_in_window(hexagons, x, y, width, height):
379 """
380 Given a dict from (x, y) position to signature names, return the list of all
381 signatures in the window starting at hexagon x, y and extending width in the
382 x direction and height in the y direction on the hexagon grid.
383 """
384
385 # This holds the list of hexagons we've found
386 found = []
387
388 for i in xrange(x, x + width):
389 for j in xrange(y, y + height):
390 if hexagons.has_key((i, j)):
391 # This position in the window has a hex.
392 found.append(hexagons[(i, j)])
393
394 return found
395
396 class ClusterFinder(object):
397 """
398 A class that can be invoked to find the p value of the best cluster in its
399 layer. Instances are pickleable.
400 """
401
402 def __init__(self, hexagons, layer, window_size=5):
403 """
404 Keep the given hexagons dict (from (x, y) to signature name) and the
405 given layer (a dict from signature name to a value), and the given
406 window size, in a ClusterFinder object.
407 """
408
409 # TODO: This should probably all operate on numpy arrays that we can
410 # slice efficiently.
411
412 # Store the layer
413 self.hexagons = hexagons
414 # Store the hexagon assignments
415 self.layer = layer
416
417 # Store the window size
418 self.window_size = window_size
419
420 @staticmethod
421 def continuous_p(in_values, out_values):
422 """
423 Get the p value for in_values and out_values being distinct continuous
424 distributions.
425
426 in_values and out_values are both Numpy arrays. Returns the p value, or
427 raises a ValueError if the statistical test cannot be run for some
428 reason.
429
430 Uses the Mann-Whitney U test.
431 """
432
433 # Do a Mann-Whitney U test to see how different the data
434 # sets are.
435 u_statistic, p_value = scipy.stats.mannwhitneyu(in_values,
436 out_values)
437
438 return p_value
439
440 @staticmethod
441 def dichotomous_p(in_values, out_values):
442 """
443 Given two one-dimensional Numpy arrays of 0s and 1s, compute a p value
444 for the in_values having a different probability of being 1 than the
445 frequency of 1s in the out_values.
446
447 This test uses the scipy.stats.binom_test function, which does not claim
448 to use the normal approximation. Therefore, this test should be valid
449 for arbitrarily small frequencies of either 0s or 1s in in_values.
450
451 TODO: What if out_values is shorter than in_values?
452 """
453
454 if len(out_values) == 0:
455 raise ValueError("Background group is empty!")
456
457 # This holds the observed frequency of 1s in out_values
458 frequency = numpy.sum(out_values) / len(out_values)
459
460 # This holds the number of 1s in in_values
461 successes = numpy.sum(in_values)
462
463 # This holds the number of "trials" we got that many successes in
464 trials = len(in_values)
465
466 # Return how significantly the frequency inside differs from that
467 # outside.
468 return scipy.stats.binom_test(successes, trials, frequency)
469
470 @staticmethod
471 def categorical_p(in_values, out_values):
472 """
473 Given two one-dimensional Numpy arrays of integers (which may be stored
474 as floats), which represent items being assigned to different
475 categories, return a p value for the distribution of categories observed
476 in in_values differing from that observed in out_values.
477
478 The normal way to do this is with a chi-squared goodness of fit test.
479 However, that test has invalid assumptions when there are fewer than 5
480 expected and 5 observed observations in every category.
481 See http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chis
482 quare.html
483
484 However, we will use it anyway, because the tests that don't break down
485 are prohibitively slow.
486 """
487
488 # Convert our inputs to integer arrays
489 in_values = in_values.astype(int)
490 out_values = out_values.astype(int)
491
492 # How many categories are there (count 0 to the maximum value)
493 num_categories = max(numpy.max(in_values), numpy.max(out_values)) + 1
494
495 # Count the number of in_values and out_values in each category
496 in_counts = numpy.array([len(in_values[in_values == i]) for i in
497 xrange(num_categories)])
498 out_counts = numpy.array([len(out_values[out_values == i]) for i in
499 xrange(num_categories)])
500
501 # Get the p value for the window being from the estimated distribution
502 # None of the distribution parameters count as "estimated from data"
503 # because they aren't estimated from the data under test.
504 _, p_value = scipy.stats.chisquare(in_counts, out_counts)
505
506 return p_value
507
508 def __call__(self):
509 """
510 Find the best p value for any window of size window_size. Return it.
511 """
512
513 # Calculate the bounding box where we want to look for windows.
514 # TODO: This would just be all of a numpy array
515 min_x = min(coords[0] for coords in self.hexagons.iterkeys())
516 min_y = min(coords[1] for coords in self.hexagons.iterkeys())
517 max_x = max(coords[0] for coords in self.hexagons.iterkeys())
518 max_y = max(coords[1] for coords in self.hexagons.iterkeys())
519
520 # This holds a Numpy array of all the data by x, y
521 layer_data = numpy.empty((max_x - min_x + 1, max_y - min_y + 1))
522
523 # Fill it with NaN so we can mask those out later
524 layer_data[:] = numpy.NAN
525
526 for (hex_x, hex_y), name in self.hexagons.iteritems():
527 # Copy the layer values into the Numpy array
528 if self.layer.has_key(name):
529 layer_data[hex_x - min_x, hex_y - min_y] = self.layer[name]
530
531 # This holds a masked version of the layer data
532 layer_data_masked = numpy.ma.masked_invalid(layer_data, copy=False)
533
534 # This holds the smallest p value we have found for this layer
535 best_p = float("+inf")
536
537 # This holds the statistical test to use (a function from two Numpy
538 # arrays to a p value)
539 # The most specific test is the dichotomous test (0 or 1)
540 statistical_test = self.dichotomous_p
541
542 if numpy.sum(~layer_data_masked.mask) == 0:
543 # There is actually no data in this layer at all.
544 # nditer complains if we try to iterate over an empty thing.
545 # So quit early and say we couldn't find anything.
546 return best_p
547
548 for value in numpy.nditer(layer_data_masked[~layer_data_masked.mask]):
549 # Check all the values in the layer.
550 # If this value is out of the domain of the current statistical
551 # test, upgrade to a more general test.
552
553 if statistical_test == self.dichotomous_p and (value > 1 or
554 value < 0):
555
556 # We can't use a dichotomous test on things outside 0 to 1
557 # But we haven't yet detected any non-integers
558 # Use categorical
559 statistical_test = self.categorical_p
560
561 if value % 1 != 0:
562 # This is not an integer value
563 # So, we must use a continuous statistical test
564 statistical_test = self.continuous_p
565
566 # This is the least specific test, so we can stop now
567 break
568
569
570 for i in xrange(min_x, max_x - self.window_size):
571 for j in xrange(min_y, max_y - self.window_size):
572
573 # Get the layer values for hexes in the window, as a Numpy
574 # masked array.
575 in_region = layer_data_masked[i:i + self.window_size,
576 j:j + self.window_size]
577
578 # And as a 1d Numpy array
579 in_values = numpy.reshape(in_region[~in_region.mask], -1).data
580
581 # And out of the window (all the other hexes) as a masked array
582 out_region = numpy.ma.copy(layer_data_masked)
583 # We get this by masking out everything in the region
584 out_region.mask[i:i + self.window_size,
585 j:j + self.window_size] = True
586
587 # And as a 1d Numpy array
588 out_values = numpy.reshape(out_region[~out_region.mask],
589 -1).data
590
591
592 if len(in_values) == 0 or len(out_values) == 0:
593 # Can't do any stats on this window
594 continue
595
596 if len(in_values) < 0.5 * self.window_size ** 2:
597 # The window is less than half full. Skip it.
598 # TODO: Make this threshold configurable.
599 continue
600
601 try:
602
603 # Get the p value for this window under the selected
604 # statistical test
605 p_value = statistical_test(in_values, out_values)
606
607 # If this is the best p value so far, record it
608 best_p = min(best_p, p_value)
609 except ValueError:
610 # Probably an all-zero layer, or something else the test
611 # can't handle.
612 # But let's try all the other windows to be safe.
613 # Maybe one will work.
614 pass
615
616
617
618 # We have now found the best p for any window for this layer.
619 print "Best p found: {}".format(best_p)
620 sys.stdout.flush()
621
622 return best_p
623
624 def run_functor(functor):
625 """
626 Given a no-argument functor (like a ClusterFinder), run it and return its
627 result. We can use this with multiprocessing.map and map it over a list of
628 job functors to do them.
629
630 Handles getting more than multiprocessing's pitiful exception output
631 """
632
633 try:
634 return functor()
635 except:
636 # Put all exception text into an exception and raise that
637 raise Exception(traceback.format_exc())
638
639 def main(args):
640 """
641 Parses command line arguments, and makes visualization.
642 "args" specifies the program arguments, with args[0] being the executable
643 name. The return value should be used as the program's exit code.
644 """
645
646 options = parse_args(args) # This holds the nicely-parsed options object
647
648 # Test our picking
649 x, y = hexagon_center(0, 0)
650 if hexagon_pick(x, y) != (0, 0):
651 raise Exception("Picking is broken!")
652
653 # First bit of stdout becomes annotation in Galaxy
654
655 # Make sure our output directory exists.
656 if not os.path.exists(options.directory):
657 # makedirs is the right thing to use here: recursive
658 os.makedirs(options.directory)
659
660 # Work in a temporary directory
661 drl_directory = tempfile.mkdtemp()
662
663 # This is the base name for all the files that DrL uses to do the layout
664 # We're going to put it in a temporary directory.
665 drl_basename = os.path.join(drl_directory, "layout")
666
667 # We can just pass our similarity matrix to DrL's truncate
668 # But we want to run it through our tsv parser to strip comments and ensure
669 # it's valid
670
671 # This holds a reader for the similarity matrix
672 sim_reader = tsv.TsvReader(options.similarities)
673
674 # This holds a writer for the sim file
675 sim_writer = tsv.TsvWriter(open(drl_basename + ".sim", "w"))
676
677 print "Regularizing similarity matrix..."
678 sys.stdout.flush()
679
680 for parts in sim_reader:
681 sim_writer.list_line(parts)
682
683 sim_reader.close()
684 sim_writer.close()
685
686 # Now our input for DrL is prepared!
687
688 # Do DrL truncate.
689 # TODO: pass a truncation level
690 print "DrL: Truncating..."
691 sys.stdout.flush()
692 subprocess.check_call(["truncate", drl_basename])
693
694 # Run the DrL layout engine.
695 print "DrL: Doing layout..."
696 sys.stdout.flush()
697 subprocess.check_call(["layout", drl_basename])
698
699 # Put the string names back
700 print "DrL: Restoring names..."
701 sys.stdout.flush()
702 subprocess.check_call(["recoord", drl_basename])
703
704 # Now DrL has saved its coordinates as <signature name>\t<x>\t<y> rows in
705 # <basename>.coord
706
707 # We want to read that.
708 # This holds a reader for the DrL output
709 coord_reader = tsv.TsvReader(open(drl_basename + ".coord", "r"))
710
711 # This holds a dict from signature name string to (x, y) float tuple
712 nodes = {}
713
714 print "Reading DrL output..."
715 sys.stdout.flush()
716 for parts in coord_reader:
717 nodes[parts[0]] = (float(parts[1]), float(parts[2]))
718
719 coord_reader.close()
720
721 # Save the DrL coordinates in our bundle, to be displayed client-side for
722 # debugging.
723 coord_writer = tsv.TsvWriter(open(
724 os.path.join(options.directory, "drl.tab"), "w"))
725
726 for signature_name, (x, y) in nodes.iteritems():
727 # Write a tsv with names instead of numbers, like what DrL recoord would
728 # have written. This is what the Javascript on the client side wants.
729 coord_writer.line(signature_name, x, y)
730
731 coord_writer.close()
732
733 # Do the hexagon layout
734 # We do the squiggly rows setup, so express everything as integer x, y
735
736 # This is a defaultdict from (x, y) integer tuple to id that goes there, or
737 # None if it's free.
738 hexagons = collections.defaultdict(lambda: None)
739
740 # This holds the side length that we use
741 side_length = 1.0
742
743 # This holds what will be a layer of how badly placed each hexagon is
744 # A dict from node name to layer value
745 placement_badnesses = {}
746
747 for node, (node_x, node_y) in nodes.iteritems():
748 # Assign each node to a hexagon
749 # This holds the resulting placement badness for that hexagon (i.e.
750 # distance from ideal location)
751 badness = assign_hexagon(hexagons, node_x, node_y, node,
752 scale=side_length)
753
754 # Put the badness in the layer
755 placement_badnesses[node] = float(badness)
756
757 # Normalize the placement badness layer
758 # This holds the max placement badness
759 max_placement_badness = max(placement_badnesses.itervalues())
760 print "Max placement badness: {}".format(max_placement_badness)
761
762 if max_placement_badness != 0:
763 # Normalize by the max if possible.
764 placement_badnesses = {node: value / max_placement_badness for node,
765 value in placement_badnesses.iteritems()}
766
767 # The hexagons have been assigned. Make hexagons be a dict instead of a
768 # defaultdict, so it pickles.
769 # TODO: I should change it so I don't need to do this.
770 hexagons = dict(hexagons)
771
772 # Now dump the hexagon assignments as an id, x, y tsv. This will be read by
773 # the JavaScript on the static page and be used to produce the
774 # visualization.
775 hexagon_writer = tsv.TsvWriter(open(os.path.join(options.directory,
776 "assignments.tab"), "w"))
777
778 # First find the x and y offsets needed to make all hexagon positions
779 # positive
780 min_x = min(coords[0] for coords in hexagons.iterkeys())
781 min_y = min(coords[1] for coords in hexagons.iterkeys())
782
783 for coords, name in hexagons.iteritems():
784 # Write this hexagon assignment, converted to all-positive coordinates.
785 hexagon_writer.line(name, coords[0] - min_x, coords[1] - min_y)
786 hexagon_writer.close()
787
788 # Now that we have hex assignments, compute layers.
789
790 # In addition to making per-layer files, we're going to copy all the score
791 # matrices to our output directoy. That way, the client can download layers
792 # in big chunks when it wants all layer data for statistics. We need to
793 # write a list of matrices that the client can read, which is written by
794 # this TSV writer.
795 matrix_index_writer = tsv.TsvWriter(open(os.path.join(options.directory,
796 "matrices.tab"), "w"))
797
798 # Read in all the layer data at once
799 # TODO: Don't read in all the layer data at once
800
801 # This holds a dict from layer name to a dict from signature name to
802 # score.
803 layers = {}
804
805 # This holds the names of all layers
806 layer_names = []
807
808 for matrix_number, score_filename in enumerate(options.scores):
809 # First, copy the whole matrix into our output. This holds its filename.
810 output_filename = "matrix_{}.tab".format(matrix_number)
811 shutil.copy2(score_filename, os.path.join(options.directory,
812 output_filename))
813
814 # Record were we put it
815 matrix_index_writer.line(output_filename)
816
817 # This holds a reader for the scores TSV
818 scores_reader = tsv.TsvReader(open(score_filename, "r"))
819
820 # This holds an iterator over lines in that file
821 # TODO: Write a proper header/data API
822 scores_iterator = scores_reader.__iter__()
823
824 try:
825 # This holds the names of the columns (except the first, which is
826 # labels). They also happen to be layer names
827 file_layer_names = scores_iterator.next()[1:]
828
829 # Add all the layers in this file to the complete list of layers.
830 layer_names += file_layer_names
831
832 # Ensure that we have a dict for every layer mentioned in the file
833 # (even the ones that have no data below). Doing it this way means
834 # all score matrices need disjoint columns, or the last one takes
835 # precedence.
836 for name in file_layer_names:
837 layers[name] = {}
838
839 for parts in scores_iterator:
840 # This is the signature that this line is about
841 signature_name = parts[0]
842
843 # These are the scores for all the layers for this signature
844 layer_scores = parts[1:]
845
846 for (layer_name, score) in itertools.izip(file_layer_names,
847 layer_scores):
848
849 # Store all the layer scores in the appropriate
850 # dictionaries.
851 try:
852 layers[layer_name][signature_name] = float(score)
853 except ValueError:
854 # This is not a float.
855 # Don't set that entry for this layer.
856 # TODO: possibly ought to complain to the user? But then
857 # things like "N/A" won't be handled properly.
858 continue
859
860 except StopIteration:
861 # We don't have any real data here. Couldn't read the header line.
862 # Skip to the next file
863 pass
864
865 # We're done with this score file now
866 scores_reader.close()
867
868 # We're done with all the input score matrices, so our index is done too.
869 matrix_index_writer.close()
870
871 # We have now loaded all layer data into memory as Python objects. What
872 # could possibly go wrong?
873
874 # Stick our placement badness layer on the end
875 layer_names.append("Placement Badness")
876 layers["Placement Badness"] = placement_badnesses
877
878 # Now we need to write layer files.
879
880 # Generate some filenames for layers that we can look up by layer name.
881 # We do this because layer names may not be valid filenames.
882 layer_files = {name: os.path.join(options.directory,
883 "layer_{}.tab".format(number)) for (name, number) in itertools.izip(
884 layer_names, itertools.count())}
885
886 for layer_name, layer in layers.iteritems():
887 # Write out all the individual layer files
888 # This holds the writer for this layer file
889 scores_writer = tsv.TsvWriter(open(layer_files[layer_name], "w"))
890 for signature_name, score in layer.iteritems():
891 # Write the score for this signature in this layer
892 scores_writer.line(signature_name, score)
893 scores_writer.close()
894
895 # We need something to sort layers by. We have "priority" (lower is
896 # better)
897
898 if len(layer_names) > 0 and options.stats:
899 # We want to do this fancy parallel stats thing.
900 # We skip it when there are no layers, so we don't try to join a
901 # never-used pool, which seems to hang.
902
903 print "Running statistics..."
904
905 # This holds an iterator that makes ClusterFinders for all out layers
906 cluster_finders = [ClusterFinder(hexagons, layers[layer_name],
907 window_size=options.window_size) for layer_name in layer_names]
908
909 print "{} jobs to do.".format(len(cluster_finders))
910
911 # This holds a multiprocessing pool for parallelization
912 pool = multiprocessing.Pool()
913
914 # This holds all the best p values in the same order
915 best_p_values = pool.map(run_functor, cluster_finders)
916
917 # Close down the pool so multiprocessing won't die sillily at the end
918 pool.close()
919 pool.join()
920
921 # This holds a dict from layer name to priority (best p value)
922 # We hope the order of the dict items has not changed
923 layer_priorities = {layer_name: best_p_value for layer_name,
924 best_p_value in itertools.izip(layer_names, best_p_values)}
925 else:
926 # We aren't doing any stats.
927
928 print "Skipping statistics."
929
930 # Make up priorities.
931 layer_priorities = {name: float("+inf") for name in layer_names}
932
933 # Count how many layer entries are greater than 0 for each binary layer, and
934 # store that number in this dict by layer name. Things with the default
935 # empty string instead of a number aren't binary layers, but they can use
936 # the empty string as their TSV field value, so we can safely pull any layer
937 # out of this by name.
938 layer_positives = collections.defaultdict(str)
939
940 for layer_name in layer_names:
941 # Assume it's a binary layer until proven otherwise
942 layer_positives[layer_name] = 0
943 for value in layers[layer_name].itervalues():
944 if value == 1:
945 # Count up all the 1s in the layer
946 layer_positives[layer_name] += 1
947 elif value != 0:
948 # It has something that isn't 1 or 0, so it can't be a binary
949 # layer. Throw it out and try the next layer.
950 layer_positives[layer_name] = ""
951 continue
952
953 # Write an index of all the layers we have, in the form:
954 # <layer>\t<file>\t<priority>\t<number of signatures with data>\t<number of
955 # signatures that are 1 for binary layers>
956 # This is the writer to use.
957 index_writer = tsv.TsvWriter(open(os.path.join(options.directory,
958 "layers.tab"), "w"))
959
960 for layer_name, layer_file in layer_files.iteritems():
961 # Write the index entry for this layer
962 index_writer.line(layer_name, os.path.basename(layer_file),
963 layer_priorities[layer_name], len(layers[layer_name]),
964 layer_positives[layer_name])
965
966 index_writer.close()
967
968 # Copy over the user-specified colormaps file, or make an empty TSV if it's
969 # not specified.
970
971
972
973 # This holds a writer for the sim file. Creating it creates the file.
974 colormaps_writer = tsv.TsvWriter(open(os.path.join(options.directory,
975 "colormaps.tab"), "w"))
976
977 if options.colormaps is not None:
978 # The user specified colormap data, so copy it over
979 # This holds a reader for the colormaps file
980 colormaps_reader = tsv.TsvReader(options.colormaps)
981
982 print "Regularizing colormaps file..."
983 sys.stdout.flush()
984
985 for parts in colormaps_reader:
986 colormaps_writer.list_line(parts)
987
988 colormaps_reader.close()
989
990 # Close the colormaps file we wrote. It may have gotten data, or it may
991 # still be empty.
992 colormaps_writer.close()
993
994 # Now copy any static files from where they live next to this Python file
995 # into the web page bundle.
996 # This holds the directory where this script lives, which also contains
997 # static files.
998 tool_root = os.path.dirname(os.path.realpath(__file__))
999
1000 # Copy over all the static files we need for the web page
1001 # This holds a list of them
1002 static_files = [
1003 # Static images
1004 "drag.svg",
1005 "filter.svg",
1006 "statistics.svg",
1007 "right.svg",
1008 "throbber.svg",
1009
1010 # jQuery itself is pulled from a CDN.
1011 # We can't take everything offline since Google Maps needs to be sourced
1012 # from Google, so we might as well use CDN jQuery.
1013
1014 # Select2 scripts and resources:
1015 "select2.css",
1016 "select2.js",
1017 "select2.png",
1018 "select2-spinner.gif",
1019 "select2x2.png",
1020
1021 # The jQuery.tsv plugin
1022 "jquery.tsv.js",
1023 # The color library
1024 "color-0.4.1.js",
1025 # The jStat statistics library
1026 "jstat-1.0.0.js",
1027 # The Google Maps MapLabel library
1028 "maplabel-compiled.js",
1029 # The main CSS file
1030 "hexagram.css",
1031 # The main JavaScript file that runs the page
1032 "hexagram.js",
1033 # Web Worker for statistics
1034 "statistics.js",
1035 # File with all the tool code
1036 "tools.js"
1037 ]
1038
1039 # We'd just use a directory of static files, but Galaxy needs single-level
1040 # output.
1041 for filename in static_files:
1042 shutil.copy2(os.path.join(tool_root, filename), options.directory)
1043
1044 # Copy the HTML file to our output file. It automatically knows to read
1045 # assignments.tab, and does its own TSV parsing
1046 shutil.copy2(os.path.join(tool_root, "hexagram.html"), options.html)
1047
1048 # Delete our temporary directory.
1049 shutil.rmtree(drl_directory)
1050
1051 print "Visualization generation complete!"
1052
1053 return 0
1054
1055 if __name__ == "__main__" :
1056 try:
1057 # Get the return code to return
1058 # Don't just exit with it because sys.exit works by exceptions.
1059 return_code = main(sys.argv)
1060 except:
1061 traceback.print_exc()
1062 # Return a definite number and not some unspecified error code.
1063 return_code = 1
1064
1065 sys.exit(return_code)