0
|
1 #!/usr/bin/env python2.7
|
|
2 """
|
|
3 hexagram.py: Given a matrix of similarities, produce a hexagram visualization.
|
|
4
|
|
5 This script takes in the filename of a tab-separated value file containing a
|
|
6 sparse similarity matrix (with string labels) and several matrices of
|
|
7 layer/score data. It produces an HTML file (and several support files) that
|
|
8 provide an interactive visualization of the items clustered on a hexagonal grid.
|
|
9
|
|
10 This script depends on the DrL graph alyout package, binaries for which must be
|
|
11 present in your PATH.
|
|
12
|
|
13 Re-uses sample code and documentation from
|
|
14 <http://users.soe.ucsc.edu/~karplus/bme205/f12/Scaffold.html>
|
|
15 """
|
|
16
|
|
17 import argparse, sys, os, itertools, math, numpy, subprocess, shutil, tempfile
|
|
18 import collections, scipy.stats, multiprocessing, traceback, numpy.ma
|
|
19 import os.path
|
|
20 import tsv
|
|
21
|
|
22 def parse_args(args):
|
|
23 """
|
|
24 Takes in the command-line arguments list (args), and returns a nice argparse
|
|
25 result with fields for all the options.
|
|
26 Borrows heavily from the argparse documentation examples:
|
|
27 <http://docs.python.org/library/argparse.html>
|
|
28 """
|
|
29
|
|
30 # The command line arguments start with the program name, which we don't
|
|
31 # want to treat as an argument for argparse. So we remove it.
|
|
32 args = args[1:]
|
|
33
|
|
34 # Construct the parser (which is stored in parser)
|
|
35 # Module docstring lives in __doc__
|
|
36 # See http://python-forum.com/pythonforum/viewtopic.php?f=3&t=36847
|
|
37 # And a formatter class so our examples in the docstring look good. Isn't it
|
|
38 # convenient how we already wrapped it to 80 characters?
|
|
39 # See http://docs.python.org/library/argparse.html#formatter-class
|
|
40 parser = argparse.ArgumentParser(description=__doc__,
|
|
41 formatter_class=argparse.RawDescriptionHelpFormatter)
|
|
42
|
|
43 # Now add all the options to it
|
|
44 # Options match the ctdHeatmap tool options as much as possible.
|
|
45 parser.add_argument("similarities", type=argparse.FileType("r"),
|
|
46 help="the TSV file with the similarities for signatures we're using")
|
|
47 parser.add_argument("--scores", type=str,
|
|
48 action="append", default=[],
|
|
49 help="a TSV to read scores for each signature from")
|
|
50 parser.add_argument("--colormaps", type=argparse.FileType("r"),
|
|
51 default=None,
|
|
52 help="a TSV defining coloring and value names for discrete scores")
|
|
53 parser.add_argument("--html", "-H", type=str,
|
|
54 default="index.html",
|
|
55 help="where to write HTML report")
|
|
56 parser.add_argument("--directory", "-d", type=str, default=".",
|
|
57 help="directory in which to create other output files")
|
|
58 parser.add_argument("--query", type=str, default=None,
|
|
59 help="Galaxy-escaped name of the query signature")
|
|
60 parser.add_argument("--window_size", type=int, default=20,
|
|
61 help="size of the window to use when looking for clusters")
|
|
62 parser.add_argument("--no-stats", dest="stats", action="store_false",
|
|
63 default=True,
|
|
64 help="disable cluster-finding statistics")
|
|
65
|
|
66 return parser.parse_args(args)
|
|
67
|
|
68 def hexagon_center(x, y, scale=1.0):
|
|
69 """
|
|
70 Given a coordinate on a grid of hexagons (using wiggly rows in x), what is
|
|
71 the 2d Euclidian coordinate of its center?
|
|
72
|
|
73 x and y are integer column and row coordinates of the hexagon in the grid.
|
|
74
|
|
75 scale is a float specifying hexagon side length.
|
|
76
|
|
77 The origin in coordinate space is defined as the upper left corner of the
|
|
78 bounding box of the hexagon wityh indices x=0 and y=0.
|
|
79
|
|
80 Returns a tuple of floats.
|
|
81 """
|
|
82 # The grid looks like this:
|
|
83 #
|
|
84 # /-\ /-\ /-\ /-\
|
|
85 # /-\-/-\-/-\-/-\-/-\
|
|
86 # \-/-\-/-\-/-\-/-\-/
|
|
87 # /-\-/-\-/-\-/-\-/-\
|
|
88 # \-/-\-/-\-/-\-/-\-/
|
|
89 # /-\-/-\-/-\-/-\-/-\
|
|
90 # \-/ \-/ \-/ \-/ \-/
|
|
91 #
|
|
92 # Say a hexagon side has length 1
|
|
93 # It's 2 across corner to corner (x), and sqrt(3) across side to side (y)
|
|
94 # X coordinates are 1.5 per column
|
|
95 # Y coordinates (down from top) are sqrt(3) per row, -1/2 sqrt(3) if you're
|
|
96 # in an odd column.
|
|
97
|
|
98 center_y = math.sqrt(3) * y
|
|
99 if x % 2 == 1:
|
|
100 # Odd column: shift up
|
|
101 center_y -= 0.5 * math.sqrt(3)
|
|
102
|
|
103 return (1.5 * x * scale + scale, center_y * scale + math.sqrt(3.0) / 2.0 *
|
|
104 scale)
|
|
105
|
|
106 def hexagon_pick(x, y, scale=1.0):
|
|
107 """
|
|
108 Given floats x and y specifying coordinates in the plane, determine which
|
|
109 hexagon grid cell that point is in.
|
|
110
|
|
111 scale is a float specifying hexagon side length.
|
|
112
|
|
113 See http://blog.ruslans.com/2011/02/hexagonal-grid-math.html
|
|
114 But we flip the direction of the wiggle. Odd rows are up (-y)
|
|
115 """
|
|
116
|
|
117 # How high is a hex?
|
|
118 hex_height = math.sqrt(3) * scale
|
|
119
|
|
120 # First we pick a rectangular tile, from the point of one side-traingle to
|
|
121 # the base of the other in width, and the whole hexagon height in height.
|
|
122
|
|
123 # How wide are these tiles? Corner to line-between-far-corners distance
|
|
124 tile_width = (3.0 / 2.0 * scale)
|
|
125
|
|
126 # Tile X index is floor(x / )
|
|
127 tile_x = int(math.floor(x / tile_width))
|
|
128
|
|
129 # We need this intermediate value for the Y index and for tile-internal
|
|
130 # picking
|
|
131 corrected_y = y + (tile_x % 2) * hex_height / 2.0
|
|
132
|
|
133 # Tile Y index is floor((y + (x index mod 2) * hex height/2) / hex height)
|
|
134 tile_y = int(math.floor(corrected_y / hex_height))
|
|
135
|
|
136 # Find coordinates within the tile
|
|
137 internal_x = x - tile_x * tile_width
|
|
138 internal_y = corrected_y - tile_y * hex_height
|
|
139
|
|
140 # Do tile-scale picking
|
|
141 # Are we in the one corner, the other corner, or the bulk of the tile?
|
|
142 if internal_x > scale * abs(0.5 - internal_y / hex_height):
|
|
143 # We're in the bulk of the tile
|
|
144 # This is the column (x) of the picked hexagon
|
|
145 hexagon_x = tile_x
|
|
146
|
|
147 # This is the row (y) of the picked hexagon
|
|
148 hexagon_y = tile_y
|
|
149 else:
|
|
150 # We're in a corner.
|
|
151 # In an even column, the lower left is part of the next row, and the
|
|
152 # upper left is part of the same row. In an odd column, the lower left
|
|
153 # is part of the same row, and the upper left is part of the previous
|
|
154 # row.
|
|
155 if internal_y > hex_height / 2.0:
|
|
156 # It's the lower left corner
|
|
157 # This is the offset in row (y) that being in this corner gives us
|
|
158 # The lower left corner is always 1 row below the upper left corner.
|
|
159 corner_y_offset = 1
|
|
160 else:
|
|
161 corner_y_offset = 0
|
|
162
|
|
163 # TODO: verify this for correctness. It seems to be right, but I want a
|
|
164 # unit test to be sure.
|
|
165 # This is the row (y) of the picked hexagon
|
|
166 hexagon_y = tile_y - tile_x % 2 + corner_y_offset
|
|
167
|
|
168 # This is the column (x) of the picked hexagon
|
|
169 hexagon_x = tile_x - 1
|
|
170
|
|
171 # Now we've picked the hexagon
|
|
172 return (hexagon_x, hexagon_y)
|
|
173
|
|
174 def radial_search(center_x, center_y):
|
|
175 """
|
|
176 An iterator that yields coordinate tuples (x, y) in order of increasing
|
|
177 hex-grid distance from the specified center position.
|
|
178 """
|
|
179
|
|
180 # A hexagon has neighbors at the following relative coordinates:
|
|
181 # (-1, 0), (1, 0), (0, -1), (0, 1)
|
|
182 # and ((-1, 1) and (1, 1) if in an even column)
|
|
183 # or ((-1, -1) and (1, -1) if in an odd column)
|
|
184
|
|
185 # We're going to go outwards using breadth-first search, so we need a queue
|
|
186 # of hexes to visit and a set of already visited hexes.
|
|
187
|
|
188 # This holds a queue (really a deque) of hexes waiting to be visited.
|
|
189 # A list has O(n) pop/insert at left.
|
|
190 queue = collections.deque()
|
|
191 # This holds a set of the (x, y) coordinate tuples of already-seen hexes,
|
|
192 # so we don't enqueue them again.
|
|
193 seen = set()
|
|
194
|
|
195 # First place to visit is the center.
|
|
196 queue.append((center_x, center_y))
|
|
197
|
|
198 while len(queue) > 0:
|
|
199 # We should in theory never run out of items in the queue.
|
|
200 # Get the current x and y to visit.
|
|
201 x, y = queue.popleft()
|
|
202
|
|
203 # Yield the location we're visiting
|
|
204 yield (x, y)
|
|
205
|
|
206 # This holds a list of all relative neighbor positions as (x, y) tuples.
|
|
207 neighbor_offsets = [(-1, 0), (1, 0), (0, -1), (0, 1)]
|
|
208 if y % 2 == 0:
|
|
209 # An even-column hex also has these neighbors
|
|
210 neighbor_offsets += [(-1, 1), (1, 1)]
|
|
211 else:
|
|
212 # An odd-column hex also has these neighbors
|
|
213 neighbor_offsets += [(-1, -1), (1, -1)]
|
|
214
|
|
215 for x_offset, y_offset in neighbor_offsets:
|
|
216 # First calculate the absolute position of the neighbor in x
|
|
217 neighbor_x = x + x_offset
|
|
218 # And in y
|
|
219 neighbor_y = y + y_offset
|
|
220
|
|
221 if (neighbor_x, neighbor_y) not in seen:
|
|
222 # This is a hex that has never been in the queue. Add it.
|
|
223 queue.append((neighbor_x, neighbor_y))
|
|
224
|
|
225 # Record that it has ever been enqueued
|
|
226 seen.add((neighbor_x, neighbor_y))
|
|
227
|
|
228
|
|
229
|
|
230
|
|
231 def assign_hexagon(hexagons, node_x, node_y, node, scale=1.0):
|
|
232 """
|
|
233 This function assigns the given node to a hexagon in hexagons. hexagons is a
|
|
234 defaultdict from tuples of hexagon (x, y) integer indices to assigned nodes,
|
|
235 or None if a hexagon is free. node_x and node_y are the x and y coordinates
|
|
236 of the node, adapted so that the seed node lands in the 0, 0 hexagon, and
|
|
237 re-scaled to reduce hexagon conflicts. node is the node to be assigned.
|
|
238 scale, if specified, is the hexagon side length in node space units.
|
|
239
|
|
240 This function assigns nodes to their closest hexagon, reprobing outwards if
|
|
241 already occupied.
|
|
242
|
|
243 When the function completes, node is stored in hexagons under some (x, y)
|
|
244 tuple.
|
|
245
|
|
246 Returns the distance this hexagon is from its ideal location.
|
|
247 """
|
|
248
|
|
249 # These hold the hexagon that the point falls in, which may be taken.
|
|
250 best_x, best_y = hexagon_pick(node_x, node_y, scale=scale)
|
|
251
|
|
252 for x, y in radial_search(best_x, best_y):
|
|
253 # These hexes are enumerated in order of increasign distance from the
|
|
254 # best one, starting with the best hex itself.
|
|
255
|
|
256 if hexagons[(x, y)] is None:
|
|
257 # This is the closest free hex. Break out of the loop, leaving x and
|
|
258 # y pointing here.
|
|
259 break
|
|
260
|
|
261 # Assign the node to the hexagon
|
|
262 hexagons[(x, y)] = node
|
|
263
|
|
264 return math.sqrt((x - best_x) ** 2 + (y - best_y) ** 2)
|
|
265
|
|
266
|
|
267
|
|
268 def assign_hexagon_local_radial(hexagons, node_x, node_y, node, scale=1.0):
|
|
269 """
|
|
270 This function assigns the given node to a hexagon in hexagons. hexagons is a
|
|
271 defaultdict from tuples of hexagon (x, y) integer indices to assigned nodes,
|
|
272 or None if a hexagon is free. node_x and node_y are the x and y coordinates
|
|
273 of the node, adapted so that the seed node lands in the 0, 0 hexagon, and
|
|
274 re-scaled to reduce hexagon conflicts. node is the node to be assigned.
|
|
275 scale, if specified, is the hexagon side length in node space units.
|
|
276
|
|
277 This function assigns nodes to their closest hexagon. If thast hexagon is
|
|
278 full, it re-probes in the direction that the node is from the closest
|
|
279 hexagon's center.
|
|
280
|
|
281 When the function completes, node is stored in hexagons under some (x, y)
|
|
282 tuple.
|
|
283
|
|
284 Returns the distance this hexagon is from its ideal location.
|
|
285 """
|
|
286
|
|
287 # These hold the hexagon that the point falls in, which may be taken.
|
|
288 best_x, best_y = hexagon_pick(node_x, node_y, scale=scale)
|
|
289
|
|
290 # These hold the center of that hexagon in float space
|
|
291 center_x, center_y = hexagon_center(best_x, best_y, scale=scale)
|
|
292
|
|
293 # This holds the distance from this point to the center of that hexagon
|
|
294 node_distance = math.sqrt((node_x - center_x) ** 2 + (node_y - center_y) **
|
|
295 2)
|
|
296
|
|
297 # These hold the normalized direction of this point, relative to the center
|
|
298 # of its best hexagon
|
|
299 direction_x = (node_x - center_x) / node_distance
|
|
300 direction_y = (node_y - center_y) / node_distance
|
|
301
|
|
302 # Do a search in that direction, starting at the best hex.
|
|
303
|
|
304 # These are the hexagon indices we're considering
|
|
305 x, y = best_x, best_y
|
|
306
|
|
307 # These are the Cartesian coordinates we're probing. Must be in the x, y hex
|
|
308 # as a loop invariant.
|
|
309 test_x, test_y = center_x, center_y
|
|
310
|
|
311 while hexagons[(x, y)] is not None:
|
|
312 # Re-probe outwards from the best hex in scale/2-sized steps
|
|
313 # TODO: is that the right step size? Scale-sized steps seemed slightly
|
|
314 # large.
|
|
315 test_x += direction_x * scale
|
|
316 test_y += direction_y * scale
|
|
317
|
|
318 # Re-pick x and y for the hex containing our test point
|
|
319 x, y = hexagon_pick(test_x, test_y, scale=scale)
|
|
320
|
|
321 # We've finally reached the edge of the cluster.
|
|
322 # Drop our hexagon
|
|
323 hexagons[(x, y)] = node
|
|
324
|
|
325 return math.sqrt((x - best_x) ** 2 + (y - best_y) ** 2)
|
|
326
|
|
327 def assign_hexagon_radial(hexagons, node_x, node_y, node, scale=1.0):
|
|
328 """
|
|
329 This function assigns the given node to a hexagon in hexagons. hexagons is a
|
|
330 defaultdict from tuples of hexagon (x, y) integer indices to assigned nodes,
|
|
331 or None if a hexagon is free. node_x and node_y are the x and y coordinates
|
|
332 of the node, adapted so that the seed node lands in the 0, 0 hexagon, and
|
|
333 re-scaled to reduce hexagon conflicts. node is the node to be assigned.
|
|
334 scale, if specified, is the hexagon side length in node space units.
|
|
335
|
|
336 This function assigns nodes to hexagons based on radial distance from 0, 0.
|
|
337 This makes hexagon assignment much more dense, but can lose spatial
|
|
338 structure.
|
|
339
|
|
340 When the function completes, node is stored in hexagons under some (x, y)
|
|
341 tuple.
|
|
342
|
|
343 Returns the distance this hexagon is from its ideal location. Unfortunately,
|
|
344 this doesn't really make sense for this assignment scheme, so it is always
|
|
345 0.
|
|
346 """
|
|
347
|
|
348 # Compute node's distance from the origin
|
|
349 node_distance = math.sqrt(node_x ** 2 + node_y ** 2)
|
|
350
|
|
351 # Compute normalized direction from the origin for this node
|
|
352 direction_x = node_x / node_distance
|
|
353 direction_y = node_y / node_distance
|
|
354
|
|
355 # These are the coordinates we are testing
|
|
356 test_x = 0
|
|
357 test_y = 0
|
|
358
|
|
359 # These are the hexagon indices that correspond to that point
|
|
360 x, y = hexagon_pick(test_x, test_y, scale=scale)
|
|
361
|
|
362 while hexagons[(x, y)] is not None:
|
|
363 # Re-probe outwards from the origin in scale-sized steps
|
|
364 # TODO: is that the right step size?
|
|
365 test_x += direction_x * scale
|
|
366 test_y += direction_y * scale
|
|
367
|
|
368 # Re-pick
|
|
369 x, y = hexagon_pick(test_x, test_y, scale=scale)
|
|
370
|
|
371 # We've finally reached the edge of the cluster.
|
|
372 # Drop our hexagon
|
|
373 # TODO: this has to be N^2 if we line them all up in a line
|
|
374 hexagons[(x, y)] = node
|
|
375
|
|
376 return 0
|
|
377
|
|
378 def hexagons_in_window(hexagons, x, y, width, height):
|
|
379 """
|
|
380 Given a dict from (x, y) position to signature names, return the list of all
|
|
381 signatures in the window starting at hexagon x, y and extending width in the
|
|
382 x direction and height in the y direction on the hexagon grid.
|
|
383 """
|
|
384
|
|
385 # This holds the list of hexagons we've found
|
|
386 found = []
|
|
387
|
|
388 for i in xrange(x, x + width):
|
|
389 for j in xrange(y, y + height):
|
|
390 if hexagons.has_key((i, j)):
|
|
391 # This position in the window has a hex.
|
|
392 found.append(hexagons[(i, j)])
|
|
393
|
|
394 return found
|
|
395
|
|
396 class ClusterFinder(object):
|
|
397 """
|
|
398 A class that can be invoked to find the p value of the best cluster in its
|
|
399 layer. Instances are pickleable.
|
|
400 """
|
|
401
|
|
402 def __init__(self, hexagons, layer, window_size=5):
|
|
403 """
|
|
404 Keep the given hexagons dict (from (x, y) to signature name) and the
|
|
405 given layer (a dict from signature name to a value), and the given
|
|
406 window size, in a ClusterFinder object.
|
|
407 """
|
|
408
|
|
409 # TODO: This should probably all operate on numpy arrays that we can
|
|
410 # slice efficiently.
|
|
411
|
|
412 # Store the layer
|
|
413 self.hexagons = hexagons
|
|
414 # Store the hexagon assignments
|
|
415 self.layer = layer
|
|
416
|
|
417 # Store the window size
|
|
418 self.window_size = window_size
|
|
419
|
|
420 @staticmethod
|
|
421 def continuous_p(in_values, out_values):
|
|
422 """
|
|
423 Get the p value for in_values and out_values being distinct continuous
|
|
424 distributions.
|
|
425
|
|
426 in_values and out_values are both Numpy arrays. Returns the p value, or
|
|
427 raises a ValueError if the statistical test cannot be run for some
|
|
428 reason.
|
|
429
|
|
430 Uses the Mann-Whitney U test.
|
|
431 """
|
|
432
|
|
433 # Do a Mann-Whitney U test to see how different the data
|
|
434 # sets are.
|
|
435 u_statistic, p_value = scipy.stats.mannwhitneyu(in_values,
|
|
436 out_values)
|
|
437
|
|
438 return p_value
|
|
439
|
|
440 @staticmethod
|
|
441 def dichotomous_p(in_values, out_values):
|
|
442 """
|
|
443 Given two one-dimensional Numpy arrays of 0s and 1s, compute a p value
|
|
444 for the in_values having a different probability of being 1 than the
|
|
445 frequency of 1s in the out_values.
|
|
446
|
|
447 This test uses the scipy.stats.binom_test function, which does not claim
|
|
448 to use the normal approximation. Therefore, this test should be valid
|
|
449 for arbitrarily small frequencies of either 0s or 1s in in_values.
|
|
450
|
|
451 TODO: What if out_values is shorter than in_values?
|
|
452 """
|
|
453
|
|
454 if len(out_values) == 0:
|
|
455 raise ValueError("Background group is empty!")
|
|
456
|
|
457 # This holds the observed frequency of 1s in out_values
|
|
458 frequency = numpy.sum(out_values) / len(out_values)
|
|
459
|
|
460 # This holds the number of 1s in in_values
|
|
461 successes = numpy.sum(in_values)
|
|
462
|
|
463 # This holds the number of "trials" we got that many successes in
|
|
464 trials = len(in_values)
|
|
465
|
|
466 # Return how significantly the frequency inside differs from that
|
|
467 # outside.
|
|
468 return scipy.stats.binom_test(successes, trials, frequency)
|
|
469
|
|
470 @staticmethod
|
|
471 def categorical_p(in_values, out_values):
|
|
472 """
|
|
473 Given two one-dimensional Numpy arrays of integers (which may be stored
|
|
474 as floats), which represent items being assigned to different
|
|
475 categories, return a p value for the distribution of categories observed
|
|
476 in in_values differing from that observed in out_values.
|
|
477
|
|
478 The normal way to do this is with a chi-squared goodness of fit test.
|
|
479 However, that test has invalid assumptions when there are fewer than 5
|
|
480 expected and 5 observed observations in every category.
|
|
481 See http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chis
|
|
482 quare.html
|
|
483
|
|
484 However, we will use it anyway, because the tests that don't break down
|
|
485 are prohibitively slow.
|
|
486 """
|
|
487
|
|
488 # Convert our inputs to integer arrays
|
|
489 in_values = in_values.astype(int)
|
|
490 out_values = out_values.astype(int)
|
|
491
|
|
492 # How many categories are there (count 0 to the maximum value)
|
|
493 num_categories = max(numpy.max(in_values), numpy.max(out_values)) + 1
|
|
494
|
|
495 # Count the number of in_values and out_values in each category
|
|
496 in_counts = numpy.array([len(in_values[in_values == i]) for i in
|
|
497 xrange(num_categories)])
|
|
498 out_counts = numpy.array([len(out_values[out_values == i]) for i in
|
|
499 xrange(num_categories)])
|
|
500
|
|
501 # Get the p value for the window being from the estimated distribution
|
|
502 # None of the distribution parameters count as "estimated from data"
|
|
503 # because they aren't estimated from the data under test.
|
|
504 _, p_value = scipy.stats.chisquare(in_counts, out_counts)
|
|
505
|
|
506 return p_value
|
|
507
|
|
508 def __call__(self):
|
|
509 """
|
|
510 Find the best p value for any window of size window_size. Return it.
|
|
511 """
|
|
512
|
|
513 # Calculate the bounding box where we want to look for windows.
|
|
514 # TODO: This would just be all of a numpy array
|
|
515 min_x = min(coords[0] for coords in self.hexagons.iterkeys())
|
|
516 min_y = min(coords[1] for coords in self.hexagons.iterkeys())
|
|
517 max_x = max(coords[0] for coords in self.hexagons.iterkeys())
|
|
518 max_y = max(coords[1] for coords in self.hexagons.iterkeys())
|
|
519
|
|
520 # This holds a Numpy array of all the data by x, y
|
|
521 layer_data = numpy.empty((max_x - min_x + 1, max_y - min_y + 1))
|
|
522
|
|
523 # Fill it with NaN so we can mask those out later
|
|
524 layer_data[:] = numpy.NAN
|
|
525
|
|
526 for (hex_x, hex_y), name in self.hexagons.iteritems():
|
|
527 # Copy the layer values into the Numpy array
|
|
528 if self.layer.has_key(name):
|
|
529 layer_data[hex_x - min_x, hex_y - min_y] = self.layer[name]
|
|
530
|
|
531 # This holds a masked version of the layer data
|
|
532 layer_data_masked = numpy.ma.masked_invalid(layer_data, copy=False)
|
|
533
|
|
534 # This holds the smallest p value we have found for this layer
|
|
535 best_p = float("+inf")
|
|
536
|
|
537 # This holds the statistical test to use (a function from two Numpy
|
|
538 # arrays to a p value)
|
|
539 # The most specific test is the dichotomous test (0 or 1)
|
|
540 statistical_test = self.dichotomous_p
|
|
541
|
|
542 if numpy.sum(~layer_data_masked.mask) == 0:
|
|
543 # There is actually no data in this layer at all.
|
|
544 # nditer complains if we try to iterate over an empty thing.
|
|
545 # So quit early and say we couldn't find anything.
|
|
546 return best_p
|
|
547
|
|
548 for value in numpy.nditer(layer_data_masked[~layer_data_masked.mask]):
|
|
549 # Check all the values in the layer.
|
|
550 # If this value is out of the domain of the current statistical
|
|
551 # test, upgrade to a more general test.
|
|
552
|
|
553 if statistical_test == self.dichotomous_p and (value > 1 or
|
|
554 value < 0):
|
|
555
|
|
556 # We can't use a dichotomous test on things outside 0 to 1
|
|
557 # But we haven't yet detected any non-integers
|
|
558 # Use categorical
|
|
559 statistical_test = self.categorical_p
|
|
560
|
|
561 if value % 1 != 0:
|
|
562 # This is not an integer value
|
|
563 # So, we must use a continuous statistical test
|
|
564 statistical_test = self.continuous_p
|
|
565
|
|
566 # This is the least specific test, so we can stop now
|
|
567 break
|
|
568
|
|
569
|
|
570 for i in xrange(min_x, max_x - self.window_size):
|
|
571 for j in xrange(min_y, max_y - self.window_size):
|
|
572
|
|
573 # Get the layer values for hexes in the window, as a Numpy
|
|
574 # masked array.
|
|
575 in_region = layer_data_masked[i:i + self.window_size,
|
|
576 j:j + self.window_size]
|
|
577
|
|
578 # And as a 1d Numpy array
|
|
579 in_values = numpy.reshape(in_region[~in_region.mask], -1).data
|
|
580
|
|
581 # And out of the window (all the other hexes) as a masked array
|
|
582 out_region = numpy.ma.copy(layer_data_masked)
|
|
583 # We get this by masking out everything in the region
|
|
584 out_region.mask[i:i + self.window_size,
|
|
585 j:j + self.window_size] = True
|
|
586
|
|
587 # And as a 1d Numpy array
|
|
588 out_values = numpy.reshape(out_region[~out_region.mask],
|
|
589 -1).data
|
|
590
|
|
591
|
|
592 if len(in_values) == 0 or len(out_values) == 0:
|
|
593 # Can't do any stats on this window
|
|
594 continue
|
|
595
|
|
596 if len(in_values) < 0.5 * self.window_size ** 2:
|
|
597 # The window is less than half full. Skip it.
|
|
598 # TODO: Make this threshold configurable.
|
|
599 continue
|
|
600
|
|
601 try:
|
|
602
|
|
603 # Get the p value for this window under the selected
|
|
604 # statistical test
|
|
605 p_value = statistical_test(in_values, out_values)
|
|
606
|
|
607 # If this is the best p value so far, record it
|
|
608 best_p = min(best_p, p_value)
|
|
609 except ValueError:
|
|
610 # Probably an all-zero layer, or something else the test
|
|
611 # can't handle.
|
|
612 # But let's try all the other windows to be safe.
|
|
613 # Maybe one will work.
|
|
614 pass
|
|
615
|
|
616
|
|
617
|
|
618 # We have now found the best p for any window for this layer.
|
|
619 print "Best p found: {}".format(best_p)
|
|
620 sys.stdout.flush()
|
|
621
|
|
622 return best_p
|
|
623
|
|
624 def run_functor(functor):
|
|
625 """
|
|
626 Given a no-argument functor (like a ClusterFinder), run it and return its
|
|
627 result. We can use this with multiprocessing.map and map it over a list of
|
|
628 job functors to do them.
|
|
629
|
|
630 Handles getting more than multiprocessing's pitiful exception output
|
|
631 """
|
|
632
|
|
633 try:
|
|
634 return functor()
|
|
635 except:
|
|
636 # Put all exception text into an exception and raise that
|
|
637 raise Exception(traceback.format_exc())
|
|
638
|
|
639 def main(args):
|
|
640 """
|
|
641 Parses command line arguments, and makes visualization.
|
|
642 "args" specifies the program arguments, with args[0] being the executable
|
|
643 name. The return value should be used as the program's exit code.
|
|
644 """
|
|
645
|
|
646 options = parse_args(args) # This holds the nicely-parsed options object
|
|
647
|
|
648 # Test our picking
|
|
649 x, y = hexagon_center(0, 0)
|
|
650 if hexagon_pick(x, y) != (0, 0):
|
|
651 raise Exception("Picking is broken!")
|
|
652
|
|
653 # First bit of stdout becomes annotation in Galaxy
|
|
654
|
|
655 # Make sure our output directory exists.
|
|
656 if not os.path.exists(options.directory):
|
|
657 # makedirs is the right thing to use here: recursive
|
|
658 os.makedirs(options.directory)
|
|
659
|
|
660 # Work in a temporary directory
|
|
661 drl_directory = tempfile.mkdtemp()
|
|
662
|
|
663 # This is the base name for all the files that DrL uses to do the layout
|
|
664 # We're going to put it in a temporary directory.
|
|
665 drl_basename = os.path.join(drl_directory, "layout")
|
|
666
|
|
667 # We can just pass our similarity matrix to DrL's truncate
|
|
668 # But we want to run it through our tsv parser to strip comments and ensure
|
|
669 # it's valid
|
|
670
|
|
671 # This holds a reader for the similarity matrix
|
|
672 sim_reader = tsv.TsvReader(options.similarities)
|
|
673
|
|
674 # This holds a writer for the sim file
|
|
675 sim_writer = tsv.TsvWriter(open(drl_basename + ".sim", "w"))
|
|
676
|
|
677 print "Regularizing similarity matrix..."
|
|
678 sys.stdout.flush()
|
|
679
|
|
680 for parts in sim_reader:
|
|
681 sim_writer.list_line(parts)
|
|
682
|
|
683 sim_reader.close()
|
|
684 sim_writer.close()
|
|
685
|
|
686 # Now our input for DrL is prepared!
|
|
687
|
|
688 # Do DrL truncate.
|
|
689 # TODO: pass a truncation level
|
|
690 print "DrL: Truncating..."
|
|
691 sys.stdout.flush()
|
|
692 subprocess.check_call(["truncate", drl_basename])
|
|
693
|
|
694 # Run the DrL layout engine.
|
|
695 print "DrL: Doing layout..."
|
|
696 sys.stdout.flush()
|
|
697 subprocess.check_call(["layout", drl_basename])
|
|
698
|
|
699 # Put the string names back
|
|
700 print "DrL: Restoring names..."
|
|
701 sys.stdout.flush()
|
|
702 subprocess.check_call(["recoord", drl_basename])
|
|
703
|
|
704 # Now DrL has saved its coordinates as <signature name>\t<x>\t<y> rows in
|
|
705 # <basename>.coord
|
|
706
|
|
707 # We want to read that.
|
|
708 # This holds a reader for the DrL output
|
|
709 coord_reader = tsv.TsvReader(open(drl_basename + ".coord", "r"))
|
|
710
|
|
711 # This holds a dict from signature name string to (x, y) float tuple
|
|
712 nodes = {}
|
|
713
|
|
714 print "Reading DrL output..."
|
|
715 sys.stdout.flush()
|
|
716 for parts in coord_reader:
|
|
717 nodes[parts[0]] = (float(parts[1]), float(parts[2]))
|
|
718
|
|
719 coord_reader.close()
|
|
720
|
|
721 # Save the DrL coordinates in our bundle, to be displayed client-side for
|
|
722 # debugging.
|
|
723 coord_writer = tsv.TsvWriter(open(
|
|
724 os.path.join(options.directory, "drl.tab"), "w"))
|
|
725
|
|
726 for signature_name, (x, y) in nodes.iteritems():
|
|
727 # Write a tsv with names instead of numbers, like what DrL recoord would
|
|
728 # have written. This is what the Javascript on the client side wants.
|
|
729 coord_writer.line(signature_name, x, y)
|
|
730
|
|
731 coord_writer.close()
|
|
732
|
|
733 # Do the hexagon layout
|
|
734 # We do the squiggly rows setup, so express everything as integer x, y
|
|
735
|
|
736 # This is a defaultdict from (x, y) integer tuple to id that goes there, or
|
|
737 # None if it's free.
|
|
738 hexagons = collections.defaultdict(lambda: None)
|
|
739
|
|
740 # This holds the side length that we use
|
|
741 side_length = 1.0
|
|
742
|
|
743 # This holds what will be a layer of how badly placed each hexagon is
|
|
744 # A dict from node name to layer value
|
|
745 placement_badnesses = {}
|
|
746
|
|
747 for node, (node_x, node_y) in nodes.iteritems():
|
|
748 # Assign each node to a hexagon
|
|
749 # This holds the resulting placement badness for that hexagon (i.e.
|
|
750 # distance from ideal location)
|
|
751 badness = assign_hexagon(hexagons, node_x, node_y, node,
|
|
752 scale=side_length)
|
|
753
|
|
754 # Put the badness in the layer
|
|
755 placement_badnesses[node] = float(badness)
|
|
756
|
|
757 # Normalize the placement badness layer
|
|
758 # This holds the max placement badness
|
|
759 max_placement_badness = max(placement_badnesses.itervalues())
|
|
760 print "Max placement badness: {}".format(max_placement_badness)
|
|
761
|
|
762 if max_placement_badness != 0:
|
|
763 # Normalize by the max if possible.
|
|
764 placement_badnesses = {node: value / max_placement_badness for node,
|
|
765 value in placement_badnesses.iteritems()}
|
|
766
|
|
767 # The hexagons have been assigned. Make hexagons be a dict instead of a
|
|
768 # defaultdict, so it pickles.
|
|
769 # TODO: I should change it so I don't need to do this.
|
|
770 hexagons = dict(hexagons)
|
|
771
|
|
772 # Now dump the hexagon assignments as an id, x, y tsv. This will be read by
|
|
773 # the JavaScript on the static page and be used to produce the
|
|
774 # visualization.
|
|
775 hexagon_writer = tsv.TsvWriter(open(os.path.join(options.directory,
|
|
776 "assignments.tab"), "w"))
|
|
777
|
|
778 # First find the x and y offsets needed to make all hexagon positions
|
|
779 # positive
|
|
780 min_x = min(coords[0] for coords in hexagons.iterkeys())
|
|
781 min_y = min(coords[1] for coords in hexagons.iterkeys())
|
|
782
|
|
783 for coords, name in hexagons.iteritems():
|
|
784 # Write this hexagon assignment, converted to all-positive coordinates.
|
|
785 hexagon_writer.line(name, coords[0] - min_x, coords[1] - min_y)
|
|
786 hexagon_writer.close()
|
|
787
|
|
788 # Now that we have hex assignments, compute layers.
|
|
789
|
|
790 # In addition to making per-layer files, we're going to copy all the score
|
|
791 # matrices to our output directoy. That way, the client can download layers
|
|
792 # in big chunks when it wants all layer data for statistics. We need to
|
|
793 # write a list of matrices that the client can read, which is written by
|
|
794 # this TSV writer.
|
|
795 matrix_index_writer = tsv.TsvWriter(open(os.path.join(options.directory,
|
|
796 "matrices.tab"), "w"))
|
|
797
|
|
798 # Read in all the layer data at once
|
|
799 # TODO: Don't read in all the layer data at once
|
|
800
|
|
801 # This holds a dict from layer name to a dict from signature name to
|
|
802 # score.
|
|
803 layers = {}
|
|
804
|
|
805 # This holds the names of all layers
|
|
806 layer_names = []
|
|
807
|
|
808 for matrix_number, score_filename in enumerate(options.scores):
|
|
809 # First, copy the whole matrix into our output. This holds its filename.
|
|
810 output_filename = "matrix_{}.tab".format(matrix_number)
|
|
811 shutil.copy2(score_filename, os.path.join(options.directory,
|
|
812 output_filename))
|
|
813
|
|
814 # Record were we put it
|
|
815 matrix_index_writer.line(output_filename)
|
|
816
|
|
817 # This holds a reader for the scores TSV
|
|
818 scores_reader = tsv.TsvReader(open(score_filename, "r"))
|
|
819
|
|
820 # This holds an iterator over lines in that file
|
|
821 # TODO: Write a proper header/data API
|
|
822 scores_iterator = scores_reader.__iter__()
|
|
823
|
|
824 try:
|
|
825 # This holds the names of the columns (except the first, which is
|
|
826 # labels). They also happen to be layer names
|
|
827 file_layer_names = scores_iterator.next()[1:]
|
|
828
|
|
829 # Add all the layers in this file to the complete list of layers.
|
|
830 layer_names += file_layer_names
|
|
831
|
|
832 # Ensure that we have a dict for every layer mentioned in the file
|
|
833 # (even the ones that have no data below). Doing it this way means
|
|
834 # all score matrices need disjoint columns, or the last one takes
|
|
835 # precedence.
|
|
836 for name in file_layer_names:
|
|
837 layers[name] = {}
|
|
838
|
|
839 for parts in scores_iterator:
|
|
840 # This is the signature that this line is about
|
|
841 signature_name = parts[0]
|
|
842
|
|
843 # These are the scores for all the layers for this signature
|
|
844 layer_scores = parts[1:]
|
|
845
|
|
846 for (layer_name, score) in itertools.izip(file_layer_names,
|
|
847 layer_scores):
|
|
848
|
|
849 # Store all the layer scores in the appropriate
|
|
850 # dictionaries.
|
|
851 try:
|
|
852 layers[layer_name][signature_name] = float(score)
|
|
853 except ValueError:
|
|
854 # This is not a float.
|
|
855 # Don't set that entry for this layer.
|
|
856 # TODO: possibly ought to complain to the user? But then
|
|
857 # things like "N/A" won't be handled properly.
|
|
858 continue
|
|
859
|
|
860 except StopIteration:
|
|
861 # We don't have any real data here. Couldn't read the header line.
|
|
862 # Skip to the next file
|
|
863 pass
|
|
864
|
|
865 # We're done with this score file now
|
|
866 scores_reader.close()
|
|
867
|
|
868 # We're done with all the input score matrices, so our index is done too.
|
|
869 matrix_index_writer.close()
|
|
870
|
|
871 # We have now loaded all layer data into memory as Python objects. What
|
|
872 # could possibly go wrong?
|
|
873
|
|
874 # Stick our placement badness layer on the end
|
|
875 layer_names.append("Placement Badness")
|
|
876 layers["Placement Badness"] = placement_badnesses
|
|
877
|
|
878 # Now we need to write layer files.
|
|
879
|
|
880 # Generate some filenames for layers that we can look up by layer name.
|
|
881 # We do this because layer names may not be valid filenames.
|
|
882 layer_files = {name: os.path.join(options.directory,
|
|
883 "layer_{}.tab".format(number)) for (name, number) in itertools.izip(
|
|
884 layer_names, itertools.count())}
|
|
885
|
|
886 for layer_name, layer in layers.iteritems():
|
|
887 # Write out all the individual layer files
|
|
888 # This holds the writer for this layer file
|
|
889 scores_writer = tsv.TsvWriter(open(layer_files[layer_name], "w"))
|
|
890 for signature_name, score in layer.iteritems():
|
|
891 # Write the score for this signature in this layer
|
|
892 scores_writer.line(signature_name, score)
|
|
893 scores_writer.close()
|
|
894
|
|
895 # We need something to sort layers by. We have "priority" (lower is
|
|
896 # better)
|
|
897
|
|
898 if len(layer_names) > 0 and options.stats:
|
|
899 # We want to do this fancy parallel stats thing.
|
|
900 # We skip it when there are no layers, so we don't try to join a
|
|
901 # never-used pool, which seems to hang.
|
|
902
|
|
903 print "Running statistics..."
|
|
904
|
|
905 # This holds an iterator that makes ClusterFinders for all out layers
|
|
906 cluster_finders = [ClusterFinder(hexagons, layers[layer_name],
|
|
907 window_size=options.window_size) for layer_name in layer_names]
|
|
908
|
|
909 print "{} jobs to do.".format(len(cluster_finders))
|
|
910
|
|
911 # This holds a multiprocessing pool for parallelization
|
|
912 pool = multiprocessing.Pool()
|
|
913
|
|
914 # This holds all the best p values in the same order
|
|
915 best_p_values = pool.map(run_functor, cluster_finders)
|
|
916
|
|
917 # Close down the pool so multiprocessing won't die sillily at the end
|
|
918 pool.close()
|
|
919 pool.join()
|
|
920
|
|
921 # This holds a dict from layer name to priority (best p value)
|
|
922 # We hope the order of the dict items has not changed
|
|
923 layer_priorities = {layer_name: best_p_value for layer_name,
|
|
924 best_p_value in itertools.izip(layer_names, best_p_values)}
|
|
925 else:
|
|
926 # We aren't doing any stats.
|
|
927
|
|
928 print "Skipping statistics."
|
|
929
|
|
930 # Make up priorities.
|
|
931 layer_priorities = {name: float("+inf") for name in layer_names}
|
|
932
|
|
933 # Count how many layer entries are greater than 0 for each binary layer, and
|
|
934 # store that number in this dict by layer name. Things with the default
|
|
935 # empty string instead of a number aren't binary layers, but they can use
|
|
936 # the empty string as their TSV field value, so we can safely pull any layer
|
|
937 # out of this by name.
|
|
938 layer_positives = collections.defaultdict(str)
|
|
939
|
|
940 for layer_name in layer_names:
|
|
941 # Assume it's a binary layer until proven otherwise
|
|
942 layer_positives[layer_name] = 0
|
|
943 for value in layers[layer_name].itervalues():
|
|
944 if value == 1:
|
|
945 # Count up all the 1s in the layer
|
|
946 layer_positives[layer_name] += 1
|
|
947 elif value != 0:
|
|
948 # It has something that isn't 1 or 0, so it can't be a binary
|
|
949 # layer. Throw it out and try the next layer.
|
|
950 layer_positives[layer_name] = ""
|
|
951 continue
|
|
952
|
|
953 # Write an index of all the layers we have, in the form:
|
|
954 # <layer>\t<file>\t<priority>\t<number of signatures with data>\t<number of
|
|
955 # signatures that are 1 for binary layers>
|
|
956 # This is the writer to use.
|
|
957 index_writer = tsv.TsvWriter(open(os.path.join(options.directory,
|
|
958 "layers.tab"), "w"))
|
|
959
|
|
960 for layer_name, layer_file in layer_files.iteritems():
|
|
961 # Write the index entry for this layer
|
|
962 index_writer.line(layer_name, os.path.basename(layer_file),
|
|
963 layer_priorities[layer_name], len(layers[layer_name]),
|
|
964 layer_positives[layer_name])
|
|
965
|
|
966 index_writer.close()
|
|
967
|
|
968 # Copy over the user-specified colormaps file, or make an empty TSV if it's
|
|
969 # not specified.
|
|
970
|
|
971
|
|
972
|
|
973 # This holds a writer for the sim file. Creating it creates the file.
|
|
974 colormaps_writer = tsv.TsvWriter(open(os.path.join(options.directory,
|
|
975 "colormaps.tab"), "w"))
|
|
976
|
|
977 if options.colormaps is not None:
|
|
978 # The user specified colormap data, so copy it over
|
|
979 # This holds a reader for the colormaps file
|
|
980 colormaps_reader = tsv.TsvReader(options.colormaps)
|
|
981
|
|
982 print "Regularizing colormaps file..."
|
|
983 sys.stdout.flush()
|
|
984
|
|
985 for parts in colormaps_reader:
|
|
986 colormaps_writer.list_line(parts)
|
|
987
|
|
988 colormaps_reader.close()
|
|
989
|
|
990 # Close the colormaps file we wrote. It may have gotten data, or it may
|
|
991 # still be empty.
|
|
992 colormaps_writer.close()
|
|
993
|
|
994 # Now copy any static files from where they live next to this Python file
|
|
995 # into the web page bundle.
|
|
996 # This holds the directory where this script lives, which also contains
|
|
997 # static files.
|
|
998 tool_root = os.path.dirname(os.path.realpath(__file__))
|
|
999
|
|
1000 # Copy over all the static files we need for the web page
|
|
1001 # This holds a list of them
|
|
1002 static_files = [
|
|
1003 # Static images
|
|
1004 "drag.svg",
|
|
1005 "filter.svg",
|
|
1006 "statistics.svg",
|
|
1007 "right.svg",
|
|
1008 "throbber.svg",
|
|
1009
|
|
1010 # jQuery itself is pulled from a CDN.
|
|
1011 # We can't take everything offline since Google Maps needs to be sourced
|
|
1012 # from Google, so we might as well use CDN jQuery.
|
|
1013
|
|
1014 # Select2 scripts and resources:
|
|
1015 "select2.css",
|
|
1016 "select2.js",
|
|
1017 "select2.png",
|
|
1018 "select2-spinner.gif",
|
|
1019 "select2x2.png",
|
|
1020
|
|
1021 # The jQuery.tsv plugin
|
|
1022 "jquery.tsv.js",
|
|
1023 # The color library
|
|
1024 "color-0.4.1.js",
|
|
1025 # The jStat statistics library
|
|
1026 "jstat-1.0.0.js",
|
|
1027 # The Google Maps MapLabel library
|
|
1028 "maplabel-compiled.js",
|
|
1029 # The main CSS file
|
|
1030 "hexagram.css",
|
|
1031 # The main JavaScript file that runs the page
|
|
1032 "hexagram.js",
|
|
1033 # Web Worker for statistics
|
|
1034 "statistics.js",
|
|
1035 # File with all the tool code
|
|
1036 "tools.js"
|
|
1037 ]
|
|
1038
|
|
1039 # We'd just use a directory of static files, but Galaxy needs single-level
|
|
1040 # output.
|
|
1041 for filename in static_files:
|
|
1042 shutil.copy2(os.path.join(tool_root, filename), options.directory)
|
|
1043
|
|
1044 # Copy the HTML file to our output file. It automatically knows to read
|
|
1045 # assignments.tab, and does its own TSV parsing
|
|
1046 shutil.copy2(os.path.join(tool_root, "hexagram.html"), options.html)
|
|
1047
|
|
1048 # Delete our temporary directory.
|
|
1049 shutil.rmtree(drl_directory)
|
|
1050
|
|
1051 print "Visualization generation complete!"
|
|
1052
|
|
1053 return 0
|
|
1054
|
|
1055 if __name__ == "__main__" :
|
|
1056 try:
|
|
1057 # Get the return code to return
|
|
1058 # Don't just exit with it because sys.exit works by exceptions.
|
|
1059 return_code = main(sys.argv)
|
|
1060 except:
|
|
1061 traceback.print_exc()
|
|
1062 # Return a definite number and not some unspecified error code.
|
|
1063 return_code = 1
|
|
1064
|
|
1065 sys.exit(return_code)
|