Mercurial > repos > adam-novak > hexagram
diff hexagram-6ae12361157c/hexagram/hexagram.py @ 0:1407e3634bcf draft default tip
Uploaded r11 from test tool shed.
author | adam-novak |
---|---|
date | Tue, 22 Oct 2013 14:17:59 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hexagram-6ae12361157c/hexagram/hexagram.py Tue Oct 22 14:17:59 2013 -0400 @@ -0,0 +1,1219 @@ +#!/usr/bin/env python2.7 +""" +hexagram.py: Given a matrix of similarities, produce a hexagram visualization. + +This script takes in the filename of a tab-separated value file containing a +sparse similarity matrix (with string labels) and several matrices of +layer/score data. It produces an HTML file (and several support files) that +provide an interactive visualization of the items clustered on a hexagonal grid. + +This script depends on the DrL graph layout package, binaries for which must be +present in your PATH. + +Re-uses sample code and documentation from +<http://users.soe.ucsc.edu/~karplus/bme205/f12/Scaffold.html> +""" + +import argparse, sys, os, itertools, math, numpy, subprocess, shutil, tempfile +import collections, multiprocessing, traceback, numpy +import scipy.stats, scipy.linalg +import os.path +import tsv + +# Global variable to hold opened matrices files +matrices = []; + + +def parse_args(args): + """ + Takes in the command-line arguments list (args), and returns a nice argparse + result with fields for all the options. + Borrows heavily from the argparse documentation examples: + <http://docs.python.org/library/argparse.html> + """ + + # The command line arguments start with the program name, which we don't + # want to treat as an argument for argparse. So we remove it. + args = args[1:] + + # Construct the parser (which is stored in parser) + # Module docstring lives in __doc__ + # See http://python-forum.com/pythonforum/viewtopic.php?f=3&t=36847 + # And a formatter class so our examples in the docstring look good. Isn't it + # convenient how we already wrapped it to 80 characters? + # See http://docs.python.org/library/argparse.html#formatter-class + parser = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + + # Now add all the options to it + # Options match the ctdHeatmap tool options as much as possible. + parser.add_argument("similarity", type=str, nargs='+', + help="the unopened files of similarity matrices") + parser.add_argument("--names", type=str, action="append", default=[], + help="the unopened files of similarity matrices") + parser.add_argument("--scores", type=str, + action="append", default=[], + help="a TSV to read scores for each signature from") + parser.add_argument("--colormaps", type=argparse.FileType("r"), + default=None, + help="a TSV defining coloring and value names for discrete scores") + parser.add_argument("--html", "-H", type=str, + default="index.html", + help="where to write HTML report") + parser.add_argument("--directory", "-d", type=str, default=".", + help="directory in which to create other output files") + parser.add_argument("--query", type=str, default=None, + help="Galaxy-escaped name of the query signature") + parser.add_argument("--window_size", type=int, default=20, + help="size of the window to use when looking for clusters") + parser.add_argument("--truncation_edges", type=int, default=10, + help="number of edges for DrL truncate to pass per node") + parser.add_argument("--no-stats", dest="stats", action="store_false", + default=True, + help="disable cluster-finding statistics") + parser.add_argument("--include-singletons", dest="singletons", + action="store_true", default=False, + help="add self-edges to retain unconnected points") + + return parser.parse_args(args) + +def hexagon_center(x, y, scale=1.0): + """ + Given a coordinate on a grid of hexagons (using wiggly rows in x), what is + the 2d Euclidian coordinate of its center? + + x and y are integer column and row coordinates of the hexagon in the grid. + + scale is a float specifying hexagon side length. + + The origin in coordinate space is defined as the upper left corner of the + bounding box of the hexagon with indices x=0 and y=0. + + Returns a tuple of floats. + """ + # The grid looks like this: + # + # /-\ /-\ /-\ /-\ + # /-\-/-\-/-\-/-\-/-\ + # \-/-\-/-\-/-\-/-\-/ + # /-\-/-\-/-\-/-\-/-\ + # \-/-\-/-\-/-\-/-\-/ + # /-\-/-\-/-\-/-\-/-\ + # \-/ \-/ \-/ \-/ \-/ + # + # Say a hexagon side has length 1 + # It's 2 across corner to corner (x), and sqrt(3) across side to side (y) + # X coordinates are 1.5 per column + # Y coordinates (down from top) are sqrt(3) per row, -1/2 sqrt(3) if you're + # in an odd column. + + center_y = math.sqrt(3) * y + if x % 2 == 1: + # Odd column: shift up + center_y -= 0.5 * math.sqrt(3) + + return (1.5 * x * scale + scale, center_y * scale + math.sqrt(3.0) / 2.0 * + scale) + +def hexagon_pick(x, y, scale=1.0): + """ + Given floats x and y specifying coordinates in the plane, determine which + hexagon grid cell that point is in. + + scale is a float specifying hexagon side length. + + See http://blog.ruslans.com/2011/02/hexagonal-grid-math.html + But we flip the direction of the wiggle. Odd rows are up (-y) + """ + + # How high is a hex? + hex_height = math.sqrt(3) * scale + + # First we pick a rectangular tile, from the point of one side-traingle to + # the base of the other in width, and the whole hexagon height in height. + + # How wide are these tiles? Corner to line-between-far-corners distance + tile_width = (3.0 / 2.0 * scale) + + # Tile X index is floor(x / ) + tile_x = int(math.floor(x / tile_width)) + + # We need this intermediate value for the Y index and for tile-internal + # picking + corrected_y = y + (tile_x % 2) * hex_height / 2.0 + + # Tile Y index is floor((y + (x index mod 2) * hex height/2) / hex height) + tile_y = int(math.floor(corrected_y / hex_height)) + + # Find coordinates within the tile + internal_x = x - tile_x * tile_width + internal_y = corrected_y - tile_y * hex_height + + # Do tile-scale picking + # Are we in the one corner, the other corner, or the bulk of the tile? + if internal_x > scale * abs(0.5 - internal_y / hex_height): + # We're in the bulk of the tile + # This is the column (x) of the picked hexagon + hexagon_x = tile_x + + # This is the row (y) of the picked hexagon + hexagon_y = tile_y + else: + # We're in a corner. + # In an even column, the lower left is part of the next row, and the + # upper left is part of the same row. In an odd column, the lower left + # is part of the same row, and the upper left is part of the previous + # row. + if internal_y > hex_height / 2.0: + # It's the lower left corner + # This is the offset in row (y) that being in this corner gives us + # The lower left corner is always 1 row below the upper left corner. + corner_y_offset = 1 + else: + corner_y_offset = 0 + + # TODO: verify this for correctness. It seems to be right, but I want a + # unit test to be sure. + # This is the row (y) of the picked hexagon + hexagon_y = tile_y - tile_x % 2 + corner_y_offset + + # This is the column (x) of the picked hexagon + hexagon_x = tile_x - 1 + + # Now we've picked the hexagon + return (hexagon_x, hexagon_y) + +def radial_search(center_x, center_y): + """ + An iterator that yields coordinate tuples (x, y) in order of increasing + hex-grid distance from the specified center position. + """ + + # A hexagon has neighbors at the following relative coordinates: + # (-1, 0), (1, 0), (0, -1), (0, 1) + # and ((-1, 1) and (1, 1) if in an even column) + # or ((-1, -1) and (1, -1) if in an odd column) + + # We're going to go outwards using breadth-first search, so we need a queue + # of hexes to visit and a set of already visited hexes. + + # This holds a queue (really a deque) of hexes waiting to be visited. + # A list has O(n) pop/insert at left. + queue = collections.deque() + # This holds a set of the (x, y) coordinate tuples of already-seen hexes, + # so we don't enqueue them again. + seen = set() + + # First place to visit is the center. + queue.append((center_x, center_y)) + + while len(queue) > 0: + # We should in theory never run out of items in the queue. + # Get the current x and y to visit. + x, y = queue.popleft() + + # Yield the location we're visiting + yield (x, y) + + # This holds a list of all relative neighbor positions as (x, y) tuples. + neighbor_offsets = [(-1, 0), (1, 0), (0, -1), (0, 1)] + if y % 2 == 0: + # An even-column hex also has these neighbors + neighbor_offsets += [(-1, 1), (1, 1)] + else: + # An odd-column hex also has these neighbors + neighbor_offsets += [(-1, -1), (1, -1)] + + for x_offset, y_offset in neighbor_offsets: + # First calculate the absolute position of the neighbor in x + neighbor_x = x + x_offset + # And in y + neighbor_y = y + y_offset + + if (neighbor_x, neighbor_y) not in seen: + # This is a hex that has never been in the queue. Add it. + queue.append((neighbor_x, neighbor_y)) + + # Record that it has ever been enqueued + seen.add((neighbor_x, neighbor_y)) + + + + +def assign_hexagon(hexagons, node_x, node_y, node, scale=1.0): + """ + This function assigns the given node to a hexagon in hexagons. hexagons is a + defaultdict from tuples of hexagon (x, y) integer indices to assigned nodes, + or None if a hexagon is free. node_x and node_y are the x and y coordinates + of the node, adapted so that the seed node lands in the 0, 0 hexagon, and + re-scaled to reduce hexagon conflicts. node is the node to be assigned. + scale, if specified, is the hexagon side length in node space units. + + This function assigns nodes to their closest hexagon, reprobing outwards if + already occupied. + + When the function completes, node is stored in hexagons under some (x, y) + tuple. + + Returns the distance this hexagon is from its ideal location. + """ + + # These hold the hexagon that the point falls in, which may be taken. + best_x, best_y = hexagon_pick(node_x, node_y, scale=scale) + + for x, y in radial_search(best_x, best_y): + # These hexes are enumerated in order of increasign distance from the + # best one, starting with the best hex itself. + + if hexagons[(x, y)] is None: + # This is the closest free hex. Break out of the loop, leaving x and + # y pointing here. + break + + # Assign the node to the hexagon + hexagons[(x, y)] = node + + return math.sqrt((x - best_x) ** 2 + (y - best_y) ** 2) + + + +def assign_hexagon_local_radial(hexagons, node_x, node_y, node, scale=1.0): + """ + This function assigns the given node to a hexagon in hexagons. hexagons is a + defaultdict from tuples of hexagon (x, y) integer indices to assigned nodes, + or None if a hexagon is free. node_x and node_y are the x and y coordinates + of the node, adapted so that the seed node lands in the 0, 0 hexagon, and + re-scaled to reduce hexagon conflicts. node is the node to be assigned. + scale, if specified, is the hexagon side length in node space units. + + This function assigns nodes to their closest hexagon. If thast hexagon is + full, it re-probes in the direction that the node is from the closest + hexagon's center. + + When the function completes, node is stored in hexagons under some (x, y) + tuple. + + Returns the distance this hexagon is from its ideal location. + """ + + # These hold the hexagon that the point falls in, which may be taken. + best_x, best_y = hexagon_pick(node_x, node_y, scale=scale) + + # These hold the center of that hexagon in float space + center_x, center_y = hexagon_center(best_x, best_y, scale=scale) + + # This holds the distance from this point to the center of that hexagon + node_distance = math.sqrt((node_x - center_x) ** 2 + (node_y - center_y) ** + 2) + + # These hold the normalized direction of this point, relative to the center + # of its best hexagon + direction_x = (node_x - center_x) / node_distance + direction_y = (node_y - center_y) / node_distance + + # Do a search in that direction, starting at the best hex. + + # These are the hexagon indices we're considering + x, y = best_x, best_y + + # These are the Cartesian coordinates we're probing. Must be in the x, y hex + # as a loop invariant. + test_x, test_y = center_x, center_y + + while hexagons[(x, y)] is not None: + # Re-probe outwards from the best hex in scale/2-sized steps + # TODO: is that the right step size? Scale-sized steps seemed slightly + # large. + test_x += direction_x * scale + test_y += direction_y * scale + + # Re-pick x and y for the hex containing our test point + x, y = hexagon_pick(test_x, test_y, scale=scale) + + # We've finally reached the edge of the cluster. + # Drop our hexagon + hexagons[(x, y)] = node + + return math.sqrt((x - best_x) ** 2 + (y - best_y) ** 2) + +def assign_hexagon_radial(hexagons, node_x, node_y, node, scale=1.0): + """ + This function assigns the given node to a hexagon in hexagons. hexagons is a + defaultdict from tuples of hexagon (x, y) integer indices to assigned nodes, + or None if a hexagon is free. node_x and node_y are the x and y coordinates + of the node, adapted so that the seed node lands in the 0, 0 hexagon, and + re-scaled to reduce hexagon conflicts. node is the node to be assigned. + scale, if specified, is the hexagon side length in node space units. + + This function assigns nodes to hexagons based on radial distance from 0, 0. + This makes hexagon assignment much more dense, but can lose spatial + structure. + + When the function completes, node is stored in hexagons under some (x, y) + tuple. + + Returns the distance this hexagon is from its ideal location. Unfortunately, + this doesn't really make sense for this assignment scheme, so it is always + 0. + """ + + # Compute node's distance from the origin + node_distance = math.sqrt(node_x ** 2 + node_y ** 2) + + # Compute normalized direction from the origin for this node + direction_x = node_x / node_distance + direction_y = node_y / node_distance + + # These are the coordinates we are testing + test_x = 0 + test_y = 0 + + # These are the hexagon indices that correspond to that point + x, y = hexagon_pick(test_x, test_y, scale=scale) + + while hexagons[(x, y)] is not None: + # Re-probe outwards from the origin in scale-sized steps + # TODO: is that the right step size? + test_x += direction_x * scale + test_y += direction_y * scale + + # Re-pick + x, y = hexagon_pick(test_x, test_y, scale=scale) + + # We've finally reached the edge of the cluster. + # Drop our hexagon + # TODO: this has to be N^2 if we line them all up in a line + hexagons[(x, y)] = node + + return 0 + +def hexagons_in_window(hexagons, x, y, width, height): + """ + Given a dict from (x, y) position to signature names, return the list of all + signatures in the window starting at hexagon x, y and extending width in the + x direction and height in the y direction on the hexagon grid. + """ + + # This holds the list of hexagons we've found + found = [] + + for i in xrange(x, x + width): + for j in xrange(y, y + height): + if hexagons.has_key((i, j)): + # This position in the window has a hex. + found.append(hexagons[(i, j)]) + + return found + +class ClusterFinder(object): + """ + A class that can be invoked to find the p value of the best cluster in its + layer. Instances are pickleable. + """ + + def __init__(self, hexagons, layer, window_size=5): + """ + Keep the given hexagons dict (from (x, y) to signature name) and the + given layer (a dict from signature name to a value), and the given + window size, in a ClusterFinder object. + """ + + # TODO: This should probably all operate on numpy arrays that we can + # slice efficiently. + + # Store the layer + self.hexagons = hexagons + # Store the hexagon assignments + self.layer = layer + + # Store the window size + self.window_size = window_size + + @staticmethod + def continuous_p(in_values, out_values): + """ + Get the p value for in_values and out_values being distinct continuous + distributions. + + in_values and out_values are both Numpy arrays. Returns the p value, or + raises a ValueError if the statistical test cannot be run for some + reason. + + Uses the Mann-Whitney U test. + """ + + # Do a Mann-Whitney U test to see how different the data + # sets are. + u_statistic, p_value = scipy.stats.mannwhitneyu(in_values, + out_values) + + return p_value + + @staticmethod + def dichotomous_p(in_values, out_values): + """ + Given two one-dimensional Numpy arrays of 0s and 1s, compute a p value + for the in_values having a different probability of being 1 than the + frequency of 1s in the out_values. + + This test uses the scipy.stats.binom_test function, which does not claim + to use the normal approximation. Therefore, this test should be valid + for arbitrarily small frequencies of either 0s or 1s in in_values. + + TODO: What if out_values is shorter than in_values? + """ + + if len(out_values) == 0: + raise ValueError("Background group is empty!") + + # This holds the observed frequency of 1s in out_values + frequency = numpy.sum(out_values) / len(out_values) + + # This holds the number of 1s in in_values + successes = numpy.sum(in_values) + + # This holds the number of "trials" we got that many successes in + trials = len(in_values) + + # Return how significantly the frequency inside differs from that + # outside. + return scipy.stats.binom_test(successes, trials, frequency) + + @staticmethod + def categorical_p(in_values, out_values): + """ + Given two one-dimensional Numpy arrays of integers (which may be stored + as floats), which represent items being assigned to different + categories, return a p value for the distribution of categories observed + in in_values differing from that observed in out_values. + + The normal way to do this is with a chi-squared goodness of fit test. + However, that test has invalid assumptions when there are fewer than 5 + expected and 5 observed observations in every category. + See http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chis + quare.html + + However, we will use it anyway, because the tests that don't break down + are prohibitively slow. + """ + + # Convert our inputs to integer arrays + in_values = in_values.astype(int) + out_values = out_values.astype(int) + + # How many categories are there (count 0 to the maximum value) + num_categories = max(numpy.max(in_values), numpy.max(out_values)) + 1 + + # Count the number of in_values and out_values in each category + in_counts = numpy.array([len(in_values[in_values == i]) for i in + xrange(num_categories)]) + out_counts = numpy.array([len(out_values[out_values == i]) for i in + xrange(num_categories)]) + + # Get the p value for the window being from the estimated distribution + # None of the distribution parameters count as "estimated from data" + # because they aren't estimated from the data under test. + _, p_value = scipy.stats.chisquare(in_counts, out_counts) + + return p_value + + def __call__(self): + """ + Find the best p value for any window of size window_size. Return it. + """ + + # Calculate the bounding box where we want to look for windows. + # TODO: This would just be all of a numpy array + min_x = min(coords[0] for coords in self.hexagons.iterkeys()) + min_y = min(coords[1] for coords in self.hexagons.iterkeys()) + max_x = max(coords[0] for coords in self.hexagons.iterkeys()) + max_y = max(coords[1] for coords in self.hexagons.iterkeys()) + + # This holds a Numpy array of all the data by x, y + layer_data = numpy.empty((max_x - min_x + 1, max_y - min_y + 1)) + + # Fill it with NaN so we can mask those out later + layer_data[:] = numpy.NAN + + for (hex_x, hex_y), name in self.hexagons.iteritems(): + # Copy the layer values into the Numpy array + if self.layer.has_key(name): + layer_data[hex_x - min_x, hex_y - min_y] = self.layer[name] + + # This holds a masked version of the layer data + layer_data_masked = numpy.ma.masked_invalid(layer_data, copy=False) + + # This holds the smallest p value we have found for this layer + best_p = float("+inf") + + # This holds the statistical test to use (a function from two Numpy + # arrays to a p value) + # The most specific test is the dichotomous test (0 or 1) + statistical_test = self.dichotomous_p + + if numpy.sum(~layer_data_masked.mask) == 0: + # There is actually no data in this layer at all. + # nditer complains if we try to iterate over an empty thing. + # So quit early and say we couldn't find anything. + return best_p + + for value in numpy.nditer(layer_data_masked[~layer_data_masked.mask]): + # Check all the values in the layer. + # If this value is out of the domain of the current statistical + # test, upgrade to a more general test. + + if statistical_test == self.dichotomous_p and (value > 1 or + value < 0): + + # We can't use a dichotomous test on things outside 0 to 1 + # But we haven't yet detected any non-integers + # Use categorical + statistical_test = self.categorical_p + + if value % 1 != 0: + # This is not an integer value + # So, we must use a continuous statistical test + statistical_test = self.continuous_p + + # This is the least specific test, so we can stop now + break + + + for i in xrange(min_x, max_x - self.window_size): + for j in xrange(min_y, max_y - self.window_size): + + # Get the layer values for hexes in the window, as a Numpy + # masked array. + in_region = layer_data_masked[i:i + self.window_size, + j:j + self.window_size] + + # And as a 1d Numpy array + in_values = numpy.reshape(in_region[~in_region.mask], -1).data + + # And out of the window (all the other hexes) as a masked array + out_region = numpy.ma.copy(layer_data_masked) + # We get this by masking out everything in the region + out_region.mask[i:i + self.window_size, + j:j + self.window_size] = True + + # And as a 1d Numpy array + out_values = numpy.reshape(out_region[~out_region.mask], + -1).data + + + if len(in_values) == 0 or len(out_values) == 0: + # Can't do any stats on this window + continue + + if len(in_values) < 0.5 * self.window_size ** 2: + # The window is less than half full. Skip it. + # TODO: Make this threshold configurable. + continue + + try: + + # Get the p value for this window under the selected + # statistical test + p_value = statistical_test(in_values, out_values) + + # If this is the best p value so far, record it + best_p = min(best_p, p_value) + except ValueError: + # Probably an all-zero layer, or something else the test + # can't handle. + # But let's try all the other windows to be safe. + # Maybe one will work. + pass + + + + # We have now found the best p for any window for this layer. + print "Best p found: {}".format(best_p) + sys.stdout.flush() + + return best_p + +def run_functor(functor): + """ + Given a no-argument functor (like a ClusterFinder), run it and return its + result. We can use this with multiprocessing.map and map it over a list of + job functors to do them. + + Handles getting more than multiprocessing's pitiful exception output + """ + + try: + return functor() + except: + # Put all exception text into an exception and raise that + raise Exception(traceback.format_exc()) + +def open_matrices(names): + """ + The argument parser now take multiple similarity matrices as input and + saves their file name as strings. We want to store the names of these + strings for display later in hexagram.js in order to allow the user to + navigate and know what type of visualization map they are looking at - + gene expression, copy number, etc. + + Since, the parser no longer opens the files automatically we must, do it + in this function. + """ + + # For each file name, open the file and add it to the matrices list + # 'r' is the argument stating that the file will be read-only + for similarity_filename in names: + print "Opening Matrices..." + matrix_file = tsv.TsvReader(open(similarity_filename, "r")) + matrices.append(matrix_file) + +def compute_beta (coords, matrix, axis, index, options): + """ + Compute and return a beta matrix from coords * matrix. + Then print the matrix to a file to be read on clientside. + """ + beta = coords * matrix + return beta + # Must add writing function + +def drl_similarity_functions(matrix, index, options): + """ + Performs all the functions needed to format a similarity matrix into a + tsv format whereby the DrL can take the values. Then all of the DrL + functions are performed on the similarity matrix. + + Options is passed to access options.singletons and other required apsects + of the parsed args. + """ + + # Work in a temporary directory + # If not available, create the directory. + drl_directory = tempfile.mkdtemp() + + # This is the base name for all the files that DrL uses to do the layout + # We're going to put it in a temporary directory. + # index added to extension in order to keep track of + # respective layouts + drl_basename = os.path.join(drl_directory, "layout" + str(index)) + + # We can just pass our similarity matrix to DrL's truncate + # But we want to run it through our tsv parser to strip comments and ensure + # it's valid + + # This holds a reader for the similarity matrix + sim_reader = matrix + + # This holds a writer for the sim file + sim_writer = tsv.TsvWriter(open(drl_basename + ".sim", "w")) + + print "Regularizing similarity matrix..." + sys.stdout.flush() + + # This holds a list of all unique signature names in the similarity matrix. + # We can use it to add edges to keep singletons. + signatures = set() + + print "Reach for parts in sim_reader" + for parts in sim_reader: + # Keep the signature names used + signatures.add(parts[0]) + signatures.add(parts[1]) + + # Save the line to the regularized file + sim_writer.list_line(parts) + + if options.singletons: + # Now add a self-edge on every node, so we don't drop nodes with no + # other strictly positive edges + for signature in signatures: + sim_writer.line(signature, signature, 1) + + sim_reader.close() + sim_writer.close() + + # Now our input for DrL is prepared! + + # Do DrL truncate. + # TODO: pass a truncation level + print "DrL: Truncating..." + sys.stdout.flush() + subprocess.check_call(["truncate", "-t", str(options.truncation_edges), + drl_basename]) + + # Run the DrL layout engine. + print "DrL: Doing layout..." + sys.stdout.flush() + subprocess.check_call(["layout", drl_basename]) + + # Put the string names back + print "DrL: Restoring names..." + sys.stdout.flush() + subprocess.check_call(["recoord", drl_basename]) + + # Now DrL has saved its coordinates as <signature name>\t<x>\t<y> rows in + # <basename>.coord + + # We want to read that. + # This holds a reader for the DrL output + coord_reader = tsv.TsvReader(open(drl_basename + ".coord", "r")) + + # This holds a dict from signature name string to (x, y) float tuple. It is + # also our official collection of node names that made it through DrL, and + # therefore need their score data sent to the client. + nodes = {} + + print "Reading DrL output..." + sys.stdout.flush() + for parts in coord_reader: + nodes[parts[0]] = (float(parts[1]), float(parts[2])) + + coord_reader.close() + + # Save the DrL coordinates in our bundle, to be displayed client-side for + # debugging. + + # index added to drl.tab extension in order to keep track of + # respective drl.tabs + coord_writer = tsv.TsvWriter(open( + os.path.join(options.directory, "drl" + str(index) + ".tab"), "w")) + + for signature_name, (x, y) in nodes.iteritems(): + # Write a tsv with names instead of numbers, like what DrL recoord would + # have written. This is what the Javascript on the client side wants. + coord_writer.line(signature_name, x, y) + + coord_writer.close() + + # Delete our temporary directory. + shutil.rmtree(drl_directory) + + # Return nodes dict back to main method for further processes + return nodes + +def compute_hexagram_assignments (nodes, index, options): + """ + Now that we are taking multiple similarity matrices as inputs, we must + compute hexagram assignments for each similarity matrix. These assignments + are based up on the nodes ouput provided by the DrL function. + + Index relates each matrix name with its drl output, nodes, assignments, etc. + Options contains the parsed arguments that are present in the main method. + """ + # Do the hexagon layout + # We do the squiggly rows setup, so express everything as integer x, y + + # This is a defaultdict from (x, y) integer tuple to id that goes there, or + # None if it's free. + global hexagons + hexagons = collections.defaultdict(lambda: None) + + # This holds the side length that we use + side_length = 1.0 + + # This holds what will be a layer of how badly placed each hexagon is + # A dict from node name to layer value + placement_badnesses = {} + + for node, (node_x, node_y) in nodes.iteritems(): + # Assign each node to a hexagon + # This holds the resulting placement badness for that hexagon (i.e. + # distance from ideal location) + badness = assign_hexagon(hexagons, node_x, node_y, node, + scale=side_length) + + # Put the badness in the layer + placement_badnesses[node] = float(badness) + + # Normalize the placement badness layer + # This holds the max placement badness + max_placement_badness = max(placement_badnesses.itervalues()) + print "Max placement badness: {}".format(max_placement_badness) + + if max_placement_badness != 0: + # Normalize by the max if possible. + placement_badnesses = {node: value / max_placement_badness for node, + value in placement_badnesses.iteritems()} + + # The hexagons have been assigned. Make hexagons be a dict instead of a + # defaultdict, so it pickles. + # TODO: I should change it so I don't need to do this. + hexagons = dict(hexagons) + + # Now dump the hexagon assignments as an id, x, y tsv. This will be read by + # the JavaScript on the static page and be used to produce the + # visualization. + hexagon_writer = tsv.TsvWriter(open(os.path.join(options.directory, + "assignments"+ str(index) + ".tab"), "w")) + + # First find the x and y offsets needed to make all hexagon positions + # positive + min_x = min(coords[0] for coords in hexagons.iterkeys()) + min_y = min(coords[1] for coords in hexagons.iterkeys()) + + for coords, name in hexagons.iteritems(): + # Write this hexagon assignment, converted to all-positive coordinates. + hexagon_writer.line(name, coords[0] - min_x, coords[1] - min_y) + hexagon_writer.close() + + # Hand placement_badness dict to main method so that it can be used else + # where. + return placement_badnesses + +def write_matrix_names (options): + """ + Write the names of the similarity matrices so that hexagram.js can + process the names and create the toggle layout GUI. + We pass options to access the parsed args and thus the matrix names. + """ + name_writer = tsv.TsvWriter(open(os.path.join(options.directory, + "matrixnames.tab"), "w")) + for i in options.names: + name_writer.line(i) + + name_writer.close() + +def main(args): + """ + Parses command line arguments, and makes visualization. + "args" specifies the program arguments, with args[0] being the executable + name. The return value should be used as the program's exit code. + """ + + options = parse_args(args) # This holds the nicely-parsed options object + + print "Created Options" + + # Test our picking + x, y = hexagon_center(0, 0) + if hexagon_pick(x, y) != (0, 0): + raise Exception("Picking is broken!") + + # First bit of stdout becomes annotation in Galaxy + # Make sure our output directory exists. + if not os.path.exists(options.directory): + # makedirs is the right thing to use here: recursive + os.makedirs(options.directory) + + print "Writing matrix names..." + # We must write the file names for hexagram.js to access. + write_matrix_names(options) + + print "About to open matrices..." + + # We have file names stored in options.similarities + # We must open the files and store them in matrices list for access + open_matrices(options.similarity) + + print "Opened matrices..." + + # The nodes list stores the list of nodes for each matrix + # We must keep track of each set of nodes + nodes_multiple = [] + + print "Created nodes_multiple list..." + + # Index for drl.tab and drl.layout file naming. With indexes we can match + # file names, to matrices, to drl output files. + for index, i in enumerate (matrices): + nodes_multiple.append (drl_similarity_functions(i, index, options)) + + # Compute Hexagam Assignments for each similarity matrix's drl output, + # which is found in nodes_multiple. + + # placement_badnesses_multiple list is required to store the placement + # badness dicts that are returned by the compute_hexagram_assignments + # function. + placement_badnesses_multiple = [] + for index, i in enumerate (nodes_multiple): + placement_badnesses_multiple.append (compute_hexagram_assignments (i, index, options)) + + # Now that we have hex assignments, compute layers. + + # In addition to making per-layer files, we're going to copy all the score + # matrices to our output directoy. That way, the client can download layers + # in big chunks when it wants all layer data for statistics. We need to + # write a list of matrices that the client can read, which is written by + # this TSV writer. + matrix_index_writer = tsv.TsvWriter(open(os.path.join(options.directory, + "matrices.tab"), "w")) + + # Read in all the layer data at once + # TODO: Don't read in all the layer data at once + + # This holds a dict from layer name to a dict from signature name to + # score. + layers = {} + + # This holds the names of all layers + layer_names = [] + + for matrix_number, score_filename in enumerate(options.scores): + # First, copy the whole matrix into our output. This holds its filename. + output_filename = "matrix_{}.tab".format(matrix_number) + shutil.copy2(score_filename, os.path.join(options.directory, + output_filename)) + + # Record were we put it + matrix_index_writer.line(output_filename) + + # This holds a reader for the scores TSV + scores_reader = tsv.TsvReader(open(score_filename, "r")) + + # This holds an iterator over lines in that file + # TODO: Write a proper header/data API + scores_iterator = scores_reader.__iter__() + + try: + # This holds the names of the columns (except the first, which is + # labels). They also happen to be layer names + file_layer_names = scores_iterator.next()[1:] + + # Add all the layers in this file to the complete list of layers. + layer_names += file_layer_names + + # Ensure that we have a dict for every layer mentioned in the file + # (even the ones that have no data below). Doing it this way means + # all score matrices need disjoint columns, or the last one takes + # precedence. + for name in file_layer_names: + layers[name] = {} + + for parts in scores_iterator: + # This is the signature that this line is about + signature_name = parts[0] + + if signature_name not in nodes_multiple[0]: + # This signature wasn't in our DrL output. Don't bother + # putting its layer data in our visualization. This saves + # space and makes the client-side layer counts accurate for + # the data actually displayable. + continue + + # These are the scores for all the layers for this signature + layer_scores = parts[1:] + + for (layer_name, score) in itertools.izip(file_layer_names, + layer_scores): + + # Store all the layer scores in the appropriate + # dictionaries. + try: + layers[layer_name][signature_name] = float(score) + except ValueError: + # This is not a float. + # Don't set that entry for this layer. + # TODO: possibly ought to complain to the user? But then + # things like "N/A" won't be handled properly. + continue + + except StopIteration: + # We don't have any real data here. Couldn't read the header line. + # Skip to the next file + pass + + # We're done with this score file now + scores_reader.close() + + # We're done with all the input score matrices, so our index is done too. + matrix_index_writer.close() + + # We have now loaded all layer data into memory as Python objects. What + # could possibly go wrong? + + # Stick our placement badness layer on the end + layer_names.append("Placement Badness") + layers["Placement Badness"] = placement_badnesses_multiple[0] + + # Now we need to write layer files. + + # Generate some filenames for layers that we can look up by layer name. + # We do this because layer names may not be valid filenames. + layer_files = {name: os.path.join(options.directory, + "layer_{}.tab".format(number)) for (name, number) in itertools.izip( + layer_names, itertools.count())} + + for layer_name, layer in layers.iteritems(): + # Write out all the individual layer files + # This holds the writer for this layer file + scores_writer = tsv.TsvWriter(open(layer_files[layer_name], "w")) + for signature_name, score in layer.iteritems(): + # Write the score for this signature in this layer + scores_writer.line(signature_name, score) + scores_writer.close() + + # We need something to sort layers by. We have "priority" (lower is + # better) + + if len(layer_names) > 0 and options.stats: + # We want to do this fancy parallel stats thing. + # We skip it when there are no layers, so we don't try to join a + # never-used pool, which seems to hang. + + print "Running statistics..." + + # This holds an iterator that makes ClusterFinders for all out layers + cluster_finders = [ClusterFinder(hexagons, layers[layer_name], + window_size=options.window_size) for layer_name in layer_names] + + print "{} jobs to do.".format(len(cluster_finders)) + + # This holds a multiprocessing pool for parallelization + pool = multiprocessing.Pool() + + # This holds all the best p values in the same order + best_p_values = pool.map(run_functor, cluster_finders) + + # Close down the pool so multiprocessing won't die sillily at the end + pool.close() + pool.join() + + # This holds a dict from layer name to priority (best p value) + # We hope the order of the dict items has not changed + layer_priorities = {layer_name: best_p_value for layer_name, + best_p_value in itertools.izip(layer_names, best_p_values)} + else: + # We aren't doing any stats. + + print "Skipping statistics." + + # Make up priorities. + layer_priorities = {name: float("+inf") for name in layer_names} + + # Count how many layer entries are greater than 0 for each binary layer, and + # store that number in this dict by layer name. Things with the default + # empty string instead of a number aren't binary layers, but they can use + # the empty string as their TSV field value, so we can safely pull any layer + # out of this by name. + layer_positives = collections.defaultdict(str) + + for layer_name in layer_names: + # Assume it's a binary layer until proven otherwise + layer_positives[layer_name] = 0 + for value in layers[layer_name].itervalues(): + if value == 1: + # Count up all the 1s in the layer + layer_positives[layer_name] += 1 + elif value != 0: + # It has something that isn't 1 or 0, so it can't be a binary + # layer. Throw it out and try the next layer. + layer_positives[layer_name] = "" + break + + # Write an index of all the layers we have, in the form: + # <layer>\t<file>\t<priority>\t<number of signatures with data>\t<number of + # signatures that are 1 for binary layers, or empty> + # This is the writer to use. + index_writer = tsv.TsvWriter(open(os.path.join(options.directory, + "layers.tab"), "w")) + + for layer_name, layer_file in layer_files.iteritems(): + # Write the index entry for this layer + index_writer.line(layer_name, os.path.basename(layer_file), + layer_priorities[layer_name], len(layers[layer_name]), + layer_positives[layer_name]) + + index_writer.close() + + # Sahil will implement linear regression code here + + # We must create a m * n matrix of samples * genes + # In order to create this matrix we first must know the number of hexes + # and mantain them in a certain order. The order is important so that + # we populate the matrix with the data values in the proper row (sample). + + # Copy over the user-specified colormaps file, or make an empty TSV if it's + # not specified. + + # This holds a writer for the sim file. Creating it creates the file. + colormaps_writer = tsv.TsvWriter(open(os.path.join(options.directory, + "colormaps.tab"), "w")) + + if options.colormaps is not None: + # The user specified colormap data, so copy it over + # This holds a reader for the colormaps file + colormaps_reader = tsv.TsvReader(options.colormaps) + + print "Regularizing colormaps file..." + sys.stdout.flush() + + for parts in colormaps_reader: + colormaps_writer.list_line(parts) + + colormaps_reader.close() + + # Close the colormaps file we wrote. It may have gotten data, or it may + # still be empty. + colormaps_writer.close() + + # Now copy any static files from where they live next to this Python file + # into the web page bundle. + # This holds the directory where this script lives, which also contains + # static files. + tool_root = os.path.dirname(os.path.realpath(__file__)) + + # Copy over all the static files we need for the web page + # This holds a list of them + static_files = [ + # Static images + "drag.svg", + "filter.svg", + "statistics.svg", + "right.svg", + "set.svg", + "save.svg", + "inflate.svg", + "throbber.svg", + + # jQuery itself is pulled from a CDN. + # We can't take everything offline since Google Maps needs to be sourced + # from Google, so we might as well use CDN jQuery. + + # Select2 scripts and resources: + "select2.css", + "select2.js", + "select2.png", + "select2-spinner.gif", + "select2x2.png", + + # The jQuery.tsv plugin + "jquery.tsv.js", + # The color library + "color-0.4.1.js", + # The jStat statistics library + "jstat-1.0.0.js", + # The Google Maps MapLabel library + "maplabel-compiled.js", + # The main CSS file + "hexagram.css", + # The main JavaScript file that runs the page + "hexagram.js", + # Web Worker for statistics + "statistics.js", + # File with all the tool code + "tools.js" + ] + + # We'd just use a directory of static files, but Galaxy needs single-level + # output. + for filename in static_files: + shutil.copy2(os.path.join(tool_root, filename), options.directory) + + # Copy the HTML file to our output file. It automatically knows to read + # assignments.tab, and does its own TSV parsing + shutil.copy2(os.path.join(tool_root, "hexagram.html"), options.html) + + print "Visualization generation complete!" + + return 0 + +if __name__ == "__main__" : + try: + # Get the return code to return + # Don't just exit with it because sys.exit works by exceptions. + return_code = main(sys.argv) + except: + traceback.print_exc() + # Return a definite number and not some unspecified error code. + return_code = 1 + + sys.exit(return_code)