Mercurial > repos > adam-novak > hexagram

diff hexagram-6ae12361157c/hexagram/hexagram.py @ 0:1407e3634bcf draft default tip
Uploaded r11 from test tool shed.
author: adam-novak
date: Tue, 22 Oct 2013 14:17:59 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hexagram-6ae12361157c/hexagram/hexagram.py	Tue Oct 22 14:17:59 2013 -0400
@@ -0,0 +1,1219 @@
+#!/usr/bin/env python2.7
+"""
+hexagram.py: Given a matrix of similarities, produce a hexagram visualization.
+
+This script takes in the filename of a tab-separated value file containing a
+sparse similarity matrix (with string labels) and several matrices of
+layer/score data. It produces an HTML file (and several support files) that
+provide an interactive visualization of the items clustered on a hexagonal grid.
+
+This script depends on the DrL graph layout package, binaries for which must be
+present in your PATH.
+
+Re-uses sample code and documentation from 
+<http://users.soe.ucsc.edu/~karplus/bme205/f12/Scaffold.html>
+"""
+
+import argparse, sys, os, itertools, math, numpy, subprocess, shutil, tempfile
+import collections, multiprocessing, traceback, numpy
+import scipy.stats, scipy.linalg
+import os.path
+import tsv
+
+# Global variable to hold opened matrices files
+matrices = [];
+
+
+def parse_args(args):
+    """
+    Takes in the command-line arguments list (args), and returns a nice argparse
+    result with fields for all the options.
+    Borrows heavily from the argparse documentation examples:
+    <http://docs.python.org/library/argparse.html>
+    """
+    
+    # The command line arguments start with the program name, which we don't
+    # want to treat as an argument for argparse. So we remove it.
+    args = args[1:]
+    
+    # Construct the parser (which is stored in parser)
+    # Module docstring lives in __doc__
+    # See http://python-forum.com/pythonforum/viewtopic.php?f=3&t=36847
+    # And a formatter class so our examples in the docstring look good. Isn't it
+    # convenient how we already wrapped it to 80 characters?
+    # See http://docs.python.org/library/argparse.html#formatter-class
+    parser = argparse.ArgumentParser(description=__doc__, 
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+    
+    # Now add all the options to it
+    # Options match the ctdHeatmap tool options as much as possible.
+    parser.add_argument("similarity", type=str, nargs='+',
+        help="the unopened files of similarity matrices")
+    parser.add_argument("--names", type=str, action="append", default=[],
+        help="the unopened files of similarity matrices")
+    parser.add_argument("--scores", type=str,
+        action="append", default=[],
+        help="a TSV to read scores for each signature from")
+    parser.add_argument("--colormaps", type=argparse.FileType("r"), 
+        default=None,
+        help="a TSV defining coloring and value names for discrete scores")
+    parser.add_argument("--html", "-H", type=str, 
+        default="index.html",
+        help="where to write HTML report")
+    parser.add_argument("--directory", "-d", type=str, default=".",
+        help="directory in which to create other output files")
+    parser.add_argument("--query", type=str, default=None,
+        help="Galaxy-escaped name of the query signature")
+    parser.add_argument("--window_size", type=int, default=20,
+        help="size of the window to use when looking for clusters")
+    parser.add_argument("--truncation_edges", type=int, default=10,
+        help="number of edges for DrL truncate to pass per node")
+    parser.add_argument("--no-stats", dest="stats", action="store_false", 
+        default=True,
+        help="disable cluster-finding statistics")
+    parser.add_argument("--include-singletons", dest="singletons", 
+        action="store_true", default=False,
+        help="add self-edges to retain unconnected points")
+        
+    return parser.parse_args(args)
+
+def hexagon_center(x, y, scale=1.0):
+    """
+    Given a coordinate on a grid of hexagons (using wiggly rows in x), what is 
+    the 2d Euclidian coordinate of its center?
+    
+    x and y are integer column and row coordinates of the hexagon in the grid.
+    
+    scale is a float specifying hexagon side length.
+    
+    The origin in coordinate space is defined as the upper left corner of the 
+    bounding box of the hexagon with indices x=0 and y=0.
+    
+    Returns a tuple of floats.
+    """
+    # The grid looks like this:
+    #
+    #   /-\ /-\ /-\ /-\ 
+    # /-\-/-\-/-\-/-\-/-\
+    # \-/-\-/-\-/-\-/-\-/
+    # /-\-/-\-/-\-/-\-/-\
+    # \-/-\-/-\-/-\-/-\-/
+    # /-\-/-\-/-\-/-\-/-\
+    # \-/ \-/ \-/ \-/ \-/
+    #   
+    # Say a hexagon side has length 1
+    # It's 2 across corner to corner (x), and sqrt(3) across side to side (y)
+    # X coordinates are 1.5 per column
+    # Y coordinates (down from top) are sqrt(3) per row, -1/2 sqrt(3) if you're 
+    # in an odd column.
+    
+    center_y = math.sqrt(3) * y
+    if x % 2 == 1:
+        # Odd column: shift up
+        center_y -= 0.5 * math.sqrt(3)
+        
+    return (1.5 * x * scale + scale, center_y * scale + math.sqrt(3.0) / 2.0 * 
+        scale)
+
+def hexagon_pick(x, y, scale=1.0):
+    """
+    Given floats x and y specifying coordinates in the plane, determine which 
+    hexagon grid cell that point is in.
+    
+    scale is a float specifying hexagon side length.
+    
+    See http://blog.ruslans.com/2011/02/hexagonal-grid-math.html
+    But we flip the direction of the wiggle. Odd rows are up (-y)
+    """
+    
+    # How high is a hex?
+    hex_height = math.sqrt(3) * scale
+    
+    # First we pick a rectangular tile, from the point of one side-traingle to 
+    # the base of the other in width, and the whole hexagon height in height.
+    
+    # How wide are these tiles? Corner to line-between-far-corners distance
+    tile_width = (3.0 / 2.0 * scale)
+    
+    # Tile X index is floor(x / )
+    tile_x = int(math.floor(x / tile_width))
+    
+    # We need this intermediate value for the Y index and for tile-internal
+    # picking
+    corrected_y = y + (tile_x % 2) * hex_height / 2.0
+    
+    # Tile Y index is floor((y + (x index mod 2) * hex height/2) / hex height)
+    tile_y = int(math.floor(corrected_y / hex_height))
+    
+    # Find coordinates within the tile
+    internal_x = x - tile_x * tile_width
+    internal_y = corrected_y - tile_y * hex_height
+    
+    # Do tile-scale picking
+    # Are we in the one corner, the other corner, or the bulk of the tile?
+    if internal_x > scale * abs(0.5 - internal_y / hex_height):
+        # We're in the bulk of the tile
+        # This is the column (x) of the picked hexagon
+        hexagon_x = tile_x
+        
+        # This is the row (y) of the picked hexagon
+        hexagon_y = tile_y
+    else:
+        # We're in a corner.
+        # In an even column, the lower left is part of the next row, and the 
+        # upper left is part of the same row. In an odd column, the lower left 
+        # is part of the same row, and the upper left is part of the previous 
+        # row.
+        if internal_y > hex_height / 2.0:
+            # It's the lower left corner
+            # This is the offset in row (y) that being in this corner gives us
+            # The lower left corner is always 1 row below the upper left corner.
+            corner_y_offset = 1
+        else:
+            corner_y_offset = 0
+            
+        # TODO: verify this for correctness. It seems to be right, but I want a
+        # unit test to be sure.
+        # This is the row (y) of the picked hexagon
+        hexagon_y = tile_y - tile_x % 2 + corner_y_offset
+        
+        # This is the column (x) of the picked hexagon
+        hexagon_x = tile_x - 1
+    
+    # Now we've picked the hexagon
+    return (hexagon_x, hexagon_y)    
+
+def radial_search(center_x, center_y):
+    """
+    An iterator that yields coordinate tuples (x, y) in order of increasing 
+    hex-grid distance from the specified center position.
+    """
+    
+    # A hexagon has neighbors at the following relative coordinates:
+    # (-1, 0), (1, 0), (0, -1), (0, 1)
+    # and ((-1, 1) and (1, 1) if in an even column)
+    # or ((-1, -1) and (1, -1) if in an odd column)
+    
+    # We're going to go outwards using breadth-first search, so we need a queue 
+    # of hexes to visit and a set of already visited hexes.
+    
+    # This holds a queue (really a deque) of hexes waiting to be visited.
+    # A list has O(n) pop/insert at left.
+    queue = collections.deque()
+    # This holds a set of the (x, y) coordinate tuples of already-seen hexes,
+    # so we don't enqueue them again.
+    seen = set()
+    
+    # First place to visit is the center.
+    queue.append((center_x, center_y))
+    
+    while len(queue) > 0:
+        # We should in theory never run out of items in the queue.
+        # Get the current x and y to visit.
+        x, y = queue.popleft()
+        
+        # Yield the location we're visiting
+        yield (x, y)
+        
+        # This holds a list of all relative neighbor positions as (x, y) tuples.
+        neighbor_offsets = [(-1, 0), (1, 0), (0, -1), (0, 1)]
+        if y % 2 == 0:
+            # An even-column hex also has these neighbors
+            neighbor_offsets += [(-1, 1), (1, 1)]
+        else:
+            # An odd-column hex also has these neighbors
+            neighbor_offsets += [(-1, -1), (1, -1)]
+    
+        for x_offset, y_offset in neighbor_offsets:
+            # First calculate the absolute position of the neighbor in x
+            neighbor_x = x + x_offset
+            # And in y
+            neighbor_y = y + y_offset
+            
+            if (neighbor_x, neighbor_y) not in seen:
+                # This is a hex that has never been in the queue. Add it.
+                queue.append((neighbor_x, neighbor_y))
+                
+                # Record that it has ever been enqueued
+                seen.add((neighbor_x, neighbor_y))
+    
+    
+    
+
+def assign_hexagon(hexagons, node_x, node_y, node, scale=1.0):
+    """
+    This function assigns the given node to a hexagon in hexagons. hexagons is a
+    defaultdict from tuples of hexagon (x, y) integer indices to assigned nodes,
+    or None if a hexagon is free. node_x and node_y are the x and y coordinates 
+    of the node, adapted so that the seed node lands in the 0, 0 hexagon, and 
+    re-scaled to reduce hexagon conflicts. node is the node to be assigned. 
+    scale, if specified, is the hexagon side length in node space units.
+    
+    This function assigns nodes to their closest hexagon, reprobing outwards if 
+    already occupied.
+    
+    When the function completes, node is stored in hexagons under some (x, y) 
+    tuple.
+    
+    Returns the distance this hexagon is from its ideal location.
+    """
+    
+    # These hold the hexagon that the point falls in, which may be taken.
+    best_x, best_y = hexagon_pick(node_x, node_y, scale=scale)
+    
+    for x, y in radial_search(best_x, best_y):
+        # These hexes are enumerated in order of increasign distance from the 
+        # best one, starting with the best hex itself.
+        
+        if hexagons[(x, y)] is None:
+            # This is the closest free hex. Break out of the loop, leaving x and
+            # y pointing here.
+            break
+    
+    # Assign the node to the hexagon
+    hexagons[(x, y)] = node
+    
+    return math.sqrt((x - best_x) ** 2 + (y - best_y) ** 2)
+    
+    
+
+def assign_hexagon_local_radial(hexagons, node_x, node_y, node, scale=1.0):
+    """
+    This function assigns the given node to a hexagon in hexagons. hexagons is a
+    defaultdict from tuples of hexagon (x, y) integer indices to assigned nodes,
+    or None if a hexagon is free. node_x and node_y are the x and y coordinates 
+    of the node, adapted so that the seed node lands in the 0, 0 hexagon, and 
+    re-scaled to reduce hexagon conflicts. node is the node to be assigned. 
+    scale, if specified, is the hexagon side length in node space units.
+    
+    This function assigns nodes to their closest hexagon. If thast hexagon is 
+    full, it re-probes in the direction that the node is from the closest 
+    hexagon's center.
+    
+    When the function completes, node is stored in hexagons under some (x, y) 
+    tuple.
+    
+    Returns the distance this hexagon is from its ideal location.
+    """
+    
+    # These hold the hexagon that the point falls in, which may be taken.
+    best_x, best_y = hexagon_pick(node_x, node_y, scale=scale)
+    
+    # These hold the center of that hexagon in float space
+    center_x, center_y = hexagon_center(best_x, best_y, scale=scale)
+    
+    # This holds the distance from this point to the center of that hexagon
+    node_distance = math.sqrt((node_x - center_x) ** 2 + (node_y - center_y) **
+        2)
+    
+    # These hold the normalized direction of this point, relative to the center 
+    # of its best hexagon
+    direction_x = (node_x - center_x) / node_distance
+    direction_y = (node_y - center_y) / node_distance
+    
+    # Do a search in that direction, starting at the best hex.
+
+    # These are the hexagon indices we're considering
+    x, y = best_x, best_y
+    
+    # These are the Cartesian coordinates we're probing. Must be in the x, y hex
+    # as a loop invariant.
+    test_x, test_y = center_x, center_y
+    
+    while hexagons[(x, y)] is not None:
+        # Re-probe outwards from the best hex in scale/2-sized steps
+        # TODO: is that the right step size? Scale-sized steps seemed slightly 
+        # large.
+        test_x += direction_x * scale
+        test_y += direction_y * scale
+        
+        # Re-pick x and y for the hex containing our test point
+        x, y = hexagon_pick(test_x, test_y, scale=scale)
+        
+    # We've finally reached the edge of the cluster.
+    # Drop our hexagon
+    hexagons[(x, y)] = node
+    
+    return math.sqrt((x - best_x) ** 2 + (y - best_y) ** 2)
+
+def assign_hexagon_radial(hexagons, node_x, node_y, node, scale=1.0):
+    """
+    This function assigns the given node to a hexagon in hexagons. hexagons is a
+    defaultdict from tuples of hexagon (x, y) integer indices to assigned nodes,
+    or None if a hexagon is free. node_x and node_y are the x and y coordinates 
+    of the node, adapted so that the seed node lands in the 0, 0 hexagon, and 
+    re-scaled to reduce hexagon conflicts. node is the node to be assigned. 
+    scale, if specified, is the hexagon side length in node space units.
+    
+    This function assigns nodes to hexagons based on radial distance from 0, 0.
+    This makes hexagon assignment much more dense, but can lose spatial 
+    structure.
+    
+    When the function completes, node is stored in hexagons under some (x, y) 
+    tuple.
+    
+    Returns the distance this hexagon is from its ideal location. Unfortunately,
+    this doesn't really make sense for this assignment scheme, so it is always
+    0.
+    """
+    
+    # Compute node's distance from the origin
+    node_distance = math.sqrt(node_x ** 2 + node_y ** 2)
+    
+    # Compute normalized direction from the origin for this node
+    direction_x = node_x / node_distance
+    direction_y = node_y / node_distance
+    
+    # These are the coordinates we are testing
+    test_x = 0
+    test_y = 0
+    
+    # These are the hexagon indices that correspond to that point
+    x, y = hexagon_pick(test_x, test_y, scale=scale)
+    
+    while hexagons[(x, y)] is not None:
+        # Re-probe outwards from the origin in scale-sized steps
+        # TODO: is that the right step size?
+        test_x += direction_x * scale
+        test_y += direction_y * scale
+        
+        # Re-pick
+        x, y = hexagon_pick(test_x, test_y, scale=scale)
+        
+    # We've finally reached the edge of the cluster.
+    # Drop our hexagon
+    # TODO: this has to be N^2 if we line them all up in a line
+    hexagons[(x, y)] = node
+    
+    return 0
+
+def hexagons_in_window(hexagons, x, y, width, height):
+    """
+    Given a dict from (x, y) position to signature names, return the list of all
+    signatures in the window starting at hexagon x, y and extending width in the
+    x direction and height in the y direction on the hexagon grid.
+    """        
+    
+    # This holds the list of hexagons we've found
+    found = []
+    
+    for i in xrange(x, x + width):
+        for j in xrange(y, y + height):
+            if hexagons.has_key((i, j)):
+                # This position in the window has a hex.
+                found.append(hexagons[(i, j)])
+                
+    return found
+
+class ClusterFinder(object):
+    """
+    A class that can be invoked to find the p value of the best cluster in its 
+    layer. Instances are pickleable.
+    """
+    
+    def __init__(self, hexagons, layer, window_size=5):
+        """
+        Keep the given hexagons dict (from (x, y) to signature name) and the 
+        given layer (a dict from signature name to a value), and the given 
+        window size, in a ClusterFinder object.
+        """
+        
+        # TODO: This should probably all operate on numpy arrays that we can 
+        # slice efficiently.
+        
+        # Store the layer
+        self.hexagons = hexagons
+        # Store the hexagon assignments
+        self.layer = layer
+        
+        # Store the window size
+        self.window_size = window_size
+    
+    @staticmethod
+    def continuous_p(in_values, out_values):
+        """
+        Get the p value for in_values and out_values being distinct continuous 
+        distributions.
+        
+        in_values and out_values are both Numpy arrays. Returns the p value, or 
+        raises a ValueError if the statistical test cannot be run for some
+        reason.
+        
+        Uses the Mann-Whitney U test.
+        """
+    
+        # Do a Mann-Whitney U test to see how different the data  
+        # sets are.
+        u_statistic, p_value = scipy.stats.mannwhitneyu(in_values, 
+            out_values)
+            
+        return p_value
+    
+    @staticmethod    
+    def dichotomous_p(in_values, out_values):
+        """
+        Given two one-dimensional Numpy arrays of 0s and 1s, compute a p value 
+        for the in_values having a different probability of being 1 than the 
+        frequency of 1s in the out_values.
+        
+        This test uses the scipy.stats.binom_test function, which does not claim
+        to use the normal approximation. Therefore, this test should be valid
+        for arbitrarily small frequencies of either 0s or 1s in in_values.
+        
+        TODO: What if out_values is shorter than in_values?
+        """
+        
+        if len(out_values) == 0:
+            raise ValueError("Background group is empty!")
+        
+        # This holds the observed frequency of 1s in out_values
+        frequency = numpy.sum(out_values) / len(out_values)
+        
+        # This holds the number of 1s in in_values
+        successes = numpy.sum(in_values)
+        
+        # This holds the number of "trials" we got that many successes in
+        trials = len(in_values)
+        
+        # Return how significantly the frequency inside differs from that 
+        # outside.
+        return scipy.stats.binom_test(successes, trials, frequency)
+    
+    @staticmethod    
+    def categorical_p(in_values, out_values):
+        """
+        Given two one-dimensional Numpy arrays of integers (which may be stored
+        as floats), which represent items being assigned to different 
+        categories, return a p value for the distribution of categories observed
+        in in_values differing from that observed in out_values.
+        
+        The normal way to do this is with a chi-squared goodness of fit test. 
+        However, that test has invalid assumptions when there are fewer than 5 
+        expected and 5 observed observations in every category. 
+        See http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chis
+        quare.html
+        
+        However, we will use it anyway, because the tests that don't break down
+        are prohibitively slow.
+        """
+        
+        # Convert our inputs to integer arrays
+        in_values = in_values.astype(int)
+        out_values = out_values.astype(int)
+        
+        # How many categories are there (count 0 to the maximum value)
+        num_categories = max(numpy.max(in_values), numpy.max(out_values)) + 1
+        
+        # Count the number of in_values and out_values in each category
+        in_counts = numpy.array([len(in_values[in_values == i]) for i in 
+            xrange(num_categories)])
+        out_counts = numpy.array([len(out_values[out_values == i]) for i in 
+            xrange(num_categories)])
+        
+        # Get the p value for the window being from the estimated distribution
+        # None of the distribution parameters count as "estimated from data" 
+        # because they aren't estimated from the data under test.
+        _, p_value = scipy.stats.chisquare(in_counts, out_counts)
+        
+        return p_value
+        
+    def __call__(self):
+        """
+        Find the best p value for any window of size window_size. Return it.
+        """
+
+        # Calculate the bounding box where we want to look for windows.
+        # TODO: This would just be all of a numpy array
+        min_x = min(coords[0] for coords in self.hexagons.iterkeys())
+        min_y = min(coords[1] for coords in self.hexagons.iterkeys()) 
+        max_x = max(coords[0] for coords in self.hexagons.iterkeys())
+        max_y = max(coords[1] for coords in self.hexagons.iterkeys()) 
+        
+        # This holds a Numpy array of all the data by x, y
+        layer_data = numpy.empty((max_x - min_x + 1, max_y - min_y + 1))
+        
+        # Fill it with NaN so we can mask those out later
+        layer_data[:] = numpy.NAN
+        
+        for (hex_x, hex_y), name in self.hexagons.iteritems():
+            # Copy the layer values into the Numpy array
+            if self.layer.has_key(name):
+                layer_data[hex_x - min_x, hex_y - min_y] = self.layer[name]
+        
+        # This holds a masked version of the layer data
+        layer_data_masked = numpy.ma.masked_invalid(layer_data, copy=False) 
+        
+        # This holds the smallest p value we have found for this layer
+        best_p = float("+inf")
+        
+        # This holds the statistical test to use (a function from two Numpy 
+        # arrays to a p value)
+        # The most specific test is the dichotomous test (0 or 1)
+        statistical_test = self.dichotomous_p
+        
+        if numpy.sum(~layer_data_masked.mask) == 0: 
+            # There is actually no data in this layer at all.
+            # nditer complains if we try to iterate over an empty thing.
+            # So quit early and say we couldn't find anything.
+            return best_p
+ 
+        for value in numpy.nditer(layer_data_masked[~layer_data_masked.mask]):
+            # Check all the values in the layer.
+            # If this value is out of the domain of the current statistical 
+            # test, upgrade to a more general test.
+            
+            if statistical_test == self.dichotomous_p and (value > 1 or 
+                value < 0):
+                
+                # We can't use a dichotomous test on things outside 0 to 1
+                # But we haven't yet detected any non-integers
+                # Use categorical
+                statistical_test = self.categorical_p
+            
+            if value % 1 != 0:
+                # This is not an integer value
+                # So, we must use a continuous statistical test
+                statistical_test = self.continuous_p
+                
+                # This is the least specific test, so we can stop now
+                break
+        
+                
+        for i in xrange(min_x, max_x - self.window_size):
+            for j in xrange(min_y, max_y - self.window_size):
+
+                # Get the layer values for hexes in the window, as a Numpy
+                # masked array.
+                in_region = layer_data_masked[i:i + self.window_size, 
+                    j:j + self.window_size]
+                    
+                # And as a 1d Numpy array
+                in_values = numpy.reshape(in_region[~in_region.mask], -1).data
+                
+                # And out of the window (all the other hexes) as a masked array
+                out_region = numpy.ma.copy(layer_data_masked)
+                # We get this by masking out everything in the region
+                out_region.mask[i:i + self.window_size, 
+                    j:j + self.window_size] = True
+                
+                # And as a 1d Numpy array
+                out_values = numpy.reshape(out_region[~out_region.mask], 
+                    -1).data
+                 
+                    
+                if len(in_values) == 0 or len(out_values) == 0:
+                    # Can't do any stats on this window
+                    continue
+                    
+                if len(in_values) < 0.5 * self.window_size ** 2:
+                    # The window is less than half full. Skip it.
+                    # TODO: Make this threshold configurable.
+                    continue
+                
+                try:    
+                    
+                    # Get the p value for this window under the selected 
+                    # statistical test
+                    p_value = statistical_test(in_values, out_values)
+                        
+                    # If this is the best p value so far, record it
+                    best_p = min(best_p, p_value)
+                except ValueError:
+                    # Probably an all-zero layer, or something else the test 
+                    # can't handle.
+                    # But let's try all the other windows to be safe. 
+                    # Maybe one will work.
+                    pass
+                    
+                
+                
+        # We have now found the best p for any window for this layer.
+        print "Best p found: {}".format(best_p)
+        sys.stdout.flush()
+        
+        return best_p                
+
+def run_functor(functor):
+    """
+    Given a no-argument functor (like a ClusterFinder), run it and return its 
+    result. We can use this with multiprocessing.map and map it over a list of 
+    job functors to do them.
+    
+    Handles getting more than multiprocessing's pitiful exception output
+    """
+    
+    try:
+        return functor()
+    except:
+        # Put all exception text into an exception and raise that
+        raise Exception(traceback.format_exc())
+
+def open_matrices(names):
+	"""
+	The argument parser now take multiple similarity matrices as input and 
+	saves their file name as strings. We want to store the names of these
+	strings for display later in hexagram.js in order to allow the user to 
+	navigate and know what type of visualization map they are looking at -
+	gene expression, copy number, etc. 
+
+	Since, the parser no longer opens the files automatically we must, do it
+	in this function.
+	"""
+
+	# For each file name, open the file and add it to the matrices list
+	# 'r' is the argument stating that the file will be read-only
+	for similarity_filename in  names:
+		print "Opening Matrices..."
+		matrix_file = tsv.TsvReader(open(similarity_filename, "r")) 
+		matrices.append(matrix_file)
+
+def compute_beta (coords, matrix, axis, index, options):
+    """ 
+    Compute and return a beta matrix from coords * matrix.
+    Then print the matrix to a file to be read on clientside.
+    """
+    beta = coords * matrix
+    return beta
+    # Must add writing function
+
+def drl_similarity_functions(matrix, index, options):
+	"""
+	Performs all the functions needed to format a similarity matrix into a 
+	tsv format whereby the DrL can take the values. Then all of the DrL
+	functions are performed on the similarity matrix.
+
+	Options is passed to access options.singletons and other required apsects
+    of the parsed args.
+	"""
+	
+	# Work in a temporary directory
+	# If not available, create the directory.
+	drl_directory = tempfile.mkdtemp()
+    
+    # This is the base name for all the files that DrL uses to do the layout
+    # We're going to put it in a temporary directory.
+	# index added to extension in order to keep track of
+	# respective layouts
+	drl_basename = os.path.join(drl_directory, "layout" + str(index))
+
+	# We can just pass our similarity matrix to DrL's truncate
+    # But we want to run it through our tsv parser to strip comments and ensure
+    # it's valid
+    
+    # This holds a reader for the similarity matrix
+	sim_reader = matrix
+    
+    # This holds a writer for the sim file
+	sim_writer = tsv.TsvWriter(open(drl_basename + ".sim", "w"))
+    
+	print "Regularizing similarity matrix..."
+	sys.stdout.flush()
+    
+    # This holds a list of all unique signature names in the similarity matrix.
+    # We can use it to add edges to keep singletons.
+	signatures = set()
+
+	print "Reach for parts in sim_reader"
+	for parts in sim_reader:
+        # Keep the signature names used
+		signatures.add(parts[0])
+		signatures.add(parts[1])
+        
+        # Save the line to the regularized file
+		sim_writer.list_line(parts)
+    
+	if options.singletons:    
+        # Now add a self-edge on every node, so we don't drop nodes with no
+        # other strictly positive edges
+		for signature in signatures:
+			sim_writer.line(signature, signature, 1)
+        
+	sim_reader.close()
+	sim_writer.close()
+    
+    # Now our input for DrL is prepared!
+    
+    # Do DrL truncate.
+    # TODO: pass a truncation level
+	print "DrL: Truncating..."
+	sys.stdout.flush()
+	subprocess.check_call(["truncate", "-t", str(options.truncation_edges), 
+        drl_basename]) 
+        
+    # Run the DrL layout engine.
+	print "DrL: Doing layout..."
+	sys.stdout.flush()
+	subprocess.check_call(["layout", drl_basename]) 
+    
+    # Put the string names back
+	print "DrL: Restoring names..."
+	sys.stdout.flush()
+	subprocess.check_call(["recoord", drl_basename]) 
+        
+    # Now DrL has saved its coordinates as <signature name>\t<x>\t<y> rows in 
+    # <basename>.coord
+    
+    # We want to read that.
+    # This holds a reader for the DrL output
+	coord_reader = tsv.TsvReader(open(drl_basename + ".coord", "r"))
+    
+    # This holds a dict from signature name string to (x, y) float tuple. It is
+    # also our official collection of node names that made it through DrL, and
+    # therefore need their score data sent to the client.
+	nodes = {}
+
+	print "Reading DrL output..."
+	sys.stdout.flush()
+	for parts in coord_reader:
+		nodes[parts[0]] = (float(parts[1]), float(parts[2])) 
+
+	coord_reader.close()
+    
+    # Save the DrL coordinates in our bundle, to be displayed client-side for 
+    # debugging.
+
+	# index added to drl.tab extension in order to keep track of
+	# respective drl.tabs
+	coord_writer = tsv.TsvWriter(open(
+		os.path.join(options.directory, "drl" + str(index) + ".tab"), "w"))
+        
+	for signature_name, (x, y) in nodes.iteritems():
+        # Write a tsv with names instead of numbers, like what DrL recoord would
+        # have written. This is what the Javascript on the client side wants.
+		coord_writer.line(signature_name, x, y)
+        
+	coord_writer.close()
+	
+    # Delete our temporary directory.
+	shutil.rmtree(drl_directory)
+
+	# Return nodes dict back to main method for further processes
+	return nodes
+
+def compute_hexagram_assignments (nodes, index, options):
+    """
+    Now that we are taking multiple similarity matrices as inputs, we must
+    compute hexagram assignments for each similarity matrix. These assignments 
+    are based up on the nodes ouput provided by the DrL function. 
+
+    Index relates each matrix name with its drl output, nodes, assignments, etc.
+    Options contains the parsed arguments that are present in the main method.
+    """
+    # Do the hexagon layout
+    # We do the squiggly rows setup, so express everything as integer x, y
+    
+    # This is a defaultdict from (x, y) integer tuple to id that goes there, or
+    # None if it's free.
+    global hexagons
+    hexagons = collections.defaultdict(lambda: None)
+    
+    # This holds the side length that we use
+    side_length = 1.0
+    
+    # This holds what will be a layer of how badly placed each hexagon is
+    # A dict from node name to layer value
+    placement_badnesses = {}
+    
+    for node, (node_x, node_y) in nodes.iteritems():
+        # Assign each node to a hexagon
+        # This holds the resulting placement badness for that hexagon (i.e. 
+        # distance from ideal location)
+        badness = assign_hexagon(hexagons, node_x, node_y, node,
+            scale=side_length)
+            
+        # Put the badness in the layer
+        placement_badnesses[node] = float(badness)
+   
+    # Normalize the placement badness layer
+    # This holds the max placement badness
+    max_placement_badness = max(placement_badnesses.itervalues())
+    print "Max placement badness: {}".format(max_placement_badness)
+
+    if max_placement_badness != 0:
+        # Normalize by the max if possible.
+        placement_badnesses = {node: value / max_placement_badness for node, 
+            value in placement_badnesses.iteritems()}
+   
+    # The hexagons have been assigned. Make hexagons be a dict instead of a 
+    # defaultdict, so it pickles.
+    # TODO: I should change it so I don't need to do this.
+    hexagons = dict(hexagons) 
+
+    # Now dump the hexagon assignments as an id, x, y tsv. This will be read by
+    # the JavaScript on the static page and be used to produce the 
+    # visualization.        
+    hexagon_writer = tsv.TsvWriter(open(os.path.join(options.directory, 
+        "assignments"+ str(index) + ".tab"), "w"))
+
+    # First find the x and y offsets needed to make all hexagon positions 
+    # positive
+    min_x = min(coords[0] for coords in hexagons.iterkeys())
+    min_y = min(coords[1] for coords in hexagons.iterkeys())  
+        
+    for coords, name in hexagons.iteritems():
+        # Write this hexagon assignment, converted to all-positive coordinates.
+        hexagon_writer.line(name, coords[0] - min_x, coords[1] - min_y)    
+    hexagon_writer.close()
+
+    # Hand placement_badness dict to main method so that it can be used else
+    # where.
+    return placement_badnesses
+                
+def write_matrix_names (options):
+    """
+    Write the names of the similarity matrices so that hexagram.js can
+    process the names and create the toggle layout GUI.
+    We pass options to access the parsed args and thus the matrix names.
+    """
+    name_writer = tsv.TsvWriter(open(os.path.join(options.directory, 
+        "matrixnames.tab"), "w"))
+    for i in options.names:
+        name_writer.line(i)
+
+    name_writer.close()
+
+def main(args):
+    """
+    Parses command line arguments, and makes visualization.
+    "args" specifies the program arguments, with args[0] being the executable
+    name. The return value should be used as the program's exit code.
+    """
+    
+    options = parse_args(args) # This holds the nicely-parsed options object
+	
+    print "Created Options"
+    
+    # Test our picking
+    x, y = hexagon_center(0, 0)
+    if hexagon_pick(x, y) != (0, 0):
+        raise Exception("Picking is broken!")
+    
+    # First bit of stdout becomes annotation in Galaxy 
+    # Make sure our output directory exists.
+    if not os.path.exists(options.directory):
+        # makedirs is the right thing to use here: recursive
+        os.makedirs(options.directory)
+	
+    print "Writing matrix names..."
+    # We must write the file names for hexagram.js to access.
+    write_matrix_names(options)
+
+    print "About to open matrices..."
+
+	# We have file names stored in options.similarities
+	# We must open the files and store them in matrices list for access
+    open_matrices(options.similarity)
+
+    print "Opened matrices..."
+
+	# The nodes list stores the list of nodes for each matrix
+	# We must keep track of each set of nodes
+    nodes_multiple = []
+
+    print "Created nodes_multiple list..."
+
+	# Index for drl.tab and drl.layout file naming. With indexes we can match
+	# file names, to matrices, to drl output files.
+    for index, i in enumerate (matrices):
+        nodes_multiple.append (drl_similarity_functions(i, index, options))
+    
+    # Compute Hexagam Assignments for each similarity matrix's drl output,
+    # which is found in nodes_multiple.
+
+    # placement_badnesses_multiple list is required to store the placement
+    # badness dicts that are returned by the compute_hexagram_assignments
+    # function.
+    placement_badnesses_multiple = []
+    for index, i in enumerate (nodes_multiple):
+        placement_badnesses_multiple.append (compute_hexagram_assignments (i, index, options))
+
+    # Now that we have hex assignments, compute layers.
+    
+    # In addition to making per-layer files, we're going to copy all the score
+    # matrices to our output directoy. That way, the client can download layers
+    # in big chunks when it wants all layer data for statistics. We need to
+    # write a list of matrices that the client can read, which is written by
+    # this TSV writer.
+    matrix_index_writer = tsv.TsvWriter(open(os.path.join(options.directory, 
+        "matrices.tab"), "w"))
+        
+    # Read in all the layer data at once
+    # TODO: Don't read in all the layer data at once
+    
+    # This holds a dict from layer name to a dict from signature name to 
+    # score.
+    layers = {}
+    
+    # This holds the names of all layers
+    layer_names = []
+    
+    for matrix_number, score_filename in enumerate(options.scores):
+        # First, copy the whole matrix into our output. This holds its filename.
+        output_filename = "matrix_{}.tab".format(matrix_number)
+        shutil.copy2(score_filename, os.path.join(options.directory, 
+            output_filename))
+            
+        # Record were we put it
+        matrix_index_writer.line(output_filename)
+    
+        # This holds a reader for the scores TSV
+        scores_reader = tsv.TsvReader(open(score_filename, "r"))
+        
+        # This holds an iterator over lines in that file
+        # TODO: Write a proper header/data API
+        scores_iterator = scores_reader.__iter__()
+        
+        try:
+            # This holds the names of the columns (except the first, which is 
+            # labels). They also happen to be layer names
+            file_layer_names = scores_iterator.next()[1:]
+            
+            # Add all the layers in this file to the complete list of layers.
+            layer_names += file_layer_names
+            
+            # Ensure that we have a dict for every layer mentioned in the file
+            # (even the ones that have no data below). Doing it this way means
+            # all score matrices need disjoint columns, or the last one takes
+            # precedence.
+            for name in file_layer_names:
+                layers[name] = {} 
+            
+            for parts in scores_iterator:
+                # This is the signature that this line is about
+                signature_name = parts[0]
+                
+                if signature_name not in nodes_multiple[0]:
+                    # This signature wasn't in our DrL output. Don't bother
+                    # putting its layer data in our visualization. This saves
+                    # space and makes the client-side layer counts accurate for
+                    # the data actually displayable.
+                    continue
+                
+                # These are the scores for all the layers for this signature
+                layer_scores = parts[1:]
+                
+                for (layer_name, score) in itertools.izip(file_layer_names, 
+                    layer_scores):
+                    
+                    # Store all the layer scores in the appropriate
+                    # dictionaries.
+                    try:
+                        layers[layer_name][signature_name] = float(score)
+                    except ValueError:
+                        # This is not a float.
+                        # Don't set that entry for this layer.
+                        # TODO: possibly ought to complain to the user? But then
+                        # things like "N/A" won't be handled properly.
+                        continue
+                    
+        except StopIteration:
+            # We don't have any real data here. Couldn't read the header line.
+            # Skip to the next file
+            pass
+            
+        # We're done with this score file now
+        scores_reader.close()
+    
+    # We're done with all the input score matrices, so our index is done too.
+    matrix_index_writer.close()
+    
+    # We have now loaded all layer data into memory as Python objects. What
+    # could possibly go wrong?
+    
+    # Stick our placement badness layer on the end
+    layer_names.append("Placement Badness")
+    layers["Placement Badness"] = placement_badnesses_multiple[0]
+       
+    # Now we need to write layer files.
+        
+    # Generate some filenames for layers that we can look up by layer name.
+    # We do this because layer names may not be valid filenames.
+    layer_files = {name: os.path.join(options.directory, 
+        "layer_{}.tab".format(number)) for (name, number) in itertools.izip(
+        layer_names, itertools.count())}
+        
+    for layer_name, layer in layers.iteritems():
+        # Write out all the individual layer files
+        # This holds the writer for this layer file
+        scores_writer = tsv.TsvWriter(open(layer_files[layer_name], "w"))
+        for signature_name, score in layer.iteritems():
+            # Write the score for this signature in this layer
+            scores_writer.line(signature_name, score)
+        scores_writer.close()
+    
+    # We need something to sort layers by. We have "priority" (lower is 
+    # better)
+    
+    if len(layer_names) > 0 and options.stats:
+        # We want to do this fancy parallel stats thing.
+        # We skip it when there are no layers, so we don't try to join a
+        # never-used pool, which seems to hang.
+        
+        print "Running statistics..."
+        
+        # This holds an iterator that makes ClusterFinders for all out layers
+        cluster_finders = [ClusterFinder(hexagons, layers[layer_name], 
+            window_size=options.window_size) for layer_name in layer_names]
+        
+        print "{} jobs to do.".format(len(cluster_finders))
+       
+        # This holds a multiprocessing pool for parallelization
+        pool = multiprocessing.Pool()
+       
+        # This holds all the best p values in the same order
+        best_p_values = pool.map(run_functor, cluster_finders)
+        
+        # Close down the pool so multiprocessing won't die sillily at the end
+        pool.close()
+        pool.join()
+        
+        # This holds a dict from layer name to priority (best p value)
+        # We hope the order of the dict items has not changed
+        layer_priorities = {layer_name: best_p_value for layer_name, 
+            best_p_value in itertools.izip(layer_names, best_p_values)}
+    else:
+        # We aren't doing any stats.
+        
+        print "Skipping statistics."
+        
+        # Make up priorities.
+        layer_priorities = {name: float("+inf") for name in layer_names}
+    
+    # Count how many layer entries are greater than 0 for each binary layer, and
+    # store that number in this dict by layer name. Things with the default
+    # empty string instead of a number aren't binary layers, but they can use
+    # the empty string as their TSV field value, so we can safely pull any layer
+    # out of this by name.
+    layer_positives = collections.defaultdict(str)
+    
+    for layer_name in layer_names:
+        # Assume it's a binary layer until proven otherwise
+        layer_positives[layer_name] = 0
+        for value in layers[layer_name].itervalues():
+            if value == 1:
+                # Count up all the 1s in the layer
+                layer_positives[layer_name] += 1
+            elif value != 0:
+                # It has something that isn't 1 or 0, so it can't be a binary
+                # layer. Throw it out and try the next layer.
+                layer_positives[layer_name] = ""
+                break
+    
+    # Write an index of all the layers we have, in the form:
+    # <layer>\t<file>\t<priority>\t<number of signatures with data>\t<number of 
+    # signatures that are 1 for binary layers, or empty>
+    # This is the writer to use.
+    index_writer = tsv.TsvWriter(open(os.path.join(options.directory, 
+        "layers.tab"), "w"))
+        
+    for layer_name, layer_file in layer_files.iteritems():
+        # Write the index entry for this layer
+        index_writer.line(layer_name, os.path.basename(layer_file), 
+            layer_priorities[layer_name], len(layers[layer_name]), 
+            layer_positives[layer_name])
+        
+    index_writer.close()
+    
+    # Sahil will implement linear regression code here
+   
+    # We must create a m * n matrix of samples * genes
+    # In order to create this matrix we first must know the number of hexes
+    # and mantain them in a certain order. The order is important so that
+    # we populate the matrix with the data values in the proper row (sample).
+
+    # Copy over the user-specified colormaps file, or make an empty TSV if it's
+    # not specified.
+    
+    # This holds a writer for the sim file. Creating it creates the file.
+    colormaps_writer = tsv.TsvWriter(open(os.path.join(options.directory, 
+        "colormaps.tab"), "w"))
+    
+    if options.colormaps is not None:
+        # The user specified colormap data, so copy it over
+        # This holds a reader for the colormaps file
+        colormaps_reader = tsv.TsvReader(options.colormaps)
+        
+        print "Regularizing colormaps file..."
+        sys.stdout.flush()
+        
+        for parts in colormaps_reader:
+            colormaps_writer.list_line(parts)
+        
+        colormaps_reader.close()
+    
+    # Close the colormaps file we wrote. It may have gotten data, or it may 
+    # still be empty.
+    colormaps_writer.close()
+    
+    # Now copy any static files from where they live next to this Python file 
+    # into the web page bundle.
+    # This holds the directory where this script lives, which also contains 
+    # static files.
+    tool_root = os.path.dirname(os.path.realpath(__file__))
+    
+    # Copy over all the static files we need for the web page
+    # This holds a list of them
+    static_files = [
+        # Static images
+        "drag.svg",
+        "filter.svg",
+        "statistics.svg",
+        "right.svg",
+		"set.svg",
+		"save.svg",
+        "inflate.svg",
+        "throbber.svg",
+        
+        # jQuery itself is pulled from a CDN.
+        # We can't take everything offline since Google Maps needs to be sourced
+        # from Google, so we might as well use CDN jQuery.
+        
+        # Select2 scripts and resources:
+        "select2.css",
+        "select2.js",
+        "select2.png",
+        "select2-spinner.gif",
+        "select2x2.png",
+        
+        # The jQuery.tsv plugin
+        "jquery.tsv.js",
+        # The color library
+        "color-0.4.1.js",
+        # The jStat statistics library
+        "jstat-1.0.0.js",
+        # The Google Maps MapLabel library
+        "maplabel-compiled.js",
+        # The main CSS file
+        "hexagram.css",
+        # The main JavaScript file that runs the page
+        "hexagram.js",
+        # Web Worker for statistics
+        "statistics.js",
+        # File with all the tool code
+        "tools.js"
+    ]
+    
+    # We'd just use a directory of static files, but Galaxy needs single-level
+    # output.
+    for filename in static_files:
+        shutil.copy2(os.path.join(tool_root, filename), options.directory)
+    
+    # Copy the HTML file to our output file. It automatically knows to read
+    # assignments.tab, and does its own TSV parsing
+    shutil.copy2(os.path.join(tool_root, "hexagram.html"), options.html)
+     
+    print "Visualization generation complete!"
+       
+    return 0
+
+if __name__ == "__main__" :
+    try:
+        # Get the return code to return
+        # Don't just exit with it because sys.exit works by exceptions.
+        return_code = main(sys.argv)
+    except:
+        traceback.print_exc()
+        # Return a definite number and not some unspecified error code.
+        return_code = 1
+        
+    sys.exit(return_code)
author	adam-novak
date	Tue, 22 Oct 2013 14:17:59 -0400
parents
children