Mercurial > repos > adam-novak > hexagram

#!/usr/bin/env python2.7
"""
hexagram.py: Given a matrix of similarities, produce a hexagram visualization.

This script takes in the filename of a tab-separated value file containing a
sparse similarity matrix (with string labels) and several matrices of
layer/score data. It produces an HTML file (and several support files) that
provide an interactive visualization of the items clustered on a hexagonal grid.

This script depends on the DrL graph layout package, binaries for which must be
present in your PATH.

Re-uses sample code and documentation from
<http://users.soe.ucsc.edu/~karplus/bme205/f12/Scaffold.html>
"""

import argparse, sys, os, itertools, math, numpy, subprocess, shutil, tempfile
import collections, multiprocessing, traceback, numpy
import scipy.stats, scipy.linalg
import os.path
import tsv

# Global variable to hold opened matrices files
matrices = [];


def parse_args(args):
    """
    Takes in the command-line arguments list (args), and returns a nice argparse
    result with fields for all the options.
    Borrows heavily from the argparse documentation examples:
    <http://docs.python.org/library/argparse.html>
    """

    # The command line arguments start with the program name, which we don't
    # want to treat as an argument for argparse. So we remove it.
    args = args[1:]

    # Construct the parser (which is stored in parser)
    # Module docstring lives in __doc__
    # See http://python-forum.com/pythonforum/viewtopic.php?f=3&t=36847
    # And a formatter class so our examples in the docstring look good. Isn't it
    # convenient how we already wrapped it to 80 characters?
    # See http://docs.python.org/library/argparse.html#formatter-class
    parser = argparse.ArgumentParser(description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)

    # Now add all the options to it
    # Options match the ctdHeatmap tool options as much as possible.
    parser.add_argument("similarity", type=str, nargs='+',
        help="the unopened files of similarity matrices")
    parser.add_argument("--names", type=str, action="append", default=[],
        help="the unopened files of similarity matrices")
    parser.add_argument("--scores", type=str,
        action="append", default=[],
        help="a TSV to read scores for each signature from")
    parser.add_argument("--colormaps", type=argparse.FileType("r"),
        default=None,
        help="a TSV defining coloring and value names for discrete scores")
    parser.add_argument("--html", "-H", type=str,
        default="index.html",
        help="where to write HTML report")
    parser.add_argument("--directory", "-d", type=str, default=".",
        help="directory in which to create other output files")
    parser.add_argument("--query", type=str, default=None,
        help="Galaxy-escaped name of the query signature")
    parser.add_argument("--window_size", type=int, default=20,
        help="size of the window to use when looking for clusters")
    parser.add_argument("--truncation_edges", type=int, default=10,
        help="number of edges for DrL truncate to pass per node")
    parser.add_argument("--no-stats", dest="stats", action="store_false",
        default=True,
        help="disable cluster-finding statistics")
    parser.add_argument("--include-singletons", dest="singletons",
        action="store_true", default=False,
        help="add self-edges to retain unconnected points")

    return parser.parse_args(args)

def hexagon_center(x, y, scale=1.0):
    """
    Given a coordinate on a grid of hexagons (using wiggly rows in x), what is
    the 2d Euclidian coordinate of its center?

    x and y are integer column and row coordinates of the hexagon in the grid.

    scale is a float specifying hexagon side length.

    The origin in coordinate space is defined as the upper left corner of the
    bounding box of the hexagon with indices x=0 and y=0.

    Returns a tuple of floats.
    """
    # The grid looks like this:
    #
    #   /-\ /-\ /-\ /-\
    # /-\-/-\-/-\-/-\-/-\
    # \-/-\-/-\-/-\-/-\-/
    # /-\-/-\-/-\-/-\-/-\
    # \-/-\-/-\-/-\-/-\-/
    # /-\-/-\-/-\-/-\-/-\
    # \-/ \-/ \-/ \-/ \-/
    #
    # Say a hexagon side has length 1
    # It's 2 across corner to corner (x), and sqrt(3) across side to side (y)
    # X coordinates are 1.5 per column
    # Y coordinates (down from top) are sqrt(3) per row, -1/2 sqrt(3) if you're
    # in an odd column.

    center_y = math.sqrt(3) * y
    if x % 2 == 1:
        # Odd column: shift up
        center_y -= 0.5 * math.sqrt(3)

    return (1.5 * x * scale + scale, center_y * scale + math.sqrt(3.0) / 2.0 *
        scale)

def hexagon_pick(x, y, scale=1.0):
    """
    Given floats x and y specifying coordinates in the plane, determine which
    hexagon grid cell that point is in.

    scale is a float specifying hexagon side length.

    See http://blog.ruslans.com/2011/02/hexagonal-grid-math.html
    But we flip the direction of the wiggle. Odd rows are up (-y)
    """

    # How high is a hex?
    hex_height = math.sqrt(3) * scale

    # First we pick a rectangular tile, from the point of one side-traingle to
    # the base of the other in width, and the whole hexagon height in height.

    # How wide are these tiles? Corner to line-between-far-corners distance
    tile_width = (3.0 / 2.0 * scale)

    # Tile X index is floor(x / )
    tile_x = int(math.floor(x / tile_width))

    # We need this intermediate value for the Y index and for tile-internal
    # picking
    corrected_y = y + (tile_x % 2) * hex_height / 2.0

    # Tile Y index is floor((y + (x index mod 2) * hex height/2) / hex height)
    tile_y = int(math.floor(corrected_y / hex_height))

    # Find coordinates within the tile
    internal_x = x - tile_x * tile_width
    internal_y = corrected_y - tile_y * hex_height

    # Do tile-scale picking
    # Are we in the one corner, the other corner, or the bulk of the tile?
    if internal_x > scale * abs(0.5 - internal_y / hex_height):
        # We're in the bulk of the tile
        # This is the column (x) of the picked hexagon
        hexagon_x = tile_x

        # This is the row (y) of the picked hexagon
        hexagon_y = tile_y
    else:
        # We're in a corner.
        # In an even column, the lower left is part of the next row, and the
        # upper left is part of the same row. In an odd column, the lower left
        # is part of the same row, and the upper left is part of the previous
        # row.
        if internal_y > hex_height / 2.0:
            # It's the lower left corner
            # This is the offset in row (y) that being in this corner gives us
            # The lower left corner is always 1 row below the upper left corner.
            corner_y_offset = 1
        else:
            corner_y_offset = 0

        # TODO: verify this for correctness. It seems to be right, but I want a
        # unit test to be sure.
        # This is the row (y) of the picked hexagon
        hexagon_y = tile_y - tile_x % 2 + corner_y_offset

        # This is the column (x) of the picked hexagon
        hexagon_x = tile_x - 1

    # Now we've picked the hexagon
    return (hexagon_x, hexagon_y)

def radial_search(center_x, center_y):
    """
    An iterator that yields coordinate tuples (x, y) in order of increasing
    hex-grid distance from the specified center position.
    """

    # A hexagon has neighbors at the following relative coordinates:
    # (-1, 0), (1, 0), (0, -1), (0, 1)
    # and ((-1, 1) and (1, 1) if in an even column)
    # or ((-1, -1) and (1, -1) if in an odd column)

    # We're going to go outwards using breadth-first search, so we need a queue
    # of hexes to visit and a set of already visited hexes.

    # This holds a queue (really a deque) of hexes waiting to be visited.
    # A list has O(n) pop/insert at left.
    queue = collections.deque()
    # This holds a set of the (x, y) coordinate tuples of already-seen hexes,
    # so we don't enqueue them again.
    seen = set()

    # First place to visit is the center.
    queue.append((center_x, center_y))

    while len(queue) > 0:
        # We should in theory never run out of items in the queue.
        # Get the current x and y to visit.
        x, y = queue.popleft()

        # Yield the location we're visiting
        yield (x, y)

        # This holds a list of all relative neighbor positions as (x, y) tuples.
        neighbor_offsets = [(-1, 0), (1, 0), (0, -1), (0, 1)]
        if y % 2 == 0:
            # An even-column hex also has these neighbors
            neighbor_offsets += [(-1, 1), (1, 1)]
        else:
            # An odd-column hex also has these neighbors
            neighbor_offsets += [(-1, -1), (1, -1)]

        for x_offset, y_offset in neighbor_offsets:
            # First calculate the absolute position of the neighbor in x
            neighbor_x = x + x_offset
            # And in y
            neighbor_y = y + y_offset

            if (neighbor_x, neighbor_y) not in seen:
                # This is a hex that has never been in the queue. Add it.
                queue.append((neighbor_x, neighbor_y))

                # Record that it has ever been enqueued
                seen.add((neighbor_x, neighbor_y))


def assign_hexagon(hexagons, node_x, node_y, node, scale=1.0):
    """
    This function assigns the given node to a hexagon in hexagons. hexagons is a
    defaultdict from tuples of hexagon (x, y) integer indices to assigned nodes,
    or None if a hexagon is free. node_x and node_y are the x and y coordinates
    of the node, adapted so that the seed node lands in the 0, 0 hexagon, and
    re-scaled to reduce hexagon conflicts. node is the node to be assigned.
    scale, if specified, is the hexagon side length in node space units.

    This function assigns nodes to their closest hexagon, reprobing outwards if
    already occupied.

    When the function completes, node is stored in hexagons under some (x, y)
    tuple.

    Returns the distance this hexagon is from its ideal location.
    """

    # These hold the hexagon that the point falls in, which may be taken.
    best_x, best_y = hexagon_pick(node_x, node_y, scale=scale)

    for x, y in radial_search(best_x, best_y):
        # These hexes are enumerated in order of increasign distance from the
        # best one, starting with the best hex itself.

        if hexagons[(x, y)] is None:
            # This is the closest free hex. Break out of the loop, leaving x and
            # y pointing here.
            break

    # Assign the node to the hexagon
    hexagons[(x, y)] = node

    return math.sqrt((x - best_x) ** 2 + (y - best_y) ** 2)


def assign_hexagon_local_radial(hexagons, node_x, node_y, node, scale=1.0):
    """
    This function assigns the given node to a hexagon in hexagons. hexagons is a
    defaultdict from tuples of hexagon (x, y) integer indices to assigned nodes,
    or None if a hexagon is free. node_x and node_y are the x and y coordinates
    of the node, adapted so that the seed node lands in the 0, 0 hexagon, and
    re-scaled to reduce hexagon conflicts. node is the node to be assigned.
    scale, if specified, is the hexagon side length in node space units.

    This function assigns nodes to their closest hexagon. If thast hexagon is
    full, it re-probes in the direction that the node is from the closest
    hexagon's center.

    When the function completes, node is stored in hexagons under some (x, y)
    tuple.

    Returns the distance this hexagon is from its ideal location.
    """

    # These hold the hexagon that the point falls in, which may be taken.
    best_x, best_y = hexagon_pick(node_x, node_y, scale=scale)

    # These hold the center of that hexagon in float space
    center_x, center_y = hexagon_center(best_x, best_y, scale=scale)

    # This holds the distance from this point to the center of that hexagon
    node_distance = math.sqrt((node_x - center_x) ** 2 + (node_y - center_y) **
        2)

    # These hold the normalized direction of this point, relative to the center
    # of its best hexagon
    direction_x = (node_x - center_x) / node_distance
    direction_y = (node_y - center_y) / node_distance

    # Do a search in that direction, starting at the best hex.

    # These are the hexagon indices we're considering
    x, y = best_x, best_y

    # These are the Cartesian coordinates we're probing. Must be in the x, y hex
    # as a loop invariant.
    test_x, test_y = center_x, center_y

    while hexagons[(x, y)] is not None:
        # Re-probe outwards from the best hex in scale/2-sized steps
        # TODO: is that the right step size? Scale-sized steps seemed slightly
        # large.
        test_x += direction_x * scale
        test_y += direction_y * scale

        # Re-pick x and y for the hex containing our test point
        x, y = hexagon_pick(test_x, test_y, scale=scale)

    # We've finally reached the edge of the cluster.
    # Drop our hexagon
    hexagons[(x, y)] = node

    return math.sqrt((x - best_x) ** 2 + (y - best_y) ** 2)

def assign_hexagon_radial(hexagons, node_x, node_y, node, scale=1.0):
    """
    This function assigns the given node to a hexagon in hexagons. hexagons is a
    defaultdict from tuples of hexagon (x, y) integer indices to assigned nodes,
    or None if a hexagon is free. node_x and node_y are the x and y coordinates
    of the node, adapted so that the seed node lands in the 0, 0 hexagon, and
    re-scaled to reduce hexagon conflicts. node is the node to be assigned.
    scale, if specified, is the hexagon side length in node space units.

    This function assigns nodes to hexagons based on radial distance from 0, 0.
    This makes hexagon assignment much more dense, but can lose spatial
    structure.

    When the function completes, node is stored in hexagons under some (x, y)
    tuple.

    Returns the distance this hexagon is from its ideal location. Unfortunately,
    this doesn't really make sense for this assignment scheme, so it is always
    0.
    """

    # Compute node's distance from the origin
    node_distance = math.sqrt(node_x ** 2 + node_y ** 2)

    # Compute normalized direction from the origin for this node
    direction_x = node_x / node_distance
    direction_y = node_y / node_distance

    # These are the coordinates we are testing
    test_x = 0
    test_y = 0

    # These are the hexagon indices that correspond to that point
    x, y = hexagon_pick(test_x, test_y, scale=scale)

    while hexagons[(x, y)] is not None:
        # Re-probe outwards from the origin in scale-sized steps
        # TODO: is that the right step size?
        test_x += direction_x * scale
        test_y += direction_y * scale

        # Re-pick
        x, y = hexagon_pick(test_x, test_y, scale=scale)

    # We've finally reached the edge of the cluster.
    # Drop our hexagon
    # TODO: this has to be N^2 if we line them all up in a line
    hexagons[(x, y)] = node

    return 0

def hexagons_in_window(hexagons, x, y, width, height):
    """
    Given a dict from (x, y) position to signature names, return the list of all
    signatures in the window starting at hexagon x, y and extending width in the
    x direction and height in the y direction on the hexagon grid.
    """

    # This holds the list of hexagons we've found
    found = []

    for i in xrange(x, x + width):
        for j in xrange(y, y + height):
            if hexagons.has_key((i, j)):
                # This position in the window has a hex.
                found.append(hexagons[(i, j)])

    return found

class ClusterFinder(object):
    """
    A class that can be invoked to find the p value of the best cluster in its
    layer. Instances are pickleable.
    """

    def __init__(self, hexagons, layer, window_size=5):
        """
        Keep the given hexagons dict (from (x, y) to signature name) and the
        given layer (a dict from signature name to a value), and the given
        window size, in a ClusterFinder object.
        """

        # TODO: This should probably all operate on numpy arrays that we can
        # slice efficiently.

        # Store the layer
        self.hexagons = hexagons
        # Store the hexagon assignments
        self.layer = layer

        # Store the window size
        self.window_size = window_size

    @staticmethod
    def continuous_p(in_values, out_values):
        """
        Get the p value for in_values and out_values being distinct continuous
        distributions.

        in_values and out_values are both Numpy arrays. Returns the p value, or
        raises a ValueError if the statistical test cannot be run for some
        reason.

        Uses the Mann-Whitney U test.
        """

        # Do a Mann-Whitney U test to see how different the data
        # sets are.
        u_statistic, p_value = scipy.stats.mannwhitneyu(in_values,
            out_values)

        return p_value

    @staticmethod
    def dichotomous_p(in_values, out_values):
        """
        Given two one-dimensional Numpy arrays of 0s and 1s, compute a p value
        for the in_values having a different probability of being 1 than the
        frequency of 1s in the out_values.

        This test uses the scipy.stats.binom_test function, which does not claim
        to use the normal approximation. Therefore, this test should be valid
        for arbitrarily small frequencies of either 0s or 1s in in_values.

        TODO: What if out_values is shorter than in_values?
        """

        if len(out_values) == 0:
            raise ValueError("Background group is empty!")

        # This holds the observed frequency of 1s in out_values
        frequency = numpy.sum(out_values) / len(out_values)

        # This holds the number of 1s in in_values
        successes = numpy.sum(in_values)

        # This holds the number of "trials" we got that many successes in
        trials = len(in_values)

        # Return how significantly the frequency inside differs from that
        # outside.
        return scipy.stats.binom_test(successes, trials, frequency)

    @staticmethod
    def categorical_p(in_values, out_values):
        """
        Given two one-dimensional Numpy arrays of integers (which may be stored
        as floats), which represent items being assigned to different
        categories, return a p value for the distribution of categories observed
        in in_values differing from that observed in out_values.

        The normal way to do this is with a chi-squared goodness of fit test.
        However, that test has invalid assumptions when there are fewer than 5
        expected and 5 observed observations in every category.
        See http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chis
        quare.html

        However, we will use it anyway, because the tests that don't break down
        are prohibitively slow.
        """

        # Convert our inputs to integer arrays
        in_values = in_values.astype(int)
        out_values = out_values.astype(int)

        # How many categories are there (count 0 to the maximum value)
        num_categories = max(numpy.max(in_values), numpy.max(out_values)) + 1

        # Count the number of in_values and out_values in each category
        in_counts = numpy.array([len(in_values[in_values == i]) for i in
            xrange(num_categories)])
        out_counts = numpy.array([len(out_values[out_values == i]) for i in
            xrange(num_categories)])

        # Get the p value for the window being from the estimated distribution
        # None of the distribution parameters count as "estimated from data"
        # because they aren't estimated from the data under test.
        _, p_value = scipy.stats.chisquare(in_counts, out_counts)

        return p_value

    def __call__(self):
        """
        Find the best p value for any window of size window_size. Return it.
        """

        # Calculate the bounding box where we want to look for windows.
        # TODO: This would just be all of a numpy array
        min_x = min(coords[0] for coords in self.hexagons.iterkeys())
        min_y = min(coords[1] for coords in self.hexagons.iterkeys())
        max_x = max(coords[0] for coords in self.hexagons.iterkeys())
        max_y = max(coords[1] for coords in self.hexagons.iterkeys())

        # This holds a Numpy array of all the data by x, y
        layer_data = numpy.empty((max_x - min_x + 1, max_y - min_y + 1))

        # Fill it with NaN so we can mask those out later
        layer_data[:] = numpy.NAN

        for (hex_x, hex_y), name in self.hexagons.iteritems():
            # Copy the layer values into the Numpy array
            if self.layer.has_key(name):
                layer_data[hex_x - min_x, hex_y - min_y] = self.layer[name]

        # This holds a masked version of the layer data
        layer_data_masked = numpy.ma.masked_invalid(layer_data, copy=False)

        # This holds the smallest p value we have found for this layer
        best_p = float("+inf")

        # This holds the statistical test to use (a function from two Numpy
        # arrays to a p value)
        # The most specific test is the dichotomous test (0 or 1)
        statistical_test = self.dichotomous_p

        if numpy.sum(~layer_data_masked.mask) == 0:
            # There is actually no data in this layer at all.
            # nditer complains if we try to iterate over an empty thing.
            # So quit early and say we couldn't find anything.
            return best_p

        for value in numpy.nditer(layer_data_masked[~layer_data_masked.mask]):
            # Check all the values in the layer.
            # If this value is out of the domain of the current statistical
            # test, upgrade to a more general test.

            if statistical_test == self.dichotomous_p and (value > 1 or
                value < 0):

                # We can't use a dichotomous test on things outside 0 to 1
                # But we haven't yet detected any non-integers
                # Use categorical
                statistical_test = self.categorical_p

            if value % 1 != 0:
                # This is not an integer value
                # So, we must use a continuous statistical test
                statistical_test = self.continuous_p

                # This is the least specific test, so we can stop now
                break


        for i in xrange(min_x, max_x - self.window_size):
            for j in xrange(min_y, max_y - self.window_size):

                # Get the layer values for hexes in the window, as a Numpy
                # masked array.
                in_region = layer_data_masked[i:i + self.window_size,
                    j:j + self.window_size]

                # And as a 1d Numpy array
                in_values = numpy.reshape(in_region[~in_region.mask], -1).data

                # And out of the window (all the other hexes) as a masked array
                out_region = numpy.ma.copy(layer_data_masked)
                # We get this by masking out everything in the region
                out_region.mask[i:i + self.window_size,
                    j:j + self.window_size] = True

                # And as a 1d Numpy array
                out_values = numpy.reshape(out_region[~out_region.mask],
                    -1).data


                if len(in_values) == 0 or len(out_values) == 0:
                    # Can't do any stats on this window
                    continue

                if len(in_values) < 0.5 * self.window_size ** 2:
                    # The window is less than half full. Skip it.
                    # TODO: Make this threshold configurable.
                    continue

                try:

                    # Get the p value for this window under the selected
                    # statistical test
                    p_value = statistical_test(in_values, out_values)

                    # If this is the best p value so far, record it
                    best_p = min(best_p, p_value)
                except ValueError:
                    # Probably an all-zero layer, or something else the test
                    # can't handle.
                    # But let's try all the other windows to be safe.
                    # Maybe one will work.
                    pass


        # We have now found the best p for any window for this layer.
        print "Best p found: {}".format(best_p)
        sys.stdout.flush()

        return best_p

def run_functor(functor):
    """
    Given a no-argument functor (like a ClusterFinder), run it and return its
    result. We can use this with multiprocessing.map and map it over a list of
    job functors to do them.

    Handles getting more than multiprocessing's pitiful exception output
    """

    try:
        return functor()
    except:
        # Put all exception text into an exception and raise that
        raise Exception(traceback.format_exc())

def open_matrices(names):
	"""
	The argument parser now take multiple similarity matrices as input and
	saves their file name as strings. We want to store the names of these
	strings for display later in hexagram.js in order to allow the user to
	navigate and know what type of visualization map they are looking at -
	gene expression, copy number, etc.

	Since, the parser no longer opens the files automatically we must, do it
	in this function.
	"""

	# For each file name, open the file and add it to the matrices list
	# 'r' is the argument stating that the file will be read-only
	for similarity_filename in  names:
		print "Opening Matrices..."
		matrix_file = tsv.TsvReader(open(similarity_filename, "r"))
		matrices.append(matrix_file)

def compute_beta (coords, matrix, axis, index, options):
    """
    Compute and return a beta matrix from coords * matrix.
    Then print the matrix to a file to be read on clientside.
    """
    beta = coords * matrix
    return beta
    # Must add writing function

def drl_similarity_functions(matrix, index, options):
	"""
	Performs all the functions needed to format a similarity matrix into a
	tsv format whereby the DrL can take the values. Then all of the DrL
	functions are performed on the similarity matrix.

	Options is passed to access options.singletons and other required apsects
    of the parsed args.
	"""

	# Work in a temporary directory
	# If not available, create the directory.
	drl_directory = tempfile.mkdtemp()

    # This is the base name for all the files that DrL uses to do the layout
    # We're going to put it in a temporary directory.
	# index added to extension in order to keep track of
	# respective layouts
	drl_basename = os.path.join(drl_directory, "layout" + str(index))

	# We can just pass our similarity matrix to DrL's truncate
    # But we want to run it through our tsv parser to strip comments and ensure
    # it's valid

    # This holds a reader for the similarity matrix
	sim_reader = matrix

    # This holds a writer for the sim file
	sim_writer = tsv.TsvWriter(open(drl_basename + ".sim", "w"))

	print "Regularizing similarity matrix..."
	sys.stdout.flush()

    # This holds a list of all unique signature names in the similarity matrix.
    # We can use it to add edges to keep singletons.
	signatures = set()

	print "Reach for parts in sim_reader"
	for parts in sim_reader:
        # Keep the signature names used
		signatures.add(parts[0])
		signatures.add(parts[1])

        # Save the line to the regularized file
		sim_writer.list_line(parts)

	if options.singletons:
        # Now add a self-edge on every node, so we don't drop nodes with no
        # other strictly positive edges
		for signature in signatures:
			sim_writer.line(signature, signature, 1)

	sim_reader.close()
	sim_writer.close()

    # Now our input for DrL is prepared!

    # Do DrL truncate.
    # TODO: pass a truncation level
	print "DrL: Truncating..."
	sys.stdout.flush()
	subprocess.check_call(["truncate", "-t", str(options.truncation_edges),
        drl_basename])

    # Run the DrL layout engine.
	print "DrL: Doing layout..."
	sys.stdout.flush()
	subprocess.check_call(["layout", drl_basename])

    # Put the string names back
	print "DrL: Restoring names..."
	sys.stdout.flush()
	subprocess.check_call(["recoord", drl_basename])

    # Now DrL has saved its coordinates as <signature name>\t<x>\t<y> rows in
    # <basename>.coord

    # We want to read that.
    # This holds a reader for the DrL output
	coord_reader = tsv.TsvReader(open(drl_basename + ".coord", "r"))

    # This holds a dict from signature name string to (x, y) float tuple. It is
    # also our official collection of node names that made it through DrL, and
    # therefore need their score data sent to the client.
	nodes = {}

	print "Reading DrL output..."
	sys.stdout.flush()
	for parts in coord_reader:
		nodes[parts[0]] = (float(parts[1]), float(parts[2]))

	coord_reader.close()

    # Save the DrL coordinates in our bundle, to be displayed client-side for
    # debugging.

	# index added to drl.tab extension in order to keep track of
	# respective drl.tabs
	coord_writer = tsv.TsvWriter(open(
		os.path.join(options.directory, "drl" + str(index) + ".tab"), "w"))

	for signature_name, (x, y) in nodes.iteritems():
        # Write a tsv with names instead of numbers, like what DrL recoord would
        # have written. This is what the Javascript on the client side wants.
		coord_writer.line(signature_name, x, y)

	coord_writer.close()

    # Delete our temporary directory.
	shutil.rmtree(drl_directory)

	# Return nodes dict back to main method for further processes
	return nodes

def compute_hexagram_assignments (nodes, index, options):
    """
    Now that we are taking multiple similarity matrices as inputs, we must
    compute hexagram assignments for each similarity matrix. These assignments
    are based up on the nodes ouput provided by the DrL function.

    Index relates each matrix name with its drl output, nodes, assignments, etc.
    Options contains the parsed arguments that are present in the main method.
    """
    # Do the hexagon layout
    # We do the squiggly rows setup, so express everything as integer x, y

    # This is a defaultdict from (x, y) integer tuple to id that goes there, or
    # None if it's free.
    global hexagons
    hexagons = collections.defaultdict(lambda: None)

    # This holds the side length that we use
    side_length = 1.0

    # This holds what will be a layer of how badly placed each hexagon is
    # A dict from node name to layer value
    placement_badnesses = {}

    for node, (node_x, node_y) in nodes.iteritems():
        # Assign each node to a hexagon
        # This holds the resulting placement badness for that hexagon (i.e.
        # distance from ideal location)
        badness = assign_hexagon(hexagons, node_x, node_y, node,
            scale=side_length)

        # Put the badness in the layer
        placement_badnesses[node] = float(badness)

    # Normalize the placement badness layer
    # This holds the max placement badness
    max_placement_badness = max(placement_badnesses.itervalues())
    print "Max placement badness: {}".format(max_placement_badness)

    if max_placement_badness != 0:
        # Normalize by the max if possible.
        placement_badnesses = {node: value / max_placement_badness for node,
            value in placement_badnesses.iteritems()}

    # The hexagons have been assigned. Make hexagons be a dict instead of a
    # defaultdict, so it pickles.
    # TODO: I should change it so I don't need to do this.
    hexagons = dict(hexagons)

    # Now dump the hexagon assignments as an id, x, y tsv. This will be read by
    # the JavaScript on the static page and be used to produce the
    # visualization.
    hexagon_writer = tsv.TsvWriter(open(os.path.join(options.directory,
        "assignments"+ str(index) + ".tab"), "w"))

    # First find the x and y offsets needed to make all hexagon positions
    # positive
    min_x = min(coords[0] for coords in hexagons.iterkeys())
    min_y = min(coords[1] for coords in hexagons.iterkeys())

    for coords, name in hexagons.iteritems():
        # Write this hexagon assignment, converted to all-positive coordinates.
        hexagon_writer.line(name, coords[0] - min_x, coords[1] - min_y)
    hexagon_writer.close()

    # Hand placement_badness dict to main method so that it can be used else
    # where.
    return placement_badnesses

def write_matrix_names (options):
    """
    Write the names of the similarity matrices so that hexagram.js can
    process the names and create the toggle layout GUI.
    We pass options to access the parsed args and thus the matrix names.
    """
    name_writer = tsv.TsvWriter(open(os.path.join(options.directory,
        "matrixnames.tab"), "w"))
    for i in options.names:
        name_writer.line(i)

    name_writer.close()

def main(args):
    """
    Parses command line arguments, and makes visualization.
    "args" specifies the program arguments, with args[0] being the executable
    name. The return value should be used as the program's exit code.
    """

    options = parse_args(args) # This holds the nicely-parsed options object

    print "Created Options"

    # Test our picking
    x, y = hexagon_center(0, 0)
    if hexagon_pick(x, y) != (0, 0):
        raise Exception("Picking is broken!")

    # First bit of stdout becomes annotation in Galaxy
    # Make sure our output directory exists.
    if not os.path.exists(options.directory):
        # makedirs is the right thing to use here: recursive
        os.makedirs(options.directory)

    print "Writing matrix names..."
    # We must write the file names for hexagram.js to access.
    write_matrix_names(options)

    print "About to open matrices..."

	# We have file names stored in options.similarities
	# We must open the files and store them in matrices list for access
    open_matrices(options.similarity)

    print "Opened matrices..."

	# The nodes list stores the list of nodes for each matrix
	# We must keep track of each set of nodes
    nodes_multiple = []

    print "Created nodes_multiple list..."

	# Index for drl.tab and drl.layout file naming. With indexes we can match
	# file names, to matrices, to drl output files.
    for index, i in enumerate (matrices):
        nodes_multiple.append (drl_similarity_functions(i, index, options))

    # Compute Hexagam Assignments for each similarity matrix's drl output,
    # which is found in nodes_multiple.

    # placement_badnesses_multiple list is required to store the placement
    # badness dicts that are returned by the compute_hexagram_assignments
    # function.
    placement_badnesses_multiple = []
    for index, i in enumerate (nodes_multiple):
        placement_badnesses_multiple.append (compute_hexagram_assignments (i, index, options))

    # Now that we have hex assignments, compute layers.

    # In addition to making per-layer files, we're going to copy all the score
    # matrices to our output directoy. That way, the client can download layers
    # in big chunks when it wants all layer data for statistics. We need to
    # write a list of matrices that the client can read, which is written by
    # this TSV writer.
    matrix_index_writer = tsv.TsvWriter(open(os.path.join(options.directory,
        "matrices.tab"), "w"))

    # Read in all the layer data at once
    # TODO: Don't read in all the layer data at once

    # This holds a dict from layer name to a dict from signature name to
    # score.
    layers = {}

    # This holds the names of all layers
    layer_names = []

    for matrix_number, score_filename in enumerate(options.scores):
        # First, copy the whole matrix into our output. This holds its filename.
        output_filename = "matrix_{}.tab".format(matrix_number)
        shutil.copy2(score_filename, os.path.join(options.directory,
            output_filename))

        # Record were we put it
        matrix_index_writer.line(output_filename)

        # This holds a reader for the scores TSV
        scores_reader = tsv.TsvReader(open(score_filename, "r"))

        # This holds an iterator over lines in that file
        # TODO: Write a proper header/data API
        scores_iterator = scores_reader.__iter__()

        try:
            # This holds the names of the columns (except the first, which is
            # labels). They also happen to be layer names
            file_layer_names = scores_iterator.next()[1:]

            # Add all the layers in this file to the complete list of layers.
            layer_names += file_layer_names

            # Ensure that we have a dict for every layer mentioned in the file
            # (even the ones that have no data below). Doing it this way means
            # all score matrices need disjoint columns, or the last one takes
            # precedence.
            for name in file_layer_names:
                layers[name] = {}

            for parts in scores_iterator:
                # This is the signature that this line is about
                signature_name = parts[0]

                if signature_name not in nodes_multiple[0]:
                    # This signature wasn't in our DrL output. Don't bother
                    # putting its layer data in our visualization. This saves
                    # space and makes the client-side layer counts accurate for
                    # the data actually displayable.
                    continue

                # These are the scores for all the layers for this signature
                layer_scores = parts[1:]

                for (layer_name, score) in itertools.izip(file_layer_names,
                    layer_scores):

                    # Store all the layer scores in the appropriate
                    # dictionaries.
                    try:
                        layers[layer_name][signature_name] = float(score)
                    except ValueError:
                        # This is not a float.
                        # Don't set that entry for this layer.
                        # TODO: possibly ought to complain to the user? But then
                        # things like "N/A" won't be handled properly.
                        continue

        except StopIteration:
            # We don't have any real data here. Couldn't read the header line.
            # Skip to the next file
            pass

        # We're done with this score file now
        scores_reader.close()

    # We're done with all the input score matrices, so our index is done too.
    matrix_index_writer.close()

    # We have now loaded all layer data into memory as Python objects. What
    # could possibly go wrong?

    # Stick our placement badness layer on the end
    layer_names.append("Placement Badness")
    layers["Placement Badness"] = placement_badnesses_multiple[0]

    # Now we need to write layer files.

    # Generate some filenames for layers that we can look up by layer name.
    # We do this because layer names may not be valid filenames.
    layer_files = {name: os.path.join(options.directory,
        "layer_{}.tab".format(number)) for (name, number) in itertools.izip(
        layer_names, itertools.count())}

    for layer_name, layer in layers.iteritems():
        # Write out all the individual layer files
        # This holds the writer for this layer file
        scores_writer = tsv.TsvWriter(open(layer_files[layer_name], "w"))
        for signature_name, score in layer.iteritems():
            # Write the score for this signature in this layer
            scores_writer.line(signature_name, score)
        scores_writer.close()

    # We need something to sort layers by. We have "priority" (lower is
    # better)

    if len(layer_names) > 0 and options.stats:
        # We want to do this fancy parallel stats thing.
        # We skip it when there are no layers, so we don't try to join a
        # never-used pool, which seems to hang.

        print "Running statistics..."

        # This holds an iterator that makes ClusterFinders for all out layers
        cluster_finders = [ClusterFinder(hexagons, layers[layer_name],
            window_size=options.window_size) for layer_name in layer_names]

        print "{} jobs to do.".format(len(cluster_finders))

        # This holds a multiprocessing pool for parallelization
        pool = multiprocessing.Pool()

        # This holds all the best p values in the same order
        best_p_values = pool.map(run_functor, cluster_finders)

        # Close down the pool so multiprocessing won't die sillily at the end
        pool.close()
        pool.join()

        # This holds a dict from layer name to priority (best p value)
        # We hope the order of the dict items has not changed
        layer_priorities = {layer_name: best_p_value for layer_name,
            best_p_value in itertools.izip(layer_names, best_p_values)}
    else:
        # We aren't doing any stats.

        print "Skipping statistics."

        # Make up priorities.
        layer_priorities = {name: float("+inf") for name in layer_names}

    # Count how many layer entries are greater than 0 for each binary layer, and
    # store that number in this dict by layer name. Things with the default
    # empty string instead of a number aren't binary layers, but they can use
    # the empty string as their TSV field value, so we can safely pull any layer
    # out of this by name.
    layer_positives = collections.defaultdict(str)

    for layer_name in layer_names:
        # Assume it's a binary layer until proven otherwise
        layer_positives[layer_name] = 0
        for value in layers[layer_name].itervalues():
            if value == 1:
                # Count up all the 1s in the layer
                layer_positives[layer_name] += 1
            elif value != 0:
                # It has something that isn't 1 or 0, so it can't be a binary
                # layer. Throw it out and try the next layer.
                layer_positives[layer_name] = ""
                break

    # Write an index of all the layers we have, in the form:
    # <layer>\t<file>\t<priority>\t<number of signatures with data>\t<number of
    # signatures that are 1 for binary layers, or empty>
    # This is the writer to use.
    index_writer = tsv.TsvWriter(open(os.path.join(options.directory,
        "layers.tab"), "w"))

    for layer_name, layer_file in layer_files.iteritems():
        # Write the index entry for this layer
        index_writer.line(layer_name, os.path.basename(layer_file),
            layer_priorities[layer_name], len(layers[layer_name]),
            layer_positives[layer_name])

    index_writer.close()

    # Sahil will implement linear regression code here

    # We must create a m * n matrix of samples * genes
    # In order to create this matrix we first must know the number of hexes
    # and mantain them in a certain order. The order is important so that
    # we populate the matrix with the data values in the proper row (sample).

    # Copy over the user-specified colormaps file, or make an empty TSV if it's
    # not specified.

    # This holds a writer for the sim file. Creating it creates the file.
    colormaps_writer = tsv.TsvWriter(open(os.path.join(options.directory,
        "colormaps.tab"), "w"))

    if options.colormaps is not None:
        # The user specified colormap data, so copy it over
        # This holds a reader for the colormaps file
        colormaps_reader = tsv.TsvReader(options.colormaps)

        print "Regularizing colormaps file..."
        sys.stdout.flush()

        for parts in colormaps_reader:
            colormaps_writer.list_line(parts)

        colormaps_reader.close()

    # Close the colormaps file we wrote. It may have gotten data, or it may
    # still be empty.
    colormaps_writer.close()

    # Now copy any static files from where they live next to this Python file
    # into the web page bundle.
    # This holds the directory where this script lives, which also contains
    # static files.
    tool_root = os.path.dirname(os.path.realpath(__file__))

    # Copy over all the static files we need for the web page
    # This holds a list of them
    static_files = [
        # Static images
        "drag.svg",
        "filter.svg",
        "statistics.svg",
        "right.svg",
		"set.svg",
		"save.svg",
        "inflate.svg",
        "throbber.svg",

        # jQuery itself is pulled from a CDN.
        # We can't take everything offline since Google Maps needs to be sourced
        # from Google, so we might as well use CDN jQuery.

        # Select2 scripts and resources:
        "select2.css",
        "select2.js",
        "select2.png",
        "select2-spinner.gif",
        "select2x2.png",

        # The jQuery.tsv plugin
        "jquery.tsv.js",
        # The color library
        "color-0.4.1.js",
        # The jStat statistics library
        "jstat-1.0.0.js",
        # The Google Maps MapLabel library
        "maplabel-compiled.js",
        # The main CSS file
        "hexagram.css",
        # The main JavaScript file that runs the page
        "hexagram.js",
        # Web Worker for statistics
        "statistics.js",
        # File with all the tool code
        "tools.js"
    ]

    # We'd just use a directory of static files, but Galaxy needs single-level
    # output.
    for filename in static_files:
        shutil.copy2(os.path.join(tool_root, filename), options.directory)

    # Copy the HTML file to our output file. It automatically knows to read
    # assignments.tab, and does its own TSV parsing
    shutil.copy2(os.path.join(tool_root, "hexagram.html"), options.html)

    print "Visualization generation complete!"

    return 0

if __name__ == "__main__" :
    try:
        # Get the return code to return
        # Don't just exit with it because sys.exit works by exceptions.
        return_code = main(sys.argv)
    except:
        traceback.print_exc()
        # Return a definite number and not some unspecified error code.
        return_code = 1

    sys.exit(return_code)
author	adam-novak
date	Tue, 22 Oct 2013 14:17:59 -0400
parents
children