hexagram: hexagram-6ae12361157c/hexagram/hexagram.py comparison

comparison hexagram-6ae12361157c/hexagram/hexagram.py @ 0:1407e3634bcf draft default tip

Uploaded r11 from test tool shed.

author	adam-novak
date	Tue, 22 Oct 2013 14:17:59 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:1407e3634bcf
+#!/usr/bin/env python2.7
+"""
+hexagram.py: Given a matrix of similarities, produce a hexagram visualization.
+This script takes in the filename of a tab-separated value file containing a
+sparse similarity matrix (with string labels) and several matrices of
+layer/score data. It produces an HTML file (and several support files) that
+provide an interactive visualization of the items clustered on a hexagonal grid.
+This script depends on the DrL graph layout package, binaries for which must be
+present in your PATH.
+Re-uses sample code and documentation from
+<http://users.soe.ucsc.edu/~karplus/bme205/f12/Scaffold.html>
+"""
+import argparse, sys, os, itertools, math, numpy, subprocess, shutil, tempfile
+import collections, multiprocessing, traceback, numpy
+import scipy.stats, scipy.linalg
+import os.path
+import tsv
+# Global variable to hold opened matrices files
+matrices = [];
+def parse_args(args):
+"""
+Takes in the command-line arguments list (args), and returns a nice argparse
+result with fields for all the options.
+Borrows heavily from the argparse documentation examples:
+<http://docs.python.org/library/argparse.html>
+"""
+# The command line arguments start with the program name, which we don't
+# want to treat as an argument for argparse. So we remove it.
+args = args[1:]
+# Construct the parser (which is stored in parser)
+# Module docstring lives in __doc__
+# See http://python-forum.com/pythonforum/viewtopic.php?f=3&t=36847
+# And a formatter class so our examples in the docstring look good. Isn't it
+# convenient how we already wrapped it to 80 characters?
+# See http://docs.python.org/library/argparse.html#formatter-class
+parser = argparse.ArgumentParser(description=__doc__,
+formatter_class=argparse.RawDescriptionHelpFormatter)
+# Now add all the options to it
+# Options match the ctdHeatmap tool options as much as possible.
+parser.add_argument("similarity", type=str, nargs='+',
+help="the unopened files of similarity matrices")
+parser.add_argument("--names", type=str, action="append", default=[],
+help="the unopened files of similarity matrices")
+parser.add_argument("--scores", type=str,
+action="append", default=[],
+help="a TSV to read scores for each signature from")
+parser.add_argument("--colormaps", type=argparse.FileType("r"),
+default=None,
+help="a TSV defining coloring and value names for discrete scores")
+parser.add_argument("--html", "-H", type=str,
+default="index.html",
+help="where to write HTML report")
+parser.add_argument("--directory", "-d", type=str, default=".",
+help="directory in which to create other output files")
+parser.add_argument("--query", type=str, default=None,
+help="Galaxy-escaped name of the query signature")
+parser.add_argument("--window_size", type=int, default=20,
+help="size of the window to use when looking for clusters")
+parser.add_argument("--truncation_edges", type=int, default=10,
+help="number of edges for DrL truncate to pass per node")
+parser.add_argument("--no-stats", dest="stats", action="store_false",
+default=True,
+help="disable cluster-finding statistics")
+parser.add_argument("--include-singletons", dest="singletons",
+action="store_true", default=False,
+help="add self-edges to retain unconnected points")
+return parser.parse_args(args)
+def hexagon_center(x, y, scale=1.0):
+"""
+Given a coordinate on a grid of hexagons (using wiggly rows in x), what is
+the 2d Euclidian coordinate of its center?
+x and y are integer column and row coordinates of the hexagon in the grid.
+scale is a float specifying hexagon side length.
+The origin in coordinate space is defined as the upper left corner of the
+bounding box of the hexagon with indices x=0 and y=0.
+Returns a tuple of floats.
+"""
+# The grid looks like this:
+#
+#   /-\ /-\ /-\ /-\
+# /-\-/-\-/-\-/-\-/-\
+# \-/-\-/-\-/-\-/-\-/
+# /-\-/-\-/-\-/-\-/-\
+# \-/-\-/-\-/-\-/-\-/
+# /-\-/-\-/-\-/-\-/-\
+# \-/ \-/ \-/ \-/ \-/
+#
+# Say a hexagon side has length 1
+# It's 2 across corner to corner (x), and sqrt(3) across side to side (y)
+# X coordinates are 1.5 per column
+# Y coordinates (down from top) are sqrt(3) per row, -1/2 sqrt(3) if you're
+# in an odd column.
+center_y = math.sqrt(3) * y
+if x % 2 == 1:
+# Odd column: shift up
+center_y -= 0.5 * math.sqrt(3)
+return (1.5 * x * scale + scale, center_y * scale + math.sqrt(3.0) / 2.0 *
+scale)
+def hexagon_pick(x, y, scale=1.0):
+"""
+Given floats x and y specifying coordinates in the plane, determine which
+hexagon grid cell that point is in.
+scale is a float specifying hexagon side length.
+See http://blog.ruslans.com/2011/02/hexagonal-grid-math.html
+But we flip the direction of the wiggle. Odd rows are up (-y)
+"""
+# How high is a hex?
+hex_height = math.sqrt(3) * scale
+# First we pick a rectangular tile, from the point of one side-traingle to
+# the base of the other in width, and the whole hexagon height in height.
+# How wide are these tiles? Corner to line-between-far-corners distance
+tile_width = (3.0 / 2.0 * scale)
+# Tile X index is floor(x / )
+tile_x = int(math.floor(x / tile_width))
+# We need this intermediate value for the Y index and for tile-internal
+# picking
+corrected_y = y + (tile_x % 2) * hex_height / 2.0
+# Tile Y index is floor((y + (x index mod 2) * hex height/2) / hex height)
+tile_y = int(math.floor(corrected_y / hex_height))
+# Find coordinates within the tile
+internal_x = x - tile_x * tile_width
+internal_y = corrected_y - tile_y * hex_height
+# Do tile-scale picking
+# Are we in the one corner, the other corner, or the bulk of the tile?
+if internal_x > scale * abs(0.5 - internal_y / hex_height):
+# We're in the bulk of the tile
+# This is the column (x) of the picked hexagon
+hexagon_x = tile_x
+# This is the row (y) of the picked hexagon
+hexagon_y = tile_y
+else:
+# We're in a corner.
+# In an even column, the lower left is part of the next row, and the
+# upper left is part of the same row. In an odd column, the lower left
+# is part of the same row, and the upper left is part of the previous
+# row.
+if internal_y > hex_height / 2.0:
+# It's the lower left corner
+# This is the offset in row (y) that being in this corner gives us
+# The lower left corner is always 1 row below the upper left corner.
+corner_y_offset = 1
+else:
+corner_y_offset = 0
+# TODO: verify this for correctness. It seems to be right, but I want a
+# unit test to be sure.
+# This is the row (y) of the picked hexagon
+hexagon_y = tile_y - tile_x % 2 + corner_y_offset
+# This is the column (x) of the picked hexagon
+hexagon_x = tile_x - 1
+# Now we've picked the hexagon
+return (hexagon_x, hexagon_y)
+def radial_search(center_x, center_y):
+"""
+An iterator that yields coordinate tuples (x, y) in order of increasing
+hex-grid distance from the specified center position.
+"""
+# A hexagon has neighbors at the following relative coordinates:
+# (-1, 0), (1, 0), (0, -1), (0, 1)
+# and ((-1, 1) and (1, 1) if in an even column)
+# or ((-1, -1) and (1, -1) if in an odd column)
+# We're going to go outwards using breadth-first search, so we need a queue
+# of hexes to visit and a set of already visited hexes.
+# This holds a queue (really a deque) of hexes waiting to be visited.
+# A list has O(n) pop/insert at left.
+queue = collections.deque()
+# This holds a set of the (x, y) coordinate tuples of already-seen hexes,
+# so we don't enqueue them again.
+seen = set()
+# First place to visit is the center.
+queue.append((center_x, center_y))
+while len(queue) > 0:
+# We should in theory never run out of items in the queue.
+# Get the current x and y to visit.
+x, y = queue.popleft()
+# Yield the location we're visiting
+yield (x, y)
+# This holds a list of all relative neighbor positions as (x, y) tuples.
+neighbor_offsets = [(-1, 0), (1, 0), (0, -1), (0, 1)]
+if y % 2 == 0:
+# An even-column hex also has these neighbors
+neighbor_offsets += [(-1, 1), (1, 1)]
+else:
+# An odd-column hex also has these neighbors
+neighbor_offsets += [(-1, -1), (1, -1)]
+for x_offset, y_offset in neighbor_offsets:
+# First calculate the absolute position of the neighbor in x
+neighbor_x = x + x_offset
+# And in y
+neighbor_y = y + y_offset
+if (neighbor_x, neighbor_y) not in seen:
+# This is a hex that has never been in the queue. Add it.
+queue.append((neighbor_x, neighbor_y))
+# Record that it has ever been enqueued
+seen.add((neighbor_x, neighbor_y))
+def assign_hexagon(hexagons, node_x, node_y, node, scale=1.0):
+"""
+This function assigns the given node to a hexagon in hexagons. hexagons is a
+defaultdict from tuples of hexagon (x, y) integer indices to assigned nodes,
+or None if a hexagon is free. node_x and node_y are the x and y coordinates
+of the node, adapted so that the seed node lands in the 0, 0 hexagon, and
+re-scaled to reduce hexagon conflicts. node is the node to be assigned.
+scale, if specified, is the hexagon side length in node space units.
+This function assigns nodes to their closest hexagon, reprobing outwards if
+already occupied.
+When the function completes, node is stored in hexagons under some (x, y)
+tuple.
+Returns the distance this hexagon is from its ideal location.
+"""
+# These hold the hexagon that the point falls in, which may be taken.
+best_x, best_y = hexagon_pick(node_x, node_y, scale=scale)
+for x, y in radial_search(best_x, best_y):
+# These hexes are enumerated in order of increasign distance from the
+# best one, starting with the best hex itself.
+if hexagons[(x, y)] is None:
+# This is the closest free hex. Break out of the loop, leaving x and
+# y pointing here.
+break
+# Assign the node to the hexagon
+hexagons[(x, y)] = node
+return math.sqrt((x - best_x) ** 2 + (y - best_y) ** 2)
+def assign_hexagon_local_radial(hexagons, node_x, node_y, node, scale=1.0):
+"""
+This function assigns the given node to a hexagon in hexagons. hexagons is a
+defaultdict from tuples of hexagon (x, y) integer indices to assigned nodes,
+or None if a hexagon is free. node_x and node_y are the x and y coordinates
+of the node, adapted so that the seed node lands in the 0, 0 hexagon, and
+re-scaled to reduce hexagon conflicts. node is the node to be assigned.
+scale, if specified, is the hexagon side length in node space units.
+This function assigns nodes to their closest hexagon. If thast hexagon is
+full, it re-probes in the direction that the node is from the closest
+hexagon's center.
+When the function completes, node is stored in hexagons under some (x, y)
+tuple.
+Returns the distance this hexagon is from its ideal location.
+"""
+# These hold the hexagon that the point falls in, which may be taken.
+best_x, best_y = hexagon_pick(node_x, node_y, scale=scale)
+# These hold the center of that hexagon in float space
+center_x, center_y = hexagon_center(best_x, best_y, scale=scale)
+# This holds the distance from this point to the center of that hexagon
+node_distance = math.sqrt((node_x - center_x) ** 2 + (node_y - center_y) **
+2)
+# These hold the normalized direction of this point, relative to the center
+# of its best hexagon
+direction_x = (node_x - center_x) / node_distance
+direction_y = (node_y - center_y) / node_distance
+# Do a search in that direction, starting at the best hex.
+# These are the hexagon indices we're considering
+x, y = best_x, best_y
+# These are the Cartesian coordinates we're probing. Must be in the x, y hex
+# as a loop invariant.
+test_x, test_y = center_x, center_y
+while hexagons[(x, y)] is not None:
+# Re-probe outwards from the best hex in scale/2-sized steps
+# TODO: is that the right step size? Scale-sized steps seemed slightly
+# large.
+test_x += direction_x * scale
+test_y += direction_y * scale
+# Re-pick x and y for the hex containing our test point
+x, y = hexagon_pick(test_x, test_y, scale=scale)
+# We've finally reached the edge of the cluster.
+# Drop our hexagon
+hexagons[(x, y)] = node
+return math.sqrt((x - best_x) ** 2 + (y - best_y) ** 2)
+def assign_hexagon_radial(hexagons, node_x, node_y, node, scale=1.0):
+"""
+This function assigns the given node to a hexagon in hexagons. hexagons is a
+defaultdict from tuples of hexagon (x, y) integer indices to assigned nodes,
+or None if a hexagon is free. node_x and node_y are the x and y coordinates
+of the node, adapted so that the seed node lands in the 0, 0 hexagon, and
+re-scaled to reduce hexagon conflicts. node is the node to be assigned.
+scale, if specified, is the hexagon side length in node space units.
+This function assigns nodes to hexagons based on radial distance from 0, 0.
+This makes hexagon assignment much more dense, but can lose spatial
+structure.
+When the function completes, node is stored in hexagons under some (x, y)
+tuple.
+Returns the distance this hexagon is from its ideal location. Unfortunately,
+this doesn't really make sense for this assignment scheme, so it is always
+0.
+"""
+# Compute node's distance from the origin
+node_distance = math.sqrt(node_x ** 2 + node_y ** 2)
+# Compute normalized direction from the origin for this node
+direction_x = node_x / node_distance
+direction_y = node_y / node_distance
+# These are the coordinates we are testing
+test_x = 0
+test_y = 0
+# These are the hexagon indices that correspond to that point
+x, y = hexagon_pick(test_x, test_y, scale=scale)
+while hexagons[(x, y)] is not None:
+# Re-probe outwards from the origin in scale-sized steps
+# TODO: is that the right step size?
+test_x += direction_x * scale
+test_y += direction_y * scale
+# Re-pick
+x, y = hexagon_pick(test_x, test_y, scale=scale)
+# We've finally reached the edge of the cluster.
+# Drop our hexagon
+# TODO: this has to be N^2 if we line them all up in a line
+hexagons[(x, y)] = node
+return 0
+def hexagons_in_window(hexagons, x, y, width, height):
+"""
+Given a dict from (x, y) position to signature names, return the list of all
+signatures in the window starting at hexagon x, y and extending width in the
+x direction and height in the y direction on the hexagon grid.
+"""
+# This holds the list of hexagons we've found
+found = []
+for i in xrange(x, x + width):
+for j in xrange(y, y + height):
+if hexagons.has_key((i, j)):
+# This position in the window has a hex.
+found.append(hexagons[(i, j)])
+return found
+class ClusterFinder(object):
+"""
+A class that can be invoked to find the p value of the best cluster in its
+layer. Instances are pickleable.
+"""
+def __init__(self, hexagons, layer, window_size=5):
+"""
+Keep the given hexagons dict (from (x, y) to signature name) and the
+given layer (a dict from signature name to a value), and the given
+window size, in a ClusterFinder object.
+"""
+# TODO: This should probably all operate on numpy arrays that we can
+# slice efficiently.
+# Store the layer
+self.hexagons = hexagons
+# Store the hexagon assignments
+self.layer = layer
+# Store the window size
+self.window_size = window_size
+@staticmethod
+def continuous_p(in_values, out_values):
+"""
+Get the p value for in_values and out_values being distinct continuous
+distributions.
+in_values and out_values are both Numpy arrays. Returns the p value, or
+raises a ValueError if the statistical test cannot be run for some
+reason.
+Uses the Mann-Whitney U test.
+"""
+# Do a Mann-Whitney U test to see how different the data
+# sets are.
+u_statistic, p_value = scipy.stats.mannwhitneyu(in_values,
+out_values)
+return p_value
+@staticmethod
+def dichotomous_p(in_values, out_values):
+"""
+Given two one-dimensional Numpy arrays of 0s and 1s, compute a p value
+for the in_values having a different probability of being 1 than the
+frequency of 1s in the out_values.
+This test uses the scipy.stats.binom_test function, which does not claim
+to use the normal approximation. Therefore, this test should be valid
+for arbitrarily small frequencies of either 0s or 1s in in_values.
+TODO: What if out_values is shorter than in_values?
+"""
+if len(out_values) == 0:
+raise ValueError("Background group is empty!")
+# This holds the observed frequency of 1s in out_values
+frequency = numpy.sum(out_values) / len(out_values)
+# This holds the number of 1s in in_values
+successes = numpy.sum(in_values)
+# This holds the number of "trials" we got that many successes in
+trials = len(in_values)
+# Return how significantly the frequency inside differs from that
+# outside.
+return scipy.stats.binom_test(successes, trials, frequency)
+@staticmethod
+def categorical_p(in_values, out_values):
+"""
+Given two one-dimensional Numpy arrays of integers (which may be stored
+as floats), which represent items being assigned to different
+categories, return a p value for the distribution of categories observed
+in in_values differing from that observed in out_values.
+The normal way to do this is with a chi-squared goodness of fit test.
+However, that test has invalid assumptions when there are fewer than 5
+expected and 5 observed observations in every category.
+See http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chis
+quare.html
+However, we will use it anyway, because the tests that don't break down
+are prohibitively slow.
+"""
+# Convert our inputs to integer arrays
+in_values = in_values.astype(int)
+out_values = out_values.astype(int)
+# How many categories are there (count 0 to the maximum value)
+num_categories = max(numpy.max(in_values), numpy.max(out_values)) + 1
+# Count the number of in_values and out_values in each category
+in_counts = numpy.array([len(in_values[in_values == i]) for i in
+xrange(num_categories)])
+out_counts = numpy.array([len(out_values[out_values == i]) for i in
+xrange(num_categories)])
+# Get the p value for the window being from the estimated distribution
+# None of the distribution parameters count as "estimated from data"
+# because they aren't estimated from the data under test.
+_, p_value = scipy.stats.chisquare(in_counts, out_counts)
+return p_value
+def __call__(self):
+"""
+Find the best p value for any window of size window_size. Return it.
+"""
+# Calculate the bounding box where we want to look for windows.
+# TODO: This would just be all of a numpy array
+min_x = min(coords[0] for coords in self.hexagons.iterkeys())
+min_y = min(coords[1] for coords in self.hexagons.iterkeys())
+max_x = max(coords[0] for coords in self.hexagons.iterkeys())
+max_y = max(coords[1] for coords in self.hexagons.iterkeys())
+# This holds a Numpy array of all the data by x, y
+layer_data = numpy.empty((max_x - min_x + 1, max_y - min_y + 1))
+# Fill it with NaN so we can mask those out later
+layer_data[:] = numpy.NAN
+for (hex_x, hex_y), name in self.hexagons.iteritems():
+# Copy the layer values into the Numpy array
+if self.layer.has_key(name):
+layer_data[hex_x - min_x, hex_y - min_y] = self.layer[name]
+# This holds a masked version of the layer data
+layer_data_masked = numpy.ma.masked_invalid(layer_data, copy=False)
+# This holds the smallest p value we have found for this layer
+best_p = float("+inf")
+# This holds the statistical test to use (a function from two Numpy
+# arrays to a p value)
+# The most specific test is the dichotomous test (0 or 1)
+statistical_test = self.dichotomous_p
+if numpy.sum(~layer_data_masked.mask) == 0:
+# There is actually no data in this layer at all.
+# nditer complains if we try to iterate over an empty thing.
+# So quit early and say we couldn't find anything.
+return best_p
+for value in numpy.nditer(layer_data_masked[~layer_data_masked.mask]):
+# Check all the values in the layer.
+# If this value is out of the domain of the current statistical
+# test, upgrade to a more general test.
+if statistical_test == self.dichotomous_p and (value > 1 or
+value < 0):
+# We can't use a dichotomous test on things outside 0 to 1
+# But we haven't yet detected any non-integers
+# Use categorical
+statistical_test = self.categorical_p
+if value % 1 != 0:
+# This is not an integer value
+# So, we must use a continuous statistical test
+statistical_test = self.continuous_p
+# This is the least specific test, so we can stop now
+break
+for i in xrange(min_x, max_x - self.window_size):
+for j in xrange(min_y, max_y - self.window_size):
+# Get the layer values for hexes in the window, as a Numpy
+# masked array.
+in_region = layer_data_masked[i:i + self.window_size,
+j:j + self.window_size]
+# And as a 1d Numpy array
+in_values = numpy.reshape(in_region[~in_region.mask], -1).data
+# And out of the window (all the other hexes) as a masked array
+out_region = numpy.ma.copy(layer_data_masked)
+# We get this by masking out everything in the region
+out_region.mask[i:i + self.window_size,
+j:j + self.window_size] = True
+# And as a 1d Numpy array
+out_values = numpy.reshape(out_region[~out_region.mask],
+-1).data
+if len(in_values) == 0 or len(out_values) == 0:
+# Can't do any stats on this window
+continue
+if len(in_values) < 0.5 * self.window_size ** 2:
+# The window is less than half full. Skip it.
+# TODO: Make this threshold configurable.
+continue
+try:
+# Get the p value for this window under the selected
+# statistical test
+p_value = statistical_test(in_values, out_values)
+# If this is the best p value so far, record it
+best_p = min(best_p, p_value)
+except ValueError:
+# Probably an all-zero layer, or something else the test
+# can't handle.
+# But let's try all the other windows to be safe.
+# Maybe one will work.
+pass
+# We have now found the best p for any window for this layer.
+print "Best p found: {}".format(best_p)
+sys.stdout.flush()
+return best_p
+def run_functor(functor):
+"""
+Given a no-argument functor (like a ClusterFinder), run it and return its
+result. We can use this with multiprocessing.map and map it over a list of
+job functors to do them.
+Handles getting more than multiprocessing's pitiful exception output
+"""
+try:
+return functor()
+except:
+# Put all exception text into an exception and raise that
+raise Exception(traceback.format_exc())
+def open_matrices(names):
+	"""
+	The argument parser now take multiple similarity matrices as input and
+	saves their file name as strings. We want to store the names of these
+	strings for display later in hexagram.js in order to allow the user to
+	navigate and know what type of visualization map they are looking at -
+	gene expression, copy number, etc.
+	Since, the parser no longer opens the files automatically we must, do it
+	in this function.
+	"""
+	# For each file name, open the file and add it to the matrices list
+	# 'r' is the argument stating that the file will be read-only
+	for similarity_filename in  names:
+		print "Opening Matrices..."
+		matrix_file = tsv.TsvReader(open(similarity_filename, "r"))
+		matrices.append(matrix_file)
+def compute_beta (coords, matrix, axis, index, options):
+"""
+Compute and return a beta matrix from coords * matrix.
+Then print the matrix to a file to be read on clientside.
+"""
+beta = coords * matrix
+return beta
+# Must add writing function
+def drl_similarity_functions(matrix, index, options):
+	"""
+	Performs all the functions needed to format a similarity matrix into a
+	tsv format whereby the DrL can take the values. Then all of the DrL
+	functions are performed on the similarity matrix.
+	Options is passed to access options.singletons and other required apsects
+of the parsed args.
+	"""
+	# Work in a temporary directory
+	# If not available, create the directory.
+	drl_directory = tempfile.mkdtemp()
+# This is the base name for all the files that DrL uses to do the layout
+# We're going to put it in a temporary directory.
+	# index added to extension in order to keep track of
+	# respective layouts
+	drl_basename = os.path.join(drl_directory, "layout" + str(index))
+	# We can just pass our similarity matrix to DrL's truncate
+# But we want to run it through our tsv parser to strip comments and ensure
+# it's valid
+# This holds a reader for the similarity matrix
+	sim_reader = matrix
+# This holds a writer for the sim file
+	sim_writer = tsv.TsvWriter(open(drl_basename + ".sim", "w"))
+	print "Regularizing similarity matrix..."
+	sys.stdout.flush()
+# This holds a list of all unique signature names in the similarity matrix.
+# We can use it to add edges to keep singletons.
+	signatures = set()
+	print "Reach for parts in sim_reader"
+	for parts in sim_reader:
+# Keep the signature names used
+		signatures.add(parts[0])
+		signatures.add(parts[1])
+# Save the line to the regularized file
+		sim_writer.list_line(parts)
+	if options.singletons:
+# Now add a self-edge on every node, so we don't drop nodes with no
+# other strictly positive edges
+		for signature in signatures:
+			sim_writer.line(signature, signature, 1)
+	sim_reader.close()
+	sim_writer.close()
+# Now our input for DrL is prepared!
+# Do DrL truncate.
+# TODO: pass a truncation level
+	print "DrL: Truncating..."
+	sys.stdout.flush()
+	subprocess.check_call(["truncate", "-t", str(options.truncation_edges),
+drl_basename])
+# Run the DrL layout engine.
+	print "DrL: Doing layout..."
+	sys.stdout.flush()
+	subprocess.check_call(["layout", drl_basename])
+# Put the string names back
+	print "DrL: Restoring names..."
+	sys.stdout.flush()
+	subprocess.check_call(["recoord", drl_basename])
+# Now DrL has saved its coordinates as <signature name>\t<x>\t<y> rows in
+# <basename>.coord
+# We want to read that.
+# This holds a reader for the DrL output
+	coord_reader = tsv.TsvReader(open(drl_basename + ".coord", "r"))
+# This holds a dict from signature name string to (x, y) float tuple. It is
+# also our official collection of node names that made it through DrL, and
+# therefore need their score data sent to the client.
+	nodes = {}
+	print "Reading DrL output..."
+	sys.stdout.flush()
+	for parts in coord_reader:
+		nodes[parts[0]] = (float(parts[1]), float(parts[2]))
+	coord_reader.close()
+# Save the DrL coordinates in our bundle, to be displayed client-side for
+# debugging.
+	# index added to drl.tab extension in order to keep track of
+	# respective drl.tabs
+	coord_writer = tsv.TsvWriter(open(
+		os.path.join(options.directory, "drl" + str(index) + ".tab"), "w"))
+	for signature_name, (x, y) in nodes.iteritems():
+# Write a tsv with names instead of numbers, like what DrL recoord would
+# have written. This is what the Javascript on the client side wants.
+		coord_writer.line(signature_name, x, y)
+	coord_writer.close()
+# Delete our temporary directory.
+	shutil.rmtree(drl_directory)
+	# Return nodes dict back to main method for further processes
+	return nodes
+def compute_hexagram_assignments (nodes, index, options):
+"""
+Now that we are taking multiple similarity matrices as inputs, we must
+compute hexagram assignments for each similarity matrix. These assignments
+are based up on the nodes ouput provided by the DrL function.
+Index relates each matrix name with its drl output, nodes, assignments, etc.
+Options contains the parsed arguments that are present in the main method.
+"""
+# Do the hexagon layout
+# We do the squiggly rows setup, so express everything as integer x, y
+# This is a defaultdict from (x, y) integer tuple to id that goes there, or
+# None if it's free.
+global hexagons
+hexagons = collections.defaultdict(lambda: None)
+# This holds the side length that we use
+side_length = 1.0
+# This holds what will be a layer of how badly placed each hexagon is
+# A dict from node name to layer value
+placement_badnesses = {}
+for node, (node_x, node_y) in nodes.iteritems():
+# Assign each node to a hexagon
+# This holds the resulting placement badness for that hexagon (i.e.
+# distance from ideal location)
+badness = assign_hexagon(hexagons, node_x, node_y, node,
+scale=side_length)
+# Put the badness in the layer
+placement_badnesses[node] = float(badness)
+# Normalize the placement badness layer
+# This holds the max placement badness
+max_placement_badness = max(placement_badnesses.itervalues())
+print "Max placement badness: {}".format(max_placement_badness)
+if max_placement_badness != 0:
+# Normalize by the max if possible.
+placement_badnesses = {node: value / max_placement_badness for node,
+value in placement_badnesses.iteritems()}
+# The hexagons have been assigned. Make hexagons be a dict instead of a
+# defaultdict, so it pickles.
+# TODO: I should change it so I don't need to do this.
+hexagons = dict(hexagons)
+# Now dump the hexagon assignments as an id, x, y tsv. This will be read by
+# the JavaScript on the static page and be used to produce the
+# visualization.
+hexagon_writer = tsv.TsvWriter(open(os.path.join(options.directory,
+"assignments"+ str(index) + ".tab"), "w"))
+# First find the x and y offsets needed to make all hexagon positions
+# positive
+min_x = min(coords[0] for coords in hexagons.iterkeys())
+min_y = min(coords[1] for coords in hexagons.iterkeys())
+for coords, name in hexagons.iteritems():
+# Write this hexagon assignment, converted to all-positive coordinates.
+hexagon_writer.line(name, coords[0] - min_x, coords[1] - min_y)
+hexagon_writer.close()
+# Hand placement_badness dict to main method so that it can be used else
+# where.
+return placement_badnesses
+def write_matrix_names (options):
+"""
+Write the names of the similarity matrices so that hexagram.js can
+process the names and create the toggle layout GUI.
+We pass options to access the parsed args and thus the matrix names.
+"""
+name_writer = tsv.TsvWriter(open(os.path.join(options.directory,
+"matrixnames.tab"), "w"))
+for i in options.names:
+name_writer.line(i)
+name_writer.close()
+def main(args):
+"""
+Parses command line arguments, and makes visualization.
+"args" specifies the program arguments, with args[0] being the executable
+name. The return value should be used as the program's exit code.
+"""
+options = parse_args(args) # This holds the nicely-parsed options object
+print "Created Options"
+# Test our picking
+x, y = hexagon_center(0, 0)
+if hexagon_pick(x, y) != (0, 0):
+raise Exception("Picking is broken!")
+# First bit of stdout becomes annotation in Galaxy
+# Make sure our output directory exists.
+if not os.path.exists(options.directory):
+# makedirs is the right thing to use here: recursive
+os.makedirs(options.directory)
+print "Writing matrix names..."
+# We must write the file names for hexagram.js to access.
+write_matrix_names(options)
+print "About to open matrices..."
+	# We have file names stored in options.similarities
+	# We must open the files and store them in matrices list for access
+open_matrices(options.similarity)
+print "Opened matrices..."
+	# The nodes list stores the list of nodes for each matrix
+	# We must keep track of each set of nodes
+nodes_multiple = []
+print "Created nodes_multiple list..."
+	# Index for drl.tab and drl.layout file naming. With indexes we can match
+	# file names, to matrices, to drl output files.
+for index, i in enumerate (matrices):
+nodes_multiple.append (drl_similarity_functions(i, index, options))
+# Compute Hexagam Assignments for each similarity matrix's drl output,
+# which is found in nodes_multiple.
+# placement_badnesses_multiple list is required to store the placement
+# badness dicts that are returned by the compute_hexagram_assignments
+# function.
+placement_badnesses_multiple = []
+for index, i in enumerate (nodes_multiple):
+placement_badnesses_multiple.append (compute_hexagram_assignments (i, index, options))
+# Now that we have hex assignments, compute layers.
+# In addition to making per-layer files, we're going to copy all the score
+# matrices to our output directoy. That way, the client can download layers
+# in big chunks when it wants all layer data for statistics. We need to
+# write a list of matrices that the client can read, which is written by
+# this TSV writer.
+matrix_index_writer = tsv.TsvWriter(open(os.path.join(options.directory,
+"matrices.tab"), "w"))
+# Read in all the layer data at once
+# TODO: Don't read in all the layer data at once
+# This holds a dict from layer name to a dict from signature name to
+# score.
+layers = {}
+# This holds the names of all layers
+layer_names = []
+for matrix_number, score_filename in enumerate(options.scores):
+# First, copy the whole matrix into our output. This holds its filename.
+output_filename = "matrix_{}.tab".format(matrix_number)
+shutil.copy2(score_filename, os.path.join(options.directory,
+output_filename))
+# Record were we put it
+matrix_index_writer.line(output_filename)
+# This holds a reader for the scores TSV
+scores_reader = tsv.TsvReader(open(score_filename, "r"))
+# This holds an iterator over lines in that file
+# TODO: Write a proper header/data API
+scores_iterator = scores_reader.__iter__()
+try:
+# This holds the names of the columns (except the first, which is
+# labels). They also happen to be layer names
+file_layer_names = scores_iterator.next()[1:]
+# Add all the layers in this file to the complete list of layers.
+layer_names += file_layer_names
+# Ensure that we have a dict for every layer mentioned in the file
+# (even the ones that have no data below). Doing it this way means
+# all score matrices need disjoint columns, or the last one takes
+# precedence.
+for name in file_layer_names:
+layers[name] = {}
+for parts in scores_iterator:
+# This is the signature that this line is about
+signature_name = parts[0]
+if signature_name not in nodes_multiple[0]:
+# This signature wasn't in our DrL output. Don't bother
+# putting its layer data in our visualization. This saves
+# space and makes the client-side layer counts accurate for
+# the data actually displayable.
+continue
+# These are the scores for all the layers for this signature
+layer_scores = parts[1:]
+for (layer_name, score) in itertools.izip(file_layer_names,
+layer_scores):
+# Store all the layer scores in the appropriate
+# dictionaries.
+try:
+layers[layer_name][signature_name] = float(score)
+except ValueError:
+# This is not a float.
+# Don't set that entry for this layer.
+# TODO: possibly ought to complain to the user? But then
+# things like "N/A" won't be handled properly.
+continue
+except StopIteration:
+# We don't have any real data here. Couldn't read the header line.
+# Skip to the next file
+pass
+# We're done with this score file now
+scores_reader.close()
+# We're done with all the input score matrices, so our index is done too.
+matrix_index_writer.close()
+# We have now loaded all layer data into memory as Python objects. What
+# could possibly go wrong?
+# Stick our placement badness layer on the end
+layer_names.append("Placement Badness")
+layers["Placement Badness"] = placement_badnesses_multiple[0]
+# Now we need to write layer files.
+# Generate some filenames for layers that we can look up by layer name.
+# We do this because layer names may not be valid filenames.
+layer_files = {name: os.path.join(options.directory,
+"layer_{}.tab".format(number)) for (name, number) in itertools.izip(
+layer_names, itertools.count())}
+for layer_name, layer in layers.iteritems():
+# Write out all the individual layer files
+# This holds the writer for this layer file
+scores_writer = tsv.TsvWriter(open(layer_files[layer_name], "w"))
+for signature_name, score in layer.iteritems():
+# Write the score for this signature in this layer
+scores_writer.line(signature_name, score)
+scores_writer.close()
+# We need something to sort layers by. We have "priority" (lower is
+# better)
+if len(layer_names) > 0 and options.stats:
+# We want to do this fancy parallel stats thing.
+# We skip it when there are no layers, so we don't try to join a
+# never-used pool, which seems to hang.
+print "Running statistics..."
+# This holds an iterator that makes ClusterFinders for all out layers
+cluster_finders = [ClusterFinder(hexagons, layers[layer_name],
+window_size=options.window_size) for layer_name in layer_names]
+print "{} jobs to do.".format(len(cluster_finders))
+# This holds a multiprocessing pool for parallelization
+pool = multiprocessing.Pool()
+# This holds all the best p values in the same order
+best_p_values = pool.map(run_functor, cluster_finders)
+# Close down the pool so multiprocessing won't die sillily at the end
+pool.close()
+pool.join()
+# This holds a dict from layer name to priority (best p value)
+# We hope the order of the dict items has not changed
+layer_priorities = {layer_name: best_p_value for layer_name,
+best_p_value in itertools.izip(layer_names, best_p_values)}
+else:
+# We aren't doing any stats.
+print "Skipping statistics."
+# Make up priorities.
+layer_priorities = {name: float("+inf") for name in layer_names}
+# Count how many layer entries are greater than 0 for each binary layer, and
+# store that number in this dict by layer name. Things with the default
+# empty string instead of a number aren't binary layers, but they can use
+# the empty string as their TSV field value, so we can safely pull any layer
+# out of this by name.
+layer_positives = collections.defaultdict(str)
+for layer_name in layer_names:
+# Assume it's a binary layer until proven otherwise
+layer_positives[layer_name] = 0
+for value in layers[layer_name].itervalues():
+if value == 1:
+# Count up all the 1s in the layer
+layer_positives[layer_name] += 1
+elif value != 0:
+# It has something that isn't 1 or 0, so it can't be a binary
+# layer. Throw it out and try the next layer.
+layer_positives[layer_name] = ""
+break
+# Write an index of all the layers we have, in the form:
+# <layer>\t<file>\t<priority>\t<number of signatures with data>\t<number of
+# signatures that are 1 for binary layers, or empty>
+# This is the writer to use.
+index_writer = tsv.TsvWriter(open(os.path.join(options.directory,
+"layers.tab"), "w"))
+for layer_name, layer_file in layer_files.iteritems():
+# Write the index entry for this layer
+index_writer.line(layer_name, os.path.basename(layer_file),
+layer_priorities[layer_name], len(layers[layer_name]),
+layer_positives[layer_name])
+index_writer.close()
+# Sahil will implement linear regression code here
+# We must create a m * n matrix of samples * genes
+# In order to create this matrix we first must know the number of hexes
+# and mantain them in a certain order. The order is important so that
+# we populate the matrix with the data values in the proper row (sample).
+# Copy over the user-specified colormaps file, or make an empty TSV if it's
+# not specified.
+# This holds a writer for the sim file. Creating it creates the file.
+colormaps_writer = tsv.TsvWriter(open(os.path.join(options.directory,
+"colormaps.tab"), "w"))
+if options.colormaps is not None:
+# The user specified colormap data, so copy it over
+# This holds a reader for the colormaps file
+colormaps_reader = tsv.TsvReader(options.colormaps)
+print "Regularizing colormaps file..."
+sys.stdout.flush()
+for parts in colormaps_reader:
+colormaps_writer.list_line(parts)
+colormaps_reader.close()
+# Close the colormaps file we wrote. It may have gotten data, or it may
+# still be empty.
+colormaps_writer.close()
+# Now copy any static files from where they live next to this Python file
+# into the web page bundle.
+# This holds the directory where this script lives, which also contains
+# static files.
+tool_root = os.path.dirname(os.path.realpath(__file__))
+# Copy over all the static files we need for the web page
+# This holds a list of them
+static_files = [
+# Static images
+"drag.svg",
+"filter.svg",
+"statistics.svg",
+"right.svg",
+		"set.svg",
+		"save.svg",
+"inflate.svg",
+"throbber.svg",
+# jQuery itself is pulled from a CDN.
+# We can't take everything offline since Google Maps needs to be sourced
+# from Google, so we might as well use CDN jQuery.
+# Select2 scripts and resources:
+"select2.css",
+"select2.js",
+"select2.png",
+"select2-spinner.gif",
+"select2x2.png",
+# The jQuery.tsv plugin
+"jquery.tsv.js",
+# The color library
+"color-0.4.1.js",
+# The jStat statistics library
+"jstat-1.0.0.js",
+# The Google Maps MapLabel library
+"maplabel-compiled.js",
+# The main CSS file
+"hexagram.css",
+# The main JavaScript file that runs the page
+"hexagram.js",
+# Web Worker for statistics
+"statistics.js",
+# File with all the tool code
+"tools.js"
+]
+# We'd just use a directory of static files, but Galaxy needs single-level
+# output.
+for filename in static_files:
+shutil.copy2(os.path.join(tool_root, filename), options.directory)
+# Copy the HTML file to our output file. It automatically knows to read
+# assignments.tab, and does its own TSV parsing
+shutil.copy2(os.path.join(tool_root, "hexagram.html"), options.html)
+print "Visualization generation complete!"
+return 0
+if __name__ == "__main__" :
+try:
+# Get the return code to return
+# Don't just exit with it because sys.exit works by exceptions.
+return_code = main(sys.argv)
+except:
+traceback.print_exc()
+# Return a definite number and not some unspecified error code.
+return_code = 1
+sys.exit(return_code)

Mercurial > repos > adam-novak > hexagram

comparison hexagram-6ae12361157c/hexagram/hexagram.py @ 0:1407e3634bcf draft default tip