Mercurial > repos > saket-choudhary > inchlib_clust
view inchlib_clust/inchlib_clust.py @ 0:60f93f839759 draft default tip
Uploaded
author | saket-choudhary |
---|---|
date | Tue, 07 Oct 2014 19:35:50 -0400 |
parents | |
children |
line wrap: on
line source
#coding: utf-8 from __future__ import print_function import csv, json, copy, re, argparse, os, urllib2 import numpy, scipy, fastcluster, sklearn import scipy.cluster.hierarchy as hcluster from sklearn import preprocessing from scipy import spatial LINKAGES = ["single", "complete", "average", "centroid", "ward", "median", "weighted"] RAW_LINKAGES = ["ward", "centroid"] DISTANCES = {"numeric": ["braycurtis", "canberra", "chebyshev", "cityblock", "correlation", "cosine", "euclidean", "mahalanobis", "minkowski", "seuclidean", "sqeuclidean"], "binary": ["dice","hamming","jaccard","kulsinski","matching","rogerstanimoto","russellrao","sokalmichener","sokalsneath","yule"]} class Dendrogram(): """Class which handles the generation of cluster heatmap format of clustered data. As an input it takes a Cluster instance with clustered data.""" def __init__(self, clustering): self.cluster_object = clustering self.data_type = clustering.data_type self.axis = clustering.clustering_axis self.clustering = clustering.clustering self.tree = hcluster.to_tree(self.clustering) self.data = clustering.data self.data_names = clustering.data_names self.header = clustering.header self.dendrogram = False def __get_cluster_heatmap__(self, write_data): root, nodes = hcluster.to_tree(self.clustering, rd=True) node_id2node = {} dendrogram = {"nodes":{}} for node in nodes: node_id = node.id if node.count == 1: node_id2node[node_id] = {"count":1, "distance":0} else: node_left_child = node.get_left().id node_right_child = node.get_right().id node_id2node[node_id] = {"count":node.count, "distance":round(node.dist, 3), "left_child": node_left_child, "right_child": node_right_child} for n in node_id2node: node = node_id2node[n] if node["count"] != 1: node_id2node[node["left_child"]]["parent"] = n node_id2node[node["right_child"]]["parent"] = n for n in node_id2node: node = node_id2node[n] if node["count"] == 1: data = self.data[n] node["objects"] = [self.data_names[n]] if node_id2node[node["parent"]]["left_child"] == n: node_id2node[node["parent"]]["left_child"] = n else: node_id2node[node["parent"]]["right_child"] = n if not write_data: data = [] node["features"] = data dendrogram["nodes"][n] = node for n in node_id2node: if node_id2node[n]["count"] != 1: dendrogram["nodes"][n] = node_id2node[n] return dendrogram def __get_column_dendrogram__(self): root, nodes = hcluster.to_tree(self.cluster_object.column_clustering, rd=True) node_id2node = {} dendrogram = {"nodes":{}} for node in nodes: node_id = node.id if node.count == 1: node_id2node[node_id] = {"count":1, "distance":0} else: node_left_child = node.get_left().id node_right_child = node.get_right().id node_id2node[node_id] = {"count":node.count, "distance":round(node.dist, 3), "left_child": node_left_child, "right_child": node_right_child} for n in node_id2node: node = node_id2node[n] if node["count"] != 1: node_id2node[node["left_child"]]["parent"] = n node_id2node[node["right_child"]]["parent"] = n for n in node_id2node: if not n in dendrogram["nodes"]: dendrogram["nodes"][n] = node_id2node[n] return dendrogram def create_cluster_heatmap(self, compress=False, compressed_value="median", write_data=True): """Creates cluster heatmap representation in inchlib format. By setting compress parameter to True you can cut the dendrogram in a distance to decrease the row size of the heatmap to specified count. When compressing the type of the resulted value of merged rows is given by the compressed_value parameter (median, mean). When the metadata are nominal (text values) the most frequent is the result after compression. By setting write_data to False the data features won't be present in the resulting format.""" self.dendrogram = {"data": self.__get_cluster_heatmap__(write_data)} self.compress = compress self.compressed_value = compressed_value self.compress_cluster_treshold = 0 if self.compress and self.compress >= 0: self.compress_cluster_treshold = self.__get_distance_treshold__(compress) print("Distance treshold for compression:", self.compress_cluster_treshold) if self.compress_cluster_treshold >= 0: self.__compress_data__() else: self.compress = False if self.header and write_data: self.dendrogram["data"]["feature_names"] = [h for h in self.header] elif self.header and not write_data: self.dendrogram["data"]["feature_names"] = [] if self.axis == "both" and len(self.cluster_object.column_clustering): column_dendrogram = hcluster.to_tree(self.cluster_object.column_clustering) self.dendrogram["column_dendrogram"] = self.__get_column_dendrogram__() return def __compress_data__(self): nodes = {} to_remove = set() compressed_value2fnc = { "median": lambda values: [round(numpy.median(value), 3) for value in values], "mean": lambda values: [round(numpy.average(value), 3) for value in values], } for n in self.dendrogram["data"]["nodes"]: node = self.dendrogram["data"]["nodes"][n] if node["count"] == 1: objects = node["objects"] data = node["features"] node_id = n while self.dendrogram["data"]["nodes"][node["parent"]]["distance"] <= self.compress_cluster_treshold: to_remove.add(node_id) node_id = node["parent"] node = self.dendrogram["data"]["nodes"][node_id] if node["count"] != 1: if not "objects" in self.dendrogram["data"]["nodes"][node_id]: self.dendrogram["data"]["nodes"][node_id]["objects"] = [] self.dendrogram["data"]["nodes"][node_id]["features"] = [] self.dendrogram["data"]["nodes"][node_id]["objects"].extend(objects) if data: self.dendrogram["data"]["nodes"][node_id]["features"].append(data) for node in to_remove: self.dendrogram["data"]["nodes"].pop(node) for k in self.dendrogram["data"]["nodes"]: node = self.dendrogram["data"]["nodes"][k] if "objects" in node and node["count"] != 1: self.dendrogram["data"]["nodes"][k]["distance"] = 0 self.dendrogram["data"]["nodes"][k]["count"] = 1 self.dendrogram["data"]["nodes"][k].pop("left_child") self.dendrogram["data"]["nodes"][k].pop("right_child") rows = zip(*self.dendrogram["data"]["nodes"][k]["features"]) self.dendrogram["data"]["nodes"][k]["features"] = compressed_value2fnc[self.compressed_value](rows) self.__adjust_node_counts__() return def __adjust_node_counts__(self): leaves = [] for n in self.dendrogram["data"]["nodes"]: if self.dendrogram["data"]["nodes"][n]["count"] > 1: self.dendrogram["data"]["nodes"][n]["count"] = 0 else: leaves.append(n) for n in leaves: node = self.dendrogram["data"]["nodes"][n] parent_id = node["parent"] while parent_id: node = self.dendrogram["data"]["nodes"][parent_id] self.dendrogram["data"]["nodes"][parent_id]["count"] += 1 parent_id = False if "parent" in node: parent_id = node["parent"] return def __get_distance_treshold__(self, cluster_count): print("Calculating distance treshold for cluster compression...") if cluster_count >= self.tree.count: return -1 i = 0 count = cluster_count + 1 test_step = self.tree.dist/2 while test_step >= 0.1: count = len(set([c for c in hcluster.fcluster(self.clustering, i, "distance")])) if count < cluster_count: if i == 0: return 0 i = i - test_step test_step = test_step/2 elif count == cluster_count: return i else: i += test_step return i+test_step*2 def export_cluster_heatmap_as_json(self, filename=None): """Returns cluster heatmap in a JSON format or exports it to the file specified by the filename parameter.""" dendrogram_json = json.dumps(self.dendrogram, indent=4) if filename: output = open(filename, "w") output.write(dendrogram_json) return dendrogram_json def export_cluster_heatmap_as_html(self, htmldir="."): """Export simple HTML page with embedded cluster heatmap and dependencies to given directory.""" if not os.path.exists(htmldir): os.makedirs(htmldir) dendrogram_json = json.dumps(self.dendrogram, indent=4) template = """<html> <head> <script src="jquery-2.0.3.min.js"></script> <script src="kinetic-v5.0.0.min.js"></script> <script src="inchlib-1.0.1.min.js"></script> <script> $(document).ready(function() {{ var data = {}; var inchlib = new InCHlib({{ target: "inchlib", max_height: 1200, width: 1000, }}); inchlib.read_data(data); inchlib.draw(); }}); </script> </head> <body> <div id="inchlib"></div> </body> </html>""".format(dendrogram_json) lib2url = {"inchlib-1.0.1.min.js": "http://openscreen.cz/software/inchlib/static/js/inchlib-1.0.1.min.js", "jquery-2.0.3.min.js": "http://openscreen.cz/software/inchlib/static/js/jquery-2.0.3.min.js", "kinetic-v5.0.0.min.js": "http://openscreen.cz/software/inchlib/static/js/kinetic-v5.0.0.min.js"} for lib, url in lib2url.items(): try: source = urllib2.urlopen(url) source_html = source.read() with open(os.path.join(htmldir, lib), "w") as output: output.write(source_html) except urllib2.URLError, e: raise Exception("\nCan't download file {}.\nPlease check your internet connection and try again.\nIf the error persists there can be something wrong with the InCHlib server.\n".format(url)) with open(os.path.join(htmdlir, "inchlib.html"), "w") as output: output.write(template) return def add_metadata_from_file(self, metadata_file, delimiter, header=True, metadata_compressed_value="median"): """Adds metadata from csv file. Metadata_compressed_value specifies the resulted value when the data are compressed (median/mean/frequency)""" self.metadata_compressed_value = metadata_compressed_value self.metadata, self.metadata_header = self.__read_metadata_file__(metadata_file, delimiter, header) self.__connect_metadata_to_data__() return def add_metadata(self, metadata, header=True, metadata_compressed_value="median"): """Adds metadata in a form of list of lists (tuples). Metadata_compressed_value specifies the resulted value when the data are compressed (median/mean/frequency)""" self.metadata_compressed_value = metadata_compressed_value self.metadata, self.metadata_header = self.__read_metadata__(metadata, header) self.__connect_metadata_to_data__() return def __connect_metadata_to_data__(self): if len(set(self.metadata.keys()) & set(self.data_names)) == 0: raise Exception("Metadata objects must correspond with original data objects.") if not self.dendrogram: raise Exception("You must create dendrogram before adding metadata.") self.dendrogram["metadata"] = {"nodes":{}} if self.metadata_header: self.dendrogram["metadata"]["feature_names"] = self.metadata_header leaves = {n:node for n, node in self.dendrogram["data"]["nodes"].items() if node["count"] == 1} if not self.compress: for leaf_id, leaf in leaves.items(): try: self.dendrogram["metadata"]["nodes"][leaf_id] = self.metadata[leaf["objects"][0]] except Exception, e: continue else: compressed_value2fnc = { "median": lambda values: round(numpy.median(col), 3), "mean": lambda values: round(numpy.average(col), 3) } for leaf in leaves: objects = [] for item in leaves[leaf]["objects"]: try: objects.append(self.metadata[item]) except Exception, e: continue cols = zip(*objects) row = [] cols = [list(c) for c in cols] for col in cols: if self.metadata_compressed_value in compressed_value2fnc: try: col = [float(c) for c in col] value = compressed_value2fnc[self.metadata_compressed_value](col) except ValueError: freq2val = {col.count(v):v for v in set(col)} value = freq2val[max(freq2val.keys())] elif self.metadata_compressed_value == "frequency": freq2val = {col.count(v):v for v in set(col)} value = freq2val[max(freq2val.keys())] else: raise Exception("Unkown type of metadata_compressed_value: {}. Possible values are: median, mean, frequency.".format(self.metadata_compressed_value)) row.append(value) self.dendrogram["metadata"]["nodes"][leaf] = row return def __read_metadata__(self, metadata, header): metadata_header = [] rows = metadata metadata = {} data_start = 0 if header: metadata_header = rows[0][1:] data_start = 1 for row in rows[data_start:]: metadata[str(row[0])] = [r for r in row[1:]] return metadata, metadata_header def __read_metadata_file__(self, metadata_file, delimiter, header): csv_reader = csv.reader(open(metadata_file, "r"), delimiter=delimiter) metadata_header = [] rows = [row for row in csv_reader] metadata = {} data_start = 0 if header: metadata_header = rows[0][1:] data_start = 1 for row in rows[data_start:]: metadata_id = str(row[0]) metadata[metadata_id] = [r for r in row[1:]] return metadata, metadata_header class Cluster(): """Class for data clustering""" def __init__(self): self.write_original = False def read_csv(self, filename, delimiter=",", header=False): """Reads data from the CSV file""" self.filename = filename csv_reader = csv.reader(open(self.filename, "r"), delimiter=delimiter) rows = [row for row in csv_reader] self.read_data(rows, header) def read_data(self, rows, header=False): """Reads data in a form of list of lists (tuples)""" self.header = header data_start = 0 if self.header: self.header = rows[0][1:] data_start = 1 self.data_names = [str(row[0]) for row in rows[data_start:]] self.data = [[round(float(value), 3) for value in row[1:]] for row in rows[data_start:]] self.original_data = copy.deepcopy(self.data) return def normalize_data(self, feature_range=(0,1), write_original=False): """Normalizes data to a scale from 0 to 1. When write_original is set to True, the normalized data will be clustered, but original data will be written to the heatmap.""" self.write_original = write_original min_max_scaler = preprocessing.MinMaxScaler(feature_range) self.data = min_max_scaler.fit_transform(self.data) self.data = [[round(v, 3) for v in row] for row in self.data] return def cluster_data(self, data_type="numeric", row_distance="euclidean", row_linkage="single", axis="row", column_distance="euclidean", column_linkage="ward"): """Performs clustering according to the given parameters. @data_type - numeric/binary @row_distance/column_distance - see. DISTANCES variable @row_linkage/column_linkage - see. LINKAGES variable @axis - row/both """ print("Clustering rows:", row_distance, row_linkage) self.data_type = data_type self.clustering_axis = axis row_linkage = str(row_linkage) if row_linkage in RAW_LINKAGES: self.clustering = fastcluster.linkage(self.data, method=row_linkage, metric=row_distance) else: self.distance_vector = fastcluster.pdist(self.data, row_distance) if data_type in DISTANCES and not row_distance in DISTANCES[data_type]: raise Exception("".join(["When clustering" , data_type, "data you must choose from these distance measures: ", ", ".join(DISTANCES[data_type])])) elif not data_type in DISTANCES.keys(): raise Exception("".join(["You can choose only from data types: ", ", ".join(DISTANCES.keys())])) self.clustering = fastcluster.linkage(self.distance_vector, method=str(row_linkage)) self.column_clustering = [] if axis == "both" and len(self.data[0]) > 2: print("Clustering columns:", column_distance, column_linkage) self.__cluster_columns__(column_distance, column_linkage) if self.write_original: self.data = self.original_data return def __cluster_columns__(self, column_distance, column_linkage): columns = zip(*self.data) self.column_clustering = fastcluster.linkage(columns, method=column_linkage, metric=column_distance) self.data_order = hcluster.leaves_list(self.column_clustering) self.data = self.__reorder_data__(self.data, self.data_order) self.original_data = self.__reorder_data__(self.original_data, self.data_order) if self.header: self.header = self.__reorder_data__([self.header], self.data_order)[0] return def __reorder_data__(self, data, order): for i in xrange(len(data)): reordered_data = [] for j in order: reordered_data.append(data[i][j]) reordered_data.reverse() data[i] = reordered_data return data def _process_(arguments): c = Cluster() c.read_csv(arguments.data_file, arguments.data_delimiter, arguments.data_header) if arguments.normalize: c.normalize_data(feature_range=(0,1), write_original=arguments.write_original) c.cluster_data(data_type=arguments.datatype, row_distance=arguments.row_distance, row_linkage=arguments.row_linkage, axis=arguments.axis, column_distance=arguments.column_distance, column_linkage=arguments.column_linkage) d = Dendrogram(c) d.create_cluster_heatmap(compress=arguments.compress, compressed_value=arguments.compressed_value, write_data=not arguments.dont_write_data) if arguments.metadata: d.add_metadata_from_file(metadata_file=arguments.metadata, delimiter=arguments.metadata_delimiter, header=arguments.metadata_header, metadata_compressed_value=arguments.metadata_compressed_value) if arguments.output_file or arguments.html_dir: if arguments.output_file: d.export_cluster_heatmap_as_json(arguments.output_file) else: d.export_cluster_heatmap_as_html(arguments.html_dir) else: print(json.dumps(d.dendrogram, indent=4)) if __name__ == '__main__': parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("data_file", type=str, help="csv(text) data file with delimited values") parser.add_argument("-o", "--output_file", type=str, help="the name of output file") parser.add_argument("-html", "--html_dir", type=str, help="the directory to store HTML page with dependencies") parser.add_argument("-rd", "--row_distance", type=str, default="euclidean", help="set the distance to use for clustering rows") parser.add_argument("-rl", "--row_linkage", type=str, default="ward", help="set the linkage to use for clustering rows") parser.add_argument("-cd", "--column_distance", type=str, default="euclidean", help="set the distance to use for clustering columns (only when clustering by both axis -a parameter)") parser.add_argument("-cl", "--column_linkage", type=str, default="ward", help="set the linkage to use for clustering columns (only when clustering by both axis -a parameter)") parser.add_argument("-a", "--axis", type=str, default="row", help="define clustering axis (row/both)") parser.add_argument("-dt", "--datatype", type=str, default="numeric", help="specify the type of the data (numeric/binary)") parser.add_argument("-dd", "--data_delimiter", type=str, default=",", help="delimiter of values in data file") parser.add_argument("-m", "--metadata", type=str, default=None, help="csv(text) metadata file with delimited values") parser.add_argument("-md", "--metadata_delimiter", type=str, default=",", help="delimiter of values in metadata file") parser.add_argument("-dh", "--data_header", default=False, help="whether the first row of data file is a header", action="store_true") parser.add_argument("-mh", "--metadata_header", default=False, help="whether the first row of metadata file is a header", action="store_true") parser.add_argument("-c", "--compress", type=int, default=0, help="compress the data to contain maximum of specified count of rows") parser.add_argument("-cv", "--compressed_value", type=str, default="median", help="the resulted value from merged rows when the data are compressed (median/mean/frequency)") parser.add_argument("-mcv", "--metadata_compressed_value", type=str, default="median", help="the resulted value from merged rows of metadata when the data are compressed (median/mean/count)") parser.add_argument("-dwd", "--dont_write_data", default=False, help="don't write clustered data to the inchlib data format", action="store_true") parser.add_argument("-n", "--normalize", default=False, help="normalize data to [0, 1] range", action="store_true") parser.add_argument("-wo", "--write_original", default=False, help="cluster normalized data but write the original ones to the heatmap", action="store_true") args = parser.parse_args() _process_(args)