Mercurial > repos > bimib > marea
view Marea/marea_cluster.py @ 19:a7d52de8f7eb draft
Fix on clustering output
author | bimib |
---|---|
date | Wed, 02 Oct 2019 08:22:17 -0400 |
parents | c71ac0bb12de |
children | a8825e66c3a0 |
line wrap: on
line source
# -*- coding: utf-8 -*- """ Created on Mon Jun 3 19:51:00 2019 @author: Narger """ import sys import argparse import os from sklearn.datasets import make_blobs from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering from sklearn.metrics import silhouette_samples, silhouette_score, davies_bouldin_score, cluster import matplotlib.pyplot as plt import scipy.cluster.hierarchy as shc import matplotlib.cm as cm import numpy as np import pandas as pd ################################# process args ############################### def process_args(args): parser = argparse.ArgumentParser(usage = '%(prog)s [options]', description = 'process some value\'s' + ' genes to create class.') parser.add_argument('-ol', '--out_log', help = "Output log") parser.add_argument('-in', '--input', type = str, help = 'input dataset') parser.add_argument('-cy', '--cluster_type', type = str, choices = ['kmeans', 'meanshift', 'dbscan', 'hierarchy'], default = 'kmeans', help = 'choose clustering algorythm') parser.add_argument('-k1', '--k_min', type = int, default = 2, help = 'choose minimun cluster number to be generated') parser.add_argument('-k2', '--k_max', type = int, default = 7, help = 'choose maximum cluster number to be generated') parser.add_argument('-el', '--elbow', type = str, default = 'false', choices = ['true', 'false'], help = 'choose if you want to generate an elbow plot for kmeans') parser.add_argument('-si', '--silhouette', type = str, default = 'false', choices = ['true', 'false'], help = 'choose if you want silhouette plots') parser.add_argument('-db', '--davies', type = str, default = 'false', choices = ['true', 'false'], help = 'choose if you want davies bouldin scores') parser.add_argument('-td', '--tool_dir', type = str, required = True, help = 'your tool directory') parser.add_argument('-ms', '--min_samples', type = int, help = 'min samples for dbscan (optional)') parser.add_argument('-ep', '--eps', type = int, help = 'eps for dbscan (optional)') args = parser.parse_args() return args ########################### warning ########################################### def warning(s): args = process_args(sys.argv) with open(args.out_log, 'a') as log: log.write(s + "\n\n") print(s) ########################## read dataset ###################################### def read_dataset(dataset): try: dataset = pd.read_csv(dataset, sep = '\t', header = 0) except pd.errors.EmptyDataError: sys.exit('Execution aborted: wrong format of dataset\n') if len(dataset.columns) < 2: sys.exit('Execution aborted: wrong format of dataset\n') return dataset ############################ rewrite_input ################################### def rewrite_input(dataset): #Riscrivo il dataset come dizionario di liste, #non come dizionario di dizionari for key, val in dataset.items(): l = [] for i in val: if i == 'None': l.append(None) else: l.append(float(i)) dataset[key] = l return dataset ############################## write to csv ################################## def write_to_csv (dataset, labels, name): list_labels = labels list_values = dataset list_values = list_values.tolist() d = {'Label' : list_labels, 'Value' : list_values} df = pd.DataFrame(d, columns=['Value','Label']) dest = name + '.tsv' df.to_csv(dest, sep = '\t', index = False, header = ['Value', 'Label']) ########################### trova il massimo in lista ######################## def max_index (lista): best = -1 best_index = 0 for i in range(len(lista)): if lista[i] > best: best = lista [i] best_index = i return best_index ################################ kmeans ##################################### def kmeans (k_min, k_max, dataset, elbow, silhouette, davies): if not os.path.exists('clustering/kmeans_output'): os.makedirs('clustering/kmeans_output') if elbow == 'true': elbow = True else: elbow = False if silhouette == 'true': silhouette = True else: silhouette = False if davies == 'true': davies = True else: davies = False range_n_clusters = [i for i in range(k_min, k_max+1)] distortions = [] scores = [] all_labels = [] for n_clusters in range_n_clusters: clusterer = KMeans(n_clusters=n_clusters, random_state=10) cluster_labels = clusterer.fit_predict(dataset) all_labels.append(cluster_labels) silhouette_avg = silhouette_score(dataset, cluster_labels) scores.append(silhouette_avg) distortions.append(clusterer.fit(dataset).inertia_) best = max_index(scores) + k_min for i in range(len(all_labels)): prefix = '' if (i + k_min == best): prefix = '_BEST' write_to_csv(dataset, all_labels[i], 'clustering/kmeans_output/kmeans_with_' + str(i + k_min) + prefix + '_clusters.tsv') if davies: with np.errstate(divide='ignore', invalid='ignore'): davies_bouldin = davies_bouldin_score(dataset, all_labels[i]) warning("\nFor n_clusters = " + str(i + k_min) + " The average davies bouldin score is: " + str(davies_bouldin)) if silhouette: silihouette_draw(dataset, all_labels[i], i + k_min, 'clustering/kmeans_output/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png') if elbow: elbow_plot(distortions, k_min,k_max) ############################## elbow_plot #################################### def elbow_plot (distortions, k_min, k_max): plt.figure(0) plt.plot(range(k_min, k_max+1), distortions, marker = 'o') plt.xlabel('Number of cluster') plt.ylabel('Distortion') s = 'clustering/kmeans_output/elbow_plot.png' fig = plt.gcf() fig.set_size_inches(18.5, 10.5, forward = True) fig.savefig(s, dpi=100) ############################## silhouette plot ############################### def silihouette_draw(dataset, labels, n_clusters, path): silhouette_avg = silhouette_score(dataset, labels) warning("For n_clusters = " + str(n_clusters) + " The average silhouette_score is: " + str(silhouette_avg)) plt.close('all') # Create a subplot with 1 row and 2 columns fig, (ax1) = plt.subplots(1, 1) fig.set_size_inches(18, 7) # The 1st subplot is the silhouette plot # The silhouette coefficient can range from -1, 1 but in this example all # lie within [-0.1, 1] ax1.set_xlim([-1, 1]) # The (n_clusters+1)*10 is for inserting blank space between silhouette # plots of individual clusters, to demarcate them clearly. ax1.set_ylim([0, len(dataset) + (n_clusters + 1) * 10]) # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(dataset, labels) y_lower = 10 for i in range(n_clusters): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = \ sample_silhouette_values[labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.nipy_spectral(float(i) / n_clusters) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) # Label the silhouette plots with their cluster numbers at the middle ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples ax1.set_title("The silhouette plot for the various clusters.") ax1.set_xlabel("The silhouette coefficient values") ax1.set_ylabel("Cluster label") # The vertical line for average silhouette score of all the values ax1.axvline(x=silhouette_avg, color="red", linestyle="--") ax1.set_yticks([]) # Clear the yaxis labels / ticks ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) plt.suptitle(("Silhouette analysis for clustering on sample data " "with n_clusters = " + str(n_clusters) + "\nAverage silhouette_score = " + str(silhouette_avg)), fontsize=12, fontweight='bold') plt.savefig(path, bbox_inches='tight') ######################## dbscan ############################################## def dbscan(dataset, eps, min_samples): if not os.path.exists('clustering/dbscan_output'): os.makedirs('clustering/dbscan_output') if eps is not None: clusterer = DBSCAN(eps = eps, min_samples = min_samples) else: clusterer = DBSCAN() clustering = clusterer.fit(dataset) core_samples_mask = np.zeros_like(clustering.labels_, dtype=bool) core_samples_mask[clustering.core_sample_indices_] = True labels = clustering.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) silhouette_avg = silhouette_score(dataset, labels) warning("For n_clusters =" + str(n_clusters_) + "The average silhouette_score is :" + str(silhouette_avg)) ##TODO: PLOT SU DBSCAN (no centers) e HIERARCHICAL # Black removed and is used for noise instead. unique_labels = set(labels) colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))] for k, col in zip(unique_labels, colors): if k == -1: # Black used for noise. col = [0, 0, 0, 1] class_member_mask = (labels == k) xy = dataset[class_member_mask & core_samples_mask] plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), markeredgecolor='k', markersize=14) xy = dataset[class_member_mask & ~core_samples_mask] plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), markeredgecolor='k', markersize=6) plt.title('Estimated number of clusters: %d' % n_clusters_) s = 'clustering/dbscan_output/dbscan_plot.png' fig = plt.gcf() fig.set_size_inches(18.5, 10.5, forward = True) fig.savefig(s, dpi=100) write_to_csv(dataset, labels, 'clustering/dbscan_output/dbscan_results.tsv') ########################## hierachical ####################################### def hierachical_agglomerative(dataset, k_min, k_max): if not os.path.exists('clustering/agglomerative_output'): os.makedirs('clustering/agglomerative_output') plt.figure(figsize=(10, 7)) plt.title("Customer Dendograms") shc.dendrogram(shc.linkage(dataset, method='ward')) fig = plt.gcf() fig.savefig('clustering/agglomerative_output/dendogram.png', dpi=200) range_n_clusters = [i for i in range(k_min, k_max+1)] for n_clusters in range_n_clusters: cluster = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward') cluster.fit_predict(dataset) cluster_labels = cluster.labels_ silhouette_avg = silhouette_score(dataset, cluster_labels) warning("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg) plt.clf() plt.figure(figsize=(10, 7)) plt.title("Agglomerative Hierarchical Clustering\nwith " + str(n_clusters) + " clusters and " + str(silhouette_avg) + " silhouette score") plt.scatter(dataset[:,0], dataset[:,1], c = cluster_labels, cmap='rainbow') s = 'clustering/agglomerative_output/hierachical_' + str(n_clusters) + '_clusters.png' fig = plt.gcf() fig.set_size_inches(10, 7, forward = True) fig.savefig(s, dpi=200) write_to_csv(dataset, cluster_labels, 'clustering/agglomerative_output/agglomerative_hierarchical_with_' + str(n_clusters) + '_clusters.tsv') ############################# main ########################################### def main(): if not os.path.exists('clustering'): os.makedirs('clustering') args = process_args(sys.argv) #Data read X = read_dataset(args.input) X = pd.DataFrame.to_dict(X, orient='list') X = rewrite_input(X) X = pd.DataFrame.from_dict(X, orient = 'index') for i in X.columns: tmp = X[i][0] if tmp == None: X = X.drop(columns=[i]) X = pd.DataFrame.to_numpy(X) if args.cluster_type == 'kmeans': kmeans(args.k_min, args.k_max, X, args.elbow, args.silhouette, args.davies) if args.cluster_type == 'dbscan': dbscan(X, args.eps, args.min_samples) if args.cluster_type == 'hierarchy': hierachical_agglomerative(X, args.k_min, args.k_max) ############################################################################## if __name__ == "__main__": main()