# HG changeset patch # User bimib # Date 1570019033 14400 # Node ID a8825e66c3a05f6801e15382a5a3d4f66cdd459f # Parent ac70e60d57899cc3a40136922530f8cf66b2736b Uploaded diff -r ac70e60d5789 -r a8825e66c3a0 Marea/marea_cluster.py --- a/Marea/marea_cluster.py Wed Oct 02 08:23:29 2019 -0400 +++ b/Marea/marea_cluster.py Wed Oct 02 08:23:53 2019 -0400 @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- """ Created on Mon Jun 3 19:51:00 2019 - @author: Narger """ @@ -122,17 +121,27 @@ ############################## write to csv ################################## def write_to_csv (dataset, labels, name): - list_labels = labels - list_values = dataset + #labels = predict + predict = [x+1 for x in labels] + + classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str) - list_values = list_values.tolist() - d = {'Label' : list_labels, 'Value' : list_values} + dest = name + classe.to_csv(dest, sep = '\t', index = False, + header = ['Patient_ID', 'Class']) - df = pd.DataFrame(d, columns=['Value','Label']) + + #list_labels = labels + #list_values = dataset - dest = name + '.tsv' - df.to_csv(dest, sep = '\t', index = False, - header = ['Value', 'Label']) + #list_values = list_values.tolist() + #d = {'Label' : list_labels, 'Value' : list_values} + + #df = pd.DataFrame(d, columns=['Value','Label']) + + #dest = name + '.tsv' + #df.to_csv(dest, sep = '\t', index = False, + # header = ['Value', 'Label']) ########################### trova il massimo in lista ######################## def max_index (lista): @@ -148,8 +157,8 @@ ################################ kmeans ##################################### def kmeans (k_min, k_max, dataset, elbow, silhouette, davies): - if not os.path.exists('clustering/kmeans_output'): - os.makedirs('clustering/kmeans_output') + if not os.path.exists('clustering'): + os.makedirs('clustering') if elbow == 'true': @@ -189,7 +198,7 @@ if (i + k_min == best): prefix = '_BEST' - write_to_csv(dataset, all_labels[i], 'clustering/kmeans_output/kmeans_with_' + str(i + k_min) + prefix + '_clusters.tsv') + write_to_csv(dataset, all_labels[i], 'clustering/kmeans_with_' + str(i + k_min) + prefix + '_clusters.tsv') if davies: with np.errstate(divide='ignore', invalid='ignore'): @@ -199,7 +208,7 @@ if silhouette: - silihouette_draw(dataset, all_labels[i], i + k_min, 'clustering/kmeans_output/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png') + silihouette_draw(dataset, all_labels[i], i + k_min, 'clustering/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png') if elbow: @@ -216,7 +225,7 @@ plt.plot(range(k_min, k_max+1), distortions, marker = 'o') plt.xlabel('Number of cluster') plt.ylabel('Distortion') - s = 'clustering/kmeans_output/elbow_plot.png' + s = 'clustering/elbow_plot.png' fig = plt.gcf() fig.set_size_inches(18.5, 10.5, forward = True) fig.savefig(s, dpi=100) @@ -288,8 +297,8 @@ ######################## dbscan ############################################## def dbscan(dataset, eps, min_samples): - if not os.path.exists('clustering/dbscan_output'): - os.makedirs('clustering/dbscan_output') + if not os.path.exists('clustering'): + os.makedirs('clustering') if eps is not None: clusterer = DBSCAN(eps = eps, min_samples = min_samples) @@ -305,52 +314,24 @@ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) - silhouette_avg = silhouette_score(dataset, labels) - warning("For n_clusters =" + str(n_clusters_) + - "The average silhouette_score is :" + str(silhouette_avg)) ##TODO: PLOT SU DBSCAN (no centers) e HIERARCHICAL - - # Black removed and is used for noise instead. - unique_labels = set(labels) - colors = [plt.cm.Spectral(each) - for each in np.linspace(0, 1, len(unique_labels))] - for k, col in zip(unique_labels, colors): - if k == -1: - # Black used for noise. - col = [0, 0, 0, 1] - - class_member_mask = (labels == k) - - xy = dataset[class_member_mask & core_samples_mask] - plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), - markeredgecolor='k', markersize=14) - - xy = dataset[class_member_mask & ~core_samples_mask] - plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), - markeredgecolor='k', markersize=6) - - plt.title('Estimated number of clusters: %d' % n_clusters_) - s = 'clustering/dbscan_output/dbscan_plot.png' - fig = plt.gcf() - fig.set_size_inches(18.5, 10.5, forward = True) - fig.savefig(s, dpi=100) - write_to_csv(dataset, labels, 'clustering/dbscan_output/dbscan_results.tsv') + write_to_csv(dataset, labels, 'clustering/dbscan_results.tsv') ########################## hierachical ####################################### def hierachical_agglomerative(dataset, k_min, k_max): - if not os.path.exists('clustering/agglomerative_output'): - os.makedirs('clustering/agglomerative_output') + if not os.path.exists('clustering'): + os.makedirs('clustering') plt.figure(figsize=(10, 7)) plt.title("Customer Dendograms") shc.dendrogram(shc.linkage(dataset, method='ward')) fig = plt.gcf() - fig.savefig('clustering/agglomerative_output/dendogram.png', dpi=200) + fig.savefig('clustering/dendogram.png', dpi=200) range_n_clusters = [i for i in range(k_min, k_max+1)] @@ -361,19 +342,10 @@ cluster_labels = cluster.labels_ silhouette_avg = silhouette_score(dataset, cluster_labels) - warning("For n_clusters =", n_clusters, - "The average silhouette_score is :", silhouette_avg) + write_to_csv(dataset, cluster_labels, 'clustering/hierarchical_with_' + str(n_clusters) + '_clusters.tsv') + #warning("For n_clusters =", n_clusters, + #"The average silhouette_score is :", silhouette_avg) - plt.clf() - plt.figure(figsize=(10, 7)) - plt.title("Agglomerative Hierarchical Clustering\nwith " + str(n_clusters) + " clusters and " + str(silhouette_avg) + " silhouette score") - plt.scatter(dataset[:,0], dataset[:,1], c = cluster_labels, cmap='rainbow') - s = 'clustering/agglomerative_output/hierachical_' + str(n_clusters) + '_clusters.png' - fig = plt.gcf() - fig.set_size_inches(10, 7, forward = True) - fig.savefig(s, dpi=200) - - write_to_csv(dataset, cluster_labels, 'clustering/agglomerative_output/agglomerative_hierarchical_with_' + str(n_clusters) + '_clusters.tsv') @@ -398,8 +370,6 @@ tmp = X[i][0] if tmp == None: X = X.drop(columns=[i]) - - X = pd.DataFrame.to_numpy(X) if args.cluster_type == 'kmeans': @@ -414,4 +384,4 @@ ############################################################################## if __name__ == "__main__": - main() + main() \ No newline at end of file