diff Marea/marea_cluster.py @ 23:a8825e66c3a0 draft

Uploaded
author bimib
date Wed, 02 Oct 2019 08:23:53 -0400
parents c71ac0bb12de
children 9992eba50cfb
line wrap: on
line diff
--- a/Marea/marea_cluster.py	Wed Oct 02 08:23:29 2019 -0400
+++ b/Marea/marea_cluster.py	Wed Oct 02 08:23:53 2019 -0400
@@ -1,7 +1,6 @@
 # -*- coding: utf-8 -*-
 """
 Created on Mon Jun 3 19:51:00 2019
-
 @author: Narger
 """
 
@@ -122,17 +121,27 @@
 ############################## write to csv ##################################
     
 def write_to_csv (dataset, labels, name):
-    list_labels = labels
-    list_values = dataset
+    #labels = predict
+    predict = [x+1 for x in labels]
+  
+    classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)
 
-    list_values = list_values.tolist()
-    d = {'Label' : list_labels, 'Value' : list_values}
+    dest = name
+    classe.to_csv(dest, sep = '\t', index = False,
+                      header = ['Patient_ID', 'Class'])
     
-    df = pd.DataFrame(d, columns=['Value','Label'])
+
+      #list_labels = labels
+    #list_values = dataset
 
-    dest = name + '.tsv'
-    df.to_csv(dest, sep = '\t', index = False,
-                      header = ['Value', 'Label'])
+    #list_values = list_values.tolist()
+    #d = {'Label' : list_labels, 'Value' : list_values}
+    
+    #df = pd.DataFrame(d, columns=['Value','Label'])
+
+    #dest = name + '.tsv'
+    #df.to_csv(dest, sep = '\t', index = False,
+     #                 header = ['Value', 'Label'])
     
 ########################### trova il massimo in lista ########################
 def max_index (lista):
@@ -148,8 +157,8 @@
 ################################ kmeans #####################################
     
 def kmeans (k_min, k_max, dataset, elbow, silhouette, davies):
-    if not os.path.exists('clustering/kmeans_output'):
-        os.makedirs('clustering/kmeans_output')
+    if not os.path.exists('clustering'):
+        os.makedirs('clustering')
     
         
     if elbow == 'true':
@@ -189,7 +198,7 @@
         if (i + k_min == best):
             prefix = '_BEST'
             
-        write_to_csv(dataset, all_labels[i], 'clustering/kmeans_output/kmeans_with_' + str(i + k_min) + prefix + '_clusters.tsv')
+        write_to_csv(dataset, all_labels[i], 'clustering/kmeans_with_' + str(i + k_min) + prefix + '_clusters.tsv')
             
         if davies:
             with np.errstate(divide='ignore', invalid='ignore'):
@@ -199,7 +208,7 @@
         
        
         if silhouette:
-            silihouette_draw(dataset, all_labels[i], i + k_min, 'clustering/kmeans_output/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png')
+            silihouette_draw(dataset, all_labels[i], i + k_min, 'clustering/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png')
         
         
     if elbow:
@@ -216,7 +225,7 @@
     plt.plot(range(k_min, k_max+1), distortions, marker = 'o')
     plt.xlabel('Number of cluster')
     plt.ylabel('Distortion')
-    s = 'clustering/kmeans_output/elbow_plot.png'
+    s = 'clustering/elbow_plot.png'
     fig = plt.gcf()
     fig.set_size_inches(18.5, 10.5, forward = True)
     fig.savefig(s, dpi=100)
@@ -288,8 +297,8 @@
 ######################## dbscan ##############################################
     
 def dbscan(dataset, eps, min_samples):
-    if not os.path.exists('clustering/dbscan_output'):
-        os.makedirs('clustering/dbscan_output')
+    if not os.path.exists('clustering'):
+        os.makedirs('clustering')
         
     if eps is not None:
     	clusterer = DBSCAN(eps = eps, min_samples = min_samples)
@@ -305,52 +314,24 @@
     # Number of clusters in labels, ignoring noise if present.
     n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
     
-    silhouette_avg = silhouette_score(dataset, labels)
-    warning("For n_clusters =" + str(n_clusters_) + 
-              "The average silhouette_score is :" + str(silhouette_avg))
     
     ##TODO: PLOT SU DBSCAN (no centers) e HIERARCHICAL
-
-    # Black removed and is used for noise instead.
-    unique_labels = set(labels)
-    colors = [plt.cm.Spectral(each)
-          for each in np.linspace(0, 1, len(unique_labels))]
-    for k, col in zip(unique_labels, colors):
-        if k == -1:
-            # Black used for noise.
-            col = [0, 0, 0, 1]
-
-        class_member_mask = (labels == k)
-    
-        xy = dataset[class_member_mask & core_samples_mask]
-        plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
-                 markeredgecolor='k', markersize=14)
-    
-        xy = dataset[class_member_mask & ~core_samples_mask]
-        plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
-                 markeredgecolor='k', markersize=6)
-
-    plt.title('Estimated number of clusters: %d' % n_clusters_)
-    s = 'clustering/dbscan_output/dbscan_plot.png'
-    fig = plt.gcf()
-    fig.set_size_inches(18.5, 10.5, forward = True)
-    fig.savefig(s, dpi=100)
     
     
-    write_to_csv(dataset, labels, 'clustering/dbscan_output/dbscan_results.tsv')
+    write_to_csv(dataset, labels, 'clustering/dbscan_results.tsv')
     
 ########################## hierachical #######################################
     
 def hierachical_agglomerative(dataset, k_min, k_max):
 
-    if not os.path.exists('clustering/agglomerative_output'):
-        os.makedirs('clustering/agglomerative_output')
+    if not os.path.exists('clustering'):
+        os.makedirs('clustering')
     
     plt.figure(figsize=(10, 7))  
     plt.title("Customer Dendograms")  
     shc.dendrogram(shc.linkage(dataset, method='ward'))  
     fig = plt.gcf()
-    fig.savefig('clustering/agglomerative_output/dendogram.png', dpi=200)
+    fig.savefig('clustering/dendogram.png', dpi=200)
     
     range_n_clusters = [i for i in range(k_min, k_max+1)]
 
@@ -361,19 +342,10 @@
         cluster_labels = cluster.labels_
         
         silhouette_avg = silhouette_score(dataset, cluster_labels)
-        warning("For n_clusters =", n_clusters,
-              "The average silhouette_score is :", silhouette_avg)
+        write_to_csv(dataset, cluster_labels, 'clustering/hierarchical_with_' + str(n_clusters) + '_clusters.tsv')
+        #warning("For n_clusters =", n_clusters,
+              #"The average silhouette_score is :", silhouette_avg)
         
-        plt.clf()
-        plt.figure(figsize=(10, 7))  
-        plt.title("Agglomerative Hierarchical Clustering\nwith " + str(n_clusters) + " clusters and " + str(silhouette_avg) + " silhouette score")
-        plt.scatter(dataset[:,0], dataset[:,1], c = cluster_labels, cmap='rainbow') 
-        s = 'clustering/agglomerative_output/hierachical_' + str(n_clusters) + '_clusters.png'
-        fig = plt.gcf()
-        fig.set_size_inches(10, 7, forward = True)
-        fig.savefig(s, dpi=200)
-        
-        write_to_csv(dataset, cluster_labels, 'clustering/agglomerative_output/agglomerative_hierarchical_with_' + str(n_clusters) + '_clusters.tsv')
         
        
 
@@ -398,8 +370,6 @@
         tmp = X[i][0]
         if tmp == None:
             X = X.drop(columns=[i])
-                
-    X = pd.DataFrame.to_numpy(X)
     
     
     if args.cluster_type == 'kmeans':
@@ -414,4 +384,4 @@
 ##############################################################################
 
 if __name__ == "__main__":
-    main()
+    main()
\ No newline at end of file