comparison Marea/marea_cluster.py @ 42:b3f9e13bf15f draft

Uploaded
author bimib
date Tue, 03 Dec 2019 12:34:30 -0500
parents 2a082b4aed02
children
comparison
equal deleted inserted replaced
41:9e02d127887a 42:b3f9e13bf15f
7 import sys 7 import sys
8 import argparse 8 import argparse
9 import os 9 import os
10 from sklearn.datasets import make_blobs 10 from sklearn.datasets import make_blobs
11 from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering 11 from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
12 from sklearn.metrics import silhouette_samples, silhouette_score, davies_bouldin_score, cluster 12 from sklearn.metrics import silhouette_samples, silhouette_score, cluster
13 import matplotlib 13 import matplotlib
14 matplotlib.use('agg') 14 matplotlib.use('agg')
15 import matplotlib.pyplot as plt 15 import matplotlib.pyplot as plt
16 import scipy.cluster.hierarchy as shc 16 import scipy.cluster.hierarchy as shc
17 import matplotlib.cm as cm 17 import matplotlib.cm as cm
326 326
327 if not os.path.exists('clustering'): 327 if not os.path.exists('clustering'):
328 os.makedirs('clustering') 328 os.makedirs('clustering')
329 329
330 plt.figure(figsize=(10, 7)) 330 plt.figure(figsize=(10, 7))
331 plt.title("Classes Dendogram") 331 plt.title("Customer Dendograms")
332 shc.dendrogram(shc.linkage(dataset, method='ward'), labels=dataset.index.values.tolist()) 332 shc.dendrogram(shc.linkage(dataset, method='ward'), labels=dataset.index.values.tolist())
333 fig = plt.gcf() 333 fig = plt.gcf()
334 fig.savefig('clustering/dendogram.png', dpi=200) 334 fig.savefig('clustering/dendogram.png', dpi=200)
335 335
336 range_n_clusters = [i for i in range(k_min, k_max+1)] 336 range_n_clusters = [i for i in range(k_min, k_max+1)]
337 337
338 scores = [] 338 scores = []
339 labels = [] 339 labels = []
340 340
341 for n_clusters in range_n_clusters: 341 n_classi = dataset.shape[0]
342
343 for n_clusters in range_n_clusters:
342 cluster = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward') 344 cluster = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward')
343 cluster.fit_predict(dataset) 345 cluster.fit_predict(dataset)
344 cluster_labels = cluster.labels_ 346 cluster_labels = cluster.labels_
345 labels.append(cluster_labels) 347 labels.append(cluster_labels)
346 write_to_csv(dataset, cluster_labels, 'clustering/hierarchical_with_' + str(n_clusters) + '_clusters.tsv') 348 write_to_csv(dataset, cluster_labels, 'clustering/hierarchical_with_' + str(n_clusters) + '_clusters.tsv')
347 349
348 best = max_index(scores) + k_min 350 best = max_index(scores) + k_min
349 351
350 for i in range(len(labels)): 352 for i in range(len(labels)):
351 prefix = '' 353 prefix = ''
352 if (i + k_min == best): 354 if (i + k_min == best):
380 382
381 for i in X.columns: 383 for i in X.columns:
382 tmp = X[i][0] 384 tmp = X[i][0]
383 if tmp == None: 385 if tmp == None:
384 X = X.drop(columns=[i]) 386 X = X.drop(columns=[i])
387
388 if args.k_max != None:
389 numero_classi = X.shape[0]
390 while args.k_max >= numero_classi:
391 err = 'Skipping k = ' + str(args.k_max) + ' since it is >= number of classes of dataset'
392 warning(err)
393 args.k_max = args.k_max - 1
385 394
386 395
387 if args.cluster_type == 'kmeans': 396 if args.cluster_type == 'kmeans':
388 kmeans(args.k_min, args.k_max, X, args.elbow, args.silhouette, args.best_cluster) 397 kmeans(args.k_min, args.k_max, X, args.elbow, args.silhouette, args.best_cluster)
389 398