Mercurial > repos > bimib > marea
comparison Marea/marea_cluster.py @ 42:b3f9e13bf15f draft
Uploaded
author | bimib |
---|---|
date | Tue, 03 Dec 2019 12:34:30 -0500 |
parents | 2a082b4aed02 |
children |
comparison
equal
deleted
inserted
replaced
41:9e02d127887a | 42:b3f9e13bf15f |
---|---|
7 import sys | 7 import sys |
8 import argparse | 8 import argparse |
9 import os | 9 import os |
10 from sklearn.datasets import make_blobs | 10 from sklearn.datasets import make_blobs |
11 from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering | 11 from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering |
12 from sklearn.metrics import silhouette_samples, silhouette_score, davies_bouldin_score, cluster | 12 from sklearn.metrics import silhouette_samples, silhouette_score, cluster |
13 import matplotlib | 13 import matplotlib |
14 matplotlib.use('agg') | 14 matplotlib.use('agg') |
15 import matplotlib.pyplot as plt | 15 import matplotlib.pyplot as plt |
16 import scipy.cluster.hierarchy as shc | 16 import scipy.cluster.hierarchy as shc |
17 import matplotlib.cm as cm | 17 import matplotlib.cm as cm |
326 | 326 |
327 if not os.path.exists('clustering'): | 327 if not os.path.exists('clustering'): |
328 os.makedirs('clustering') | 328 os.makedirs('clustering') |
329 | 329 |
330 plt.figure(figsize=(10, 7)) | 330 plt.figure(figsize=(10, 7)) |
331 plt.title("Classes Dendogram") | 331 plt.title("Customer Dendograms") |
332 shc.dendrogram(shc.linkage(dataset, method='ward'), labels=dataset.index.values.tolist()) | 332 shc.dendrogram(shc.linkage(dataset, method='ward'), labels=dataset.index.values.tolist()) |
333 fig = plt.gcf() | 333 fig = plt.gcf() |
334 fig.savefig('clustering/dendogram.png', dpi=200) | 334 fig.savefig('clustering/dendogram.png', dpi=200) |
335 | 335 |
336 range_n_clusters = [i for i in range(k_min, k_max+1)] | 336 range_n_clusters = [i for i in range(k_min, k_max+1)] |
337 | 337 |
338 scores = [] | 338 scores = [] |
339 labels = [] | 339 labels = [] |
340 | 340 |
341 for n_clusters in range_n_clusters: | 341 n_classi = dataset.shape[0] |
342 | |
343 for n_clusters in range_n_clusters: | |
342 cluster = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward') | 344 cluster = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward') |
343 cluster.fit_predict(dataset) | 345 cluster.fit_predict(dataset) |
344 cluster_labels = cluster.labels_ | 346 cluster_labels = cluster.labels_ |
345 labels.append(cluster_labels) | 347 labels.append(cluster_labels) |
346 write_to_csv(dataset, cluster_labels, 'clustering/hierarchical_with_' + str(n_clusters) + '_clusters.tsv') | 348 write_to_csv(dataset, cluster_labels, 'clustering/hierarchical_with_' + str(n_clusters) + '_clusters.tsv') |
347 | 349 |
348 best = max_index(scores) + k_min | 350 best = max_index(scores) + k_min |
349 | 351 |
350 for i in range(len(labels)): | 352 for i in range(len(labels)): |
351 prefix = '' | 353 prefix = '' |
352 if (i + k_min == best): | 354 if (i + k_min == best): |
380 | 382 |
381 for i in X.columns: | 383 for i in X.columns: |
382 tmp = X[i][0] | 384 tmp = X[i][0] |
383 if tmp == None: | 385 if tmp == None: |
384 X = X.drop(columns=[i]) | 386 X = X.drop(columns=[i]) |
387 | |
388 if args.k_max != None: | |
389 numero_classi = X.shape[0] | |
390 while args.k_max >= numero_classi: | |
391 err = 'Skipping k = ' + str(args.k_max) + ' since it is >= number of classes of dataset' | |
392 warning(err) | |
393 args.k_max = args.k_max - 1 | |
385 | 394 |
386 | 395 |
387 if args.cluster_type == 'kmeans': | 396 if args.cluster_type == 'kmeans': |
388 kmeans(args.k_min, args.k_max, X, args.elbow, args.silhouette, args.best_cluster) | 397 kmeans(args.k_min, args.k_max, X, args.elbow, args.silhouette, args.best_cluster) |
389 | 398 |