marea: Marea/marea_cluster.py comparison

comparison Marea/marea_cluster.py @ 28:e6831924df01 draft

small fixes (elbow plot and output managment)

author	bimib
date	Mon, 14 Oct 2019 05:01:08 -0400
parents	9992eba50cfb
children	944e15aa970a

comparison

equal deleted inserted replaced

-:8c480c977a12
+:e6831924df01
 help = 'min samples for dbscan (optional)')
 parser.add_argument('-ep', '--eps',
 type = int,
 help = 'eps for dbscan (optional)')
+parser.add_argument('-bc', '--best_cluster',
+type = str,
+help = 'output of best cluster tsv')
 args = parser.parse_args()
 return args
 classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)
 dest = name
 classe.to_csv(dest, sep = '\t', index = False,
 header = ['Patient_ID', 'Class'])
-#list_labels = labels
-#list_values = dataset
-#list_values = list_values.tolist()
-#d = {'Label' : list_labels, 'Value' : list_values}
-#df = pd.DataFrame(d, columns=['Value','Label'])
-#dest = name + '.tsv'
-#df.to_csv(dest, sep = '\t', index = False,
-#                 header = ['Value', 'Label'])
 ########################### trova il massimo in lista ########################
 def max_index (lista):
 best = -1
 best_index = 0
 for i in range(len(lista)):
 return best_index
 ################################ kmeans #####################################
-def kmeans (k_min, k_max, dataset, elbow, silhouette, davies):
+def kmeans (k_min, k_max, dataset, elbow, silhouette, davies, best_cluster):
 if not os.path.exists('clustering'):
 os.makedirs('clustering')
 if elbow == 'true':
 for n_clusters in range_n_clusters:
 clusterer = KMeans(n_clusters=n_clusters, random_state=10)
 cluster_labels = clusterer.fit_predict(dataset)
 all_labels.append(cluster_labels)
-silhouette_avg = silhouette_score(dataset, cluster_labels)
+if n_clusters == 1:
+	silhouette_avg = 0
+else:
+silhouette_avg = silhouette_score(dataset, cluster_labels)
 scores.append(silhouette_avg)
 distortions.append(clusterer.fit(dataset).inertia_)
 best = max_index(scores) + k_min
 prefix = ''
 if (i + k_min == best):
 prefix = '_BEST'
 write_to_csv(dataset, all_labels[i], 'clustering/kmeans_with_' + str(i + k_min) + prefix + '_clusters.tsv')
+if (prefix == '_BEST'):
+labels = all_labels[i]
+predict = [x+1 for x in labels]
+classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)
+classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class'])
 if davies:
 with np.errstate(divide='ignore', invalid='ignore'):
 davies_bouldin = davies_bouldin_score(dataset, all_labels[i])
 warning("\nFor n_clusters = " + str(i + k_min) +
 fig.savefig(s, dpi=100)
 ############################## silhouette plot ###############################
 def silihouette_draw(dataset, labels, n_clusters, path):
+if n_clusters == 1:
+return None
 silhouette_avg = silhouette_score(dataset, labels)
 warning("For n_clusters = " + str(n_clusters) +
 " The average silhouette_score is: " + str(silhouette_avg))
 plt.close('all')
 if tmp == None:
 X = X.drop(columns=[i])
 if args.cluster_type == 'kmeans':
-kmeans(args.k_min, args.k_max, X, args.elbow, args.silhouette, args.davies)
+kmeans(args.k_min, args.k_max, X, args.elbow, args.silhouette, args.davies, args.best_cluster)
 if args.cluster_type == 'dbscan':
 dbscan(X, args.eps, args.min_samples)
 if args.cluster_type == 'hierarchy':

Mercurial > repos > bimib > marea

comparison Marea/marea_cluster.py @ 28:e6831924df01 draft