annotate COBRAxy/src/marea_cluster.py @ 546:01147e83f43c draft default tip

Uploaded
author luca_milaz
date Mon, 27 Oct 2025 12:33:08 +0000
parents 5d5583dc6082
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
539
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
1 # -*- coding: utf-8 -*-
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
2 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
3 Created on Mon Jun 3 19:51:00 2019
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
4 @author: Narger
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
5 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
6
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
7 import sys
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
8 import argparse
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
9 import os
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
10 import numpy as np
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
11 import pandas as pd
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
12 from sklearn.datasets import make_blobs
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
13 from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
14 from sklearn.metrics import silhouette_samples, silhouette_score, cluster
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
15 import matplotlib
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
16 matplotlib.use('agg')
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
17 import matplotlib.pyplot as plt
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
18 import scipy.cluster.hierarchy as shc
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
19 import matplotlib.cm as cm
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
20 from typing import Optional, Dict, List
543
5d5583dc6082 Uploaded
francesco_lapi
parents: 542
diff changeset
21 import os
539
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
22
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
23 ################################# process args ###############################
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
24 def process_args(args_in :List[str] = None) -> argparse.Namespace:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
25 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
26 Processes command-line arguments.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
27
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
28 Args:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
29 args (list): List of command-line arguments.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
30
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
31 Returns:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
32 Namespace: An object containing parsed arguments.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
33 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
34 parser = argparse.ArgumentParser(usage = '%(prog)s [options]',
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
35 description = 'process some value\'s' +
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
36 ' genes to create class.')
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
37
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
38 parser.add_argument('-ol', '--out_log',
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
39 help = "Output log")
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
40
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
41 parser.add_argument('-in', '--input',
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
42 type = str,
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
43 help = 'input dataset')
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
44
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
45 parser.add_argument('-cy', '--cluster_type',
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
46 type = str,
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
47 choices = ['kmeans', 'dbscan', 'hierarchy'],
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
48 default = 'kmeans',
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
49 help = 'choose clustering algorythm')
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
50
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
51 parser.add_argument('-sc', '--scaling',
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
52 type = str,
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
53 choices = ['true', 'false'],
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
54 default = 'true',
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
55 help = 'choose if you want to scaling the data')
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
56
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
57 parser.add_argument('-k1', '--k_min',
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
58 type = int,
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
59 default = 2,
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
60 help = 'choose minimun cluster number to be generated')
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
61
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
62 parser.add_argument('-k2', '--k_max',
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
63 type = int,
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
64 default = 7,
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
65 help = 'choose maximum cluster number to be generated')
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
66
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
67 parser.add_argument('-el', '--elbow',
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
68 type = str,
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
69 default = 'false',
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
70 choices = ['true', 'false'],
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
71 help = 'choose if you want to generate an elbow plot for kmeans')
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
72
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
73 parser.add_argument('-si', '--silhouette',
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
74 type = str,
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
75 default = 'false',
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
76 choices = ['true', 'false'],
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
77 help = 'choose if you want silhouette plots')
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
78
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
79 parser.add_argument('-td', '--tool_dir',
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
80 type = str,
542
fcdbc81feb45 Uploaded
francesco_lapi
parents: 539
diff changeset
81 default = os.path.dirname(os.path.abspath(__file__)),
fcdbc81feb45 Uploaded
francesco_lapi
parents: 539
diff changeset
82 help = 'your tool directory (default: auto-detected package location)')
539
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
83
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
84 parser.add_argument('-ms', '--min_samples',
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
85 type = int,
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
86 help = 'min samples for dbscan (optional)')
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
87
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
88 parser.add_argument('-ep', '--eps',
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
89 type = float,
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
90 help = 'eps for dbscan (optional)')
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
91
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
92 parser.add_argument('-bc', '--best_cluster',
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
93 type = str,
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
94 help = 'output of best cluster tsv')
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
95
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
96 parser.add_argument(
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
97 '-idop', '--output_path',
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
98 type = str,
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
99 default='clustering/',
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
100 help = 'output path for maps')
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
101
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
102 args_in = parser.parse_args(args_in)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
103 return args_in
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
104
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
105 ########################### warning ###########################################
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
106 def warning(s :str) -> None:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
107 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
108 Log a warning message to an output log file and print it to the console.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
109
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
110 Args:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
111 s (str): The warning message to be logged and printed.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
112
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
113 Returns:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
114 None
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
115 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
116
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
117 with open(args.out_log, 'a') as log:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
118 log.write(s + "\n\n")
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
119 print(s)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
120
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
121 ########################## read dataset ######################################
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
122 def read_dataset(dataset :str) -> pd.DataFrame:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
123 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
124 Read dataset from a CSV file and return it as a Pandas DataFrame.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
125
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
126 Args:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
127 dataset (str): the path to the dataset to convert into a DataFrame
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
128
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
129 Returns:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
130 pandas.DataFrame: The dataset loaded as a Pandas DataFrame.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
131
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
132 Raises:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
133 pandas.errors.EmptyDataError: If the dataset file is empty.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
134 sys.exit: If the dataset file has the wrong format (e.g., fewer than 2 columns)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
135 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
136 try:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
137 dataset = pd.read_csv(dataset, sep = '\t', header = 0)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
138 except pd.errors.EmptyDataError:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
139 sys.exit('Execution aborted: wrong format of dataset\n')
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
140 if len(dataset.columns) < 2:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
141 sys.exit('Execution aborted: wrong format of dataset\n')
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
142 return dataset
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
143
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
144 ############################ rewrite_input ###################################
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
145 def rewrite_input(dataset :Dict) -> Dict[str, List[Optional[float]]]:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
146 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
147 Rewrite the dataset as a dictionary of lists instead of as a dictionary of dictionaries.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
148
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
149 Args:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
150 dataset (pandas.DataFrame): The dataset to be rewritten.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
151
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
152 Returns:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
153 dict: The rewritten dataset as a dictionary of lists.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
154 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
155 #Riscrivo il dataset come dizionario di liste,
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
156 #non come dizionario di dizionari
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
157 #dataset.pop('Reactions', None)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
158
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
159 for key, val in dataset.items():
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
160 l = []
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
161 for i in val:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
162 if i == 'None':
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
163 l.append(None)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
164 else:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
165 l.append(float(i))
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
166
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
167 dataset[key] = l
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
168
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
169 return dataset
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
170
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
171 ############################## write to csv ##################################
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
172 def write_to_csv (dataset :pd.DataFrame, labels :List[str], name :str) -> None:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
173 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
174 Write dataset and predicted labels to a CSV file.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
175
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
176 Args:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
177 dataset (pandas.DataFrame): The dataset to be written.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
178 labels (list): The predicted labels for each data point.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
179 name (str): The name of the output CSV file.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
180
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
181 Returns:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
182 None
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
183 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
184 #labels = predict
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
185 predict = [x+1 for x in labels]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
186
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
187 classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
188
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
189 dest = name
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
190 classe.to_csv(dest, sep = '\t', index = False,
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
191 header = ['Patient_ID', 'Class'])
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
192
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
193 ########################### trova il massimo in lista ########################
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
194 def max_index (lista :List[int]) -> int:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
195 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
196 Find the index of the maximum value in a list.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
197
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
198 Args:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
199 lista (list): The list in which we search for the index of the maximum value.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
200
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
201 Returns:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
202 int: The index of the maximum value in the list.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
203 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
204 best = -1
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
205 best_index = 0
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
206 for i in range(len(lista)):
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
207 if lista[i] > best:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
208 best = lista [i]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
209 best_index = i
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
210
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
211 return best_index
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
212
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
213 ################################ kmeans #####################################
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
214 def kmeans (k_min: int, k_max: int, dataset: pd.DataFrame, elbow: str, silhouette: str, best_cluster: str) -> None:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
215 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
216 Perform k-means clustering on the given dataset, which is an algorithm used to partition a dataset into groups (clusters) based on their characteristics.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
217 The goal is to divide the data into homogeneous groups, where the elements within each group are similar to each other and different from the elements in other groups.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
218
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
219 Args:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
220 k_min (int): The minimum number of clusters to consider.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
221 k_max (int): The maximum number of clusters to consider.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
222 dataset (pandas.DataFrame): The dataset to perform clustering on.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
223 elbow (str): Whether to generate an elbow plot for kmeans ('True' or 'False').
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
224 silhouette (str): Whether to generate silhouette plots ('True' or 'False').
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
225 best_cluster (str): The file path to save the output of the best cluster.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
226
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
227 Returns:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
228 None
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
229 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
230 if not os.path.exists(args.output_path):
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
231 os.makedirs(args.output_path)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
232
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
233
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
234 if elbow == 'true':
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
235 elbow = True
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
236 else:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
237 elbow = False
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
238
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
239 if silhouette == 'true':
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
240 silhouette = True
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
241 else:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
242 silhouette = False
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
243
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
244 range_n_clusters = [i for i in range(k_min, k_max+1)]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
245 distortions = []
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
246 scores = []
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
247 all_labels = []
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
248
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
249 clusterer = KMeans(n_clusters=1, random_state=10)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
250 distortions.append(clusterer.fit(dataset).inertia_)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
251
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
252
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
253 for n_clusters in range_n_clusters:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
254 clusterer = KMeans(n_clusters=n_clusters, random_state=10)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
255 cluster_labels = clusterer.fit_predict(dataset)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
256
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
257 all_labels.append(cluster_labels)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
258 if n_clusters == 1:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
259 silhouette_avg = 0
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
260 else:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
261 silhouette_avg = silhouette_score(dataset, cluster_labels)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
262 scores.append(silhouette_avg)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
263 distortions.append(clusterer.fit(dataset).inertia_)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
264
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
265 best = max_index(scores) + k_min
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
266
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
267 for i in range(len(all_labels)):
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
268 prefix = ''
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
269 if (i + k_min == best):
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
270 prefix = '_BEST'
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
271
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
272 write_to_csv(dataset, all_labels[i], f'{args.output_path}/kmeans_with_' + str(i + k_min) + prefix + '_clusters.tsv')
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
273
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
274
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
275 if (prefix == '_BEST'):
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
276 labels = all_labels[i]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
277 predict = [x+1 for x in labels]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
278 classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
279 classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class'])
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
280
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
281
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
282
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
283
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
284 if silhouette:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
285 silhouette_draw(dataset, all_labels[i], i + k_min, f'{args.output_path}/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png')
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
286
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
287
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
288 if elbow:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
289 elbow_plot(distortions, k_min,k_max)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
290
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
291
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
292
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
293
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
294
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
295 ############################## elbow_plot ####################################
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
296 def elbow_plot (distortions: List[float], k_min: int, k_max: int) -> None:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
297 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
298 Generate an elbow plot to visualize the distortion for different numbers of clusters.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
299 The elbow plot is a graphical tool used in clustering analysis to help identifying the appropriate number of clusters by looking for the point where the rate of decrease
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
300 in distortion sharply decreases, indicating the optimal balance between model complexity and clustering quality.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
301
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
302 Args:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
303 distortions (list): List of distortion values for different numbers of clusters.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
304 k_min (int): The minimum number of clusters considered.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
305 k_max (int): The maximum number of clusters considered.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
306
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
307 Returns:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
308 None
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
309 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
310 plt.figure(0)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
311 x = list(range(k_min, k_max + 1))
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
312 x.insert(0, 1)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
313 plt.plot(x, distortions, marker = 'o')
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
314 plt.xlabel('Number of clusters (k)')
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
315 plt.ylabel('Distortion')
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
316 s = f'{args.output_path}/elbow_plot.png'
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
317 fig = plt.gcf()
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
318 fig.set_size_inches(18.5, 10.5, forward = True)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
319 fig.savefig(s, dpi=100)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
320
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
321
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
322 ############################## silhouette plot ###############################
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
323 def silhouette_draw(dataset: pd.DataFrame, labels: List[str], n_clusters: int, path:str) -> None:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
324 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
325 Generate a silhouette plot for the clustering results.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
326 The silhouette coefficient is a measure used to evaluate the quality of clusters obtained from a clustering algorithmand it quantifies how similar an object is to its own cluster compared to other clusters.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
327 The silhouette coefficient ranges from -1 to 1, where:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
328 - A value close to +1 indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. This implies that the object is in a dense, well-separated cluster.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
329 - A value close to 0 indicates that the object is close to the decision boundary between two neighboring clusters.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
330 - A value close to -1 indicates that the object may have been assigned to the wrong cluster.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
331
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
332 Args:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
333 dataset (pandas.DataFrame): The dataset used for clustering.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
334 labels (list): The cluster labels assigned to each data point.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
335 n_clusters (int): The number of clusters.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
336 path (str): The path to save the silhouette plot image.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
337
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
338 Returns:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
339 None
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
340 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
341 if n_clusters == 1:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
342 return None
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
343
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
344 silhouette_avg = silhouette_score(dataset, labels)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
345 warning("For n_clusters = " + str(n_clusters) +
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
346 " The average silhouette_score is: " + str(silhouette_avg))
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
347
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
348 plt.close('all')
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
349 # Create a subplot with 1 row and 2 columns
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
350 fig, (ax1) = plt.subplots(1, 1)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
351
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
352 fig.set_size_inches(18, 7)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
353
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
354 # The 1st subplot is the silhouette plot
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
355 # The silhouette coefficient can range from -1, 1 but in this example all
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
356 # lie within [-0.1, 1]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
357 ax1.set_xlim([-1, 1])
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
358 # The (n_clusters+1)*10 is for inserting blank space between silhouette
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
359 # plots of individual clusters, to demarcate them clearly.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
360 ax1.set_ylim([0, len(dataset) + (n_clusters + 1) * 10])
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
361
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
362 # Compute the silhouette scores for each sample
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
363 sample_silhouette_values = silhouette_samples(dataset, labels)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
364
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
365 y_lower = 10
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
366 for i in range(n_clusters):
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
367 # Aggregate the silhouette scores for samples belonging to
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
368 # cluster i, and sort them
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
369 ith_cluster_silhouette_values = \
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
370 sample_silhouette_values[labels == i]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
371
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
372 ith_cluster_silhouette_values.sort()
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
373
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
374 size_cluster_i = ith_cluster_silhouette_values.shape[0]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
375 y_upper = y_lower + size_cluster_i
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
376
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
377 color = cm.nipy_spectral(float(i) / n_clusters)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
378 ax1.fill_betweenx(np.arange(y_lower, y_upper),
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
379 0, ith_cluster_silhouette_values,
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
380 facecolor=color, edgecolor=color, alpha=0.7)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
381
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
382 # Label the silhouette plots with their cluster numbers at the middle
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
383 ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
384
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
385 # Compute the new y_lower for next plot
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
386 y_lower = y_upper + 10 # 10 for the 0 samples
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
387
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
388 ax1.set_title("The silhouette plot for the various clusters.")
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
389 ax1.set_xlabel("The silhouette coefficient values")
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
390 ax1.set_ylabel("Cluster label")
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
391
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
392 # The vertical line for average silhouette score of all the values
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
393 ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
394
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
395 ax1.set_yticks([]) # Clear the yaxis labels / ticks
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
396 ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
397
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
398
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
399 plt.suptitle(("Silhouette analysis for clustering on sample data "
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
400 "with n_clusters = " + str(n_clusters) + "\nAverage silhouette_score = " + str(silhouette_avg)), fontsize=12, fontweight='bold')
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
401
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
402
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
403 plt.savefig(path, bbox_inches='tight')
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
404
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
405 ######################## dbscan ##############################################
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
406 def dbscan(dataset: pd.DataFrame, eps: float, min_samples: float, best_cluster: str) -> None:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
407 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
408 Perform DBSCAN clustering on the given dataset, which is a clustering algorithm that groups together closely packed points based on the notion of density.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
409
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
410 Args:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
411 dataset (pandas.DataFrame): The dataset to be clustered.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
412 eps (float): The maximum distance between two samples for one to be considered as in the neighborhood of the other.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
413 min_samples (float): The number of samples in a neighborhood for a point to be considered as a core point.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
414 best_cluster (str): The file path to save the output of the best cluster.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
415
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
416 Returns:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
417 None
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
418 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
419 if not os.path.exists(args.output_path):
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
420 os.makedirs(args.output_path)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
421
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
422 if eps is not None:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
423 clusterer = DBSCAN(eps = eps, min_samples = min_samples)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
424 else:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
425 clusterer = DBSCAN()
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
426
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
427 clustering = clusterer.fit(dataset)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
428
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
429 core_samples_mask = np.zeros_like(clustering.labels_, dtype=bool)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
430 core_samples_mask[clustering.core_sample_indices_] = True
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
431 labels = clustering.labels_
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
432
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
433 # Number of clusters in labels, ignoring noise if present.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
434 n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
435
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
436
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
437 labels = labels
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
438 predict = [x+1 for x in labels]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
439 classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
440 classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class'])
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
441
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
442
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
443 ########################## hierachical #######################################
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
444 def hierachical_agglomerative(dataset: pd.DataFrame, k_min: int, k_max: int, best_cluster: str, silhouette: str) -> None:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
445 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
446 Perform hierarchical agglomerative clustering on the given dataset.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
447
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
448 Args:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
449 dataset (pandas.DataFrame): The dataset to be clustered.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
450 k_min (int): The minimum number of clusters to consider.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
451 k_max (int): The maximum number of clusters to consider.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
452 best_cluster (str): The file path to save the output of the best cluster.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
453 silhouette (str): Whether to generate silhouette plots ('True' or 'False').
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
454
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
455 Returns:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
456 None
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
457 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
458 if not os.path.exists(args.output_path):
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
459 os.makedirs(args.output_path)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
460
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
461 plt.figure(figsize=(10, 7))
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
462 plt.title("Customer Dendograms")
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
463 shc.dendrogram(shc.linkage(dataset, method='ward'), labels=dataset.index.values.tolist())
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
464 fig = plt.gcf()
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
465 fig.savefig(f'{args.output_path}/dendogram.png', dpi=200)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
466
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
467 range_n_clusters = [i for i in range(k_min, k_max+1)]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
468
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
469 scores = []
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
470 labels = []
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
471
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
472 n_classi = dataset.shape[0]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
473
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
474 for n_clusters in range_n_clusters:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
475 cluster = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward')
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
476 cluster.fit_predict(dataset)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
477 cluster_labels = cluster.labels_
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
478 labels.append(cluster_labels)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
479 write_to_csv(dataset, cluster_labels, f'{args.output_path}/hierarchical_with_' + str(n_clusters) + '_clusters.tsv')
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
480
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
481 best = max_index(scores) + k_min
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
482
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
483 for i in range(len(labels)):
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
484 prefix = ''
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
485 if (i + k_min == best):
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
486 prefix = '_BEST'
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
487 if silhouette == 'true':
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
488 silhouette_draw(dataset, labels[i], i + k_min, f'{args.output_path}/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png')
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
489
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
490 for i in range(len(labels)):
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
491 if (i + k_min == best):
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
492 labels = labels[i]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
493 predict = [x+1 for x in labels]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
494 classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
495 classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class'])
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
496
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
497
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
498 ############################# main ###########################################
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
499 def main(args_in:List[str] = None) -> None:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
500 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
501 Initializes everything and sets the program in motion based on the fronted input arguments.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
502
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
503 Returns:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
504 None
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
505 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
506 global args
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
507 args = process_args(args_in)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
508
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
509 if not os.path.exists(args.output_path):
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
510 os.makedirs(args.output_path)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
511
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
512 #Data read
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
513
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
514 X = read_dataset(args.input)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
515 X = X.iloc[:, 1:]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
516 X = pd.DataFrame.to_dict(X, orient='list')
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
517 X = rewrite_input(X)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
518 X = pd.DataFrame.from_dict(X, orient = 'index')
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
519
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
520 for i in X.columns:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
521 if any(val is None or np.isnan(val) for val in X[i]):
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
522 X = X.drop(columns=[i])
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
523
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
524 if args.scaling == "true":
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
525 list_to_remove = []
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
526 toll_std=1e-8
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
527 for i in X.columns:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
528 mean_i = X[i].mean()
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
529 std_i = X[i].std()
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
530 if std_i >toll_std:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
531 #scaling with mean 0 and std 1
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
532 X[i] = (X[i]-mean_i)/std_i
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
533 else:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
534 #remove feature because std = 0 during clustering
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
535 list_to_remove.append(i)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
536 if len(list_to_remove)>0:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
537 X = X.drop(columns=list_to_remove)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
538
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
539 if args.k_max != None:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
540 numero_classi = X.shape[0]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
541 while args.k_max >= numero_classi:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
542 err = 'Skipping k = ' + str(args.k_max) + ' since it is >= number of classes of dataset'
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
543 warning(err)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
544 args.k_max = args.k_max - 1
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
545
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
546
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
547 if args.cluster_type == 'kmeans':
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
548 kmeans(args.k_min, args.k_max, X, args.elbow, args.silhouette, args.best_cluster)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
549
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
550 if args.cluster_type == 'dbscan':
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
551 dbscan(X, args.eps, args.min_samples, args.best_cluster)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
552
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
553 if args.cluster_type == 'hierarchy':
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
554 hierachical_agglomerative(X, args.k_min, args.k_max, args.best_cluster, args.silhouette)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
555
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
556 ##############################################################################
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
557 if __name__ == "__main__":
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
558 main()