annotate COBRAxy/marea_cluster.py @ 214:6ca5758010b3 draft

Uploaded
author francesco_lapi
date Fri, 13 Dec 2024 11:05:47 +0000
parents 3ad3fb730b87
children 4a677fc67aeb
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
1 # -*- coding: utf-8 -*-
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
2 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
3 Created on Mon Jun 3 19:51:00 2019
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
4 @author: Narger
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
5 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
6
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
7 import sys
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
8 import argparse
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
9 import os
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
10 import numpy as np
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
11 import pandas as pd
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
12 from sklearn.datasets import make_blobs
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
13 from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
14 from sklearn.metrics import silhouette_samples, silhouette_score, cluster
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
15 import matplotlib
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
16 matplotlib.use('agg')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
17 import matplotlib.pyplot as plt
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
18 import scipy.cluster.hierarchy as shc
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
19 import matplotlib.cm as cm
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
20 from typing import Optional, Dict, List
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
21
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
22 ################################# process args ###############################
155
3ad3fb730b87 Uploaded
bimib
parents: 154
diff changeset
23 def process_args(args_in :List[str] = None) -> argparse.Namespace:
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
24 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
25 Processes command-line arguments.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
26
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
27 Args:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
28 args (list): List of command-line arguments.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
29
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
30 Returns:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
31 Namespace: An object containing parsed arguments.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
32 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
33 parser = argparse.ArgumentParser(usage = '%(prog)s [options]',
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
34 description = 'process some value\'s' +
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
35 ' genes to create class.')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
36
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
37 parser.add_argument('-ol', '--out_log',
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
38 help = "Output log")
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
39
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
40 parser.add_argument('-in', '--input',
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
41 type = str,
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
42 help = 'input dataset')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
43
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
44 parser.add_argument('-cy', '--cluster_type',
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
45 type = str,
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
46 choices = ['kmeans', 'dbscan', 'hierarchy'],
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
47 default = 'kmeans',
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
48 help = 'choose clustering algorythm')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
49
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
50 parser.add_argument('-k1', '--k_min',
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
51 type = int,
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
52 default = 2,
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
53 help = 'choose minimun cluster number to be generated')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
54
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
55 parser.add_argument('-k2', '--k_max',
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
56 type = int,
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
57 default = 7,
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
58 help = 'choose maximum cluster number to be generated')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
59
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
60 parser.add_argument('-el', '--elbow',
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
61 type = str,
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
62 default = 'false',
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
63 choices = ['true', 'false'],
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
64 help = 'choose if you want to generate an elbow plot for kmeans')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
65
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
66 parser.add_argument('-si', '--silhouette',
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
67 type = str,
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
68 default = 'false',
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
69 choices = ['true', 'false'],
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
70 help = 'choose if you want silhouette plots')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
71
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
72 parser.add_argument('-td', '--tool_dir',
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
73 type = str,
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
74 required = True,
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
75 help = 'your tool directory')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
76
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
77 parser.add_argument('-ms', '--min_samples',
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
78 type = float,
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
79 help = 'min samples for dbscan (optional)')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
80
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
81 parser.add_argument('-ep', '--eps',
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
82 type = float,
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
83 help = 'eps for dbscan (optional)')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
84
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
85 parser.add_argument('-bc', '--best_cluster',
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
86 type = str,
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
87 help = 'output of best cluster tsv')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
88
147
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
89 parser.add_argument(
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
90 '-idop', '--output_path',
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
91 type = str,
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
92 default='result',
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
93 help = 'output path for maps')
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
94
155
3ad3fb730b87 Uploaded
bimib
parents: 154
diff changeset
95 args_in = parser.parse_args(args_in)
3ad3fb730b87 Uploaded
bimib
parents: 154
diff changeset
96 return args_in
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
97
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
98 ########################### warning ###########################################
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
99 def warning(s :str) -> None:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
100 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
101 Log a warning message to an output log file and print it to the console.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
102
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
103 Args:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
104 s (str): The warning message to be logged and printed.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
105
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
106 Returns:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
107 None
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
108 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
109 args = process_args(sys.argv)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
110 with open(args.out_log, 'a') as log:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
111 log.write(s + "\n\n")
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
112 print(s)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
113
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
114 ########################## read dataset ######################################
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
115 def read_dataset(dataset :str) -> pd.DataFrame:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
116 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
117 Read dataset from a CSV file and return it as a Pandas DataFrame.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
118
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
119 Args:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
120 dataset (str): the path to the dataset to convert into a DataFrame
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
121
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
122 Returns:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
123 pandas.DataFrame: The dataset loaded as a Pandas DataFrame.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
124
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
125 Raises:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
126 pandas.errors.EmptyDataError: If the dataset file is empty.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
127 sys.exit: If the dataset file has the wrong format (e.g., fewer than 2 columns)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
128 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
129 try:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
130 dataset = pd.read_csv(dataset, sep = '\t', header = 0)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
131 except pd.errors.EmptyDataError:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
132 sys.exit('Execution aborted: wrong format of dataset\n')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
133 if len(dataset.columns) < 2:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
134 sys.exit('Execution aborted: wrong format of dataset\n')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
135 return dataset
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
136
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
137 ############################ rewrite_input ###################################
154
49fb0556242f Uploaded
bimib
parents: 153
diff changeset
138 def rewrite_input(dataset :Dict) -> Dict[str, List[Optional[float]]]:
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
139 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
140 Rewrite the dataset as a dictionary of lists instead of as a dictionary of dictionaries.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
141
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
142 Args:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
143 dataset (pandas.DataFrame): The dataset to be rewritten.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
144
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
145 Returns:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
146 dict: The rewritten dataset as a dictionary of lists.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
147 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
148 #Riscrivo il dataset come dizionario di liste,
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
149 #non come dizionario di dizionari
153
a95ac9ae9648 Uploaded
bimib
parents: 152
diff changeset
150 #dataset.pop('Reactions', None)
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
151
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
152 for key, val in dataset.items():
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
153 l = []
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
154 for i in val:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
155 if i == 'None':
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
156 l.append(None)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
157 else:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
158 l.append(float(i))
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
159
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
160 dataset[key] = l
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
161
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
162 return dataset
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
163
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
164 ############################## write to csv ##################################
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
165 def write_to_csv (dataset :pd.DataFrame, labels :List[str], name :str) -> None:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
166 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
167 Write dataset and predicted labels to a CSV file.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
168
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
169 Args:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
170 dataset (pandas.DataFrame): The dataset to be written.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
171 labels (list): The predicted labels for each data point.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
172 name (str): The name of the output CSV file.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
173
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
174 Returns:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
175 None
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
176 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
177 #labels = predict
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
178 predict = [x+1 for x in labels]
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
179
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
180 classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
181
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
182 dest = name
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
183 classe.to_csv(dest, sep = '\t', index = False,
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
184 header = ['Patient_ID', 'Class'])
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
185
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
186 ########################### trova il massimo in lista ########################
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
187 def max_index (lista :List[int]) -> int:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
188 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
189 Find the index of the maximum value in a list.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
190
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
191 Args:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
192 lista (list): The list in which we search for the index of the maximum value.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
193
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
194 Returns:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
195 int: The index of the maximum value in the list.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
196 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
197 best = -1
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
198 best_index = 0
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
199 for i in range(len(lista)):
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
200 if lista[i] > best:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
201 best = lista [i]
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
202 best_index = i
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
203
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
204 return best_index
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
205
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
206 ################################ kmeans #####################################
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
207 def kmeans (k_min: int, k_max: int, dataset: pd.DataFrame, elbow: str, silhouette: str, best_cluster: str) -> None:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
208 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
209 Perform k-means clustering on the given dataset, which is an algorithm used to partition a dataset into groups (clusters) based on their characteristics.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
210 The goal is to divide the data into homogeneous groups, where the elements within each group are similar to each other and different from the elements in other groups.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
211
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
212 Args:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
213 k_min (int): The minimum number of clusters to consider.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
214 k_max (int): The maximum number of clusters to consider.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
215 dataset (pandas.DataFrame): The dataset to perform clustering on.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
216 elbow (str): Whether to generate an elbow plot for kmeans ('true' or 'false').
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
217 silhouette (str): Whether to generate silhouette plots ('true' or 'false').
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
218 best_cluster (str): The file path to save the output of the best cluster.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
219
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
220 Returns:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
221 None
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
222 """
147
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
223 if not os.path.exists(args.output_path):
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
224 os.makedirs(args.output_path)
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
225
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
226
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
227 if elbow == 'true':
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
228 elbow = True
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
229 else:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
230 elbow = False
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
231
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
232 if silhouette == 'true':
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
233 silhouette = True
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
234 else:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
235 silhouette = False
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
236
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
237 range_n_clusters = [i for i in range(k_min, k_max+1)]
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
238 distortions = []
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
239 scores = []
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
240 all_labels = []
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
241
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
242 clusterer = KMeans(n_clusters=1, random_state=10)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
243 distortions.append(clusterer.fit(dataset).inertia_)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
244
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
245
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
246 for n_clusters in range_n_clusters:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
247 clusterer = KMeans(n_clusters=n_clusters, random_state=10)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
248 cluster_labels = clusterer.fit_predict(dataset)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
249
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
250 all_labels.append(cluster_labels)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
251 if n_clusters == 1:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
252 silhouette_avg = 0
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
253 else:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
254 silhouette_avg = silhouette_score(dataset, cluster_labels)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
255 scores.append(silhouette_avg)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
256 distortions.append(clusterer.fit(dataset).inertia_)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
257
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
258 best = max_index(scores) + k_min
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
259
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
260 for i in range(len(all_labels)):
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
261 prefix = ''
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
262 if (i + k_min == best):
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
263 prefix = '_BEST'
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
264
147
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
265 write_to_csv(dataset, all_labels[i], f'{args.output_path}/kmeans_with_' + str(i + k_min) + prefix + '_clusters.tsv')
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
266
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
267
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
268 if (prefix == '_BEST'):
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
269 labels = all_labels[i]
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
270 predict = [x+1 for x in labels]
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
271 classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
272 classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class'])
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
273
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
274
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
275
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
276
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
277 if silhouette:
147
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
278 silhouette_draw(dataset, all_labels[i], i + k_min, f'{args.output_path}/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png')
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
279
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
280
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
281 if elbow:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
282 elbow_plot(distortions, k_min,k_max)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
283
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
284
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
285
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
286
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
287
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
288 ############################## elbow_plot ####################################
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
289 def elbow_plot (distortions: List[float], k_min: int, k_max: int) -> None:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
290 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
291 Generate an elbow plot to visualize the distortion for different numbers of clusters.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
292 The elbow plot is a graphical tool used in clustering analysis to help identifying the appropriate number of clusters by looking for the point where the rate of decrease
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
293 in distortion sharply decreases, indicating the optimal balance between model complexity and clustering quality.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
294
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
295 Args:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
296 distortions (list): List of distortion values for different numbers of clusters.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
297 k_min (int): The minimum number of clusters considered.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
298 k_max (int): The maximum number of clusters considered.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
299
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
300 Returns:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
301 None
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
302 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
303 plt.figure(0)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
304 x = list(range(k_min, k_max + 1))
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
305 x.insert(0, 1)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
306 plt.plot(x, distortions, marker = 'o')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
307 plt.xlabel('Number of clusters (k)')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
308 plt.ylabel('Distortion')
147
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
309 s = f'{args.output_path}/elbow_plot.png'
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
310 fig = plt.gcf()
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
311 fig.set_size_inches(18.5, 10.5, forward = True)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
312 fig.savefig(s, dpi=100)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
313
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
314
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
315 ############################## silhouette plot ###############################
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
316 def silhouette_draw(dataset: pd.DataFrame, labels: List[str], n_clusters: int, path:str) -> None:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
317 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
318 Generate a silhouette plot for the clustering results.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
319 The silhouette coefficient is a measure used to evaluate the quality of clusters obtained from a clustering algorithmand it quantifies how similar an object is to its own cluster compared to other clusters.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
320 The silhouette coefficient ranges from -1 to 1, where:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
321 - A value close to +1 indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. This implies that the object is in a dense, well-separated cluster.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
322 - A value close to 0 indicates that the object is close to the decision boundary between two neighboring clusters.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
323 - A value close to -1 indicates that the object may have been assigned to the wrong cluster.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
324
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
325 Args:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
326 dataset (pandas.DataFrame): The dataset used for clustering.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
327 labels (list): The cluster labels assigned to each data point.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
328 n_clusters (int): The number of clusters.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
329 path (str): The path to save the silhouette plot image.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
330
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
331 Returns:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
332 None
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
333 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
334 if n_clusters == 1:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
335 return None
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
336
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
337 silhouette_avg = silhouette_score(dataset, labels)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
338 warning("For n_clusters = " + str(n_clusters) +
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
339 " The average silhouette_score is: " + str(silhouette_avg))
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
340
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
341 plt.close('all')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
342 # Create a subplot with 1 row and 2 columns
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
343 fig, (ax1) = plt.subplots(1, 1)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
344
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
345 fig.set_size_inches(18, 7)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
346
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
347 # The 1st subplot is the silhouette plot
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
348 # The silhouette coefficient can range from -1, 1 but in this example all
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
349 # lie within [-0.1, 1]
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
350 ax1.set_xlim([-1, 1])
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
351 # The (n_clusters+1)*10 is for inserting blank space between silhouette
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
352 # plots of individual clusters, to demarcate them clearly.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
353 ax1.set_ylim([0, len(dataset) + (n_clusters + 1) * 10])
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
354
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
355 # Compute the silhouette scores for each sample
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
356 sample_silhouette_values = silhouette_samples(dataset, labels)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
357
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
358 y_lower = 10
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
359 for i in range(n_clusters):
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
360 # Aggregate the silhouette scores for samples belonging to
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
361 # cluster i, and sort them
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
362 ith_cluster_silhouette_values = \
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
363 sample_silhouette_values[labels == i]
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
364
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
365 ith_cluster_silhouette_values.sort()
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
366
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
367 size_cluster_i = ith_cluster_silhouette_values.shape[0]
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
368 y_upper = y_lower + size_cluster_i
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
369
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
370 color = cm.nipy_spectral(float(i) / n_clusters)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
371 ax1.fill_betweenx(np.arange(y_lower, y_upper),
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
372 0, ith_cluster_silhouette_values,
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
373 facecolor=color, edgecolor=color, alpha=0.7)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
374
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
375 # Label the silhouette plots with their cluster numbers at the middle
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
376 ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
377
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
378 # Compute the new y_lower for next plot
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
379 y_lower = y_upper + 10 # 10 for the 0 samples
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
380
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
381 ax1.set_title("The silhouette plot for the various clusters.")
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
382 ax1.set_xlabel("The silhouette coefficient values")
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
383 ax1.set_ylabel("Cluster label")
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
384
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
385 # The vertical line for average silhouette score of all the values
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
386 ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
387
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
388 ax1.set_yticks([]) # Clear the yaxis labels / ticks
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
389 ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
390
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
391
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
392 plt.suptitle(("Silhouette analysis for clustering on sample data "
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
393 "with n_clusters = " + str(n_clusters) + "\nAverage silhouette_score = " + str(silhouette_avg)), fontsize=12, fontweight='bold')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
394
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
395
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
396 plt.savefig(path, bbox_inches='tight')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
397
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
398 ######################## dbscan ##############################################
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
399 def dbscan(dataset: pd.DataFrame, eps: float, min_samples: float, best_cluster: str) -> None:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
400 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
401 Perform DBSCAN clustering on the given dataset, which is a clustering algorithm that groups together closely packed points based on the notion of density.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
402
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
403 Args:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
404 dataset (pandas.DataFrame): The dataset to be clustered.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
405 eps (float): The maximum distance between two samples for one to be considered as in the neighborhood of the other.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
406 min_samples (float): The number of samples in a neighborhood for a point to be considered as a core point.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
407 best_cluster (str): The file path to save the output of the best cluster.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
408
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
409 Returns:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
410 None
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
411 """
147
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
412 if not os.path.exists(args.output_path):
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
413 os.makedirs(args.output_path)
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
414
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
415 if eps is not None:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
416 clusterer = DBSCAN(eps = eps, min_samples = min_samples)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
417 else:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
418 clusterer = DBSCAN()
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
419
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
420 clustering = clusterer.fit(dataset)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
421
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
422 core_samples_mask = np.zeros_like(clustering.labels_, dtype=bool)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
423 core_samples_mask[clustering.core_sample_indices_] = True
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
424 labels = clustering.labels_
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
425
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
426 # Number of clusters in labels, ignoring noise if present.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
427 n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
428
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
429
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
430 labels = labels
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
431 predict = [x+1 for x in labels]
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
432 classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
433 classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class'])
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
434
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
435
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
436 ########################## hierachical #######################################
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
437 def hierachical_agglomerative(dataset: pd.DataFrame, k_min: int, k_max: int, best_cluster: str, silhouette: str) -> None:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
438 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
439 Perform hierarchical agglomerative clustering on the given dataset.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
440
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
441 Args:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
442 dataset (pandas.DataFrame): The dataset to be clustered.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
443 k_min (int): The minimum number of clusters to consider.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
444 k_max (int): The maximum number of clusters to consider.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
445 best_cluster (str): The file path to save the output of the best cluster.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
446 silhouette (str): Whether to generate silhouette plots ('true' or 'false').
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
447
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
448 Returns:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
449 None
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
450 """
147
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
451 if not os.path.exists(args.output_path):
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
452 os.makedirs(args.output_path)
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
453
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
454 plt.figure(figsize=(10, 7))
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
455 plt.title("Customer Dendograms")
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
456 shc.dendrogram(shc.linkage(dataset, method='ward'), labels=dataset.index.values.tolist())
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
457 fig = plt.gcf()
147
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
458 fig.savefig(f'{args.output_path}/dendogram.png', dpi=200)
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
459
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
460 range_n_clusters = [i for i in range(k_min, k_max+1)]
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
461
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
462 scores = []
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
463 labels = []
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
464
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
465 n_classi = dataset.shape[0]
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
466
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
467 for n_clusters in range_n_clusters:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
468 cluster = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
469 cluster.fit_predict(dataset)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
470 cluster_labels = cluster.labels_
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
471 labels.append(cluster_labels)
147
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
472 write_to_csv(dataset, cluster_labels, f'{args.output_path}/hierarchical_with_' + str(n_clusters) + '_clusters.tsv')
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
473
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
474 best = max_index(scores) + k_min
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
475
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
476 for i in range(len(labels)):
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
477 prefix = ''
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
478 if (i + k_min == best):
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
479 prefix = '_BEST'
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
480 if silhouette == 'true':
147
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
481 silhouette_draw(dataset, labels[i], i + k_min, f'{args.output_path}/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png')
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
482
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
483 for i in range(len(labels)):
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
484 if (i + k_min == best):
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
485 labels = labels[i]
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
486 predict = [x+1 for x in labels]
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
487 classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
488 classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class'])
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
489
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
490
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
491 ############################# main ###########################################
147
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
492 def main(args_in:List[str] = None) -> None:
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
493 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
494 Initializes everything and sets the program in motion based on the fronted input arguments.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
495
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
496 Returns:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
497 None
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
498 """
147
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
499 global args
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
500 args = process_args(args_in)
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
501
147
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
502 if not os.path.exists(args.output_path):
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
503 os.makedirs(args.output_path)
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
504
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
505 #Data read
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
506
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
507 X = read_dataset(args.input)
154
49fb0556242f Uploaded
bimib
parents: 153
diff changeset
508 X = X.iloc[:, 1:]
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
509 X = pd.DataFrame.to_dict(X, orient='list')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
510 X = rewrite_input(X)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
511 X = pd.DataFrame.from_dict(X, orient = 'index')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
512
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
513 for i in X.columns:
152
7f3552eaf774 Uploaded
luca_milaz
parents: 147
diff changeset
514 if any(val is None for val in X[i]):
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
515 X = X.drop(columns=[i])
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
516
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
517 if args.k_max != None:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
518 numero_classi = X.shape[0]
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
519 while args.k_max >= numero_classi:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
520 err = 'Skipping k = ' + str(args.k_max) + ' since it is >= number of classes of dataset'
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
521 warning(err)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
522 args.k_max = args.k_max - 1
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
523
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
524
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
525 if args.cluster_type == 'kmeans':
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
526 kmeans(args.k_min, args.k_max, X, args.elbow, args.silhouette, args.best_cluster)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
527
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
528 if args.cluster_type == 'dbscan':
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
529 dbscan(X, args.eps, args.min_samples, args.best_cluster)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
530
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
531 if args.cluster_type == 'hierarchy':
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
532 hierachical_agglomerative(X, args.k_min, args.k_max, args.best_cluster, args.silhouette)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
533
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
534 ##############################################################################
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
535 if __name__ == "__main__":
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
536 main()