annotate COBRAxy/marea_cluster.py @ 152:7f3552eaf774 draft

Uploaded
author luca_milaz
date Wed, 06 Nov 2024 21:14:57 +0000
parents 3fca9b568faf
children a95ac9ae9648
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
1 # -*- coding: utf-8 -*-
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
2 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
3 Created on Mon Jun 3 19:51:00 2019
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
4 @author: Narger
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
5 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
6
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
7 import sys
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
8 import argparse
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
9 import os
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
10 import numpy as np
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
11 import pandas as pd
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
12 from sklearn.datasets import make_blobs
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
13 from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
14 from sklearn.metrics import silhouette_samples, silhouette_score, cluster
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
15 import matplotlib
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
16 matplotlib.use('agg')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
17 import matplotlib.pyplot as plt
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
18 import scipy.cluster.hierarchy as shc
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
19 import matplotlib.cm as cm
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
20 from typing import Optional, Dict, List
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
21
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
22 ################################# process args ###############################
147
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
23 def process_args(args :List[str] = None) -> argparse.Namespace:
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
24 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
25 Processes command-line arguments.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
26
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
27 Args:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
28 args (list): List of command-line arguments.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
29
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
30 Returns:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
31 Namespace: An object containing parsed arguments.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
32 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
33 parser = argparse.ArgumentParser(usage = '%(prog)s [options]',
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
34 description = 'process some value\'s' +
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
35 ' genes to create class.')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
36
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
37 parser.add_argument('-ol', '--out_log',
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
38 help = "Output log")
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
39
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
40 parser.add_argument('-in', '--input',
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
41 type = str,
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
42 help = 'input dataset')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
43
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
44 parser.add_argument('-cy', '--cluster_type',
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
45 type = str,
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
46 choices = ['kmeans', 'dbscan', 'hierarchy'],
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
47 default = 'kmeans',
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
48 help = 'choose clustering algorythm')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
49
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
50 parser.add_argument('-k1', '--k_min',
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
51 type = int,
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
52 default = 2,
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
53 help = 'choose minimun cluster number to be generated')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
54
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
55 parser.add_argument('-k2', '--k_max',
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
56 type = int,
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
57 default = 7,
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
58 help = 'choose maximum cluster number to be generated')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
59
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
60 parser.add_argument('-el', '--elbow',
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
61 type = str,
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
62 default = 'false',
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
63 choices = ['true', 'false'],
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
64 help = 'choose if you want to generate an elbow plot for kmeans')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
65
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
66 parser.add_argument('-si', '--silhouette',
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
67 type = str,
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
68 default = 'false',
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
69 choices = ['true', 'false'],
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
70 help = 'choose if you want silhouette plots')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
71
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
72 parser.add_argument('-td', '--tool_dir',
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
73 type = str,
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
74 required = True,
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
75 help = 'your tool directory')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
76
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
77 parser.add_argument('-ms', '--min_samples',
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
78 type = float,
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
79 help = 'min samples for dbscan (optional)')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
80
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
81 parser.add_argument('-ep', '--eps',
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
82 type = float,
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
83 help = 'eps for dbscan (optional)')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
84
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
85 parser.add_argument('-bc', '--best_cluster',
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
86 type = str,
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
87 help = 'output of best cluster tsv')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
88
147
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
89 parser.add_argument(
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
90 '-idop', '--output_path',
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
91 type = str,
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
92 default='result',
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
93 help = 'output path for maps')
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
94
147
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
95 args = parser.parse_args(args)
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
96 return args
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
97
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
98 ########################### warning ###########################################
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
99 def warning(s :str) -> None:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
100 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
101 Log a warning message to an output log file and print it to the console.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
102
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
103 Args:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
104 s (str): The warning message to be logged and printed.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
105
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
106 Returns:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
107 None
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
108 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
109 args = process_args(sys.argv)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
110 with open(args.out_log, 'a') as log:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
111 log.write(s + "\n\n")
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
112 print(s)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
113
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
114 ########################## read dataset ######################################
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
115 def read_dataset(dataset :str) -> pd.DataFrame:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
116 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
117 Read dataset from a CSV file and return it as a Pandas DataFrame.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
118
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
119 Args:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
120 dataset (str): the path to the dataset to convert into a DataFrame
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
121
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
122 Returns:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
123 pandas.DataFrame: The dataset loaded as a Pandas DataFrame.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
124
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
125 Raises:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
126 pandas.errors.EmptyDataError: If the dataset file is empty.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
127 sys.exit: If the dataset file has the wrong format (e.g., fewer than 2 columns)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
128 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
129 try:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
130 dataset = pd.read_csv(dataset, sep = '\t', header = 0)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
131 except pd.errors.EmptyDataError:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
132 sys.exit('Execution aborted: wrong format of dataset\n')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
133 if len(dataset.columns) < 2:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
134 sys.exit('Execution aborted: wrong format of dataset\n')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
135 return dataset
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
136
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
137 ############################ rewrite_input ###################################
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
138 def rewrite_input(dataset :pd.DataFrame) -> Dict[str, List[Optional[float]]]:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
139 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
140 Rewrite the dataset as a dictionary of lists instead of as a dictionary of dictionaries.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
141
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
142 Args:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
143 dataset (pandas.DataFrame): The dataset to be rewritten.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
144
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
145 Returns:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
146 dict: The rewritten dataset as a dictionary of lists.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
147 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
148 #Riscrivo il dataset come dizionario di liste,
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
149 #non come dizionario di dizionari
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
150
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
151 dataset.pop('Reactions', None)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
152
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
153 for key, val in dataset.items():
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
154 l = []
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
155 for i in val:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
156 if i == 'None':
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
157 l.append(None)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
158 else:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
159 l.append(float(i))
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
160
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
161 dataset[key] = l
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
162
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
163 return dataset
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
164
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
165 ############################## write to csv ##################################
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
166 def write_to_csv (dataset :pd.DataFrame, labels :List[str], name :str) -> None:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
167 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
168 Write dataset and predicted labels to a CSV file.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
169
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
170 Args:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
171 dataset (pandas.DataFrame): The dataset to be written.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
172 labels (list): The predicted labels for each data point.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
173 name (str): The name of the output CSV file.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
174
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
175 Returns:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
176 None
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
177 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
178 #labels = predict
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
179 predict = [x+1 for x in labels]
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
180
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
181 classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
182
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
183 dest = name
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
184 classe.to_csv(dest, sep = '\t', index = False,
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
185 header = ['Patient_ID', 'Class'])
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
186
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
187 ########################### trova il massimo in lista ########################
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
188 def max_index (lista :List[int]) -> int:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
189 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
190 Find the index of the maximum value in a list.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
191
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
192 Args:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
193 lista (list): The list in which we search for the index of the maximum value.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
194
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
195 Returns:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
196 int: The index of the maximum value in the list.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
197 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
198 best = -1
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
199 best_index = 0
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
200 for i in range(len(lista)):
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
201 if lista[i] > best:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
202 best = lista [i]
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
203 best_index = i
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
204
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
205 return best_index
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
206
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
207 ################################ kmeans #####################################
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
208 def kmeans (k_min: int, k_max: int, dataset: pd.DataFrame, elbow: str, silhouette: str, best_cluster: str) -> None:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
209 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
210 Perform k-means clustering on the given dataset, which is an algorithm used to partition a dataset into groups (clusters) based on their characteristics.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
211 The goal is to divide the data into homogeneous groups, where the elements within each group are similar to each other and different from the elements in other groups.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
212
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
213 Args:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
214 k_min (int): The minimum number of clusters to consider.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
215 k_max (int): The maximum number of clusters to consider.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
216 dataset (pandas.DataFrame): The dataset to perform clustering on.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
217 elbow (str): Whether to generate an elbow plot for kmeans ('true' or 'false').
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
218 silhouette (str): Whether to generate silhouette plots ('true' or 'false').
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
219 best_cluster (str): The file path to save the output of the best cluster.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
220
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
221 Returns:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
222 None
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
223 """
147
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
224 if not os.path.exists(args.output_path):
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
225 os.makedirs(args.output_path)
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
226
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
227
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
228 if elbow == 'true':
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
229 elbow = True
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
230 else:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
231 elbow = False
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
232
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
233 if silhouette == 'true':
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
234 silhouette = True
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
235 else:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
236 silhouette = False
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
237
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
238 range_n_clusters = [i for i in range(k_min, k_max+1)]
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
239 distortions = []
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
240 scores = []
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
241 all_labels = []
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
242
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
243 clusterer = KMeans(n_clusters=1, random_state=10)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
244 distortions.append(clusterer.fit(dataset).inertia_)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
245
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
246
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
247 for n_clusters in range_n_clusters:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
248 clusterer = KMeans(n_clusters=n_clusters, random_state=10)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
249 cluster_labels = clusterer.fit_predict(dataset)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
250
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
251 all_labels.append(cluster_labels)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
252 if n_clusters == 1:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
253 silhouette_avg = 0
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
254 else:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
255 silhouette_avg = silhouette_score(dataset, cluster_labels)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
256 scores.append(silhouette_avg)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
257 distortions.append(clusterer.fit(dataset).inertia_)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
258
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
259 best = max_index(scores) + k_min
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
260
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
261 for i in range(len(all_labels)):
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
262 prefix = ''
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
263 if (i + k_min == best):
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
264 prefix = '_BEST'
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
265
147
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
266 write_to_csv(dataset, all_labels[i], f'{args.output_path}/kmeans_with_' + str(i + k_min) + prefix + '_clusters.tsv')
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
267
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
268
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
269 if (prefix == '_BEST'):
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
270 labels = all_labels[i]
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
271 predict = [x+1 for x in labels]
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
272 classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
273 classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class'])
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
274
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
275
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
276
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
277
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
278 if silhouette:
147
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
279 silhouette_draw(dataset, all_labels[i], i + k_min, f'{args.output_path}/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png')
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
280
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
281
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
282 if elbow:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
283 elbow_plot(distortions, k_min,k_max)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
284
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
285
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
286
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
287
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
288
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
289 ############################## elbow_plot ####################################
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
290 def elbow_plot (distortions: List[float], k_min: int, k_max: int) -> None:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
291 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
292 Generate an elbow plot to visualize the distortion for different numbers of clusters.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
293 The elbow plot is a graphical tool used in clustering analysis to help identifying the appropriate number of clusters by looking for the point where the rate of decrease
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
294 in distortion sharply decreases, indicating the optimal balance between model complexity and clustering quality.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
295
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
296 Args:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
297 distortions (list): List of distortion values for different numbers of clusters.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
298 k_min (int): The minimum number of clusters considered.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
299 k_max (int): The maximum number of clusters considered.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
300
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
301 Returns:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
302 None
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
303 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
304 plt.figure(0)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
305 x = list(range(k_min, k_max + 1))
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
306 x.insert(0, 1)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
307 plt.plot(x, distortions, marker = 'o')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
308 plt.xlabel('Number of clusters (k)')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
309 plt.ylabel('Distortion')
147
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
310 s = f'{args.output_path}/elbow_plot.png'
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
311 fig = plt.gcf()
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
312 fig.set_size_inches(18.5, 10.5, forward = True)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
313 fig.savefig(s, dpi=100)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
314
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
315
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
316 ############################## silhouette plot ###############################
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
317 def silhouette_draw(dataset: pd.DataFrame, labels: List[str], n_clusters: int, path:str) -> None:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
318 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
319 Generate a silhouette plot for the clustering results.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
320 The silhouette coefficient is a measure used to evaluate the quality of clusters obtained from a clustering algorithmand it quantifies how similar an object is to its own cluster compared to other clusters.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
321 The silhouette coefficient ranges from -1 to 1, where:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
322 - A value close to +1 indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. This implies that the object is in a dense, well-separated cluster.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
323 - A value close to 0 indicates that the object is close to the decision boundary between two neighboring clusters.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
324 - A value close to -1 indicates that the object may have been assigned to the wrong cluster.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
325
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
326 Args:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
327 dataset (pandas.DataFrame): The dataset used for clustering.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
328 labels (list): The cluster labels assigned to each data point.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
329 n_clusters (int): The number of clusters.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
330 path (str): The path to save the silhouette plot image.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
331
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
332 Returns:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
333 None
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
334 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
335 if n_clusters == 1:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
336 return None
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
337
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
338 silhouette_avg = silhouette_score(dataset, labels)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
339 warning("For n_clusters = " + str(n_clusters) +
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
340 " The average silhouette_score is: " + str(silhouette_avg))
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
341
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
342 plt.close('all')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
343 # Create a subplot with 1 row and 2 columns
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
344 fig, (ax1) = plt.subplots(1, 1)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
345
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
346 fig.set_size_inches(18, 7)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
347
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
348 # The 1st subplot is the silhouette plot
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
349 # The silhouette coefficient can range from -1, 1 but in this example all
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
350 # lie within [-0.1, 1]
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
351 ax1.set_xlim([-1, 1])
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
352 # The (n_clusters+1)*10 is for inserting blank space between silhouette
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
353 # plots of individual clusters, to demarcate them clearly.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
354 ax1.set_ylim([0, len(dataset) + (n_clusters + 1) * 10])
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
355
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
356 # Compute the silhouette scores for each sample
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
357 sample_silhouette_values = silhouette_samples(dataset, labels)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
358
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
359 y_lower = 10
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
360 for i in range(n_clusters):
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
361 # Aggregate the silhouette scores for samples belonging to
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
362 # cluster i, and sort them
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
363 ith_cluster_silhouette_values = \
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
364 sample_silhouette_values[labels == i]
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
365
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
366 ith_cluster_silhouette_values.sort()
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
367
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
368 size_cluster_i = ith_cluster_silhouette_values.shape[0]
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
369 y_upper = y_lower + size_cluster_i
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
370
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
371 color = cm.nipy_spectral(float(i) / n_clusters)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
372 ax1.fill_betweenx(np.arange(y_lower, y_upper),
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
373 0, ith_cluster_silhouette_values,
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
374 facecolor=color, edgecolor=color, alpha=0.7)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
375
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
376 # Label the silhouette plots with their cluster numbers at the middle
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
377 ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
378
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
379 # Compute the new y_lower for next plot
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
380 y_lower = y_upper + 10 # 10 for the 0 samples
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
381
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
382 ax1.set_title("The silhouette plot for the various clusters.")
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
383 ax1.set_xlabel("The silhouette coefficient values")
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
384 ax1.set_ylabel("Cluster label")
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
385
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
386 # The vertical line for average silhouette score of all the values
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
387 ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
388
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
389 ax1.set_yticks([]) # Clear the yaxis labels / ticks
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
390 ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
391
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
392
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
393 plt.suptitle(("Silhouette analysis for clustering on sample data "
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
394 "with n_clusters = " + str(n_clusters) + "\nAverage silhouette_score = " + str(silhouette_avg)), fontsize=12, fontweight='bold')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
395
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
396
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
397 plt.savefig(path, bbox_inches='tight')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
398
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
399 ######################## dbscan ##############################################
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
400 def dbscan(dataset: pd.DataFrame, eps: float, min_samples: float, best_cluster: str) -> None:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
401 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
402 Perform DBSCAN clustering on the given dataset, which is a clustering algorithm that groups together closely packed points based on the notion of density.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
403
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
404 Args:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
405 dataset (pandas.DataFrame): The dataset to be clustered.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
406 eps (float): The maximum distance between two samples for one to be considered as in the neighborhood of the other.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
407 min_samples (float): The number of samples in a neighborhood for a point to be considered as a core point.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
408 best_cluster (str): The file path to save the output of the best cluster.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
409
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
410 Returns:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
411 None
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
412 """
147
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
413 if not os.path.exists(args.output_path):
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
414 os.makedirs(args.output_path)
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
415
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
416 if eps is not None:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
417 clusterer = DBSCAN(eps = eps, min_samples = min_samples)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
418 else:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
419 clusterer = DBSCAN()
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
420
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
421 clustering = clusterer.fit(dataset)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
422
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
423 core_samples_mask = np.zeros_like(clustering.labels_, dtype=bool)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
424 core_samples_mask[clustering.core_sample_indices_] = True
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
425 labels = clustering.labels_
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
426
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
427 # Number of clusters in labels, ignoring noise if present.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
428 n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
429
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
430
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
431 labels = labels
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
432 predict = [x+1 for x in labels]
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
433 classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
434 classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class'])
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
435
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
436
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
437 ########################## hierachical #######################################
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
438 def hierachical_agglomerative(dataset: pd.DataFrame, k_min: int, k_max: int, best_cluster: str, silhouette: str) -> None:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
439 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
440 Perform hierarchical agglomerative clustering on the given dataset.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
441
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
442 Args:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
443 dataset (pandas.DataFrame): The dataset to be clustered.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
444 k_min (int): The minimum number of clusters to consider.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
445 k_max (int): The maximum number of clusters to consider.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
446 best_cluster (str): The file path to save the output of the best cluster.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
447 silhouette (str): Whether to generate silhouette plots ('true' or 'false').
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
448
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
449 Returns:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
450 None
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
451 """
147
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
452 if not os.path.exists(args.output_path):
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
453 os.makedirs(args.output_path)
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
454
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
455 plt.figure(figsize=(10, 7))
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
456 plt.title("Customer Dendograms")
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
457 shc.dendrogram(shc.linkage(dataset, method='ward'), labels=dataset.index.values.tolist())
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
458 fig = plt.gcf()
147
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
459 fig.savefig(f'{args.output_path}/dendogram.png', dpi=200)
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
460
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
461 range_n_clusters = [i for i in range(k_min, k_max+1)]
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
462
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
463 scores = []
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
464 labels = []
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
465
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
466 n_classi = dataset.shape[0]
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
467
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
468 for n_clusters in range_n_clusters:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
469 cluster = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
470 cluster.fit_predict(dataset)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
471 cluster_labels = cluster.labels_
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
472 labels.append(cluster_labels)
147
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
473 write_to_csv(dataset, cluster_labels, f'{args.output_path}/hierarchical_with_' + str(n_clusters) + '_clusters.tsv')
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
474
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
475 best = max_index(scores) + k_min
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
476
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
477 for i in range(len(labels)):
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
478 prefix = ''
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
479 if (i + k_min == best):
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
480 prefix = '_BEST'
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
481 if silhouette == 'true':
147
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
482 silhouette_draw(dataset, labels[i], i + k_min, f'{args.output_path}/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png')
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
483
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
484 for i in range(len(labels)):
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
485 if (i + k_min == best):
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
486 labels = labels[i]
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
487 predict = [x+1 for x in labels]
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
488 classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
489 classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class'])
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
490
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
491
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
492 ############################# main ###########################################
147
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
493 def main(args_in:List[str] = None) -> None:
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
494 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
495 Initializes everything and sets the program in motion based on the fronted input arguments.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
496
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
497 Returns:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
498 None
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
499 """
147
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
500 global args
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
501 args = process_args(args_in)
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
502
147
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
503 if not os.path.exists(args.output_path):
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
504 os.makedirs(args.output_path)
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
505
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
506 #Data read
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
507
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
508 X = read_dataset(args.input)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
509 X = pd.DataFrame.to_dict(X, orient='list')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
510 X = rewrite_input(X)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
511 X = pd.DataFrame.from_dict(X, orient = 'index')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
512
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
513 for i in X.columns:
152
7f3552eaf774 Uploaded
luca_milaz
parents: 147
diff changeset
514 if any(val is None for val in X[i]):
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
515 X = X.drop(columns=[i])
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
516
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
517 if args.k_max != None:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
518 numero_classi = X.shape[0]
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
519 while args.k_max >= numero_classi:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
520 err = 'Skipping k = ' + str(args.k_max) + ' since it is >= number of classes of dataset'
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
521 warning(err)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
522 args.k_max = args.k_max - 1
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
523
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
524
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
525 if args.cluster_type == 'kmeans':
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
526 kmeans(args.k_min, args.k_max, X, args.elbow, args.silhouette, args.best_cluster)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
527
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
528 if args.cluster_type == 'dbscan':
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
529 dbscan(X, args.eps, args.min_samples, args.best_cluster)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
530
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
531 if args.cluster_type == 'hierarchy':
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
532 hierachical_agglomerative(X, args.k_min, args.k_max, args.best_cluster, args.silhouette)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
533
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
534 ##############################################################################
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
535 if __name__ == "__main__":
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
536 main()