comparison COBRAxy/marea_cluster.py @ 309:38c9a958ea78 draft

Uploaded
author francesco_lapi
date Thu, 22 May 2025 16:03:37 +0000
parents 4a677fc67aeb
children 4599fb23f25b
comparison
equal deleted inserted replaced
308:797d0e002934 309:38c9a958ea78
57 default = 7, 57 default = 7,
58 help = 'choose maximum cluster number to be generated') 58 help = 'choose maximum cluster number to be generated')
59 59
60 parser.add_argument('-el', '--elbow', 60 parser.add_argument('-el', '--elbow',
61 type = str, 61 type = str,
62 default = 'false', 62 default = 'False',
63 choices = ['true', 'false'], 63 choices = ['True', 'False'],
64 help = 'choose if you want to generate an elbow plot for kmeans') 64 help = 'choose if you want to generate an elbow plot for kmeans')
65 65
66 parser.add_argument('-si', '--silhouette', 66 parser.add_argument('-si', '--silhouette',
67 type = str, 67 type = str,
68 default = 'false', 68 default = 'False',
69 choices = ['true', 'false'], 69 choices = ['True', 'False'],
70 help = 'choose if you want silhouette plots') 70 help = 'choose if you want silhouette plots')
71 71
72 parser.add_argument('-td', '--tool_dir', 72 parser.add_argument('-td', '--tool_dir',
73 type = str, 73 type = str,
74 required = True, 74 required = True,
75 help = 'your tool directory') 75 help = 'your tool directory')
76 76
77 parser.add_argument('-ms', '--min_samples', 77 parser.add_argument('-ms', '--min_samples',
78 type = float, 78 type = int,
79 help = 'min samples for dbscan (optional)') 79 help = 'min samples for dbscan (optional)')
80 80
81 parser.add_argument('-ep', '--eps', 81 parser.add_argument('-ep', '--eps',
82 type = float, 82 type = float,
83 help = 'eps for dbscan (optional)') 83 help = 'eps for dbscan (optional)')
104 s (str): The warning message to be logged and printed. 104 s (str): The warning message to be logged and printed.
105 105
106 Returns: 106 Returns:
107 None 107 None
108 """ 108 """
109 args = process_args(sys.argv) 109
110 with open(args.out_log, 'a') as log: 110 with open(args.out_log, 'a') as log:
111 log.write(s + "\n\n") 111 log.write(s + "\n\n")
112 print(s) 112 print(s)
113 113
114 ########################## read dataset ###################################### 114 ########################## read dataset ######################################
211 211
212 Args: 212 Args:
213 k_min (int): The minimum number of clusters to consider. 213 k_min (int): The minimum number of clusters to consider.
214 k_max (int): The maximum number of clusters to consider. 214 k_max (int): The maximum number of clusters to consider.
215 dataset (pandas.DataFrame): The dataset to perform clustering on. 215 dataset (pandas.DataFrame): The dataset to perform clustering on.
216 elbow (str): Whether to generate an elbow plot for kmeans ('true' or 'false'). 216 elbow (str): Whether to generate an elbow plot for kmeans ('True' or 'False').
217 silhouette (str): Whether to generate silhouette plots ('true' or 'false'). 217 silhouette (str): Whether to generate silhouette plots ('True' or 'False').
218 best_cluster (str): The file path to save the output of the best cluster. 218 best_cluster (str): The file path to save the output of the best cluster.
219 219
220 Returns: 220 Returns:
221 None 221 None
222 """ 222 """
223 if not os.path.exists(args.output_path): 223 if not os.path.exists(args.output_path):
224 os.makedirs(args.output_path) 224 os.makedirs(args.output_path)
225 225
226 226
227 if elbow == 'true': 227 if elbow == 'True':
228 elbow = True 228 elbow = True
229 else: 229 else:
230 elbow = False 230 elbow = False
231 231
232 if silhouette == 'true': 232 if silhouette == 'True':
233 silhouette = True 233 silhouette = True
234 else: 234 else:
235 silhouette = False 235 silhouette = False
236 236
237 range_n_clusters = [i for i in range(k_min, k_max+1)] 237 range_n_clusters = [i for i in range(k_min, k_max+1)]
441 Args: 441 Args:
442 dataset (pandas.DataFrame): The dataset to be clustered. 442 dataset (pandas.DataFrame): The dataset to be clustered.
443 k_min (int): The minimum number of clusters to consider. 443 k_min (int): The minimum number of clusters to consider.
444 k_max (int): The maximum number of clusters to consider. 444 k_max (int): The maximum number of clusters to consider.
445 best_cluster (str): The file path to save the output of the best cluster. 445 best_cluster (str): The file path to save the output of the best cluster.
446 silhouette (str): Whether to generate silhouette plots ('true' or 'false'). 446 silhouette (str): Whether to generate silhouette plots ('True' or 'False').
447 447
448 Returns: 448 Returns:
449 None 449 None
450 """ 450 """
451 if not os.path.exists(args.output_path): 451 if not os.path.exists(args.output_path):
475 475
476 for i in range(len(labels)): 476 for i in range(len(labels)):
477 prefix = '' 477 prefix = ''
478 if (i + k_min == best): 478 if (i + k_min == best):
479 prefix = '_BEST' 479 prefix = '_BEST'
480 if silhouette == 'true': 480 if silhouette == 'True':
481 silhouette_draw(dataset, labels[i], i + k_min, f'{args.output_path}/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png') 481 silhouette_draw(dataset, labels[i], i + k_min, f'{args.output_path}/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png')
482 482
483 for i in range(len(labels)): 483 for i in range(len(labels)):
484 if (i + k_min == best): 484 if (i + k_min == best):
485 labels = labels[i] 485 labels = labels[i]