Mercurial > repos > bimib > cobraxy
view COBRAxy/marea_cluster.py @ 14:c10d8b38de84 draft
Uploaded
author | luca_milaz |
---|---|
date | Wed, 18 Sep 2024 12:37:06 +0000 |
parents | 41f35c2f0c7b |
children | 3fca9b568faf |
line wrap: on
line source
# -*- coding: utf-8 -*- """ Created on Mon Jun 3 19:51:00 2019 @author: Narger """ import sys import argparse import os import numpy as np import pandas as pd from sklearn.datasets import make_blobs from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering from sklearn.metrics import silhouette_samples, silhouette_score, cluster import matplotlib matplotlib.use('agg') import matplotlib.pyplot as plt import scipy.cluster.hierarchy as shc import matplotlib.cm as cm from typing import Optional, Dict, List ################################# process args ############################### def process_args(args :List[str]) -> argparse.Namespace: """ Processes command-line arguments. Args: args (list): List of command-line arguments. Returns: Namespace: An object containing parsed arguments. """ parser = argparse.ArgumentParser(usage = '%(prog)s [options]', description = 'process some value\'s' + ' genes to create class.') parser.add_argument('-ol', '--out_log', help = "Output log") parser.add_argument('-in', '--input', type = str, help = 'input dataset') parser.add_argument('-cy', '--cluster_type', type = str, choices = ['kmeans', 'dbscan', 'hierarchy'], default = 'kmeans', help = 'choose clustering algorythm') parser.add_argument('-k1', '--k_min', type = int, default = 2, help = 'choose minimun cluster number to be generated') parser.add_argument('-k2', '--k_max', type = int, default = 7, help = 'choose maximum cluster number to be generated') parser.add_argument('-el', '--elbow', type = str, default = 'false', choices = ['true', 'false'], help = 'choose if you want to generate an elbow plot for kmeans') parser.add_argument('-si', '--silhouette', type = str, default = 'false', choices = ['true', 'false'], help = 'choose if you want silhouette plots') parser.add_argument('-td', '--tool_dir', type = str, required = True, help = 'your tool directory') parser.add_argument('-ms', '--min_samples', type = float, help = 'min samples for dbscan (optional)') parser.add_argument('-ep', '--eps', type = float, help = 'eps for dbscan (optional)') parser.add_argument('-bc', '--best_cluster', type = str, help = 'output of best cluster tsv') args = parser.parse_args() return args ########################### warning ########################################### def warning(s :str) -> None: """ Log a warning message to an output log file and print it to the console. Args: s (str): The warning message to be logged and printed. Returns: None """ args = process_args(sys.argv) with open(args.out_log, 'a') as log: log.write(s + "\n\n") print(s) ########################## read dataset ###################################### def read_dataset(dataset :str) -> pd.DataFrame: """ Read dataset from a CSV file and return it as a Pandas DataFrame. Args: dataset (str): the path to the dataset to convert into a DataFrame Returns: pandas.DataFrame: The dataset loaded as a Pandas DataFrame. Raises: pandas.errors.EmptyDataError: If the dataset file is empty. sys.exit: If the dataset file has the wrong format (e.g., fewer than 2 columns) """ try: dataset = pd.read_csv(dataset, sep = '\t', header = 0) except pd.errors.EmptyDataError: sys.exit('Execution aborted: wrong format of dataset\n') if len(dataset.columns) < 2: sys.exit('Execution aborted: wrong format of dataset\n') return dataset ############################ rewrite_input ################################### def rewrite_input(dataset :pd.DataFrame) -> Dict[str, List[Optional[float]]]: """ Rewrite the dataset as a dictionary of lists instead of as a dictionary of dictionaries. Args: dataset (pandas.DataFrame): The dataset to be rewritten. Returns: dict: The rewritten dataset as a dictionary of lists. """ #Riscrivo il dataset come dizionario di liste, #non come dizionario di dizionari dataset.pop('Reactions', None) for key, val in dataset.items(): l = [] for i in val: if i == 'None': l.append(None) else: l.append(float(i)) dataset[key] = l return dataset ############################## write to csv ################################## def write_to_csv (dataset :pd.DataFrame, labels :List[str], name :str) -> None: """ Write dataset and predicted labels to a CSV file. Args: dataset (pandas.DataFrame): The dataset to be written. labels (list): The predicted labels for each data point. name (str): The name of the output CSV file. Returns: None """ #labels = predict predict = [x+1 for x in labels] classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str) dest = name classe.to_csv(dest, sep = '\t', index = False, header = ['Patient_ID', 'Class']) ########################### trova il massimo in lista ######################## def max_index (lista :List[int]) -> int: """ Find the index of the maximum value in a list. Args: lista (list): The list in which we search for the index of the maximum value. Returns: int: The index of the maximum value in the list. """ best = -1 best_index = 0 for i in range(len(lista)): if lista[i] > best: best = lista [i] best_index = i return best_index ################################ kmeans ##################################### def kmeans (k_min: int, k_max: int, dataset: pd.DataFrame, elbow: str, silhouette: str, best_cluster: str) -> None: """ Perform k-means clustering on the given dataset, which is an algorithm used to partition a dataset into groups (clusters) based on their characteristics. The goal is to divide the data into homogeneous groups, where the elements within each group are similar to each other and different from the elements in other groups. Args: k_min (int): The minimum number of clusters to consider. k_max (int): The maximum number of clusters to consider. dataset (pandas.DataFrame): The dataset to perform clustering on. elbow (str): Whether to generate an elbow plot for kmeans ('true' or 'false'). silhouette (str): Whether to generate silhouette plots ('true' or 'false'). best_cluster (str): The file path to save the output of the best cluster. Returns: None """ if not os.path.exists('clustering'): os.makedirs('clustering') if elbow == 'true': elbow = True else: elbow = False if silhouette == 'true': silhouette = True else: silhouette = False range_n_clusters = [i for i in range(k_min, k_max+1)] distortions = [] scores = [] all_labels = [] clusterer = KMeans(n_clusters=1, random_state=10) distortions.append(clusterer.fit(dataset).inertia_) for n_clusters in range_n_clusters: clusterer = KMeans(n_clusters=n_clusters, random_state=10) cluster_labels = clusterer.fit_predict(dataset) all_labels.append(cluster_labels) if n_clusters == 1: silhouette_avg = 0 else: silhouette_avg = silhouette_score(dataset, cluster_labels) scores.append(silhouette_avg) distortions.append(clusterer.fit(dataset).inertia_) best = max_index(scores) + k_min for i in range(len(all_labels)): prefix = '' if (i + k_min == best): prefix = '_BEST' write_to_csv(dataset, all_labels[i], 'clustering/kmeans_with_' + str(i + k_min) + prefix + '_clusters.tsv') if (prefix == '_BEST'): labels = all_labels[i] predict = [x+1 for x in labels] classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str) classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class']) if silhouette: silhouette_draw(dataset, all_labels[i], i + k_min, 'clustering/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png') if elbow: elbow_plot(distortions, k_min,k_max) ############################## elbow_plot #################################### def elbow_plot (distortions: List[float], k_min: int, k_max: int) -> None: """ Generate an elbow plot to visualize the distortion for different numbers of clusters. The elbow plot is a graphical tool used in clustering analysis to help identifying the appropriate number of clusters by looking for the point where the rate of decrease in distortion sharply decreases, indicating the optimal balance between model complexity and clustering quality. Args: distortions (list): List of distortion values for different numbers of clusters. k_min (int): The minimum number of clusters considered. k_max (int): The maximum number of clusters considered. Returns: None """ plt.figure(0) x = list(range(k_min, k_max + 1)) x.insert(0, 1) plt.plot(x, distortions, marker = 'o') plt.xlabel('Number of clusters (k)') plt.ylabel('Distortion') s = 'clustering/elbow_plot.png' fig = plt.gcf() fig.set_size_inches(18.5, 10.5, forward = True) fig.savefig(s, dpi=100) ############################## silhouette plot ############################### def silhouette_draw(dataset: pd.DataFrame, labels: List[str], n_clusters: int, path:str) -> None: """ Generate a silhouette plot for the clustering results. The silhouette coefficient is a measure used to evaluate the quality of clusters obtained from a clustering algorithmand it quantifies how similar an object is to its own cluster compared to other clusters. The silhouette coefficient ranges from -1 to 1, where: - A value close to +1 indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. This implies that the object is in a dense, well-separated cluster. - A value close to 0 indicates that the object is close to the decision boundary between two neighboring clusters. - A value close to -1 indicates that the object may have been assigned to the wrong cluster. Args: dataset (pandas.DataFrame): The dataset used for clustering. labels (list): The cluster labels assigned to each data point. n_clusters (int): The number of clusters. path (str): The path to save the silhouette plot image. Returns: None """ if n_clusters == 1: return None silhouette_avg = silhouette_score(dataset, labels) warning("For n_clusters = " + str(n_clusters) + " The average silhouette_score is: " + str(silhouette_avg)) plt.close('all') # Create a subplot with 1 row and 2 columns fig, (ax1) = plt.subplots(1, 1) fig.set_size_inches(18, 7) # The 1st subplot is the silhouette plot # The silhouette coefficient can range from -1, 1 but in this example all # lie within [-0.1, 1] ax1.set_xlim([-1, 1]) # The (n_clusters+1)*10 is for inserting blank space between silhouette # plots of individual clusters, to demarcate them clearly. ax1.set_ylim([0, len(dataset) + (n_clusters + 1) * 10]) # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(dataset, labels) y_lower = 10 for i in range(n_clusters): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = \ sample_silhouette_values[labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.nipy_spectral(float(i) / n_clusters) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) # Label the silhouette plots with their cluster numbers at the middle ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples ax1.set_title("The silhouette plot for the various clusters.") ax1.set_xlabel("The silhouette coefficient values") ax1.set_ylabel("Cluster label") # The vertical line for average silhouette score of all the values ax1.axvline(x=silhouette_avg, color="red", linestyle="--") ax1.set_yticks([]) # Clear the yaxis labels / ticks ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) plt.suptitle(("Silhouette analysis for clustering on sample data " "with n_clusters = " + str(n_clusters) + "\nAverage silhouette_score = " + str(silhouette_avg)), fontsize=12, fontweight='bold') plt.savefig(path, bbox_inches='tight') ######################## dbscan ############################################## def dbscan(dataset: pd.DataFrame, eps: float, min_samples: float, best_cluster: str) -> None: """ Perform DBSCAN clustering on the given dataset, which is a clustering algorithm that groups together closely packed points based on the notion of density. Args: dataset (pandas.DataFrame): The dataset to be clustered. eps (float): The maximum distance between two samples for one to be considered as in the neighborhood of the other. min_samples (float): The number of samples in a neighborhood for a point to be considered as a core point. best_cluster (str): The file path to save the output of the best cluster. Returns: None """ if not os.path.exists('clustering'): os.makedirs('clustering') if eps is not None: clusterer = DBSCAN(eps = eps, min_samples = min_samples) else: clusterer = DBSCAN() clustering = clusterer.fit(dataset) core_samples_mask = np.zeros_like(clustering.labels_, dtype=bool) core_samples_mask[clustering.core_sample_indices_] = True labels = clustering.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) labels = labels predict = [x+1 for x in labels] classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str) classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class']) ########################## hierachical ####################################### def hierachical_agglomerative(dataset: pd.DataFrame, k_min: int, k_max: int, best_cluster: str, silhouette: str) -> None: """ Perform hierarchical agglomerative clustering on the given dataset. Args: dataset (pandas.DataFrame): The dataset to be clustered. k_min (int): The minimum number of clusters to consider. k_max (int): The maximum number of clusters to consider. best_cluster (str): The file path to save the output of the best cluster. silhouette (str): Whether to generate silhouette plots ('true' or 'false'). Returns: None """ if not os.path.exists('clustering'): os.makedirs('clustering') plt.figure(figsize=(10, 7)) plt.title("Customer Dendograms") shc.dendrogram(shc.linkage(dataset, method='ward'), labels=dataset.index.values.tolist()) fig = plt.gcf() fig.savefig('clustering/dendogram.png', dpi=200) range_n_clusters = [i for i in range(k_min, k_max+1)] scores = [] labels = [] n_classi = dataset.shape[0] for n_clusters in range_n_clusters: cluster = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward') cluster.fit_predict(dataset) cluster_labels = cluster.labels_ labels.append(cluster_labels) write_to_csv(dataset, cluster_labels, 'clustering/hierarchical_with_' + str(n_clusters) + '_clusters.tsv') best = max_index(scores) + k_min for i in range(len(labels)): prefix = '' if (i + k_min == best): prefix = '_BEST' if silhouette == 'true': silhouette_draw(dataset, labels[i], i + k_min, 'clustering/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png') for i in range(len(labels)): if (i + k_min == best): labels = labels[i] predict = [x+1 for x in labels] classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str) classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class']) ############################# main ########################################### def main() -> None: """ Initializes everything and sets the program in motion based on the fronted input arguments. Returns: None """ if not os.path.exists('clustering'): os.makedirs('clustering') args = process_args(sys.argv) #Data read X = read_dataset(args.input) X = pd.DataFrame.to_dict(X, orient='list') X = rewrite_input(X) X = pd.DataFrame.from_dict(X, orient = 'index') for i in X.columns: tmp = X[i][0] if tmp == None: X = X.drop(columns=[i]) ## NAN TO HANLDE if args.k_max != None: numero_classi = X.shape[0] while args.k_max >= numero_classi: err = 'Skipping k = ' + str(args.k_max) + ' since it is >= number of classes of dataset' warning(err) args.k_max = args.k_max - 1 if args.cluster_type == 'kmeans': kmeans(args.k_min, args.k_max, X, args.elbow, args.silhouette, args.best_cluster) if args.cluster_type == 'dbscan': dbscan(X, args.eps, args.min_samples, args.best_cluster) if args.cluster_type == 'hierarchy': hierachical_agglomerative(X, args.k_min, args.k_max, args.best_cluster, args.silhouette) ############################################################################## if __name__ == "__main__": main()