| 
539
 | 
     1 # -*- coding: utf-8 -*-
 | 
| 
 | 
     2 """
 | 
| 
 | 
     3 Created on Mon Jun 3 19:51:00 2019
 | 
| 
 | 
     4 @author: Narger
 | 
| 
 | 
     5 """
 | 
| 
 | 
     6 
 | 
| 
 | 
     7 import sys
 | 
| 
 | 
     8 import argparse
 | 
| 
 | 
     9 import os
 | 
| 
 | 
    10 import numpy as np
 | 
| 
 | 
    11 import pandas as pd
 | 
| 
 | 
    12 from sklearn.datasets import make_blobs
 | 
| 
 | 
    13 from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
 | 
| 
 | 
    14 from sklearn.metrics import silhouette_samples, silhouette_score, cluster
 | 
| 
 | 
    15 import matplotlib
 | 
| 
 | 
    16 matplotlib.use('agg')
 | 
| 
 | 
    17 import matplotlib.pyplot as plt
 | 
| 
 | 
    18 import scipy.cluster.hierarchy as shc   
 | 
| 
 | 
    19 import matplotlib.cm as cm
 | 
| 
 | 
    20 from typing import Optional, Dict, List
 | 
| 
543
 | 
    21 import os
 | 
| 
539
 | 
    22 
 | 
| 
 | 
    23 ################################# process args ###############################
 | 
| 
 | 
    24 def process_args(args_in :List[str] = None) -> argparse.Namespace:
 | 
| 
 | 
    25     """
 | 
| 
 | 
    26     Processes command-line arguments.
 | 
| 
 | 
    27 
 | 
| 
 | 
    28     Args:
 | 
| 
 | 
    29         args (list): List of command-line arguments.
 | 
| 
 | 
    30 
 | 
| 
 | 
    31     Returns:
 | 
| 
 | 
    32         Namespace: An object containing parsed arguments.
 | 
| 
 | 
    33     """
 | 
| 
 | 
    34     parser = argparse.ArgumentParser(usage = '%(prog)s [options]',
 | 
| 
 | 
    35                                      description = 'process some value\'s' +
 | 
| 
 | 
    36                                      ' genes to create class.')
 | 
| 
 | 
    37 
 | 
| 
 | 
    38     parser.add_argument('-ol', '--out_log', 
 | 
| 
 | 
    39                         help = "Output log")
 | 
| 
 | 
    40     
 | 
| 
 | 
    41     parser.add_argument('-in', '--input',
 | 
| 
 | 
    42                         type = str,
 | 
| 
 | 
    43                         help = 'input dataset')
 | 
| 
 | 
    44     
 | 
| 
 | 
    45     parser.add_argument('-cy', '--cluster_type',
 | 
| 
 | 
    46                         type = str,
 | 
| 
 | 
    47                         choices = ['kmeans', 'dbscan', 'hierarchy'],
 | 
| 
 | 
    48                         default = 'kmeans',
 | 
| 
 | 
    49                         help = 'choose clustering algorythm')
 | 
| 
 | 
    50     
 | 
| 
 | 
    51     parser.add_argument('-sc', '--scaling',
 | 
| 
 | 
    52                         type = str,
 | 
| 
 | 
    53                         choices = ['true', 'false'],
 | 
| 
 | 
    54                         default = 'true',
 | 
| 
 | 
    55                         help = 'choose if you want to scaling the data')
 | 
| 
 | 
    56     
 | 
| 
 | 
    57     parser.add_argument('-k1', '--k_min', 
 | 
| 
 | 
    58                         type = int,
 | 
| 
 | 
    59                         default = 2,
 | 
| 
 | 
    60                         help = 'choose minimun cluster number to be generated')
 | 
| 
 | 
    61     
 | 
| 
 | 
    62     parser.add_argument('-k2', '--k_max', 
 | 
| 
 | 
    63                         type = int,
 | 
| 
 | 
    64                         default = 7,
 | 
| 
 | 
    65                         help = 'choose maximum cluster number to be generated')
 | 
| 
 | 
    66     
 | 
| 
 | 
    67     parser.add_argument('-el', '--elbow', 
 | 
| 
 | 
    68                         type = str,
 | 
| 
 | 
    69                         default = 'false',
 | 
| 
 | 
    70                         choices = ['true', 'false'],
 | 
| 
 | 
    71                         help = 'choose if you want to generate an elbow plot for kmeans')
 | 
| 
 | 
    72     
 | 
| 
 | 
    73     parser.add_argument('-si', '--silhouette', 
 | 
| 
 | 
    74                         type = str,
 | 
| 
 | 
    75                         default = 'false',
 | 
| 
 | 
    76                         choices = ['true', 'false'],
 | 
| 
 | 
    77                         help = 'choose if you want silhouette plots')
 | 
| 
 | 
    78     
 | 
| 
 | 
    79     parser.add_argument('-td', '--tool_dir',
 | 
| 
 | 
    80                         type = str,
 | 
| 
542
 | 
    81                         default = os.path.dirname(os.path.abspath(__file__)),
 | 
| 
 | 
    82                         help = 'your tool directory (default: auto-detected package location)')
 | 
| 
539
 | 
    83                         
 | 
| 
 | 
    84     parser.add_argument('-ms', '--min_samples',
 | 
| 
 | 
    85                         type = int,
 | 
| 
 | 
    86                         help = 'min samples for dbscan (optional)')
 | 
| 
 | 
    87                         
 | 
| 
 | 
    88     parser.add_argument('-ep', '--eps',
 | 
| 
 | 
    89                         type = float,
 | 
| 
 | 
    90                         help = 'eps for dbscan (optional)')
 | 
| 
 | 
    91                         
 | 
| 
 | 
    92     parser.add_argument('-bc', '--best_cluster',
 | 
| 
 | 
    93                         type = str,
 | 
| 
 | 
    94                         help = 'output of best cluster tsv')
 | 
| 
 | 
    95     				
 | 
| 
 | 
    96     parser.add_argument(
 | 
| 
 | 
    97         '-idop', '--output_path', 
 | 
| 
 | 
    98         type = str,
 | 
| 
 | 
    99         default='clustering/',
 | 
| 
 | 
   100         help = 'output path for maps')
 | 
| 
 | 
   101     
 | 
| 
 | 
   102     args_in = parser.parse_args(args_in)
 | 
| 
 | 
   103     return args_in
 | 
| 
 | 
   104 
 | 
| 
 | 
   105 ########################### warning ###########################################
 | 
| 
 | 
   106 def warning(s :str) -> None:
 | 
| 
 | 
   107     """
 | 
| 
 | 
   108     Log a warning message to an output log file and print it to the console.
 | 
| 
 | 
   109 
 | 
| 
 | 
   110     Args:
 | 
| 
 | 
   111         s (str): The warning message to be logged and printed.
 | 
| 
 | 
   112     
 | 
| 
 | 
   113     Returns:
 | 
| 
 | 
   114       None
 | 
| 
 | 
   115     """
 | 
| 
 | 
   116 
 | 
| 
 | 
   117     with open(args.out_log, 'a') as log:
 | 
| 
 | 
   118         log.write(s + "\n\n")
 | 
| 
 | 
   119     print(s)
 | 
| 
 | 
   120 
 | 
| 
 | 
   121 ########################## read dataset ######################################
 | 
| 
 | 
   122 def read_dataset(dataset :str) -> pd.DataFrame:
 | 
| 
 | 
   123     """
 | 
| 
 | 
   124     Read dataset from a CSV file and return it as a Pandas DataFrame.
 | 
| 
 | 
   125 
 | 
| 
 | 
   126     Args:
 | 
| 
 | 
   127         dataset (str): the path to the dataset to convert into a DataFrame
 | 
| 
 | 
   128 
 | 
| 
 | 
   129     Returns:
 | 
| 
 | 
   130         pandas.DataFrame: The dataset loaded as a Pandas DataFrame.
 | 
| 
 | 
   131 
 | 
| 
 | 
   132     Raises:
 | 
| 
 | 
   133         pandas.errors.EmptyDataError: If the dataset file is empty.
 | 
| 
 | 
   134         sys.exit: If the dataset file has the wrong format (e.g., fewer than 2 columns)
 | 
| 
 | 
   135     """
 | 
| 
 | 
   136     try:
 | 
| 
 | 
   137         dataset = pd.read_csv(dataset, sep = '\t', header = 0)
 | 
| 
 | 
   138     except pd.errors.EmptyDataError:
 | 
| 
 | 
   139         sys.exit('Execution aborted: wrong format of dataset\n')
 | 
| 
 | 
   140     if len(dataset.columns) < 2:
 | 
| 
 | 
   141         sys.exit('Execution aborted: wrong format of dataset\n')
 | 
| 
 | 
   142     return dataset
 | 
| 
 | 
   143 
 | 
| 
 | 
   144 ############################ rewrite_input ###################################
 | 
| 
 | 
   145 def rewrite_input(dataset :Dict) -> Dict[str, List[Optional[float]]]:
 | 
| 
 | 
   146     """
 | 
| 
 | 
   147     Rewrite the dataset as a dictionary of lists instead of as a dictionary of dictionaries.
 | 
| 
 | 
   148 
 | 
| 
 | 
   149     Args:
 | 
| 
 | 
   150         dataset (pandas.DataFrame): The dataset to be rewritten.
 | 
| 
 | 
   151 
 | 
| 
 | 
   152     Returns:
 | 
| 
 | 
   153         dict: The rewritten dataset as a dictionary of lists.
 | 
| 
 | 
   154     """
 | 
| 
 | 
   155     #Riscrivo il dataset come dizionario di liste, 
 | 
| 
 | 
   156     #non come dizionario di dizionari
 | 
| 
 | 
   157     #dataset.pop('Reactions', None)
 | 
| 
 | 
   158     
 | 
| 
 | 
   159     for key, val in dataset.items():
 | 
| 
 | 
   160         l = []
 | 
| 
 | 
   161         for i in val:
 | 
| 
 | 
   162             if i == 'None':
 | 
| 
 | 
   163                 l.append(None)
 | 
| 
 | 
   164             else:
 | 
| 
 | 
   165                 l.append(float(i))
 | 
| 
 | 
   166    
 | 
| 
 | 
   167         dataset[key] = l
 | 
| 
 | 
   168     
 | 
| 
 | 
   169     return dataset
 | 
| 
 | 
   170 
 | 
| 
 | 
   171 ############################## write to csv ##################################
 | 
| 
 | 
   172 def write_to_csv (dataset :pd.DataFrame, labels :List[str], name :str) -> None:
 | 
| 
 | 
   173     """
 | 
| 
 | 
   174     Write dataset and predicted labels to a CSV file.
 | 
| 
 | 
   175 
 | 
| 
 | 
   176     Args:
 | 
| 
 | 
   177         dataset (pandas.DataFrame): The dataset to be written.
 | 
| 
 | 
   178         labels (list): The predicted labels for each data point.
 | 
| 
 | 
   179         name (str): The name of the output CSV file.
 | 
| 
 | 
   180 
 | 
| 
 | 
   181     Returns:
 | 
| 
 | 
   182         None
 | 
| 
 | 
   183     """
 | 
| 
 | 
   184     #labels = predict
 | 
| 
 | 
   185     predict = [x+1 for x in labels]
 | 
| 
 | 
   186   
 | 
| 
 | 
   187     classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)
 | 
| 
 | 
   188 
 | 
| 
 | 
   189     dest = name
 | 
| 
 | 
   190     classe.to_csv(dest, sep = '\t', index = False,
 | 
| 
 | 
   191                       header = ['Patient_ID', 'Class'])
 | 
| 
 | 
   192    
 | 
| 
 | 
   193 ########################### trova il massimo in lista ########################
 | 
| 
 | 
   194 def max_index (lista :List[int]) -> int:
 | 
| 
 | 
   195     """
 | 
| 
 | 
   196     Find the index of the maximum value in a list.
 | 
| 
 | 
   197 
 | 
| 
 | 
   198     Args:
 | 
| 
 | 
   199         lista (list): The list in which we search for the index of the maximum value.
 | 
| 
 | 
   200 
 | 
| 
 | 
   201     Returns:
 | 
| 
 | 
   202         int: The index of the maximum value in the list.
 | 
| 
 | 
   203     """
 | 
| 
 | 
   204     best = -1
 | 
| 
 | 
   205     best_index = 0
 | 
| 
 | 
   206     for i in range(len(lista)):
 | 
| 
 | 
   207         if lista[i] > best:
 | 
| 
 | 
   208             best = lista [i]
 | 
| 
 | 
   209             best_index = i
 | 
| 
 | 
   210             
 | 
| 
 | 
   211     return best_index
 | 
| 
 | 
   212     
 | 
| 
 | 
   213 ################################ kmeans #####################################
 | 
| 
 | 
   214 def kmeans (k_min: int, k_max: int, dataset: pd.DataFrame, elbow: str, silhouette: str, best_cluster: str) -> None:
 | 
| 
 | 
   215     """
 | 
| 
 | 
   216     Perform k-means clustering on the given dataset, which is an algorithm used to partition a dataset into groups (clusters) based on their characteristics.
 | 
| 
 | 
   217     The goal is to divide the data into homogeneous groups, where the elements within each group are similar to each other and different from the elements in other groups.
 | 
| 
 | 
   218 
 | 
| 
 | 
   219     Args:
 | 
| 
 | 
   220         k_min (int): The minimum number of clusters to consider.
 | 
| 
 | 
   221         k_max (int): The maximum number of clusters to consider.
 | 
| 
 | 
   222         dataset (pandas.DataFrame): The dataset to perform clustering on.
 | 
| 
 | 
   223         elbow (str): Whether to generate an elbow plot for kmeans ('True' or 'False').
 | 
| 
 | 
   224         silhouette (str): Whether to generate silhouette plots ('True' or 'False').
 | 
| 
 | 
   225         best_cluster (str): The file path to save the output of the best cluster.
 | 
| 
 | 
   226 
 | 
| 
 | 
   227     Returns:
 | 
| 
 | 
   228         None
 | 
| 
 | 
   229     """
 | 
| 
 | 
   230     if not os.path.exists(args.output_path):
 | 
| 
 | 
   231         os.makedirs(args.output_path)
 | 
| 
 | 
   232     
 | 
| 
 | 
   233         
 | 
| 
 | 
   234     if elbow == 'true':
 | 
| 
 | 
   235         elbow = True
 | 
| 
 | 
   236     else:
 | 
| 
 | 
   237         elbow = False
 | 
| 
 | 
   238         
 | 
| 
 | 
   239     if silhouette == 'true':
 | 
| 
 | 
   240         silhouette = True
 | 
| 
 | 
   241     else:
 | 
| 
 | 
   242         silhouette = False
 | 
| 
 | 
   243         
 | 
| 
 | 
   244     range_n_clusters = [i for i in range(k_min, k_max+1)]
 | 
| 
 | 
   245     distortions = []
 | 
| 
 | 
   246     scores = []
 | 
| 
 | 
   247     all_labels = []
 | 
| 
 | 
   248     
 | 
| 
 | 
   249     clusterer = KMeans(n_clusters=1, random_state=10)
 | 
| 
 | 
   250     distortions.append(clusterer.fit(dataset).inertia_)
 | 
| 
 | 
   251     
 | 
| 
 | 
   252     
 | 
| 
 | 
   253     for n_clusters in range_n_clusters:
 | 
| 
 | 
   254         clusterer = KMeans(n_clusters=n_clusters, random_state=10)
 | 
| 
 | 
   255         cluster_labels = clusterer.fit_predict(dataset)
 | 
| 
 | 
   256         
 | 
| 
 | 
   257         all_labels.append(cluster_labels)
 | 
| 
 | 
   258         if n_clusters == 1:
 | 
| 
 | 
   259             silhouette_avg = 0
 | 
| 
 | 
   260         else:
 | 
| 
 | 
   261             silhouette_avg = silhouette_score(dataset, cluster_labels)
 | 
| 
 | 
   262         scores.append(silhouette_avg)
 | 
| 
 | 
   263         distortions.append(clusterer.fit(dataset).inertia_)
 | 
| 
 | 
   264         
 | 
| 
 | 
   265     best = max_index(scores) + k_min
 | 
| 
 | 
   266         
 | 
| 
 | 
   267     for i in range(len(all_labels)):
 | 
| 
 | 
   268         prefix = ''
 | 
| 
 | 
   269         if (i + k_min == best):
 | 
| 
 | 
   270             prefix = '_BEST'
 | 
| 
 | 
   271             
 | 
| 
 | 
   272         write_to_csv(dataset, all_labels[i], f'{args.output_path}/kmeans_with_' + str(i + k_min) + prefix + '_clusters.tsv')
 | 
| 
 | 
   273         
 | 
| 
 | 
   274         
 | 
| 
 | 
   275         if (prefix == '_BEST'):
 | 
| 
 | 
   276             labels = all_labels[i]
 | 
| 
 | 
   277             predict = [x+1 for x in labels]
 | 
| 
 | 
   278             classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)
 | 
| 
 | 
   279             classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class'])
 | 
| 
 | 
   280             
 | 
| 
 | 
   281           
 | 
| 
 | 
   282         
 | 
| 
 | 
   283        
 | 
| 
 | 
   284         if silhouette:
 | 
| 
 | 
   285             silhouette_draw(dataset, all_labels[i], i + k_min, f'{args.output_path}/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png')
 | 
| 
 | 
   286         
 | 
| 
 | 
   287         
 | 
| 
 | 
   288     if elbow:
 | 
| 
 | 
   289         elbow_plot(distortions, k_min,k_max) 
 | 
| 
 | 
   290 
 | 
| 
 | 
   291    
 | 
| 
 | 
   292     
 | 
| 
 | 
   293     
 | 
| 
 | 
   294 
 | 
| 
 | 
   295 ############################## elbow_plot ####################################
 | 
| 
 | 
   296 def elbow_plot (distortions: List[float], k_min: int, k_max: int) -> None:
 | 
| 
 | 
   297     """
 | 
| 
 | 
   298     Generate an elbow plot to visualize the distortion for different numbers of clusters.
 | 
| 
 | 
   299     The elbow plot is a graphical tool used in clustering analysis to help identifying the appropriate number of clusters by looking for the point where the rate of decrease
 | 
| 
 | 
   300     in distortion sharply decreases, indicating the optimal balance between model complexity and clustering quality.
 | 
| 
 | 
   301 
 | 
| 
 | 
   302     Args:
 | 
| 
 | 
   303         distortions (list): List of distortion values for different numbers of clusters.
 | 
| 
 | 
   304         k_min (int): The minimum number of clusters considered.
 | 
| 
 | 
   305         k_max (int): The maximum number of clusters considered.
 | 
| 
 | 
   306 
 | 
| 
 | 
   307     Returns:
 | 
| 
 | 
   308         None
 | 
| 
 | 
   309     """
 | 
| 
 | 
   310     plt.figure(0)
 | 
| 
 | 
   311     x = list(range(k_min, k_max + 1))
 | 
| 
 | 
   312     x.insert(0, 1)
 | 
| 
 | 
   313     plt.plot(x, distortions, marker = 'o')
 | 
| 
 | 
   314     plt.xlabel('Number of clusters (k)')
 | 
| 
 | 
   315     plt.ylabel('Distortion')
 | 
| 
 | 
   316     s = f'{args.output_path}/elbow_plot.png'
 | 
| 
 | 
   317     fig = plt.gcf()
 | 
| 
 | 
   318     fig.set_size_inches(18.5, 10.5, forward = True)
 | 
| 
 | 
   319     fig.savefig(s, dpi=100)
 | 
| 
 | 
   320     
 | 
| 
 | 
   321     
 | 
| 
 | 
   322 ############################## silhouette plot ###############################
 | 
| 
 | 
   323 def silhouette_draw(dataset: pd.DataFrame, labels: List[str], n_clusters: int, path:str) -> None:
 | 
| 
 | 
   324     """
 | 
| 
 | 
   325     Generate a silhouette plot for the clustering results.
 | 
| 
 | 
   326     The silhouette coefficient is a measure used to evaluate the quality of clusters obtained from a clustering algorithmand it quantifies how similar an object is to its own cluster compared to other clusters.
 | 
| 
 | 
   327     The silhouette coefficient ranges from -1 to 1, where:
 | 
| 
 | 
   328     - A value close to +1 indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. This implies that the object is in a dense, well-separated cluster.
 | 
| 
 | 
   329     - A value close to 0 indicates that the object is close to the decision boundary between two neighboring clusters.
 | 
| 
 | 
   330     - A value close to -1 indicates that the object may have been assigned to the wrong cluster.
 | 
| 
 | 
   331 
 | 
| 
 | 
   332     Args:
 | 
| 
 | 
   333         dataset (pandas.DataFrame): The dataset used for clustering.
 | 
| 
 | 
   334         labels (list): The cluster labels assigned to each data point.
 | 
| 
 | 
   335         n_clusters (int): The number of clusters.
 | 
| 
 | 
   336         path (str): The path to save the silhouette plot image.
 | 
| 
 | 
   337 
 | 
| 
 | 
   338     Returns:
 | 
| 
 | 
   339         None
 | 
| 
 | 
   340     """
 | 
| 
 | 
   341     if n_clusters == 1:
 | 
| 
 | 
   342         return None
 | 
| 
 | 
   343         
 | 
| 
 | 
   344     silhouette_avg = silhouette_score(dataset, labels)
 | 
| 
 | 
   345     warning("For n_clusters = " + str(n_clusters) +
 | 
| 
 | 
   346           " The average silhouette_score is: " + str(silhouette_avg))
 | 
| 
 | 
   347            
 | 
| 
 | 
   348     plt.close('all')
 | 
| 
 | 
   349     # Create a subplot with 1 row and 2 columns
 | 
| 
 | 
   350     fig, (ax1) = plt.subplots(1, 1)
 | 
| 
 | 
   351     
 | 
| 
 | 
   352     fig.set_size_inches(18, 7)
 | 
| 
 | 
   353         
 | 
| 
 | 
   354     # The 1st subplot is the silhouette plot
 | 
| 
 | 
   355     # The silhouette coefficient can range from -1, 1 but in this example all
 | 
| 
 | 
   356     # lie within [-0.1, 1]
 | 
| 
 | 
   357     ax1.set_xlim([-1, 1])
 | 
| 
 | 
   358     # The (n_clusters+1)*10 is for inserting blank space between silhouette
 | 
| 
 | 
   359     # plots of individual clusters, to demarcate them clearly.
 | 
| 
 | 
   360     ax1.set_ylim([0, len(dataset) + (n_clusters + 1) * 10])
 | 
| 
 | 
   361     
 | 
| 
 | 
   362     # Compute the silhouette scores for each sample
 | 
| 
 | 
   363     sample_silhouette_values = silhouette_samples(dataset, labels)
 | 
| 
 | 
   364         
 | 
| 
 | 
   365     y_lower = 10
 | 
| 
 | 
   366     for i in range(n_clusters):
 | 
| 
 | 
   367         # Aggregate the silhouette scores for samples belonging to
 | 
| 
 | 
   368         # cluster i, and sort them
 | 
| 
 | 
   369         ith_cluster_silhouette_values = \
 | 
| 
 | 
   370         sample_silhouette_values[labels == i]
 | 
| 
 | 
   371         
 | 
| 
 | 
   372         ith_cluster_silhouette_values.sort()
 | 
| 
 | 
   373     
 | 
| 
 | 
   374         size_cluster_i = ith_cluster_silhouette_values.shape[0]
 | 
| 
 | 
   375         y_upper = y_lower + size_cluster_i
 | 
| 
 | 
   376     
 | 
| 
 | 
   377         color = cm.nipy_spectral(float(i) / n_clusters)
 | 
| 
 | 
   378         ax1.fill_betweenx(np.arange(y_lower, y_upper),
 | 
| 
 | 
   379                           0, ith_cluster_silhouette_values,
 | 
| 
 | 
   380                                      facecolor=color, edgecolor=color, alpha=0.7)
 | 
| 
 | 
   381         
 | 
| 
 | 
   382         # Label the silhouette plots with their cluster numbers at the middle
 | 
| 
 | 
   383         ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
 | 
| 
 | 
   384         
 | 
| 
 | 
   385         # Compute the new y_lower for next plot
 | 
| 
 | 
   386         y_lower = y_upper + 10  # 10 for the 0 samples
 | 
| 
 | 
   387     
 | 
| 
 | 
   388         ax1.set_title("The silhouette plot for the various clusters.")
 | 
| 
 | 
   389         ax1.set_xlabel("The silhouette coefficient values")
 | 
| 
 | 
   390         ax1.set_ylabel("Cluster label")
 | 
| 
 | 
   391         
 | 
| 
 | 
   392         # The vertical line for average silhouette score of all the values
 | 
| 
 | 
   393         ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
 | 
| 
 | 
   394     
 | 
| 
 | 
   395         ax1.set_yticks([])  # Clear the yaxis labels / ticks
 | 
| 
 | 
   396         ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
 | 
| 
 | 
   397         
 | 
| 
 | 
   398         
 | 
| 
 | 
   399         plt.suptitle(("Silhouette analysis for clustering on sample data "
 | 
| 
 | 
   400                           "with n_clusters = " + str(n_clusters) + "\nAverage silhouette_score = " + str(silhouette_avg)), fontsize=12, fontweight='bold')
 | 
| 
 | 
   401             
 | 
| 
 | 
   402             
 | 
| 
 | 
   403         plt.savefig(path, bbox_inches='tight')
 | 
| 
 | 
   404             
 | 
| 
 | 
   405 ######################## dbscan ##############################################
 | 
| 
 | 
   406 def dbscan(dataset: pd.DataFrame, eps: float, min_samples: float, best_cluster: str) -> None:
 | 
| 
 | 
   407     """
 | 
| 
 | 
   408     Perform DBSCAN clustering on the given dataset, which is a clustering algorithm that groups together closely packed points based on the notion of density.
 | 
| 
 | 
   409 
 | 
| 
 | 
   410     Args:
 | 
| 
 | 
   411         dataset (pandas.DataFrame): The dataset to be clustered.
 | 
| 
 | 
   412         eps (float): The maximum distance between two samples for one to be considered as in the neighborhood of the other.
 | 
| 
 | 
   413         min_samples (float): The number of samples in a neighborhood for a point to be considered as a core point.
 | 
| 
 | 
   414         best_cluster (str): The file path to save the output of the best cluster.
 | 
| 
 | 
   415 
 | 
| 
 | 
   416     Returns:
 | 
| 
 | 
   417         None
 | 
| 
 | 
   418     """
 | 
| 
 | 
   419     if not os.path.exists(args.output_path):
 | 
| 
 | 
   420         os.makedirs(args.output_path)
 | 
| 
 | 
   421         
 | 
| 
 | 
   422     if eps is not None:
 | 
| 
 | 
   423         clusterer = DBSCAN(eps = eps, min_samples = min_samples)
 | 
| 
 | 
   424     else:
 | 
| 
 | 
   425         clusterer = DBSCAN()
 | 
| 
 | 
   426     
 | 
| 
 | 
   427     clustering = clusterer.fit(dataset)
 | 
| 
 | 
   428     
 | 
| 
 | 
   429     core_samples_mask = np.zeros_like(clustering.labels_, dtype=bool)
 | 
| 
 | 
   430     core_samples_mask[clustering.core_sample_indices_] = True
 | 
| 
 | 
   431     labels = clustering.labels_
 | 
| 
 | 
   432 
 | 
| 
 | 
   433     # Number of clusters in labels, ignoring noise if present.
 | 
| 
 | 
   434     n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
 | 
| 
 | 
   435     
 | 
| 
 | 
   436     
 | 
| 
 | 
   437     labels = labels
 | 
| 
 | 
   438     predict = [x+1 for x in labels]
 | 
| 
 | 
   439     classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)
 | 
| 
 | 
   440     classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class'])
 | 
| 
 | 
   441   
 | 
| 
 | 
   442     
 | 
| 
 | 
   443 ########################## hierachical #######################################
 | 
| 
 | 
   444 def hierachical_agglomerative(dataset: pd.DataFrame, k_min: int, k_max: int, best_cluster: str, silhouette: str) -> None:
 | 
| 
 | 
   445     """
 | 
| 
 | 
   446     Perform hierarchical agglomerative clustering on the given dataset.
 | 
| 
 | 
   447 
 | 
| 
 | 
   448     Args:
 | 
| 
 | 
   449         dataset (pandas.DataFrame): The dataset to be clustered.
 | 
| 
 | 
   450         k_min (int): The minimum number of clusters to consider.
 | 
| 
 | 
   451         k_max (int): The maximum number of clusters to consider.
 | 
| 
 | 
   452         best_cluster (str): The file path to save the output of the best cluster.
 | 
| 
 | 
   453         silhouette (str): Whether to generate silhouette plots ('True' or 'False').
 | 
| 
 | 
   454 
 | 
| 
 | 
   455     Returns:
 | 
| 
 | 
   456         None
 | 
| 
 | 
   457     """
 | 
| 
 | 
   458     if not os.path.exists(args.output_path):
 | 
| 
 | 
   459         os.makedirs(args.output_path)
 | 
| 
 | 
   460     
 | 
| 
 | 
   461     plt.figure(figsize=(10, 7))  
 | 
| 
 | 
   462     plt.title("Customer Dendograms")  
 | 
| 
 | 
   463     shc.dendrogram(shc.linkage(dataset, method='ward'), labels=dataset.index.values.tolist())  
 | 
| 
 | 
   464     fig = plt.gcf()
 | 
| 
 | 
   465     fig.savefig(f'{args.output_path}/dendogram.png', dpi=200)
 | 
| 
 | 
   466     
 | 
| 
 | 
   467     range_n_clusters = [i for i in range(k_min, k_max+1)]
 | 
| 
 | 
   468 
 | 
| 
 | 
   469     scores = []
 | 
| 
 | 
   470     labels = []
 | 
| 
 | 
   471     
 | 
| 
 | 
   472     n_classi = dataset.shape[0]
 | 
| 
 | 
   473     
 | 
| 
 | 
   474     for n_clusters in range_n_clusters:  
 | 
| 
 | 
   475         cluster = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward')  
 | 
| 
 | 
   476         cluster.fit_predict(dataset)  
 | 
| 
 | 
   477         cluster_labels = cluster.labels_
 | 
| 
 | 
   478         labels.append(cluster_labels)
 | 
| 
 | 
   479         write_to_csv(dataset, cluster_labels, f'{args.output_path}/hierarchical_with_' + str(n_clusters) + '_clusters.tsv')
 | 
| 
 | 
   480         
 | 
| 
 | 
   481     best = max_index(scores) + k_min
 | 
| 
 | 
   482     
 | 
| 
 | 
   483     for i in range(len(labels)):
 | 
| 
 | 
   484         prefix = ''
 | 
| 
 | 
   485         if (i + k_min == best):
 | 
| 
 | 
   486             prefix = '_BEST'
 | 
| 
 | 
   487         if silhouette == 'true':
 | 
| 
 | 
   488             silhouette_draw(dataset, labels[i], i + k_min, f'{args.output_path}/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png')
 | 
| 
 | 
   489      
 | 
| 
 | 
   490     for i in range(len(labels)):
 | 
| 
 | 
   491         if (i + k_min == best):
 | 
| 
 | 
   492             labels = labels[i]
 | 
| 
 | 
   493             predict = [x+1 for x in labels]
 | 
| 
 | 
   494             classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)
 | 
| 
 | 
   495             classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class'])
 | 
| 
 | 
   496             
 | 
| 
 | 
   497     
 | 
| 
 | 
   498 ############################# main ###########################################
 | 
| 
 | 
   499 def main(args_in:List[str] = None) -> None:
 | 
| 
 | 
   500     """
 | 
| 
 | 
   501     Initializes everything and sets the program in motion based on the fronted input arguments.
 | 
| 
 | 
   502 
 | 
| 
 | 
   503     Returns:
 | 
| 
 | 
   504         None
 | 
| 
 | 
   505     """
 | 
| 
 | 
   506     global args
 | 
| 
 | 
   507     args = process_args(args_in)
 | 
| 
 | 
   508 
 | 
| 
 | 
   509     if not os.path.exists(args.output_path):
 | 
| 
 | 
   510         os.makedirs(args.output_path)
 | 
| 
 | 
   511 
 | 
| 
 | 
   512     #Data read
 | 
| 
 | 
   513     
 | 
| 
 | 
   514     X = read_dataset(args.input)
 | 
| 
 | 
   515     X = X.iloc[:, 1:]
 | 
| 
 | 
   516     X = pd.DataFrame.to_dict(X, orient='list')
 | 
| 
 | 
   517     X = rewrite_input(X)
 | 
| 
 | 
   518     X = pd.DataFrame.from_dict(X, orient = 'index')
 | 
| 
 | 
   519     
 | 
| 
 | 
   520     for i in X.columns:
 | 
| 
 | 
   521         if any(val is None or np.isnan(val) for val in X[i]):
 | 
| 
 | 
   522             X = X.drop(columns=[i])
 | 
| 
 | 
   523             
 | 
| 
 | 
   524     if args.scaling == "true":
 | 
| 
 | 
   525         list_to_remove = []
 | 
| 
 | 
   526         toll_std=1e-8
 | 
| 
 | 
   527         for i in X.columns:
 | 
| 
 | 
   528             mean_i = X[i].mean()
 | 
| 
 | 
   529             std_i = X[i].std()
 | 
| 
 | 
   530             if std_i >toll_std:
 | 
| 
 | 
   531                 #scaling with mean 0 and std 1
 | 
| 
 | 
   532                 X[i] = (X[i]-mean_i)/std_i
 | 
| 
 | 
   533             else:
 | 
| 
 | 
   534                 #remove feature because std = 0 during clustering
 | 
| 
 | 
   535                 list_to_remove.append(i)
 | 
| 
 | 
   536         if len(list_to_remove)>0:
 | 
| 
 | 
   537             X = X.drop(columns=list_to_remove)
 | 
| 
 | 
   538 
 | 
| 
 | 
   539     if args.k_max != None:
 | 
| 
 | 
   540        numero_classi = X.shape[0]
 | 
| 
 | 
   541        while args.k_max >= numero_classi:
 | 
| 
 | 
   542           err = 'Skipping k = ' + str(args.k_max) + ' since it is >= number of classes of dataset'
 | 
| 
 | 
   543           warning(err)
 | 
| 
 | 
   544           args.k_max = args.k_max - 1
 | 
| 
 | 
   545     
 | 
| 
 | 
   546     
 | 
| 
 | 
   547     if args.cluster_type == 'kmeans':
 | 
| 
 | 
   548         kmeans(args.k_min, args.k_max, X, args.elbow, args.silhouette, args.best_cluster)
 | 
| 
 | 
   549     
 | 
| 
 | 
   550     if args.cluster_type == 'dbscan':
 | 
| 
 | 
   551         dbscan(X, args.eps, args.min_samples, args.best_cluster)
 | 
| 
 | 
   552         
 | 
| 
 | 
   553     if args.cluster_type == 'hierarchy':
 | 
| 
 | 
   554         hierachical_agglomerative(X, args.k_min, args.k_max, args.best_cluster, args.silhouette)
 | 
| 
 | 
   555         
 | 
| 
 | 
   556 ##############################################################################
 | 
| 
 | 
   557 if __name__ == "__main__":
 | 
| 
 | 
   558     main()
 |