diff R_functions/clustering_functions.R @ 0:64e75e21466e draft default tip

Uploaded
author pmac
date Wed, 01 Jun 2016 03:38:39 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/R_functions/clustering_functions.R	Wed Jun 01 03:38:39 2016 -0400
@@ -0,0 +1,134 @@
+# Automated clustering suite, that makes use of dbscan and hclust 
+# clustering algorithms
+library(dbscan)
+library(cluster)
+
+# Do the OPTICS clustering algorithm on the input data.
+# Computationally determines the 'optimal' value for the epsilion neighbourhood argument.
+# Outputs an integer array indicating which cluster each sample from the input data belongs to.
+# 0 indicates the sample is an outlier and does not belong to any sample
+automated_dbscan = function(pca_data, emax, mp) {
+  d = dist(pca_data$values[, c(1, 2)])
+  oc = optics(d, emax, mp)
+  eps = compute_eps(oc, 3)
+  oc_clusters = optics_cut(oc, eps)
+  return(oc_clusters$cluster)
+}
+
+# Do the hclust clustering algorithm on the input data, automatically 
+# computing the optimal value for 'k', i.e. the number of clusters.
+# Outputs an integer array indicating which cluster each sample from the input data belongs to.
+automated_hclust = function(pca_data) {
+  d = dist(pca_data$values[, c(1, 2)])
+  hc = hclust(d)
+  k = compute_k(hc, d, 10)
+  clusters = cutree(hc, k)
+  return(clusters)
+}
+
+# determine the optimal epsilon value for dbscan
+# From the input OPTICS data, 
+compute_eps = function(oc, weight) {
+  rdist = oc$reachdist
+  iqr = IQR(rdist)
+  uc = median(rdist) + weight*iqr
+  outliers = rdist[which(rdist > uc)]
+  eps = median(outliers)
+  return(eps)
+}
+
+# determining k using the silhouette method and hclust
+# Values near 1 indicate clustering is appropriate.
+compute_k = function(hc, d, n) {
+  s = 1:n
+  # try clusters from size 2 to n, evaluating the 'silhouette coefficient'
+  # for each value of k.
+  for (i in 1:n) {
+    # silhouette method not valid for cluster size = 1
+    if (i == 1) {
+      # if the silhouette coeff is < 0.5 for all the other values,
+      # we assume there is just one big cluster
+      s[i] = 0.5
+    } else {
+      # get the silhoutte coeff for every point, and take the mean
+      clusters = cutree(hc, i)
+      sil = silhouette(clusters, d)
+      sil_coeff = mean(sil[, 'sil_width'])
+      s[i] = sil_coeff
+    }
+  }
+  return(which.max(s))
+}
+
+# Compute the centroids of each cluster
+# returns an n x m matrix, where
+# n = number of clusters
+# m = dimension of data
+find_cluster_centers = function(clusters, x) {
+  cluster_ids = unique(clusters)
+  k = length(cluster_ids)
+  centers = matrix(nrow=k, ncol=ncol(x))
+  rownames(centers) = 1:k
+  for (i in 1:k) {
+    this_id = cluster_ids[i]
+    # dont work out centre of outliers
+    cluster_points = x[which(clusters==this_id), ]
+    if (is.null(dim(cluster_points))) {
+      l = length(cluster_points)
+      cluster_points = matrix(cluster_points, ncol=l)
+    }
+    centers[i, ] = apply(cluster_points, 2, FUN=mean)
+    rownames(centers)[i] = this_id
+  }
+  centers = centers[rownames(centers) != 0, , drop=FALSE]
+  return(centers)
+}
+
+# Determine which clusters are outliers based on how far away their cluster
+# centers are from the center of all the data. Returns a list containing the 
+# 'ids' of all rejected CLUSTERS (not samples)
+find_cluster_outliers_sd = function(clusters, centers, pca_data, numsds) {
+  if(nrow(centers) <= 1) {
+    print("no centers, or only one center")
+    return(NULL)  
+  }
+  # work out the standard deviation of the data in pc1 and pc2 directions
+  pc1_sd = sd(pca_data$values[, 1])
+  pc2_sd = sd(pca_data$values[, 2])
+  # work out centroid of the pca data
+  centroid = apply(pca_data$values, 2, FUN=mean)
+  pc1_bar = centroid[1]
+  pc2_bar = centroid[2]
+  # get pc1 and pc2 hi-lo cutoffs
+  pc1_hi = pc1_bar + pc1_sd*numsds
+  pc1_lo = pc1_bar - pc1_sd*numsds
+  pc2_hi = pc2_bar + pc2_sd*numsds
+  pc2_lo = pc2_bar - pc2_sd*numsds
+  # check which clusters fall outside cutoffs
+  pc1_centers = centers[, 1]
+  pc2_centers = centers[, 2]
+  pc1_ol = which(pc1_centers > pc1_hi | pc1_centers < pc1_lo)
+  pc2_ol = which(pc2_centers > pc2_hi | pc2_centers < pc2_lo)
+  all_ol = union(pc1_ol, pc2_ol)
+  rc = rownames(centers)[all_ol] # id of rejected clusters
+  return(rc)
+}
+
+# Determine which clusters are outliers based on how far away they are from other clusters
+# Returns a list containing the 'ids' of all rejected CLUSTERS (not samples)
+find_cluster_outliers_mcd = function(clusters, centers, pca_data, multiplier, ndim) {
+  if(nrow(centers) <= 1) {
+    print("no centers, or only one center")
+    return(NULL)  
+  }
+  d = dist(centers[, 1:ndim])
+  dm = as.matrix(d)
+  avg_distance = mean(as.numeric(d))
+  # ignore the diagonal
+  cluster_distances = rowSums(dm)/(ncol(dm)-1)
+  # trim clusters based on how far apart they are
+  cutoff = multiplier*avg_distance
+  indices = which(cluster_distances > cutoff)
+  rc = names(cluster_distances)[indices]
+  return(rc)
+}
\ No newline at end of file