Mercurial > repos > pmac > iterativepca
diff R_functions/clustering_functions.R @ 0:64e75e21466e draft default tip
Uploaded
author | pmac |
---|---|
date | Wed, 01 Jun 2016 03:38:39 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/R_functions/clustering_functions.R Wed Jun 01 03:38:39 2016 -0400 @@ -0,0 +1,134 @@ +# Automated clustering suite, that makes use of dbscan and hclust +# clustering algorithms +library(dbscan) +library(cluster) + +# Do the OPTICS clustering algorithm on the input data. +# Computationally determines the 'optimal' value for the epsilion neighbourhood argument. +# Outputs an integer array indicating which cluster each sample from the input data belongs to. +# 0 indicates the sample is an outlier and does not belong to any sample +automated_dbscan = function(pca_data, emax, mp) { + d = dist(pca_data$values[, c(1, 2)]) + oc = optics(d, emax, mp) + eps = compute_eps(oc, 3) + oc_clusters = optics_cut(oc, eps) + return(oc_clusters$cluster) +} + +# Do the hclust clustering algorithm on the input data, automatically +# computing the optimal value for 'k', i.e. the number of clusters. +# Outputs an integer array indicating which cluster each sample from the input data belongs to. +automated_hclust = function(pca_data) { + d = dist(pca_data$values[, c(1, 2)]) + hc = hclust(d) + k = compute_k(hc, d, 10) + clusters = cutree(hc, k) + return(clusters) +} + +# determine the optimal epsilon value for dbscan +# From the input OPTICS data, +compute_eps = function(oc, weight) { + rdist = oc$reachdist + iqr = IQR(rdist) + uc = median(rdist) + weight*iqr + outliers = rdist[which(rdist > uc)] + eps = median(outliers) + return(eps) +} + +# determining k using the silhouette method and hclust +# Values near 1 indicate clustering is appropriate. +compute_k = function(hc, d, n) { + s = 1:n + # try clusters from size 2 to n, evaluating the 'silhouette coefficient' + # for each value of k. + for (i in 1:n) { + # silhouette method not valid for cluster size = 1 + if (i == 1) { + # if the silhouette coeff is < 0.5 for all the other values, + # we assume there is just one big cluster + s[i] = 0.5 + } else { + # get the silhoutte coeff for every point, and take the mean + clusters = cutree(hc, i) + sil = silhouette(clusters, d) + sil_coeff = mean(sil[, 'sil_width']) + s[i] = sil_coeff + } + } + return(which.max(s)) +} + +# Compute the centroids of each cluster +# returns an n x m matrix, where +# n = number of clusters +# m = dimension of data +find_cluster_centers = function(clusters, x) { + cluster_ids = unique(clusters) + k = length(cluster_ids) + centers = matrix(nrow=k, ncol=ncol(x)) + rownames(centers) = 1:k + for (i in 1:k) { + this_id = cluster_ids[i] + # dont work out centre of outliers + cluster_points = x[which(clusters==this_id), ] + if (is.null(dim(cluster_points))) { + l = length(cluster_points) + cluster_points = matrix(cluster_points, ncol=l) + } + centers[i, ] = apply(cluster_points, 2, FUN=mean) + rownames(centers)[i] = this_id + } + centers = centers[rownames(centers) != 0, , drop=FALSE] + return(centers) +} + +# Determine which clusters are outliers based on how far away their cluster +# centers are from the center of all the data. Returns a list containing the +# 'ids' of all rejected CLUSTERS (not samples) +find_cluster_outliers_sd = function(clusters, centers, pca_data, numsds) { + if(nrow(centers) <= 1) { + print("no centers, or only one center") + return(NULL) + } + # work out the standard deviation of the data in pc1 and pc2 directions + pc1_sd = sd(pca_data$values[, 1]) + pc2_sd = sd(pca_data$values[, 2]) + # work out centroid of the pca data + centroid = apply(pca_data$values, 2, FUN=mean) + pc1_bar = centroid[1] + pc2_bar = centroid[2] + # get pc1 and pc2 hi-lo cutoffs + pc1_hi = pc1_bar + pc1_sd*numsds + pc1_lo = pc1_bar - pc1_sd*numsds + pc2_hi = pc2_bar + pc2_sd*numsds + pc2_lo = pc2_bar - pc2_sd*numsds + # check which clusters fall outside cutoffs + pc1_centers = centers[, 1] + pc2_centers = centers[, 2] + pc1_ol = which(pc1_centers > pc1_hi | pc1_centers < pc1_lo) + pc2_ol = which(pc2_centers > pc2_hi | pc2_centers < pc2_lo) + all_ol = union(pc1_ol, pc2_ol) + rc = rownames(centers)[all_ol] # id of rejected clusters + return(rc) +} + +# Determine which clusters are outliers based on how far away they are from other clusters +# Returns a list containing the 'ids' of all rejected CLUSTERS (not samples) +find_cluster_outliers_mcd = function(clusters, centers, pca_data, multiplier, ndim) { + if(nrow(centers) <= 1) { + print("no centers, or only one center") + return(NULL) + } + d = dist(centers[, 1:ndim]) + dm = as.matrix(d) + avg_distance = mean(as.numeric(d)) + # ignore the diagonal + cluster_distances = rowSums(dm)/(ncol(dm)-1) + # trim clusters based on how far apart they are + cutoff = multiplier*avg_distance + indices = which(cluster_distances > cutoff) + rc = names(cluster_distances)[indices] + return(rc) +} \ No newline at end of file