Mercurial > repos > pmac > iterativepca
view R_functions/clustering_functions.R @ 0:64e75e21466e draft default tip
Uploaded
author | pmac |
---|---|
date | Wed, 01 Jun 2016 03:38:39 -0400 |
parents | |
children |
line wrap: on
line source
# Automated clustering suite, that makes use of dbscan and hclust # clustering algorithms library(dbscan) library(cluster) # Do the OPTICS clustering algorithm on the input data. # Computationally determines the 'optimal' value for the epsilion neighbourhood argument. # Outputs an integer array indicating which cluster each sample from the input data belongs to. # 0 indicates the sample is an outlier and does not belong to any sample automated_dbscan = function(pca_data, emax, mp) { d = dist(pca_data$values[, c(1, 2)]) oc = optics(d, emax, mp) eps = compute_eps(oc, 3) oc_clusters = optics_cut(oc, eps) return(oc_clusters$cluster) } # Do the hclust clustering algorithm on the input data, automatically # computing the optimal value for 'k', i.e. the number of clusters. # Outputs an integer array indicating which cluster each sample from the input data belongs to. automated_hclust = function(pca_data) { d = dist(pca_data$values[, c(1, 2)]) hc = hclust(d) k = compute_k(hc, d, 10) clusters = cutree(hc, k) return(clusters) } # determine the optimal epsilon value for dbscan # From the input OPTICS data, compute_eps = function(oc, weight) { rdist = oc$reachdist iqr = IQR(rdist) uc = median(rdist) + weight*iqr outliers = rdist[which(rdist > uc)] eps = median(outliers) return(eps) } # determining k using the silhouette method and hclust # Values near 1 indicate clustering is appropriate. compute_k = function(hc, d, n) { s = 1:n # try clusters from size 2 to n, evaluating the 'silhouette coefficient' # for each value of k. for (i in 1:n) { # silhouette method not valid for cluster size = 1 if (i == 1) { # if the silhouette coeff is < 0.5 for all the other values, # we assume there is just one big cluster s[i] = 0.5 } else { # get the silhoutte coeff for every point, and take the mean clusters = cutree(hc, i) sil = silhouette(clusters, d) sil_coeff = mean(sil[, 'sil_width']) s[i] = sil_coeff } } return(which.max(s)) } # Compute the centroids of each cluster # returns an n x m matrix, where # n = number of clusters # m = dimension of data find_cluster_centers = function(clusters, x) { cluster_ids = unique(clusters) k = length(cluster_ids) centers = matrix(nrow=k, ncol=ncol(x)) rownames(centers) = 1:k for (i in 1:k) { this_id = cluster_ids[i] # dont work out centre of outliers cluster_points = x[which(clusters==this_id), ] if (is.null(dim(cluster_points))) { l = length(cluster_points) cluster_points = matrix(cluster_points, ncol=l) } centers[i, ] = apply(cluster_points, 2, FUN=mean) rownames(centers)[i] = this_id } centers = centers[rownames(centers) != 0, , drop=FALSE] return(centers) } # Determine which clusters are outliers based on how far away their cluster # centers are from the center of all the data. Returns a list containing the # 'ids' of all rejected CLUSTERS (not samples) find_cluster_outliers_sd = function(clusters, centers, pca_data, numsds) { if(nrow(centers) <= 1) { print("no centers, or only one center") return(NULL) } # work out the standard deviation of the data in pc1 and pc2 directions pc1_sd = sd(pca_data$values[, 1]) pc2_sd = sd(pca_data$values[, 2]) # work out centroid of the pca data centroid = apply(pca_data$values, 2, FUN=mean) pc1_bar = centroid[1] pc2_bar = centroid[2] # get pc1 and pc2 hi-lo cutoffs pc1_hi = pc1_bar + pc1_sd*numsds pc1_lo = pc1_bar - pc1_sd*numsds pc2_hi = pc2_bar + pc2_sd*numsds pc2_lo = pc2_bar - pc2_sd*numsds # check which clusters fall outside cutoffs pc1_centers = centers[, 1] pc2_centers = centers[, 2] pc1_ol = which(pc1_centers > pc1_hi | pc1_centers < pc1_lo) pc2_ol = which(pc2_centers > pc2_hi | pc2_centers < pc2_lo) all_ol = union(pc1_ol, pc2_ol) rc = rownames(centers)[all_ol] # id of rejected clusters return(rc) } # Determine which clusters are outliers based on how far away they are from other clusters # Returns a list containing the 'ids' of all rejected CLUSTERS (not samples) find_cluster_outliers_mcd = function(clusters, centers, pca_data, multiplier, ndim) { if(nrow(centers) <= 1) { print("no centers, or only one center") return(NULL) } d = dist(centers[, 1:ndim]) dm = as.matrix(d) avg_distance = mean(as.numeric(d)) # ignore the diagonal cluster_distances = rowSums(dm)/(ncol(dm)-1) # trim clusters based on how far apart they are cutoff = multiplier*avg_distance indices = which(cluster_distances > cutoff) rc = names(cluster_distances)[indices] return(rc) }