Mercurial > repos > pmac > iterativepca
comparison R_functions/clustering_functions.R @ 0:64e75e21466e draft default tip
Uploaded
| author | pmac |
|---|---|
| date | Wed, 01 Jun 2016 03:38:39 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:64e75e21466e |
|---|---|
| 1 # Automated clustering suite, that makes use of dbscan and hclust | |
| 2 # clustering algorithms | |
| 3 library(dbscan) | |
| 4 library(cluster) | |
| 5 | |
| 6 # Do the OPTICS clustering algorithm on the input data. | |
| 7 # Computationally determines the 'optimal' value for the epsilion neighbourhood argument. | |
| 8 # Outputs an integer array indicating which cluster each sample from the input data belongs to. | |
| 9 # 0 indicates the sample is an outlier and does not belong to any sample | |
| 10 automated_dbscan = function(pca_data, emax, mp) { | |
| 11 d = dist(pca_data$values[, c(1, 2)]) | |
| 12 oc = optics(d, emax, mp) | |
| 13 eps = compute_eps(oc, 3) | |
| 14 oc_clusters = optics_cut(oc, eps) | |
| 15 return(oc_clusters$cluster) | |
| 16 } | |
| 17 | |
| 18 # Do the hclust clustering algorithm on the input data, automatically | |
| 19 # computing the optimal value for 'k', i.e. the number of clusters. | |
| 20 # Outputs an integer array indicating which cluster each sample from the input data belongs to. | |
| 21 automated_hclust = function(pca_data) { | |
| 22 d = dist(pca_data$values[, c(1, 2)]) | |
| 23 hc = hclust(d) | |
| 24 k = compute_k(hc, d, 10) | |
| 25 clusters = cutree(hc, k) | |
| 26 return(clusters) | |
| 27 } | |
| 28 | |
| 29 # determine the optimal epsilon value for dbscan | |
| 30 # From the input OPTICS data, | |
| 31 compute_eps = function(oc, weight) { | |
| 32 rdist = oc$reachdist | |
| 33 iqr = IQR(rdist) | |
| 34 uc = median(rdist) + weight*iqr | |
| 35 outliers = rdist[which(rdist > uc)] | |
| 36 eps = median(outliers) | |
| 37 return(eps) | |
| 38 } | |
| 39 | |
| 40 # determining k using the silhouette method and hclust | |
| 41 # Values near 1 indicate clustering is appropriate. | |
| 42 compute_k = function(hc, d, n) { | |
| 43 s = 1:n | |
| 44 # try clusters from size 2 to n, evaluating the 'silhouette coefficient' | |
| 45 # for each value of k. | |
| 46 for (i in 1:n) { | |
| 47 # silhouette method not valid for cluster size = 1 | |
| 48 if (i == 1) { | |
| 49 # if the silhouette coeff is < 0.5 for all the other values, | |
| 50 # we assume there is just one big cluster | |
| 51 s[i] = 0.5 | |
| 52 } else { | |
| 53 # get the silhoutte coeff for every point, and take the mean | |
| 54 clusters = cutree(hc, i) | |
| 55 sil = silhouette(clusters, d) | |
| 56 sil_coeff = mean(sil[, 'sil_width']) | |
| 57 s[i] = sil_coeff | |
| 58 } | |
| 59 } | |
| 60 return(which.max(s)) | |
| 61 } | |
| 62 | |
| 63 # Compute the centroids of each cluster | |
| 64 # returns an n x m matrix, where | |
| 65 # n = number of clusters | |
| 66 # m = dimension of data | |
| 67 find_cluster_centers = function(clusters, x) { | |
| 68 cluster_ids = unique(clusters) | |
| 69 k = length(cluster_ids) | |
| 70 centers = matrix(nrow=k, ncol=ncol(x)) | |
| 71 rownames(centers) = 1:k | |
| 72 for (i in 1:k) { | |
| 73 this_id = cluster_ids[i] | |
| 74 # dont work out centre of outliers | |
| 75 cluster_points = x[which(clusters==this_id), ] | |
| 76 if (is.null(dim(cluster_points))) { | |
| 77 l = length(cluster_points) | |
| 78 cluster_points = matrix(cluster_points, ncol=l) | |
| 79 } | |
| 80 centers[i, ] = apply(cluster_points, 2, FUN=mean) | |
| 81 rownames(centers)[i] = this_id | |
| 82 } | |
| 83 centers = centers[rownames(centers) != 0, , drop=FALSE] | |
| 84 return(centers) | |
| 85 } | |
| 86 | |
| 87 # Determine which clusters are outliers based on how far away their cluster | |
| 88 # centers are from the center of all the data. Returns a list containing the | |
| 89 # 'ids' of all rejected CLUSTERS (not samples) | |
| 90 find_cluster_outliers_sd = function(clusters, centers, pca_data, numsds) { | |
| 91 if(nrow(centers) <= 1) { | |
| 92 print("no centers, or only one center") | |
| 93 return(NULL) | |
| 94 } | |
| 95 # work out the standard deviation of the data in pc1 and pc2 directions | |
| 96 pc1_sd = sd(pca_data$values[, 1]) | |
| 97 pc2_sd = sd(pca_data$values[, 2]) | |
| 98 # work out centroid of the pca data | |
| 99 centroid = apply(pca_data$values, 2, FUN=mean) | |
| 100 pc1_bar = centroid[1] | |
| 101 pc2_bar = centroid[2] | |
| 102 # get pc1 and pc2 hi-lo cutoffs | |
| 103 pc1_hi = pc1_bar + pc1_sd*numsds | |
| 104 pc1_lo = pc1_bar - pc1_sd*numsds | |
| 105 pc2_hi = pc2_bar + pc2_sd*numsds | |
| 106 pc2_lo = pc2_bar - pc2_sd*numsds | |
| 107 # check which clusters fall outside cutoffs | |
| 108 pc1_centers = centers[, 1] | |
| 109 pc2_centers = centers[, 2] | |
| 110 pc1_ol = which(pc1_centers > pc1_hi | pc1_centers < pc1_lo) | |
| 111 pc2_ol = which(pc2_centers > pc2_hi | pc2_centers < pc2_lo) | |
| 112 all_ol = union(pc1_ol, pc2_ol) | |
| 113 rc = rownames(centers)[all_ol] # id of rejected clusters | |
| 114 return(rc) | |
| 115 } | |
| 116 | |
| 117 # Determine which clusters are outliers based on how far away they are from other clusters | |
| 118 # Returns a list containing the 'ids' of all rejected CLUSTERS (not samples) | |
| 119 find_cluster_outliers_mcd = function(clusters, centers, pca_data, multiplier, ndim) { | |
| 120 if(nrow(centers) <= 1) { | |
| 121 print("no centers, or only one center") | |
| 122 return(NULL) | |
| 123 } | |
| 124 d = dist(centers[, 1:ndim]) | |
| 125 dm = as.matrix(d) | |
| 126 avg_distance = mean(as.numeric(d)) | |
| 127 # ignore the diagonal | |
| 128 cluster_distances = rowSums(dm)/(ncol(dm)-1) | |
| 129 # trim clusters based on how far apart they are | |
| 130 cutoff = multiplier*avg_distance | |
| 131 indices = which(cluster_distances > cutoff) | |
| 132 rc = names(cluster_distances)[indices] | |
| 133 return(rc) | |
| 134 } |
