0
|
1 # Automated clustering suite, that makes use of dbscan and hclust
|
|
2 # clustering algorithms
|
|
3 library(dbscan)
|
|
4 library(cluster)
|
|
5
|
|
6 # Do the OPTICS clustering algorithm on the input data.
|
|
7 # Computationally determines the 'optimal' value for the epsilion neighbourhood argument.
|
|
8 # Outputs an integer array indicating which cluster each sample from the input data belongs to.
|
|
9 # 0 indicates the sample is an outlier and does not belong to any sample
|
|
10 automated_dbscan = function(pca_data, emax, mp) {
|
|
11 d = dist(pca_data$values[, c(1, 2)])
|
|
12 oc = optics(d, emax, mp)
|
|
13 eps = compute_eps(oc, 3)
|
|
14 oc_clusters = optics_cut(oc, eps)
|
|
15 return(oc_clusters$cluster)
|
|
16 }
|
|
17
|
|
18 # Do the hclust clustering algorithm on the input data, automatically
|
|
19 # computing the optimal value for 'k', i.e. the number of clusters.
|
|
20 # Outputs an integer array indicating which cluster each sample from the input data belongs to.
|
|
21 automated_hclust = function(pca_data) {
|
|
22 d = dist(pca_data$values[, c(1, 2)])
|
|
23 hc = hclust(d)
|
|
24 k = compute_k(hc, d, 10)
|
|
25 clusters = cutree(hc, k)
|
|
26 return(clusters)
|
|
27 }
|
|
28
|
|
29 # determine the optimal epsilon value for dbscan
|
|
30 # From the input OPTICS data,
|
|
31 compute_eps = function(oc, weight) {
|
|
32 rdist = oc$reachdist
|
|
33 iqr = IQR(rdist)
|
|
34 uc = median(rdist) + weight*iqr
|
|
35 outliers = rdist[which(rdist > uc)]
|
|
36 eps = median(outliers)
|
|
37 return(eps)
|
|
38 }
|
|
39
|
|
40 # determining k using the silhouette method and hclust
|
|
41 # Values near 1 indicate clustering is appropriate.
|
|
42 compute_k = function(hc, d, n) {
|
|
43 s = 1:n
|
|
44 # try clusters from size 2 to n, evaluating the 'silhouette coefficient'
|
|
45 # for each value of k.
|
|
46 for (i in 1:n) {
|
|
47 # silhouette method not valid for cluster size = 1
|
|
48 if (i == 1) {
|
|
49 # if the silhouette coeff is < 0.5 for all the other values,
|
|
50 # we assume there is just one big cluster
|
|
51 s[i] = 0.5
|
|
52 } else {
|
|
53 # get the silhoutte coeff for every point, and take the mean
|
|
54 clusters = cutree(hc, i)
|
|
55 sil = silhouette(clusters, d)
|
|
56 sil_coeff = mean(sil[, 'sil_width'])
|
|
57 s[i] = sil_coeff
|
|
58 }
|
|
59 }
|
|
60 return(which.max(s))
|
|
61 }
|
|
62
|
|
63 # Compute the centroids of each cluster
|
|
64 # returns an n x m matrix, where
|
|
65 # n = number of clusters
|
|
66 # m = dimension of data
|
|
67 find_cluster_centers = function(clusters, x) {
|
|
68 cluster_ids = unique(clusters)
|
|
69 k = length(cluster_ids)
|
|
70 centers = matrix(nrow=k, ncol=ncol(x))
|
|
71 rownames(centers) = 1:k
|
|
72 for (i in 1:k) {
|
|
73 this_id = cluster_ids[i]
|
|
74 # dont work out centre of outliers
|
|
75 cluster_points = x[which(clusters==this_id), ]
|
|
76 if (is.null(dim(cluster_points))) {
|
|
77 l = length(cluster_points)
|
|
78 cluster_points = matrix(cluster_points, ncol=l)
|
|
79 }
|
|
80 centers[i, ] = apply(cluster_points, 2, FUN=mean)
|
|
81 rownames(centers)[i] = this_id
|
|
82 }
|
|
83 centers = centers[rownames(centers) != 0, , drop=FALSE]
|
|
84 return(centers)
|
|
85 }
|
|
86
|
|
87 # Determine which clusters are outliers based on how far away their cluster
|
|
88 # centers are from the center of all the data. Returns a list containing the
|
|
89 # 'ids' of all rejected CLUSTERS (not samples)
|
|
90 find_cluster_outliers_sd = function(clusters, centers, pca_data, numsds) {
|
|
91 if(nrow(centers) <= 1) {
|
|
92 print("no centers, or only one center")
|
|
93 return(NULL)
|
|
94 }
|
|
95 # work out the standard deviation of the data in pc1 and pc2 directions
|
|
96 pc1_sd = sd(pca_data$values[, 1])
|
|
97 pc2_sd = sd(pca_data$values[, 2])
|
|
98 # work out centroid of the pca data
|
|
99 centroid = apply(pca_data$values, 2, FUN=mean)
|
|
100 pc1_bar = centroid[1]
|
|
101 pc2_bar = centroid[2]
|
|
102 # get pc1 and pc2 hi-lo cutoffs
|
|
103 pc1_hi = pc1_bar + pc1_sd*numsds
|
|
104 pc1_lo = pc1_bar - pc1_sd*numsds
|
|
105 pc2_hi = pc2_bar + pc2_sd*numsds
|
|
106 pc2_lo = pc2_bar - pc2_sd*numsds
|
|
107 # check which clusters fall outside cutoffs
|
|
108 pc1_centers = centers[, 1]
|
|
109 pc2_centers = centers[, 2]
|
|
110 pc1_ol = which(pc1_centers > pc1_hi | pc1_centers < pc1_lo)
|
|
111 pc2_ol = which(pc2_centers > pc2_hi | pc2_centers < pc2_lo)
|
|
112 all_ol = union(pc1_ol, pc2_ol)
|
|
113 rc = rownames(centers)[all_ol] # id of rejected clusters
|
|
114 return(rc)
|
|
115 }
|
|
116
|
|
117 # Determine which clusters are outliers based on how far away they are from other clusters
|
|
118 # Returns a list containing the 'ids' of all rejected CLUSTERS (not samples)
|
|
119 find_cluster_outliers_mcd = function(clusters, centers, pca_data, multiplier, ndim) {
|
|
120 if(nrow(centers) <= 1) {
|
|
121 print("no centers, or only one center")
|
|
122 return(NULL)
|
|
123 }
|
|
124 d = dist(centers[, 1:ndim])
|
|
125 dm = as.matrix(d)
|
|
126 avg_distance = mean(as.numeric(d))
|
|
127 # ignore the diagonal
|
|
128 cluster_distances = rowSums(dm)/(ncol(dm)-1)
|
|
129 # trim clusters based on how far apart they are
|
|
130 cutoff = multiplier*avg_distance
|
|
131 indices = which(cluster_distances > cutoff)
|
|
132 rc = names(cluster_distances)[indices]
|
|
133 return(rc)
|
|
134 } |