iterativepca: R_functions/clustering

annotate R_functions/clustering_functions.R @ 0:64e75e21466e draft default tip

Uploaded

author	pmac
date	Wed, 01 Jun 2016 03:38:39 -0400
parents
children

rev	line source
0 64e75e21466e Uploaded pmac parents: diff changeset	1 # Automated clustering suite, that makes use of dbscan and hclust
64e75e21466e Uploaded pmac parents: diff changeset	2 # clustering algorithms
64e75e21466e Uploaded pmac parents: diff changeset	3 library(dbscan)
64e75e21466e Uploaded pmac parents: diff changeset	4 library(cluster)
64e75e21466e Uploaded pmac parents: diff changeset	5
64e75e21466e Uploaded pmac parents: diff changeset	6 # Do the OPTICS clustering algorithm on the input data.
64e75e21466e Uploaded pmac parents: diff changeset	7 # Computationally determines the 'optimal' value for the epsilion neighbourhood argument.
64e75e21466e Uploaded pmac parents: diff changeset	8 # Outputs an integer array indicating which cluster each sample from the input data belongs to.
64e75e21466e Uploaded pmac parents: diff changeset	9 # 0 indicates the sample is an outlier and does not belong to any sample
64e75e21466e Uploaded pmac parents: diff changeset	10 automated_dbscan = function(pca_data, emax, mp) {
64e75e21466e Uploaded pmac parents: diff changeset	11 d = dist(pca_data$values[, c(1, 2)])
64e75e21466e Uploaded pmac parents: diff changeset	12 oc = optics(d, emax, mp)
64e75e21466e Uploaded pmac parents: diff changeset	13 eps = compute_eps(oc, 3)
64e75e21466e Uploaded pmac parents: diff changeset	14 oc_clusters = optics_cut(oc, eps)
64e75e21466e Uploaded pmac parents: diff changeset	15 return(oc_clusters$cluster)
64e75e21466e Uploaded pmac parents: diff changeset	16 }
64e75e21466e Uploaded pmac parents: diff changeset	17
64e75e21466e Uploaded pmac parents: diff changeset	18 # Do the hclust clustering algorithm on the input data, automatically
64e75e21466e Uploaded pmac parents: diff changeset	19 # computing the optimal value for 'k', i.e. the number of clusters.
64e75e21466e Uploaded pmac parents: diff changeset	20 # Outputs an integer array indicating which cluster each sample from the input data belongs to.
64e75e21466e Uploaded pmac parents: diff changeset	21 automated_hclust = function(pca_data) {
64e75e21466e Uploaded pmac parents: diff changeset	22 d = dist(pca_data$values[, c(1, 2)])
64e75e21466e Uploaded pmac parents: diff changeset	23 hc = hclust(d)
64e75e21466e Uploaded pmac parents: diff changeset	24 k = compute_k(hc, d, 10)
64e75e21466e Uploaded pmac parents: diff changeset	25 clusters = cutree(hc, k)
64e75e21466e Uploaded pmac parents: diff changeset	26 return(clusters)
64e75e21466e Uploaded pmac parents: diff changeset	27 }
64e75e21466e Uploaded pmac parents: diff changeset	28
64e75e21466e Uploaded pmac parents: diff changeset	29 # determine the optimal epsilon value for dbscan
64e75e21466e Uploaded pmac parents: diff changeset	30 # From the input OPTICS data,
64e75e21466e Uploaded pmac parents: diff changeset	31 compute_eps = function(oc, weight) {
64e75e21466e Uploaded pmac parents: diff changeset	32 rdist = oc$reachdist
64e75e21466e Uploaded pmac parents: diff changeset	33 iqr = IQR(rdist)
64e75e21466e Uploaded pmac parents: diff changeset	34 uc = median(rdist) + weight*iqr
64e75e21466e Uploaded pmac parents: diff changeset	35 outliers = rdist[which(rdist > uc)]
64e75e21466e Uploaded pmac parents: diff changeset	36 eps = median(outliers)
64e75e21466e Uploaded pmac parents: diff changeset	37 return(eps)
64e75e21466e Uploaded pmac parents: diff changeset	38 }
64e75e21466e Uploaded pmac parents: diff changeset	39
64e75e21466e Uploaded pmac parents: diff changeset	40 # determining k using the silhouette method and hclust
64e75e21466e Uploaded pmac parents: diff changeset	41 # Values near 1 indicate clustering is appropriate.
64e75e21466e Uploaded pmac parents: diff changeset	42 compute_k = function(hc, d, n) {
64e75e21466e Uploaded pmac parents: diff changeset	43 s = 1:n
64e75e21466e Uploaded pmac parents: diff changeset	44 # try clusters from size 2 to n, evaluating the 'silhouette coefficient'
64e75e21466e Uploaded pmac parents: diff changeset	45 # for each value of k.
64e75e21466e Uploaded pmac parents: diff changeset	46 for (i in 1:n) {
64e75e21466e Uploaded pmac parents: diff changeset	47 # silhouette method not valid for cluster size = 1
64e75e21466e Uploaded pmac parents: diff changeset	48 if (i == 1) {
64e75e21466e Uploaded pmac parents: diff changeset	49 # if the silhouette coeff is < 0.5 for all the other values,
64e75e21466e Uploaded pmac parents: diff changeset	50 # we assume there is just one big cluster
64e75e21466e Uploaded pmac parents: diff changeset	51 s[i] = 0.5
64e75e21466e Uploaded pmac parents: diff changeset	52 } else {
64e75e21466e Uploaded pmac parents: diff changeset	53 # get the silhoutte coeff for every point, and take the mean
64e75e21466e Uploaded pmac parents: diff changeset	54 clusters = cutree(hc, i)
64e75e21466e Uploaded pmac parents: diff changeset	55 sil = silhouette(clusters, d)
64e75e21466e Uploaded pmac parents: diff changeset	56 sil_coeff = mean(sil[, 'sil_width'])
64e75e21466e Uploaded pmac parents: diff changeset	57 s[i] = sil_coeff
64e75e21466e Uploaded pmac parents: diff changeset	58 }
64e75e21466e Uploaded pmac parents: diff changeset	59 }
64e75e21466e Uploaded pmac parents: diff changeset	60 return(which.max(s))
64e75e21466e Uploaded pmac parents: diff changeset	61 }
64e75e21466e Uploaded pmac parents: diff changeset	62
64e75e21466e Uploaded pmac parents: diff changeset	63 # Compute the centroids of each cluster
64e75e21466e Uploaded pmac parents: diff changeset	64 # returns an n x m matrix, where
64e75e21466e Uploaded pmac parents: diff changeset	65 # n = number of clusters
64e75e21466e Uploaded pmac parents: diff changeset	66 # m = dimension of data
64e75e21466e Uploaded pmac parents: diff changeset	67 find_cluster_centers = function(clusters, x) {
64e75e21466e Uploaded pmac parents: diff changeset	68 cluster_ids = unique(clusters)
64e75e21466e Uploaded pmac parents: diff changeset	69 k = length(cluster_ids)
64e75e21466e Uploaded pmac parents: diff changeset	70 centers = matrix(nrow=k, ncol=ncol(x))
64e75e21466e Uploaded pmac parents: diff changeset	71 rownames(centers) = 1:k
64e75e21466e Uploaded pmac parents: diff changeset	72 for (i in 1:k) {
64e75e21466e Uploaded pmac parents: diff changeset	73 this_id = cluster_ids[i]
64e75e21466e Uploaded pmac parents: diff changeset	74 # dont work out centre of outliers
64e75e21466e Uploaded pmac parents: diff changeset	75 cluster_points = x[which(clusters==this_id), ]
64e75e21466e Uploaded pmac parents: diff changeset	76 if (is.null(dim(cluster_points))) {
64e75e21466e Uploaded pmac parents: diff changeset	77 l = length(cluster_points)
64e75e21466e Uploaded pmac parents: diff changeset	78 cluster_points = matrix(cluster_points, ncol=l)
64e75e21466e Uploaded pmac parents: diff changeset	79 }
64e75e21466e Uploaded pmac parents: diff changeset	80 centers[i, ] = apply(cluster_points, 2, FUN=mean)
64e75e21466e Uploaded pmac parents: diff changeset	81 rownames(centers)[i] = this_id
64e75e21466e Uploaded pmac parents: diff changeset	82 }
64e75e21466e Uploaded pmac parents: diff changeset	83 centers = centers[rownames(centers) != 0, , drop=FALSE]
64e75e21466e Uploaded pmac parents: diff changeset	84 return(centers)
64e75e21466e Uploaded pmac parents: diff changeset	85 }
64e75e21466e Uploaded pmac parents: diff changeset	86
64e75e21466e Uploaded pmac parents: diff changeset	87 # Determine which clusters are outliers based on how far away their cluster
64e75e21466e Uploaded pmac parents: diff changeset	88 # centers are from the center of all the data. Returns a list containing the
64e75e21466e Uploaded pmac parents: diff changeset	89 # 'ids' of all rejected CLUSTERS (not samples)
64e75e21466e Uploaded pmac parents: diff changeset	90 find_cluster_outliers_sd = function(clusters, centers, pca_data, numsds) {
64e75e21466e Uploaded pmac parents: diff changeset	91 if(nrow(centers) <= 1) {
64e75e21466e Uploaded pmac parents: diff changeset	92 print("no centers, or only one center")
64e75e21466e Uploaded pmac parents: diff changeset	93 return(NULL)
64e75e21466e Uploaded pmac parents: diff changeset	94 }
64e75e21466e Uploaded pmac parents: diff changeset	95 # work out the standard deviation of the data in pc1 and pc2 directions
64e75e21466e Uploaded pmac parents: diff changeset	96 pc1_sd = sd(pca_data$values[, 1])
64e75e21466e Uploaded pmac parents: diff changeset	97 pc2_sd = sd(pca_data$values[, 2])
64e75e21466e Uploaded pmac parents: diff changeset	98 # work out centroid of the pca data
64e75e21466e Uploaded pmac parents: diff changeset	99 centroid = apply(pca_data$values, 2, FUN=mean)
64e75e21466e Uploaded pmac parents: diff changeset	100 pc1_bar = centroid[1]
64e75e21466e Uploaded pmac parents: diff changeset	101 pc2_bar = centroid[2]
64e75e21466e Uploaded pmac parents: diff changeset	102 # get pc1 and pc2 hi-lo cutoffs
64e75e21466e Uploaded pmac parents: diff changeset	103 pc1_hi = pc1_bar + pc1_sd*numsds
64e75e21466e Uploaded pmac parents: diff changeset	104 pc1_lo = pc1_bar - pc1_sd*numsds
64e75e21466e Uploaded pmac parents: diff changeset	105 pc2_hi = pc2_bar + pc2_sd*numsds
64e75e21466e Uploaded pmac parents: diff changeset	106 pc2_lo = pc2_bar - pc2_sd*numsds
64e75e21466e Uploaded pmac parents: diff changeset	107 # check which clusters fall outside cutoffs
64e75e21466e Uploaded pmac parents: diff changeset	108 pc1_centers = centers[, 1]
64e75e21466e Uploaded pmac parents: diff changeset	109 pc2_centers = centers[, 2]
64e75e21466e Uploaded pmac parents: diff changeset	110 pc1_ol = which(pc1_centers > pc1_hi \| pc1_centers < pc1_lo)
64e75e21466e Uploaded pmac parents: diff changeset	111 pc2_ol = which(pc2_centers > pc2_hi \| pc2_centers < pc2_lo)
64e75e21466e Uploaded pmac parents: diff changeset	112 all_ol = union(pc1_ol, pc2_ol)
64e75e21466e Uploaded pmac parents: diff changeset	113 rc = rownames(centers)[all_ol] # id of rejected clusters
64e75e21466e Uploaded pmac parents: diff changeset	114 return(rc)
64e75e21466e Uploaded pmac parents: diff changeset	115 }
64e75e21466e Uploaded pmac parents: diff changeset	116
64e75e21466e Uploaded pmac parents: diff changeset	117 # Determine which clusters are outliers based on how far away they are from other clusters
64e75e21466e Uploaded pmac parents: diff changeset	118 # Returns a list containing the 'ids' of all rejected CLUSTERS (not samples)
64e75e21466e Uploaded pmac parents: diff changeset	119 find_cluster_outliers_mcd = function(clusters, centers, pca_data, multiplier, ndim) {
64e75e21466e Uploaded pmac parents: diff changeset	120 if(nrow(centers) <= 1) {
64e75e21466e Uploaded pmac parents: diff changeset	121 print("no centers, or only one center")
64e75e21466e Uploaded pmac parents: diff changeset	122 return(NULL)
64e75e21466e Uploaded pmac parents: diff changeset	123 }
64e75e21466e Uploaded pmac parents: diff changeset	124 d = dist(centers[, 1:ndim])
64e75e21466e Uploaded pmac parents: diff changeset	125 dm = as.matrix(d)
64e75e21466e Uploaded pmac parents: diff changeset	126 avg_distance = mean(as.numeric(d))
64e75e21466e Uploaded pmac parents: diff changeset	127 # ignore the diagonal
64e75e21466e Uploaded pmac parents: diff changeset	128 cluster_distances = rowSums(dm)/(ncol(dm)-1)
64e75e21466e Uploaded pmac parents: diff changeset	129 # trim clusters based on how far apart they are
64e75e21466e Uploaded pmac parents: diff changeset	130 cutoff = multiplier*avg_distance
64e75e21466e Uploaded pmac parents: diff changeset	131 indices = which(cluster_distances > cutoff)
64e75e21466e Uploaded pmac parents: diff changeset	132 rc = names(cluster_distances)[indices]
64e75e21466e Uploaded pmac parents: diff changeset	133 return(rc)
64e75e21466e Uploaded pmac parents: diff changeset	134 }

Mercurial > repos > pmac > iterativepca

annotate R_functions/clustering_functions.R @ 0:64e75e21466e draft default tip