annotate src/breadcrumbs/src/KMedoids.py @ 0:2f4f6f08c8c4 draft

Uploaded
author george-weingart
date Tue, 13 May 2014 21:58:57 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
1 ## Included from MLPY build 2.2.0
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
2 ## Attempts were made to contact Davide Albanese on 08-10-2012 and 09-19-2012 at albanese@fbk.eu
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
3
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
4 ## This code is written by Davide Albanese, <albanese@fbk.eu>
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
5 ## (C) 2009 Fondazione Bruno Kessler - Via Santa Croce 77, 38100 Trento, ITALY.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
6
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
7 ## This program is free software: you can redistribute it and/or modify
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
8 ## it under the terms of the GNU General Public License as published by
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
9 ## the Free Software Foundation, either version 3 of the License, or
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
10 ## (at your option) any later version.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
11
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
12 ## This program is distributed in the hope that it will be useful,
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
13 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
14 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
15 ## GNU General Public License for more details.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
16
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
17 ## You should have received a copy of the GNU General Public License
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
18 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
19
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
20
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
21 __all__= ['Kmedoids', 'Minkowski']
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
22
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
23
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
24 import numpy as np
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
25 import matplotlib
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
26 matplotlib.use( "Agg" )
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
27 import mlpy
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
28
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
29
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
30 def kmedoids_core(x, med, oth, clust, cost, dist):
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
31 """
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
32 * for each mediod m
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
33 * for each non-mediod data point n
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
34 Swap m and n and compute the total cost of the configuration
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
35 Select the configuration with the lowest cost
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
36 """
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
37
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
38 d = np.empty((oth.shape[0], med.shape[0]), dtype=float)
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
39 med_n = np.empty_like(med)
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
40 oth_n = np.empty_like(oth)
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
41 idx = np.arange(oth.shape[0])
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
42
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
43 med_cur = med.copy()
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
44 oth_cur = oth.copy()
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
45 clust_cur = clust.copy()
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
46 cost_cur = cost
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
47
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
48 for i, m in enumerate(med):
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
49 for j, n in enumerate(oth[clust == i]):
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
50
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
51 med_n, oth_n = med.copy(), oth.copy()
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
52
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
53 med_n[i] = n
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
54 tmp = oth_n[clust == i]
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
55 tmp[j] = m
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
56 oth_n[clust == i] = tmp
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
57
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
58 for ii, nn in enumerate(oth_n):
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
59 for jj, mm in enumerate(med_n):
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
60 d[ii, jj] = dist.compute(x[mm], x[nn])
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
61
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
62 clust_n = np.argmin(d, axis=1) # clusters
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
63 cost_n = np.sum(d[idx, clust_n]) # total cost of configuration
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
64
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
65 if cost_n <= cost_cur:
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
66 med_cur = med_n.copy()
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
67 oth_cur = oth_n.copy()
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
68 clust_cur = clust_n.copy()
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
69 cost_cur = cost_n
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
70
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
71 return med_cur, oth_cur, clust_cur, cost_cur
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
72
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
73
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
74 class Kmedoids:
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
75 """k-medoids algorithm.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
76 """
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
77
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
78 def __init__(self, k, dist, maxloops=100, rs=0):
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
79 """Initialize Kmedoids.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
80
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
81 :Parameters:
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
82
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
83 k : int
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
84 Number of clusters/medoids
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
85 dist : class
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
86 class with a .compute(x, y) method which
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
87 returns a distance
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
88 maxloops : int
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
89 maximum number of loops
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
90 rs : int
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
91 random seed
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
92
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
93 Example:
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
94
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
95 >>> import numpy as np
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
96 >>> import mlpy
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
97 >>> x = np.array([[ 1. , 1.5],
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
98 ... [ 1.1, 1.8],
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
99 ... [ 2. , 2.8],
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
100 ... [ 3.2, 3.1],
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
101 ... [ 3.4, 3.2]])
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
102 >>> dtw = mlpy.Dtw(onlydist=True)
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
103 >>> km = mlpy.Kmedoids(k=3, dist=dtw)
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
104 >>> km.compute(x)
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
105 (array([4, 0, 2]), array([3, 1]), array([0, 1]), 0.072499999999999981)
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
106
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
107 Samples 4, 0, 2 are medoids and represent cluster 0, 1, 2 respectively.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
108
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
109 * cluster 0: samples 4 (medoid) and 3
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
110 * cluster 1: samples 0 (medoid) and 1
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
111 * cluster 2: sample 2 (medoid)
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
112 """
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
113
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
114 self.__k = k
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
115 self.__maxloops = maxloops
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
116 self.__rs = rs
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
117 self.__dist = dist
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
118
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
119 np.random.seed(self.__rs)
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
120
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
121
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
122 def compute(self, x):
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
123 """Compute Kmedoids.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
124
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
125 :Parameters:
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
126 x : ndarray
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
127 An 2-dimensional vector (sample x features).
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
128
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
129 :Returns:
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
130 m : ndarray (1-dimensional vector)
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
131 medoids indexes
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
132 n : ndarray (1-dimensional vector)
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
133 non-medoids indexes
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
134 cl : ndarray 1-dimensional vector)
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
135 cluster membership for non-medoids.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
136 Groups are in 0, ..., k-1
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
137 co : double
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
138 total cost of configuration
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
139 """
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
140
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
141 # randomly select k of the n data points as the mediods
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
142 idx = np.arange(x.shape[0])
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
143 np.random.shuffle(idx)
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
144 med = idx[0:self.__k]
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
145 oth = idx[self.__k::]
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
146
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
147 # compute distances
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
148 d = np.empty((oth.shape[0], med.shape[0]), dtype=float)
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
149 for i, n in enumerate(oth):
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
150 for j, m in enumerate(med):
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
151 d[i, j] = self.__dist.compute(x[m], x[n])
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
152
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
153 # associate each data point to the closest medoid
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
154 clust = np.argmin(d, axis=1)
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
155
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
156 # total cost of configuration
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
157 cost = np.sum(d[np.arange(d.shape[0]), clust])
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
158
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
159 # repeat kmedoids_core until there is no change in the medoid
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
160 for l in range(self.__maxloops):
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
161
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
162 med_n, oth_n, clust_n, cost_n = kmedoids_core(x, med, oth, clust, cost, self.__dist)
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
163
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
164 if (cost_n < cost):
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
165 med, oth, clust, cost = med_n, oth_n, clust_n, cost_n
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
166 else:
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
167 break
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
168
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
169 return med, oth, clust, cost
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
170
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
171
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
172 class Minkowski:
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
173 """
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
174 Computes the Minkowski distance between two vectors ``x`` and ``y``.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
175
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
176 .. math::
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
177
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
178 {||x-y||}_p = (\sum{|x_i - y_i|^p})^{1/p}.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
179 """
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
180
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
181 def __init__(self, p):
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
182 """
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
183 Initialize Minkowski class.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
184
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
185 :Parameters:
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
186 p : float
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
187 The norm of the difference :math:`{||x-y||}_p`
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
188 """
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
189
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
190 self.__p = p
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
191
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
192
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
193 def compute(self, x, y):
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
194 """
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
195 Compute Minkowski distance
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
196
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
197 :Parameters:
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
198 x : ndarray
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
199 An 1-dimensional vector.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
200 y : ndarray
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
201 An 1-dimensional vector.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
202
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
203 :Returns:
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
204 d : float
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
205 The Minkowski distance between vectors ``x`` and ``y``
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
206 """
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
207
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
208 return (abs(x - y)**self.__p).sum() ** (1.0 / self.__p)