Muster
 All Classes Namespaces Files Functions Variables Typedefs Macros
kmedoids.cpp
Go to the documentation of this file.
1 //////////////////////////////////////////////////////////////////////////////////////////////////
2 // Copyright (c) 2010, Lawrence Livermore National Security, LLC.
3 // Produced at the Lawrence Livermore National Laboratory
4 // LLNL-CODE-433662
5 // All rights reserved.
6 //
7 // This file is part of Muster. For details, see http://github.com/tgamblin/muster.
8 // Please also read the LICENSE file for further information.
9 //
10 // Redistribution and use in source and binary forms, with or without modification, are
11 // permitted provided that the following conditions are met:
12 //
13 // * Redistributions of source code must retain the above copyright notice, this list of
14 // conditions and the disclaimer below.
15 // * Redistributions in binary form must reproduce the above copyright notice, this list of
16 // conditions and the disclaimer (as noted below) in the documentation and/or other materials
17 // provided with the distribution.
18 // * Neither the name of the LLNS/LLNL nor the names of its contributors may be used to endorse
19 // or promote products derived from this software without specific prior written permission.
20 //
21 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS
22 // OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
23 // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
24 // LAWRENCE LIVERMORE NATIONAL SECURITY, LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE
25 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26 // (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
28 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
29 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 //////////////////////////////////////////////////////////////////////////////////////////////////
31 
32 ///
33 /// @file kmedoids.cpp
34 /// @author Todd Gamblin tgamblin@llnl.gov
35 ///
36 #include "kmedoids.h"
37 using namespace cluster;
38 
39 #include <vector>
40 #include <sstream>
41 
42 #include <algorithm>
43 #include <numeric>
44 #include <cassert>
45 #include <cstdlib>
46 #include <sys/time.h>
47 using namespace std;
48 
49 #include "random.h"
50 #include "bic.h"
51 #include "matrix_utils.h"
52 
53 namespace cluster {
54 
55  kmedoids::kmedoids(size_t num_objects)
56  : partition(num_objects),
57  random(get_time_seed()),
58  rng(random),
59  total_dissimilarity(std::numeric_limits<double>::infinity()),
60  sort_medoids(true),
61  epsilon(1e-15),
62  init_size(40),
63  max_reps(5),
64  xcallback(NULL)
65  { }
66 
67 
69 
71  return total_dissimilarity / cluster_ids.size();
72  }
73 
74  void kmedoids::set_seed(unsigned long s) {
75  random.seed(s);
76  }
77 
78  void kmedoids::set_sort_medoids(bool sort) {
80  }
81 
82  void kmedoids::set_epsilon(double e) {
83  epsilon = e;
84  }
85 
86  void kmedoids::set_xcallback(void (*xpc)(const partition& part, double bic)) {
87  xcallback = xpc;
88  }
89 
90  void kmedoids::init_medoids(size_t k, const dissimilarity_matrix& distance) {
91  medoid_ids.clear();
92  // find first oject: object minimum dissimilarity to others
93  object_id first_medoid = 0;
94  double min_dissim = DBL_MAX;
95  for (size_t i=0; i < distance.size1(); i++) {
96  double total = 0.0;
97  for (size_t j=0; j < distance.size2(); j++) {
98  total += distance(i,j);
99  }
100  if (total < min_dissim) {
101  min_dissim = total;
102  first_medoid = i;
103  }
104  }
105 
106  // add first object to medoids and compute medoid ids.
107  medoid_ids.push_back(first_medoid);
109 
110  // now select next k-1 objects according to KR's BUILD algorithm
111  for (size_t cur_k = 1; cur_k < k; cur_k++) {
112  object_id best_obj = 0;
113  double max_gain = 0.0;
114  for (size_t i=0; i < distance.size1(); i++) {
115  if (is_medoid(i)) continue;
116 
117  double gain = 0.0;
118  for (size_t j=0; j < distance.size1(); j++) {
119  double Dj = distance(j, medoid_ids[cluster_ids[j]]); // distance from j to its medoid
120  gain += max(Dj - distance(i,j), 0.0); // gain from selecting i
121  }
122 
123  if (gain >= max_gain) { // set the next medoid to the object that
124  max_gain = gain; // maximizes the gain function.
125  best_obj = i;
126  }
127  }
128 
129  medoid_ids.push_back(best_obj);
131  }
132  }
133 
134 
135  double kmedoids::cost(medoid_id i, object_id h, const dissimilarity_matrix& distance) const {
136  double total = 0;
137  for (object_id j = 0; j < cluster_ids.size(); j++) {
138  object_id mi = medoid_ids[i]; // object id of medoid i
139  double dhj = distance(h, j); // distance between object h and object j
140 
141  object_id mj1 = medoid_ids[cluster_ids[j]]; // object id of j's nearest medoid
142  double dj1 = distance(mj1, j); // distance to j's nearest medoid
143 
144  // check if distance bt/w medoid i and j is same as j's current nearest medoid.
145  if (distance(mi, j) == dj1) {
146  double dj2 = DBL_MAX;
147  if (medoid_ids.size() > 1) { // look at 2nd nearest if there's more than one medoid.
148  object_id mj2 = medoid_ids[sec_nearest[j]]; // object id of j's 2nd-nearest medoid
149  dj2 = distance(mj2, j); // distance to j's 2nd-nearest medoid
150  }
151  total += min(dj2, dhj) - dj1;
152 
153  } else if (dhj < dj1) {
154  total += dhj - dj1;
155  }
156  }
157  return total;
158  }
159 
160 
161  void kmedoids::pam(const dissimilarity_matrix& distance, size_t k, const object_id *initial_medoids) {
162  if (k > distance.size1()) {
163  throw std::logic_error("Attempt to run PAM with more clusters than data.");
164  }
165 
166  if (distance.size1() != distance.size2()) {
167  throw std::logic_error("Error: distance matrix is not square!");
168  }
169 
170  // first get this the right size.
171  cluster_ids.resize(distance.size1());
172 
173  // size cluster_ids appropriately and randomly pick initial medoids
174  if (initial_medoids) {
175  medoid_ids.clear();
176  copy(initial_medoids, initial_medoids + k, back_inserter(medoid_ids));
177  } else {
178  init_medoids(k, distance);
179  }
180 
181  // set tolerance equal to epsilon times mean magnitude of distances.
182  // Note that distances *should* all be non-negative.
183  double tolerance = epsilon * sum(distance) / (distance.size1() * distance.size2());
184 
185  while (true) {
186  // initial cluster setup
188 
189  //vars to keep track of minimum
190  double minTotalCost = DBL_MAX;
191  medoid_id minMedoid = 0;
192  object_id minObject = 0;
193 
194  //iterate over each medoid
195  for (medoid_id i=0; i < k; i++) {
196  //iterate over all non-medoid objects
197  for (object_id h = 0; h < cluster_ids.size(); h++) {
198  if (is_medoid(h)) continue;
199 
200  //see if the total cost of swapping i & h was less than min
201  double curCost = cost(i, h, distance);
202  if (curCost < minTotalCost) {
203  minTotalCost = curCost;
204  minMedoid = i;
205  minObject = h;
206  }
207  }
208  }
209 
210  // bail if we can't gain anything more (we've converged)
211  if (minTotalCost >= -tolerance) break;
212 
213  // install the new medoid if we found a beneficial swap
214  medoid_ids[minMedoid] = minObject;
215  cluster_ids[minObject] = minMedoid;
216  }
217 
218  if (sort_medoids) sort();
219  }
220 
221 
222  double kmedoids::xpam(const dissimilarity_matrix& distance, size_t max_k, size_t dimensionality) {
223  double best_bic = -DBL_MAX; // note that DBL_MIN isn't what you think it is.
224 
225  for (size_t k = 1; k <= max_k; k++) {
226  kmedoids subcall;
227  subcall.pam(distance, k);
228  double cur_bic = bic(subcall, matrix_distance(distance), dimensionality);
229 
230  if (xcallback) xcallback(subcall, cur_bic);
231 
232  if (cur_bic > best_bic) {
233  best_bic = cur_bic;
234  swap(subcall);
235  }
236  }
237  return best_bic;
238  }
239 
240 
241 } // namespace cluster
double average_dissimilarity() const
Get the average dissimilarity of objects w/their medoids for the last run.
Definition: kmedoids.cpp:70
void set_epsilon(double epsilon)
Set tolerance for convergence.
Definition: kmedoids.cpp:82
void set_seed(unsigned long seed)
Set random seed.
Definition: kmedoids.cpp:74
long get_time_seed()
Returns a seed for random number generators based on the product of sec and usec from gettimeofday()...
Definition: random.h:119
void init_medoids(size_t k, const dissimilarity_matrix &distance)
KR BUILD algorithm for assigning initial medoids to a partition.
Definition: kmedoids.cpp:90
std::vector< object_id > medoid_ids
Gives the index of the object that is the ith medoid.
Definition: partition.h:79
double epsilon
Whether medoids should be canonically sorted by object id.
Definition: kmedoids.h:275
Adaptor for passing a matrix by reference to template functions that take a callable distance functio...
Definition: dissimilarity.h:99
bool sort_medoids
Total dissimilarity bt/w objects and their medoid.
Definition: kmedoids.h:274
void swap(partition &other)
Fast swap with other patrition objects.
Definition: partition.cpp:68
void(* xcallback)(const partition &part, double bic)
initial sample size (before 2*k)
Definition: kmedoids.h:281
double xpam(const dissimilarity_matrix &distance, size_t max_k, size_t dimensionality)
Classic K-Medoids clustering, using the Partitioning-Around-Medoids (PAM) algorithm as described in K...
Definition: kmedoids.cpp:222
double cost(medoid_id i, object_id h, const dissimilarity_matrix &distance) const
Total cost of swapping object h with medoid i.
Definition: kmedoids.cpp:135
double total_dissimilarity(const partition &p, D dist)
Compute the total dissimilarity between all objects and their medoids.
Definition: partition.h:170
boost::numeric::ublas::symmetric_matrix< double > dissimilarity_matrix
Packed repersentation of symmetric dissimilarity matrix.
Definition: dissimilarity.h:48
void sort()
puts medoids in order by their object id, and adjusts cluster_ids accordingly.
Definition: partition.cpp:74
random_type random
Type for RNG used in this algorithm.
Definition: kmedoids.h:266
Matrix::value_type sum(const Matrix &mat, size_t row_start=0, size_t row_end=std::numeric_limits< size_t >::max(), size_t col_start=0, size_t col_end=std::numeric_limits< size_t >::max())
Definition: matrix_utils.h:195
Template function implementations of the Bayesian Information Criterion.
void set_xcallback(void(*)(const partition &part, double bic))
Set callback function for XPAM and XCLARA. default is none.
Definition: kmedoids.cpp:86
double assign_objects_to_clusters(DM distance)
Assign each object to the cluster with the closest medoid.
Definition: kmedoids.h:299
kmedoids(size_t num_objects=0)
Constructor.
Definition: kmedoids.cpp:55
void set_sort_medoids(bool sort_medoids)
Set whether medoids will be sorted by object id after clustering is complete.
Definition: kmedoids.cpp:78
Implementations of the classic clustering algorithms PAM and CLARA, from Finding Groups in Data...
Definition: kmedoids.h:60
size_t object_id
More descriptive type for object index.
Definition: partition.h:52
bool is_medoid(object_id oi) const
True if and only if object i is a medoid.
Definition: partition.h:94
size_t medoid_id
More descriptive type for medoid index.
Definition: partition.h:51
std::vector< medoid_id > sec_nearest
Definition: kmedoids.h:272
Implementations of the classic clustering algorithms PAM and CLARA, from Finding Groups in Data...
void pam(const dissimilarity_matrix &distance, size_t k, const object_id *initial_medoids=NULL)
Classic K-Medoids clustering, using the Partitioning-Around-Medoids (PAM) algorithm as described in K...
Definition: kmedoids.cpp:161
double total_dissimilarity
Index of second closest medoids. Used by PAM.
Definition: kmedoids.h:273
std::vector< medoid_id > cluster_ids
Gives cluster id (index in medoids) for the ith object.
Definition: partition.h:84
Helper functions for taking random samples and seeding RNGs from the system clock.
This represents a partitioning of a data set.
Definition: partition.h:76
double bic(const partition &p, D distance, size_t M)
Directly computes the BIC from a partition object based on the cluster centroids and the number of cl...
Definition: bic.h:77
Muster. Copyright © 2010, Lawrence Livermore National Laboratory, LLNL-CODE-433662.
Distribution of Muster and its documentation is subject to terms of the Muster LICENSE.
Generated on Thu Sep 1 2016 using Doxygen 1.8.5