47 #include <boost/random.hpp>
132 template <
class T,
class D>
133 void clara(
const std::vector<T>& objects, D dmetric,
size_t k) {
137 if (objects.size() <= sample_size) {
153 for (
size_t i = 0; i <
max_reps; i++) {
155 std::vector<size_t> sample_to_full;
156 algorithm_r(objects.size(), sample_size, back_inserter(sample_to_full),
rng);
165 subcall.
pam(distance, k);
168 for (
size_t i=0; i <
medoid_ids.size(); i++) {
178 swap(best_partition);
183 if (sort_medoids)
sort();
197 template <
class T,
class D>
201 std::vector<size_t> counts(
medoid_ids.size());
204 means[m] = counts[m] ? (means[m] + objects[i]) : objects[i];
209 std::vector<double> shortest(means.size());
210 for (
size_t m=0; m < means.size(); m++) {
211 means[m] = means[m] / counts[m];
212 shortest[m] = distance(means[m], objects[
medoid_ids[m]]);
218 double d = distance(objects[i], means[m]);
219 if (d < shortest[m]) {
237 template <
class T,
class D>
238 double xclara(
const std::vector<T>& objects, D dmetric,
size_t max_k,
size_t dimensionality) {
239 double best_bic = -DBL_MAX;
241 for (
size_t k = 1; k <= max_k; k++) {
243 subcall.
clara(objects, dmetric, k);
245 double cur_bic =
bic(subcall,
lazy_distance(objects, dmetric), dimensionality);
248 if (cur_bic > best_bic) {
269 typedef boost::random_number_generator<random_type, unsigned long>
rng_type;
324 total_dissimilarity += d1;
double average_dissimilarity() const
Get the average dissimilarity of objects w/their medoids for the last run.
void set_epsilon(double epsilon)
Set tolerance for convergence.
void set_seed(unsigned long seed)
Set random seed.
void init_medoids(size_t k, const dissimilarity_matrix &distance)
KR BUILD algorithm for assigning initial medoids to a partition.
void build_dissimilarity_matrix(const std::vector< T > &objects, D dissimilarity, dissimilarity_matrix &mat)
Computes a dissimilarity matrix from a vector of objects.
std::vector< object_id > medoid_ids
Gives the index of the object that is the ith medoid.
double epsilon
Whether medoids should be canonically sorted by object id.
boost::random_number_generator< random_type, unsigned long > rng_type
Randomness source for this algorithm.
Class to represent a partitioning of a data set.
bool sort_medoids
Total dissimilarity bt/w objects and their medoid.
lazy_distance_functor< T, D > lazy_distance(const std::vector< T > &objs, D dist)
Type-inferred syntactic sugar for constructing lazy_distance_functor.
void swap(partition &other)
Fast swap with other patrition objects.
void(* xcallback)(const partition &part, double bic)
initial sample size (before 2*k)
double xpam(const dissimilarity_matrix &distance, size_t max_k, size_t dimensionality)
Classic K-Medoids clustering, using the Partitioning-Around-Medoids (PAM) algorithm as described in K...
double cost(medoid_id i, object_id h, const dissimilarity_matrix &distance) const
Total cost of swapping object h with medoid i.
boost::numeric::ublas::symmetric_matrix< double > dissimilarity_matrix
Packed repersentation of symmetric dissimilarity matrix.
void sort()
puts medoids in order by their object id, and adjusts cluster_ids accordingly.
random_type random
Type for RNG used in this algorithm.
Template function implementations of the Bayesian Information Criterion.
void set_xcallback(void(*)(const partition &part, double bic))
Set callback function for XPAM and XCLARA. default is none.
void set_max_reps(size_t r)
double assign_objects_to_clusters(DM distance)
Assign each object to the cluster with the closest medoid.
size_t max_reps
initial sample size (before 2*k)
kmedoids(size_t num_objects=0)
Constructor.
void set_sort_medoids(bool sort_medoids)
Set whether medoids will be sorted by object id after clustering is complete.
Implementations of the classic clustering algorithms PAM and CLARA, from Finding Groups in Data...
size_t object_id
More descriptive type for object index.
void clara(const std::vector< T > &objects, D dmetric, size_t k)
CLARA clustering algorithm, as per Kaufman and Rousseuw and R.
size_t medoid_id
More descriptive type for medoid index.
void center_medoids(const std::vector< T > &objects, D distance)
Takes existing clustering and reassigns medoids by taking closest medoid to mean of each cluster...
std::vector< medoid_id > sec_nearest
size_t init_size
Normalized sensitivity for convergence.
void pam(const dissimilarity_matrix &distance, size_t k, const object_id *initial_medoids=NULL)
Classic K-Medoids clustering, using the Partitioning-Around-Medoids (PAM) algorithm as described in K...
double xclara(const std::vector< T > &objects, D dmetric, size_t max_k, size_t dimensionality)
K-Agnostic version of CLARA.
double total_dissimilarity
Index of second closest medoids. Used by PAM.
Data types and functions for dealing with dissimilarity matrices.
void algorithm_r(size_t numElements, size_t sample_size, OutputIterator out, Random &random)
This is Knuth's algorithm R for taking a sample of numElements numbers.
std::vector< medoid_id > cluster_ids
Gives cluster id (index in medoids) for the ith object.
Helper functions for taking random samples and seeding RNGs from the system clock.
boost::mt19937 random_type
This represents a partitioning of a data set.
void set_init_size(size_t sz)
double bic(const partition &p, D distance, size_t M)
Directly computes the BIC from a partition object based on the cluster centroids and the number of cl...