37 using namespace cluster;
94 double min_dissim = DBL_MAX;
95 for (
size_t i=0; i < distance.size1(); i++) {
97 for (
size_t j=0; j < distance.size2(); j++) {
98 total += distance(i,j);
100 if (total < min_dissim) {
111 for (
size_t cur_k = 1; cur_k < k; cur_k++) {
113 double max_gain = 0.0;
114 for (
size_t i=0; i < distance.size1(); i++) {
118 for (
size_t j=0; j < distance.size1(); j++) {
120 gain += max(Dj - distance(i,j), 0.0);
123 if (gain >= max_gain) {
139 double dhj = distance(h, j);
142 double dj1 = distance(mj1, j);
145 if (distance(mi, j) == dj1) {
146 double dj2 = DBL_MAX;
149 dj2 = distance(mj2, j);
151 total += min(dj2, dhj) - dj1;
153 }
else if (dhj < dj1) {
162 if (k > distance.size1()) {
163 throw std::logic_error(
"Attempt to run PAM with more clusters than data.");
166 if (distance.size1() != distance.size2()) {
167 throw std::logic_error(
"Error: distance matrix is not square!");
174 if (initial_medoids) {
176 copy(initial_medoids, initial_medoids + k, back_inserter(
medoid_ids));
183 double tolerance =
epsilon *
sum(distance) / (distance.size1() * distance.size2());
190 double minTotalCost = DBL_MAX;
201 double curCost =
cost(i, h, distance);
202 if (curCost < minTotalCost) {
203 minTotalCost = curCost;
211 if (minTotalCost >= -tolerance)
break;
223 double best_bic = -DBL_MAX;
225 for (
size_t k = 1; k <= max_k; k++) {
227 subcall.
pam(distance, k);
232 if (cur_bic > best_bic) {
double average_dissimilarity() const
Get the average dissimilarity of objects w/their medoids for the last run.
void set_epsilon(double epsilon)
Set tolerance for convergence.
void set_seed(unsigned long seed)
Set random seed.
long get_time_seed()
Returns a seed for random number generators based on the product of sec and usec from gettimeofday()...
void init_medoids(size_t k, const dissimilarity_matrix &distance)
KR BUILD algorithm for assigning initial medoids to a partition.
std::vector< object_id > medoid_ids
Gives the index of the object that is the ith medoid.
double epsilon
Whether medoids should be canonically sorted by object id.
Adaptor for passing a matrix by reference to template functions that take a callable distance functio...
bool sort_medoids
Total dissimilarity bt/w objects and their medoid.
void swap(partition &other)
Fast swap with other patrition objects.
void(* xcallback)(const partition &part, double bic)
initial sample size (before 2*k)
double xpam(const dissimilarity_matrix &distance, size_t max_k, size_t dimensionality)
Classic K-Medoids clustering, using the Partitioning-Around-Medoids (PAM) algorithm as described in K...
double cost(medoid_id i, object_id h, const dissimilarity_matrix &distance) const
Total cost of swapping object h with medoid i.
double total_dissimilarity(const partition &p, D dist)
Compute the total dissimilarity between all objects and their medoids.
boost::numeric::ublas::symmetric_matrix< double > dissimilarity_matrix
Packed repersentation of symmetric dissimilarity matrix.
void sort()
puts medoids in order by their object id, and adjusts cluster_ids accordingly.
random_type random
Type for RNG used in this algorithm.
Matrix::value_type sum(const Matrix &mat, size_t row_start=0, size_t row_end=std::numeric_limits< size_t >::max(), size_t col_start=0, size_t col_end=std::numeric_limits< size_t >::max())
Template function implementations of the Bayesian Information Criterion.
void set_xcallback(void(*)(const partition &part, double bic))
Set callback function for XPAM and XCLARA. default is none.
double assign_objects_to_clusters(DM distance)
Assign each object to the cluster with the closest medoid.
kmedoids(size_t num_objects=0)
Constructor.
void set_sort_medoids(bool sort_medoids)
Set whether medoids will be sorted by object id after clustering is complete.
Implementations of the classic clustering algorithms PAM and CLARA, from Finding Groups in Data...
size_t object_id
More descriptive type for object index.
bool is_medoid(object_id oi) const
True if and only if object i is a medoid.
size_t medoid_id
More descriptive type for medoid index.
std::vector< medoid_id > sec_nearest
Implementations of the classic clustering algorithms PAM and CLARA, from Finding Groups in Data...
void pam(const dissimilarity_matrix &distance, size_t k, const object_id *initial_medoids=NULL)
Classic K-Medoids clustering, using the Partitioning-Around-Medoids (PAM) algorithm as described in K...
double total_dissimilarity
Index of second closest medoids. Used by PAM.
std::vector< medoid_id > cluster_ids
Gives cluster id (index in medoids) for the ith object.
Helper functions for taking random samples and seeding RNGs from the system clock.
This represents a partitioning of a data set.
double bic(const partition &p, D distance, size_t M)
Directly computes the BIC from a partition object based on the cluster centroids and the number of cl...