37 #ifndef PAR_KMEDOIDS_H
38 #define PAR_KMEDOIDS_H
45 #include <boost/iterator/permutation_iterator.hpp>
170 template <
class T,
class D>
178 MPI_Group comm_group;
181 for (
size_t i=0; trials.
has_next(); i++) {
184 std::vector<size_t> my_ids;
185 std::vector<T> my_objects;
189 for (
int root=0; trials.
has_next() && root < size; root++) {
193 std::vector<size_t> sample_ids;
194 boost::random_number_generator<random_t> rng(
random);
198 std::vector<int> sources;
199 std::transform(sample_ids.begin(), sample_ids.end(), std::back_inserter(sources),
200 std::bind2nd(std::divides<size_t>(), objects.size()));
201 sources.erase(std::unique(sources.begin(), sources.end()), sources.end());
204 std::vector<size_t> sample_indices;
205 transform(std::lower_bound(sample_ids.begin(), sample_ids.end(), objects.size() * rank),
206 std::lower_bound(sample_ids.begin(), sample_ids.end(), objects.size() * (rank + 1)),
207 std::back_inserter(sample_indices),
208 std::bind2nd(std::minus<int>(), objects.size() * rank));
211 gather.
start(boost::make_permutation_iterator(objects.begin(), sample_indices.begin()),
212 boost::make_permutation_iterator(objects.begin(), sample_indices.end()),
213 sources.begin(), sources.end(), my_objects, root);
218 my_trial = trials.
count() - 1;
219 my_ids.swap(sample_ids);
229 int is_worker_process = (my_trial >= 0);
232 if (is_worker_process) {
238 cluster.
pam(mat, my_k);
243 for (
size_t m=0; m < cluster.
medoid_ids.size(); m++) {
244 all_medoids[my_trial].push_back(
252 std::vector<int> trial_ranks;
253 for (
int trial_id = i * size; trial_id < trials.
count(); trial_id++) {
254 trial_ranks.push_back(trial_id % size);
257 MPI_Group trials_group;
258 CMPI_Group_incl(comm_group, trial_ranks.size(), &trial_ranks[0], &trials_group);
260 MPI_Comm trials_comm;
265 std::vector<char> packed_medoids;
267 if (is_worker_process) {
269 binomial, trials_comm);
276 size_t packed_medoids_size = packed_medoids.size();
279 if (rank != 0) packed_medoids.resize(packed_medoids_size);
280 CMPI_Bcast(&packed_medoids[0], packed_medoids_size, MPI_PACKED, 0, comm);
284 std::vector< packable_vector< id_pair<T> > > unpacked_medoids;
286 for (
int trial_id = i * size; trial_id < trials.
count(); trial_id++) {
287 unpacked_medoids[trial_id % size]._packables->swap(all_medoids[trial_id]);
322 template <
class T,
class D>
323 void capek(
const std::vector<T>& objects, D dmetric,
size_t k, std::vector<T> *medoids = NULL)
334 size_t num_objects = size * objects.size();
335 k = std::min(num_objects, k);
340 std::vector<typename id_pair<T>::vector> all_medoids(
max_reps);
345 std::vector<double> all_dissimilarities(trials.
count(), 0.0);
346 std::vector< std::vector<medoid_id> > all_cluster_ids(trials.
count());
350 for (
size_t i=0; i < trials.
count(); i++) {
351 for (
size_t o=0; o < objects.size(); o++) {
353 std::pair<double, size_t> closest =
closest_medoid(objects[o], global_oid, all_medoids[i], dmetric);
355 all_dissimilarities[i] += closest.first;
356 all_cluster_ids[i].push_back(closest.second);
363 std::vector<double> sums(trials.
count());
370 std::vector<double>::iterator min_sum = std::min_element(sums.begin(), sums.end());
372 size_t best = (min_sum - sums.begin());
377 for (
size_t i = 0; i <
medoid_ids.size(); i++) {
382 std::vector<size_t> mapping(
medoid_ids.size());
383 std::generate(mapping.begin(), mapping.end(),
sequence());
388 for (
size_t i=0; i <
medoid_ids.size(); i++) {
389 medoid_ids[i] = all_medoids[best][mapping[i]].id;
398 for (
size_t i=0; i <
medoid_ids.size(); i++) {
399 (*medoids)[i] = all_medoids[best][mapping[i]].element;
436 template <
class T,
class D>
437 double xcapek(
const std::vector<T>& objects, D dmetric,
size_t max_k,
size_t dimensionality,
438 std::vector<T> *medoids = NULL)
449 size_t num_objects = size * objects.size();
450 max_k = std::min(num_objects, max_k);
453 std::vector<typename id_pair<T>::vector> all_medoids(max_k *
max_reps);
458 std::vector<double> all_dissimilarities(trials.
count(), 0.0);
459 std::vector< std::vector<medoid_id> > all_cluster_ids(trials.
count());
461 std::vector<double> all_dissim2;
462 std::vector<size_t> cluster_sizes;
466 for (
size_t i=0; i < trials.
count(); i++) {
467 const size_t num_medoids = all_medoids[i].size();
468 for (
size_t m=0; m < num_medoids; m++) {
469 all_dissim2.push_back(0.0);
470 cluster_sizes.push_back(0);
472 double *dissim2 = &all_dissim2[all_dissim2.size() - num_medoids];
473 size_t *sizes = &cluster_sizes[cluster_sizes.size() - num_medoids];
475 for (
size_t o=0; o < objects.size(); o++) {
477 std::pair<double, size_t> closest =
closest_medoid(objects[o], global_oid, all_medoids[i], dmetric);
479 all_dissimilarities[i] += closest.first;
480 dissim2[closest.second] += closest.first * closest.first;
481 sizes[closest.second] += 1;
482 all_cluster_ids[i].push_back(closest.second);
490 std::vector<double> sums(trials.
count());
491 std::vector<double> sums2(all_dissim2.size());
492 std::vector<size_t> sizes(cluster_sizes.size());
495 CMPI_Reduce(&all_dissim2[0], &sums2[0], sums2.size(), MPI_DOUBLE, MPI_SUM, 0,
comm);
502 std::vector<double>::iterator min_sum = std::min_element(sums.begin(), sums.end());
504 size_t best = (min_sum - sums.begin());
507 size_t best_trial = 0;
510 size_t trial_offset = 0;
511 for (
size_t i=0; i < trials.
count(); i++) {
512 size_t k = all_medoids[i].size();
513 double cur_bic =
bic(k, &sizes[trial_offset], &sums2[trial_offset], dimensionality);
525 for (
size_t i = 0; i <
medoid_ids.size(); i++) {
530 std::vector<size_t> mapping(
medoid_ids.size());
531 std::generate(mapping.begin(), mapping.end(),
sequence());
536 for (
size_t i=0; i <
medoid_ids.size(); i++) {
537 medoid_ids[i] = all_medoids[best][mapping[i]].id;
546 for (
size_t i=0; i <
medoid_ids.size(); i++) {
547 (*medoids)[i] = all_medoids[best][mapping[i]].element;
586 template <
typename T,
typename D>
590 double min_distance = DBL_MAX;
591 size_t min_id = medoids.size();
592 for (
size_t m=0; m < medoids.size(); m++) {
593 double d = dmetric(medoids[m].element,
object);
594 if (d < min_distance || medoids[m].
id == oid) {
599 return std::make_pair(min_distance, min_id);
606 #endif // PAR_KMEDOIDS_H
void start(ObjIterator begin_obj, ObjIterator end_obj, RankIterator begin_src, RankIterator end_src, std::vector< T > &dest, int root)
Starts initial send and receive requests for this gather.
void set_epsilon(double epsilon)
Set tolerance for convergence.
size_t init_size
baseline size for samples
Data structure representing a trial run of a partitioned clustering algorithm.
Generator object for a strided sequence of ints.
void record(const std::string &name)
Records time since start or last call to record.
std::vector< id_pair< T > > vector
Template typedef for declaring vectors of id_pair<T>
void unpack_binomial(const std::vector< char > &src, std::vector< T > &dest, const binomial_embedding binomial, MPI_Comm comm)
Unpacks a packed vector in binomial order into objects in rank order in the destination vector...
Timer timer
Performance timer.
void build_dissimilarity_matrix(const std::vector< T > &objects, D dissimilarity, dissimilarity_matrix &mat)
Computes a dissimilarity matrix from a vector of objects.
std::vector< object_id > medoid_ids
Gives the index of the object that is the ith medoid.
bool has_next() const
whether there are trials remaining.
size_t max_reps
Max repetitions of trials for a particular k.
double average_dissimilarity()
Get the average dissimilarity of objects w/their medoids for the last run.
Asynchronous, some-to-some gather operation used by parallel clustering algorithms to simultaneously ...
Asynchronous, some-to-some gather operation used by parallel clustering algorithms to simultaneously ...
void seed_random_uniform(MPI_Comm comm)
Seeds random number generators across all processes with the same number, taken from the time in micr...
MPI_Comm comm
Communicator, the processes of which this partition divides.
std::pair< double, size_t > closest_medoid(const T &object, object_id oid, const std::vector< id_pair< T > > &medoids, D dmetric)
Find the closest object in the medoids vector to the object passed in.
void gather_packed(const T &src, std::vector< char > &dest, const binomial_embedding binomial, MPI_Comm comm)
Packs and gathers a buffer full of packed representation of src's.
void set_seed(uint32_t seed)
Set the random seed.
id_pair< T > make_id_pair(const T &elt, int id)
Helper function for making arbitrary id_pairs with type inference.
par_kmedoids(MPI_Comm comm=MPI_COMM_WORLD)
Constructs a parallel kmedoids object and seeds its random number generator.
boost::numeric::ublas::symmetric_matrix< double > dissimilarity_matrix
Packed repersentation of symmetric dissimilarity matrix.
size_t count() const
return iterations so far.
trial next()
return parameters for next trial
Template function implementations of the Bayesian Information Criterion.
par_partition represents a partitioning of a distributed data set.
double best_bic_score
BIC score for the clustering found.
Implementations of the classic clustering algorithms PAM and CLARA, from Finding Groups in Data...
size_t get_max_reps()
Max number of times to run PAM with each sampled dataset.
size_t object_id
More descriptive type for object index.
boost::mt19937 random_t
Type for random number generator used here.
Distributed representation of a partitioning of a data set.
void set_max_reps(size_t reps)
Sets max_reps, Max number of times to run PAM with each sampled dataset.
This class implements the CAPEK and XCAPEK scalable parallel clustering algorithms.
Implementations of the classic clustering algorithms PAM and CLARA, from Finding Groups in Data...
double bic_score()
BIC score for selected clustering.
void pam(const dissimilarity_matrix &distance, size_t k, const object_id *initial_medoids=NULL)
Classic K-Medoids clustering, using the Partitioning-Around-Medoids (PAM) algorithm as described in K...
std::vector< object_id > cluster_ids
Global cluster ids for local objects.
random_t random
Random number distribution to be used for samples.
indexed_lt_functor< Indexable > indexed_lt(const Indexable &container)
void algorithm_r(size_t numElements, size_t sample_size, OutputIterator out, Random &random)
This is Knuth's algorithm R for taking a sample of numElements numbers.
packable_vector< T > make_packable_vector(std::vector< T > *vec, bool owned=true)
Tempate function adaptor so we can have type inference when making packable vectors.
void run_pam_trials(trial_generator &trials, const std::vector< T > &objects, D dmetric, std::vector< typename id_pair< T >::vector > &all_medoids, MPI_Comm comm)
Farms out trials of PAM to worker processes then collects medoids from all trials to all processors...
MPI-packable struct for an MPI-packable type plus its object id.
std::vector< object_id > medoid_ids
Gives the object id for the ith medoid. This object may not be local.
MPI-packable, templated struct for shipping around an MPI-packable object plus the id of the process ...
double epsilon
Tolerance for convergence tests in kmedoids PAM runs.
void set_epsilon(double epsilon)
Set tolerance for convergence.
size_t get_init_size()
Baseline size for samples, added to 2*k.
const Timer & get_timer()
Get the Timer with info on the last run of either capek() or xcapek().
void invert(std::vector< Index > &vec)
#defines for switching between MPI and PMPI bindings.
void capek(const std::vector< T > &objects, D dmetric, size_t k, std::vector< T > *medoids=NULL)
This is the Clustering Algorithm with Parallel Extensions to K-Medoids (CAPEK).
const size_t num_objects
number of elements in the data set; determines maximum sample size.
void gather(partition &local, int root=0)
Collective operation.
Class to generate a set of trials for clustering.
This struct represents parameters for a single trial run of kmedoids.
double total_dissimilarity
Track whether the random seed has been set.
double bic(const partition &p, D distance, size_t M)
Directly computes the BIC from a partition object based on the cluster centroids and the number of cl...
void set_init_size(size_t size)
Sets init_size, baseline size for samples, added to 2*k.
double xcapek(const std::vector< T > &objects, D dmetric, size_t max_k, size_t dimensionality, std::vector< T > *medoids=NULL)
K-agnostic version of capek().