kmeans_plus_plus_initialization.hpp
Go to the documentation of this file.
1 
7 #ifndef MLPACK_METHODS_KMEANS_KMEANS_PLUS_PLUS_INITIALIZATION_HPP
8 #define MLPACK_METHODS_KMEANS_KMEANS_PLUS_PLUS_INITIALIZATION_HPP
9 
10 #include <mlpack/prereqs.hpp>
11 
33 {
34  public:
37 
46  template<typename MatType>
47  inline static void Cluster(const MatType& data,
48  const size_t clusters,
49  arma::mat& centroids)
50  {
51  centroids.set_size(data.n_rows, clusters);
52 
53  // We'll sample our first point fully randomly.
54  size_t firstPoint = mlpack::math::RandInt(0, data.n_cols);
55  centroids.col(0) = data.col(firstPoint);
56 
57  // Utility variable.
58  arma::vec distribution(data.n_cols);
59 
60  // Now, sample other points...
61  for (size_t i = 1; i < clusters; ++i)
62  {
63  // We must compute the CDF for sampling... this depends on the computation
64  // of the minimum distance between each point and its closest
65  // already-chosen centroid.
66  //
67  // This computation is ripe for speedup with trees! I am not sure exactly
68  // how much we would need to approximate, but I think it could be done
69  // without breaking the O(log k)-competitive guarantee (I think).
70  for (size_t p = 0; p < data.n_cols; ++p)
71  {
72  double minDistance = std::numeric_limits<double>::max();
73  for (size_t j = 0; j < i; ++j)
74  {
75  const double distance =
77  centroids.col(j));
78  minDistance = std::min(distance, minDistance);
79  }
80 
81  distribution[p] = minDistance;
82  }
83 
84  // Next normalize the distribution (actually technically we could avoid
85  // this).
86  distribution /= arma::accu(distribution);
87 
88  // Turn it into a CDF for convenience...
89  for (size_t j = 1; j < distribution.n_elem; ++j)
90  distribution[j] += distribution[j - 1];
91 
92  // Sample a point...
93  const double sampleValue = mlpack::math::Random();
94  const double* elem = std::lower_bound(distribution.begin(),
95  distribution.end(), sampleValue);
96  const size_t position = (size_t)
97  (elem - distribution.begin()) / sizeof(double);
98  centroids.col(i) = data.col(position);
99  }
100  }
101 };
102 
103 #endif
This class implements the k-means++ initialization, as described in the following paper: ...
The core includes that mlpack expects; standard C++ includes and Armadillo.
KMeansPlusPlusInitialization()
Empty constructor, required by the InitialPartitionPolicy type definition.
static VecTypeA::elem_type Evaluate(const VecTypeA &a, const VecTypeB &b)
Computes the distance between two points.
double Random()
Generates a uniform random number between 0 and 1.
Definition: random.hpp:83
int RandInt(const int hiExclusive)
Generates a uniform random integer.
Definition: random.hpp:110
static void Cluster(const MatType &data, const size_t clusters, arma::mat &centroids)
Initialize the centroids matrix by randomly sampling points from the data matrix. ...