mock_categorical_data.hpp
Go to the documentation of this file.
1 
11 #ifndef MLPACK_TESTS_MOCK_CATEGORICAL_DATA_HPP
12 #define MLPACK_TESTS_MOCK_CATEGORICAL_DATA_HPP
13 
14 #include <mlpack/prereqs.hpp>
16 
20 inline void MockCategoricalData(arma::mat& d,
21  arma::Row<size_t>& l,
22  mlpack::data::DatasetInfo& datasetInfo)
23 {
24  // We'll build a spiral dataset plus two noisy categorical features. We need
25  // to build the distributions for the categorical features (they'll be
26  // discrete distributions).
28  // The distribution will be automatically normalized.
29  for (size_t i = 0; i < 5; ++i)
30  {
31  std::vector<arma::vec> probs;
32  probs.push_back(arma::vec(4, arma::fill::randu));
34  }
35 
37  for (size_t i = 0; i < 5; ++i)
38  {
39  std::vector<arma::vec> probs;
40  probs.push_back(arma::vec(2, arma::fill::randu));
42  }
43 
44  arma::mat spiralDataset(4, 4000);
45  arma::Row<size_t> labels(4000);
46  for (size_t i = 0; i < 4000; ++i)
47  {
48  // One circle every 2000 samples. Plus some noise.
49  const double magnitude = 2.0 + (double(i) / 200.0) +
50  0.5 * mlpack::math::Random();
51  const double angle = (i % 200) * (2 * M_PI) + mlpack::math::Random();
52 
53  const double x = magnitude * cos(angle);
54  const double y = magnitude * sin(angle);
55 
56  spiralDataset(0, i) = x;
57  spiralDataset(1, i) = y;
58 
59  // Set categorical features c1 and c2.
60  if (i < 800)
61  {
62  spiralDataset(2, i) = c1[1].Random()[0];
63  spiralDataset(3, i) = c2[1].Random()[0];
64  labels[i] = 1;
65  }
66  else if (i < 1600)
67  {
68  spiralDataset(2, i) = c1[3].Random()[0];
69  spiralDataset(3, i) = c2[3].Random()[0];
70  labels[i] = 3;
71  }
72  else if (i < 2400)
73  {
74  spiralDataset(2, i) = c1[2].Random()[0];
75  spiralDataset(3, i) = c2[2].Random()[0];
76  labels[i] = 2;
77  }
78  else if (i < 3200)
79  {
80  spiralDataset(2, i) = c1[0].Random()[0];
81  spiralDataset(3, i) = c2[0].Random()[0];
82  labels[i] = 0;
83  }
84  else
85  {
86  spiralDataset(2, i) = c1[4].Random()[0];
87  spiralDataset(3, i) = c2[4].Random()[0];
88  labels[i] = 4;
89  }
90  }
91 
92  // Now create the dataset info.
93  datasetInfo = mlpack::data::DatasetInfo(4);
96  // Set mappings.
97  datasetInfo.MapString<double>("0", 2);
98  datasetInfo.MapString<double>("1", 2);
99  datasetInfo.MapString<double>("2", 2);
100  datasetInfo.MapString<double>("3", 2);
101  datasetInfo.MapString<double>("0", 3);
102  datasetInfo.MapString<double>("1", 3);
103 
104  // Now shuffle the dataset.
105  arma::uvec indices = arma::shuffle(arma::linspace<arma::uvec>(0, 3999,
106  4000));
107  d = arma::mat(4, 4000);
108  l = arma::Row<size_t>(4000);
109  for (size_t i = 0; i < 4000; ++i)
110  {
111  d.col(i) = spiralDataset.col(indices[i]);
112  l[i] = labels[indices[i]];
113  }
114 }
115 
119 inline void MockCategoricalData(arma::mat& d,
120  arma::Row<double>& l,
121  mlpack::data::DatasetInfo& datasetInfo)
122 {
123  // Dataset of size 4000.
124  d.set_size(5, 4000);
125  l.set_size(4000);
126 
127  for (size_t i = 0; i < 4000; ++i)
128  {
129  // Random numeric features.
130  d(0, i) = mlpack::math::Random();
131  d(1, i) = mlpack::math::Random(-1, 1);
132  d(2, i) = mlpack::math::Random();
133 
134  // Binary feature.
135  d(3, i) = mlpack::math::RandInt(0, 2);
136  // 5-category categorical feature.
137  d(4, i) = mlpack::math::RandInt(0, 5);
138 
139  // Mappings from categorical features to regression value.
140  std::map<int, double> f;
141  f[0] = 5.0;
142  f[1] = -5.0;
143 
144  std::map<int, double> g;
145  g[0] = 2.0;
146  g[1] = 7.0;
147  g[2] = -3.0;
148  g[3] = 0.0;
149  g[4] = 4.0;
150 
151  // Random noise in range [-0.5, 0.5).
152  const double noise = mlpack::math::Random() - 0.5;
153 
154  // y = x1 + x2 + 3 * x3 + f(x4) + g(x5) + noise
155  l[i] = d(0, i) + d(1, i) + 3 * d(2, i) + f[(int) d(3, i)] +
156  g[(int) d(4, i)] + noise;
157  }
158 
159  // Now create the dataset info.
160  datasetInfo = mlpack::data::DatasetInfo(5);
161  datasetInfo.Type(3) = mlpack::data::Datatype::categorical;
162  datasetInfo.Type(4) = mlpack::data::Datatype::categorical;
163  // Set mappings.
164  datasetInfo.MapString<double>("0", 3);
165  datasetInfo.MapString<double>("1", 3);
166 
167  datasetInfo.MapString<double>("0", 4);
168  datasetInfo.MapString<double>("1", 4);
169  datasetInfo.MapString<double>("2", 4);
170  datasetInfo.MapString<double>("3", 4);
171  datasetInfo.MapString<double>("4", 4);
172 }
173 
174 #endif
T MapString(const InputType &input, const size_t dimension)
Given the input and the dimension to which it belongs, return its numeric mapping.
Auxiliary information for a dataset, including mappings to/from strings (or other types) and the data...
A discrete distribution where the only observations are discrete observations.
The core includes that mlpack expects; standard C++ includes and Armadillo.
#define M_PI
Definition: prereqs.hpp:39
arma::vec Random() const
Return a randomly generated observation (one-dimensional vector; one observation) according to the pr...
Datatype Type(const size_t dimension) const
Return the type of a given dimension (numeric or categorical).
void MockCategoricalData(arma::mat &d, arma::Row< size_t > &l, mlpack::data::DatasetInfo &datasetInfo)
Create a mock categorical dataset for testing classification.
DatasetMapper< data::IncrementPolicy > DatasetInfo
double Random()
Generates a uniform random number between 0 and 1.
Definition: random.hpp:83
int RandInt(const int hiExclusive)
Generates a uniform random integer.
Definition: random.hpp:110