dataset_mapper.hpp
Go to the documentation of this file.
1 
15 #ifndef MLPACK_CORE_DATA_DATASET_INFO_HPP
16 #define MLPACK_CORE_DATA_DATASET_INFO_HPP
17 
18 #include <mlpack/prereqs.hpp>
19 #include <unordered_map>
20 
22 
23 namespace mlpack {
24 namespace data {
25 
40 template<typename PolicyType, typename InputType = std::string>
42 {
43  public:
49  explicit DatasetMapper(const size_t dimensionality = 0);
50 
56  explicit DatasetMapper(PolicyType& policy, const size_t dimensionality = 0);
57 
64  void SetDimensionality(const size_t dimensionality);
65 
73  template<typename T>
74  void MapFirstPass(const InputType& input, const size_t dimension);
75 
86  template<typename T>
87  T MapString(const InputType& input,
88  const size_t dimension);
89 
108  template<typename T>
109  const InputType& UnmapString(const T value,
110  const size_t dimension,
111  const size_t unmappingIndex = 0) const;
112 
116  template<typename T>
117  size_t NumUnmappings(const T value, const size_t dimension) const;
118 
128  typename PolicyType::MappedType UnmapValue(const InputType& input,
129  const size_t dimension);
130 
132  Datatype Type(const size_t dimension) const;
134  Datatype& Type(const size_t dimension);
135 
140  size_t NumMappings(const size_t dimension) const;
141 
148  size_t Dimensionality() const;
149 
153  template<typename Archive>
154  void serialize(Archive& ar, const uint32_t /* version */)
155  {
156  ar(CEREAL_NVP(types));
157  ar(CEREAL_NVP(maps));
158  }
159 
161  const PolicyType& Policy() const;
162 
164  PolicyType& Policy();
166  void Policy(PolicyType&& policy);
167 
168  private:
170  std::vector<Datatype> types;
171 
172  // Forward mapping type.
173  using ForwardMapType = typename std::unordered_map<InputType, typename
174  PolicyType::MappedType>;
175 
176  // Reverse mapping type. Multiple inputs may map to a single output, hence
177  // the need for std::vector.
178  using ReverseMapType = std::unordered_map<typename PolicyType::MappedType,
179  std::vector<InputType>>;
180 
181  // Mappings from strings to integers.
182  // Map entries will only exist for dimensions that are categorical.
183  // MapType = map<dimension, pair<bimap<string, MappedType>, numMappings>>
184  using MapType = std::unordered_map<size_t, std::pair<ForwardMapType,
185  ReverseMapType>>;
186 
188  MapType maps;
189 
191  // mapped to the maps object. It is used in MapString() and MapTokens().
192  PolicyType policy;
193 };
194 
195 // Use typedef to provide backward compatibility
197 
198 } // namespace data
199 } // namespace mlpack
200 
201 #include "dataset_mapper_impl.hpp"
202 
203 #endif
T MapString(const InputType &input, const size_t dimension)
Given the input and the dimension to which it belongs, return its numeric mapping.
Auxiliary information for a dataset, including mappings to/from strings (or other types) and the data...
DatasetMapper(const size_t dimensionality=0)
Create the DatasetMapper object with the given dimensionality.
Linear algebra utility functions, generally performed on matrices or vectors.
The core includes that mlpack expects; standard C++ includes and Armadillo.
const InputType & UnmapString(const T value, const size_t dimension, const size_t unmappingIndex=0) const
Return the input that corresponds to a given value in a given dimension.
PolicyType::MappedType UnmapValue(const InputType &input, const size_t dimension)
Return the value that corresponds to a given input in a given dimension.
Datatype Type(const size_t dimension) const
Return the type of a given dimension (numeric or categorical).
size_t NumMappings(const size_t dimension) const
Get the number of mappings for a particular dimension.
void MapFirstPass(const InputType &input, const size_t dimension)
Preprocessing: during a first pass of the data, pass the input on to the MapPolicy if they are needed...
size_t NumUnmappings(const T value, const size_t dimension) const
Get the number of possible unmappings for a string in a given dimension.
void SetDimensionality(const size_t dimensionality)
Set the dimensionality of an existing DatasetMapper object.
void serialize(Archive &ar, const uint32_t)
Serialize the dataset information.
Datatype
The Datatype enum specifies the types of data mlpack algorithms can use.
Definition: datatype.hpp:24
size_t Dimensionality() const
Get the dimensionality of the DatasetMapper object (that is, how many dimensions it has information f...
const PolicyType & Policy() const
Return the policy of the mapper.