string_encoding_dictionary.hpp
Go to the documentation of this file.
1 
13 #ifndef MLPACK_CORE_DATA_STRING_ENCODING_DICTIONARY_HPP
14 #define MLPACK_CORE_DATA_STRING_ENCODING_DICTIONARY_HPP
15 
16 #include <mlpack/prereqs.hpp>
17 #include <mlpack/core/boost_backport/boost_backport_string_view.hpp>
18 #include <unordered_map>
19 #include <deque>
20 #include <array>
21 
22 namespace mlpack {
23 namespace data {
24 
31 template<typename Token>
33 {
34  public:
36  using MapType = std::unordered_map<Token, size_t>;
37 
39  using TokenType = Token;
40 
46  bool HasToken(const Token& token) const
47  {
48  return mapping.find(token) != mapping.end();
49  }
50 
58  template<typename T>
59  size_t AddToken(T&& token)
60  {
61  size_t size = mapping.size();
62 
63  mapping[std::forward<T>(token)] = ++size;
64 
65  return size;
66  }
67 
74  size_t Value(const Token& token) const
75  {
76  return mapping.at(token);
77  }
78 
80  size_t Size() const { return mapping.size(); }
81 
83  void Clear()
84  {
85  mapping.clear();
86  }
87 
89  const MapType& Mapping() const { return mapping; }
91  MapType& Mapping() { return mapping; }
92 
96  template<typename Archive>
97  void serialize(Archive& ar, const uint32_t /* version */)
98  {
99  ar(CEREAL_NVP(mapping));
100  }
101 
102  private:
104  MapType mapping;
105 };
106 
107 /*
108  * Specialization of the StringEncodingDictionary class for boost::string_view.
109  */
110 template<>
111 class StringEncodingDictionary<boost::string_view>
112 {
113  public:
115  using MapType = std::unordered_map<
116  boost::string_view,
117  size_t,
118  boost::hash<boost::string_view>>;
119 
121  using TokenType = boost::string_view;
122 
124  StringEncodingDictionary() = default;
125 
128  tokens(other.tokens)
129  {
130  for (const std::string& token : tokens)
131  mapping[token] = other.mapping.at(token);
132  }
133 
136 
139  {
140  tokens = other.tokens;
141  mapping.clear();
142 
143  for (const std::string& token : tokens)
144  mapping[token] = other.mapping.at(token);
145 
146  return *this;
147  }
148 
150  StringEncodingDictionary& operator=(
151  StringEncodingDictionary&& other) = default;
152 
158  bool HasToken(const boost::string_view token) const
159  {
160  return mapping.find(token) != mapping.end();
161  }
162 
170  size_t AddToken(const boost::string_view token)
171  {
172  tokens.emplace_back(token);
173 
174  size_t size = mapping.size();
175 
176  mapping[tokens.back()] = ++size;
177 
178  return size;
179  }
180 
187  size_t Value(const boost::string_view token) const
188  {
189  return mapping.at(token);
190  }
191 
193  size_t Size() const { return mapping.size(); }
194 
196  void Clear()
197  {
198  mapping.clear();
199  tokens.clear();
200  }
201 
203  const std::deque<std::string>& Tokens() const { return tokens; }
205  std::deque<std::string>& Tokens() { return tokens; }
206 
208  const MapType& Mapping() const { return mapping; }
210  MapType& Mapping() { return mapping; }
211 
215  template<typename Archive>
216  void serialize(Archive& ar, const uint32_t /* version */)
217  {
218  size_t numTokens = tokens.size();
219 
220  ar(CEREAL_NVP(numTokens));
221 
222  if (cereal::is_loading<Archive>())
223  {
224  tokens.resize(numTokens);
225 
226  for (std::string& token : tokens)
227  {
228  ar(CEREAL_NVP(token));
229 
230  size_t tokenValue = 0;
231  ar(CEREAL_NVP(tokenValue));
232  mapping[token] = tokenValue;
233  }
234  }
235  if (cereal::is_saving<Archive>())
236  {
237  for (std::string& token : tokens)
238  {
239  ar(CEREAL_NVP(token));
240 
241  size_t tokenValue = mapping.at(token);
242  ar(CEREAL_NVP(tokenValue));
243  }
244  }
245  }
246 
247  private:
249  std::deque<std::string> tokens;
250 
252  MapType mapping;
253 };
254 
255 template<>
257 {
258  public:
260  using MapType = std::array<size_t, 1 << CHAR_BIT>;
261 
263  using TokenType = int;
264 
267  size(0)
268  {
269  mapping.fill(0);
270  }
271 
278  bool HasToken(const int token) const
279  {
280  return mapping[token] > 0;
281  }
282 
291  size_t AddToken(const int token)
292  {
293  mapping[token] = ++size;
294 
295  return size;
296  }
297 
305  size_t Value(const int token) const
306  {
307  return mapping[token];
308  }
309 
311  size_t Size() const
312  {
313  return size;
314  }
315 
317  void Clear()
318  {
319  mapping.fill(0);
320  }
321 
323  const MapType& Mapping() const { return mapping; }
325  MapType& Mapping() { return mapping; }
326 
330  template<typename Archive>
331  void serialize(Archive& ar, const uint32_t /* version */)
332  {
333  ar(CEREAL_NVP(mapping));
334  ar(CEREAL_NVP(size));
335  }
336 
337  private:
339  MapType mapping;
340 
342  size_t size;
343 };
344 
345 } // namespace data
346 } // namespace mlpack
347 
348 #endif
const MapType & Mapping() const
Get the mapping.
size_t AddToken(const int token)
The function adds the given token to the dictionary and assigns a label to the token.
size_t AddToken(T &&token)
The function adds the given token to the dictionary and assigns a label to the token.
Linear algebra utility functions, generally performed on matrices or vectors.
std::unordered_map< boost::string_view, size_t, boost::hash< boost::string_view > > MapType
A convenient alias for the internal type of the map.
const std::deque< std::string > & Tokens() const
Get the tokens.
The core includes that mlpack expects; standard C++ includes and Armadillo.
bool HasToken(const int token) const
The function returns true if the dictionary contains the given token.
size_t Value(const Token &token) const
The function returns the label assigned to the given token.
void serialize(Archive &ar, const uint32_t)
Serialize the class to the given archive.
std::array< size_t, 1<< CHAR_BIT > MapType
A convenient alias for the internal type of the map.
bool HasToken(const boost::string_view token) const
The function returns true if the dictionary contains the given token.
void serialize(Archive &ar, const uint32_t)
Serialize the class to the given archive.
This class provides a dictionary interface for the purpose of string encoding.
boost::string_view TokenType
The type of the token that the dictionary stores.
StringEncodingDictionary(const StringEncodingDictionary &other)
Copy the class from the given object.
Token TokenType
The type of the token that the dictionary stores.
int TokenType
The type of the token that the dictionary stores.
bool HasToken(const Token &token) const
The function returns true if the dictionary contains the given token.
const MapType & Mapping() const
Get the mapping.
size_t AddToken(const boost::string_view token)
The function adds the given token to the dictionary and assigns a label to the token.
void serialize(Archive &ar, const uint32_t)
Serialize the class to the given archive.
size_t Size() const
Get the size of the dictionary.
size_t Size() const
Get the size of the dictionary.
size_t Value(const boost::string_view token) const
The function returns the label assigned to the given token.
size_t Value(const int token) const
The function returns the label assigned to the given token.
std::unordered_map< Token, size_t > MapType
A convenient alias for the internal type of the map.
StringEncodingDictionary & operator=(const StringEncodingDictionary &other)
Copy the class from the given object.