string_encoding.hpp
Go to the documentation of this file.
1 
13 #ifndef MLPACK_CORE_DATA_STRING_ENCODING_HPP
14 #define MLPACK_CORE_DATA_STRING_ENCODING_HPP
15 
16 #include <mlpack/prereqs.hpp>
17 #include <mlpack/core/boost_backport/boost_backport_string_view.hpp>
20 #include <vector>
21 
22 namespace mlpack {
23 namespace data {
24 
33 template<typename EncodingPolicyType,
34  typename DictionaryType>
36 {
37  public:
42  template<typename ... ArgTypes>
43  StringEncoding(ArgTypes&& ... args);
44 
50  StringEncoding(EncodingPolicyType encodingPolicy);
51 
55  StringEncoding(StringEncoding&);
56 
58  StringEncoding(const StringEncoding&);
59 
61  StringEncoding& operator=(const StringEncoding&) = default;
62 
64  StringEncoding(StringEncoding&&);
65 
67  StringEncoding& operator=(StringEncoding&&) = default;
68 
84  template<typename TokenizerType>
85  void CreateMap(const std::string& input,
86  const TokenizerType& tokenizer);
87 
91  void Clear();
92 
118  template<typename OutputType, typename TokenizerType>
119  void Encode(const std::vector<std::string>& input,
120  OutputType& output,
121  const TokenizerType& tokenizer);
122 
124  const DictionaryType& Dictionary() const { return dictionary; }
126  DictionaryType& Dictionary() { return dictionary; }
127 
129  const EncodingPolicyType& EncodingPolicy() const { return encodingPolicy; }
131  EncodingPolicyType& EncodingPolicy() { return encodingPolicy; }
132 
136  template<typename Archive>
137  void serialize(Archive& ar, const uint32_t /* version */);
138 
139  private:
168  template<typename OutputType, typename TokenizerType, typename PolicyType>
169  void EncodeHelper(const std::vector<std::string>& input,
170  OutputType& output,
171  const TokenizerType& tokenizer,
172  PolicyType& policy);
173 
197  template<typename TokenizerType, typename PolicyType, typename ElemType>
198  void EncodeHelper(const std::vector<std::string>& input,
199  std::vector<std::vector<ElemType>>& output,
200  const TokenizerType& tokenizer,
201  PolicyType& policy,
202  typename std::enable_if<StringEncodingPolicyTraits<
203  PolicyType>::onePassEncoding>::type* = 0);
204 
205  private:
207  EncodingPolicyType encodingPolicy;
209  DictionaryType dictionary;
210 };
211 
212 } // namespace data
213 } // namespace mlpack
214 
215 // Include implementation.
216 #include "string_encoding_impl.hpp"
217 
218 #endif
StringEncoding(ArgTypes &&... args)
Pass the given arguments to the policy constructor and create the StringEncoding object using the pol...
This is a template struct that provides some information about various encoding policies.
Linear algebra utility functions, generally performed on matrices or vectors.
void CreateMap(const std::string &input, const TokenizerType &tokenizer)
Initialize the dictionary using the given corpus.
The core includes that mlpack expects; standard C++ includes and Armadillo.
void serialize(Archive &ar, const uint32_t)
Serialize the class to the given archive.
The class translates a set of strings into numbers using various encoding algorithms.
const EncodingPolicyType & EncodingPolicy() const
Return the encoding policy object.
EncodingPolicyType & EncodingPolicy()
Modify the encoding policy object.
StringEncoding & operator=(const StringEncoding &)=default
Default copy assignment operator.
void Clear()
Clear the dictionary.
const DictionaryType & Dictionary() const
Return the dictionary.
DictionaryType & Dictionary()
Modify the dictionary.
void Encode(const std::vector< std::string > &input, OutputType &output, const TokenizerType &tokenizer)
Encode the given text and write the result to the given output.