bag_of_words_encoding_policy.hpp
Go to the documentation of this file.
1 
13 #ifndef MLPACK_CORE_DATA_STR_ENCODING_POLICIES_BAG_OF_WORDS_ENCODING_POLICY_HPP
14 #define MLPACK_CORE_DATA_STR_ENCODING_POLICIES_BAG_OF_WORDS_ENCODING_POLICY_HPP
15 
16 #include <mlpack/prereqs.hpp>
19 
20 namespace mlpack {
21 namespace data {
22 
36 {
37  public:
41  static void Reset()
42  {
43  // Nothing to do.
44  }
45 
58  template<typename MatType>
59  static void InitMatrix(MatType& output,
60  const size_t datasetSize,
61  const size_t /* maxNumTokens */,
62  const size_t dictionarySize)
63  {
64  output.zeros(dictionarySize, datasetSize);
65  }
66 
81  template<typename ElemType>
82  static void InitMatrix(std::vector<std::vector<ElemType>>& output,
83  const size_t datasetSize,
84  const size_t /* maxNumTokens */,
85  const size_t dictionarySize)
86  {
87  output.resize(datasetSize, std::vector<ElemType>(dictionarySize));
88  }
89 
102  template<typename MatType>
103  static void Encode(MatType& output,
104  const size_t value,
105  const size_t line,
106  const size_t /* index */)
107  {
108  // The labels are assigned sequentially starting from one.
109  output(value - 1, line) += 1;
110  }
111 
127  template<typename ElemType>
128  static void Encode(std::vector<std::vector<ElemType>>& output,
129  const size_t value,
130  const size_t line,
131  const size_t /* index */)
132  {
133  // The labels are assigned sequentially starting from one.
134  output[line][value - 1] += 1;
135  }
136 
144  static void PreprocessToken(size_t /* line */,
145  size_t /* index */,
146  size_t /* value */)
147  { }
148 
152  template<typename Archive>
153  void serialize(Archive& /* ar */, const uint32_t /* version */)
154  {
155  // Nothing to serialize.
156  }
157 };
158 
165 template<typename TokenType>
168 } // namespace data
169 } // namespace mlpack
170 
171 #endif
Definition of the BagOfWordsEncodingPolicy class.
Linear algebra utility functions, generally performed on matrices or vectors.
The core includes that mlpack expects; standard C++ includes and Armadillo.
static void InitMatrix(std::vector< std::vector< ElemType >> &output, const size_t datasetSize, const size_t, const size_t dictionarySize)
The function initializes the output matrix.
The class translates a set of strings into numbers using various encoding algorithms.
static void Encode(MatType &output, const size_t value, const size_t line, const size_t)
The function performs the bag of words encoding algorithm i.e.
static void PreprocessToken(size_t, size_t, size_t)
The function is not used by the bag of words encoding policy.
This class provides a dictionary interface for the purpose of string encoding.
static void Reset()
Clear the necessary internal variables.
static void InitMatrix(MatType &output, const size_t datasetSize, const size_t, const size_t dictionarySize)
The function initializes the output matrix.
void serialize(Archive &, const uint32_t)
Serialize the class to the given archive.
static void Encode(std::vector< std::vector< ElemType >> &output, const size_t value, const size_t line, const size_t)
The function performs the bag of words encoding algorithm i.e.