13 #ifndef MLPACK_CORE_DATA_STRING_ENCODING_POLICIES_TF_IDF_ENCODING_POLICY_HPP 14 #define MLPACK_CORE_DATA_STRING_ENCODING_POLICIES_TF_IDF_ENCODING_POLICY_HPP 76 const bool smoothIdf =
true) :
86 tokensFrequences.clear();
87 numContainingStrings.clear();
103 template<
typename MatType>
105 const size_t datasetSize,
107 const size_t dictionarySize)
109 output.zeros(dictionarySize, datasetSize);
126 template<
typename ElemType>
127 static void InitMatrix(std::vector<std::vector<ElemType>>& output,
128 const size_t datasetSize,
130 const size_t dictionarySize)
132 output.resize(datasetSize, std::vector<ElemType>(dictionarySize));
147 template<
typename MatType>
153 const typename MatType::elem_type tf =
154 TermFrequency<typename MatType::elem_type>(
155 tokensFrequences[line][value], linesSizes[line]);
157 const typename MatType::elem_type idf =
158 InverseDocumentFrequency<typename MatType::elem_type>(
159 output.n_cols, numContainingStrings[value]);
161 output(value - 1, line) = tf * idf;
179 template<
typename ElemType>
180 void Encode(std::vector<std::vector<ElemType>>& output,
185 const ElemType tf = TermFrequency<ElemType>(
186 tokensFrequences[line][value], linesSizes[line]);
188 const ElemType idf = InverseDocumentFrequency<ElemType>(
189 output.size(), numContainingStrings[value]);
191 output[line][value - 1] = tf * idf;
206 if (line >= tokensFrequences.size())
208 linesSizes.resize(line + 1);
209 tokensFrequences.resize(line + 1);
212 tokensFrequences[line][value]++;
214 if (tokensFrequences[line][value] == 1)
215 numContainingStrings[value]++;
221 const std::vector<std::unordered_map<size_t, size_t>>&
226 return tokensFrequences;
232 return numContainingStrings;
238 return numContainingStrings;
242 const std::vector<size_t>&
LinesSizes()
const {
return linesSizes; }
259 template<
typename Archive>
262 ar(CEREAL_NVP(tfType));
263 ar(CEREAL_NVP(smoothIdf));
276 template<
typename ValueType>
277 ValueType TermFrequency(
const size_t numOccurrences,
278 const size_t numTokens)
283 return numOccurrences > 0;
285 return numOccurrences;
287 return static_cast<ValueType
>(numOccurrences) / numTokens;
289 return std::log(static_cast<ValueType>(numOccurrences)) + 1;
291 Log::Fatal <<
"Incorrect term frequency type!";
305 template<
typename ValueType>
306 ValueType InverseDocumentFrequency(
const size_t totalNumLines,
307 const size_t numOccurrences)
311 return std::log(static_cast<ValueType>(totalNumLines + 1) /
312 (1 + numOccurrences)) + 1.0;
316 return std::log(static_cast<ValueType>(totalNumLines) /
317 numOccurrences) + 1.0;
323 std::vector<std::unordered_map<size_t, size_t>> tokensFrequences;
328 std::unordered_map<size_t, size_t> numContainingStrings;
330 std::vector<size_t> linesSizes;
343 template<
typename TokenType>
std::unordered_map< size_t, size_t > & NumContainingStrings()
Modify the number of containing strings depending on the given token.
TfTypes TfType() const
Return the term frequency type.
Linear algebra utility functions, generally performed on matrices or vectors.
void Reset()
Clear the necessary internal variables.
static void InitMatrix(MatType &output, const size_t datasetSize, const size_t, const size_t dictionarySize)
The function initializes the output matrix.
static void InitMatrix(std::vector< std::vector< ElemType >> &output, const size_t datasetSize, const size_t, const size_t dictionarySize)
The function initializes the output matrix.
std::vector< size_t > & LinesSizes()
Modify the lines sizes.
The core includes that mlpack expects; standard C++ includes and Armadillo.
void Encode(MatType &output, const size_t value, const size_t line, const size_t)
The function performs the TfIdf encoding algorithm i.e.
The class translates a set of strings into numbers using various encoding algorithms.
TfTypes
Enum class used to identify the type of the term frequency statistics.
This class provides a dictionary interface for the purpose of string encoding.
static MLPACK_EXPORT util::PrefixedOutStream Fatal
Prints fatal messages prefixed with [FATAL], then terminates the program.
const std::unordered_map< size_t, size_t > & NumContainingStrings() const
Get the number of containing strings depending on the given token.
TfIdfEncodingPolicy(const TfTypes tfType=TfTypes::RAW_COUNT, const bool smoothIdf=true)
Construct this using the term frequency type and the inverse document frequency type.
TfTypes & TfType()
Modify the term frequency type.
void PreprocessToken(const size_t line, const size_t, const size_t value)
const std::vector< size_t > & LinesSizes() const
Return the lines sizes.
void Encode(std::vector< std::vector< ElemType >> &output, const size_t value, const size_t line, const size_t)
The function performs the TfIdf encoding algorithm i.e.
Definition of the TfIdfEncodingPolicy class.
std::vector< std::unordered_map< size_t, size_t > > & TokensFrequences()
Modify token frequencies.
bool & SmoothIdf()
Modify the idf algorithm type (whether it's smooth or not).
void serialize(Archive &ar, const uint32_t)
Serialize the class to the given archive.
const std::vector< std::unordered_map< size_t, size_t > > & TokensFrequences() const
Return token frequencies.
bool SmoothIdf() const
Determine the idf algorithm type (whether it's smooth or not).