17#include <unordered_map>
36 static bool isNotAlphaNumeric(
char c);
37 static bool isAlphaNumeric(
const std::string &s);
48 static void string2TokensDlm(
const std::string &s, std::vector<std::string> &res,
49 const std::string &delims);
61 static void updateBagDlm(
const Table& table, std::vector<std::vector<std::string>>& bow,
62 ui column,
const std::string& dlim,
ui strategy);
67 static void sortIdMap(std::vector<ui>& id_map,
const std::vector<std::vector<ui>>& datasets);
74 std::vector<std::vector<ui>>&
recordsA,
75 std::vector<std::vector<ui>>&
recordsB,
78 std::vector<double>&
wordwt,
81 ui columnA,
ui columnB,
84 std::vector<std::vector<ui>>&
recordsA,
86 std::vector<double>&
wordwt,
std::vector< ui > q
Definition block.cc:9
std::vector< TokenizerType > tok_type
Definition block.cc:8
ui num_word
Definition block.cc:7
std::vector< std::vector< double > > weightsA
Definition blocker_config.cc:21
std::vector< std::vector< ui > > id_mapA
Definition blocker_config.cc:15
std::vector< std::vector< std::vector< ui > > > recordsA
Definition blocker_config.cc:19
std::vector< std::vector< double > > weightsB
Definition blocker_config.cc:22
std::vector< std::vector< ui > > id_mapB
Definition blocker_config.cc:16
std::vector< std::vector< double > > wordwt
Definition blocker_config.cc:23
std::vector< std::vector< std::vector< ui > > > recordsB
Definition blocker_config.cc:20
Definition dataframe.h:19
Definition tokenizer.h:27
Tokenizer(const Tokenizer &other)=delete
static void string2TokensAlphaNumeric(const std::string &s, std::vector< std::string > &res)
Definition tokenizer.cc:92
static void string2TokensQGram(const std::string &s, std::vector< std::string > &res, ui q)
Definition tokenizer.cc:48
static void updateBagQGram(const Table &table, std::vector< std::vector< std::string > > &bow, ui column, ui q)
Definition tokenizer.cc:161
static void string2TokensDlm(const std::string &s, std::vector< std::string > &res, const std::string &delims)
Definition tokenizer.cc:23
static void stringNormalize(std::string &s, ui startegy)
Definition tokenizer.cc:106
static void updateBagDlm(const Table &table, std::vector< std::vector< std::string > > &bow, ui column, const std::string &dlim, ui strategy)
Definition tokenizer.cc:136
static void resTableAttr2IntVector(const Table &resTable, std::vector< std::vector< ui > > &recordsA, std::vector< std::vector< ui > > &recordsB, std::vector< double > &weightsA, std::vector< double > &weightsB, std::vector< double > &wordwt, ui columnA, ui columnB, TokenizerType tok_type, ui &num_word, ui q)
Definition tokenizer.cc:513
static void updateBagAlphaNumeric(const Table &table, std::vector< std::vector< std::string > > &bow, ui column)
Definition tokenizer.cc:184
static void string2TokensWSpace(const std::string &s, std::vector< std::string > &res)
Definition tokenizer.cc:85
static void SelftableAttr2IntVector(const Table &tableA, std::vector< std::vector< ui > > &recordsA, std::vector< double > &weightsA, std::vector< double > &wordwt, std::vector< ui > &id_mapA, ui columnA, TokenizerType tok_type, ui &num_word, ui q)
Definition tokenizer.cc:394
Tokenizer(Tokenizer &&other)=delete
static void sortIdMap(std::vector< ui > &id_map, const std::vector< std::vector< ui > > &datasets)
Definition tokenizer.cc:207
static void RStableAttr2IntVector(const Table &tableA, const Table &tableB, std::vector< std::vector< ui > > &recordsA, std::vector< std::vector< ui > > &recordsB, std::vector< double > &weightsA, std::vector< double > &weightsB, std::vector< double > &wordwt, std::vector< ui > &id_mapA, std::vector< ui > &id_mapB, ui columnA, ui columnB, TokenizerType tok_type, ui &num_word, ui q)
Definition tokenizer.cc:233
TokenizerType
Definition type.h:39
unsigned int ui
Definition type.h:8