Entity Matching by Similarity Join
 
Loading...
Searching...
No Matches
tokenizer.h
Go to the documentation of this file.
1/*
2 * author: Chaoji Zuo and Zhizhi Wang in rutgers-db/SIGMOD2022-Programming-Contest-Public
3 * modified: Yunqi Li
4 * contact: liyunqixa@gmail.com
5 */
6#ifndef _TOKENIZER_H_
7#define _TOKENIZER_H_
8
9#include "config.h"
10#include "type.h"
11#include "dataframe.h"
12#include <regex>
13#include <cmath>
14#include <string>
15#include <vector>
16#include <algorithm>
17#include <unordered_map>
18#include <ctype.h>
19
20// If we need to do q-gram tokenization, we could use text normalization.
21// That is only keep a-z/A-Z and 0-9 in string.
22// If we need to use dlm tokenization, we should not use text normalization.
23// Since some tokens like "18.9" may be eliminated in this way.
24
25
27{
28public:
29 Tokenizer() = default;
30 ~Tokenizer() = default;
31 Tokenizer(const Tokenizer& other) = delete;
32 Tokenizer(Tokenizer&& other) = delete;
33
34private:
35 // check a string if contains other chars
36 static bool isNotAlphaNumeric(char c);
37 static bool isAlphaNumeric(const std::string &s);
38
39public:
40 /*
41 * As suggested in sparkly, it's better to tokenize strings only keeping alphanumeric chars.
42 * For dlm_dc0 & q(3)-gram, they will introduce non-alphanumeric characters
43 * We could set marco "SKIP_NO_ALPHANUMERIC" to avoid this
44 * Alternatively, we could also use alphanumeric tokenizer
45 * And we already synethsis the difference between py_entitymatching & blocker
46 */
47 // dlm_dc0: tokenize by white space, "\n" or "\t", etc.
48 static void string2TokensDlm(const std::string &s, std::vector<std::string> &res,
49 const std::string &delims);
50 // q-gram: default 3-qgram with padding
51 static void string2TokensQGram(const std::string &s, std::vector<std::string> &res,
52 ui q);
53 // w-space: it's the same function as dlm_dc0
54 static void string2TokensWSpace(const std::string &s, std::vector<std::string> &res);
55 // alphanumeric: returns a list of tokens that are maximal sequences of consecutive alphanumeric characters.
56 static void string2TokensAlphaNumeric(const std::string &s, std::vector<std::string> &res);
57
58public:
59 // Convert a table into a set of int vectors
60 static void stringNormalize(std::string& s, ui startegy);
61 static void updateBagDlm(const Table& table, std::vector<std::vector<std::string>>& bow,
62 ui column, const std::string& dlim, ui strategy);
63 static void updateBagQGram(const Table& table, std::vector<std::vector<std::string>>& bow,
64 ui column, ui q);
65 static void updateBagAlphaNumeric(const Table &table, std::vector<std::vector<std::string>>& bow,
66 ui column);
67 static void sortIdMap(std::vector<ui>& id_map, const std::vector<std::vector<ui>>& datasets);
68
69public:
70 // weightsA & weightsB are same
71 // they record the word weight
72 // which is log_10(records_num / word_frequency)
73 static void RStableAttr2IntVector(const Table& tableA, const Table& tableB,
74 std::vector<std::vector<ui>>& recordsA,
75 std::vector<std::vector<ui>>& recordsB,
76 std::vector<double>& weightsA,
77 std::vector<double>& weightsB,
78 std::vector<double>& wordwt,
79 std::vector<ui>& id_mapA,
80 std::vector<ui>& id_mapB,
81 ui columnA, ui columnB,
83 static void SelftableAttr2IntVector(const Table& tableA,
84 std::vector<std::vector<ui>>& recordsA,
85 std::vector<double>& weightsA,
86 std::vector<double>& wordwt,
87 std::vector<ui>& id_mapA,
88 ui columnA, TokenizerType tok_type,
89 ui& num_word, ui q);
90 // tokenize the sample / match res table
91 static void resTableAttr2IntVector(const Table &resTable, std::vector<std::vector<ui>> &recordsA,
92 std::vector<std::vector<ui>> &recordsB, std::vector<double> &weightsA,
93 std::vector<double> &weightsB, std::vector<double> &wordwt,
94 ui columnA, ui columnB, TokenizerType tok_type,
95 ui &num_word, ui q);
96};
97
98
99#endif // _TOKENIZER_H_
std::vector< ui > q
Definition block.cc:9
std::vector< TokenizerType > tok_type
Definition block.cc:8
ui num_word
Definition block.cc:7
std::vector< std::vector< double > > weightsA
Definition blocker_config.cc:21
std::vector< std::vector< ui > > id_mapA
Definition blocker_config.cc:15
std::vector< std::vector< std::vector< ui > > > recordsA
Definition blocker_config.cc:19
std::vector< std::vector< double > > weightsB
Definition blocker_config.cc:22
std::vector< std::vector< ui > > id_mapB
Definition blocker_config.cc:16
std::vector< std::vector< double > > wordwt
Definition blocker_config.cc:23
std::vector< std::vector< std::vector< ui > > > recordsB
Definition blocker_config.cc:20
Definition dataframe.h:19
Definition tokenizer.h:27
Tokenizer(const Tokenizer &other)=delete
static void string2TokensAlphaNumeric(const std::string &s, std::vector< std::string > &res)
Definition tokenizer.cc:92
static void string2TokensQGram(const std::string &s, std::vector< std::string > &res, ui q)
Definition tokenizer.cc:48
static void updateBagQGram(const Table &table, std::vector< std::vector< std::string > > &bow, ui column, ui q)
Definition tokenizer.cc:161
static void string2TokensDlm(const std::string &s, std::vector< std::string > &res, const std::string &delims)
Definition tokenizer.cc:23
static void stringNormalize(std::string &s, ui startegy)
Definition tokenizer.cc:106
Tokenizer()=default
static void updateBagDlm(const Table &table, std::vector< std::vector< std::string > > &bow, ui column, const std::string &dlim, ui strategy)
Definition tokenizer.cc:136
~Tokenizer()=default
static void resTableAttr2IntVector(const Table &resTable, std::vector< std::vector< ui > > &recordsA, std::vector< std::vector< ui > > &recordsB, std::vector< double > &weightsA, std::vector< double > &weightsB, std::vector< double > &wordwt, ui columnA, ui columnB, TokenizerType tok_type, ui &num_word, ui q)
Definition tokenizer.cc:513
static void updateBagAlphaNumeric(const Table &table, std::vector< std::vector< std::string > > &bow, ui column)
Definition tokenizer.cc:184
static void string2TokensWSpace(const std::string &s, std::vector< std::string > &res)
Definition tokenizer.cc:85
static void SelftableAttr2IntVector(const Table &tableA, std::vector< std::vector< ui > > &recordsA, std::vector< double > &weightsA, std::vector< double > &wordwt, std::vector< ui > &id_mapA, ui columnA, TokenizerType tok_type, ui &num_word, ui q)
Definition tokenizer.cc:394
Tokenizer(Tokenizer &&other)=delete
static void sortIdMap(std::vector< ui > &id_map, const std::vector< std::vector< ui > > &datasets)
Definition tokenizer.cc:207
static void RStableAttr2IntVector(const Table &tableA, const Table &tableB, std::vector< std::vector< ui > > &recordsA, std::vector< std::vector< ui > > &recordsB, std::vector< double > &weightsA, std::vector< double > &weightsB, std::vector< double > &wordwt, std::vector< ui > &id_mapA, std::vector< ui > &id_mapB, ui columnA, ui columnB, TokenizerType tok_type, ui &num_word, ui q)
Definition tokenizer.cc:233
TokenizerType
Definition type.h:39
unsigned int ui
Definition type.h:8