Entity Matching by Similarity Join
 
Loading...
Searching...
No Matches
feature_index.h
Go to the documentation of this file.
1/*
2 * author: Yunqi Li
3 * contact: liyunqixa@gmail.com
4 */
5/*
6 * this file can only be included in the feature.cpp & related .hpp files
7 * this file contains the methods for accelerating extracting features
8 * from interchangeable values (e.g. sim joins/searchs)
9 * this is the version for vector<string> rather than vector<ui>
10 */
11#ifndef _FEATURE_INDEX_H_
12#define _FEATURE_INDEX_H_
13
15#include <string>
16#include <vector>
17
18
20{
21public:
22 // tokens are stored as string or (unsigned) integer
23 // the integer tokens are for similarity join to boost index init
24 // e.g., set a threshold 0.8 to quickly filter pairs in two interchangeable groups
25 using Group = std::unordered_map<int, std::vector<std::string>>;
26 using Groups = std::vector<std::unordered_map<int, std::vector<std::string>>>;
27 using GroupToken = std::unordered_map<int, std::vector<std::vector<std::string>>>;
28 using GroupTokens = std::vector<std::unordered_map<int, std::vector<std::vector<std::string>>>>;
29 using GroupTokenInt = std::unordered_map<int, std::vector<std::vector<ui>>>;
30 using GroupTokensInt = std::vector<std::unordered_map<int, std::vector<std::vector<ui>>>>;
31 using Cluster = std::vector<std::unordered_map<std::string, int>>;
32
33public:
34 std::vector<std::string> str_gt_10w = {"name", "title", "description"};
35 std::vector<std::string> str_bt_1w_5w = {};
36 std::vector<std::string> str_bt_5w_10w = {};
37 std::vector<std::string> str_eq_1w = {"brand", "category", "manufacturer"};
38
39 // for each attr, cache all the values for each two groups with size larger than MIN_CACHED_LENGTH
40 double ****featureValCache = nullptr;
41 int **discreteCacheIdx = nullptr;
42 // for releasing buffers only
43 std::vector<int> attrCahceLength;
44
45 // min length of the group to be cached
46 const int MIN_CACHED_LENGTH = 10000;
47 // det used for length filter
48 const double LENGTH_FILTER_DET = 0.1;
49 // length filter
53
54public:
55 FeatureIndex() = default;
57 int attrSize = (int)attrCahceLength.size();
58 for(int i = 0; i < attrSize; i++) {
59 for(int j = 0; j < attrCahceLength[i]; j++) {
60 for(int k = 0; k < attrCahceLength[i]; k++)
61 delete[] featureValCache[i][j][k];
62 delete[] featureValCache[i][j];
63 }
64 delete[] featureValCache[i];
65 delete[] discreteCacheIdx[i];
66 }
67 delete[] featureValCache;
68 delete[] discreteCacheIdx;
69 }
70 FeatureIndex(const FeatureIndex &other) = delete;
71 FeatureIndex(FeatureIndex &&other) = delete;
72
73public:
74 // get the column index for a specific feature
75 int calCahceIndex(const std::string &func, const std::string &tok, int numFeature);
76
77private:
78 // sort according to idf
79 void normalizeTokens(const GroupTokens &grpToks, GroupTokensInt &grpToksInt);
80
81 void calIndexLength4(int curAttrIdx, const std::vector<int> &grpid, const GroupToken &curGrpDlm, const GroupTokenInt &curGrpDlmInt,
82 bool isCoeff);
83 void calIndexLength6(int curAttrIdx, const std::vector<int> &grpid, const Group &curGrp, const GroupToken &curGrpQgm,
84 const GroupTokenInt &curGrpQgmInt, bool isCoeff);
85 void calIndexLength8(int curAttrIdx, const std::vector<int> &grpid, const GroupToken &curGrpDlm, const GroupTokenInt &curGrpDlmInt,
86 const GroupToken &curGrpQgm, const GroupTokenInt &curGrpQgmInt, bool isCoeff);
87
88public:
89 // get number of features according to attr
90 int calNumFeature(const std::string attr);
91
92 void globalInit(const std::vector<int> &keyNum, const std::vector<std::string> &attrs, Groups &groups, const GroupTokens &grpdlm,
93 const GroupTokens &grpqgm, bool isCoeff = false);
94};
95
96
97#endif // _FEATURE_INDEX_H_
Definition feature_index.h:20
std::vector< std::string > str_bt_5w_10w
Definition feature_index.h:36
std::vector< std::string > str_bt_1w_5w
Definition feature_index.h:35
const double diceLengthFilter
Definition feature_index.h:52
const double cosLengthFilter
Definition feature_index.h:51
double **** featureValCache
Definition feature_index.h:40
std::vector< std::unordered_map< int, std::vector< std::string > > > Groups
Definition feature_index.h:26
int ** discreteCacheIdx
Definition feature_index.h:41
int calNumFeature(const std::string attr)
Definition feature_index.cc:311
std::unordered_map< int, std::vector< std::vector< std::string > > > GroupToken
Definition feature_index.h:27
std::unordered_map< int, std::vector< std::string > > Group
Definition feature_index.h:25
std::vector< std::unordered_map< std::string, int > > Cluster
Definition feature_index.h:31
void globalInit(const std::vector< int > &keyNum, const std::vector< std::string > &attrs, Groups &groups, const GroupTokens &grpdlm, const GroupTokens &grpqgm, bool isCoeff=false)
Definition feature_index.cc:326
std::vector< std::string > str_gt_10w
Definition feature_index.h:34
FeatureIndex(FeatureIndex &&other)=delete
const double universalDet
Definition feature_index.h:50
std::vector< std::unordered_map< int, std::vector< std::vector< std::string > > > > GroupTokens
Definition feature_index.h:28
FeatureIndex(const FeatureIndex &other)=delete
std::vector< std::string > str_eq_1w
Definition feature_index.h:37
FeatureIndex()=default
const int MIN_CACHED_LENGTH
Definition feature_index.h:46
std::unordered_map< int, std::vector< std::vector< ui > > > GroupTokenInt
Definition feature_index.h:29
std::vector< std::unordered_map< int, std::vector< std::vector< ui > > > > GroupTokensInt
Definition feature_index.h:30
int calCahceIndex(const std::string &func, const std::string &tok, int numFeature)
Definition feature_index.cc:9
const double LENGTH_FILTER_DET
Definition feature_index.h:48
std::vector< int > attrCahceLength
Definition feature_index.h:43
~FeatureIndex()
Definition feature_index.h:56