Entity Matching by Similarity Join
 
Loading...
Searching...
No Matches
group.h
Go to the documentation of this file.
1// TODO: using FeatureIndex class
2// the interchangeable value in blocking may be depracted
3#ifndef _GROUP_H_
4#define _GROUP_H_
5
6#include "common/tokenizer.h"
7#include <sstream>
8#include <string>
9#include <vector>
10#include <unordered_map>
11
12using Groups = std::vector<std::unordered_map<int, std::vector<std::string>>>;
13using GroupTokens = std::vector<std::unordered_map<int, std::vector<std::vector<std::string>>>>;
14using Clusters = std::vector<std::unordered_map<std::string, int>>;
15
17{
18
19// sims
20inline int overlap(const std::vector<std::string> &v1, const std::vector<std::string> &v2)
21{
22 int ovlp = 0;
23
24 std::vector<std::string> res;
25 std::set_intersection(v1.begin(), v1.end(), v2.begin(), v2.end(), std::back_inserter(res));
26 // __gnu_parallel::set_intersection(v1.begin(), v1.end(), v2.begin(), v2.end(), std::back_inserter(res));
27 ovlp = res.size();
28
29 return ovlp;
30}
31
32inline int tripletMin(int a, int b, int c)
33{
34 return (a <= b && a <= c) ? a : (b <= c ? b : c);
35}
36
37inline double levDist(const std::string &v1, const std::string &v2)
38{
39 ui v1_size = v1.size();
40 ui v2_size = v2.size();
41
42 if (!v1_size)
43 return v2_size;
44 else if (!v2_size)
45 return v1_size;
46
47 int dist[v1_size + 1][v2_size + 1];
48 for (ui i = 0; i <= v1_size; i++)
49 std::fill(dist[i], dist[i] + v2_size + 1, 0);
50
51 for (ui i = 0; i <= v1_size; i++)
52 dist[i][0] = int(i);
53 for (ui i = 0; i <= v2_size; i++)
54 dist[0][i] = int(i);
55
56 // std::cout << v1 << v2 << std::endl;
57 for (ui i = 1; i <= v1_size; i++)
58 {
59 for (ui j = 1; j <= v2_size; j++)
60 {
61 int cost = (v1[i - 1] == v2[j - 1]) ? 0 : 1;
62 dist[i][j] = tripletMin(dist[i - 1][j] + 1,
63 dist[i][j - 1] + 1,
64 dist[i - 1][j - 1] + cost);
65 }
66 }
67
68 return dist[v1_size][v2_size] * 1.0;
69}
70
71inline double jaccard(const std::vector<std::string> &v1, const std::vector<std::string> &v2)
72{
73 if (v1.empty() && v2.empty())
74 return 1.0;
75
76 int ovlp = overlap(v1, v2);
77
78 return ovlp * 1.0 / (v1.size() + v2.size() - ovlp) * 1.0;
79}
80
81inline double cosine(const std::vector<std::string> &v1, const std::vector<std::string> &v2)
82{
83 if (v1.empty() && v2.empty())
84 return 1.0;
85
86 int ovlp = overlap(v1, v2);
87 double dot_product = v1.size() * v2.size() * 1.0;
88 // double inv_denominator = SimFuncs::inverseSqrt(dot_product);
89
90 // return ovlp * 1.0 * inv_denominator;
91 return ovlp * 1.0 / sqrt(dot_product);
92}
93
94inline double dice(const std::vector<std::string> &v1, const std::vector<std::string> &v2)
95{
96 if (v1.empty() && v2.empty())
97 return 1.0;
98
99 int ovlp = overlap(v1, v2);
100
101 return ovlp * 2.0 / (int)(v1.size() + v2.size()) * 1.0;
102}
103
104inline double exactMatch(const std::string &s1, const std::string &s2)
105{
106 return s1 == s2 ? 1.0 : 0.0;
107}
108
109inline double absoluteNorm(const std::string &s1, const std::string &s2)
110{
111 if (s1 == " " || s2 == " " || s1.empty() || s2.empty())
112 return -1.0;
113
114 double d1 = stod(s1);
115 double d2 = stod(s2);
116
117 if (std::abs(d1) < 1e-5 || std::abs(d2) < 1e-5)
118 return 0.0;
119
120 double diff = std::abs(d1 - d2);
121 double maxVal = std::max(std::abs(d1), std::abs(d2));
122
123 if (diff / maxVal <= 1e-5)
124 return 1.0;
125
126 return 1.0 - diff / maxVal;
127}
128
129inline void stringSplit(std::string str, char delim, std::vector<std::string> &res)
130{
131 std::istringstream iss(str);
132 std::string token;
133 while(getline(iss, token, delim))
134 res.emplace_back(token);
135};
136
137inline void tokenize(const std::string &str, const std::string &type, std::vector<std::string> &tokens,
138 const std::string &delims)
139{
140 tokens.clear();
141 if(type == "dlm") Tokenizer::string2TokensDlm(str, tokens, delims);
142 else if(type == "qgm") Tokenizer::string2TokensQGram(str, tokens, 3);
143 std::sort(tokens.begin(), tokens.end());
144 auto iter = std::unique(tokens.begin(), tokens.end());
145 tokens.resize(std::distance(tokens.begin(), iter));
146}
147
148};
149
150
152{
153public:
155 int totalAttr{0};
156 std::vector<std::string> attrVec;
157 std::vector<int> featureLength;
158
162 std::vector<std::vector<std::vector<int>>> interCGroupsA, interCGroupsB;
163 std::vector<std::vector<int>> interCGrpIdA, interCGrpIdB;
164 std::vector<int> keyLength;
165
166 // for each attr, cache all the values for each two groups with size >= 2
167 double ****featureValCache{nullptr};
168 int **discreteCacheIdx{nullptr};
169 // for release only
170 std::vector<int> attrCahceLength;
171
172 std::vector<std::string> str_gt_10w = {"name", "title", "description"};
173 std::vector<std::string> str_bt_1w_5w = {};
174 std::vector<std::string> str_bt_5w_10w = {};
175 std::vector<std::string> str_eq_1w = {"brand", "category"};
176
177public:
179 GroupInterchangeable(int _totalTable, int _totalAttr, const std::vector<std::string> &_attrVec)
180 : totalTable(_totalTable), totalAttr(_totalAttr), attrVec(_attrVec) {
181 printf("number of tables: %d\tnumber of attrs: %d\n", totalTable, totalAttr);
182 for(const auto &attr : attrVec)
183 printf("%s\t", attr.c_str());
184 printf("\n");
185 }
189
190public:
191 // get number of features according to attr
192 int calNumFeature(const std::string attr) {
193 if(std::count(str_gt_10w.begin(), str_gt_10w.end(), attr) != 0
194 || std::count(str_bt_5w_10w.begin(), str_bt_5w_10w.end(), attr) != 0)
195 return 4;
196 else if(std::count(str_eq_1w.begin(), str_eq_1w.end(), attr) != 0)
197 return 6;
198 else if(std::count(str_bt_1w_5w.begin(), str_bt_1w_5w.end(), attr) != 0)
199 return 8;
200
201 return 0;
202 }
203
204 // get the column index for a specific feature
205 int calCahceIndex(const std::string &func, const std::string &tok, int numFeature) {
206 int idx = 0;
207
208 if(func == "exm" || func == "lev") {
209 idx = func == "lev" ? 4 : 5;
210 return idx;
211 }
212
213 if(tok == "dlm") idx = 0;
214 else if(tok == "qgm") idx = numFeature == 6 ? 0 : 1;
215
216 if(func == "jac") idx = idx * 4 + 0;
217 else if(func == "cos") idx = idx * 4 + 1;
218 else if(func == "dice") idx = idx * 4 + 2;
219 else if(func == "overlap") idx = idx * 4 + 3;
220
221 return idx;
222 }
223
224 int calAttrIndex(const std::string &attr) {
225 auto iter = std::find(attrVec.begin(), attrVec.end(), attr);
226 return std::distance(attrVec.begin(), iter);
227 }
228
229 void globalInit(const std::vector<int> &keyNum, const std::vector<std::string> &attrs, Groups &groups, const GroupTokens &grpdlm, const GroupTokens &grpqgm) {
230 featureValCache = new double***[totalAttr];
231 discreteCacheIdx = new int*[totalAttr];
232
233 for(int i = 0; i < totalAttr; i++) {
234 const auto &curGrp = groups[i];
235 const auto &curGrpDlm = grpdlm[i];
236 const auto &curGrpQgm = grpqgm[i];
237 int numFeature = calNumFeature(attrs[i]);
238 if(numFeature == 0) {
239 std::cerr << "no such attr: " << attrs[i] << std::endl;
240 exit(1);
241 }
242
243 // length > 1
244 std::vector<int> grpid;
245 for(const auto &grpit : curGrp) {
246 int keyId = grpit.first;
247 if(grpit.second.size() > 1)
248 grpid.emplace_back(keyId);
249 }
250
251 int bucketSize = (int)grpid.size();
252 // cache
253 featureValCache[i] = new double**[numFeature];
254 for(int j = 0; j < numFeature; j++) {
255 featureValCache[i][j] = new double*[bucketSize];
256 for(int l = 0; l < bucketSize; l++) {
257 featureValCache[i][j][l] = new double[numFeature];
258 std::fill(featureValCache[i][j][l], featureValCache[i][j][l] + numFeature, 0.0);
259 }
260 }
261 // cache index
262 discreteCacheIdx[i] = new int[keyNum[i]];
263 std::fill(discreteCacheIdx[i], discreteCacheIdx[i] + keyNum[i], 0);
264 for(int didx = 0; didx < bucketSize; didx++)
265 discreteCacheIdx[i][grpid[didx]] = didx;
266
267 attrCahceLength.emplace_back(bucketSize);
268
269 // calculate
270 for(int id1 = 0; id1 < bucketSize; id1++) {
271 int lid = grpid[id1];
272 const auto &ldocs = curGrp.at(lid);
273 const auto &ldlms = curGrpDlm.at(lid);
274 const auto &lqgms = curGrpQgm.at(lid);
275 for(int id2 = id1 + 1; id2 < bucketSize; id2++) {
276 int rid = grpid[id2];
277 const auto &rdocs = curGrp.at(rid);
278 const auto &rdlms = curGrpDlm.at(rid);
279 const auto &rqgms = curGrpQgm.at(rid);
280
281 switch(numFeature) {
282 case 4 : {
283 double maxJacVal = 0.0;
284 double maxCosVal = 0.0;
285 double maxDiceVal = 0.0;
286 int maxOvlpVal = 0;
287 for(const auto &ldoc : ldlms) {
288 for(const auto &rdoc : rdlms) {
289 maxJacVal = std::max(maxJacVal, feature_utils::jaccard(ldoc, rdoc));
290 maxCosVal = std::max(maxCosVal, feature_utils::cosine(ldoc, rdoc));
291 maxDiceVal = std::max(maxDiceVal, feature_utils::dice(ldoc, rdoc));
292 maxOvlpVal = std::max(maxOvlpVal, feature_utils::overlap(ldoc, rdoc));
293 }
294 }
295 featureValCache[i][0][id1][id2] = maxJacVal;
296 featureValCache[i][0][id2][id1] = maxJacVal;
297 featureValCache[i][1][id1][id2] = maxCosVal;
298 featureValCache[i][1][id2][id1] = maxCosVal;
299 featureValCache[i][2][id1][id2] = maxDiceVal;
300 featureValCache[i][2][id2][id1] = maxDiceVal;
301 featureValCache[i][3][id1][id2] = maxOvlpVal * 1.0;
302 featureValCache[i][3][id2][id1] = maxOvlpVal * 1.0;
303 break;
304 }
305 case 6 : {
306 double maxJacVal = 0.0;
307 double maxCosVal = 0.0;
308 double maxDiceVal = 0.0;
309 int maxOvlpVal = 0;
310 double minLevVal = 0.0;
311 double maxExmVal = 0.0;
312 for(const auto &ldoc : lqgms) {
313 for(const auto &rdoc : rqgms) {
314 maxJacVal = std::max(maxJacVal, feature_utils::jaccard(ldoc, rdoc));
315 maxCosVal = std::max(maxCosVal, feature_utils::cosine(ldoc, rdoc));
316 maxDiceVal = std::max(maxDiceVal, feature_utils::dice(ldoc, rdoc));
317 maxOvlpVal = std::max(maxOvlpVal, feature_utils::overlap(ldoc, rdoc));
318 }
319 }
320 for(const auto &ldoc : ldocs) {
321 for(const auto &rdoc : rdocs) {
322 minLevVal = std::min(minLevVal, feature_utils::levDist(ldoc, rdoc));
323 maxExmVal = std::max(maxExmVal, feature_utils::exactMatch(ldoc, rdoc));
324 }
325 }
326 featureValCache[i][0][id1][id2] = maxJacVal;
327 featureValCache[i][0][id2][id1] = maxJacVal;
328 featureValCache[i][1][id1][id2] = maxCosVal;
329 featureValCache[i][1][id2][id1] = maxCosVal;
330 featureValCache[i][2][id1][id2] = maxDiceVal;
331 featureValCache[i][2][id2][id1] = maxDiceVal;
332 featureValCache[i][3][id1][id2] = maxOvlpVal * 1.0;
333 featureValCache[i][3][id2][id1] = maxOvlpVal * 1.0;
334 featureValCache[i][4][id1][id2] = minLevVal;
335 featureValCache[i][4][id2][id1] = minLevVal;
336 featureValCache[i][5][id1][id2] = maxExmVal;
337 featureValCache[i][5][id2][id1] = maxExmVal;
338 break;
339 }
340 case 8 : {
341 double maxJacDlmVal = 0.0, maxJacQgmVal = 0.0;
342 double maxCosDlmVal = 0.0, maxCosQgmVal = 0.0;
343 double maxDiceDlmVal = 0.0, maxDiceQgmVal = 0.0;
344 int maxOvlpDlmVal = 0, maxOvlpQgmVal = 0;
345 for(const auto &ldoc : ldlms) {
346 for(const auto &rdoc : rdlms) {
347 maxJacDlmVal = std::max(maxJacDlmVal, feature_utils::jaccard(ldoc, rdoc));
348 maxCosDlmVal = std::max(maxCosDlmVal, feature_utils::cosine(ldoc, rdoc));
349 maxDiceDlmVal = std::max(maxDiceDlmVal, feature_utils::dice(ldoc, rdoc));
350 maxOvlpDlmVal = std::max(maxOvlpDlmVal, feature_utils::overlap(ldoc, rdoc));
351 }
352 }
353 for(const auto &ldoc : lqgms) {
354 for(const auto &rdoc : rqgms) {
355 maxJacQgmVal = std::max(maxJacQgmVal, feature_utils::jaccard(ldoc, rdoc));
356 maxCosQgmVal = std::max(maxCosQgmVal, feature_utils::cosine(ldoc, rdoc));
357 maxDiceQgmVal = std::max(maxDiceQgmVal, feature_utils::dice(ldoc, rdoc));
358 maxOvlpQgmVal = std::max(maxOvlpQgmVal, feature_utils::overlap(ldoc, rdoc));
359 }
360 }
361 featureValCache[i][0][id1][id2] = maxJacDlmVal;
362 featureValCache[i][0][id2][id1] = maxJacDlmVal;
363 featureValCache[i][1][id1][id2] = maxCosDlmVal;
364 featureValCache[i][1][id2][id1] = maxCosDlmVal;
365 featureValCache[i][2][id1][id2] = maxDiceDlmVal;
366 featureValCache[i][2][id2][id1] = maxDiceDlmVal;
367 featureValCache[i][3][id1][id2] = maxOvlpDlmVal * 1.0;
368 featureValCache[i][3][id2][id1] = maxOvlpDlmVal * 1.0;
369 featureValCache[i][4][id1][id2] = maxJacQgmVal;
370 featureValCache[i][4][id2][id1] = maxJacQgmVal;
371 featureValCache[i][5][id1][id2] = maxCosQgmVal;
372 featureValCache[i][5][id2][id1] = maxCosQgmVal;
373 featureValCache[i][6][id1][id2] = maxDiceQgmVal;
374 featureValCache[i][6][id2][id1] = maxDiceQgmVal;
375 featureValCache[i][7][id1][id2] = maxOvlpQgmVal * 1.0;
376 featureValCache[i][7][id2][id1] = maxOvlpQgmVal * 1.0;
377 break;
378 }
379 }
380 }
381 }
382 }
383 }
384
385 void IO() {
386 std::string delims = " \"\',\\\t\r\n";
387
388 for(int i = 0; i < totalAttr; i ++) {
389 std::string grpPath = "buffer/interchangeable_grp_" + attrVec[i] + ".txt";
390 auto &curGrp = group[i];
391 auto &curGrpDlm = groupTokensDlm[i];
392 auto &curGrpQgm = groupTokensQgm[i];
393 auto &curClt = cluster[i];
394
395 // grp
396 std::string entity;
397 std::vector<std::string> entityVec;
398
399 std::ifstream grpFile(grpPath.c_str(), std::ios::in);
400
401 getline(grpFile, entity);
402 int totalKey = std::stoi(entity);
403 keyLength.emplace_back(totalKey);
404
405 for(int j = 0; j < totalKey; j++) {
406 entityVec.clear();
407 getline(grpFile, entity);
408 feature_utils::stringSplit(entity, ' ', entityVec);
409 int keyId = std::stoi(entityVec[0]);
410 int length = std::stoi(entityVec[1]);
411
412 if(length <= 1)
413 continue;
414
415 for(int l = 0; l < length; l++) {
416 std::string doc;
417 getline(grpFile, doc);
418 // doc
419 curGrp[keyId].emplace_back(doc);
420 // tokenized doc
421 std::vector<std::string> tokensDlm;
422 std::vector<std::string> tokensQgm;
423 std::string tok1 = "dlm", tok2 = "qgm";
424 feature_utils::tokenize(doc, tok1, tokensDlm, delims);
425 feature_utils::tokenize(doc, tok2, tokensQgm, delims);
426 curGrpDlm[keyId].emplace_back(tokensDlm);
427 curGrpQgm[keyId].emplace_back(tokensQgm);
428 curClt[doc] = keyId;
429 }
430 }
431 grpFile.close();
432 }
433 }
434
435 void buildIndex(const Table &tableA, const Table &tableB) {
436 interCGroupsA.resize(totalAttr);
437 interCGroupsB.resize(totalAttr);
438 interCGrpIdA.resize(totalAttr);
439 interCGrpIdB.resize(totalAttr);
440
441 for(int i = 0; i < totalAttr; i++) {
442 std::string curAttr = attrVec[i];
443 int attrPos = (int)tableA.inverted_schema.at(curAttr);
444 const auto &curColA = tableA.cols[attrPos];
445 const auto &curColB = tableB.cols[attrPos];
446 int colASize = (int)curColA.size();
447 int colBSize = (int)curColB.size();
448
449 const auto &curClt = cluster[i];
450 // const auto &curGrp = group[i];
451 auto &curGrpA = interCGroupsA[i];
452 auto &curGrpB = interCGroupsB[i];
453 auto &curGrpIdA = interCGrpIdA[i];
454 auto &curGrpIdB = interCGrpIdB[i];
455
456 for(int row = 0; row < colASize; row++) {
457 const auto &doc = curColA[row];
458 bool haskey = curClt.find(doc) != curClt.end();
459
460 if(haskey) {
461 int keyId = curClt.at(doc);
462 curGrpIdA.emplace_back(keyId);
463 curGrpA[keyId].emplace_back(row);
464 }
465 else
466 curGrpIdA.emplace_back(-1);
467 }
468
469 for(int row = 0; row < colBSize; row++) {
470 const auto &doc = curColB[row];
471 bool haskey = curClt.find(doc) != curClt.end();
472
473 if(haskey) {
474 int keyId = curClt.at(doc);
475 curGrpIdB.emplace_back(keyId);
476 curGrpB[keyId].emplace_back(row);
477 }
478 else
479 curGrpB.emplace_back(-1);
480 }
481 }
482 }
483
485 int attrSize = (int)attrCahceLength.size();
486 for(int i = 0; i < attrSize; i++) {
487 int numFeature = calNumFeature(attrVec[i]);
488 for(int j = 0; j < numFeature; j++) {
489 for(int k = 0; k < attrCahceLength[i]; k++)
490 delete[] featureValCache[i][j][k];
491 delete[] featureValCache[i][j];
492 }
493 delete[] featureValCache[i];
494 delete[] discreteCacheIdx[i];
495 }
496 delete[] featureValCache;
497 delete[] discreteCacheIdx;
498 }
499};
500
501
502
503
504
505#endif // _GROUP_H_
Definition group.h:152
GroupTokens groupTokensDlm
Definition group.h:161
std::vector< int > attrCahceLength
Definition group.h:170
int calNumFeature(const std::string attr)
Definition group.h:192
double **** featureValCache
Definition group.h:167
int totalAttr
Definition group.h:155
int ** discreteCacheIdx
Definition group.h:168
int calCahceIndex(const std::string &func, const std::string &tok, int numFeature)
Definition group.h:205
void IO()
Definition group.h:385
std::vector< std::string > str_bt_5w_10w
Definition group.h:174
GroupTokens groupTokensQgm
Definition group.h:161
std::vector< int > keyLength
Definition group.h:164
void releaseBuffer()
Definition group.h:484
std::vector< std::vector< int > > interCGrpIdA
Definition group.h:163
Groups group
Definition group.h:159
std::vector< std::string > str_eq_1w
Definition group.h:175
std::vector< std::string > str_gt_10w
Definition group.h:172
std::vector< std::string > attrVec
Definition group.h:156
void buildIndex(const Table &tableA, const Table &tableB)
Definition group.h:435
GroupInterchangeable(const GroupInterchangeable &other)=delete
std::vector< std::string > str_bt_1w_5w
Definition group.h:173
std::vector< std::vector< std::vector< int > > > interCGroupsA
Definition group.h:162
std::vector< int > featureLength
Definition group.h:157
GroupInterchangeable(GroupInterchangeable &&other)=delete
int calAttrIndex(const std::string &attr)
Definition group.h:224
std::vector< std::vector< std::vector< int > > > interCGroupsB
Definition group.h:162
void globalInit(const std::vector< int > &keyNum, const std::vector< std::string > &attrs, Groups &groups, const GroupTokens &grpdlm, const GroupTokens &grpqgm)
Definition group.h:229
std::vector< std::vector< int > > interCGrpIdB
Definition group.h:163
Clusters cluster
Definition group.h:160
int totalTable
Definition group.h:154
GroupInterchangeable()=default
~GroupInterchangeable()=default
GroupInterchangeable(int _totalTable, int _totalAttr, const std::vector< std::string > &_attrVec)
Definition group.h:179
Definition dataframe.h:19
static void string2TokensQGram(const std::string &s, std::vector< std::string > &res, ui q)
Definition tokenizer.cc:48
static void string2TokensDlm(const std::string &s, std::vector< std::string > &res, const std::string &delims)
Definition tokenizer.cc:23
std::vector< std::unordered_map< int, std::vector< std::string > > > Groups
Definition group.h:12
std::vector< std::unordered_map< int, std::vector< std::vector< std::string > > > > GroupTokens
Definition group.h:13
std::vector< std::unordered_map< std::string, int > > Clusters
Definition group.h:14
Definition group.h:17
double absoluteNorm(const std::string &s1, const std::string &s2)
Definition group.h:109
double jaccard(const std::vector< std::string > &v1, const std::vector< std::string > &v2)
Definition group.h:71
double cosine(const std::vector< std::string > &v1, const std::vector< std::string > &v2)
Definition group.h:81
double dice(const std::vector< std::string > &v1, const std::vector< std::string > &v2)
Definition group.h:94
int tripletMin(int a, int b, int c)
Definition group.h:32
double levDist(const std::string &v1, const std::string &v2)
Definition group.h:37
void stringSplit(std::string str, char delim, std::vector< std::string > &res)
Definition group.h:129
double exactMatch(const std::string &s1, const std::string &s2)
Definition group.h:104
int overlap(const std::vector< std::string > &v1, const std::vector< std::string > &v2)
Definition group.h:20
void tokenize(const std::string &str, const std::string &type, std::vector< std::string > &tokens, const std::string &delims)
Definition group.h:137
unsigned int ui
Definition type.h:8