Entity Matching by Similarity Join
 
Loading...
Searching...
No Matches
ovlpjoin.h
Go to the documentation of this file.
1/*
2 * author: Dong Deng
3 * modified: Zhencan Peng in rutgers-db/RedPajama_Analysis
4 * modified: Yunqi Li
5 * contact: liyunqixa@gmail.com
6 */
7#ifndef _OVLPRSJOIN_H_
8#define _OVLPRSJOIN_H_
9
10#include "config.h"
11#include "type.h"
12#include "index.h"
13#include <iostream>
14#include <fstream>
15#include <functional>
16#include <vector>
17#include <unordered_map>
18#include <unordered_set>
19#include <string>
20#include <algorithm>
21#include <string.h>
22#include <queue>
23#include <inttypes.h>
24#include <sys/time.h>
25#include <cmath>
26#include <cstdio>
27#include <sys/sysinfo.h>
28#include <chrono>
29#include <assert.h>
30
31#define BRUTEFORCE 0
32#define REPORT_INDEX 0
33#define REPORT_BINARY 0
34
35/*
36 * Overlap join according to SIGMOD2018: Overlap join with theoritical guaranteen
37 * for sequential overlap joiner, the input dataset will not have a large size
38 * thus, it is okay for taking all of them as small sets
39 * so we only implement the heap-based method
40 */
41
42class OvlpRSJoin;
43class OvlpSelfJoin;
44
45struct combination2;
47{
48public:
49 int N{0};
50 int id{0};
51 bool completed{false};
52 std::vector<int> curr;
53
54 combination1(int d, int beg, const OvlpRSJoin &joiner);
55 combination1(int d, int beg, const OvlpSelfJoin &joiner);
56
57 int getlastcurr(const OvlpRSJoin &joiner);
58 int getlastcurr(const OvlpSelfJoin &joiner);
59
60 // compute next combination_test1
61 void next(const OvlpRSJoin &joiner);
62 void next(const OvlpSelfJoin &joiner);
63
64 void print(const OvlpRSJoin &joiner) const;
65 void print(const OvlpSelfJoin &joiner) const;
66
67 bool stepback(const int i, const OvlpRSJoin &joiner);
68 bool stepback(const int i, const OvlpSelfJoin &joiner);
69
70 void binary(const combination1 &value, const OvlpSelfJoin &joiner);
71 void binary(const combination2 &value, const OvlpRSJoin &joiner);
72
73 // test unit
74 bool ifsame(const std::vector<ui> &data, const OvlpRSJoin &joiner);
75};
76
78{
79public:
80 int N{0};
81 int id{0};
82 bool completed{false};
83 std::vector<int> curr;
84
85 combination2(int d, int beg, const OvlpRSJoin &joiner);
86
87 int getlastcurr(const OvlpRSJoin &joiner);
88
89 // compute next combination2
90 void next(const OvlpRSJoin &joiner);
91
92 void print(const OvlpRSJoin &joiner) const;
93
94 bool stepback(const int i, const OvlpRSJoin &joiner);
95
96 void binary(const combination2 &value, const OvlpRSJoin &joiner);
97 void binary(const combination1 &value, const OvlpRSJoin &joiner);
98
99 // test unit
100 bool ifsame(const std::vector<ui> &data, const OvlpRSJoin &joiner);
101};
102
103
105{
106public:
107 int n{0};
108 int c{0};
110
111 std::vector<std::vector<ui>> records; // two sets
112 std::vector<std::vector<ui>> datasets; // two working datasets
113 std::vector<double> recWeights;
114 std::vector<double> wordwt;
115 std::vector<std::pair<int, int>> idmap_records;
116 std::vector<std::vector<std::pair<int, int>>> ele_lists;
117 std::vector<std::pair<int, int>> result_pairs;
118 std::vector<int> heap;
119 std::vector<combination1> combs;
121#if MAINTAIN_VALUE_OVLP == 1
122 bool isWeightedComp{false};
123 // only build heap when the res size is greater than MAX
124 std::vector<WeightPair> result_pairs_;
125 int isHeap{0};
126#endif
127
128 int64_t candidate_num{0};
129 int64_t result_num{0};
130
131 void overlapjoin(int overlap_threshold, std::vector<std::pair<int, int>> &finalPairs);
132 void small_case(int L, int R, std::vector<std::pair<int, int>> &finalPairs);
133
134 OvlpSelfJoin(const std::vector<std::vector<ui>> &sorted_records, const std::vector<double> &_recWeights,
135 const std::vector<double> _wordwt, ui _maxHeapSize = 0, bool _isWeightedComp = false)
136 : records(sorted_records), recWeights(_recWeights), wordwt(_wordwt) {
137 // reset everything
138 c = 0;
139 result_num = 0;
140 candidate_num = 0;
141
142 maxHeapSize = _maxHeapSize == 0 ? MAX_PAIR_SIZE_SERIAL : _maxHeapSize;
143#if MAINTAIN_VALUE_OVLP == 1
144 isWeightedComp = _isWeightedComp;
145 result_pairs_.reserve(maxHeapSize);
146#endif
147 }
148
149public:
150 bool comp_comb1(const int a, const int b) {
151 // cout << c << " ";
152 auto & c1 = combs[a];
153 auto & c2 = combs[b];
154 for (ui i = 0; i < c; i++) {
155 if (datasets[c1.id][c1.curr[i]] > datasets[c2.id][c2.curr[i]])
156 return false;
157 else if (datasets[c1.id][c1.curr[i]] < datasets[c2.id][c2.curr[i]])
158 return true;
159 }
160 return c1.id > c2.id;
161 }
162
163 double weightedOverlapCoeff(int id1, int id2) {
164 std::vector<ui> res;
165 std::set_intersection(records[id1].begin(), records[id1].end(),
166 records[id2].begin(), records[id2].end(),
167 std::back_inserter(res));
168 double ovlp = 0.0;
169 for(const auto &e : res)
170 ovlp += wordwt[e];
171 return ovlp / std::min(recWeights[id1], recWeights[id2]);
172 // return ovlp;
173 }
174
175 double overlapCoeff(int id1, int id2) {
176 std::vector<ui> res;
177 std::set_intersection(records[id1].begin(), records[id1].end(),
178 records[id2].begin(), records[id2].end(),
179 back_inserter(res));
180
181 double ovlp = res.size() * 1.0;
182 return ovlp / std::min(records[id1].size(), records[id2].size()) * 1.0;
183 // return ovlp;
184 }
185
186public:
187 // build heap for combination
188 bool build_heap(const std::vector<std::pair<int,int>> &vec, const std::vector<std::vector<ui>> &dataset,
189 int L, std::vector<int> &heap, std::vector<combination1> &combs, int &heap_size);
190};
191
192
194{
195public:
196 int n1{0}, n2{0}; // R S sizes
197 int c{0}; // threshold
199
200 std::vector<std::vector<ui>> records1, records2; // two sets
201 std::vector<std::vector<ui>> datasets1, datasets2; // two working datasets
202 std::vector<double> recWeights1, recWeights2;
203 std::vector<double> wordwt;
204 std::vector<std::pair<int, int>> idmap_records1, idmap_records2;
205 std::vector<std::vector<std::pair<int, int>>> ele_lists1, ele_lists2;
206 std::vector<std::pair<int, int>> result_pairs;
207 std::vector<int> heap1, heap2;
208 std::vector<combination1> combs1;
209 std::vector<combination2> combs2; // comb
211#if MAINTAIN_VALUE_OVLP == 1
212 bool isWeightedComp{false};
213 // only build heap when the res size is greater than MAX
214 std::vector<WeightPair> result_pairs_;
215 int isHeap{0};
216#endif
217
219 int64_t result_num;
220
221 void overlapjoin(int overlap_threshold, std::vector<std::pair<int, int>> &finalPairs);
222 void small_case(int L1, int R1, int L2, int R2, std::vector<std::pair<int, int>> &finalPairs);
223
224 bool if_external_IO = false;
226
227 OvlpRSJoin(const std::vector<std::vector<ui>> &sorted_records_1, const std::vector<std::vector<ui>> &sorted_records_2,
228 const std::vector<double> &_recWeights1, const std::vector<double> &_recWeights2,
229 const std::vector<double> &_wordwt, ui _maxHeapSize = 0, bool _isWeightedComp = false)
230 : records1(sorted_records_1), records2(sorted_records_2), recWeights1(_recWeights1), recWeights2(_recWeights2),
231 wordwt(_wordwt) {
232 // reset everything
233 c = 0;
234 result_num = 0;
235 candidate_num = 0;
236
237 maxHeapSize = _maxHeapSize == 0 ? MAX_PAIR_SIZE_SERIAL : _maxHeapSize;
238#if MAINTAIN_VALUE_OVLP == 1
239 isWeightedComp = _isWeightedComp;
240 result_pairs_.reserve(maxHeapSize);
241#endif
242 }
243
244 void set_external_store(const std::string &_resPair_path){
245 if_external_IO = true;
246 resultPair_storePath = _resPair_path;
247 }
248
249 // void save_idmap(string _resPair_path);
250
251public:
252 bool comp_comb1(const int a, const int b) {
253 // cout << c << " ";
254 auto & c1 = combs1[a];
255 auto & c2 = combs1[b];
256 for (ui i = 0; i < c; i++) {
257 if (datasets1[c1.id][c1.curr[i]] > datasets1[c2.id][c2.curr[i]])
258 return false;
259 else if (datasets1[c1.id][c1.curr[i]] < datasets1[c2.id][c2.curr[i]])
260 return true;
261 }
262 return c1.id > c2.id;
263 }
264 bool comp_comb2(const int a, const int b) {
265 auto & c1 = combs2[a];
266 auto & c2 = combs2[b];
267 for (ui i = 0; i < c; i++) {
268 if (datasets2[c1.id][c1.curr[i]] > datasets2[c2.id][c2.curr[i]])
269 return false;
270 else if (datasets2[c1.id][c1.curr[i]] < datasets2[c2.id][c2.curr[i]])
271 return true;
272 }
273 return c1.id > c2.id;
274 }
275
276 double weightedOverlapCoeff(int id1, int id2) {
277 std::vector<ui> res;
278 std::set_intersection(records1[id1].begin(), records1[id1].end(),
279 records2[id2].begin(), records2[id2].end(),
280 std::back_inserter(res));
281 double ovlp = 0.0;
282 for(const auto &e : res)
283 ovlp += wordwt[e];
284 return ovlp / std::min(recWeights1[id1], recWeights2[id2]);
285 // return ovlp;
286 }
287
288 double overlapCoeff(int id1, int id2) {
289 std::vector<ui> res;
290 std::set_intersection(records1[id1].begin(), records1[id1].end(),
291 records2[id2].begin(), records2[id2].end(),
292 back_inserter(res));
293
294 double ovlp = res.size() * 1.0;
295 return ovlp / std::min(records1[id1].size(), records2[id2].size()) * 1.0;
296 // return ovlp;
297 }
298
299public:
300 // build heap for combination1
301 bool build_heap(const std::vector<std::pair<int,int>> &vec, const std::vector<std::vector<ui>> &dataset,
302 int L, std::vector<int> &heap, std::vector<combination1> &combs, int &heap_size);
303 // build heap for combination2
304 bool build_heap(const std::vector<std::pair<int,int>> &vec, const std::vector<std::vector<ui>> &dataset,
305 int L, std::vector<int> &heap, std::vector<combination2> &combs, int &heap_size);
306};
307
308
309
311{
312public:
313 OvlpUtil() = default;
314 ~OvlpUtil() = default;
315 OvlpUtil(const OvlpUtil& other) = delete;
316 OvlpUtil(OvlpUtil&& other) = delete;
317
318public:
319 static bool comp_int(const int a, const int b) {
320 return a > b;
321 }
322 static bool comp_pair(const std::pair<int, int> &p1, const int val) {
323 return p1.first < val;
324 }
325 static bool is_equal(const combination1 & c1, const combination1 & c2,
326 const OvlpRSJoin &joiner) {
327 for (int i = 0; i < joiner.c; i++) {
328 if (joiner.datasets1[c1.id][c1.curr[i]] != joiner.datasets1[c2.id][c2.curr[i]])
329 return false;
330 }
331 return true;
332 }
333 static bool is_equal(const combination1 & c1, const combination1 & c2,
334 const OvlpSelfJoin &joiner) {
335 for (int i = 0; i < joiner.c; i++) {
336 if (joiner.datasets[c1.id][c1.curr[i]] != joiner.datasets[c2.id][c2.curr[i]])
337 return false;
338 }
339 return true;
340 }
341 static bool is_equal(const combination2 & c1, const combination2 & c2,
342 const OvlpRSJoin &joiner) {
343 for (int i = 0; i < joiner.c; i++) {
344 if (joiner.datasets2[c1.id][c1.curr[i]] != joiner.datasets2[c2.id][c2.curr[i]])
345 return false;
346 }
347 return true;
348 }
349 static int compare(const combination1 & c1, const combination2 & c2,
350 const OvlpRSJoin &joiner) {
351 // cout << joiner.c << ' ';
352 for (int i = 0; i < joiner.c; i++) {
353 if (joiner.datasets1[c1.id][c1.curr[i]] > joiner.datasets2[c2.id][c2.curr[i]])
354 return 1;
355 else if (joiner.datasets1[c1.id][c1.curr[i]] < joiner.datasets2[c2.id][c2.curr[i]])
356 return -1;
357 }
358 return 0;
359 }
360
361public:
362 static void removeShort(const std::vector<std::vector<ui>> &records, std::unordered_map<ui, std::vector<int>> &ele,
363 const OvlpRSJoin &joiner);
364 // Remove "widows" from a hash map based on another hash map.
365 // This function removes key-value pairs from the unordered_map 'ele'
366 // if the key doesn't exist in another unordered_map 'ele_other'.
367 static void removeWidow(std::unordered_map<ui, std::vector<int>> &ele, const std::unordered_map<ui, std::vector<int>> &ele_other);
368 static void transform(std::unordered_map<ui, std::vector<int>> &ele, const std::vector<std::pair<int, int>> &eles,
369 std::vector<std::pair<int, int>> &idmap, std::vector<std::vector<std::pair<int, int>>> &ele_lists,
370 std::vector<std::vector<ui>> &dataset, const ui total_eles, const int n, const OvlpRSJoin &joiner);
371};
372
373
374
375#endif // OVLPRSJOIN
Definition ovlpjoin.h:194
int n2
Definition ovlpjoin.h:196
std::vector< combination2 > combs2
Definition ovlpjoin.h:209
std::vector< int > heap2
Definition ovlpjoin.h:207
int c
Definition ovlpjoin.h:197
std::vector< std::pair< int, int > > idmap_records1
Definition ovlpjoin.h:204
std::string resultPair_storePath
Definition ovlpjoin.h:225
std::vector< int > heap1
Definition ovlpjoin.h:207
std::vector< double > wordwt
Definition ovlpjoin.h:203
bool comp_comb2(const int a, const int b)
Definition ovlpjoin.h:264
ui maxHeapSize
Definition ovlpjoin.h:210
ui total_eles
Definition ovlpjoin.h:198
std::vector< double > recWeights1
Definition ovlpjoin.h:202
std::vector< std::vector< ui > > records1
Definition ovlpjoin.h:200
std::vector< std::vector< std::pair< int, int > > > ele_lists1
Definition ovlpjoin.h:205
int isHeap
Definition ovlpjoin.h:215
int n1
Definition ovlpjoin.h:196
void small_case(int L1, int R1, int L2, int R2, std::vector< std::pair< int, int > > &finalPairs)
Definition ovlpjoin.cc:103
int64_t candidate_num
Definition ovlpjoin.h:218
std::vector< std::pair< int, int > > idmap_records2
Definition ovlpjoin.h:204
std::vector< std::vector< ui > > datasets2
Definition ovlpjoin.h:201
std::vector< WeightPair > result_pairs_
Definition ovlpjoin.h:214
std::vector< combination1 > combs1
Definition ovlpjoin.h:208
OvlpRSJoin(const std::vector< std::vector< ui > > &sorted_records_1, const std::vector< std::vector< ui > > &sorted_records_2, const std::vector< double > &_recWeights1, const std::vector< double > &_recWeights2, const std::vector< double > &_wordwt, ui _maxHeapSize=0, bool _isWeightedComp=false)
Definition ovlpjoin.h:227
std::vector< std::pair< int, int > > result_pairs
Definition ovlpjoin.h:206
double weightedOverlapCoeff(int id1, int id2)
Definition ovlpjoin.h:276
std::vector< double > recWeights2
Definition ovlpjoin.h:202
void set_external_store(const std::string &_resPair_path)
Definition ovlpjoin.h:244
std::vector< std::vector< ui > > records2
Definition ovlpjoin.h:200
std::vector< std::vector< ui > > datasets1
Definition ovlpjoin.h:201
void overlapjoin(int overlap_threshold, std::vector< std::pair< int, int > > &finalPairs)
Definition ovlpjoin.cc:376
bool comp_comb1(const int a, const int b)
Definition ovlpjoin.h:252
bool isWeightedComp
Definition ovlpjoin.h:212
bool if_external_IO
Definition ovlpjoin.h:224
double overlapCoeff(int id1, int id2)
Definition ovlpjoin.h:288
int64_t result_num
Definition ovlpjoin.h:219
bool build_heap(const std::vector< std::pair< int, int > > &vec, const std::vector< std::vector< ui > > &dataset, int L, std::vector< int > &heap, std::vector< combination1 > &combs, int &heap_size)
Definition ovlpjoin.cc:11
std::vector< std::vector< std::pair< int, int > > > ele_lists2
Definition ovlpjoin.h:205
Definition ovlpjoin.h:105
int n
Definition ovlpjoin.h:107
std::vector< std::vector< ui > > datasets
Definition ovlpjoin.h:112
int64_t result_num
Definition ovlpjoin.h:129
int64_t candidate_num
Definition ovlpjoin.h:128
ui maxHeapSize
Definition ovlpjoin.h:120
void small_case(int L, int R, std::vector< std::pair< int, int > > &finalPairs)
Definition ovlpjoin.cc:448
OvlpSelfJoin(const std::vector< std::vector< ui > > &sorted_records, const std::vector< double > &_recWeights, const std::vector< double > _wordwt, ui _maxHeapSize=0, bool _isWeightedComp=false)
Definition ovlpjoin.h:134
ui total_eles
Definition ovlpjoin.h:109
std::vector< WeightPair > result_pairs_
Definition ovlpjoin.h:124
bool build_heap(const std::vector< std::pair< int, int > > &vec, const std::vector< std::vector< ui > > &dataset, int L, std::vector< int > &heap, std::vector< combination1 > &combs, int &heap_size)
Definition ovlpjoin.cc:70
std::vector< std::vector< std::pair< int, int > > > ele_lists
Definition ovlpjoin.h:116
void overlapjoin(int overlap_threshold, std::vector< std::pair< int, int > > &finalPairs)
Definition ovlpjoin.cc:592
std::vector< double > recWeights
Definition ovlpjoin.h:113
std::vector< std::vector< ui > > records
Definition ovlpjoin.h:111
std::vector< combination1 > combs
Definition ovlpjoin.h:119
std::vector< double > wordwt
Definition ovlpjoin.h:114
double overlapCoeff(int id1, int id2)
Definition ovlpjoin.h:175
int isHeap
Definition ovlpjoin.h:125
bool comp_comb1(const int a, const int b)
Definition ovlpjoin.h:150
std::vector< std::pair< int, int > > result_pairs
Definition ovlpjoin.h:117
std::vector< int > heap
Definition ovlpjoin.h:118
double weightedOverlapCoeff(int id1, int id2)
Definition ovlpjoin.h:163
std::vector< std::pair< int, int > > idmap_records
Definition ovlpjoin.h:115
bool isWeightedComp
Definition ovlpjoin.h:122
int c
Definition ovlpjoin.h:108
Definition ovlpjoin.h:311
static int compare(const combination1 &c1, const combination2 &c2, const OvlpRSJoin &joiner)
Definition ovlpjoin.h:349
static bool comp_int(const int a, const int b)
Definition ovlpjoin.h:319
static void removeWidow(std::unordered_map< ui, std::vector< int > > &ele, const std::unordered_map< ui, std::vector< int > > &ele_other)
Definition ovlpjoin.cc:701
static bool is_equal(const combination1 &c1, const combination1 &c2, const OvlpRSJoin &joiner)
Definition ovlpjoin.h:325
~OvlpUtil()=default
OvlpUtil(OvlpUtil &&other)=delete
static bool comp_pair(const std::pair< int, int > &p1, const int val)
Definition ovlpjoin.h:322
static void removeShort(const std::vector< std::vector< ui > > &records, std::unordered_map< ui, std::vector< int > > &ele, const OvlpRSJoin &joiner)
Definition ovlpjoin.cc:686
static bool is_equal(const combination2 &c1, const combination2 &c2, const OvlpRSJoin &joiner)
Definition ovlpjoin.h:341
static void transform(std::unordered_map< ui, std::vector< int > > &ele, const std::vector< std::pair< int, int > > &eles, std::vector< std::pair< int, int > > &idmap, std::vector< std::vector< std::pair< int, int > > > &ele_lists, std::vector< std::vector< ui > > &dataset, const ui total_eles, const int n, const OvlpRSJoin &joiner)
Definition ovlpjoin.cc:721
OvlpUtil(const OvlpUtil &other)=delete
static bool is_equal(const combination1 &c1, const combination1 &c2, const OvlpSelfJoin &joiner)
Definition ovlpjoin.h:333
OvlpUtil()=default
#define MAX_PAIR_SIZE_SERIAL
Definition config.h:51
Definition ovlpjoin.h:47
bool ifsame(const std::vector< ui > &data, const OvlpRSJoin &joiner)
Definition ovlpjoin.cc:936
void next(const OvlpRSJoin &joiner)
Definition ovlpjoin.cc:790
std::vector< int > curr
Definition ovlpjoin.h:52
int getlastcurr(const OvlpRSJoin &joiner)
Definition ovlpjoin.cc:777
bool stepback(const int i, const OvlpRSJoin &joiner)
Definition ovlpjoin.cc:842
void binary(const combination1 &value, const OvlpSelfJoin &joiner)
Definition ovlpjoin.cc:874
bool completed
Definition ovlpjoin.h:51
combination1(int d, int beg, const OvlpRSJoin &joiner)
Definition ovlpjoin.cc:759
int N
Definition ovlpjoin.h:49
void print(const OvlpRSJoin &joiner) const
Definition ovlpjoin.cc:818
int id
Definition ovlpjoin.h:50
Definition ovlpjoin.h:78
int id
Definition ovlpjoin.h:81
void print(const OvlpRSJoin &joiner) const
Definition ovlpjoin.cc:974
combination2(int d, int beg, const OvlpRSJoin &joiner)
Definition ovlpjoin.cc:944
std::vector< int > curr
Definition ovlpjoin.h:83
bool stepback(const int i, const OvlpRSJoin &joiner)
Definition ovlpjoin.cc:986
bool completed
Definition ovlpjoin.h:82
void binary(const combination2 &value, const OvlpRSJoin &joiner)
Definition ovlpjoin.cc:1002
bool ifsame(const std::vector< ui > &data, const OvlpRSJoin &joiner)
Definition ovlpjoin.cc:1067
int N
Definition ovlpjoin.h:80
void next(const OvlpRSJoin &joiner)
Definition ovlpjoin.cc:960
int getlastcurr(const OvlpRSJoin &joiner)
Definition ovlpjoin.cc:953
unsigned int ui
Definition type.h:8