Entity Matching by Similarity Join
 
Loading...
Searching...
No Matches
stringjoin.h
Go to the documentation of this file.
1/*
2 * author: Dong Deng
3 * modified: Yunqi Li
4 * contact: liyunqixa@gmail.com
5 */
6#ifndef _STRING_JOIN_H_
7#define _STRING_JOIN_H_
8
9#include "joinutil.h"
10#include "index.h"
11#include "simfunc.h"
12#include <vector>
13#include <string>
14#include <unordered_map>
15#include <algorithm>
16#include <cmath>
17
18
20{
21public:
22 using InvLists = std::unordered_map<uint64_t, std::vector<int>>;
23
24public:
25 int workMinDictLen{19260817};
27 int queryMinDictLen{19260817};
29 int maxDictLen{0};
30 int minDictLen{0};
31 int workN{0};
32 int queryN{0};
33 int N{0};
34 int D{0}; // threshold
35 int PN{0}; // part num
36 int hashNumber{31}; // prime number for hashing
37 int modNumber{1000000007}; // prime mod number for hashing
38 std::vector<std::string> work_dataset; // work dataset
39 std::vector<std::string> query_dataset; // query dataset, null if self-join
40 std::vector<std::pair<int, int>> pairs; // reuslt pairs
41
42 uint64_t candNum{0}; // number of candidate pairs
43 uint64_t veriNum{0}; // number of pairs passed left verification
44 uint64_t listNum{0}; // number of inverted list matched
45 uint64_t realNum{0}; // number of results
46 std::vector<int> results; // results
47 bool valid; // used for early termination
48 int left, right; // verify range of left part: [D - left, D + right]
49 int _left, _right; // verify range of right part: [D - _left, D + _right]
50
51 int **matrix{nullptr}; // matrix used to verify left parts
52 int **_matrix{nullptr}; // matrix used to verify right parts
53 bool *quickRef{nullptr}; // record results to avoid duplicate verification
54 int **partLen{nullptr}; // record length of all partitions
55 int **partPos{nullptr}; // record start position of all partitions
56 int *dist{nullptr}; // id of string with different length to its former one
57 uint64_t *power{nullptr}; // the power of 131, used to hash segment and substring
58 InvLists **invLists{nullptr}; // inverted index of different string length and different segments
59 std::vector<PIndex> **partIndex{nullptr}; // the index record with pair of substring and segment should be checked
60
62#if MAINTAIN_VALUE_EDIT == 1
63 std::vector<WeightPairEdit> result_pairs_;
64 int isHeap = 0;
65#endif
66
67public:
68 StringJoin() = default;
69
70 StringJoin(const std::vector<std::string>& data, int threshold, ui _maxHeapSize = 0)
71 : workN(data.size()), D(threshold), PN(threshold + 1), work_dataset(data) {
72#if DROP_EMPTY == 1
73 printf("Drop empty strings for string(lev) join\n");
74 auto wit = work_dataset.begin();
75 for( ; wit != work_dataset.end(); ) {
76 if((*wit).empty())
77 wit = work_dataset.erase(wit);
78 else
79 ++ wit;
80 }
81 workN = work_dataset.size();
82#endif
83 for(int i = 0; i < workN; i++) {
84 workMaxDictLen = (int)work_dataset[i].size() > workMaxDictLen ? (int)work_dataset[i].size()
86 workMinDictLen = (int)work_dataset[i].size() < workMinDictLen ? (int)work_dataset[i].size()
88 }
89 std::sort(work_dataset.begin(), work_dataset.end(), StringJoinUtil::strLessT);
90
93 N = workN;
94
95 printf("Work size: %zu\tQuery size: %zu\n", work_dataset.size(), query_dataset.size());
96 printf("Min work record's size: %d\tMax work record's size: %d\n", workMinDictLen, workMaxDictLen);
97
98 maxHeapSize = _maxHeapSize == 0 ? MAX_PAIR_SIZE_SERIAL : _maxHeapSize;
99#if MAINTAIN_VALUE_EDIT == 1
100 result_pairs_.reserve(maxHeapSize);
101#endif
102 }
103
104 StringJoin(const std::vector<std::string>& work, const std::vector<std::string>& query, int threshold,
105 ui _maxHeapSize = 0)
106 : workN(work.size()), queryN(query.size()), D(threshold), PN(threshold + 1),
107 work_dataset(work), query_dataset(query) {
108#if DROP_EMPTY == 1
109 printf("Drop empty strings for string(lev) join\n");
110 // work
111 auto wit = work_dataset.begin();
112 for( ; wit != work_dataset.end(); ) {
113 if((*wit).empty())
114 wit = work_dataset.erase(wit);
115 else
116 ++ wit;
117 }
118 workN = work_dataset.size();
119 // query
120 auto qit = query_dataset.begin();
121 for( ; qit != query_dataset.end(); ) {
122 if((*qit).empty())
123 qit = query_dataset.erase(qit);
124 else
125 ++ qit;
126 }
127 queryN = query_dataset.size();
128#endif
129 for(int i = 0; i < workN; i++) {
130 workMaxDictLen = (int)work_dataset[i].size() > workMaxDictLen ? (int)work_dataset[i].size()
132 workMinDictLen = (int)work_dataset[i].size() < workMinDictLen ? (int)work_dataset[i].size()
134 }
135 for(int i = 0; i < queryN; i++) {
136 queryMaxDictLen = (int)query_dataset[i].size() > queryMaxDictLen ? (int)query_dataset[i].size()
138 queryMinDictLen = (int)query_dataset[i].size() < queryMinDictLen ? (int)query_dataset[i].size()
140 }
141 std::sort(work_dataset.begin(), work_dataset.end(), StringJoinUtil::strLessT);
142 std::sort(query_dataset.begin(), query_dataset.end(), StringJoinUtil::strLessT);
143
146 N = std::max(workN, queryN);
147
148 printf("Work size: %zu\tQuery size: %zu\n", work_dataset.size(), query_dataset.size());
149 printf("Min work record's size: %d\tMax work record's size: %d\n", workMinDictLen, workMaxDictLen);
150 printf("Min query record's size: %d\tMax query record's size: %d\n", queryMinDictLen, queryMaxDictLen);
151
152 maxHeapSize = _maxHeapSize == 0 ? MAX_PAIR_SIZE_SERIAL : _maxHeapSize;
153#if MAINTAIN_VALUE_EDIT == 1
154 result_pairs_.reserve(maxHeapSize);
155#endif
156 }
157
159 for (int lp = 0; lp < PN; lp++) {
160 delete[] partPos[lp];
161 delete[] partLen[lp];
162 delete[] invLists[lp];
163 delete[] partIndex[lp];
164 }
165 delete[] partPos[PN];
166
167 for (int lp = maxDictLen; lp >= 0; lp--) {
168 delete[] matrix[lp];
169 delete[] _matrix[lp];
170 }
171
172 delete[] matrix;
173 delete[] _matrix;
174 delete[] dist;
175 delete[] power;
176 delete[] partPos;
177 delete[] partLen;
178 delete[] invLists;
179 delete[] partIndex;
180 delete[] quickRef;
181
182 std::cout << "destructor" << std::endl << std::flush;
183 }
184
185 StringJoin(const StringJoin& other) = delete;
186 StringJoin(StringJoin&& other) = delete;
187
188public:
189 // Pre-process
190 void init();
191 void prepareSelf();
192 void prepareRS();
193
194 // verify
195 bool verifyLeftPartSelf(int xid, int yid, int xlen, int ylen, int Tau);
196 bool verifyRightPartSelf(int xid, int yid, int xlen, int ylen, int xpos, int ypos, int Tau);
197 bool verifyLeftPartRS(int xid, int yid, int xlen, int ylen, int Tau);
198 bool verifyRightPartRS(int xid, int yid, int xlen, int ylen, int xpos, int ypos, int Tau);
199
200 // join
201 void selfJoin(std::vector<std::pair<int, int>> &finalPairs);
202 void RSJoin(std::vector<std::pair<int, int>> &finalPairs);
203
204 // check
205 void checkSelfResults() const;
206 void printDebugInfo(int currLen) const;
207};
208
209
211{
212public:
213 ExactJoin() = default;
214 ~ExactJoin() = default;
215 ExactJoin(const ExactJoin &other) = delete;
216 ExactJoin(ExactJoin &&other) = delete;
217
218public:
219 static void exactJoinRS(const std::vector<std::string> &colA, const std::vector<std::string> &colB,
220 std::vector<std::pair<int, int>> &pairs, ui _maxHeapSize = 0) {
221 ui sizeA = colA.size();
222 ui sizeB = colB.size();
223
224 std::unordered_map<ui, std::vector<ui>> indexA;
225 std::unordered_map<ui, std::vector<ui>> indexB;
226 ui maxHeapSize = _maxHeapSize == 0 ? MAX_PAIR_SIZE_SERIAL : _maxHeapSize;
227
228 for (ui j = 0; j < sizeA; j++) {
229 ui length = colA[j].length();
230 // skip empty
231#if DROP_EMPTY == 1
232 if (length == 0)
233 continue;
234#endif
235 indexA[length].emplace_back(j);
236 }
237 for (ui j = 0; j < sizeB; j++) {
238 ui length = colB[j].length();
239 // skip empty
240#if DROP_EMPTY == 1
241 if (length == 0)
242 continue;
243#endif
244 indexB[length].emplace_back(j);
245 }
246
247 for (auto &itA : indexA) {
248 ui bucketSizeA = itA.second.size();
249 ui bucketSizeB = indexB[itA.first].size();
250 const auto &bucketB = indexB[itA.first];
251 for (ui ii = 0; ii < bucketSizeA; ii++) {
252 for (ui jj = 0; jj < bucketSizeB; jj++)
253 if (colA[itA.second[ii]] == colB[bucketB[jj]])
254 pairs.emplace_back(ii, jj);
255 if(pairs.size() > maxHeapSize)
256 return;
257 }
258 }
259 }
260
261 static void exactJoinSelf(const std::vector<std::string> &col, std::vector<std::pair<int, int>> &pairs,
262 ui _maxHeapSize = 0) {
263 ui size = col.size();
264 std::unordered_map<ui, std::vector<ui>> index;
265 ui maxHeapSize = _maxHeapSize == 0 ? MAX_PAIR_SIZE_SERIAL : _maxHeapSize;
266
267 for (ui j = 0; j < size; j++) {
268 ui length = col[j].length();
269 // skip empty
270#if DROP_EMPTY == 1
271 if (length == 0)
272 continue;
273#endif
274 index[length].emplace_back(j);
275 }
276
277 for (auto &it : index) {
278 ui bucketSize = it.second.size();
279 for (ui ii = 0; ii < bucketSize; ii++) {
280 for (ui jj = ii + 1; jj < bucketSize; jj++)
281 if (col[it.second[ii]] == col[it.second[jj]])
282 pairs.emplace_back(ii, jj);
283 if(pairs.size() > maxHeapSize)
284 return;
285 }
286 }
287 }
288};
289
290
291#endif // _STRING_JOIN_H_
Definition stringjoin.h:211
static void exactJoinRS(const std::vector< std::string > &colA, const std::vector< std::string > &colB, std::vector< std::pair< int, int > > &pairs, ui _maxHeapSize=0)
Definition stringjoin.h:219
~ExactJoin()=default
static void exactJoinSelf(const std::vector< std::string > &col, std::vector< std::pair< int, int > > &pairs, ui _maxHeapSize=0)
Definition stringjoin.h:261
ExactJoin(ExactJoin &&other)=delete
ExactJoin(const ExactJoin &other)=delete
ExactJoin()=default
static bool strLessT(const std::string &s1, const std::string &s2)
Definition joinutil.cc:243
Definition stringjoin.h:20
int workN
Definition stringjoin.h:31
int PN
Definition stringjoin.h:35
StringJoin()=default
StringJoin(StringJoin &&other)=delete
uint64_t veriNum
Definition stringjoin.h:43
void checkSelfResults() const
Definition stringjoin.cc:504
int hashNumber
Definition stringjoin.h:36
int N
Definition stringjoin.h:33
void selfJoin(std::vector< std::pair< int, int > > &finalPairs)
Definition stringjoin.cc:305
void prepareRS()
Definition stringjoin.cc:112
int workMinDictLen
Definition stringjoin.h:25
int D
Definition stringjoin.h:34
StringJoin(const std::vector< std::string > &work, const std::vector< std::string > &query, int threshold, ui _maxHeapSize=0)
Definition stringjoin.h:104
bool * quickRef
Definition stringjoin.h:53
int modNumber
Definition stringjoin.h:37
std::vector< std::string > work_dataset
Definition stringjoin.h:38
int left
Definition stringjoin.h:48
bool verifyLeftPartSelf(int xid, int yid, int xlen, int ylen, int Tau)
Definition stringjoin.cc:157
uint64_t * power
Definition stringjoin.h:57
int minDictLen
Definition stringjoin.h:30
uint64_t realNum
Definition stringjoin.h:45
int queryN
Definition stringjoin.h:32
uint64_t candNum
Definition stringjoin.h:42
int * dist
Definition stringjoin.h:56
int maxDictLen
Definition stringjoin.h:29
void RSJoin(std::vector< std::pair< int, int > > &finalPairs)
Definition stringjoin.cc:407
bool verifyRightPartSelf(int xid, int yid, int xlen, int ylen, int xpos, int ypos, int Tau)
Definition stringjoin.cc:194
int ** partPos
Definition stringjoin.h:55
int ** _matrix
Definition stringjoin.h:52
StringJoin(const std::vector< std::string > &data, int threshold, ui _maxHeapSize=0)
Definition stringjoin.h:70
int queryMaxDictLen
Definition stringjoin.h:28
StringJoin(const StringJoin &other)=delete
std::unordered_map< uint64_t, std::vector< int > > InvLists
Definition stringjoin.h:22
void init()
Definition stringjoin.cc:9
bool valid
Definition stringjoin.h:47
void prepareSelf()
Definition stringjoin.cc:68
int _left
Definition stringjoin.h:49
std::vector< PIndex > ** partIndex
Definition stringjoin.h:59
ui maxHeapSize
Definition stringjoin.h:61
std::vector< std::pair< int, int > > pairs
Definition stringjoin.h:40
~StringJoin()
Definition stringjoin.h:158
std::vector< std::string > query_dataset
Definition stringjoin.h:39
bool verifyLeftPartRS(int xid, int yid, int xlen, int ylen, int Tau)
Definition stringjoin.cc:231
int _right
Definition stringjoin.h:49
int right
Definition stringjoin.h:48
std::vector< int > results
Definition stringjoin.h:46
InvLists ** invLists
Definition stringjoin.h:58
int workMaxDictLen
Definition stringjoin.h:26
void printDebugInfo(int currLen) const
Definition stringjoin.cc:531
int queryMinDictLen
Definition stringjoin.h:27
bool verifyRightPartRS(int xid, int yid, int xlen, int ylen, int xpos, int ypos, int Tau)
Definition stringjoin.cc:268
int ** matrix
Definition stringjoin.h:51
int ** partLen
Definition stringjoin.h:54
uint64_t listNum
Definition stringjoin.h:44
#define MAX_PAIR_SIZE_SERIAL
Definition config.h:51
unsigned int ui
Definition type.h:8