14#include <unordered_map>
22 using InvLists = std::unordered_map<uint64_t, std::vector<int>>;
40 std::vector<std::pair<int, int>>
pairs;
62#if MAINTAIN_VALUE_EDIT == 1
63 std::vector<WeightPairEdit> result_pairs_;
70 StringJoin(
const std::vector<std::string>& data,
int threshold,
ui _maxHeapSize = 0)
73 printf(
"Drop empty strings for string(lev) join\n");
83 for(
int i = 0; i <
workN; i++) {
99#if MAINTAIN_VALUE_EDIT == 1
104 StringJoin(
const std::vector<std::string>& work,
const std::vector<std::string>& query,
int threshold,
106 :
workN(work.size()),
queryN(query.size()),
D(threshold),
PN(threshold + 1),
109 printf(
"Drop empty strings for string(lev) join\n");
129 for(
int i = 0; i <
workN; i++) {
135 for(
int i = 0; i <
queryN; i++) {
153#if MAINTAIN_VALUE_EDIT == 1
159 for (
int lp = 0; lp <
PN; lp++) {
182 std::cout <<
"destructor" << std::endl << std::flush;
196 bool verifyRightPartSelf(
int xid,
int yid,
int xlen,
int ylen,
int xpos,
int ypos,
int Tau);
198 bool verifyRightPartRS(
int xid,
int yid,
int xlen,
int ylen,
int xpos,
int ypos,
int Tau);
201 void selfJoin(std::vector<std::pair<int, int>> &finalPairs);
202 void RSJoin(std::vector<std::pair<int, int>> &finalPairs);
219 static void exactJoinRS(
const std::vector<std::string> &colA,
const std::vector<std::string> &colB,
220 std::vector<std::pair<int, int>> &pairs,
ui _maxHeapSize = 0) {
221 ui sizeA = colA.size();
222 ui sizeB = colB.size();
224 std::unordered_map<ui, std::vector<ui>> indexA;
225 std::unordered_map<ui, std::vector<ui>> indexB;
228 for (
ui j = 0; j < sizeA; j++) {
229 ui length = colA[j].length();
235 indexA[length].emplace_back(j);
237 for (
ui j = 0; j < sizeB; j++) {
238 ui length = colB[j].length();
244 indexB[length].emplace_back(j);
247 for (
auto &itA : indexA) {
248 ui bucketSizeA = itA.second.size();
249 ui bucketSizeB = indexB[itA.first].size();
250 const auto &bucketB = indexB[itA.first];
251 for (
ui ii = 0; ii < bucketSizeA; ii++) {
252 for (
ui jj = 0; jj < bucketSizeB; jj++)
253 if (colA[itA.second[ii]] == colB[bucketB[jj]])
254 pairs.emplace_back(ii, jj);
255 if(pairs.size() > maxHeapSize)
261 static void exactJoinSelf(
const std::vector<std::string> &col, std::vector<std::pair<int, int>> &pairs,
262 ui _maxHeapSize = 0) {
263 ui size = col.size();
264 std::unordered_map<ui, std::vector<ui>> index;
267 for (
ui j = 0; j < size; j++) {
268 ui length = col[j].length();
274 index[length].emplace_back(j);
277 for (
auto &it : index) {
278 ui bucketSize = it.second.size();
279 for (
ui ii = 0; ii < bucketSize; ii++) {
280 for (
ui jj = ii + 1; jj < bucketSize; jj++)
281 if (col[it.second[ii]] == col[it.second[jj]])
282 pairs.emplace_back(ii, jj);
283 if(pairs.size() > maxHeapSize)
Definition stringjoin.h:211
static void exactJoinRS(const std::vector< std::string > &colA, const std::vector< std::string > &colB, std::vector< std::pair< int, int > > &pairs, ui _maxHeapSize=0)
Definition stringjoin.h:219
static void exactJoinSelf(const std::vector< std::string > &col, std::vector< std::pair< int, int > > &pairs, ui _maxHeapSize=0)
Definition stringjoin.h:261
ExactJoin(ExactJoin &&other)=delete
ExactJoin(const ExactJoin &other)=delete
static bool strLessT(const std::string &s1, const std::string &s2)
Definition joinutil.cc:243
Definition stringjoin.h:20
int workN
Definition stringjoin.h:31
int PN
Definition stringjoin.h:35
StringJoin(StringJoin &&other)=delete
uint64_t veriNum
Definition stringjoin.h:43
void checkSelfResults() const
Definition stringjoin.cc:504
int hashNumber
Definition stringjoin.h:36
int N
Definition stringjoin.h:33
void selfJoin(std::vector< std::pair< int, int > > &finalPairs)
Definition stringjoin.cc:305
void prepareRS()
Definition stringjoin.cc:112
int workMinDictLen
Definition stringjoin.h:25
int D
Definition stringjoin.h:34
StringJoin(const std::vector< std::string > &work, const std::vector< std::string > &query, int threshold, ui _maxHeapSize=0)
Definition stringjoin.h:104
bool * quickRef
Definition stringjoin.h:53
int modNumber
Definition stringjoin.h:37
std::vector< std::string > work_dataset
Definition stringjoin.h:38
int left
Definition stringjoin.h:48
bool verifyLeftPartSelf(int xid, int yid, int xlen, int ylen, int Tau)
Definition stringjoin.cc:157
uint64_t * power
Definition stringjoin.h:57
int minDictLen
Definition stringjoin.h:30
uint64_t realNum
Definition stringjoin.h:45
int queryN
Definition stringjoin.h:32
uint64_t candNum
Definition stringjoin.h:42
int * dist
Definition stringjoin.h:56
int maxDictLen
Definition stringjoin.h:29
void RSJoin(std::vector< std::pair< int, int > > &finalPairs)
Definition stringjoin.cc:407
bool verifyRightPartSelf(int xid, int yid, int xlen, int ylen, int xpos, int ypos, int Tau)
Definition stringjoin.cc:194
int ** partPos
Definition stringjoin.h:55
int ** _matrix
Definition stringjoin.h:52
StringJoin(const std::vector< std::string > &data, int threshold, ui _maxHeapSize=0)
Definition stringjoin.h:70
int queryMaxDictLen
Definition stringjoin.h:28
StringJoin(const StringJoin &other)=delete
std::unordered_map< uint64_t, std::vector< int > > InvLists
Definition stringjoin.h:22
void init()
Definition stringjoin.cc:9
bool valid
Definition stringjoin.h:47
void prepareSelf()
Definition stringjoin.cc:68
int _left
Definition stringjoin.h:49
std::vector< PIndex > ** partIndex
Definition stringjoin.h:59
ui maxHeapSize
Definition stringjoin.h:61
std::vector< std::pair< int, int > > pairs
Definition stringjoin.h:40
~StringJoin()
Definition stringjoin.h:158
std::vector< std::string > query_dataset
Definition stringjoin.h:39
bool verifyLeftPartRS(int xid, int yid, int xlen, int ylen, int Tau)
Definition stringjoin.cc:231
int _right
Definition stringjoin.h:49
int right
Definition stringjoin.h:48
std::vector< int > results
Definition stringjoin.h:46
InvLists ** invLists
Definition stringjoin.h:58
int workMaxDictLen
Definition stringjoin.h:26
void printDebugInfo(int currLen) const
Definition stringjoin.cc:531
int queryMinDictLen
Definition stringjoin.h:27
bool verifyRightPartRS(int xid, int yid, int xlen, int ylen, int xpos, int ypos, int Tau)
Definition stringjoin.cc:268
int ** matrix
Definition stringjoin.h:51
int ** partLen
Definition stringjoin.h:54
uint64_t listNum
Definition stringjoin.h:44
#define MAX_PAIR_SIZE_SERIAL
Definition config.h:51
unsigned int ui
Definition type.h:8