7#ifndef _OVLP_JOIN_PARALLEL_H_
8#define _OVLP_JOIN_PARALLEL_H_
19#include <unordered_map>
20#include <unordered_set>
29#include <sys/sysinfo.h>
109#if MAINTAIN_VALUE_OVLP == 1
119 void overlapjoin(
int overlap_threshold, std::vector<std::pair<int, int>> &finalPairs);
120 void small_case(
int L1,
int R1,
int L2,
int R2, std::vector<std::pair<int, int>> &finalPairs);
125 OvlpRSJoinParallel(
const std::vector<std::vector<ui>> &sorted_records_1,
const std::vector<std::vector<ui>> &sorted_records_2,
126 const std::vector<double> &rec1wt,
const std::vector<double> &rec2wt,
const std::vector<double> &_wordwt,
127 ui _maxHeapSize = 0,
bool _isWeightedComp =
false)
135#if MAINTAIN_VALUE_OVLP == 1
146 std::back_inserter(res));
148 for(
const auto &e : res)
160 double ovlp = res.size() * 1.0;
174 auto & c1 =
combs1[tid][a];
175 auto & c2 =
combs1[tid][b];
176 for (
int i = 0; i <
c; i++) {
182 return c1.id > c2.id;
185 auto & c1 =
combs2[tid][a];
186 auto & c2 =
combs2[tid][b];
187 for (
int i = 0; i <
c; i++) {
193 return c1.id > c2.id;
198 bool build_heap(
const std::vector<std::pair<int,int>> &vec,
const std::vector<std::vector<ui>> &dataset,
199 int L, std::vector<int> &heap, std::vector<combination_p1> &combs,
int &heap_size,
202 bool build_heap(
const std::vector<std::pair<int,int>> &vec,
const std::vector<std::vector<ui>> &dataset,
203 int L, std::vector<int> &heap, std::vector<combination_p2> &combs,
int &heap_size,
221 std::vector<std::vector<std::pair<int, int>>>
ele_lists;
226 std::vector<std::pair<int, int>>
buck;
228#if MAINTAIN_VALUE_OVLP == 1
244 void overlapjoin(
int overlap_threshold, std::vector<std::pair<int, int>> &finalPairs);
245 void small_case(
int L,
int R, std::vector<std::pair<int, int>> &finalPairs);
251 void large_case(
int L,
int R, std::vector<std::pair<int, int>> &finalPairs);
257 const std::vector<double> &_wordwt,
ui _maxHeapSize = 0,
bool _isWeightedComp =
false)
265#if MAINTAIN_VALUE_OVLP == 1
276 std::back_inserter(res));
278 for(
const auto &e : res)
290 double ovlp = res.size() * 1.0;
291 return ovlp / std::min(
records[id1].size(),
records[id2].size()) * 1.0;
304 auto & c1 =
combs[tid][a];
305 auto & c2 =
combs[tid][b];
306 for (
int i = 0; i <
c; i++) {
312 return c1.id > c2.id;
317 bool build_heap(
const std::vector<std::pair<int,int>> &vec,
const std::vector<std::vector<ui>> &dataset,
318 int L, std::vector<int> &
heap, std::vector<combination_p1> &
combs,
int &heap_size,
336 static bool comp_pair(
const std::pair<int, int> &p1,
const int val) {
337 return p1.first < val;
341 for (
int i = 0; i < joiner.
c; i++) {
349 for (
int i = 0; i < joiner.
c; i++) {
357 for (
int i = 0; i < joiner.
c; i++) {
366 for (
int i = 0; i < joiner.
c; i++) {
375 if (k == 0)
return 1;
376 return (n *
nchoosek(n - 1, k - 1)) / k;
380 static void removeShort(
const std::vector<std::vector<ui>> &records, std::unordered_map<
ui, std::vector<int>> &ele,
385 static void removeWidow(std::unordered_map<
ui, std::vector<int>> &ele,
const std::unordered_map<
ui, std::vector<int>> &ele_other);
386 static void transform(std::unordered_map<
ui, std::vector<int>> &ele,
const std::vector<std::pair<int, int>> &eles,
387 std::vector<std::pair<int, int>> &idmap, std::vector<std::vector<std::pair<int, int>>> &ele_lists,
388 std::vector<std::vector<ui>> &dataset,
const ui total_eles,
const int n,
const OvlpRSJoinParallel &joiner);
Definition ovlpjoin_parallel.h:92
ui maxHeapSize
Definition ovlpjoin_parallel.h:108
std::vector< int > heap2[MAXTHREADNUM]
Definition ovlpjoin_parallel.h:105
int n2
Definition ovlpjoin_parallel.h:94
int64_t result_num
Definition ovlpjoin_parallel.h:117
int n1
Definition ovlpjoin_parallel.h:94
std::vector< std::vector< std::pair< int, int > > > ele_lists2
Definition ovlpjoin_parallel.h:103
std::vector< std::vector< ui > > records1
Definition ovlpjoin_parallel.h:98
bool comp_comb2(const int a, const int b, int tid)
Definition ovlpjoin_parallel.h:184
std::vector< WeightPair > result_pairs_[MAXTHREADNUM]
Definition ovlpjoin_parallel.h:112
void set_external_store(const std::string &_resPair_path)
Definition ovlpjoin_parallel.h:166
bool isWeightedComp
Definition ovlpjoin_parallel.h:110
std::vector< std::pair< int, int > > idmap_records2
Definition ovlpjoin_parallel.h:102
int64_t candidate_num
Definition ovlpjoin_parallel.h:116
std::vector< std::pair< int, int > > idmap_records1
Definition ovlpjoin_parallel.h:102
std::vector< double > recWeights2
Definition ovlpjoin_parallel.h:100
OvlpRSJoinParallel(const std::vector< std::vector< ui > > &sorted_records_1, const std::vector< std::vector< ui > > &sorted_records_2, const std::vector< double > &rec1wt, const std::vector< double > &rec2wt, const std::vector< double > &_wordwt, ui _maxHeapSize=0, bool _isWeightedComp=false)
Definition ovlpjoin_parallel.h:125
std::vector< std::vector< ui > > datasets1
Definition ovlpjoin_parallel.h:99
std::vector< int > heap1[MAXTHREADNUM]
Definition ovlpjoin_parallel.h:105
int c
Definition ovlpjoin_parallel.h:95
ui total_eles
Definition ovlpjoin_parallel.h:96
int isHeap[MAXTHREADNUM]
Definition ovlpjoin_parallel.h:113
std::vector< std::pair< int, int > > result_pairs[MAXTHREADNUM]
Definition ovlpjoin_parallel.h:104
bool build_heap(const std::vector< std::pair< int, int > > &vec, const std::vector< std::vector< ui > > &dataset, int L, std::vector< int > &heap, std::vector< combination_p1 > &combs, int &heap_size, int tid)
Definition ovlpjoin_parallel.cc:15
std::string resultPair_storePath
Definition ovlpjoin_parallel.h:123
void overlapjoin(int overlap_threshold, std::vector< std::pair< int, int > > &finalPairs)
Definition ovlpjoin_parallel.cc:450
std::vector< combination_p1 > combs1[MAXTHREADNUM]
Definition ovlpjoin_parallel.h:106
std::vector< std::vector< ui > > records2
Definition ovlpjoin_parallel.h:98
double overlapCoeff(int id1, int id2)
Definition ovlpjoin_parallel.h:154
std::vector< std::vector< ui > > datasets2
Definition ovlpjoin_parallel.h:99
std::vector< double > wordwt
Definition ovlpjoin_parallel.h:101
double weightedOverlapCoeff(int id1, int id2)
Definition ovlpjoin_parallel.h:142
bool if_external_IO
Definition ovlpjoin_parallel.h:122
std::vector< double > recWeights1
Definition ovlpjoin_parallel.h:100
std::vector< std::vector< std::pair< int, int > > > ele_lists1
Definition ovlpjoin_parallel.h:103
void small_case(int L1, int R1, int L2, int R2, std::vector< std::pair< int, int > > &finalPairs)
Definition ovlpjoin_parallel.cc:154
bool comp_comb1(const int a, const int b, int tid)
Definition ovlpjoin_parallel.h:172
std::vector< combination_p2 > combs2[MAXTHREADNUM]
Definition ovlpjoin_parallel.h:107
Definition ovlpjoin_parallel.h:209
double binary_cost
Definition ovlpjoin_parallel.h:238
int64_t candidate_num
Definition ovlpjoin_parallel.h:234
ui total_eles
Definition ovlpjoin_parallel.h:213
std::vector< std::vector< ui > > records
Definition ovlpjoin_parallel.h:216
bool comp_comb1(const int a, const int b, int tid)
Definition ovlpjoin_parallel.h:302
void set_external_store(const std::string &_resPair_path)
Definition ovlpjoin_parallel.h:295
int isHeap[MAXTHREADNUM]
Definition ovlpjoin_parallel.h:232
bool build_heap(const std::vector< std::pair< int, int > > &vec, const std::vector< std::vector< ui > > &dataset, int L, std::vector< int > &heap, std::vector< combination_p1 > &combs, int &heap_size, int tid)
Definition ovlpjoin_parallel.cc:553
int64_t list_cost
Definition ovlpjoin_parallel.h:236
int earlyTerminated[MAXTHREADNUM]
Definition ovlpjoin_parallel.h:214
void large_case(int L, int R, std::vector< std::pair< int, int > > &finalPairs)
Definition ovlpjoin_parallel.cc:1020
int alive_id
Definition ovlpjoin_parallel.h:242
void small_case(int L, int R, std::vector< std::pair< int, int > > &finalPairs)
Definition ovlpjoin_parallel.cc:583
int64_t large_cost
Definition ovlpjoin_parallel.h:240
int estimate()
Definition ovlpjoin_parallel.cc:974
int divide(int nL)
Definition ovlpjoin_parallel.cc:962
std::vector< int > heap[MAXTHREADNUM]
Definition ovlpjoin_parallel.h:223
double overlapCoeff(int id1, int id2)
Definition ovlpjoin_parallel.h:284
OvlpSelfJoinParallel(const std::vector< std::vector< ui > > &sorted_records, const std::vector< double > &recwt, const std::vector< double > &_wordwt, ui _maxHeapSize=0, bool _isWeightedComp=false)
Definition ovlpjoin_parallel.h:256
std::vector< WeightPair > result_pairs_[MAXTHREADNUM]
Definition ovlpjoin_parallel.h:231
bool if_external_IO
Definition ovlpjoin_parallel.h:253
ui maxHeapSize
Definition ovlpjoin_parallel.h:227
std::vector< std::vector< std::pair< int, int > > > ele_lists
Definition ovlpjoin_parallel.h:221
std::vector< std::pair< int, int > > result_pairs[MAXTHREADNUM]
Definition ovlpjoin_parallel.h:222
void overlapjoin(int overlap_threshold, std::vector< std::pair< int, int > > &finalPairs)
Definition ovlpjoin_parallel.cc:1090
std::string resultPair_storePath
Definition ovlpjoin_parallel.h:254
std::vector< double > weights
Definition ovlpjoin_parallel.h:218
std::vector< double > wordwt
Definition ovlpjoin_parallel.h:219
double weightedOverlapCoeff(int id1, int id2)
Definition ovlpjoin_parallel.h:272
int64_t small_estimate(int L, int R)
Definition ovlpjoin_parallel.cc:823
int c
Definition ovlpjoin_parallel.h:212
std::vector< std::pair< int, int > > buck
Definition ovlpjoin_parallel.h:226
int n1
Definition ovlpjoin_parallel.h:211
std::vector< std::vector< ui > > datasets
Definition ovlpjoin_parallel.h:217
int64_t large_estimate(int L, int R)
Definition ovlpjoin_parallel.cc:939
uint64_t heap_op
Definition ovlpjoin_parallel.h:239
std::vector< std::pair< int, int > > idmap_records
Definition ovlpjoin_parallel.h:220
std::unordered_set< int > random_ids
Definition ovlpjoin_parallel.h:225
std::vector< combination_p1 > combs[MAXTHREADNUM]
Definition ovlpjoin_parallel.h:224
int64_t large_est_cost
Definition ovlpjoin_parallel.h:241
bool isWeightedComp
Definition ovlpjoin_parallel.h:229
double heap_cost
Definition ovlpjoin_parallel.h:237
int64_t result_num
Definition ovlpjoin_parallel.h:235
Definition ovlpjoin_parallel.h:325
OvlpUtilParallel(OvlpUtilParallel &&other)=delete
static void removeShort(const std::vector< std::vector< ui > > &records, std::unordered_map< ui, std::vector< int > > &ele, const OvlpRSJoinParallel &joiner)
Definition ovlpjoin_parallel.cc:75
static bool is_equal(const combination_p1 &c1, const combination_p1 &c2, const OvlpRSJoinParallel &joiner)
Definition ovlpjoin_parallel.h:339
static bool comp_int(const int a, const int b)
Definition ovlpjoin_parallel.h:333
static void removeWidow(std::unordered_map< ui, std::vector< int > > &ele, const std::unordered_map< ui, std::vector< int > > &ele_other)
Definition ovlpjoin_parallel.cc:90
~OvlpUtilParallel()=default
static bool is_equal(const combination_p2 &c1, const combination_p2 &c2, const OvlpRSJoinParallel &joiner)
Definition ovlpjoin_parallel.h:355
static bool is_equal(const combination_p1 &c1, const combination_p1 &c2, const OvlpSelfJoinParallel &joiner)
Definition ovlpjoin_parallel.h:347
OvlpUtilParallel(const OvlpUtilParallel &other)=delete
static int compare(const combination_p1 &c1, const combination_p2 &c2, const OvlpRSJoinParallel &joiner)
Definition ovlpjoin_parallel.h:363
static void transform(std::unordered_map< ui, std::vector< int > > &ele, const std::vector< std::pair< int, int > > &eles, std::vector< std::pair< int, int > > &idmap, std::vector< std::vector< std::pair< int, int > > > &ele_lists, std::vector< std::vector< ui > > &dataset, const ui total_eles, const int n, const OvlpRSJoinParallel &joiner)
Definition ovlpjoin_parallel.cc:110
OvlpUtilParallel()=default
static int64_t nchoosek(int64_t n, int64_t k)
Definition ovlpjoin_parallel.h:374
static bool comp_pair(const std::pair< int, int > &p1, const int val)
Definition ovlpjoin_parallel.h:336
#define MAX_PAIR_SIZE
Definition config.h:44
#define MAXTHREADNUM
Definition config.h:38
Definition ovlpjoin_parallel.h:38
std::vector< int > curr
Definition ovlpjoin_parallel.h:43
bool ifsame(const std::vector< ui > &data, const OvlpRSJoinParallel &joiner)
Definition ovlpjoin_parallel.cc:1345
int N
Definition ovlpjoin_parallel.h:40
bool stepback(const int i, const OvlpRSJoinParallel &joiner)
Definition ovlpjoin_parallel.cc:1252
combination_p1(int d, int beg, const OvlpRSJoinParallel &joiner)
Definition ovlpjoin_parallel.cc:1196
bool completed
Definition ovlpjoin_parallel.h:42
void binary(const combination_p1 &value, const OvlpSelfJoinParallel &joiner)
Definition ovlpjoin_parallel.cc:1284
void print(const OvlpRSJoinParallel &joiner) const
Definition ovlpjoin_parallel.cc:1240
int getlastcurr(const OvlpRSJoinParallel &joiner)
Definition ovlpjoin_parallel.cc:1214
void next(const OvlpRSJoinParallel &joiner)
Definition ovlpjoin_parallel.cc:1226
int id
Definition ovlpjoin_parallel.h:41
Definition ovlpjoin_parallel.h:64
int N
Definition ovlpjoin_parallel.h:66
int id
Definition ovlpjoin_parallel.h:67
bool ifsame(const std::vector< ui > &data, const OvlpRSJoinParallel &joiner)
Definition ovlpjoin_parallel.cc:1474
combination_p2(int d, int beg, const OvlpRSJoinParallel &joiner)
Definition ovlpjoin_parallel.cc:1353
void next(const OvlpRSJoinParallel &joiner)
Definition ovlpjoin_parallel.cc:1368
bool completed
Definition ovlpjoin_parallel.h:68
int getlastcurr(const OvlpRSJoinParallel &joiner)
Definition ovlpjoin_parallel.cc:1362
bool stepback(const int i, const OvlpRSJoinParallel &joiner)
Definition ovlpjoin_parallel.cc:1394
void print(const OvlpRSJoinParallel &joiner) const
Definition ovlpjoin_parallel.cc:1382
std::vector< int > curr
Definition ovlpjoin_parallel.h:69
void binary(const combination_p2 &value, const OvlpRSJoinParallel &joiner)
Definition ovlpjoin_parallel.cc:1410
unsigned int ui
Definition type.h:8