17#include <unordered_map>
18#include <unordered_set>
27#include <sys/sysinfo.h>
33#define REPORT_BINARY 0
116 std::vector<std::vector<std::pair<int, int>>>
ele_lists;
121#if MAINTAIN_VALUE_OVLP == 1
131 void overlapjoin(
int overlap_threshold, std::vector<std::pair<int, int>> &finalPairs);
132 void small_case(
int L,
int R, std::vector<std::pair<int, int>> &finalPairs);
134 OvlpSelfJoin(
const std::vector<std::vector<ui>> &sorted_records,
const std::vector<double> &_recWeights,
135 const std::vector<double> _wordwt,
ui _maxHeapSize = 0,
bool _isWeightedComp =
false)
143#if MAINTAIN_VALUE_OVLP == 1
152 auto & c1 =
combs[a];
153 auto & c2 =
combs[b];
154 for (
ui i = 0; i <
c; i++) {
160 return c1.id > c2.id;
167 std::back_inserter(res));
169 for(
const auto &e : res)
181 double ovlp = res.size() * 1.0;
182 return ovlp / std::min(
records[id1].size(),
records[id2].size()) * 1.0;
188 bool build_heap(
const std::vector<std::pair<int,int>> &vec,
const std::vector<std::vector<ui>> &dataset,
189 int L, std::vector<int> &
heap, std::vector<combination1> &
combs,
int &heap_size);
211#if MAINTAIN_VALUE_OVLP == 1
221 void overlapjoin(
int overlap_threshold, std::vector<std::pair<int, int>> &finalPairs);
222 void small_case(
int L1,
int R1,
int L2,
int R2, std::vector<std::pair<int, int>> &finalPairs);
227 OvlpRSJoin(
const std::vector<std::vector<ui>> &sorted_records_1,
const std::vector<std::vector<ui>> &sorted_records_2,
228 const std::vector<double> &_recWeights1,
const std::vector<double> &_recWeights2,
229 const std::vector<double> &_wordwt,
ui _maxHeapSize = 0,
bool _isWeightedComp =
false)
238#if MAINTAIN_VALUE_OVLP == 1
256 for (
ui i = 0; i <
c; i++) {
262 return c1.id > c2.id;
267 for (
ui i = 0; i <
c; i++) {
273 return c1.id > c2.id;
280 std::back_inserter(res));
282 for(
const auto &e : res)
294 double ovlp = res.size() * 1.0;
301 bool build_heap(
const std::vector<std::pair<int,int>> &vec,
const std::vector<std::vector<ui>> &dataset,
302 int L, std::vector<int> &heap, std::vector<combination1> &combs,
int &heap_size);
304 bool build_heap(
const std::vector<std::pair<int,int>> &vec,
const std::vector<std::vector<ui>> &dataset,
305 int L, std::vector<int> &heap, std::vector<combination2> &combs,
int &heap_size);
322 static bool comp_pair(
const std::pair<int, int> &p1,
const int val) {
323 return p1.first < val;
327 for (
int i = 0; i < joiner.
c; i++) {
335 for (
int i = 0; i < joiner.
c; i++) {
343 for (
int i = 0; i < joiner.
c; i++) {
352 for (
int i = 0; i < joiner.
c; i++) {
362 static void removeShort(
const std::vector<std::vector<ui>> &records, std::unordered_map<
ui, std::vector<int>> &ele,
367 static void removeWidow(std::unordered_map<
ui, std::vector<int>> &ele,
const std::unordered_map<
ui, std::vector<int>> &ele_other);
368 static void transform(std::unordered_map<
ui, std::vector<int>> &ele,
const std::vector<std::pair<int, int>> &eles,
369 std::vector<std::pair<int, int>> &idmap, std::vector<std::vector<std::pair<int, int>>> &ele_lists,
370 std::vector<std::vector<ui>> &dataset,
const ui total_eles,
const int n,
const OvlpRSJoin &joiner);
Definition ovlpjoin.h:194
int n2
Definition ovlpjoin.h:196
std::vector< combination2 > combs2
Definition ovlpjoin.h:209
std::vector< int > heap2
Definition ovlpjoin.h:207
int c
Definition ovlpjoin.h:197
std::vector< std::pair< int, int > > idmap_records1
Definition ovlpjoin.h:204
std::string resultPair_storePath
Definition ovlpjoin.h:225
std::vector< int > heap1
Definition ovlpjoin.h:207
std::vector< double > wordwt
Definition ovlpjoin.h:203
bool comp_comb2(const int a, const int b)
Definition ovlpjoin.h:264
ui maxHeapSize
Definition ovlpjoin.h:210
ui total_eles
Definition ovlpjoin.h:198
std::vector< double > recWeights1
Definition ovlpjoin.h:202
std::vector< std::vector< ui > > records1
Definition ovlpjoin.h:200
std::vector< std::vector< std::pair< int, int > > > ele_lists1
Definition ovlpjoin.h:205
int isHeap
Definition ovlpjoin.h:215
int n1
Definition ovlpjoin.h:196
void small_case(int L1, int R1, int L2, int R2, std::vector< std::pair< int, int > > &finalPairs)
Definition ovlpjoin.cc:103
int64_t candidate_num
Definition ovlpjoin.h:218
std::vector< std::pair< int, int > > idmap_records2
Definition ovlpjoin.h:204
std::vector< std::vector< ui > > datasets2
Definition ovlpjoin.h:201
std::vector< WeightPair > result_pairs_
Definition ovlpjoin.h:214
std::vector< combination1 > combs1
Definition ovlpjoin.h:208
OvlpRSJoin(const std::vector< std::vector< ui > > &sorted_records_1, const std::vector< std::vector< ui > > &sorted_records_2, const std::vector< double > &_recWeights1, const std::vector< double > &_recWeights2, const std::vector< double > &_wordwt, ui _maxHeapSize=0, bool _isWeightedComp=false)
Definition ovlpjoin.h:227
std::vector< std::pair< int, int > > result_pairs
Definition ovlpjoin.h:206
double weightedOverlapCoeff(int id1, int id2)
Definition ovlpjoin.h:276
std::vector< double > recWeights2
Definition ovlpjoin.h:202
void set_external_store(const std::string &_resPair_path)
Definition ovlpjoin.h:244
std::vector< std::vector< ui > > records2
Definition ovlpjoin.h:200
std::vector< std::vector< ui > > datasets1
Definition ovlpjoin.h:201
void overlapjoin(int overlap_threshold, std::vector< std::pair< int, int > > &finalPairs)
Definition ovlpjoin.cc:376
bool comp_comb1(const int a, const int b)
Definition ovlpjoin.h:252
bool isWeightedComp
Definition ovlpjoin.h:212
bool if_external_IO
Definition ovlpjoin.h:224
double overlapCoeff(int id1, int id2)
Definition ovlpjoin.h:288
int64_t result_num
Definition ovlpjoin.h:219
bool build_heap(const std::vector< std::pair< int, int > > &vec, const std::vector< std::vector< ui > > &dataset, int L, std::vector< int > &heap, std::vector< combination1 > &combs, int &heap_size)
Definition ovlpjoin.cc:11
std::vector< std::vector< std::pair< int, int > > > ele_lists2
Definition ovlpjoin.h:205
Definition ovlpjoin.h:105
int n
Definition ovlpjoin.h:107
std::vector< std::vector< ui > > datasets
Definition ovlpjoin.h:112
int64_t result_num
Definition ovlpjoin.h:129
int64_t candidate_num
Definition ovlpjoin.h:128
ui maxHeapSize
Definition ovlpjoin.h:120
void small_case(int L, int R, std::vector< std::pair< int, int > > &finalPairs)
Definition ovlpjoin.cc:448
OvlpSelfJoin(const std::vector< std::vector< ui > > &sorted_records, const std::vector< double > &_recWeights, const std::vector< double > _wordwt, ui _maxHeapSize=0, bool _isWeightedComp=false)
Definition ovlpjoin.h:134
ui total_eles
Definition ovlpjoin.h:109
std::vector< WeightPair > result_pairs_
Definition ovlpjoin.h:124
bool build_heap(const std::vector< std::pair< int, int > > &vec, const std::vector< std::vector< ui > > &dataset, int L, std::vector< int > &heap, std::vector< combination1 > &combs, int &heap_size)
Definition ovlpjoin.cc:70
std::vector< std::vector< std::pair< int, int > > > ele_lists
Definition ovlpjoin.h:116
void overlapjoin(int overlap_threshold, std::vector< std::pair< int, int > > &finalPairs)
Definition ovlpjoin.cc:592
std::vector< double > recWeights
Definition ovlpjoin.h:113
std::vector< std::vector< ui > > records
Definition ovlpjoin.h:111
std::vector< combination1 > combs
Definition ovlpjoin.h:119
std::vector< double > wordwt
Definition ovlpjoin.h:114
double overlapCoeff(int id1, int id2)
Definition ovlpjoin.h:175
int isHeap
Definition ovlpjoin.h:125
bool comp_comb1(const int a, const int b)
Definition ovlpjoin.h:150
std::vector< std::pair< int, int > > result_pairs
Definition ovlpjoin.h:117
std::vector< int > heap
Definition ovlpjoin.h:118
double weightedOverlapCoeff(int id1, int id2)
Definition ovlpjoin.h:163
std::vector< std::pair< int, int > > idmap_records
Definition ovlpjoin.h:115
bool isWeightedComp
Definition ovlpjoin.h:122
int c
Definition ovlpjoin.h:108
Definition ovlpjoin.h:311
static int compare(const combination1 &c1, const combination2 &c2, const OvlpRSJoin &joiner)
Definition ovlpjoin.h:349
static bool comp_int(const int a, const int b)
Definition ovlpjoin.h:319
static void removeWidow(std::unordered_map< ui, std::vector< int > > &ele, const std::unordered_map< ui, std::vector< int > > &ele_other)
Definition ovlpjoin.cc:701
static bool is_equal(const combination1 &c1, const combination1 &c2, const OvlpRSJoin &joiner)
Definition ovlpjoin.h:325
OvlpUtil(OvlpUtil &&other)=delete
static bool comp_pair(const std::pair< int, int > &p1, const int val)
Definition ovlpjoin.h:322
static void removeShort(const std::vector< std::vector< ui > > &records, std::unordered_map< ui, std::vector< int > > &ele, const OvlpRSJoin &joiner)
Definition ovlpjoin.cc:686
static bool is_equal(const combination2 &c1, const combination2 &c2, const OvlpRSJoin &joiner)
Definition ovlpjoin.h:341
static void transform(std::unordered_map< ui, std::vector< int > > &ele, const std::vector< std::pair< int, int > > &eles, std::vector< std::pair< int, int > > &idmap, std::vector< std::vector< std::pair< int, int > > > &ele_lists, std::vector< std::vector< ui > > &dataset, const ui total_eles, const int n, const OvlpRSJoin &joiner)
Definition ovlpjoin.cc:721
OvlpUtil(const OvlpUtil &other)=delete
static bool is_equal(const combination1 &c1, const combination1 &c2, const OvlpSelfJoin &joiner)
Definition ovlpjoin.h:333
#define MAX_PAIR_SIZE_SERIAL
Definition config.h:51
bool ifsame(const std::vector< ui > &data, const OvlpRSJoin &joiner)
Definition ovlpjoin.cc:936
void next(const OvlpRSJoin &joiner)
Definition ovlpjoin.cc:790
std::vector< int > curr
Definition ovlpjoin.h:52
int getlastcurr(const OvlpRSJoin &joiner)
Definition ovlpjoin.cc:777
bool stepback(const int i, const OvlpRSJoin &joiner)
Definition ovlpjoin.cc:842
void binary(const combination1 &value, const OvlpSelfJoin &joiner)
Definition ovlpjoin.cc:874
bool completed
Definition ovlpjoin.h:51
combination1(int d, int beg, const OvlpRSJoin &joiner)
Definition ovlpjoin.cc:759
int N
Definition ovlpjoin.h:49
void print(const OvlpRSJoin &joiner) const
Definition ovlpjoin.cc:818
int id
Definition ovlpjoin.h:50
int id
Definition ovlpjoin.h:81
void print(const OvlpRSJoin &joiner) const
Definition ovlpjoin.cc:974
combination2(int d, int beg, const OvlpRSJoin &joiner)
Definition ovlpjoin.cc:944
std::vector< int > curr
Definition ovlpjoin.h:83
bool stepback(const int i, const OvlpRSJoin &joiner)
Definition ovlpjoin.cc:986
bool completed
Definition ovlpjoin.h:82
void binary(const combination2 &value, const OvlpRSJoin &joiner)
Definition ovlpjoin.cc:1002
bool ifsame(const std::vector< ui > &data, const OvlpRSJoin &joiner)
Definition ovlpjoin.cc:1067
int N
Definition ovlpjoin.h:80
void next(const OvlpRSJoin &joiner)
Definition ovlpjoin.cc:960
int getlastcurr(const OvlpRSJoin &joiner)
Definition ovlpjoin.cc:953
unsigned int ui
Definition type.h:8