16#include <unordered_map>
17#include <unordered_set>
51 std::vector<std::vector<std::pair<int, int>>>
indexVecs;
74#if MAINTAIN_VALUE == 1
97 joiner->
indexVecs.push_back(std::vector<std::pair<int, int>>());
118 SetJoin(
const std::vector<std::vector<ui>> &sorted_records,
const std::vector<double> &recwt,
119 const std::vector<double> &_wordwt, std::string _sim_pairs_filepath,
double _det,
120 ui _maxHeapSize = 0,
bool _isWeightedComp =
false)
126 ui min_records_size = sorted_records.front().size();
127 ui max_records_size = sorted_records.back().size();
130 if(_det <= 0.4 && max_records_size >= 55) {
137 printf(
"Min record's size: %u\tMax record's size: %u\n", min_records_size, max_records_size);
140#if MAINTAIN_VALUE == 1
146 SetJoin(
const std::vector<std::vector<ui>> &work_records,
const std::vector<std::vector<ui>> &query_records,
147 const std::vector<double> &workwt,
const std::vector<double> &querywt,
const std::vector<double> &_wordwt,
148 const std::string &_sim_pairs_filepath,
double _det,
ui _maxHeapSize = 0,
bool _isWeightedComp =
false)
161 if(_det <= 0.4 && max_work_size >= 55) {
165 if(_det <= 0.4 && max_query_size >= 55) {
172 printf(
"Min work record's size: %u\tMax work record's size: %u\n", min_work_size, max_work_size);
173 printf(
"Min query record's size: %u\tMax query record's size: %u\n", min_query_size, max_query_size);
176#if MAINTAIN_VALUE == 1
188 void loadDataset(
const std::vector<std::vector<ui>> &records, std::string file) {
196 void prepare(
const std::vector<std::vector<ui>> &offsets,
ui column) {
198 for(
ui i = 0; i < size; i++)
200 for(
ui i = 0; i < size; i++) {
201 ui prefix = offsets[i][column];
202 ui suffix = offsets[i][column + 1];
213 double avergaeSize = 0.0;
214 for(
const auto &rec : dataset) {
215 if(rec.empty())
continue;
216 avergaeSize += (double)rec.size() * 1.0;
219 avergaeSize = avergaeSize / (unempty * 1.0);
220 long double sd = 0.0;
221 for(
const auto &rec : dataset) {
222 if(rec.empty())
continue;
223 sd += (rec.size() * 1.0 - avergaeSize) * (rec.size() * 1.0 - avergaeSize);
225 sd = sd / (unempty * 1.0);
226 printf(
"Resize dataset on 1sd, mean: %.1lf\tsd: %.1Lf\n", avergaeSize, sd);
227 int bound = ceil(avergaeSize + 1.0 * sd - 1e-5);
228 for(
auto &rec : dataset)
229 if((
int)rec.size() > bound)
239 set_intersection(records1.begin(), records1.end(),
240 records2.begin(), records2.end(),
241 std::back_inserter(res));
244 for(
const auto &e : res)
255 assert(std::abs(rw1) > 1e-7 && std::abs(rw2) > 1e-7);
257 return ovlp / (rw1 + rw2 - ovlp);
264 set_intersection(records1.begin(), records1.end(),
265 records2.begin(), records2.end(),
266 std::back_inserter(res));
267 int ovlp = (int)res.size();
269 return ovlp * 1.0 / (records1.size() + records2.size() - ovlp) * 1.0;
277 assert(std::abs(rw1) > 1e-7 && std::abs(rw2) > 1e-7);
279 return ovlp / sqrt(rw1 * rw2);
286 set_intersection(records1.begin(), records1.end(),
287 records2.begin(), records2.end(),
288 std::back_inserter(res));
289 int ovlp = (int)res.size();
291 return ovlp * 1.0 / sqrt(records1.size() * records2.size()) * 1.0;
299 assert(std::abs(rw1) > 1e-7 && std::abs(rw2) > 1e-7);
301 return 2.0 * ovlp / (rw1 + rw2);
308 set_intersection(records1.begin(), records1.end(),
309 records2.begin(), records2.end(),
310 std::back_inserter(res));
311 int ovlp = (int)res.size();
313 return ovlp * 2.0 / (records1.size() + records2.size()) * 1.0;
317 bool overlap(
int x,
int y,
int posx = 0,
int posy = 0,
int current_overlap = 0);
318 bool overlapRS(
int x,
int y,
int posx = 0,
int posy = 0,
int current_overlap = 0);
320 void setSelfJoin(
double threshold, std::vector<std::pair<int, int>>& sim_pairs);
322 void setRSJoin(
double threshold, std::vector<std::pair<int, int>>& sim_pairs);
void setRSJoin(double threshold, std::vector< std::pair< int, int > > &sim_pairs)
Definition setjoin.cc:619
void setSelfJoin(double threshold, std::vector< std::pair< int, int > > &sim_pairs)
Definition setjoin.cc:87
bool overlapRS(int x, int y, int posx=0, int posy=0, int current_overlap=0)
Definition setjoin.cc:48
SetJoin(const std::vector< std::vector< ui > > &work_records, const std::vector< std::vector< ui > > &query_records, const std::vector< double > &workwt, const std::vector< double > &querywt, const std::vector< double > &_wordwt, const std::string &_sim_pairs_filepath, double _det, ui _maxHeapSize=0, bool _isWeightedComp=false)
Definition setjoin.h:146
std::vector< ui > workEmpty
Definition setjoin.h:68
uint64_t candidateNum
Definition setjoin.h:56
std::vector< std::vector< ui > > query_dataset
Definition setjoin.h:63
std::vector< double > work_weights
Definition setjoin.h:64
std::vector< invertedList > indexLists
Definition setjoin.h:114
ui maxHeapSize
Definition setjoin.h:73
std::vector< std::vector< ui > > work_dataset
Definition setjoin.h:62
double weightedOverlap(ui x, ui y)
Definition setjoin.h:234
void loadDataset(const std::vector< std::vector< ui > > &records, std::string file)
Definition setjoin.h:188
int prime_exp[MAX_LINE_LENGTH]
Definition setjoin.h:60
double(SetJoin::* weightedFunc)(ui, ui)
Definition setjoin.h:46
bool(SetJoin::* overlapFunc)(int, int, int, int, int)
Definition setjoin.h:48
double weightedDice(ui x, ui y)
Definition setjoin.h:294
uint64_t listlens
Definition setjoin.h:58
bool ifRS
Definition setjoin.h:43
SetJoin(const std::vector< std::vector< ui > > &sorted_records, const std::vector< double > &recwt, const std::vector< double > &_wordwt, std::string _sim_pairs_filepath, double _det, ui _maxHeapSize=0, bool _isWeightedComp=false)
Definition setjoin.h:118
SimFuncType simFType
Definition setjoin.h:45
int isHeap
Definition setjoin.h:77
uint64_t lengthSum
Definition setjoin.h:57
uint64_t resultNum
Definition setjoin.h:55
std::vector< double > query_weights
Definition setjoin.h:65
double(SetJoin::* normalFunc)(ui, ui)
Definition setjoin.h:47
bool isWeightedComp
Definition setjoin.h:75
double weightedJaccard(ui x, ui y)
Definition setjoin.h:250
void resizeData(std::vector< std::vector< ui > > &dataset)
Definition setjoin.h:210
double jaccard(ui x, ui y)
Definition setjoin.h:259
std::vector< double > wordwt
Definition setjoin.h:66
std::vector< std::vector< std::pair< int, int > > > indexVecs
Definition setjoin.h:51
double det
Definition setjoin.h:54
double index_cost
Definition setjoin.h:40
double cosine(ui x, ui y)
Definition setjoin.h:281
std::string simP_file_path
Definition setjoin.h:71
~SetJoin()
Definition setjoin.h:182
std::vector< std::vector< ui > > dataset_all
Definition setjoin.h:61
std::vector< WeightPair > result_pairs_
Definition setjoin.h:76
double allocation_cost
Definition setjoin.h:39
std::vector< std::pair< int, int > > result_pairs
Definition setjoin.h:69
std::vector< std::pair< int, int > > cacheVec
Definition setjoin.h:50
bool overlap(int x, int y, int posx=0, int posy=0, int current_overlap=0)
Definition setjoin.cc:12
double dice(ui x, ui y)
Definition setjoin.h:303
std::vector< ui > queryEmpty
Definition setjoin.h:68
double overlap_cost
Definition setjoin.h:38
double weightedCosine(ui x, ui y)
Definition setjoin.h:272
void prepare(const std::vector< std::vector< ui > > &offsets, ui column)
Definition setjoin.h:196
#define MAX_PAIR_SIZE_SERIAL
Definition config.h:51
#define CACHE_SIZE
Definition config.h:82
#define MAX_LINE_LENGTH
Definition config.h:81
invIndexStruct()
Definition setjoin.h:111
unsigned long long list_no
Definition setjoin.h:108
int * oneList
Definition setjoin.h:109
int vec_no
Definition setjoin.h:81
std::vector< std::pair< int, int > > & getVector(SetJoin *joiner) const
Definition setjoin.h:84
std::pair< int, int > cache[CACHE_SIZE]
Definition setjoin.h:82
void add(std::pair< int, int > data, SetJoin *joiner)
Definition setjoin.h:92
int cnt
Definition setjoin.h:81
SimFuncType
Definition type.h:48
unsigned int ui
Definition type.h:8