Entity Matching by Similarity Join
 
Loading...
Searching...
No Matches
sample.h
Go to the documentation of this file.
1/*
2 * author: Yunqi Li
3 * contact: liyunqixa@gmail.com
4 */
5#ifndef _SAMPLE_H_
6#define _SAMPLE_H_
7
8#include "common/simfunc.h"
9#include "common/io.h"
14#include <cstdlib>
15#include <time.h>
16#include <parallel/algorithm>
17#include <omp.h>
18
19
20class Sample
21{
22private:
23 static int CLUSTER_SAMPLE_SIZE;
24
25 struct dsu
26 {
27 std::vector<int> fa;
28
29 dsu(ui size): fa(size) {
30 std::iota(fa.begin(), fa.end(), 0);
31 }
32
33 int find(int x) {
34 return fa[x] == x ? x : fa[x] = find(fa[x]);
35 }
36
37 void unite(int x, int y) {
38 fa[find(x)] = find(y);
39 }
40 };
41
42public:
43 Sample() = default;
44 ~Sample() = default;
45 Sample(const Sample &other) = delete;
46 Sample(Sample &&other) = delete;
47
48private:
49 // 2-step cluster
50 static std::pair<double, double> getStat(const std::vector<std::pair<int, int>> &pairs, const std::vector<ui> &idMapA,
51 const std::vector<ui> &idMapB);
52 static void step2Sample(const std::string &blkAttr, double step2Tau, std::vector<std::pair<int, int>> &pairs,
53 const Table &tableA, const Table &tableB, const std::vector<ui> &idMapA,
54 const std::vector<ui> &idMapB, bool isRS);
55
56public:
57 // cluster
58 static void clusterSampleSelf(const std::string &blkAttr, double clusterTau, double blkTau,
59 const std::string &pathTableA, const std::string &pathTableB,
60 const std::string &defaultOutputDir = "");
61 /*
62 * it is not appropriate to apply clustering on RS-join
63 * thus we only adopt the normal 2-step jaccard sampling
64 */
65 static void clusterSampleRS(const std::string &blkAttr, double clusterTau, double blkTau, double step2Tau,
66 const std::string &pathTableA, const std::string &pathTableB,
67 const std::string &defaultOutputDir = "");
68
69 // down
70 static void downSample(ui n, ui y, const std::string &blkAttr, bool isRS,
71 const std::string &pathTableA, const std::string &pathTableB,
72 const std::string &defaultOutputDir = "");
73
74 // pre, may be depracted
75 /*
76 * Directly read the provided sampled subset
77 * this can only be used on synthetic datasets
78 * the real-wolrd datasets has no pre-sampled subset
79 */
80 static void preSample(ui n, int datanum, const std::string &blkAttr, const std::string &pathZ,
81 const std::string &pathY, const std::string &defaultOutputDir = "");
82};
83
84#endif // _SAMPLE_H_
Definition sample.h:21
static void clusterSampleSelf(const std::string &blkAttr, double clusterTau, double blkTau, const std::string &pathTableA, const std::string &pathTableB, const std::string &defaultOutputDir="")
Definition sample.cc:10
Sample()=default
static void clusterSampleRS(const std::string &blkAttr, double clusterTau, double blkTau, double step2Tau, const std::string &pathTableA, const std::string &pathTableB, const std::string &defaultOutputDir="")
Definition sample.cc:231
Sample(Sample &&other)=delete
~Sample()=default
static void preSample(ui n, int datanum, const std::string &blkAttr, const std::string &pathZ, const std::string &pathY, const std::string &defaultOutputDir="")
Definition sample.cc:322
static void downSample(ui n, ui y, const std::string &blkAttr, bool isRS, const std::string &pathTableA, const std::string &pathTableB, const std::string &defaultOutputDir="")
Definition sample.cc:292
Sample(const Sample &other)=delete
Definition dataframe.h:19
unsigned int ui
Definition type.h:8