Entity Matching by Similarity Join
 
Loading...
Searching...
No Matches
topk.h
Go to the documentation of this file.
1/*
2 * author: Yunqi Li
3 * contact: liyunqixa@gmail.com
4 */
5#ifndef _TOP_K_H_
6#define _TOP_K_H_
7
8#include "common/config.h"
9#include "common/dataframe.h"
10#include "common/tokenizer.h"
11#include "common/simfunc.h"
12#include <fstream>
13#include <iostream>
14#include <bitset>
15#include <parallel/algorithm>
16#include <unordered_set>
17#include <queue>
18#include <iomanip>
19#include <numeric>
20
21// topk table entry
23{
24 uint64_t id{0};
25 double val{0.0};
26
27 TableEntry() = default;
28 TableEntry(uint64_t _id, double _val)
29 : id(_id), val(_val) { }
30};
31
32// top k bucket entry
34{
35 bool operator() (const TableEntry &lhs, const TableEntry &rhs) const{
36 return lhs.val > rhs.val;
37 }
38};
39
40
41class TopK
42{
43public:
44 TopK() = default;
45 ~TopK() = default;
46 TopK(const TopK &other) = delete;
47 TopK(TopK &&other) = delete;
48
49private:
50 // buffers
51 static void allocateBuffers(uint64_t numEntity, ui numDimension, TableEntry **&valueTable, double **&backup);
52 static void releaseBuffers(ui numDimension, TableEntry **&valueTable, double **&backup);
53
54 // fill in table
55 // using weighted similarity metrics or original one
56 // it seems that the original is better, but the reason remains unclear
57 static void prepareSelf(const std::vector<std::vector<ui>> &records, const std::vector<ui> &id_map,
58 const std::vector<std::vector<int>> &final_pairs,
59 std::vector<std::pair<int, int>> &id2Pair,
60 TableEntry **&valueTable, double **&backup,
61 ui numRow, uint64_t &numEntity);
62
63 static void prepareSelfWeighted(const std::vector<std::vector<ui>> &records, const std::vector<ui> &id_map,
64 const std::vector<double> &recWeights, const std::vector<double> &wordwt,
65 const std::vector<std::vector<int>> &final_pairs,
66 std::vector<std::pair<int, int>> &id2Pair,
67 TableEntry **&valueTable, double **&backup,
68 ui numRow, uint64_t &numEntity);
69
70 // for RS join, the pair must be in form: (idA, idB) as in ground truth
71 // that is, we need to synethsis before doing top k
72 static void prepareRS(const std::vector<std::vector<ui>> &recordsA, const std::vector<std::vector<ui>> &recordsB,
73 const std::vector<ui> &id_mapA, const std::vector<ui> &id_mapB,
74 const std::vector<std::vector<int>> &final_pairs,
75 std::vector<std::pair<int, int>> &id2Pair,
76 TableEntry **&valueTable, double **&backup,
77 ui numRowA, ui numRowB, uint64_t &numEntity);
78
79 static void prepareRSWeighted(const std::vector<std::vector<ui>> &recordsA, const std::vector<std::vector<ui>> &recordsB,
80 const std::vector<ui> &id_mapA, const std::vector<ui> &id_mapB,
81 const std::vector<double> &recWeightsA, const std::vector<double> &recWeightsB,
82 const std::vector<double> &wordwt,
83 const std::vector<std::vector<int>> &final_pairs,
84 std::vector<std::pair<int, int>> &id2Pair,
85 TableEntry **&valueTable, double **&backup,
86 ui numRowA, ui numRowB, uint64_t &numEntity);
87
88 // intuitively, the topK attr will be the attr that different entities
89 // will have different values on it
90 // thus, the attr tends to be long, with as much as info as possible.
91 // so we use dlm_dc0 to tokenize it, and it must be already tokenzied before
92
93 // calculate recall within the functions
94 // mode: "report" only reports without change final_pairs; "replace" only changes without reports,
95 // "hybrid" for both, "exp" is only used for experiments
96private:
97 static void updateFinalPairs(const std::vector<ui> &topKidMapA, const std::vector<ui> &topKidMapB, const std::vector<std::pair<int, int>> &id2Pair,
98 std::priority_queue<TableEntry, std::vector<TableEntry>, PQComparison> &q,
99 std::vector<std::vector<int>> &final_pairs, uint64_t K);
100
101 static void getCurrentRecall(const std::vector<ui> &topKidMapA, const std::vector<ui> &topKidMapB, const std::vector<std::pair<int, int>> &id2Pair,
102 std::priority_queue<TableEntry, std::vector<TableEntry>, PQComparison> &q,
103 const std::vector<std::pair<int, int>> &golds, uint64_t K, uint64_t cartesian);
104 // only used for "exp" mode
105 static void getCurrentRecallExp(const std::vector<ui> &topKidMapA, const std::vector<ui> &topKidMapB, const std::vector<std::pair<int, int>> &id2Pair,
106 std::priority_queue<TableEntry, std::vector<TableEntry>, PQComparison> &q,
107 const std::vector<std::pair<int, int>> &golds, uint64_t K, uint64_t cartesian,
108 std::ofstream &statStream);
109
110public:
111 // TA
112 static void topKviaTASelf(const Table &table_A, const std::string &topKattr, const std::string &attrType,
113 const std::vector<std::vector<std::vector<ui>>> &recordsA,
114 const std::vector<std::vector<double>> &recWeights,
115 const std::vector<std::vector<double>> &wordwt,
116 const std::unordered_map<std::string, ui> &datasets_map,
117 const std::vector<std::vector<ui>> &id_mapA,
118 std::vector<std::vector<int>> &final_pairs,
119 const Table &groundTruth, uint64_t K, std::ofstream &statStream,
120 bool ifWeighted = false, const std::string &mode = "exp");
121
122 static void topKviaTARS(const Table &table_A, const Table &table_B, const std::string &topKattr, const std::string &attrType,
123 const std::vector<std::vector<std::vector<ui>>> &recordsA,
124 const std::vector<std::vector<std::vector<ui>>> &recordsB,
125 const std::vector<std::vector<double>> &recWeightsA,
126 const std::vector<std::vector<double>> &recWeightsB,
127 const std::vector<std::vector<double>> &wordwt,
128 const std::unordered_map<std::string, ui> &datasets_map,
129 const std::vector<std::vector<ui>> &id_mapA,
130 const std::vector<std::vector<ui>> &id_mapB,
131 std::vector<std::vector<int>> &final_pairs,
132 const Table &groundTruth, uint64_t K, std::ofstream &statStream,
133 bool ifWeighted = false, const std::string &mode = "exp");
134
135 // optimized version
136 // multithread & SIMD
137 // to be done
138 static void topKviaTASelfOpt(const Table &table_A, const std::string &topKattr, const std::string &attrType,
139 const std::vector<std::vector<std::vector<ui>>> &recordsA,
140 const std::unordered_map<std::string, ui> &datasets_map,
141 const std::vector<std::vector<ui>> &id_mapA,
142 std::vector<std::vector<int>> &final_pairs,
143 uint64_t K);
144
145 // NRA: Non-random Access Algorithm
146 // this algorithm is the primary choice of the blocker
147 // but it's not efficient, so decrapated now
148
149 // weighted all similarity scores sort
150private:
151 static void updateFinalPairs(const std::vector<ui> &sortedIdMap, const std::vector<std::pair<int, int>> &allPairs,
152 std::vector<std::vector<int>> &final_pairs, uint64_t K);
153
154 static void getCurrentRecall(const std::vector<ui> &sortedIdMap, const std::vector<std::pair<int, int>> &allPairs,
155 const std::vector<std::pair<int, int>> &golds, uint64_t K, uint64_t cartesian);
156 // only used for "exp" mode
157 static void getCurrentRecallExp(const std::vector<ui> &sortedIdMap, const std::vector<std::pair<int, int>> &allPairs,
158 const std::vector<std::pair<int, int>> &golds, uint64_t K, uint64_t cartesian,
159 std::ofstream &statStream);
160
161public:
162 static void topKviaAllSimilarityScoresRS(const Table &table_A, const Table &table_B, const Rule *rules,
163 const std::vector<double> &simWeights,
164 const std::unordered_map<std::string, double> &attrAverage,
165 const std::vector<std::vector<std::vector<ui>>> &recordsA,
166 const std::vector<std::vector<std::vector<ui>>> &recordsB,
167 const std::vector<std::vector<double>> &recWeightsA,
168 const std::vector<std::vector<double>> &recWeightsB,
169 const std::vector<std::vector<double>> &wordwt,
170 const std::unordered_map<std::string, ui> &datasets_map,
171 const std::vector<std::vector<ui>> &id_mapA,
172 const std::vector<std::vector<ui>> &id_mapB,
173 const std::vector<std::vector<ui>> &id_mapAString,
174 const std::vector<std::vector<ui>> &id_mapBString,
175 std::vector<std::vector<int>> &final_pairs,
176 const Table &groundTruth, uint64_t K, ui numRule,
177 std::ofstream &statStream, bool isWeighted = false,
178 const std::string &mode = "exp");
179
180 static void topKviaAllSimilarityScoreSelf(const Table &table_A, const Rule *rules, const std::vector<double> &simWeights,
181 const std::unordered_map<std::string, double> &attrAverage,
182 const std::vector<std::vector<std::vector<ui>>> &records,
183 const std::vector<std::vector<double>> &recWeights,
184 const std::vector<std::vector<double>> &wordwt,
185 const std::unordered_map<std::string, ui> &datasets_map,
186 const std::vector<std::vector<ui>> &id_map,
187 const std::vector<std::vector<ui>> &id_mapString,
188 std::vector<std::vector<int>> &final_pairs,
189 const Table &groundTruth, u_int64_t K, ui numRule,
190 std::ofstream &statStream, bool isWeighted = false,
191 const std::string &mode = "exp");
192
193public:
194 // on match result
195 void topKviaTATable(const Table &matchRes, uint64_t K, std::vector<std::vector<std::string>> &newTable);
196};
197
198
199#endif // _TOP_K_H_
std::vector< ui > q
Definition block.cc:9
std::unordered_map< std::string, ui > datasets_map
Definition blocker_config.cc:24
std::vector< std::vector< int > > final_pairs
Definition blocker_config.cc:25
std::vector< std::vector< ui > > id_mapA
Definition blocker_config.cc:15
std::vector< std::vector< std::vector< ui > > > recordsA
Definition blocker_config.cc:19
std::vector< std::vector< ui > > id_mapB
Definition blocker_config.cc:16
Table table_A
Definition blocker_config.cc:11
Table table_B
Definition blocker_config.cc:12
std::vector< std::vector< double > > wordwt
Definition blocker_config.cc:23
Rule * rules
Definition blocker_config.cc:14
std::vector< std::vector< std::vector< ui > > > recordsB
Definition blocker_config.cc:20
Definition dataframe.h:19
Definition topk.h:42
TopK(TopK &&other)=delete
~TopK()=default
TopK()=default
static void topKviaAllSimilarityScoresRS(const Table &table_A, const Table &table_B, const Rule *rules, const std::vector< double > &simWeights, const std::unordered_map< std::string, double > &attrAverage, const std::vector< std::vector< std::vector< ui > > > &recordsA, const std::vector< std::vector< std::vector< ui > > > &recordsB, const std::vector< std::vector< double > > &recWeightsA, const std::vector< std::vector< double > > &recWeightsB, const std::vector< std::vector< double > > &wordwt, const std::unordered_map< std::string, ui > &datasets_map, const std::vector< std::vector< ui > > &id_mapA, const std::vector< std::vector< ui > > &id_mapB, const std::vector< std::vector< ui > > &id_mapAString, const std::vector< std::vector< ui > > &id_mapBString, std::vector< std::vector< int > > &final_pairs, const Table &groundTruth, uint64_t K, ui numRule, std::ofstream &statStream, bool isWeighted=false, const std::string &mode="exp")
Definition topk.cc:887
static void topKviaAllSimilarityScoreSelf(const Table &table_A, const Rule *rules, const std::vector< double > &simWeights, const std::unordered_map< std::string, double > &attrAverage, const std::vector< std::vector< std::vector< ui > > > &records, const std::vector< std::vector< double > > &recWeights, const std::vector< std::vector< double > > &wordwt, const std::unordered_map< std::string, ui > &datasets_map, const std::vector< std::vector< ui > > &id_map, const std::vector< std::vector< ui > > &id_mapString, std::vector< std::vector< int > > &final_pairs, const Table &groundTruth, u_int64_t K, ui numRule, std::ofstream &statStream, bool isWeighted=false, const std::string &mode="exp")
Definition topk.cc:1130
static void topKviaTASelfOpt(const Table &table_A, const std::string &topKattr, const std::string &attrType, const std::vector< std::vector< std::vector< ui > > > &recordsA, const std::unordered_map< std::string, ui > &datasets_map, const std::vector< std::vector< ui > > &id_mapA, std::vector< std::vector< int > > &final_pairs, uint64_t K)
Definition topk.cc:714
TopK(const TopK &other)=delete
void topKviaTATable(const Table &matchRes, uint64_t K, std::vector< std::vector< std::string > > &newTable)
Definition topk.cc:1356
static void topKviaTASelf(const Table &table_A, const std::string &topKattr, const std::string &attrType, const std::vector< std::vector< std::vector< ui > > > &recordsA, const std::vector< std::vector< double > > &recWeights, const std::vector< std::vector< double > > &wordwt, const std::unordered_map< std::string, ui > &datasets_map, const std::vector< std::vector< ui > > &id_mapA, std::vector< std::vector< int > > &final_pairs, const Table &groundTruth, uint64_t K, std::ofstream &statStream, bool ifWeighted=false, const std::string &mode="exp")
Definition topk.cc:447
static void topKviaTARS(const Table &table_A, const Table &table_B, const std::string &topKattr, const std::string &attrType, const std::vector< std::vector< std::vector< ui > > > &recordsA, const std::vector< std::vector< std::vector< ui > > > &recordsB, const std::vector< std::vector< double > > &recWeightsA, const std::vector< std::vector< double > > &recWeightsB, const std::vector< std::vector< double > > &wordwt, const std::unordered_map< std::string, ui > &datasets_map, const std::vector< std::vector< ui > > &id_mapA, const std::vector< std::vector< ui > > &id_mapB, std::vector< std::vector< int > > &final_pairs, const Table &groundTruth, uint64_t K, std::ofstream &statStream, bool ifWeighted=false, const std::string &mode="exp")
Definition topk.cc:573
Definition topk.h:34
bool operator()(const TableEntry &lhs, const TableEntry &rhs) const
Definition topk.h:35
Definition dataframe.h:54
Definition topk.h:23
TableEntry(uint64_t _id, double _val)
Definition topk.h:28
TableEntry()=default
double val
Definition topk.h:25
uint64_t id
Definition topk.h:24
unsigned int ui
Definition type.h:8