Entity Matching by Similarity Join
 
Loading...
Searching...
No Matches
blocker_util.h
Go to the documentation of this file.
1/*
2 * author: Yunqi Li
3 * contact: liyunqixa@gmail.com
4 */
5/*
6 * utils for blocker
7 */
8#ifndef _BLOCKER_UTIL_H_
9#define _BLOCKER_UTIL_H_
10
11#include "common/config.h"
12#include "common/dataframe.h"
13#include "topk/topk.h"
15#include <vector>
16#include <assert.h>
17
18extern Table table_A;
19extern Table table_B;
20extern Table gold;
21extern Rule* rules;
22extern std::vector<std::vector<ui>> id_mapA;
23extern std::vector<std::vector<ui>> id_mapB;
24extern std::vector<std::vector<ui>> idStringMapA;
25extern std::vector<std::vector<ui>> idStringMapB;
26extern std::vector<std::vector<std::vector<ui>>> recordsA;
27extern std::vector<std::vector<std::vector<ui>>> recordsB;
28extern std::vector<std::vector<double>> weightsA;
29extern std::vector<std::vector<double>> weightsB;
30extern std::vector<std::vector<double>> wordwt;
31extern std::unordered_map<std::string, ui> datasets_map; // "tok" + "tok_setting" + "column"
32extern std::vector<std::vector<int>> final_pairs;
33extern std::vector<std::vector<std::pair<int, int>>> passedRules;
34
35
37{
38public:
39 BlockerUtil() = default;
40 ~BlockerUtil() = default;
41 BlockerUtil(const BlockerUtil &other) = delete;
42 BlockerUtil(BlockerUtil &&other) = delete;
43
44 // merge the result pairs
45 // the result pairs will have a format as adjacency list
46 // this design is for post-processing
47private:
48 // merge pairs from a adjacency list
49 static void mergePairs(const std::vector<std::vector<int>> &bucket);
50
51public:
52 // These two functions will merge result into an adjacency list
53 // synthesize pairs according id_map in Self join
54 static void synthesizePairsSelf(ui pos, std::vector<std::pair<int, int>> &pairs, int mapid);
55 // synthesize pairs according id_map in RS join
56 static void synthesizePairsRS(ui pos, std::vector<std::pair<int, int>> &pairs, int mapid);
57
58 // pre-filter when res pair already exceed maximum value, default 1e9
59 // we only employ TA since the all scores method needs sim weights
60 // TA
61 static void pretopKviaTASelf(uint64_t K, const std::string &topKattr, const std::string &attrType, bool isWeighted);
62 static void pretopKviaTARS(uint64_t K, const std::string &topKattr, const std::string &attrType, bool isWeighted);
63};
64
65
66#endif // _BLOCKER_UTIL_H_
std::unordered_map< std::string, ui > datasets_map
Definition blocker_config.cc:24
std::vector< std::vector< int > > final_pairs
Definition blocker_config.cc:25
std::vector< std::vector< ui > > idStringMapB
Definition blocker_config.cc:18
std::vector< std::vector< double > > weightsA
Definition blocker_config.cc:21
std::vector< std::vector< ui > > id_mapA
Definition blocker_config.cc:15
std::vector< std::vector< std::vector< ui > > > recordsA
Definition blocker_config.cc:19
std::vector< std::vector< ui > > idStringMapA
Definition blocker_config.cc:17
std::vector< std::vector< double > > weightsB
Definition blocker_config.cc:22
std::vector< std::vector< std::pair< int, int > > > passedRules
Definition blocker_config.cc:26
std::vector< std::vector< ui > > id_mapB
Definition blocker_config.cc:16
Table table_A
Definition blocker_config.cc:11
Table table_B
Definition blocker_config.cc:12
std::vector< std::vector< double > > wordwt
Definition blocker_config.cc:23
Rule * rules
Definition blocker_config.cc:14
std::vector< std::vector< std::vector< ui > > > recordsB
Definition blocker_config.cc:20
Table gold
Definition blocker_config.cc:13
Definition blocker_util.h:37
static void synthesizePairsRS(ui pos, std::vector< std::pair< int, int > > &pairs, int mapid)
Definition blocker_util.cc:37
static void synthesizePairsSelf(ui pos, std::vector< std::pair< int, int > > &pairs, int mapid)
Definition blocker_util.cc:68
BlockerUtil(BlockerUtil &&other)=delete
BlockerUtil(const BlockerUtil &other)=delete
~BlockerUtil()=default
BlockerUtil()=default
static void pretopKviaTASelf(uint64_t K, const std::string &topKattr, const std::string &attrType, bool isWeighted)
Definition blocker_util.cc:111
static void pretopKviaTARS(uint64_t K, const std::string &topKattr, const std::string &attrType, bool isWeighted)
Definition blocker_util.cc:133
Definition dataframe.h:19
Definition dataframe.h:54
unsigned int ui
Definition type.h:8