Entity Matching by Similarity Join
 
Loading...
Searching...
No Matches
index.h
Go to the documentation of this file.
1/*
2 * author: Dong Deng
3 * modified: Zhencan Peng in rutgers-db/RedPajama_Analysis
4 * modified: Yunqi Li
5 * contact: liyunqixa@gmail.com
6 */
7#ifndef _INDEX_H_
8#define _INDEX_H_
9
10#include "common/config.h"
11#include <cstdlib>
12#include <cstdint>
13#include <cstring>
14#include <string>
15#include <vector>
16#include <numeric>
17
18/*
19 * weighted pair in heap
20 */
22{
23 int id1{0};
24 int id2{0};
25 double val{0.0};
26
27 WeightPair() = default;
28 WeightPair(int _id1, int _id2, double _val)
29 : id1(_id1), id2(_id2), val(_val) { }
30
31 // min-heap for top k largest
32 // make_heap uses < by default,
33 // the entity on top of the heap receives false in every < comparsion
34 bool operator<(const WeightPair& rhs) const {
35 return this->val > rhs.val;
36 }
37};
38
40{
41 int id1{0};
42 int id2{0};
43 int val{0};
44
45 WeightPairEdit() = default;
46 WeightPairEdit(int _id1, int _id2, int _val)
47 : id1(_id1), id2(_id2), val(_val) { }
48
49 // min-heap for top k largest
50 // make_heap uses < by default,
51 // the entity on top of the heap receives false in every < comparsion
52 bool operator<(const WeightPairEdit& rhs) const {
53 return this->val < rhs.val;
54 }
55};
56
57/*
58 * Set join
59 */
61{
62 // index for partitions and one deletions
63 // There are two dimensions:
64 // 1st: (Pointer Level) Indicates the range
65 // 2nd: (External Vector Level) Indicates the pid (partition ID)
66 // Inner vector stores the vector of rid (record ID)
67 std::vector<std::vector<unsigned int>> *parts_rids{nullptr}; // <rid>
68 std::vector<std::vector<unsigned int>> *ods_rids{nullptr}; // <rid>
69
70 // The index pointers
71 std::vector<std::vector<unsigned int>> *parts_index_hv{nullptr};
72 std::vector<std::vector<unsigned int>> *od_index_hv{nullptr};
73 std::vector<std::vector<unsigned int>> *parts_index_offset{nullptr};
74 std::vector<std::vector<unsigned int>> *od_index_offset{nullptr};
75
76 // Mention: if the cnt is greater than the limit of TokenLen, we will just treat it as the maximum value of TokenLen
77 std::vector<std::vector<TokenLen>> *parts_index_cnt{nullptr};
78 std::vector<std::vector<TokenLen>> *od_index_cnt{nullptr};
79
80 // memory released in functions
85};
86
87
88/*
89 * String join
90 */
91struct PIndex
92{
93 int stPos{0}; // start position of substring
94 int Lo{0}; // start position of segment
95 int partLen{0}; // substring/segment length
96 int len{0}; // length of indexed string
97
98 PIndex() = default;
99 PIndex(int _s, int _o, int _p, int _l)
100 : stPos(_s), Lo(_o), partLen(_p), len(_l) { }
101 ~PIndex() = default;
102};
103
105{
106 int id{0}; // id of string in dataset
107 int stPos{0}; // start position of string to hash
108 int enPos{0}; // end position of string to hash
109 uint64_t value{0}; // hash value
110
111 hashValue(): id(-1), stPos(-1), enPos(-1), value(0) { }
112 hashValue(int in_id, int in_stPos, int in_enPos, uint64_t in_value)
113 : id(in_id), stPos(in_stPos), enPos(in_enPos), value(in_value) { }
114 hashValue(int in_id, int in_stPos, int in_enPos, const std::string &str)
115 : id(in_id), stPos(in_stPos), enPos(in_enPos) {
116 value = strHash(str, stPos, enPos - stPos + 1);
117 }
118
119 // calcuate hash value of string incrementally
120 void update(int newid, int newStPos, int newEnPos, const std::string &str,
121 uint64_t *power) {
122 // update information if string id is different
123 if (newid != id) {
124 id = newid;
125 enPos = newEnPos;
126 stPos = newStPos;
127 value = strHash(str, stPos, enPos - stPos + 1);
128 }
129 else {
130 // check if it can update hash value incrementally
131 if (newStPos == stPos + 1) {
132 value = value - str[stPos] * power[enPos - stPos];
133 stPos = newStPos;
134 }
135 if (newEnPos == enPos + 1) {
136 value = (value * stringHashNumber + str[newEnPos]);
137 enPos = newEnPos;
138 }
139 }
140 }
141
142 // Hash
143 uint64_t strHash(const std::string &str, int stPos, int len) {
144 uint64_t __h = 0;
145 int i = 0;
146 while (i < len)
147 __h = (__h * stringHashNumber + str[stPos + i++]);
148 return __h;
149 }
150};
151
152
153/*
154 * dsu for clustering pairs
155 */
156struct DSU
157{
158 std::vector<int> fa;
159
160 DSU(int _size): fa(_size) { std::iota(fa.begin(), fa.end(), 0); }
161
162 int find(int x) {
163 return fa[x] == x ? x : fa[x] = find(fa[x]);
164 }
165
166 void unite(int x, int y) {
167 fa[find(x)] = find(y);
168 }
169};
170
171#endif // _INDEX_H_
constexpr size_t stringHashNumber
Definition config.h:57
Definition index.h:157
DSU(int _size)
Definition index.h:160
int find(int x)
Definition index.h:162
void unite(int x, int y)
Definition index.h:166
std::vector< int > fa
Definition index.h:158
Definition index.h:92
~PIndex()=default
PIndex(int _s, int _o, int _p, int _l)
Definition index.h:99
int partLen
Definition index.h:95
int stPos
Definition index.h:93
PIndex()=default
int len
Definition index.h:96
int Lo
Definition index.h:94
Definition index.h:61
std::vector< std::vector< unsigned int > > * parts_rids
Definition index.h:67
std::vector< std::vector< unsigned int > > * od_index_offset
Definition index.h:74
std::vector< std::vector< TokenLen > > * parts_index_cnt
Definition index.h:77
std::vector< std::vector< TokenLen > > * od_index_cnt
Definition index.h:78
std::vector< std::vector< unsigned int > > * ods_rids
Definition index.h:68
~SetJoinParelledIndex()=default
SetJoinParelledIndex(SetJoinParelledIndex &&other)=delete
std::vector< std::vector< unsigned int > > * od_index_hv
Definition index.h:72
std::vector< std::vector< unsigned int > > * parts_index_offset
Definition index.h:73
std::vector< std::vector< unsigned int > > * parts_index_hv
Definition index.h:71
SetJoinParelledIndex(const SetJoinParelledIndex &other)=delete
SetJoinParelledIndex()=default
Definition index.h:40
WeightPairEdit()=default
bool operator<(const WeightPairEdit &rhs) const
Definition index.h:52
int val
Definition index.h:43
WeightPairEdit(int _id1, int _id2, int _val)
Definition index.h:46
int id1
Definition index.h:41
int id2
Definition index.h:42
Definition index.h:22
bool operator<(const WeightPair &rhs) const
Definition index.h:34
int id1
Definition index.h:23
WeightPair(int _id1, int _id2, double _val)
Definition index.h:28
WeightPair()=default
int id2
Definition index.h:24
double val
Definition index.h:25
Definition index.h:105
hashValue()
Definition index.h:111
int stPos
Definition index.h:107
hashValue(int in_id, int in_stPos, int in_enPos, const std::string &str)
Definition index.h:114
void update(int newid, int newStPos, int newEnPos, const std::string &str, uint64_t *power)
Definition index.h:120
int id
Definition index.h:106
uint64_t strHash(const std::string &str, int stPos, int len)
Definition index.h:143
uint64_t value
Definition index.h:109
hashValue(int in_id, int in_stPos, int in_enPos, uint64_t in_value)
Definition index.h:112
int enPos
Definition index.h:108