Entity Matching by Similarity Join
 
Loading...
Searching...
No Matches
simfunc.h
Go to the documentation of this file.
1/*
2 * author: Yunqi Li
3 * contact: liyunqixa@gmail.com
4 */
5#ifndef _SIM_FUNC_H_
6#define _SIM_FUNC_H_
7
8#include "type.h"
9#include <vector>
10#include <cstdlib>
11#include <cstdint>
12#include <cmath>
13#include <cstdio>
14#include <string>
15#include <algorithm>
16#include <sstream>
17#include <sys/time.h>
18#include <parallel/algorithm>
19
20#define ISZERO(X) std::abs(X) <= 1e-5
21
22
23/*
24 * A set of similarity functions
25 */
27{
28public:
29 SimFuncs() = default;
30 ~SimFuncs() = default;
31 SimFuncs(const SimFuncs &other) = delete;
32 SimFuncs(SimFuncs &&other) = delete;
33
34public:
35 // helpers
36 static int overlap(const std::vector<ui> &v1, const std::vector<ui> &v2);
37 static double weightedOverlap(const std::vector<ui> &v1, const std::vector<ui> &v2,
38 const std::vector<double> &wordwt);
39
40 static int levDist(const std::string &v1, const std::string &v2);
41 static double inverseSqrt(double number);
42 static int tripletMin(int a, int b, int c);
43
44 // four main funcs
45 // different from the classical weighted func
46 // wordwt: word weights
47 // v1rw, v2rw: weight for each record, which is the sum of its words weights
48
49 // jaccard : (|A| \cap |B|) / (|A| + |B| - |A| \cap |B|)
50 static double jaccard(const std::vector<ui> &v1, const std::vector<ui> &v2);
51 static double jaccard(const std::vector<ui> &v1, const std::vector<ui> &v2, int ovlp);
52 static double weightedJaccard(const std::vector<ui> &v1, const std::vector<ui> &v2,
53 const std::vector<double> &wordwt,
54 double v1rw, double v2rw);
55 static double weightedJaccard(double v1rw, double v2rw, double ovlp);
56
57 // cosine: (|A| \cap |B|) / sqrt(|A| * |B|)
58 static double cosine(const std::vector<ui> &v1, const std::vector<ui> &v2);
59 static double cosine(const std::vector<ui> &v1, const std::vector<ui> &v2, int ovlp);
60 static double weightedCosine(const std::vector<ui> &v1, const std::vector<ui> &v2,
61 const std::vector<double> &wordwt,
62 double v1rw, double v2rw);
63 static double weightedCosine(double v1rw, double v2rw, double ovlp);
64
65 // dice: 2 * (|A| \cap |B|) / (|A| + |B|)
66 static double dice(const std::vector<ui> &v1, const std::vector<ui> &v2);
67 static double dice(const std::vector<ui> &v1, const std::vector<ui> &v2, int ovlp);
68 static double weightedDice(const std::vector<ui> &v1, const std::vector<ui> &v2,
69 const std::vector<double> &wordwt,
70 double v1rw, double v2rw);
71 static double weightedDice(double v1rw, double v2rw, double ovlp);
72
73 // overlap coefficient: (|A| \cap |B|) / min(|A|, |B|)
74 static double overlapCoeff(const std::vector<ui> &v1, const std::vector<ui> &v2);
75 static double overlapCoeff(const std::vector<ui> &v1, const std::vector<ui> &v2, int ovlp);
76 static double weightedOverlapCoeff(const std::vector<ui> &v1, const std::vector<ui> &v2,
77 const std::vector<double> &wordwt,
78 double v1rw, double v2rw);
79 static double weightedOverlapCoeff(double v1rw, double v2rw, double ovlp);
80
81 // other funcs based on string without tokenization
82 // these can not be applied on long strings: str_bt_1w_5w to str_gt_10w
83 // since most of funcs(joins) time will increase sharply with length
84 static double levSim(const std::string &v1, const std::string &v2);
85 static bool exactMatch(const std::string &s1, const std::string &s2);
86 static double absoluteNorm(const std::string &s1, const std::string &s2);
87 // following not used in blocking
88 // code: https://github.com/sanket143/Jaro-Winkler/blob/master/cpp/jwdistance.h
89 static double jaroWinkler(const std::string &s1, const std::string &s2);
90 static double mongeElkan(const std::string &s1, const std::string &s2);
91};
92
93
94/*
95 * The records are vectors of unsigned int for most of the time
96 * but in some parts, like feature extraction, the four set similarity functions
97 * may accept the records that are vectors of strings
98 * thus, we only overload these four sim funcs as template, which will not affect
99 * for other parts
100 */
102{
103public:
104 SimFuncsTemplate() = default;
105 ~SimFuncsTemplate() = default;
106 SimFuncsTemplate(const SimFuncsTemplate &other) = delete;
108
109public:
110 template<typename T>
111 static int overlap(const std::vector<T> &v1, const std::vector<T> &v2);
112
113 template<typename T>
114 static double jaccard(const std::vector<T> &v1, const std::vector<T> &v2);
115 template<typename T>
116 static double jaccard(const std::vector<T> &v1, const std::vector<T> &v2, int ovlp);
117
118 template<typename T>
119 static double cosine(const std::vector<T> &v1, const std::vector<T> &v2);
120 template<typename T>
121 static double cosine(const std::vector<T> &v1, const std::vector<T> &v2, int ovlp);
122
123 template<typename T>
124 static double dice(const std::vector<T> &v1, const std::vector<T> &v2);
125 template<typename T>
126 static double dice(const std::vector<T> &v1, const std::vector<T> &v2, int ovlp);
127
128 template<typename T>
129 static double overlapCoeff(const std::vector<T> &v1, const std::vector<T> &v2);
130 template<typename T>
131 static double overlapCoeff(const std::vector<T> &v1, const std::vector<T> &v2, int ovlp);
132};
133
134
135// TODO:
136class TFIDF
137{
138
139};
140
141
142#endif // _SIM_FUNC_H_
std::vector< std::vector< double > > wordwt
Definition blocker_config.cc:23
Definition simfunc.h:102
static double jaccard(const std::vector< T > &v1, const std::vector< T > &v2)
Definition simfunc.cc:431
static double cosine(const std::vector< T > &v1, const std::vector< T > &v2)
Definition simfunc.cc:452
~SimFuncsTemplate()=default
SimFuncsTemplate(SimFuncsTemplate &&other)=delete
static double overlapCoeff(const std::vector< T > &v1, const std::vector< T > &v2)
Definition simfunc.cc:494
SimFuncsTemplate()=default
static int overlap(const std::vector< T > &v1, const std::vector< T > &v2)
Definition simfunc.cc:421
static double dice(const std::vector< T > &v1, const std::vector< T > &v2)
Definition simfunc.cc:473
SimFuncsTemplate(const SimFuncsTemplate &other)=delete
Definition simfunc.h:27
static double weightedCosine(const std::vector< ui > &v1, const std::vector< ui > &v2, const std::vector< double > &wordwt, double v1rw, double v2rw)
Definition simfunc.cc:186
static double absoluteNorm(const std::string &s1, const std::string &s2)
Definition simfunc.cc:297
SimFuncs(SimFuncs &&other)=delete
static double jaccard(const std::vector< ui > &v1, const std::vector< ui > &v2)
Definition simfunc.cc:127
static double weightedOverlap(const std::vector< ui > &v1, const std::vector< ui > &v2, const std::vector< double > &wordwt)
Definition simfunc.cc:43
static double inverseSqrt(double number)
Definition simfunc.cc:75
static int levDist(const std::string &v1, const std::string &v2)
Definition simfunc.cc:88
static double mongeElkan(const std::string &s1, const std::string &s2)
Definition simfunc.cc:387
static bool exactMatch(const std::string &s1, const std::string &s2)
Definition simfunc.cc:292
static double weightedDice(const std::vector< ui > &v1, const std::vector< ui > &v2, const std::vector< double > &wordwt, double v1rw, double v2rw)
Definition simfunc.cc:227
static int tripletMin(int a, int b, int c)
Definition simfunc.cc:122
static double weightedJaccard(const std::vector< ui > &v1, const std::vector< ui > &v2, const std::vector< double > &wordwt, double v1rw, double v2rw)
Definition simfunc.cc:145
~SimFuncs()=default
static double overlapCoeff(const std::vector< ui > &v1, const std::vector< ui > &v2)
Definition simfunc.cc:247
SimFuncs(const SimFuncs &other)=delete
static double levSim(const std::string &v1, const std::string &v2)
Definition simfunc.cc:285
SimFuncs()=default
static double jaroWinkler(const std::string &s1, const std::string &s2)
Definition simfunc.cc:318
static double cosine(const std::vector< ui > &v1, const std::vector< ui > &v2)
Definition simfunc.cc:165
static double dice(const std::vector< ui > &v1, const std::vector< ui > &v2)
Definition simfunc.cc:209
static double weightedOverlapCoeff(const std::vector< ui > &v1, const std::vector< ui > &v2, const std::vector< double > &wordwt, double v1rw, double v2rw)
Definition simfunc.cc:265
static int overlap(const std::vector< ui > &v1, const std::vector< ui > &v2)
Definition simfunc.cc:10
Definition simfunc.h:137