Entity Matching by Similarity Join
 
Loading...
Searching...
No Matches
feature_utils.h
Go to the documentation of this file.
1/*
2 * author: Yunqi Li
3 * contact: liyunqixa@gmail.com
4 */
5/*
6 * this file can only be included in the feature.cpp & related .hpp files
7 * functions are not inlined
8 * this file contains the overloading sim funcs of SimFunc.cpp
9 * the version for vector<string> rather than vector<ui>
10 */
11#ifndef _FEATURE_UTILS_H_
12#define _FEATURE_UTILS_H_
13
14#include "common/type.h"
15#include "common/tokenizer.h"
16#include <algorithm>
17#include <string>
18#include <sstream>
19#include <vector>
20#include <math.h>
21
22
24{
25public:
26 static std::string delims;
27 static double NaN;
28
29public:
30 FeatureUtils() = default;
31 ~FeatureUtils() = default;
32 FeatureUtils(const FeatureUtils &other) = delete;
33 FeatureUtils(FeatureUtils &&other) = delete;
34
35public:
36 /*
37 * the implementation is according to the py_entitymatching package
38 * for sim funcs, if one of the input records is empty
39 * the funcs will return NaN which is a big negative value
40 * this is different from simfunc.cc
41 */
42 static int overlap(const std::vector<std::string> &v1, const std::vector<std::string> &v2);
43 // this should only be used in feature value calculation in feature.cpp
44 // captured by SetJoinFunc
45 static double overlapD(const std::vector<std::string> &v1, const std::vector<std::string> &v2);
46
47 static int tripletMin(int a, int b, int c);
48 static double levDist(const std::string &v1, const std::string &v2);
49
50 static double jaccard(const std::vector<std::string> &v1, const std::vector<std::string> &v2);
51 static double jaccard(const std::vector<std::string> &v1, const std::vector<std::string> &v2, int ovlp);
52
53 static double cosine(const std::vector<std::string> &v1, const std::vector<std::string> &v2);
54 static double cosine(const std::vector<std::string> &v1, const std::vector<std::string> &v2, int ovlp);
55
56 static double dice(const std::vector<std::string> &v1, const std::vector<std::string> &v2);
57 static double dice(const std::vector<std::string> &v1, const std::vector<std::string> &v2, int ovlp);
58
59 static double overlapCoeff(const std::vector<std::string> &v1, const std::vector<std::string> &v2);
60 static double overlapCoeff(const std::vector<std::string> &v1, const std::vector<std::string> &v2, int ovlp);
61
62 static double exactMatch(const std::string &s1, const std::string &s2);
63
64 static double absoluteNorm(const std::string &s1, const std::string &s2);
65
66 static void stringSplit(std::string str, char delim, std::vector<std::string> &res);
67
68 static void tokenize(const std::string &str, TokenizerType type, std::vector<std::string> &tokens);
69};
70
71
72#endif // _FEATURE_UTILS_H_
Definition feature_utils.h:24
~FeatureUtils()=default
static int tripletMin(int a, int b, int c)
Definition feature_utils.cc:39
static void stringSplit(std::string str, char delim, std::vector< std::string > &res)
Definition feature_utils.cc:189
static double jaccard(const std::vector< std::string > &v1, const std::vector< std::string > &v2)
Definition feature_utils.cc:77
static void tokenize(const std::string &str, TokenizerType type, std::vector< std::string > &tokens)
Definition feature_utils.cc:198
static double dice(const std::vector< std::string > &v1, const std::vector< std::string > &v2)
Definition feature_utils.cc:119
static std::string delims
Definition feature_utils.h:26
FeatureUtils(FeatureUtils &&other)=delete
static double exactMatch(const std::string &s1, const std::string &s2)
Definition feature_utils.cc:159
FeatureUtils()=default
static int overlap(const std::vector< std::string > &v1, const std::vector< std::string > &v2)
Definition feature_utils.cc:11
static double levDist(const std::string &v1, const std::string &v2)
Definition feature_utils.cc:45
static double NaN
Definition feature_utils.h:27
static double overlapD(const std::vector< std::string > &v1, const std::vector< std::string > &v2)
Definition feature_utils.cc:25
static double cosine(const std::vector< std::string > &v1, const std::vector< std::string > &v2)
Definition feature_utils.cc:97
static double overlapCoeff(const std::vector< std::string > &v1, const std::vector< std::string > &v2)
Definition feature_utils.cc:139
static double absoluteNorm(const std::string &s1, const std::string &s2)
Definition feature_utils.cc:168
FeatureUtils(const FeatureUtils &other)=delete
TokenizerType
Definition type.h:39