Entity Matching by Similarity Join
 
Loading...
Searching...
No Matches
cal_feature.h
Go to the documentation of this file.
1/*
2 * author: Yunqi Li
3 * contact: liyunqixa@gmail.com
4 */
5/*
6 * this file can only be included in the feature.cpp & related .hpp files
7 * this file contains the calculation of feature values
8 */
9#ifndef _CAL_FEATURE_H_
10#define _CAL_FEATURE_H_
11
14
15
17{
18public:
19 // there are two versions of set join funcs in feature_utils.hpp
20 // the version with 2 arguments are used in feature.cpp
21 // thus the pointer only capture this version
22 typedef double (*SetJoinFunc)(const std::vector<std::string> &, const std::vector<std::string> &);
23 typedef double (*StringJoinFunc)(const std::string &, const std::string &);
24
25public:
27
28public:
29 CalculateFeature() = default;
30 ~CalculateFeature() = default;
31 CalculateFeature(const CalculateFeature &other) = delete;
33
34private:
35 // calculate original features (i.e., without interchangeable values)
36 static void calOriginalFeatures(std::vector<std::vector<double>> &featureValues, const std::string &func,
37 const std::string &lstr, const std::string &rstr,
38 const std::vector<std::string> &ltokens,
39 const std::vector<std::string> &rtokens,
40 bool isCoeff = false);
41 // calculate features with on side has interchangeable values
42 // tokens, cltid: the entity does not have interchangeable values
43 // ictokens, iccltid: the entity has interchangeable values
44 static void calOneSideFeatures(std::vector<std::vector<double>> &featureValues, SetJoinFunc setJoinP, const std::string &tok,
45 const std::vector<std::string> &tokens, const std::vector<std::string> &ictokens,
46 const FeatureIndex::GroupToken &curGrpDlm, const FeatureIndex::GroupToken &curGrpQgm,
47 int cltid, int iccltid);
48
49 static void calOneSideFeatures(std::vector<std::vector<double>> &featureValues, StringJoinFunc stringJoinP, const std::string &tok,
50 const std::string &str, const std::string &icstr, const FeatureIndex::Group &curGrp,
51 int cltid, int iccltid, const std::string &func);
52
53 // double side
54 static void calDoubleSideFeatures(std::vector<std::vector<double>> &featureValues, SetJoinFunc setjoinP, const std::string &tok,
55 const std::vector<std::string> &ltokens, const std::vector<std::string> &rtokens,
56 const FeatureIndex::GroupToken &curGrpDlm, const FeatureIndex::GroupToken &curGrpQgm,
57 int lcltid, int rcltid, int *const &curDCIdx, double ***const &curCache,
58 const std::vector<int> &featureLength, const std::string &func, ui attrpos);
59
60 static void calDoubleSideFeatures(std::vector<std::vector<double>> &featureValues, StringJoinFunc stringJoinP, const std::string &tok,
61 const std::string &lstr, const std::string &rstr, const FeatureIndex::Group &curGrp,
62 int lcltid, int rcltid, int *const &curDCIdx, double ***const &curCache,
63 const std::vector<int> &featureLength, const std::string &func, ui attrpos);
64
65public:
66 // isTopK indicates whether calculating features is used for top K on matchRes
67 // if true, then the value needed to be normalized, that is, use the overlap coeff
68 static void calAll(int numFeatures, Rule *featureNames, const std::vector<std::string> &attrVec, const Table &resTable,
69 std::vector<std::vector<double>> &featureValues, const FeatureIndex::Groups &group,
70 const FeatureIndex::GroupTokens &groupTokensDlm, const FeatureIndex::GroupTokens &groupTokensQgm,
71 const FeatureIndex::Cluster &cluster, const std::vector<int> &featureLength, bool flagConsistent,
72 bool isTopK);
73
74 static void calAllWithoutInterchangeable(int numFeatures, Rule *featureNames, const std::vector<std::string> &attrVec, const Table &resTable,
75 std::vector<std::vector<double>> &featureValues, const std::vector<int> &featureLength, bool isTopK);
76};
77
78
79#endif // _CAL_FEATURE_H_
Definition cal_feature.h:17
static FeatureIndex index
Definition cal_feature.h:26
double(*) SetJoinFunc(const std::vector< std::string > &, const std::vector< std::string > &)
Definition cal_feature.h:22
static void calAllWithoutInterchangeable(int numFeatures, Rule *featureNames, const std::vector< std::string > &attrVec, const Table &resTable, std::vector< std::vector< double > > &featureValues, const std::vector< int > &featureLength, bool isTopK)
Definition cal_feature.cc:264
CalculateFeature()=default
double(*) StringJoinFunc(const std::string &, const std::string &)
Definition cal_feature.h:23
static void calAll(int numFeatures, Rule *featureNames, const std::vector< std::string > &attrVec, const Table &resTable, std::vector< std::vector< double > > &featureValues, const FeatureIndex::Groups &group, const FeatureIndex::GroupTokens &groupTokensDlm, const FeatureIndex::GroupTokens &groupTokensQgm, const FeatureIndex::Cluster &cluster, const std::vector< int > &featureLength, bool flagConsistent, bool isTopK)
Definition cal_feature.cc:143
CalculateFeature(const CalculateFeature &other)=delete
CalculateFeature(CalculateFeature &&other)=delete
~CalculateFeature()=default
Definition feature_index.h:20
std::vector< std::unordered_map< int, std::vector< std::string > > > Groups
Definition feature_index.h:26
std::unordered_map< int, std::vector< std::vector< std::string > > > GroupToken
Definition feature_index.h:27
std::unordered_map< int, std::vector< std::string > > Group
Definition feature_index.h:25
std::vector< std::unordered_map< std::string, int > > Cluster
Definition feature_index.h:31
std::vector< std::unordered_map< int, std::vector< std::vector< std::string > > > > GroupTokens
Definition feature_index.h:28
Definition dataframe.h:19
Definition dataframe.h:54
unsigned int ui
Definition type.h:8