Entity Matching by Similarity Join
 
Loading...
Searching...
No Matches
SetJoin Class Reference

#include <setjoin.h>

Classes

struct  invertedList
 
struct  invIndexStruct
 

Public Member Functions

 SetJoin ()=default
 
 SetJoin (const std::vector< std::vector< ui > > &sorted_records, const std::vector< double > &recwt, const std::vector< double > &_wordwt, std::string _sim_pairs_filepath, double _det, ui _maxHeapSize=0, bool _isWeightedComp=false)
 
 SetJoin (const std::vector< std::vector< ui > > &work_records, const std::vector< std::vector< ui > > &query_records, const std::vector< double > &workwt, const std::vector< double > &querywt, const std::vector< double > &_wordwt, const std::string &_sim_pairs_filepath, double _det, ui _maxHeapSize=0, bool _isWeightedComp=false)
 
 ~SetJoin ()
 
void loadDataset (const std::vector< std::vector< ui > > &records, std::string file)
 
void prepare (const std::vector< std::vector< ui > > &offsets, ui column)
 
void resizeData (std::vector< std::vector< ui > > &dataset)
 
double weightedOverlap (ui x, ui y)
 
double weightedJaccard (ui x, ui y)
 
double jaccard (ui x, ui y)
 
double weightedCosine (ui x, ui y)
 
double cosine (ui x, ui y)
 
double weightedDice (ui x, ui y)
 
double dice (ui x, ui y)
 
bool overlap (int x, int y, int posx=0, int posy=0, int current_overlap=0)
 
bool overlapRS (int x, int y, int posx=0, int posy=0, int current_overlap=0)
 
void setSelfJoin (double threshold, std::vector< std::pair< int, int > > &sim_pairs)
 
void setRSJoin (double threshold, std::vector< std::pair< int, int > > &sim_pairs)
 

Public Attributes

double overlap_cost = 0
 
double allocation_cost = 0
 
double index_cost = 0
 
bool ifRS = false
 
SimFuncType simFType {SimFuncType::JACCARD}
 
double(SetJoin::* weightedFunc )(ui, ui) = nullptr
 
double(SetJoin::* normalFunc )(ui, ui) = nullptr
 
bool(SetJoin::* overlapFunc )(int, int, int, int, int) = nullptr
 
std::vector< std::pair< int, int > > cacheVec
 
std::vector< std::vector< std::pair< int, int > > > indexVecs
 
double det
 
uint64_t resultNum = 0
 
uint64_t candidateNum = 0
 
uint64_t lengthSum = 0
 
uint64_t listlens = 0
 
int prime_exp [MAX_LINE_LENGTH]
 
std::vector< std::vector< ui > > dataset_all
 
std::vector< std::vector< ui > > work_dataset
 
std::vector< std::vector< ui > > query_dataset
 
std::vector< double > work_weights
 
std::vector< double > query_weights
 
std::vector< double > wordwt
 
std::vector< uiworkEmpty
 
std::vector< uiqueryEmpty
 
std::vector< std::pair< int, int > > result_pairs
 
std::string simP_file_path
 
ui maxHeapSize {0}
 
bool isWeightedComp {false}
 
std::vector< WeightPairresult_pairs_
 
int isHeap = 0
 
std::vector< invertedListindexLists
 

Constructor & Destructor Documentation

◆ SetJoin() [1/3]

SetJoin::SetJoin ( )
default

◆ SetJoin() [2/3]

SetJoin::SetJoin ( const std::vector< std::vector< ui > > & sorted_records,
const std::vector< double > & recwt,
const std::vector< double > & _wordwt,
std::string _sim_pairs_filepath,
double _det,
ui _maxHeapSize = 0,
bool _isWeightedComp = false )
inline

◆ SetJoin() [3/3]

SetJoin::SetJoin ( const std::vector< std::vector< ui > > & work_records,
const std::vector< std::vector< ui > > & query_records,
const std::vector< double > & workwt,
const std::vector< double > & querywt,
const std::vector< double > & _wordwt,
const std::string & _sim_pairs_filepath,
double _det,
ui _maxHeapSize = 0,
bool _isWeightedComp = false )
inline

◆ ~SetJoin()

SetJoin::~SetJoin ( )
inline

Member Function Documentation

◆ cosine()

double SetJoin::cosine ( ui x,
ui y )
inline

◆ dice()

double SetJoin::dice ( ui x,
ui y )
inline

◆ jaccard()

double SetJoin::jaccard ( ui x,
ui y )
inline

◆ loadDataset()

void SetJoin::loadDataset ( const std::vector< std::vector< ui > > & records,
std::string file )
inline

◆ overlap()

bool SetJoin::overlap ( int x,
int y,
int posx = 0,
int posy = 0,
int current_overlap = 0 )

◆ overlapRS()

bool SetJoin::overlapRS ( int x,
int y,
int posx = 0,
int posy = 0,
int current_overlap = 0 )

◆ prepare()

void SetJoin::prepare ( const std::vector< std::vector< ui > > & offsets,
ui column )
inline

◆ resizeData()

void SetJoin::resizeData ( std::vector< std::vector< ui > > & dataset)
inline

◆ setRSJoin()

void SetJoin::setRSJoin ( double threshold,
std::vector< std::pair< int, int > > & sim_pairs )

◆ setSelfJoin()

void SetJoin::setSelfJoin ( double threshold,
std::vector< std::pair< int, int > > & sim_pairs )

◆ weightedCosine()

double SetJoin::weightedCosine ( ui x,
ui y )
inline

◆ weightedDice()

double SetJoin::weightedDice ( ui x,
ui y )
inline

◆ weightedJaccard()

double SetJoin::weightedJaccard ( ui x,
ui y )
inline

◆ weightedOverlap()

double SetJoin::weightedOverlap ( ui x,
ui y )
inline

Member Data Documentation

◆ allocation_cost

double SetJoin::allocation_cost = 0

◆ cacheVec

std::vector<std::pair<int, int> > SetJoin::cacheVec

◆ candidateNum

uint64_t SetJoin::candidateNum = 0

◆ dataset_all

std::vector<std::vector<ui> > SetJoin::dataset_all

◆ det

double SetJoin::det

◆ ifRS

bool SetJoin::ifRS = false

◆ index_cost

double SetJoin::index_cost = 0

◆ indexLists

std::vector<invertedList> SetJoin::indexLists

◆ indexVecs

std::vector<std::vector<std::pair<int, int> > > SetJoin::indexVecs

◆ isHeap

int SetJoin::isHeap = 0

◆ isWeightedComp

bool SetJoin::isWeightedComp {false}

◆ lengthSum

uint64_t SetJoin::lengthSum = 0

◆ listlens

uint64_t SetJoin::listlens = 0

◆ maxHeapSize

ui SetJoin::maxHeapSize {0}

◆ normalFunc

double(SetJoin::* SetJoin::normalFunc) (ui, ui) = nullptr

◆ overlap_cost

double SetJoin::overlap_cost = 0

◆ overlapFunc

bool(SetJoin::* SetJoin::overlapFunc) (int, int, int, int, int) = nullptr

◆ prime_exp

int SetJoin::prime_exp[MAX_LINE_LENGTH]

◆ query_dataset

std::vector<std::vector<ui> > SetJoin::query_dataset

◆ query_weights

std::vector<double> SetJoin::query_weights

◆ queryEmpty

std::vector<ui> SetJoin::queryEmpty

◆ result_pairs

std::vector<std::pair<int, int> > SetJoin::result_pairs

◆ result_pairs_

std::vector<WeightPair> SetJoin::result_pairs_

◆ resultNum

uint64_t SetJoin::resultNum = 0

◆ simFType

SimFuncType SetJoin::simFType {SimFuncType::JACCARD}

◆ simP_file_path

std::string SetJoin::simP_file_path

◆ weightedFunc

double(SetJoin::* SetJoin::weightedFunc) (ui, ui) = nullptr

◆ wordwt

std::vector<double> SetJoin::wordwt

◆ work_dataset

std::vector<std::vector<ui> > SetJoin::work_dataset

◆ work_weights

std::vector<double> SetJoin::work_weights

◆ workEmpty

std::vector<ui> SetJoin::workEmpty

The documentation for this class was generated from the following files: