Entity Matching by Similarity Join
 
Loading...
Searching...
No Matches
SetJoinParallel Class Reference

#include <setjoin_parallel.h>

Public Member Functions

 SetJoinParallel (const std::vector< std::vector< ui > > &sorted_records, const std::vector< double > &recwt, const std::vector< double > &_wordwt, double _det, ui _maxHeapSize=0, bool _isWeightedComp=false)
 
 SetJoinParallel (const std::vector< std::vector< ui > > &work_records, const std::vector< std::vector< ui > > &query_records, const std::vector< double > &workwt, const std::vector< double > &querywt, const std::vector< double > &_wordwt, double _det, ui _maxHeapSize=0, bool _isWeightedComp=false)
 
 ~SetJoinParallel ()=default
 
void showPara () const
 
void resizeData (std::vector< std::vector< ui > > &dataset)
 
void reportTimeCost ()
 
void reportLargestGroup ()
 
unsigned long long getResultPairsAmount ()
 
void mergeResults (std::vector< std::pair< int, int > > &finalPairs)
 
bool overlapSelf (ui x, ui y, int posx=0, int posy=0, int current_overlap=0)
 
bool overlapSelfIC (ui x, ui y, int posx=0, int posy=0, int current_overlap=0)
 
bool overlapRS (ui x, ui y, int posx=0, int posy=0, int current_overlap=0)
 
bool overlapRSIC (ui x, ui y, int posx=0, int posy=0, int current_overlap=0)
 
double weightedOverlap (ui x, ui y)
 
double weightedJaccard (ui x, ui y)
 
double jaccard (ui x, ui y)
 
double weightedCosine (ui x, ui y)
 
double cosine (ui x, ui y)
 
double weightedDice (ui x, ui y)
 
double dice (ui x, ui y)
 
void index (double threshold)
 
void GreedyFindCandidateAndSimPairs (const int &tid, const int indexLenGrp, const ui rid, ui record_length, const std::vector< ui > &p_keys, const std::vector< ui > &od_keys, const std::vector< ui > &odk_st)
 
void findSimPairsSelf ()
 
void findSimPairsRS ()
 

Public Attributes

bool ifRS = false
 
int earlyTerminated [MAXTHREADNUM] = { 0 }
 
int earlyTerminatedEmpty [MAXTHREADNUM] = { 0 }
 
SimFuncType simFType {SimFuncType::JACCARD}
 
std::string typeMap [3] = {"Jaccard", "Cosine", "Dice"}
 
double(SetJoinParallel::* weightedFunc )(ui, ui) = nullptr
 
double(SetJoinParallel::* normalFunc )(ui, ui) = nullptr
 
bool(SetJoinParallel::* overlapFunc )(ui, ui, int, int, int) = nullptr
 
double det
 
uint64_t resultNum = 0
 
uint64_t candidateNum = 0
 
uint64_t listlens = 0
 
ui maxIndexPartNum {0}
 
std::vector< std::vector< ui > > work_dataset
 
std::vector< std::vector< ui > > query_dataset
 
std::vector< double > work_weights
 
std::vector< double > query_weights
 
std::vector< double > wordwt
 
std::vector< uiworkEmpty
 
std::vector< uiqueryEmpty
 
std::vector< uiworkLength
 
double coe {0.0}
 
double coePart {0.0}
 
double ALPHA {0.0}
 
ui work_n {0}
 
ui query_n {0}
 
ui work_maxSize {0}
 
ui work_minSize {0}
 
ui query_maxSize {0}
 
ui query_minSize {0}
 
ui maxHeapSize {0}
 
std::vector< std::pair< int, int > > result_pairs [MAXTHREADNUM]
 
std::vector< std::pair< int, int > > emptyPairs [MAXTHREADNUM]
 
bool isWeightedComp {false}
 
std::vector< WeightPairresult_pairs_ [MAXTHREADNUM]
 
int isHeap [MAXTHREADNUM] = { 0 }
 
double index_cost
 
double search_cost
 
double hashInFind_cost [MAXTHREADNUM]
 
double mem_cost [MAXTHREADNUM]
 
double find_cost [MAXTHREADNUM]
 
double alloc_cost [MAXTHREADNUM]
 
double verif_cost [MAXTHREADNUM]
 
bool flagIC {false}
 
std::vector< int > grpIdA
 
std::vector< int > grpIdB
 
std::vector< std::vector< int > > groupA
 
std::vector< std::vector< int > > groupB
 
std::vector< uirevIdMapA
 
std::vector< uirevIdMapB
 
std::vector< uiidMapA
 
std::vector< uiidMapB
 
double ** featureValueCache {nullptr}
 
int * discreteCacheIdx {nullptr}
 

Constructor & Destructor Documentation

◆ SetJoinParallel() [1/2]

SetJoinParallel::SetJoinParallel ( const std::vector< std::vector< ui > > & sorted_records,
const std::vector< double > & recwt,
const std::vector< double > & _wordwt,
double _det,
ui _maxHeapSize = 0,
bool _isWeightedComp = false )
inline

◆ SetJoinParallel() [2/2]

SetJoinParallel::SetJoinParallel ( const std::vector< std::vector< ui > > & work_records,
const std::vector< std::vector< ui > > & query_records,
const std::vector< double > & workwt,
const std::vector< double > & querywt,
const std::vector< double > & _wordwt,
double _det,
ui _maxHeapSize = 0,
bool _isWeightedComp = false )
inline

◆ ~SetJoinParallel()

SetJoinParallel::~SetJoinParallel ( )
default

Member Function Documentation

◆ cosine()

double SetJoinParallel::cosine ( ui x,
ui y )
inline

◆ dice()

double SetJoinParallel::dice ( ui x,
ui y )
inline

◆ findSimPairsRS()

void SetJoinParallel::findSimPairsRS ( )

◆ findSimPairsSelf()

void SetJoinParallel::findSimPairsSelf ( )

◆ getResultPairsAmount()

unsigned long long SetJoinParallel::getResultPairsAmount ( )
inline

◆ GreedyFindCandidateAndSimPairs()

void SetJoinParallel::GreedyFindCandidateAndSimPairs ( const int & tid,
const int indexLenGrp,
const ui rid,
ui record_length,
const std::vector< ui > & p_keys,
const std::vector< ui > & od_keys,
const std::vector< ui > & odk_st )

◆ index()

void SetJoinParallel::index ( double threshold)

◆ jaccard()

double SetJoinParallel::jaccard ( ui x,
ui y )
inline

◆ mergeResults()

void SetJoinParallel::mergeResults ( std::vector< std::pair< int, int > > & finalPairs)
inline

◆ overlapRS()

bool SetJoinParallel::overlapRS ( ui x,
ui y,
int posx = 0,
int posy = 0,
int current_overlap = 0 )
inline

◆ overlapRSIC()

bool SetJoinParallel::overlapRSIC ( ui x,
ui y,
int posx = 0,
int posy = 0,
int current_overlap = 0 )
inline

◆ overlapSelf()

bool SetJoinParallel::overlapSelf ( ui x,
ui y,
int posx = 0,
int posy = 0,
int current_overlap = 0 )
inline

◆ overlapSelfIC()

bool SetJoinParallel::overlapSelfIC ( ui x,
ui y,
int posx = 0,
int posy = 0,
int current_overlap = 0 )
inline

◆ reportLargestGroup()

void SetJoinParallel::reportLargestGroup ( )
inline

◆ reportTimeCost()

void SetJoinParallel::reportTimeCost ( )
inline

◆ resizeData()

void SetJoinParallel::resizeData ( std::vector< std::vector< ui > > & dataset)
inline

◆ showPara()

void SetJoinParallel::showPara ( ) const
inline

◆ weightedCosine()

double SetJoinParallel::weightedCosine ( ui x,
ui y )
inline

◆ weightedDice()

double SetJoinParallel::weightedDice ( ui x,
ui y )
inline

◆ weightedJaccard()

double SetJoinParallel::weightedJaccard ( ui x,
ui y )
inline

◆ weightedOverlap()

double SetJoinParallel::weightedOverlap ( ui x,
ui y )
inline

Member Data Documentation

◆ alloc_cost

double SetJoinParallel::alloc_cost[MAXTHREADNUM]

◆ ALPHA

double SetJoinParallel::ALPHA {0.0}

◆ candidateNum

uint64_t SetJoinParallel::candidateNum = 0

◆ coe

double SetJoinParallel::coe {0.0}

◆ coePart

double SetJoinParallel::coePart {0.0}

◆ det

double SetJoinParallel::det

◆ discreteCacheIdx

int* SetJoinParallel::discreteCacheIdx {nullptr}

◆ earlyTerminated

int SetJoinParallel::earlyTerminated[MAXTHREADNUM] = { 0 }

◆ earlyTerminatedEmpty

int SetJoinParallel::earlyTerminatedEmpty[MAXTHREADNUM] = { 0 }

◆ emptyPairs

std::vector<std::pair<int, int> > SetJoinParallel::emptyPairs[MAXTHREADNUM]

◆ featureValueCache

double** SetJoinParallel::featureValueCache {nullptr}

◆ find_cost

double SetJoinParallel::find_cost[MAXTHREADNUM]

◆ flagIC

bool SetJoinParallel::flagIC {false}

◆ groupA

std::vector<std::vector<int> > SetJoinParallel::groupA

◆ groupB

std::vector<std::vector<int> > SetJoinParallel::groupB

◆ grpIdA

std::vector<int> SetJoinParallel::grpIdA

◆ grpIdB

std::vector<int> SetJoinParallel::grpIdB

◆ hashInFind_cost

double SetJoinParallel::hashInFind_cost[MAXTHREADNUM]

◆ idMapA

std::vector<ui> SetJoinParallel::idMapA

◆ idMapB

std::vector<ui> SetJoinParallel::idMapB

◆ ifRS

bool SetJoinParallel::ifRS = false

◆ index_cost

double SetJoinParallel::index_cost

◆ isHeap

int SetJoinParallel::isHeap[MAXTHREADNUM] = { 0 }

◆ isWeightedComp

bool SetJoinParallel::isWeightedComp {false}

◆ listlens

uint64_t SetJoinParallel::listlens = 0

◆ maxHeapSize

ui SetJoinParallel::maxHeapSize {0}

◆ maxIndexPartNum

ui SetJoinParallel::maxIndexPartNum {0}

◆ mem_cost

double SetJoinParallel::mem_cost[MAXTHREADNUM]

◆ normalFunc

double(SetJoinParallel::* SetJoinParallel::normalFunc) (ui, ui) = nullptr

◆ overlapFunc

bool(SetJoinParallel::* SetJoinParallel::overlapFunc) (ui, ui, int, int, int) = nullptr

◆ query_dataset

std::vector<std::vector<ui> > SetJoinParallel::query_dataset

◆ query_maxSize

ui SetJoinParallel::query_maxSize {0}

◆ query_minSize

ui SetJoinParallel::query_minSize {0}

◆ query_n

ui SetJoinParallel::query_n {0}

◆ query_weights

std::vector<double> SetJoinParallel::query_weights

◆ queryEmpty

std::vector<ui> SetJoinParallel::queryEmpty

◆ result_pairs

std::vector<std::pair<int, int> > SetJoinParallel::result_pairs[MAXTHREADNUM]

◆ result_pairs_

std::vector<WeightPair> SetJoinParallel::result_pairs_[MAXTHREADNUM]

◆ resultNum

uint64_t SetJoinParallel::resultNum = 0

◆ revIdMapA

std::vector<ui> SetJoinParallel::revIdMapA

◆ revIdMapB

std::vector<ui> SetJoinParallel::revIdMapB

◆ search_cost

double SetJoinParallel::search_cost

◆ simFType

SimFuncType SetJoinParallel::simFType {SimFuncType::JACCARD}

◆ typeMap

std::string SetJoinParallel::typeMap[3] = {"Jaccard", "Cosine", "Dice"}

◆ verif_cost

double SetJoinParallel::verif_cost[MAXTHREADNUM]

◆ weightedFunc

double(SetJoinParallel::* SetJoinParallel::weightedFunc) (ui, ui) = nullptr

◆ wordwt

std::vector<double> SetJoinParallel::wordwt

◆ work_dataset

std::vector<std::vector<ui> > SetJoinParallel::work_dataset

◆ work_maxSize

ui SetJoinParallel::work_maxSize {0}

◆ work_minSize

ui SetJoinParallel::work_minSize {0}

◆ work_n

ui SetJoinParallel::work_n {0}

◆ work_weights

std::vector<double> SetJoinParallel::work_weights

◆ workEmpty

std::vector<ui> SetJoinParallel::workEmpty

◆ workLength

std::vector<ui> SetJoinParallel::workLength

The documentation for this class was generated from the following files: