Entity Matching by Similarity Join
 
Loading...
Searching...
No Matches
StringJoinParallel Class Reference

#include <stringjoin_parallel.h>

Public Types

using InvListsParallel = std::unordered_map<uint64_t, std::vector<int>>
 
using InvListPrefix = std::unordered_map<uint64_t, std::vector<std::pair<int, int>>>
 

Public Member Functions

 StringJoinParallel ()=default
 
 StringJoinParallel (const std::vector< std::string > &data, int threshold, ui _maxHeapSize=0)
 
 StringJoinParallel (const std::vector< std::string > &work, const std::vector< std::string > &query, int threshold, ui _maxHeapSize=0)
 
 ~StringJoinParallel ()
 
 StringJoinParallel (const StringJoinParallel &other)=delete
 
 StringJoinParallel (StringJoinParallel &&other)=delete
 
void init ()
 
void prepareSelf ()
 
void prepareRS ()
 
bool verifyLeftPartSelf (int xid, int yid, int xlen, int ylen, int Tau, int tid, int sharing=0)
 
bool verifyRightPartSelf (int xid, int yid, int xlen, int ylen, int xpos, int ypos, int Tau, int tid, int sharing=0)
 
bool verifyLeftPartRS (int xid, int yid, int xlen, int ylen, int Tau, int tid, int sharing=0)
 
bool verifyRightPartRS (int xid, int yid, int xlen, int ylen, int xpos, int ypos, int Tau, int tid, int sharing=0)
 
bool iterativeVerifyLeftPartRS (int xid, int yid, int stPos, int xlen, int ylen, int wlen, int taul, int partId)
 
bool iterativeVerifyRightPartRS (int xid, int yid, int xpos, int ypos, int xlen, int ylen, int partId, int tid)
 
void selfJoin (std::vector< std::pair< int, int > > &finalPairs)
 
void RSJoin (std::vector< std::pair< int, int > > &finalPairs)
 
void checkSelfResults () const
 
void printDebugInfo (int currLen) const
 

Public Attributes

int sharePrefix {0}
 
std::vector< uint64_t > workPrefixHash
 
std::vector< uint64_t > queryPrefixHash
 
int earlyTerminated [MAXTHREADNUM] = { 0 }
 
int workMinDictLen {19260817}
 
int workMaxDictLen {0}
 
int queryMinDictLen {19260817}
 
int queryMaxDictLen {0}
 
int maxDictLen {0}
 
int minDictLen {0}
 
uint64_t avgDictLen {0}
 
int workN {0}
 
int queryN {0}
 
int N {0}
 
int D {0}
 
int PN {0}
 
int hashNumber {31}
 
int modNumber {1000000007}
 
std::vector< std::string > work_dataset
 
std::vector< std::string > query_dataset
 
std::vector< std::pair< int, int > > pairs [MAXTHREADNUM]
 
uint64_t candNum {0}
 
uint64_t veriNum {0}
 
uint64_t listNum {0}
 
uint64_t realNum {0}
 
bool valid [MAXTHREADNUM]
 
int left [MAXTHREADNUM]
 
int right [MAXTHREADNUM]
 
int _left [MAXTHREADNUM]
 
int _right [MAXTHREADNUM]
 
int ** matrix [MAXTHREADNUM]
 
int ** _matrix [MAXTHREADNUM]
 
bool * quickRef [MAXTHREADNUM]
 
int ** partLen {nullptr}
 
int ** partPos {nullptr}
 
int * dist {nullptr}
 
std::vector< int > workLengthArray
 
std::vector< int > queryLengthArray
 
std::vector< std::vector< int > > worklengthMap
 
std::vector< std::vector< int > > querylengthMap
 
uint64_t * power {nullptr}
 
InvListsParallel ** invLists {nullptr}
 
InvListPrefix ** invListsPre {nullptr}
 
std::vector< PIndex > ** partIndex {nullptr}
 
hashValuehv [MAXTHREADNUM]
 
std::unordered_set< std::string > strCount
 
int * workInvSC {nullptr}
 
int * queryInvSC {nullptr}
 
ui maxHeapSize {0}
 

Member Typedef Documentation

◆ InvListPrefix

using StringJoinParallel::InvListPrefix = std::unordered_map<uint64_t, std::vector<std::pair<int, int>>>

◆ InvListsParallel

using StringJoinParallel::InvListsParallel = std::unordered_map<uint64_t, std::vector<int>>

Constructor & Destructor Documentation

◆ StringJoinParallel() [1/5]

StringJoinParallel::StringJoinParallel ( )
default

◆ StringJoinParallel() [2/5]

StringJoinParallel::StringJoinParallel ( const std::vector< std::string > & data,
int threshold,
ui _maxHeapSize = 0 )
inline

◆ StringJoinParallel() [3/5]

StringJoinParallel::StringJoinParallel ( const std::vector< std::string > & work,
const std::vector< std::string > & query,
int threshold,
ui _maxHeapSize = 0 )
inline

◆ ~StringJoinParallel()

StringJoinParallel::~StringJoinParallel ( )
inline

◆ StringJoinParallel() [4/5]

StringJoinParallel::StringJoinParallel ( const StringJoinParallel & other)
delete

◆ StringJoinParallel() [5/5]

StringJoinParallel::StringJoinParallel ( StringJoinParallel && other)
delete

Member Function Documentation

◆ checkSelfResults()

void StringJoinParallel::checkSelfResults ( ) const

◆ init()

void StringJoinParallel::init ( )

◆ iterativeVerifyLeftPartRS()

bool StringJoinParallel::iterativeVerifyLeftPartRS ( int xid,
int yid,
int stPos,
int xlen,
int ylen,
int wlen,
int taul,
int partId )
inline

◆ iterativeVerifyRightPartRS()

bool StringJoinParallel::iterativeVerifyRightPartRS ( int xid,
int yid,
int xpos,
int ypos,
int xlen,
int ylen,
int partId,
int tid )
inline

◆ prepareRS()

void StringJoinParallel::prepareRS ( )

◆ prepareSelf()

void StringJoinParallel::prepareSelf ( )

◆ printDebugInfo()

void StringJoinParallel::printDebugInfo ( int currLen) const

◆ RSJoin()

void StringJoinParallel::RSJoin ( std::vector< std::pair< int, int > > & finalPairs)

◆ selfJoin()

void StringJoinParallel::selfJoin ( std::vector< std::pair< int, int > > & finalPairs)

◆ verifyLeftPartRS()

bool StringJoinParallel::verifyLeftPartRS ( int xid,
int yid,
int xlen,
int ylen,
int Tau,
int tid,
int sharing = 0 )
inline

◆ verifyLeftPartSelf()

bool StringJoinParallel::verifyLeftPartSelf ( int xid,
int yid,
int xlen,
int ylen,
int Tau,
int tid,
int sharing = 0 )
inline

◆ verifyRightPartRS()

bool StringJoinParallel::verifyRightPartRS ( int xid,
int yid,
int xlen,
int ylen,
int xpos,
int ypos,
int Tau,
int tid,
int sharing = 0 )
inline

◆ verifyRightPartSelf()

bool StringJoinParallel::verifyRightPartSelf ( int xid,
int yid,
int xlen,
int ylen,
int xpos,
int ypos,
int Tau,
int tid,
int sharing = 0 )
inline

Member Data Documentation

◆ _left

int StringJoinParallel::_left[MAXTHREADNUM]

◆ _matrix

int** StringJoinParallel::_matrix[MAXTHREADNUM]

◆ _right

int StringJoinParallel::_right[MAXTHREADNUM]

◆ avgDictLen

uint64_t StringJoinParallel::avgDictLen {0}

◆ candNum

uint64_t StringJoinParallel::candNum {0}

◆ D

int StringJoinParallel::D {0}

◆ dist

int* StringJoinParallel::dist {nullptr}

◆ earlyTerminated

int StringJoinParallel::earlyTerminated[MAXTHREADNUM] = { 0 }

◆ hashNumber

int StringJoinParallel::hashNumber {31}

◆ hv

hashValue* StringJoinParallel::hv[MAXTHREADNUM]

◆ invLists

InvListsParallel** StringJoinParallel::invLists {nullptr}

◆ invListsPre

InvListPrefix** StringJoinParallel::invListsPre {nullptr}

◆ left

int StringJoinParallel::left[MAXTHREADNUM]

◆ listNum

uint64_t StringJoinParallel::listNum {0}

◆ matrix

int** StringJoinParallel::matrix[MAXTHREADNUM]

◆ maxDictLen

int StringJoinParallel::maxDictLen {0}

◆ maxHeapSize

ui StringJoinParallel::maxHeapSize {0}

◆ minDictLen

int StringJoinParallel::minDictLen {0}

◆ modNumber

int StringJoinParallel::modNumber {1000000007}

◆ N

int StringJoinParallel::N {0}

◆ pairs

std::vector<std::pair<int, int> > StringJoinParallel::pairs[MAXTHREADNUM]

◆ partIndex

std::vector<PIndex>** StringJoinParallel::partIndex {nullptr}

◆ partLen

int** StringJoinParallel::partLen {nullptr}

◆ partPos

int** StringJoinParallel::partPos {nullptr}

◆ PN

int StringJoinParallel::PN {0}

◆ power

uint64_t* StringJoinParallel::power {nullptr}

◆ query_dataset

std::vector<std::string> StringJoinParallel::query_dataset

◆ queryInvSC

int* StringJoinParallel::queryInvSC {nullptr}

◆ queryLengthArray

std::vector<int> StringJoinParallel::queryLengthArray

◆ querylengthMap

std::vector<std::vector<int> > StringJoinParallel::querylengthMap

◆ queryMaxDictLen

int StringJoinParallel::queryMaxDictLen {0}

◆ queryMinDictLen

int StringJoinParallel::queryMinDictLen {19260817}

◆ queryN

int StringJoinParallel::queryN {0}

◆ queryPrefixHash

std::vector<uint64_t> StringJoinParallel::queryPrefixHash

◆ quickRef

bool* StringJoinParallel::quickRef[MAXTHREADNUM]

◆ realNum

uint64_t StringJoinParallel::realNum {0}

◆ right

int StringJoinParallel::right[MAXTHREADNUM]

◆ sharePrefix

int StringJoinParallel::sharePrefix {0}

◆ strCount

std::unordered_set<std::string> StringJoinParallel::strCount

◆ valid

bool StringJoinParallel::valid[MAXTHREADNUM]

◆ veriNum

uint64_t StringJoinParallel::veriNum {0}

◆ work_dataset

std::vector<std::string> StringJoinParallel::work_dataset

◆ workInvSC

int* StringJoinParallel::workInvSC {nullptr}

◆ workLengthArray

std::vector<int> StringJoinParallel::workLengthArray

◆ worklengthMap

std::vector<std::vector<int> > StringJoinParallel::worklengthMap

◆ workMaxDictLen

int StringJoinParallel::workMaxDictLen {0}

◆ workMinDictLen

int StringJoinParallel::workMinDictLen {19260817}

◆ workN

int StringJoinParallel::workN {0}

◆ workPrefixHash

std::vector<uint64_t> StringJoinParallel::workPrefixHash

The documentation for this class was generated from the following files: