Entity Matching by Similarity Join
 
Loading...
Searching...
No Matches
joinutil.h
Go to the documentation of this file.
1/*
2 * author: Dong Deng
3 * modified: Zhencan Peng in rutgers-db/RedPajama_Analysis
4 * modified: Yunqi Li
5 * contact: liyunqixa@gmail.com
6 */
7#ifndef _JOINUTIL_H_
8#define _JOINUTIL_H_
9
10#include "common/type.h"
11#include "common/index.h"
12#include <chrono>
13#include <ios>
14#include <iostream>
15#include <fstream>
16#include <cmath>
17#include <vector>
18#include <set>
19#include <algorithm>
20#include <sys/time.h>
21#include <sys/sysinfo.h>
22#include <omp.h>
23#include <unistd.h>
24
25
26/*
27 * Setjoin
28 */
30{
31public:
32 SetJoinUtil() = default;
33 ~SetJoinUtil() = default;
34 SetJoinUtil(const SetJoinUtil& other) = delete;
35 SetJoinUtil(SetJoinUtil&& other) = delete;
36
37public:
38 static void printMemory();
39 static void processMemUsage(double &vm_usage, double &resident_set);
40 // turn on the timer
41 static std::chrono::_V2::system_clock::time_point logTime();
42 // turn off the timer
43 static double repTime(const std::chrono::_V2::system_clock::time_point &start);
44};
45
46
48{
49public:
54
55public:
56 static int getHowManyThreads();
57 static void printHowManyThreads();
58 static std::vector<int>
59 getUniqueInts(const std::vector<std::pair<int, int>>& pairs);
60 static void
61 mergeArrays(std::vector<std::vector<std::pair<int,int>>>* input, int arr_len,
62 std::vector<std::vector<std::pair<int,int>>> & result);
63 static ui hval(const std::pair<ui, ui> &hf, ui &word);
64 static void generateHashFunc(ui seed, std::pair<ui, ui> &hf);
65 // bottom K
66 static double shrinkBottomk(std::vector<std::vector<ui>>& bottom_ks, double ratio);
67 template<typename T>
68 static bool bottomKJaccard(const std::vector<T>& A, const std::vector<T>& B, double& thres);
69};
70
71
72/*
73 * String join
74 */
76{
77public:
78 StringJoinUtil() = default;
79 ~StringJoinUtil() = default;
80 StringJoinUtil(const StringJoinUtil& other) = delete;
81 StringJoinUtil(StringJoinUtil&& other) = delete;
82
83public:
84 static bool strLessT(const std::string &s1, const std::string &s2);
85 static uint64_t strHash(const std::string &str, int stPos, int len);
86 static int min(int a, int b, int c) {
87 return (a <= b && a <= c) ? a : (b <= c ? b : c);
88 }
89 static int min(int *arr) {
90 return arr[std::min_element(arr, arr + 3) - arr];
91 }
92 static int max(int a, int b, int c);
93 static char min(char a, char b, char c);
94 static unsigned min(unsigned a, unsigned b, unsigned c);
95 static bool PIndexLess(const PIndex &p1, const PIndex &p2);
96};
97
98#endif // _JOIN_UTIL_H_
Definition joinutil.h:48
static void mergeArrays(std::vector< std::vector< std::pair< int, int > > > *input, int arr_len, std::vector< std::vector< std::pair< int, int > > > &result)
Definition joinutil.cc:163
static int getHowManyThreads()
Definition joinutil.cc:119
static double shrinkBottomk(std::vector< std::vector< ui > > &bottom_ks, double ratio)
Definition joinutil.cc:191
static bool bottomKJaccard(const std::vector< T > &A, const std::vector< T > &B, double &thres)
Definition joinutil.cc:205
SetJoinParallelUtil(SetJoinParallelUtil &&other)=delete
static std::vector< int > getUniqueInts(const std::vector< std::pair< int, int > > &pairs)
Definition joinutil.cc:148
static ui hval(const std::pair< ui, ui > &hf, ui &word)
Definition joinutil.cc:173
SetJoinParallelUtil()=default
~SetJoinParallelUtil()=default
static void printHowManyThreads()
Definition joinutil.cc:133
SetJoinParallelUtil(const SetJoinParallelUtil &other)=delete
static void generateHashFunc(ui seed, std::pair< ui, ui > &hf)
Definition joinutil.cc:179
Definition joinutil.h:30
~SetJoinUtil()=default
static void printMemory()
Definition joinutil.cc:13
static void processMemUsage(double &vm_usage, double &resident_set)
Definition joinutil.cc:66
SetJoinUtil(const SetJoinUtil &other)=delete
static double repTime(const std::chrono::_V2::system_clock::time_point &start)
Definition joinutil.cc:111
SetJoinUtil()=default
SetJoinUtil(SetJoinUtil &&other)=delete
static std::chrono::_V2::system_clock::time_point logTime()
Definition joinutil.cc:105
Definition joinutil.h:76
StringJoinUtil()=default
static int min(int a, int b, int c)
Definition joinutil.h:86
static uint64_t strHash(const std::string &str, int stPos, int len)
Definition joinutil.cc:252
static int max(int a, int b, int c)
Definition joinutil.cc:261
static bool PIndexLess(const PIndex &p1, const PIndex &p2)
Definition joinutil.cc:276
StringJoinUtil(const StringJoinUtil &other)=delete
~StringJoinUtil()=default
static bool strLessT(const std::string &s1, const std::string &s2)
Definition joinutil.cc:243
static int min(int *arr)
Definition joinutil.h:89
StringJoinUtil(StringJoinUtil &&other)=delete
Definition index.h:92
unsigned int ui
Definition type.h:8