Entity Matching by Similarity Join
 
Loading...
Searching...
No Matches
config.h
Go to the documentation of this file.
1/*
2 * author: Yunqi Li
3 * contact: liyunqixa@gmail.com
4 */
5#ifndef _CONFIG_H_
6#define _CONFIG_H_
7
8#include <cstddef>
9
10/*
11 * Package
12 * After a vcpkg update, it can no longer install packages because "vcpkg_cmake" failure
13 * Guess it's because of cmake version, not fixed at this stage
14 */
15// #define ARROW_INSTALLED
16// #define EDLIB_INSTALLED
17
18
19/*
20 * IO & tokenzie
21 */
22#define PARQUET_PREFIX_MIN_LENGTH 4
23
24#define REPORT_TABLE_IN_BUFFER
25#define REPORT_TOKEN_IN_BUFFER
26
27// 0: put multiple spaces as a space
28// 1: skip all characters not a-z/A-Z/0-9
29#define NORMALIZE_STRATEGY 1
30// #define STRING_NORMALIZE
31#define SKIP_NO_ALPHANUMERIC 0
32// #define CHECK_TOKENIZE
33
34
35/*
36 * Parelled
37 */
38#define MAXTHREADNUM 160
39
40#define MAINTAIN_VALUE 1
41#define MAINTAIN_VALUE_OVLP 1
42#define MAINTAIN_VALUE_EDIT 0
43#define EARLY_TERMINATE 0
44#define MAX_PAIR_SIZE 10000000 // for each heap (thread)
45
46#define DEDUPLICATE 1
47
48/*
49 * Serial
50 */
51#define MAX_PAIR_SIZE_SERIAL 1000000000
52
53
54/*
55 * String join
56 */
57constexpr size_t stringHashNumber = 31;
58constexpr size_t modNumber = 1000000007;
59
60#define APPROXIMATE 0
61#define TIMER_ON 0
62#define VERIFY_PREFIX 0
63#define DROP_EMPTY 1
64#define LEAVE_EXACT_MATCH 0
65#define REPORT_STR_COUNT 0
66
67
68/*
69 * Set join
70 */
71#define BRUTE_FORCE 1 // we flip the value, that is, 1 for non-bruteforce
72#define OUTPUT_DUP 0
73
74// #define WRITE_RESULT
75
76#define PACK(x, y) ((x << 32) + y)
77#define PRIME 2017
78#define EPS 1e-5
79#define NEG -1
80#define INF 100000000
81#define MAX_LINE_LENGTH 100000
82#define CACHE_SIZE 5
83#define PART_COE 1
84
85#define APPEND_EMPTY 0
86#define MAX_EMPTY_SIZE 1000000
87#define RESIZE_DATA 0
88
89// Macro to define the version of the algorithm.
90// If VERSION is set to 2, the bottomk variant is used.
91#define VERSION 1
92
93// Type alias for token length.
94// Use unsigned int for ngram, unsigned short otherwise.
95using TokenLen = unsigned int;
96
97
98/*
99 * Overlap join
100 */
101#define RATIO 0.005
102#define TIMES 200
103
104#define BRUTEFORCE_COMB 0
105#define PREPROCESS_TIMER_ON 1
106#define REPORT_INDEX 0
107#define REPORT_BINARY 0
108#define REPORT_LIST 0
109#define LIMIT_INV_SIZE 1
110#define MAX_INV_SIZE 100000
111#define APPROXIMATE_OVLP 0
112#define SHARING_PREFIX 1
113
114
115/*
116 * sim funcs
117 */
118#define OVLP_STRATEGY 1
119
120
121/*
122 * Simjoin hpp
123 */
124#define USING_CRITICAL 0
125#define USING_PARALLEL 0
126#define MAX_TOTAL_SIZE 1000000000 // exactly the same as MAX_PAIR_SIZE_SERIAL
127
128
129/*
130 * blocker main
131 */
132#define PRINT_RULES 0
133#define EXPORT_MISS 0
134
135
136/*
137 * Miscellaneous
138 */
139#define VERIFY_JOIN
140
141#define MAX_BITSET_LENGTH 10000000
142
143
144#endif // _CONFIG_H_
unsigned int TokenLen
Definition config.h:95
constexpr size_t stringHashNumber
Definition config.h:57
constexpr size_t modNumber
Definition config.h:58