Entity Matching by Similarity Join
 
Loading...
Searching...
No Matches
io.h
Go to the documentation of this file.
1/*
2 * author: Chaoji Zuo and Zhizhi Wang in rutgers-db/SIGMOD2022-Programming-Contest-Public
3 * modified: Yunqi Li
4 * contact: liyunqixa@gmail.com
5 */
6#ifndef _IO_H_
7#define _IO_H_
8
9#include "common/dataframe.h"
10#include "common/type.h"
11#include "common/config.h"
12#include <iostream>
13#include <istream>
14#include <sstream>
15#include <cstring>
16#include <dirent.h>
17#include <sys/stat.h>
18#include <string.h>
19#include <stdio.h>
20#include <vector>
21#ifdef ARROW_INSTALLED
22#include <arrow/compute/api.h>
23#include "arrow/pretty_print.h"
24#include <arrow/api.h>
25#include <arrow/csv/api.h>
26#include <arrow/json/api.h>
27#include <arrow/io/api.h>
28#include <arrow/table.h>
29#include <arrow/chunked_array.h>
30#include <arrow/pretty_print.h>
31#include <arrow/result.h>
32#include <arrow/status.h>
33#include <arrow/ipc/api.h>
34#include <parquet/arrow/reader.h>
35#include <parquet/arrow/writer.h>
36#include <parquet/exception.h>
37#include "arrow/io/file.h"
38#include "parquet/stream_writer.h"
39#endif
40
41
42// read
44{
45private:
46 int max_val_len = 0;
47 char field_delimiter = ','; // '\t'
48
49 int filesize(const char* filename);
50 bool ends_with(std::string const & value, std::string const & ending);
51 void csv_read_row(std::istream &in, std::vector<std::string> &row, bool isNorm = true);
52 bool get_table(const std::string &filepath, std::vector<std::string> &headers,
53 std::vector<std::vector<std::string>> &columns,
54 std::vector<std::vector<std::string>> &rows,
55 bool normalize);
56
57public:
58 CSVReader() = default;
59 ~CSVReader() = default;
60 std::vector<Table> tables;
61
62 void strNormalize(std::string &s); // also for the use of query normalization
63 bool reading(const std::string &datafilespath, bool normalize);
64 void write_one_table(const Table &table, const std::string &outfilename);
65 bool reading_one_table(const std::string &datafilepath, bool normalize);
66 int get_max_val_len() { return max_val_len; };
67};
68
69
71{
72public:
73 RuleReader() = default;
74 ~RuleReader() = default;
75 RuleReader(const RuleReader& other) = delete;
76 RuleReader(RuleReader&& other) = delete;
77
78public:
79 static void readRules(ui &numRules, Rule *&rules, const std::string &rulePath);
80 static void readFeatureNames(ui &numFeatures, Rule *&rules,
81 const std::string &defaultFeatureNameDir = "");
82
83 // unused
84 static void extractRules(char* str, const char* pattern, std::vector<std::string>& res);
85 static void readRules(const std::string& dirname, ui& num_rules, Rule*& rules);
86};
87
88void readFile(const char* filename, std::vector<std::vector<int>>& records);
89
90
91#ifdef ARROW_INSTALLED
92class ParquetReader
93{
94public:
95 ParquetReader() = default;
96 ~ParquetReader() = default;
97 ParquetReader(const ParquetReader& other) = delete;
98 ParquetReader(ParquetReader&& other) = delete;
99
100private:
101 static bool endWith(const std::string& path, const std::string& ending);
102 static TableType beginWith(const std::string& path);
103 static void chunkedArray2StringVector(std::shared_ptr<arrow::ChunkedArray> const& array_a,
104 std::vector<std::string>& int64_values);
105 template <typename T, typename arrayT>
106 static void chunkedArray2Vector(std::shared_ptr<arrow::ChunkedArray> const& array_a,
107 std::vector<T>& values);
108 template <typename T>
109 static void dataVector2TableVector(const std::vector<T>& data_vec, Table& table);
110 static std::string splitSchemaName(const std::string& column_name, const std::string& pattern);
111
112public:
113 // String normalization: Lowercase & Skip spaces.
114 static void stringNormalize(std::string& s);
115 // Read
116 static arrow::Status readTable(const std::string& filename, Table& table);
117 static arrow::Status readDirectory(const std::string& dirname, ui& num_table,
119};
120#endif
121
122
123/*
124 * writer
125 * for writing blocking / matching / sample results
126 * csv & parquet
127 * Snowman format & Megallen format
128 * the block res is stored by chunking the "big table" to several small tables in "blktmp"
129 * the sample res is stored in "buffer"
130 */
132{
133private:
134 static const std::vector<std::string> strAttr;
135 static const std::vector<std::string> intAttr;
136 static const std::vector<std::string> floatAttr;
137
138public:
139 MultiWriter() = default;
140 ~MultiWriter() = default;
141 MultiWriter(const MultiWriter &other) = delete;
142 MultiWriter(MultiWriter &&other) = delete;
143
144 /*
145 * csv
146 */
147public:
148 // if seperator found (,) in str, surround it with ""
149 // if double quotes found, escaped it with another double quote
150 // other chars like '\n' or '<' are common in secret datasets
151 // but we can leave them scine pandas praser will handle them
152 static void escapeOneRow(std::string &str);
153
154public:
155 static void writeOneTable(const Table &table, const std::string &outputFilePath);
156
157 // sample csv
158 static void writeSampleResSnowmanCSV(const std::vector<std::pair<int, int>> &pairs, const std::vector<ui> &idMapA,
159 const std::vector<ui> &idMapB, const std::string &defaultOutputDir = "");
160 static void writeSampleResMegallenCSV(const std::vector<std::pair<int, int>> &pairs, const std::vector<ui> &idMapA,
161 const std::vector<ui> &idMapB, const Table &tableA, const Table &tableB,
162 const std::vector<int> &label, const std::string &defaultOutputDir = "");
163
164 // block csv
165 static void writeBlockResSnowmanCSV(const Table &tableA, const std::vector<std::vector<int>> &final_pairs,
166 const std::string &defaultOutputDir = "");
167 static void writeBlockResMegallenCSV(const Table &tableA, const Table &tableB, ui oneTableSize,
168 const std::vector<std::vector<int>> &final_pairs,
169 const std::string &defaultOutputDir = "");
170
171 /*
172 * parquet
173 */
174#ifdef ARROW_INSTALLED
175private:
176 // utils
177 static void setFirstThree(parquet::schema::NodeVector &fields);
178 static bool getField(parquet::schema::NodeVector &fields, string &curAttr, string &newAttr);
179
180public:
181 // sample parquet
182 // unimplemented
183 static void writeSampleResSnowmanParquet(const std::vector<std::pair<int, int>> &pairs, const std::vector<ui> &idMapA,
184 const std::vector<ui> &idMapB, const std::string &defaultOutputDir = "");
185 // avaiable
186 static void writeSampleResMegallenParquet(const std::vector<std::pair<int, int>> &pairs, const std::vector<ui> &idMapA,
187 const std::vector<ui> &idMapB, const Table &tableA, const Table &tableB,
188 const std::vector<int> &label, const std::string &defaultOutputDir = "");
189
190 // block parquet
191 // unimplemented
192 static void writeBlockResSnowmanParquet(const Table &tableA, const std::vector<std::vector<int>> &final_pairs,
193 const std::string &defaultOutputDir = "");
194 static void writeBlockResMegallenParquet(const Table &tableA, const Table &tableB, ui oneTableSize,
195 const std::vector<std::vector<int>> &final_pairs,
196 const std::string &defaultOutputDir = "");
197#endif
198};
199
200#endif // _IO_H_
std::vector< std::vector< int > > final_pairs
Definition blocker_config.cc:25
ui num_rules
Definition blocker_config.cc:10
Table table_A
Definition blocker_config.cc:11
Table table_B
Definition blocker_config.cc:12
Rule * rules
Definition blocker_config.cc:14
Table gold
Definition blocker_config.cc:13
Definition io.h:44
std::vector< Table > tables
Definition io.h:60
void strNormalize(std::string &s)
Definition io.cc:12
bool reading_one_table(const std::string &datafilepath, bool normalize)
Definition io.cc:34
~CSVReader()=default
void write_one_table(const Table &table, const std::string &outfilename)
Definition io.cc:62
int get_max_val_len()
Definition io.h:66
bool reading(const std::string &datafilespath, bool normalize)
Definition io.cc:87
CSVReader()=default
Definition io.h:132
static void writeBlockResSnowmanCSV(const Table &tableA, const std::vector< std::vector< int > > &final_pairs, const std::string &defaultOutputDir="")
Definition io.cc:822
MultiWriter(const MultiWriter &other)=delete
static void writeSampleResSnowmanCSV(const std::vector< std::pair< int, int > > &pairs, const std::vector< ui > &idMapA, const std::vector< ui > &idMapB, const std::string &defaultOutputDir="")
Definition io.cc:723
MultiWriter()=default
static void writeOneTable(const Table &table, const std::string &outputFilePath)
Definition io.cc:693
static void writeSampleResMegallenCSV(const std::vector< std::pair< int, int > > &pairs, const std::vector< ui > &idMapA, const std::vector< ui > &idMapB, const Table &tableA, const Table &tableB, const std::vector< int > &label, const std::string &defaultOutputDir="")
Definition io.cc:750
~MultiWriter()=default
static void writeBlockResMegallenCSV(const Table &tableA, const Table &tableB, ui oneTableSize, const std::vector< std::vector< int > > &final_pairs, const std::string &defaultOutputDir="")
Definition io.cc:849
static void escapeOneRow(std::string &str)
Definition io.cc:672
MultiWriter(MultiWriter &&other)=delete
Definition io.h:71
RuleReader(RuleReader &&other)=delete
~RuleReader()=default
RuleReader()=default
static void extractRules(char *str, const char *pattern, std::vector< std::string > &res)
static void readRules(ui &numRules, Rule *&rules, const std::string &rulePath)
Definition io.cc:220
RuleReader(const RuleReader &other)=delete
static void readFeatureNames(ui &numFeatures, Rule *&rules, const std::string &defaultFeatureNameDir="")
Definition io.cc:265
static void readRules(const std::string &dirname, ui &num_rules, Rule *&rules)
Definition dataframe.h:19
void readFile(const char *filename, std::vector< std::vector< int > > &records)
Definition io.cc:312
Definition dataframe.h:54
unsigned int ui
Definition type.h:8
TableType
Definition type.h:23