22#include <arrow/compute/api.h>
23#include "arrow/pretty_print.h"
25#include <arrow/csv/api.h>
26#include <arrow/json/api.h>
27#include <arrow/io/api.h>
28#include <arrow/table.h>
29#include <arrow/chunked_array.h>
30#include <arrow/pretty_print.h>
31#include <arrow/result.h>
32#include <arrow/status.h>
33#include <arrow/ipc/api.h>
34#include <parquet/arrow/reader.h>
35#include <parquet/arrow/writer.h>
36#include <parquet/exception.h>
37#include "arrow/io/file.h"
38#include "parquet/stream_writer.h"
47 char field_delimiter =
',';
49 int filesize(
const char* filename);
50 bool ends_with(std::string
const & value, std::string
const & ending);
51 void csv_read_row(std::istream &in, std::vector<std::string> &row,
bool isNorm =
true);
52 bool get_table(
const std::string &filepath, std::vector<std::string> &headers,
53 std::vector<std::vector<std::string>> &columns,
54 std::vector<std::vector<std::string>> &rows,
63 bool reading(
const std::string &datafilespath,
bool normalize);
81 const std::string &defaultFeatureNameDir =
"");
84 static void extractRules(
char* str,
const char* pattern, std::vector<std::string>& res);
88void readFile(
const char* filename, std::vector<std::vector<int>>& records);
95 ParquetReader() =
default;
96 ~ParquetReader() =
default;
97 ParquetReader(
const ParquetReader& other) =
delete;
98 ParquetReader(ParquetReader&& other) =
delete;
101 static bool endWith(
const std::string& path,
const std::string& ending);
102 static TableType beginWith(
const std::string& path);
103 static void chunkedArray2StringVector(std::shared_ptr<arrow::ChunkedArray>
const& array_a,
104 std::vector<std::string>& int64_values);
105 template <
typename T,
typename arrayT>
106 static void chunkedArray2Vector(std::shared_ptr<arrow::ChunkedArray>
const& array_a,
107 std::vector<T>& values);
108 template <
typename T>
109 static void dataVector2TableVector(
const std::vector<T>& data_vec,
Table& table);
110 static std::string splitSchemaName(
const std::string& column_name,
const std::string& pattern);
114 static void stringNormalize(std::string& s);
116 static arrow::Status readTable(
const std::string& filename,
Table& table);
117 static arrow::Status readDirectory(
const std::string& dirname,
ui& num_table,
134 static const std::vector<std::string> strAttr;
135 static const std::vector<std::string> intAttr;
136 static const std::vector<std::string> floatAttr;
159 const std::vector<ui> &idMapB,
const std::string &defaultOutputDir =
"");
161 const std::vector<ui> &idMapB,
const Table &tableA,
const Table &tableB,
162 const std::vector<int> &label,
const std::string &defaultOutputDir =
"");
166 const std::string &defaultOutputDir =
"");
169 const std::string &defaultOutputDir =
"");
174#ifdef ARROW_INSTALLED
177 static void setFirstThree(parquet::schema::NodeVector &fields);
178 static bool getField(parquet::schema::NodeVector &fields,
string &curAttr,
string &newAttr);
183 static void writeSampleResSnowmanParquet(
const std::vector<std::pair<int, int>> &pairs,
const std::vector<ui> &idMapA,
184 const std::vector<ui> &idMapB,
const std::string &defaultOutputDir =
"");
186 static void writeSampleResMegallenParquet(
const std::vector<std::pair<int, int>> &pairs,
const std::vector<ui> &idMapA,
187 const std::vector<ui> &idMapB,
const Table &tableA,
const Table &tableB,
188 const std::vector<int> &label,
const std::string &defaultOutputDir =
"");
192 static void writeBlockResSnowmanParquet(
const Table &tableA,
const std::vector<std::vector<int>> &
final_pairs,
193 const std::string &defaultOutputDir =
"");
194 static void writeBlockResMegallenParquet(
const Table &tableA,
const Table &tableB,
ui oneTableSize,
196 const std::string &defaultOutputDir =
"");
std::vector< std::vector< int > > final_pairs
Definition blocker_config.cc:25
ui num_rules
Definition blocker_config.cc:10
Table table_A
Definition blocker_config.cc:11
Table table_B
Definition blocker_config.cc:12
Rule * rules
Definition blocker_config.cc:14
Table gold
Definition blocker_config.cc:13
std::vector< Table > tables
Definition io.h:60
void strNormalize(std::string &s)
Definition io.cc:12
bool reading_one_table(const std::string &datafilepath, bool normalize)
Definition io.cc:34
void write_one_table(const Table &table, const std::string &outfilename)
Definition io.cc:62
int get_max_val_len()
Definition io.h:66
bool reading(const std::string &datafilespath, bool normalize)
Definition io.cc:87
static void writeBlockResSnowmanCSV(const Table &tableA, const std::vector< std::vector< int > > &final_pairs, const std::string &defaultOutputDir="")
Definition io.cc:822
MultiWriter(const MultiWriter &other)=delete
static void writeSampleResSnowmanCSV(const std::vector< std::pair< int, int > > &pairs, const std::vector< ui > &idMapA, const std::vector< ui > &idMapB, const std::string &defaultOutputDir="")
Definition io.cc:723
static void writeOneTable(const Table &table, const std::string &outputFilePath)
Definition io.cc:693
static void writeSampleResMegallenCSV(const std::vector< std::pair< int, int > > &pairs, const std::vector< ui > &idMapA, const std::vector< ui > &idMapB, const Table &tableA, const Table &tableB, const std::vector< int > &label, const std::string &defaultOutputDir="")
Definition io.cc:750
static void writeBlockResMegallenCSV(const Table &tableA, const Table &tableB, ui oneTableSize, const std::vector< std::vector< int > > &final_pairs, const std::string &defaultOutputDir="")
Definition io.cc:849
static void escapeOneRow(std::string &str)
Definition io.cc:672
MultiWriter(MultiWriter &&other)=delete
RuleReader(RuleReader &&other)=delete
static void extractRules(char *str, const char *pattern, std::vector< std::string > &res)
static void readRules(ui &numRules, Rule *&rules, const std::string &rulePath)
Definition io.cc:220
RuleReader(const RuleReader &other)=delete
static void readFeatureNames(ui &numFeatures, Rule *&rules, const std::string &defaultFeatureNameDir="")
Definition io.cc:265
static void readRules(const std::string &dirname, ui &num_rules, Rule *&rules)
Definition dataframe.h:19
void readFile(const char *filename, std::vector< std::vector< int > > &records)
Definition io.cc:312
Definition dataframe.h:54
unsigned int ui
Definition type.h:8
TableType
Definition type.h:23