Entity Matching by Similarity Join
 
Loading...
Searching...
No Matches
dataframe.h
Go to the documentation of this file.
1/*
2 * author: Chaoji Zuo and Zhizhi Wang in rutgers-db/SIGMOD2022-Programming-Contest-Public
3 * modified: Yunqi Li
4 * contact: liyunqixa@gmail.com
5 */
6#ifndef _DATAFRAME_H_
7#define _DATAFRAME_H_
8
9#include "common/type.h"
10#include <iostream>
11#include <fstream>
12#include <vector>
13#include <string>
14#include <unordered_map>
15#include <algorithm>
16
17
18class Table
19{
20public:
21 int tid;
23 std::string table_name;
24
25 std::vector<std::string> schema; // headers
26 std::unordered_map<std::string, unsigned int> inverted_schema;
27 std::vector<std::vector<std::string>> rows;
28 std::vector<std::vector<std::string>> cols;
29 std::vector<ui> perfectid; // row id whose all attrs are non-empty
30
31public:
32 Table() = default;
33 Table(int id, const std::string &name) : tid(id), table_name(name) { }
34 Table(int id, const std::string &name, const std::vector<std::string> &data_headers,
35 const std::vector<std::vector<std::string>> &data_rows,
36 const std::vector<std::vector<std::string>> &data_columns)
37 : tid(id), table_name(name), schema(data_headers), rows(data_rows), cols(data_columns) {
38 row_no = rows.size();
39 col_no = cols.size();
40 }
41
42public:
43 void Profile();
44 void PrintInfo();
45 void printData() const;
46 void printMetaData(const std::string& filename) const;
47 void printGoldData(const std::string& filename, ui tableAsize) const;
48 void findPerfectEntity(); // find entities without nan in any attributes
49 void insertOneRow(const std::vector<std::string> &tmpRow);
50};
51
52
53struct Rule
54{
55 std::string attr;
56 std::string sim;
57 std::string sim_measure {"none"}; // distance or similarity
58 std::string tok {"none"}; // q-gram or dlm
59 std::string tok_settings {"none"}; // # of q or type of dlm
60 bool sign; // 0: -/< and 1: +/>
61 double threshold;
62
63 Rule() = default;
64 ~Rule() = default;
65 Rule(const Rule& other) = delete;
66 Rule(Rule&& other) = delete;
67};
68
69
70struct Feature
71{
72 std::string attr;
73 std::string sim;
74 std::string sim_measure {"none"}; // distance or similarity
75 std::string tok {"none"}; // q-gram or dlm
76 std::string tok_settings {"none"}; // # of q or type of dlm
77
78 Feature() = default;
79 ~Feature() = default;
80 Feature(const Feature& other) = delete;
81 Feature(Feature&& other) = delete;
82};
83
84#endif // _DATAFRAME_H_
Definition dataframe.h:19
void printMetaData(const std::string &filename) const
Definition dataframe.cc:53
Table(int id, const std::string &name, const std::vector< std::string > &data_headers, const std::vector< std::vector< std::string > > &data_rows, const std::vector< std::vector< std::string > > &data_columns)
Definition dataframe.h:34
void Profile()
Definition dataframe.cc:9
void printGoldData(const std::string &filename, ui tableAsize) const
Definition dataframe.cc:77
std::vector< std::vector< std::string > > cols
Definition dataframe.h:28
std::vector< std::string > schema
Definition dataframe.h:25
std::string table_name
Definition dataframe.h:23
void findPerfectEntity()
Definition dataframe.cc:114
void PrintInfo()
Definition dataframe.cc:19
int tid
Definition dataframe.h:21
void printData() const
Definition dataframe.cc:29
Table(int id, const std::string &name)
Definition dataframe.h:33
void insertOneRow(const std::vector< std::string > &tmpRow)
Definition dataframe.cc:140
int col_no
Definition dataframe.h:22
std::vector< ui > perfectid
Definition dataframe.h:29
std::unordered_map< std::string, unsigned int > inverted_schema
Definition dataframe.h:26
int row_no
Definition dataframe.h:22
std::vector< std::vector< std::string > > rows
Definition dataframe.h:27
Table()=default
Definition dataframe.h:71
std::string sim
Definition dataframe.h:73
std::string attr
Definition dataframe.h:72
Feature(const Feature &other)=delete
std::string sim_measure
Definition dataframe.h:74
Feature()=default
std::string tok_settings
Definition dataframe.h:76
std::string tok
Definition dataframe.h:75
~Feature()=default
Feature(Feature &&other)=delete
Definition dataframe.h:54
std::string sim_measure
Definition dataframe.h:57
Rule(Rule &&other)=delete
std::string tok
Definition dataframe.h:58
std::string tok_settings
Definition dataframe.h:59
~Rule()=default
Rule(const Rule &other)=delete
std::string sim
Definition dataframe.h:56
std::string attr
Definition dataframe.h:55
bool sign
Definition dataframe.h:60
double threshold
Definition dataframe.h:61
Rule()=default
unsigned int ui
Definition type.h:8