|
| __init__ (self) |
|
| get_recall (self, candidates, num_golds, external_report=False, default_res_dir="") |
|
| get_recall_check (self, candidates, num_golds, check_file_path, check=0) |
|
| label_cand (self, C) |
|
| cand_stat (self, C) |
|
| over_sample (self, C) |
|
| fix_null (self, tableA, tableB) |
|
| sample_data (self, tableA, tableB, default_sample_res_dir="") |
|
| generate_features (self, tableA, tableB, at_ltable=None, at_rtable=None, dataname=None, default_output_dir="", wrtie_fea_names=True) |
|
| train_model_normal (self, tableA, tableB, num_tree, sample_size, if_balanced=True) |
|
| train_model_tuning (self, tableA, tableB, num_tree, sample_size, if_balanced=True) |
|
| train_model_active (self, tableA, tableB, num_tree, sample_size, if_balanced=True) |
|
| apply_model (self, tottable, tableA, tableB, external_fea_extract=False, default_blk_res_dir="", default_match_res_dir="") |
|
| report_tree_to_text (self, path) |
|
| store_model (self, path) |
|
| load_model (self, path) |
|
| add_attrs_blk_res (self, tableA, tableB, path) |
|
|
| _entropy (self, p1, p2) |
|
| _set_metadata (self, dataframe, key, fk_ltable, fk_rtable, ltable, rtable) |
|
| _apply_model_worker (self, tableid, tableA, tableB, external_fea_extract=False, if_report_pre=False, default_blk_res_dir="", default_match_res_dir="") |
|
◆ __init__()
simjoin_entitymatching.matcher.random_forest.RandomForest.__init__ |
( |
| self | ) |
|
◆ _apply_model_worker()
simjoin_entitymatching.matcher.random_forest.RandomForest._apply_model_worker |
( |
| self, |
|
|
| tableid, |
|
|
| tableA, |
|
|
| tableB, |
|
|
| external_fea_extract = False, |
|
|
| if_report_pre = False, |
|
|
| default_blk_res_dir = "", |
|
|
| default_match_res_dir = "" ) |
|
protected |
external_extract: indicates whether using cpp to extract features,
if not, the em package only supports non-interchangeable
◆ _entropy()
simjoin_entitymatching.matcher.random_forest.RandomForest._entropy |
( |
| self, |
|
|
| p1, |
|
|
| p2 ) |
|
protected |
◆ _set_metadata()
simjoin_entitymatching.matcher.random_forest.RandomForest._set_metadata |
( |
| self, |
|
|
| dataframe, |
|
|
| key, |
|
|
| fk_ltable, |
|
|
| fk_rtable, |
|
|
| ltable, |
|
|
| rtable ) |
|
protected |
py_entitymatching maintain a catalog as a dict with the id of dataframe as key
on each operation like extract feature vector requires metadata
but if your dataframe is not read using its api read_csv_metadata
then you need to set it by yourself
◆ add_attrs_blk_res()
simjoin_entitymatching.matcher.random_forest.RandomForest.add_attrs_blk_res |
( |
| self, |
|
|
| tableA, |
|
|
| tableB, |
|
|
| path ) |
◆ apply_model()
simjoin_entitymatching.matcher.random_forest.RandomForest.apply_model |
( |
| self, |
|
|
| tottable, |
|
|
| tableA, |
|
|
| tableB, |
|
|
| external_fea_extract = False, |
|
|
| default_blk_res_dir = "", |
|
|
| default_match_res_dir = "" ) |
Chunk the blocking result into pieces with size 1M
then apply random forests concurrently
each piece of the table has a new process
◆ cand_stat()
simjoin_entitymatching.matcher.random_forest.RandomForest.cand_stat |
( |
| self, |
|
|
| C ) |
◆ fix_null()
simjoin_entitymatching.matcher.random_forest.RandomForest.fix_null |
( |
| self, |
|
|
| tableA, |
|
|
| tableB ) |
◆ generate_features()
simjoin_entitymatching.matcher.random_forest.RandomForest.generate_features |
( |
| self, |
|
|
| tableA, |
|
|
| tableB, |
|
|
| at_ltable = None, |
|
|
| at_rtable = None, |
|
|
| dataname = None, |
|
|
| default_output_dir = "", |
|
|
| wrtie_fea_names = True ) |
◆ get_recall()
simjoin_entitymatching.matcher.random_forest.RandomForest.get_recall |
( |
| self, |
|
|
| candidates, |
|
|
| num_golds, |
|
|
| external_report = False, |
|
|
| default_res_dir = "" ) |
◆ get_recall_check()
simjoin_entitymatching.matcher.random_forest.RandomForest.get_recall_check |
( |
| self, |
|
|
| candidates, |
|
|
| num_golds, |
|
|
| check_file_path, |
|
|
| check = 0 ) |
◆ label_cand()
simjoin_entitymatching.matcher.random_forest.RandomForest.label_cand |
( |
| self, |
|
|
| C ) |
◆ load_model()
simjoin_entitymatching.matcher.random_forest.RandomForest.load_model |
( |
| self, |
|
|
| path ) |
◆ over_sample()
simjoin_entitymatching.matcher.random_forest.RandomForest.over_sample |
( |
| self, |
|
|
| C ) |
◆ report_tree_to_text()
simjoin_entitymatching.matcher.random_forest.RandomForest.report_tree_to_text |
( |
| self, |
|
|
| path ) |
◆ sample_data()
simjoin_entitymatching.matcher.random_forest.RandomForest.sample_data |
( |
| self, |
|
|
| tableA, |
|
|
| tableB, |
|
|
| default_sample_res_dir = "" ) |
To get the training set for random forest,
we should firstly block the tables and label raw results.
◆ store_model()
simjoin_entitymatching.matcher.random_forest.RandomForest.store_model |
( |
| self, |
|
|
| path ) |
◆ train_model_active()
simjoin_entitymatching.matcher.random_forest.RandomForest.train_model_active |
( |
| self, |
|
|
| tableA, |
|
|
| tableB, |
|
|
| num_tree, |
|
|
| sample_size, |
|
|
| if_balanced = True ) |
◆ train_model_normal()
simjoin_entitymatching.matcher.random_forest.RandomForest.train_model_normal |
( |
| self, |
|
|
| tableA, |
|
|
| tableB, |
|
|
| num_tree, |
|
|
| sample_size, |
|
|
| if_balanced = True ) |
◆ train_model_tuning()
simjoin_entitymatching.matcher.random_forest.RandomForest.train_model_tuning |
( |
| self, |
|
|
| tableA, |
|
|
| tableB, |
|
|
| num_tree, |
|
|
| sample_size, |
|
|
| if_balanced = True ) |
tune the hyper patameters to avoid overfitting
5-fold cross-validation
◆ cand
simjoin_entitymatching.matcher.random_forest.RandomForest.cand = '' |
◆ cand_backup
str simjoin_entitymatching.matcher.random_forest.RandomForest.cand_backup = ' ' |
◆ features
list simjoin_entitymatching.matcher.random_forest.RandomForest.features = [] |
◆ graph
simjoin_entitymatching.matcher.random_forest.RandomForest.graph = nx.Graph() |
◆ num_total [1/2]
str simjoin_entitymatching.matcher.random_forest.RandomForest.num_total = 0 |
◆ num_total [2/2]
str simjoin_entitymatching.matcher.random_forest.RandomForest.num_total = 'id', |
◆ num_training
int simjoin_entitymatching.matcher.random_forest.RandomForest.num_training = 0 |
◆ rf [1/3]
simjoin_entitymatching.matcher.random_forest.RandomForest.rf = '' |
◆ rf [2/3]
dict simjoin_entitymatching.matcher.random_forest.RandomForest.rf |
Initial value:= {
"max_depth": [None, 10, 20, 30],
"min_samples_split": [2, 5, 10],
"min_samples_leaf": [1, 2, 4]
}
◆ rf [3/3]
int simjoin_entitymatching.matcher.random_forest.RandomForest.rf = 30 |
◆ sparkrf
str simjoin_entitymatching.matcher.random_forest.RandomForest.sparkrf = '' |
◆ test_table
str simjoin_entitymatching.matcher.random_forest.RandomForest.test_table = ' ' |
The documentation for this class was generated from the following file: