Entity Matching by Similarity Join
 
Loading...
Searching...
No Matches
simjoin_entitymatching.matcher.random_forest.RandomForest Class Reference

Public Member Functions

 __init__ (self)
 
 get_recall (self, candidates, num_golds, external_report=False, default_res_dir="")
 
 get_recall_check (self, candidates, num_golds, check_file_path, check=0)
 
 label_cand (self, C)
 
 cand_stat (self, C)
 
 over_sample (self, C)
 
 fix_null (self, tableA, tableB)
 
 sample_data (self, tableA, tableB, default_sample_res_dir="")
 
 generate_features (self, tableA, tableB, at_ltable=None, at_rtable=None, dataname=None, default_output_dir="", wrtie_fea_names=True)
 
 train_model_normal (self, tableA, tableB, num_tree, sample_size, if_balanced=True)
 
 train_model_tuning (self, tableA, tableB, num_tree, sample_size, if_balanced=True)
 
 train_model_active (self, tableA, tableB, num_tree, sample_size, if_balanced=True)
 
 apply_model (self, tottable, tableA, tableB, external_fea_extract=False, default_blk_res_dir="", default_match_res_dir="")
 
 report_tree_to_text (self, path)
 
 store_model (self, path)
 
 load_model (self, path)
 
 add_attrs_blk_res (self, tableA, tableB, path)
 

Public Attributes

 graph = nx.Graph()
 
str cand = ''
 
str cand_backup = ' '
 
str test_table = ' '
 
list features = []
 
str rf = ''
 
str sparkrf = ''
 
int num_total = 0
 
int num_training = 0
 
str num_total = 'id',
 
dict rf
 
int rf = 30
 

Protected Member Functions

 _entropy (self, p1, p2)
 
 _set_metadata (self, dataframe, key, fk_ltable, fk_rtable, ltable, rtable)
 
 _apply_model_worker (self, tableid, tableA, tableB, external_fea_extract=False, if_report_pre=False, default_blk_res_dir="", default_match_res_dir="")
 

Detailed Description

    Random forest matcher

Constructor & Destructor Documentation

◆ __init__()

simjoin_entitymatching.matcher.random_forest.RandomForest.__init__ ( self)

Member Function Documentation

◆ _apply_model_worker()

simjoin_entitymatching.matcher.random_forest.RandomForest._apply_model_worker ( self,
tableid,
tableA,
tableB,
external_fea_extract = False,
if_report_pre = False,
default_blk_res_dir = "",
default_match_res_dir = "" )
protected
        external_extract: indicates whether using cpp to extract features, 
                          if not, the em package only supports non-interchangeable

◆ _entropy()

simjoin_entitymatching.matcher.random_forest.RandomForest._entropy ( self,
p1,
p2 )
protected

◆ _set_metadata()

simjoin_entitymatching.matcher.random_forest.RandomForest._set_metadata ( self,
dataframe,
key,
fk_ltable,
fk_rtable,
ltable,
rtable )
protected
        py_entitymatching maintain a catalog as a dict with the id of dataframe as key
        on each operation like extract feature vector requires metadata
        but if your dataframe is not read using its api read_csv_metadata
        then you need to set it by yourself

◆ add_attrs_blk_res()

simjoin_entitymatching.matcher.random_forest.RandomForest.add_attrs_blk_res ( self,
tableA,
tableB,
path )

◆ apply_model()

simjoin_entitymatching.matcher.random_forest.RandomForest.apply_model ( self,
tottable,
tableA,
tableB,
external_fea_extract = False,
default_blk_res_dir = "",
default_match_res_dir = "" )
        Chunk the blocking result into pieces with size 1M
        then apply random forests concurrently
        each piece of the table has a new process

◆ cand_stat()

simjoin_entitymatching.matcher.random_forest.RandomForest.cand_stat ( self,
C )

◆ fix_null()

simjoin_entitymatching.matcher.random_forest.RandomForest.fix_null ( self,
tableA,
tableB )

◆ generate_features()

simjoin_entitymatching.matcher.random_forest.RandomForest.generate_features ( self,
tableA,
tableB,
at_ltable = None,
at_rtable = None,
dataname = None,
default_output_dir = "",
wrtie_fea_names = True )

◆ get_recall()

simjoin_entitymatching.matcher.random_forest.RandomForest.get_recall ( self,
candidates,
num_golds,
external_report = False,
default_res_dir = "" )

◆ get_recall_check()

simjoin_entitymatching.matcher.random_forest.RandomForest.get_recall_check ( self,
candidates,
num_golds,
check_file_path,
check = 0 )

◆ label_cand()

simjoin_entitymatching.matcher.random_forest.RandomForest.label_cand ( self,
C )

◆ load_model()

simjoin_entitymatching.matcher.random_forest.RandomForest.load_model ( self,
path )

◆ over_sample()

simjoin_entitymatching.matcher.random_forest.RandomForest.over_sample ( self,
C )

◆ report_tree_to_text()

simjoin_entitymatching.matcher.random_forest.RandomForest.report_tree_to_text ( self,
path )

◆ sample_data()

simjoin_entitymatching.matcher.random_forest.RandomForest.sample_data ( self,
tableA,
tableB,
default_sample_res_dir = "" )
        To get the training set for random forest,
        we should firstly block the tables and label raw results.

◆ store_model()

simjoin_entitymatching.matcher.random_forest.RandomForest.store_model ( self,
path )

◆ train_model_active()

simjoin_entitymatching.matcher.random_forest.RandomForest.train_model_active ( self,
tableA,
tableB,
num_tree,
sample_size,
if_balanced = True )

◆ train_model_normal()

simjoin_entitymatching.matcher.random_forest.RandomForest.train_model_normal ( self,
tableA,
tableB,
num_tree,
sample_size,
if_balanced = True )

◆ train_model_tuning()

simjoin_entitymatching.matcher.random_forest.RandomForest.train_model_tuning ( self,
tableA,
tableB,
num_tree,
sample_size,
if_balanced = True )
        tune the hyper patameters to avoid overfitting
        5-fold cross-validation

Member Data Documentation

◆ cand

simjoin_entitymatching.matcher.random_forest.RandomForest.cand = ''

◆ cand_backup

str simjoin_entitymatching.matcher.random_forest.RandomForest.cand_backup = ' '

◆ features

list simjoin_entitymatching.matcher.random_forest.RandomForest.features = []

◆ graph

simjoin_entitymatching.matcher.random_forest.RandomForest.graph = nx.Graph()

◆ num_total [1/2]

str simjoin_entitymatching.matcher.random_forest.RandomForest.num_total = 0

◆ num_total [2/2]

str simjoin_entitymatching.matcher.random_forest.RandomForest.num_total = 'id',

◆ num_training

int simjoin_entitymatching.matcher.random_forest.RandomForest.num_training = 0

◆ rf [1/3]

simjoin_entitymatching.matcher.random_forest.RandomForest.rf = ''

◆ rf [2/3]

dict simjoin_entitymatching.matcher.random_forest.RandomForest.rf
Initial value:
= {
"max_depth": [None, 10, 20, 30], # Maximum depth of the tree
"min_samples_split": [2, 5, 10], # Minimum number of samples required to split an internal node
"min_samples_leaf": [1, 2, 4] # Minimum number of samples required to be at a leaf node
}

◆ rf [3/3]

int simjoin_entitymatching.matcher.random_forest.RandomForest.rf = 30

◆ sparkrf

str simjoin_entitymatching.matcher.random_forest.RandomForest.sparkrf = ''

◆ test_table

str simjoin_entitymatching.matcher.random_forest.RandomForest.test_table = ' '

The documentation for this class was generated from the following file: