Public Member Functions
	__init__ (self)

	get_recall (self, candidates, num_golds, external_report=False, default_res_dir="")

	get_recall_check (self, candidates, num_golds, check_file_path, check=0)

	label_cand (self, C)

	cand_stat (self, C)

	over_sample (self, C)

	fix_null (self, tableA, tableB)

	sample_data (self, tableA, tableB, default_sample_res_dir="")

	generate_features (self, tableA, tableB, at_ltable=None, at_rtable=None, dataname=None, default_output_dir="", wrtie_fea_names=True)

	train_model_normal (self, tableA, tableB, num_tree, sample_size, if_balanced=True)

	train_model_tuning (self, tableA, tableB, num_tree, sample_size, if_balanced=True)

	train_model_active (self, tableA, tableB, num_tree, sample_size, if_balanced=True)

	apply_model (self, tottable, tableA, tableB, external_fea_extract=False, default_blk_res_dir="", default_match_res_dir="")

	report_tree_to_text (self, path)

	store_model (self, path)

	load_model (self, path)

	add_attrs_blk_res (self, tableA, tableB, path)

Public Attributes
	graph = nx.Graph()

str	cand = ''

str	cand_backup = ' '

str	test_table = ' '

list	features = []

str	rf = ''

str	sparkrf = ''

int	num_total = 0

int	num_training = 0

str	num_total = 'id',

dict	rf

int	rf = 30

Protected Member Functions
	_entropy (self, p1, p2)

	_set_metadata (self, dataframe, key, fk_ltable, fk_rtable, ltable, rtable)

	_apply_model_worker (self, tableid, tableA, tableB, external_fea_extract=False, if_report_pre=False, default_blk_res_dir="", default_match_res_dir="")

Detailed Description

    Random forest matcher

Constructor & Destructor Documentation

◆ init()

simjoin_entitymatching.matcher.random_forest.RandomForest.__init__ ( self )

Member Function Documentation

◆ _apply_model_worker()

simjoin_entitymatching.matcher.random_forest.RandomForest._apply_model_worker	(	self,
		tableid,
		tableA,
		tableB,
		external_fea_extract = False,
		if_report_pre = False,
		default_blk_res_dir = "",
		default_match_res_dir = "" )

protected

        external_extract: indicates whether using cpp to extract features, 
                          if not, the em package only supports non-interchangeable

◆ _entropy()

simjoin_entitymatching.matcher.random_forest.RandomForest._entropy	(	self,
		p1,
		p2 )

protected

◆ _set_metadata()

simjoin_entitymatching.matcher.random_forest.RandomForest._set_metadata	(	self,
		dataframe,
		key,
		fk_ltable,
		fk_rtable,
		ltable,
		rtable )

protected

        py_entitymatching maintain a catalog as a dict with the id of dataframe as key
        on each operation like extract feature vector requires metadata
        but if your dataframe is not read using its api read_csv_metadata
        then you need to set it by yourself

◆ add_attrs_blk_res()

simjoin_entitymatching.matcher.random_forest.RandomForest.add_attrs_blk_res	(	self,
		tableA,
		tableB,
		path )

◆ apply_model()

simjoin_entitymatching.matcher.random_forest.RandomForest.apply_model	(	self,
		tottable,
		tableA,
		tableB,
		external_fea_extract = False,
		default_blk_res_dir = "",
		default_match_res_dir = "" )

        Chunk the blocking result into pieces with size 1M
        then apply random forests concurrently
        each piece of the table has a new process

◆ cand_stat()

simjoin_entitymatching.matcher.random_forest.RandomForest.cand_stat	(		self,
			C )

◆ fix_null()

simjoin_entitymatching.matcher.random_forest.RandomForest.fix_null	(	self,
		tableA,
		tableB )

◆ generate_features()

simjoin_entitymatching.matcher.random_forest.RandomForest.generate_features	(	self,
		tableA,
		tableB,
		at_ltable = None,
		at_rtable = None,
		dataname = None,
		default_output_dir = "",
		wrtie_fea_names = True )

◆ get_recall()

simjoin_entitymatching.matcher.random_forest.RandomForest.get_recall	(	self,
		candidates,
		num_golds,
		external_report = False,
		default_res_dir = "" )

◆ get_recall_check()

simjoin_entitymatching.matcher.random_forest.RandomForest.get_recall_check	(	self,
		candidates,
		num_golds,
		check_file_path,
		check = 0 )

◆ label_cand()

simjoin_entitymatching.matcher.random_forest.RandomForest.label_cand	(		self,
			C )

◆ load_model()

simjoin_entitymatching.matcher.random_forest.RandomForest.load_model	(		self,
			path )

◆ over_sample()

simjoin_entitymatching.matcher.random_forest.RandomForest.over_sample	(		self,
			C )

◆ report_tree_to_text()

simjoin_entitymatching.matcher.random_forest.RandomForest.report_tree_to_text	(		self,
			path )

◆ sample_data()

simjoin_entitymatching.matcher.random_forest.RandomForest.sample_data	(	self,
		tableA,
		tableB,
		default_sample_res_dir = "" )

        To get the training set for random forest,
        we should firstly block the tables and label raw results.

◆ store_model()

simjoin_entitymatching.matcher.random_forest.RandomForest.store_model	(		self,
			path )

◆ train_model_active()

simjoin_entitymatching.matcher.random_forest.RandomForest.train_model_active	(	self,
		tableA,
		tableB,
		num_tree,
		sample_size,
		if_balanced = True )

◆ train_model_normal()

simjoin_entitymatching.matcher.random_forest.RandomForest.train_model_normal	(	self,
		tableA,
		tableB,
		num_tree,
		sample_size,
		if_balanced = True )

◆ train_model_tuning()

simjoin_entitymatching.matcher.random_forest.RandomForest.train_model_tuning	(	self,
		tableA,
		tableB,
		num_tree,
		sample_size,
		if_balanced = True )

        tune the hyper patameters to avoid overfitting
        5-fold cross-validation

Member Data Documentation

◆ cand

simjoin_entitymatching.matcher.random_forest.RandomForest.cand = ''

◆ cand_backup

str simjoin_entitymatching.matcher.random_forest.RandomForest.cand_backup = ' '

◆ features

list simjoin_entitymatching.matcher.random_forest.RandomForest.features = []

◆ graph

simjoin_entitymatching.matcher.random_forest.RandomForest.graph = nx.Graph()

◆ num_total [1/2]

str simjoin_entitymatching.matcher.random_forest.RandomForest.num_total = 0

◆ num_total [2/2]

str simjoin_entitymatching.matcher.random_forest.RandomForest.num_total = 'id',

◆ num_training

int simjoin_entitymatching.matcher.random_forest.RandomForest.num_training = 0

◆ rf [1/3]

simjoin_entitymatching.matcher.random_forest.RandomForest.rf = ''

◆ rf [2/3]

dict simjoin_entitymatching.matcher.random_forest.RandomForest.rf

Initial value:

=  {
            "max_depth": [None, 10, 20, 30],  # Maximum depth of the tree
            "min_samples_split": [2, 5, 10],  # Minimum number of samples required to split an internal node
            "min_samples_leaf": [1, 2, 4]  # Minimum number of samples required to be at a leaf node
        }

◆ rf [3/3]

int simjoin_entitymatching.matcher.random_forest.RandomForest.rf = 30

◆ sparkrf

str simjoin_entitymatching.matcher.random_forest.RandomForest.sparkrf = ''

◆ test_table

str simjoin_entitymatching.matcher.random_forest.RandomForest.test_table = ' '

The documentation for this class was generated from the following file:

simjoin_entitymatching/matcher/random_forest.py

Public Member Functions

Public Attributes

Protected Member Functions

Detailed Description

Constructor & Destructor Documentation

◆ __init__()

Member Function Documentation

◆ _apply_model_worker()

◆ _entropy()

◆ _set_metadata()

◆ add_attrs_blk_res()

◆ apply_model()

◆ cand_stat()

◆ fix_null()

◆ generate_features()

◆ get_recall()

◆ get_recall_check()

◆ label_cand()

◆ load_model()

◆ over_sample()

◆ report_tree_to_text()

◆ sample_data()

◆ store_model()

◆ train_model_active()

◆ train_model_normal()

◆ train_model_tuning()

Member Data Documentation

◆ cand

◆ cand_backup

◆ features

◆ graph

◆ num_total [1/2]

◆ num_total [2/2]

◆ num_training

◆ rf [1/3]

◆ rf [2/3]

◆ rf [3/3]

◆ sparkrf

◆ test_table

◆ init()