Entity Matching by Similarity Join
 
Loading...
Searching...
No Matches
simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec Class Reference

Public Member Functions

 __init__ (self, inmemory_)
 
 sample_sample (self, sample_size=5000)
 
 sample_match (self, sample_size=5000)
 
 train_and_save (self, blk_attr, rawtable, rawtable2=None, default_output_dir="")
 
 train_all_and_save (self, attrs, rawtable, rawtable2=None, default_output_dir="")
 
 apply_sample (self, blk_attr, tau, ground_truth_label=False, default_icv_dir="", default_gold_dir="", default_sample_res_dir="")
 
 group_interchangeable (self, blk_attr, tau, default_icv_dir="")
 
 group_interchangeable_parallel (self, blk_attr, tau, tottable=100, default_icv_dir="")
 
 load_sample_res (self, tableA, tableB, default_sample_res_dir="")
 
 load_match_res (self, tableA, tableB, default_match_res_dir="")
 
 load_model (self, usage, attr, default_model_dir="")
 

Public Attributes

 model = None
 
 sample_res = None
 
 match_res = None
 
 training_set = None
 
list setences = []
 
int inmemory = inmemory_
 
 cur_parent_dir = str(pathlib.Path(__file__).parent.resolve())
 
int model = 1:
 

Protected Member Functions

 _preprocess (self, blk_attr, rawtable, rawtable2=None)
 
 _label_and_group (self, tau, cluster, bag_of_words, pair_list, word2id, default_icv_dir="")
 
 _flush_group_and_cluster (self, blk_attr, ori_grp, ori_clt, default_icv_dir="")
 
 _group_interchangeable (self, blk_attr, tableid, return_dict, default_match_res_dir="")
 

Detailed Description

Dov2Vec for attribute: str_bt_1w_5w, str_bt_5w_10w and str_gt_10w
Generally the value matcher has two usages:
    1. usage 0, label the sample result. 
    2. usage 1, group interchangeable values in match result.
    
    For usage 0, the dataframe is stored at "sample_res"
    For usage 1, the dataframe is stored at "match_res"
    
    The "inmemory" indicates whether we could directly train doc2vec by using 
    the whole table(s) in memory. It will be set as True when the table(s) are
    usually not to large.
    
    Both two usages will share "training_set" as training set if the "inmemory" is False.

Constructor & Destructor Documentation

◆ __init__()

simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec.__init__ ( self,
inmemory_ )

Member Function Documentation

◆ _flush_group_and_cluster()

simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec._flush_group_and_cluster ( self,
blk_attr,
ori_grp,
ori_clt,
default_icv_dir = "" )
protected

◆ _group_interchangeable()

simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec._group_interchangeable ( self,
blk_attr,
tableid,
return_dict,
default_match_res_dir = "" )
protected
worker for grouping each chunk of match result

◆ _label_and_group()

simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec._label_and_group ( self,
tau,
cluster,
bag_of_words,
pair_list,
word2id,
default_icv_dir = "" )
protected
helper function for "group_interchangeable"
1. call executable to label vecs
2. group similar docs
3. report in the desired file

Args:
    cluster: the dsu;
    bag_of_words: the doc set
    pair_list: matching result

◆ _preprocess()

simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec._preprocess ( self,
blk_attr,
rawtable,
rawtable2 = None )
protected

◆ apply_sample()

simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec.apply_sample ( self,
blk_attr,
tau,
ground_truth_label = False,
default_icv_dir = "",
default_gold_dir = "",
default_sample_res_dir = "" )
Apply Doc2Vec for sampling and labeling cand

◆ group_interchangeable()

simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec.group_interchangeable ( self,
blk_attr,
tau,
default_icv_dir = "" )
Apply Doc2Vec for grouping interchangeable value in matching result

◆ group_interchangeable_parallel()

simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec.group_interchangeable_parallel ( self,
blk_attr,
tau,
tottable = 100,
default_icv_dir = "" )

◆ load_match_res()

simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec.load_match_res ( self,
tableA,
tableB,
default_match_res_dir = "" )

◆ load_model()

simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec.load_model ( self,
usage,
attr,
default_model_dir = "" )
usage: 0 for labeler and 1 for value matcher

◆ load_sample_res()

simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec.load_sample_res ( self,
tableA,
tableB,
default_sample_res_dir = "" )

◆ sample_match()

simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec.sample_match ( self,
sample_size = 5000 )

◆ sample_sample()

simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec.sample_sample ( self,
sample_size = 5000 )

◆ train_all_and_save()

simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec.train_all_and_save ( self,
attrs,
rawtable,
rawtable2 = None,
default_output_dir = "" )
Train model for all attributes except id
attrs: attributes that could use word2vec

◆ train_and_save()

simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec.train_and_save ( self,
blk_attr,
rawtable,
rawtable2 = None,
default_output_dir = "" )

Member Data Documentation

◆ cur_parent_dir

simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec.cur_parent_dir = str(pathlib.Path(__file__).parent.resolve())

◆ inmemory

int simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec.inmemory = inmemory_

◆ match_res

simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec.match_res = None

◆ model [1/2]

simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec.model = None

◆ model [2/2]

int simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec.model = 1:

◆ sample_res

simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec.sample_res = None

◆ setences

simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec.setences = []

◆ training_set

simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec.training_set = None

The documentation for this class was generated from the following file: