Public Member Functions | |
__init__ (self, inmemory_) | |
sample_sample (self, sample_size=5000) | |
sample_match (self, sample_size=5000) | |
train_and_save (self, blk_attr, rawtable, rawtable2=None, default_output_dir="") | |
train_all_and_save (self, attrs, rawtable, rawtable2=None, default_output_dir="") | |
apply_sample (self, blk_attr, tau, ground_truth_label=False, default_icv_dir="", default_gold_dir="", default_sample_res_dir="") | |
group_interchangeable (self, blk_attr, tau, default_icv_dir="") | |
group_interchangeable_parallel (self, blk_attr, tau, tottable=100, default_icv_dir="") | |
load_sample_res (self, tableA, tableB, default_sample_res_dir="") | |
load_match_res (self, tableA, tableB, default_match_res_dir="") | |
load_model (self, usage, attr, default_model_dir="") | |
Public Attributes | |
model = None | |
sample_res = None | |
match_res = None | |
training_set = None | |
list | setences = [] |
int | inmemory = inmemory_ |
cur_parent_dir = str(pathlib.Path(__file__).parent.resolve()) | |
int | model = 1: |
Protected Member Functions | |
_preprocess (self, blk_attr, rawtable, rawtable2=None) | |
_label_and_group (self, tau, cluster, bag_of_words, pair_list, word2id, default_icv_dir="") | |
_flush_group_and_cluster (self, blk_attr, ori_grp, ori_clt, default_icv_dir="") | |
_group_interchangeable (self, blk_attr, tableid, return_dict, default_match_res_dir="") | |
Dov2Vec for attribute: str_bt_1w_5w, str_bt_5w_10w and str_gt_10w Generally the value matcher has two usages: 1. usage 0, label the sample result. 2. usage 1, group interchangeable values in match result. For usage 0, the dataframe is stored at "sample_res" For usage 1, the dataframe is stored at "match_res" The "inmemory" indicates whether we could directly train doc2vec by using the whole table(s) in memory. It will be set as True when the table(s) are usually not to large. Both two usages will share "training_set" as training set if the "inmemory" is False.
simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec.__init__ | ( | self, | |
inmemory_ ) |
|
protected |
|
protected |
worker for grouping each chunk of match result
|
protected |
helper function for "group_interchangeable" 1. call executable to label vecs 2. group similar docs 3. report in the desired file Args: cluster: the dsu; bag_of_words: the doc set pair_list: matching result
|
protected |
simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec.apply_sample | ( | self, | |
blk_attr, | |||
tau, | |||
ground_truth_label = False, | |||
default_icv_dir = "", | |||
default_gold_dir = "", | |||
default_sample_res_dir = "" ) |
Apply Doc2Vec for sampling and labeling cand
simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec.group_interchangeable | ( | self, | |
blk_attr, | |||
tau, | |||
default_icv_dir = "" ) |
Apply Doc2Vec for grouping interchangeable value in matching result
simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec.group_interchangeable_parallel | ( | self, | |
blk_attr, | |||
tau, | |||
tottable = 100, | |||
default_icv_dir = "" ) |
simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec.load_match_res | ( | self, | |
tableA, | |||
tableB, | |||
default_match_res_dir = "" ) |
simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec.load_model | ( | self, | |
usage, | |||
attr, | |||
default_model_dir = "" ) |
usage: 0 for labeler and 1 for value matcher
simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec.load_sample_res | ( | self, | |
tableA, | |||
tableB, | |||
default_sample_res_dir = "" ) |
simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec.sample_match | ( | self, | |
sample_size = 5000 ) |
simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec.sample_sample | ( | self, | |
sample_size = 5000 ) |
simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec.train_all_and_save | ( | self, | |
attrs, | |||
rawtable, | |||
rawtable2 = None, | |||
default_output_dir = "" ) |
Train model for all attributes except id attrs: attributes that could use word2vec
simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec.train_and_save | ( | self, | |
blk_attr, | |||
rawtable, | |||
rawtable2 = None, | |||
default_output_dir = "" ) |
simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec.cur_parent_dir = str(pathlib.Path(__file__).parent.resolve()) |
int simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec.inmemory = inmemory_ |
simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec.match_res = None |
simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec.model = None |
int simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec.model = 1: |
simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec.sample_res = None |
simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec.setences = [] |
simjoin_entitymatching.value_matcher.doc2vec.Doc2Vec.training_set = None |