Functions | |
normalize_values (ori_group, ori_clt, normalized_attrs, default_buffer_dir="") | |
group_interchangeable (tableA, tableB, group_tau, group_strategy=Literal["doc", "mix"], num_data=Literal[1, 2], default_match_res_dir="", default_vmatcher_dir="", default_icv_dir="", default_buffer_dir="") | |
simjoin_entitymatching.value_matcher.interchangeable.group_interchangeable | ( | tableA, | |
tableB, | |||
group_tau, | |||
group_strategy = Literal["doc", "mix"], | |||
num_data = Literal[1, 2], | |||
default_match_res_dir = "", | |||
default_vmatcher_dir = "", | |||
default_icv_dir = "", | |||
default_buffer_dir = "" ) |
apply value matcher, group interchangeable values on matching result 1. use doc2vec for all attrs, since for str_eq_1w there may exist values that are longer than 1 word in raw data 2. use doc2vec & word2vec(for str_eq_1w), we omit the impact of such abnormal(longer) words in 1
simjoin_entitymatching.value_matcher.interchangeable.normalize_values | ( | ori_group, | |
ori_clt, | |||
normalized_attrs, | |||
default_buffer_dir = "" ) |