234 const auto &curGrp = groups[i];
235 const auto &curGrpDlm = grpdlm[i];
236 const auto &curGrpQgm = grpqgm[i];
238 if(numFeature == 0) {
239 std::cerr <<
"no such attr: " << attrs[i] << std::endl;
244 std::vector<int> grpid;
245 for(
const auto &grpit : curGrp) {
246 int keyId = grpit.first;
247 if(grpit.second.size() > 1)
248 grpid.emplace_back(keyId);
251 int bucketSize = (int)grpid.size();
254 for(
int j = 0; j < numFeature; j++) {
256 for(
int l = 0; l < bucketSize; l++) {
264 for(
int didx = 0; didx < bucketSize; didx++)
270 for(
int id1 = 0; id1 < bucketSize; id1++) {
271 int lid = grpid[id1];
272 const auto &ldocs = curGrp.at(lid);
273 const auto &ldlms = curGrpDlm.at(lid);
274 const auto &lqgms = curGrpQgm.at(lid);
275 for(
int id2 = id1 + 1; id2 < bucketSize; id2++) {
276 int rid = grpid[id2];
277 const auto &rdocs = curGrp.at(rid);
278 const auto &rdlms = curGrpDlm.at(rid);
279 const auto &rqgms = curGrpQgm.at(rid);
283 double maxJacVal = 0.0;
284 double maxCosVal = 0.0;
285 double maxDiceVal = 0.0;
287 for(
const auto &ldoc : ldlms) {
288 for(
const auto &rdoc : rdlms) {
306 double maxJacVal = 0.0;
307 double maxCosVal = 0.0;
308 double maxDiceVal = 0.0;
310 double minLevVal = 0.0;
311 double maxExmVal = 0.0;
312 for(
const auto &ldoc : lqgms) {
313 for(
const auto &rdoc : rqgms) {
320 for(
const auto &ldoc : ldocs) {
321 for(
const auto &rdoc : rdocs) {
341 double maxJacDlmVal = 0.0, maxJacQgmVal = 0.0;
342 double maxCosDlmVal = 0.0, maxCosQgmVal = 0.0;
343 double maxDiceDlmVal = 0.0, maxDiceQgmVal = 0.0;
344 int maxOvlpDlmVal = 0, maxOvlpQgmVal = 0;
345 for(
const auto &ldoc : ldlms) {
346 for(
const auto &rdoc : rdlms) {
353 for(
const auto &ldoc : lqgms) {
354 for(
const auto &rdoc : rqgms) {
386 std::string delims =
" \"\',\\\t\r\n";
389 std::string grpPath =
"buffer/interchangeable_grp_" +
attrVec[i] +
".txt";
390 auto &curGrp =
group[i];
397 std::vector<std::string> entityVec;
399 std::ifstream grpFile(grpPath.c_str(), std::ios::in);
401 getline(grpFile, entity);
402 int totalKey = std::stoi(entity);
405 for(
int j = 0; j < totalKey; j++) {
407 getline(grpFile, entity);
409 int keyId = std::stoi(entityVec[0]);
410 int length = std::stoi(entityVec[1]);
415 for(
int l = 0; l < length; l++) {
417 getline(grpFile, doc);
419 curGrp[keyId].emplace_back(doc);
421 std::vector<std::string> tokensDlm;
422 std::vector<std::string> tokensQgm;
423 std::string tok1 =
"dlm", tok2 =
"qgm";
426 curGrpDlm[keyId].emplace_back(tokensDlm);
427 curGrpQgm[keyId].emplace_back(tokensQgm);
442 std::string curAttr =
attrVec[i];
443 int attrPos = (int)tableA.inverted_schema.at(curAttr);
444 const auto &curColA = tableA.cols[attrPos];
445 const auto &curColB = tableB.cols[attrPos];
446 int colASize = (int)curColA.size();
447 int colBSize = (int)curColB.size();
449 const auto &curClt =
cluster[i];
456 for(
int row = 0; row < colASize; row++) {
457 const auto &doc = curColA[row];
458 bool haskey = curClt.find(doc) != curClt.end();
461 int keyId = curClt.at(doc);
462 curGrpIdA.emplace_back(keyId);
463 curGrpA[keyId].emplace_back(row);
466 curGrpIdA.emplace_back(-1);
469 for(
int row = 0; row < colBSize; row++) {
470 const auto &doc = curColB[row];
471 bool haskey = curClt.find(doc) != curClt.end();
474 int keyId = curClt.at(doc);
475 curGrpIdB.emplace_back(keyId);
476 curGrpB[keyId].emplace_back(row);
479 curGrpB.emplace_back(-1);