gopher-clustering2.c - brcon2024-hackathons - Bitreichcon 2024 Hackathons HTML git clone git://bitreich.org/brcon2024-hackathons git://enlrupgkhuxnvlhsf6lc3fziv5h2hhfrinws65d7roiv6bfj7d652fid.onion/brcon2024-hackathons DIR Log DIR Files DIR Refs DIR Tags DIR Submodules --- gopher-clustering2.c (24998B) --- 1 #define _POSIX_C_SOURCE 200112L 2 3 #include <assert.h> 4 #include <stdarg.h> 5 #include <stdint.h> 6 #include <stdio.h> 7 #include <stdlib.h> 8 #include <string.h> 9 10 #include <math.h> 11 12 #include <netdb.h> 13 #include <sys/socket.h> 14 #include <sys/types.h> 15 #include <unistd.h> 16 17 uint32_t 18 fnv1a(int n,...) 19 { 20 int i; 21 char *s; 22 va_list l; 23 uint32_t h; 24 25 h = 0x811c9dc5; 26 27 va_start(l, n); 28 for (i = 0; i < n; i++) { 29 for (s = va_arg(l, char*); *s; s++) { 30 h ^= *s; 31 h *= 0x01000193; 32 } 33 } 34 va_end(l); 35 36 return h; 37 } 38 39 uint32_t 40 xorshift(uint32_t *s) 41 { 42 *s ^= *s << 13; 43 *s ^= *s >> 17; 44 *s ^= *s << 5; 45 return *s; 46 } 47 48 /* 49 https://github.com/skeeto/hash-prospector/issues/19#issuecomment-1120105785 50 */ 51 uint32_t 52 mix(uint32_t x) 53 { 54 x ^= x >> 16; 55 x *= 0x21f0aaad; 56 x ^= x >> 15; 57 x *= 0x735a2d97; 58 x ^= x >> 16; 59 return x; 60 } 61 62 struct gopherentry { 63 struct gopherentry *next; 64 char type; 65 char *fields[4]; 66 size_t i, s; 67 }; 68 69 char * 70 gopher_fieldend(char *s) 71 { 72 for (; *s; s++) { 73 if (*s == '\t') 74 break; 75 if (*s == '\r' && *(s+1) == '\n') 76 break; 77 if (*s == '\n') 78 break; 79 } 80 return s; 81 } 82 83 char * 84 gopher_nextentry(char *s) 85 { 86 char *c; 87 88 if (c = strchr(s, '\n')) 89 return c + 1; 90 return NULL; 91 } 92 93 void * 94 xmalloc(size_t s) 95 { 96 void *p; 97 98 if (!(p = malloc(s))) { 99 perror(NULL); 100 exit(1); 101 } 102 return p; 103 } 104 105 void * 106 xrealloc(void *p, size_t s) 107 { 108 if (!(p = realloc(p, s))) { 109 perror(NULL); 110 exit(1); 111 } 112 return p; 113 } 114 115 /* Allow a lot of ugly things... */ 116 size_t 117 gopher_parsemenu(char *d, struct gopherentry **g) 118 { 119 ssize_t gl; 120 size_t fl, i, s; 121 char *c, *p, pt; 122 struct gopherentry *cg; 123 124 gl = 0; 125 cg = NULL; 126 127 s = 0; 128 pt = 0; 129 for (c = d; c && *c; c = gopher_nextentry(c)) { 130 if (*c == '.') 131 break; 132 if (*c == '\n') 133 continue; 134 135 gl++; 136 cg = xrealloc(cg, gl * sizeof(struct gopherentry)); 137 memset(&cg[gl-1], 0, sizeof(struct gopherentry)); 138 139 if (*c != 'i' && pt == 'i') 140 s++; 141 pt = *c; 142 143 cg[gl-1].type = *c; 144 cg[gl-1].i = gl-1; 145 cg[gl-1].s = s; 146 c++; 147 148 for (i = 0; i < 4; i++) { 149 p = gopher_fieldend(c); 150 fl = p - c; 151 cg[gl-1].fields[i] = xmalloc(fl + 1); 152 memcpy(cg[gl-1].fields[i], c, fl); 153 cg[gl-1].fields[i][fl] = '\0'; 154 if (*p != '\t') 155 break; 156 c = p + 1; 157 } 158 } 159 s++; 160 *g = cg; 161 return gl; 162 } 163 164 ssize_t 165 gopher_removeentries(struct gopherentry *g, size_t l) 166 { 167 size_t i, j; 168 169 for (i = 0; i < l;) { 170 if (g[i].type == 'i' || g[i].type == '3' || !g[i].fields[2] || !g[i].fields[3]) { 171 l--; 172 memmove(&g[i], &g[i+1], (l - i) * sizeof(*g)); 173 continue; 174 } 175 for (j = 0; j < i; j++) { 176 if (!strcmp(g[i].fields[1], g[j].fields[1]) && 177 !strcmp(g[i].fields[2], g[j].fields[2]) && 178 !strcmp(g[i].fields[3], g[j].fields[3])) 179 break; 180 } 181 if (j < i) { 182 l--; 183 memmove(&g[i], &g[i+1], (l - i) * sizeof(*g)); 184 continue; 185 } 186 i++; 187 } 188 189 return l; 190 } 191 192 ssize_t 193 gopher_request(char *host, char *port, char *selector, char *buffer, size_t l) 194 { 195 int fd; 196 struct addrinfo hints; 197 struct addrinfo *r, *rp; 198 ssize_t n, result, rn; 199 200 memset(&hints, 0, sizeof(hints)); 201 hints.ai_family = AF_UNSPEC; 202 hints.ai_socktype = SOCK_STREAM; 203 204 if (getaddrinfo(host, port, &hints, &r) != 0) 205 return -1; 206 207 for (rp = r; rp; rp = rp->ai_next) { 208 if ((fd = socket(rp->ai_family, rp->ai_socktype, rp->ai_protocol)) == -1) 209 continue; 210 if (connect(fd, rp->ai_addr, rp->ai_addrlen) != -1) 211 break; 212 close(fd); 213 } 214 freeaddrinfo(r); 215 if (!rp) 216 return -1; 217 218 result = -1; 219 if (write(fd, selector, strlen(selector)) != strlen(selector)) 220 goto cleanup; 221 222 if (write(fd, "\r\n", 2) != 2) 223 goto cleanup; 224 225 n = 0; 226 while (1) { 227 if (n + 512 > l - 1) 228 goto cleanup; 229 if ((rn = read(fd, &buffer[n], 512)) == -1) 230 goto cleanup; 231 else if (!rn) 232 break; 233 n += rn; 234 } 235 236 buffer[n] = '\0'; 237 result = n; 238 239 cleanup: 240 close(fd); 241 242 return result; 243 } 244 245 size_t 246 triangularnumber(size_t n) 247 { 248 return (n * n + n) / 2; 249 } 250 251 size_t 252 distanceindex(size_t i, size_t j) 253 { 254 size_t t; 255 256 if (i < j) { 257 t = i; 258 i = j; 259 j = t; 260 } 261 262 return triangularnumber(i) + j; 263 } 264 265 struct point { 266 double dims[2]; 267 }; 268 269 #define SAMMON_MAPPING_MAX_ITER (1 << 13) 270 #define SAMMON_MAPPING_LR (0.01) 271 void 272 sammon(double *distances, size_t l, uint32_t prng, struct point *points) 273 { 274 size_t i, j, k, t, tmax; 275 double *gnarf; 276 double c, lr; 277 double e1, e2, sum, lasterror; 278 struct point *newpoints; 279 280 gnarf = calloc(triangularnumber(l), sizeof(*gnarf)); 281 newpoints = calloc(l, sizeof(*newpoints)); 282 283 for (i = 0; i < l; i++) { 284 points[i].dims[0] = (double)xorshift(&prng) / UINT32_MAX - 0.5; 285 points[i].dims[1] = (double)xorshift(&prng) / UINT32_MAX - 0.5; 286 } 287 288 c = 0; 289 for (i = 0; i < l; i++) { 290 gnarf[distanceindex(i, i)] = 0; 291 for (j = 0; j < i; j++) { 292 gnarf[distanceindex(i, j)] = sqrt(pow(points[i].dims[0] - points[j].dims[0], 2) + pow(points[i].dims[1] - points[j].dims[1], 2)); 293 c += distances[distanceindex(i, j)]; 294 } 295 } 296 297 lasterror = 0; 298 for (t = 0, tmax = SAMMON_MAPPING_MAX_ITER; t < tmax; t++) { 299 lr = SAMMON_MAPPING_LR; 300 for (i = 0; i < l; i++) { 301 for (j = 0; j < 2; j++) { 302 e1 = -2. / c; 303 sum = 0; 304 for (k = 0; k < l; k++) { 305 if (i == k) 306 continue; 307 sum += (distances[distanceindex(i, k)] - gnarf[distanceindex(i, k)]) / (distances[distanceindex(i, k)] * gnarf[distanceindex(i, k)]) * (points[i].dims[j] - points[k].dims[j]); 308 } 309 e1 *= sum; 310 311 e2 = -2. / c; 312 sum = 0; 313 for (k = 0; k < l; k++) { 314 if (i == k) 315 continue; 316 sum += 1. / (distances[distanceindex(i, k)] * gnarf[distanceindex(i, k)]) * ((distances[distanceindex(i, k)] - gnarf[distanceindex(i, k)]) - pow(points[i].dims[j] - points[k].dims[j], 2) / gnarf[distanceindex(i, k)] * (1 + (distances[distanceindex(i, k)] - gnarf[distanceindex(i, k)]) / gnarf[distanceindex(i, k)])); 317 } 318 e2 *= sum; 319 320 newpoints[i].dims[j] = points[i].dims[j] - lr * (e1 / fabs(e2)); 321 } 322 } 323 324 sum = 0; 325 for (i = 0; i < l; i++) { 326 for (j = 0; j < i; j++) { 327 gnarf[distanceindex(i, j)] = sqrt(pow(newpoints[i].dims[0] - newpoints[j].dims[0], 2) + pow(newpoints[i].dims[1] - newpoints[j].dims[1], 2)); 328 sum += pow(distances[distanceindex(i, j)] - gnarf[distanceindex(i, j)], 2) / distances[distanceindex(i, j)]; 329 } 330 } 331 332 fprintf(stderr, "sammon mapping error: %lf\n", sum / c); 333 if (t > 0 && fabs(sum / c - lasterror) < 0.000000000001) 334 break; 335 lasterror = sum / c; 336 337 memcpy(points, newpoints, l * sizeof(*points)); 338 } 339 340 free(newpoints); 341 free(gnarf); 342 } 343 344 int 345 fastermsc(struct gopherentry *g, size_t l, size_t k, double *distances, size_t *assoc, uint32_t prng) 346 { 347 size_t i, j, n, ck, grimmel; 348 ssize_t m1, m, m2; 349 size_t ti, th, huh; 350 double dsum, d, d2, gnarf, tdsum; 351 struct { 352 size_t medoid; 353 size_t n; 354 double dsum; 355 } *clusters; 356 struct { 357 size_t n1, n2, n3; 358 double d1, d2, d3; 359 } *cache; 360 double *dS, *dS2; 361 double cdS, cdS2; 362 size_t cm, cx; 363 ssize_t lx; 364 int flag, result; 365 uint32_t r; 366 double *C; 367 368 if (!l) 369 return 0; 370 371 if (l < k) 372 k = l; 373 374 clusters = calloc(k, sizeof(*clusters)); 375 if (!clusters) 376 return -1; 377 378 result = -1; 379 380 /* 381 Seed the medoids randomly 382 */ 383 for (i = n = 0; i < l; i++) { 384 if ((l - i) * (xorshift(&prng) / (double)UINT32_MAX) < k - n) 385 clusters[n++].medoid = i; 386 if (n == k) 387 break; 388 } 389 390 cache = calloc(l, sizeof(*cache)); 391 if (!cache) 392 goto cleanup; 393 394 dS = calloc(k, sizeof(*dS)); 395 if (!dS) 396 goto cleanup; 397 398 dS2 = calloc(k, sizeof(*dS2)); 399 if (!dS2) 400 goto cleanup; 401 402 for (i = 0; i < l; i++) { 403 cache[i].d1 = cache[i].d2 = cache[i].d3 = INFINITY; 404 for (j = 0; j < k; j++) { 405 d = distances[distanceindex(i, clusters[j].medoid)]; 406 if (d < cache[i].d1) { 407 cache[i].d3 = cache[i].d2; 408 cache[i].d2 = cache[i].d1; 409 cache[i].d1 = d; 410 cache[i].n3 = cache[i].n2; 411 cache[i].n2 = cache[i].n1; 412 cache[i].n1 = j; 413 } else if (d < cache[i].d2) { 414 cache[i].d3 = cache[i].d2; 415 cache[i].d2 = d; 416 cache[i].n3 = cache[i].n2; 417 cache[i].n2 = j; 418 } else if (d < cache[i].d3) { 419 cache[i].d3 = d; 420 cache[i].n3 = j; 421 } 422 } 423 } 424 425 for (i = 0; i < k; i++) { 426 dS[i] = 0; 427 for (j = 0; j < l; j++) { 428 if (cache[j].n1 != i) 429 continue; 430 dS[i] += cache[j].d1 / cache[j].d2 - cache[j].d2 / cache[j].d3; 431 } 432 for (j = 0; j < l; j++) { 433 if (cache[j].n2 != i) 434 continue; 435 dS[i] += cache[j].d1 / cache[j].d2 - cache[j].d1 / cache[j].d3; 436 } 437 } 438 439 lx = -1; 440 for (grimmel = 0;grimmel < 32;grimmel++) { 441 cdS = 0; 442 for (i = 0; i < l; i++) { 443 if (i == lx) 444 goto fastermsc_finished; 445 446 for (j = 0; j < k; j++) { 447 if (clusters[j].medoid == i) 448 goto fastmsc_nexti; 449 } 450 451 memcpy(dS2, dS, k * sizeof(*dS2)); 452 cdS2 = 0; 453 for (j = 0; j < l; j++) { 454 d = distances[distanceindex(i, j)]; 455 if (d < cache[j].d1) { 456 cdS2 += cache[j].d1 / cache[j].d2 - d / cache[j].d1; 457 dS2[cache[j].n1] += d / cache[j].d1 + cache[j].d2 / cache[j].d3 - (cache[j].d1 + d) / cache[j].d2; 458 dS2[cache[j].n2] += cache[j].d1 / cache[j].d3 - cache[j].d1 / cache[j].d2; 459 } else if (d < cache[j].d2) { 460 cdS2 += cache[j].d1 / cache[j].d2 - cache[j].d1 / d; 461 dS2[cache[j].n1] += cache[j].d1 / d + cache[j].d2 / cache[j].d3 - (cache[j].d1 + d) / cache[j].d2; 462 dS2[cache[j].n2] += cache[j].d1 / cache[j].d3 - cache[j].d1 / cache[j].d2; 463 } else if (d < cache[j].d3) { 464 dS2[cache[j].n1] += cache[j].d2 / cache[j].d3 - cache[j].d2 / d; 465 dS2[cache[j].n2] += cache[j].d1 / cache[j].d3 - cache[j].d1 / d; 466 } 467 } 468 d = -INFINITY; 469 for (j = 0; j < k; j++) { 470 if (dS2[j] > d) { 471 d = dS2[j]; 472 m = j; 473 } 474 } 475 dS2[m] += cdS2; 476 if (dS2[m] <= 0) 477 continue; 478 lx = i; 479 clusters[m].medoid = i; 480 for (j = 0; j < l; j++) { 481 if (cache[j].n1 != m && cache[j].n2 != m && cache[j].n3 != m) 482 continue; 483 cache[j].d1 = cache[j].d2 = cache[j].d3 = INFINITY; 484 for (n = 0; n < k; n++) { 485 d = distances[distanceindex(j, clusters[n].medoid)]; 486 if (d < cache[j].d1) { 487 cache[j].d3 = cache[j].d2; 488 cache[j].d2 = cache[j].d1; 489 cache[j].d1 = d; 490 cache[j].n3 = cache[j].n2; 491 cache[j].n2 = cache[j].n1; 492 cache[j].n1 = n; 493 } else if (d < cache[j].d2) { 494 cache[j].d3 = cache[j].d2; 495 cache[j].d2 = d; 496 cache[j].n3 = cache[j].n2; 497 cache[j].n2 = n; 498 } else if (d < cache[j].d3) { 499 cache[j].d3 = d; 500 cache[j].n3 = n; 501 } 502 } 503 } 504 for (j = 0; j < k; j++) { 505 dS[j] = 0; 506 for (n = 0; n < l; n++) { 507 if (cache[n].n1 != j) 508 continue; 509 dS[j] += cache[n].d1 / cache[n].d2 - cache[n].d2 / cache[n].d3; 510 } 511 for (n = 0; n < l; n++) { 512 if (cache[n].n2 != j) 513 continue; 514 dS[j] += cache[n].d1 / cache[n].d2 - cache[n].d1 / cache[n].d3; 515 } 516 } 517 fastmsc_nexti:; 518 } 519 /* 520 if (lx == -1) 521 break; 522 */ 523 } 524 fastermsc_finished:; 525 526 for (i = 0; i < k; i++) 527 clusters[i].n = 0; 528 529 for (i = 0; i < l; i++) { 530 d = INFINITY; 531 for (j = 0; j < k; j++) { 532 if (distances[distanceindex(clusters[j].medoid, i)] < d) { 533 assoc[i] = j; 534 d = distances[distanceindex(clusters[j].medoid, i)]; 535 } 536 } 537 clusters[assoc[i]].n++; 538 } 539 540 for (i = 0; i < l; i++) { 541 if (clusters[assoc[i]].medoid == i) 542 assoc[i] |= 128; 543 } 544 545 result = k; 546 cleanup: 547 free(clusters); 548 return result; 549 } 550 551 int 552 fastmsc(struct gopherentry *g, size_t l, size_t k, double *distances, size_t *assoc, uint32_t prng) 553 { 554 size_t i, j, n, ck; 555 ssize_t m1, m, m2; 556 size_t ti, th; 557 double dsum, d, d2, gnarf, tdsum; 558 struct { 559 size_t medoid; 560 size_t n; 561 double dsum; 562 } *clusters; 563 struct { 564 size_t n1, n2; 565 double d1, d2, d3; 566 } *cache; 567 double *dS, *dS2; 568 double cdS, cdS2; 569 double _e, _y, _z, _s; 570 size_t cm, cx; 571 int flag, result; 572 uint32_t r; 573 double *C; 574 575 if (!l) 576 return 0; 577 578 if (l < k) 579 k = l; 580 581 clusters = calloc(k, sizeof(*clusters)); 582 if (!clusters) 583 return -1; 584 585 result = -1; 586 587 /* 588 seed the medoids using PAM BUILD 589 d = INFINITY; 590 m = -1; 591 for (i = 0; i < l; i++) { 592 dsum = 0; 593 for (j = 0; j < l; j++) 594 dsum += distances[distanceindex(i, j)]; 595 if (dsum < d) { 596 m = i; 597 d = dsum; 598 } 599 } 600 if (m == -1) 601 goto cleanup; 602 ck = 0; 603 clusters[ck].medoid = m; 604 ck++; 605 606 C = xmalloc(l * l * sizeof(*C)); 607 608 for (; ck < k; ck++) { 609 memset(C, 0, l * l * sizeof(*C)); 610 for (i = 0; i < l; i++) { 611 for (j = 0; j < ck; j++) { 612 if (clusters[j].medoid == i) 613 goto build_nexti; 614 } 615 for (j = 0; j < l; j++) { 616 if (i == j) 617 continue; 618 for (n = 0; n < ck; n++) { 619 if (clusters[n].medoid == j) 620 goto build_nextj; 621 } 622 d = INFINITY; 623 for (n = 0; n < ck; n++) { 624 if (distances[distanceindex(j, clusters[n].medoid)] < d) 625 d = distances[distanceindex(j, clusters[n].medoid)]; 626 } 627 C[j * l + i] = d - distances[distanceindex(j, i)]; 628 if (C[j * l + i] < 0) 629 C[j * l + i] = 0; 630 build_nextj:; 631 } 632 build_nexti:; 633 } 634 d = -1; 635 m = -1; 636 for (i = 0; i < l; i++) { 637 for (j = 0; j < ck; j++) { 638 if (clusters[j].medoid == i) 639 goto build_nexti2; 640 } 641 dsum = 0; 642 for (j = 0; j < l; j++) 643 dsum += C[j * l + i]; 644 if (dsum > d) { 645 d = dsum; 646 m = i; 647 } 648 build_nexti2:; 649 } 650 clusters[ck].medoid = m; 651 } 652 free(C); 653 */ 654 655 /* 656 Seed the medoids randomly 657 */ 658 for (i = n = 0; i < l; i++) { 659 if ((l - i) * (xorshift(&prng) / (double)UINT32_MAX) < k - n) 660 clusters[n++].medoid = i; 661 if (n == k) 662 break; 663 } 664 665 cache = calloc(l, sizeof(*cache)); 666 if (!cache) 667 goto cleanup; 668 669 dS = calloc(k, sizeof(*dS)); 670 if (!dS) 671 goto cleanup; 672 673 dS2 = calloc(k, sizeof(*dS2)); 674 if (!dS2) 675 goto cleanup; 676 677 for (;;) { 678 for (i = 0; i < l; i++) { 679 cache[i].d1 = cache[i].d2 = cache[i].d3 = INFINITY; 680 for (j = 0; j < k; j++) { 681 d = distances[distanceindex(i, clusters[j].medoid)]; 682 if (d < cache[i].d1) { 683 cache[i].d3 = cache[i].d2; 684 cache[i].d2 = cache[i].d1; 685 cache[i].d1 = d; 686 cache[i].n2 = cache[i].n1; 687 cache[i].n1 = j; 688 } else if (d < cache[i].d2) { 689 cache[i].d3 = cache[i].d2; 690 cache[i].d2 = d; 691 cache[i].n2 = j; 692 } else if (d < cache[i].d3) { 693 cache[i].d3 = d; 694 } 695 } 696 } 697 698 for (i = 0; i < k; i++) { 699 dS[i] = 0; 700 for (j = 0; j < l; j++) { 701 if (cache[j].n1 != i) 702 continue; 703 dS[i] += cache[j].d1 / cache[j].d2 - cache[j].d2 / cache[j].d3; 704 } 705 for (j = 0; j < l; j++) { 706 if (cache[j].n2 != i) 707 continue; 708 dS[i] += cache[j].d1 / cache[j].d2 - cache[j].d1 / cache[j].d3; 709 } 710 } 711 712 cdS = 0; 713 for (i = 0; i < l; i++) { 714 for (j = 0; j < k; j++) { 715 if (clusters[j].medoid == i) 716 goto fastmsc_nexti; 717 } 718 memcpy(dS2, dS, k * sizeof(*dS2)); 719 cdS2 = _e = 0; 720 for (j = 0; j < l; j++) { 721 d = distances[distanceindex(i, j)]; 722 if (d < cache[j].d1) { 723 cdS2 += cache[j].d1 / cache[j].d2 - d / cache[j].d1; 724 dS2[cache[j].n1] += d / cache[j].d1 + cache[j].d2 / cache[j].d3 - (cache[j].d1 + d) / cache[j].d2; 725 dS2[cache[j].n2] += cache[j].d1 / cache[j].d3 - cache[j].d1 / cache[j].d2; 726 } else if (d < cache[j].d2) { 727 cdS2 += cache[j].d1 / cache[j].d2 - cache[j].d1 / d; 728 dS2[cache[j].n1] += cache[j].d1 / d + cache[j].d2 / cache[j].d3 - (cache[j].d1 + d) / cache[j].d2; 729 dS2[cache[j].n2] += cache[j].d1 / cache[j].d3 - cache[j].d1 / cache[j].d2; 730 } else if (d < cache[j].d3) { 731 dS2[cache[j].n1] += cache[j].d2 / cache[j].d3 - cache[j].d2 / d; 732 dS2[cache[j].n2] += cache[j].d1 / cache[j].d3 - cache[j].d1 / d; 733 } 734 } 735 d = -INFINITY; 736 d2 = INFINITY; 737 for (j = 0; j < k; j++) { 738 if (dS2[j] > d) { 739 d = dS2[j]; 740 m = j; 741 } 742 if (dS2[j] < d2) { 743 d2 = dS2[j]; 744 m2 = j; 745 } 746 } 747 dS2[m] += cdS2; 748 if (dS2[m] > cdS) { 749 cdS = dS2[m]; 750 cm = m; 751 cx = i; 752 } 753 fastmsc_nexti:; 754 } 755 /* Should be 0. I guess some floating point problems? */ 756 if (cdS <= 0.00000000000001) 757 break; 758 clusters[cm].medoid = cx; 759 } 760 761 for (i = 0; i < k; i++) 762 clusters[i].n = 0; 763 764 for (i = 0; i < l; i++) { 765 d = INFINITY; 766 for (j = 0; j < k; j++) { 767 if (distances[distanceindex(clusters[j].medoid, i)] < d) { 768 assoc[i] = j; 769 d = distances[distanceindex(clusters[j].medoid, i)]; 770 } 771 } 772 clusters[assoc[i]].n++; 773 } 774 775 for (i = 0; i < l; i++) { 776 if (clusters[assoc[i]].medoid == i) 777 assoc[i] |= 128; 778 } 779 780 result = k; 781 cleanup: 782 free(clusters); 783 return result; 784 } 785 786 /* 787 Seeding of medoids based on the PAM BUILD description in https://www.cs.umb.edu/cs738/pam1.pdf 788 I was too stupid to understand other descriptions. 789 */ 790 int 791 kmedoids(struct gopherentry *g, size_t l, size_t k, double *distances, size_t *assoc, uint32_t prng) 792 { 793 size_t i, j, n, ck; 794 ssize_t m1, m; 795 size_t ti, th; 796 double dsum, d, d2, gnarf, tdsum; 797 struct { 798 size_t medoid; 799 size_t n; 800 double dsum; 801 } *clusters; 802 int flag, result; 803 uint32_t r; 804 double *C; 805 806 if (!l) 807 return 0; 808 809 if (l < k) 810 k = l; 811 812 clusters = calloc(k, sizeof(*clusters)); 813 if (!clusters) 814 return -1; 815 816 result = -1; 817 818 /* seed the medoids using PAM BUILD */ 819 d = INFINITY; 820 m = -1; 821 for (i = 0; i < l; i++) { 822 dsum = 0; 823 for (j = 0; j < l; j++) 824 dsum += distances[distanceindex(i, j)]; 825 if (dsum < d) { 826 m = i; 827 d = dsum; 828 } 829 } 830 if (m == -1) 831 goto cleanup; 832 ck = 0; 833 clusters[ck].medoid = m; 834 ck++; 835 836 C = xmalloc(l * l * sizeof(*C)); 837 838 for (; ck < k; ck++) { 839 memset(C, 0, l * l * sizeof(*C)); 840 for (i = 0; i < l; i++) { 841 for (j = 0; j < ck; j++) { 842 if (clusters[j].medoid == i) 843 goto build_nexti; 844 } 845 for (j = 0; j < l; j++) { 846 if (i == j) 847 continue; 848 for (n = 0; n < ck; n++) { 849 if (clusters[n].medoid == j) 850 goto build_nextj; 851 } 852 d = INFINITY; 853 for (n = 0; n < ck; n++) { 854 if (distances[distanceindex(j, clusters[n].medoid)] < d) 855 d = distances[distanceindex(j, clusters[n].medoid)]; 856 } 857 C[j * l + i] = d - distances[distanceindex(j, i)]; 858 if (C[j * l + i] < 0) 859 C[j * l + i] = 0; 860 build_nextj:; 861 } 862 build_nexti:; 863 } 864 d = -1; 865 m = -1; 866 for (i = 0; i < l; i++) { 867 for (j = 0; j < ck; j++) { 868 if (clusters[j].medoid == i) 869 goto build_nexti2; 870 } 871 dsum = 0; 872 for (j = 0; j < l; j++) 873 dsum += C[j * l + i]; 874 if (dsum > d) { 875 d = dsum; 876 m = i; 877 } 878 build_nexti2:; 879 } 880 clusters[ck].medoid = m; 881 } 882 free(C); 883 884 for (;;) { 885 tdsum = INFINITY; 886 for (i = 0; i < k; i++) { 887 for (j = 0; j < l; j++) { 888 for (n = 0; n < k; n++) { 889 if (j == clusters[n].medoid) 890 goto swap_nextj; 891 } 892 dsum = 0; 893 for (n = 0; n < l; n++) { 894 if (n == j) 895 continue; 896 for (m = 0; m < k; m++) { 897 if (clusters[m].medoid == n) 898 goto swap_nextn; 899 } 900 d = INFINITY; 901 d2 = INFINITY; 902 for (m = 0; m < k; m++) { 903 if (distances[distanceindex(clusters[m].medoid, n)] < d) { 904 d2 = d; 905 d = distances[distanceindex(clusters[m].medoid, n)]; 906 } else if (distances[distanceindex(clusters[m].medoid, n)] < d2) { 907 d2 = distances[distanceindex(clusters[m].medoid, n)]; 908 } 909 } 910 if (distances[distanceindex(clusters[i].medoid, n)] > d) { 911 if (distances[distanceindex(j, n)] >= d) 912 gnarf = 0; 913 else 914 gnarf = distances[distanceindex(j, n)] - d; 915 } else if (distances[distanceindex(clusters[i].medoid, n)] == d) { 916 if (distances[distanceindex(j, n)] < d2) 917 gnarf = distances[distanceindex(j, n)] - d; 918 else 919 gnarf = d2 - d; 920 } else { 921 continue; 922 } 923 dsum += gnarf; 924 swap_nextn:; 925 } 926 if (dsum < tdsum) { 927 ti = i; 928 th = j; 929 tdsum = dsum; 930 } 931 swap_nextj:; 932 } 933 } 934 if (tdsum >= 0) 935 break; 936 clusters[ti].medoid = th; 937 } 938 939 for (i = 0; i < k; i++) 940 clusters[i].n = 0; 941 942 for (i = 0; i < l; i++) { 943 d = INFINITY; 944 for (j = 0; j < k; j++) { 945 if (distances[distanceindex(clusters[j].medoid, i)] < d) { 946 assoc[i] = j; 947 d = distances[distanceindex(clusters[j].medoid, i)]; 948 } 949 } 950 clusters[assoc[i]].n++; 951 } 952 953 for (i = 0; i < l; i++) { 954 if (clusters[assoc[i]].medoid == i) 955 assoc[i] |= 128; 956 } 957 958 result = k; 959 cleanup: 960 free(clusters); 961 return result; 962 } 963 964 size_t 965 diff(size_t a, size_t b) 966 { 967 if (a < b) 968 return b - a; 969 return a - b; 970 } 971 972 /* 973 https://en.wikipedia.org/wiki/Levenshtein_distance#Iterative_with_two_matrix_rows 974 Which references https://www.codeproject.com/Articles/13525/Fast-memory-efficient-Levenshtein-algorithm-2 975 The code there is using the CPOL: https://www.codeproject.com/info/cpol10.aspx 976 */ 977 size_t 978 levenshteindistance(char *a, char *b) 979 { 980 size_t i, j, dc, ic, sc, r; 981 size_t *v0, *v1, *t; 982 983 v0 = xmalloc((strlen(b) + 1) * sizeof(*v0)); 984 v1 = xmalloc((strlen(b) + 1) * sizeof(*v1)); 985 986 for (i = 0; i <= strlen(b); i++) 987 v0[i] = i; 988 989 for (i = 0; i < strlen(a); i++) { 990 v1[0] = i + 1; 991 for (j = 0; j < strlen(b); j++) { 992 dc = v0[j + 1] + 1; 993 ic = v1[j] + 1; 994 if (a[i] == b[j]) 995 sc = v0[j]; 996 else 997 sc = v0[j] + 1; 998 if (dc < ic && dc < sc) 999 v1[j + 1] = dc; 1000 else if (ic < dc && ic < sc) 1001 v1[j + 1] = ic; 1002 else 1003 v1[j + 1] = sc; 1004 } 1005 t = v0; 1006 v0 = v1; 1007 v1 = t; 1008 } 1009 1010 r = v0[strlen(b)]; 1011 1012 free(v1); 1013 free(v0); 1014 1015 return r; 1016 } 1017 1018 /* 1019 Remove common prefix and suffix. The pazz0distance is the sum of the length of the leftover strings. 1020 */ 1021 size_t 1022 pazz0distance(char *a, char *b) 1023 { 1024 size_t pl, sl; 1025 size_t al, bl; 1026 1027 al = strlen(a); 1028 bl = strlen(b); 1029 1030 for (pl = 0; pl < al && pl < bl && a[pl] == b[pl]; pl++); 1031 for (sl = 0; al - sl > pl && bl - sl > pl && a[al - 1 - sl] == b[bl - 1 - sl]; sl++); 1032 1033 return al + bl - 2 * (pl + sl); 1034 } 1035 1036 /* 1037 https://en.wikipedia.org/wiki/Levenshtein_distance#Recursive 1038 */ 1039 size_t 1040 levenshteindistance_slow(char *a, char *b) 1041 { 1042 size_t k, l, m; 1043 1044 if (!*a) 1045 return strlen(b); 1046 else if (!*b) 1047 return strlen(a); 1048 else if (*a == *b) 1049 return levenshteindistance_slow(a + 1, b + 1); 1050 1051 k = levenshteindistance_slow(a + 1, b); 1052 l = levenshteindistance_slow(a, b + 1); 1053 m = levenshteindistance_slow(a + 1, b + 1); 1054 1055 if (k < l && k < m) 1056 return 1 + k; 1057 else if (l < k && l < m) 1058 return 1 + l; 1059 else 1060 return 1 + m; 1061 } 1062 1063 ssize_t 1064 clustering(char *host, char *port, char *selector, struct gopherentry *g, size_t l, size_t *assocs, size_t k) 1065 { 1066 size_t i, j; 1067 ssize_t r; 1068 double *distances, d; 1069 double *mdists; 1070 struct { 1071 double selector; 1072 double segment; 1073 double index; 1074 } *distanceparts, maxdistanceparts; 1075 uint32_t prng; 1076 struct point *points; 1077 1078 distanceparts = xmalloc(triangularnumber(l) * sizeof(*distanceparts)); 1079 1080 /* So the sammon mapping can be happy. */ 1081 prng = fnv1a(4, host, port, selector, "distancewiggle"); 1082 memset(&maxdistanceparts, 0, sizeof(maxdistanceparts)); 1083 for (i = 0; i < l; i++) { 1084 memset(&distanceparts[distanceindex(i, i)], 0, sizeof(*distanceparts)); 1085 for (j = 0; j < i; j++) { 1086 distanceparts[distanceindex(i, j)].selector = levenshteindistance(g[i].fields[1], g[j].fields[1]) + levenshteindistance(g[i].fields[2], g[j].fields[2]) + levenshteindistance(g[i].fields[3], g[j].fields[3]) + ((double)xorshift(&prng) / UINT32_MAX) * 0.000000000000000000000000000000000000001; 1087 distanceparts[distanceindex(i, j)].segment = diff(g[i].s, g[j].s); 1088 distanceparts[distanceindex(i, j)].index = diff(g[i].i, g[j].i); 1089 1090 if (distanceparts[distanceindex(i, j)].selector > maxdistanceparts.selector) 1091 maxdistanceparts.selector = distanceparts[distanceindex(i, j)].selector; 1092 if (distanceparts[distanceindex(i, j)].segment > maxdistanceparts.segment) 1093 maxdistanceparts.segment = distanceparts[distanceindex(i, j)].segment; 1094 if (distanceparts[distanceindex(i, j)].index > maxdistanceparts.index) 1095 maxdistanceparts.index = distanceparts[distanceindex(i, j)].index; 1096 } 1097 } 1098 for (i = 0; i < l; i++) { 1099 for (j = 0; j < i; j++) { 1100 distanceparts[distanceindex(i, j)].selector /= maxdistanceparts.selector; 1101 distanceparts[distanceindex(i, j)].segment /= maxdistanceparts.segment; 1102 distanceparts[distanceindex(i, j)].index /= maxdistanceparts.index; 1103 } 1104 } 1105 1106 distances = xmalloc(triangularnumber(l) * sizeof(*distances)); 1107 for (i = 0; i < l; i++) { 1108 distances[distanceindex(i, i)] = 0; 1109 for (j = 0; j < i; j++) 1110 distances[distanceindex(i, j)] = 0.75 * distanceparts[distanceindex(i, j)].selector + 0.125 * distanceparts[distanceindex(i, j)].segment + 0.125 * distanceparts[distanceindex(i, j)].index; //distanceparts[distanceindex(i, j)].selector; 1111 } 1112 free(distanceparts); 1113 1114 r = kmedoids(g, l, k, distances, assocs, fnv1a(4, host, port, selector, "seed")); 1115 1116 points = calloc(l, sizeof(*points)); 1117 sammon(distances, l, fnv1a(4, host, port, selector, "sammonseed"), points); 1118 1119 for (i = 0; i < l; i++) 1120 printf("%lf\t%lf\t%ld\t%lu\t%s\n", points[i].dims[0], points[i].dims[1], assocs[i] & 127, assocs[i] >> 7, g[i].fields[1]); 1121 1122 free(points); 1123 free(distances); 1124 1125 return r; 1126 } 1127 1128 char gopherbuffer[16 * 1024 * 1024]; 1129 1130 int 1131 main(int argc, char *argv[]) 1132 { 1133 ssize_t l; 1134 size_t k; 1135 struct gopherentry *g; 1136 char *host, *port, *selector; 1137 size_t *cassocs; 1138 1139 if (argc < 2) { 1140 printf("%s host [port [selector]]\n", argv[0]); 1141 return 1; 1142 } 1143 1144 host = argv[1]; 1145 port = (argc > 2) ? argv[2] : "70"; 1146 selector = (argc > 3) ? argv[3] : ""; 1147 1148 l = gopher_request(host, port, selector, gopherbuffer, sizeof(gopherbuffer)); 1149 if (l == -1) 1150 return 1; 1151 1152 l = gopher_parsemenu(gopherbuffer, &g); 1153 if (l == -1) 1154 return 1; 1155 1156 l = gopher_removeentries(g, l); 1157 1158 cassocs = calloc(l, sizeof(*cassocs)); 1159 if (!cassocs) 1160 return 1; 1161 k = 5; 1162 clustering(host, port, selector, g, l, cassocs, k); 1163 1164 return 0; 1165 }