URI: 
       gopher-clustering2.c - brcon2024-hackathons - Bitreichcon 2024 Hackathons
  HTML git clone git://bitreich.org/brcon2024-hackathons git://enlrupgkhuxnvlhsf6lc3fziv5h2hhfrinws65d7roiv6bfj7d652fid.onion/brcon2024-hackathons
   DIR Log
   DIR Files
   DIR Refs
   DIR Tags
   DIR Submodules
       ---
       gopher-clustering2.c (24998B)
       ---
            1 #define _POSIX_C_SOURCE 200112L
            2 
            3 #include <assert.h>
            4 #include <stdarg.h>
            5 #include <stdint.h>
            6 #include <stdio.h>
            7 #include <stdlib.h>
            8 #include <string.h>
            9 
           10 #include <math.h>
           11 
           12 #include <netdb.h>
           13 #include <sys/socket.h>
           14 #include <sys/types.h>
           15 #include <unistd.h>
           16 
           17 uint32_t
           18 fnv1a(int n,...)
           19 {
           20         int i;
           21         char *s;
           22         va_list l;
           23         uint32_t h;
           24 
           25         h = 0x811c9dc5;
           26 
           27         va_start(l, n); 
           28         for (i = 0; i < n; i++) {
           29                 for (s = va_arg(l, char*); *s; s++) {
           30                         h ^= *s;
           31                         h *= 0x01000193;
           32                 }
           33         }
           34         va_end(l);
           35 
           36         return h;
           37 }
           38 
           39 uint32_t
           40 xorshift(uint32_t *s)
           41 {
           42         *s ^= *s << 13;
           43         *s ^= *s >> 17;
           44         *s ^= *s << 5;
           45         return *s;
           46 }
           47 
           48 /*
           49         https://github.com/skeeto/hash-prospector/issues/19#issuecomment-1120105785
           50 */
           51 uint32_t
           52 mix(uint32_t x)
           53 {
           54         x ^= x >> 16;
           55         x *= 0x21f0aaad;
           56         x ^= x >> 15;
           57         x *= 0x735a2d97;
           58         x ^= x >> 16;
           59         return x;
           60 }
           61 
           62 struct gopherentry {
           63         struct gopherentry *next;
           64         char type;
           65         char *fields[4];
           66         size_t i, s;
           67 };
           68 
           69 char *
           70 gopher_fieldend(char *s)
           71 {
           72         for (; *s; s++) {
           73                 if (*s == '\t')
           74                         break;
           75                 if (*s == '\r' && *(s+1) == '\n')
           76                         break;
           77                 if (*s == '\n')
           78                         break;
           79         }
           80         return s;
           81 }
           82 
           83 char *
           84 gopher_nextentry(char *s)
           85 {
           86         char *c;
           87 
           88         if (c = strchr(s, '\n'))
           89                 return c + 1;
           90         return NULL;
           91 }
           92 
           93 void *
           94 xmalloc(size_t s)
           95 {
           96         void *p;
           97 
           98         if (!(p = malloc(s))) {
           99                 perror(NULL);
          100                 exit(1);
          101         }
          102         return p;
          103 }
          104 
          105 void *
          106 xrealloc(void *p, size_t s)
          107 {
          108         if (!(p = realloc(p, s))) {
          109                 perror(NULL);
          110                 exit(1);
          111         }
          112         return p;
          113 }
          114 
          115 /* Allow a lot of ugly things... */
          116 size_t
          117 gopher_parsemenu(char *d, struct gopherentry **g)
          118 {
          119         ssize_t gl;
          120         size_t fl, i, s;
          121         char *c, *p, pt;
          122         struct gopherentry *cg;
          123 
          124         gl = 0;
          125         cg = NULL;
          126 
          127         s = 0;
          128         pt = 0;
          129         for (c = d; c && *c; c = gopher_nextentry(c)) {
          130                 if (*c == '.')
          131                         break;
          132                 if (*c == '\n')
          133                         continue;
          134 
          135                 gl++;
          136                 cg = xrealloc(cg, gl * sizeof(struct gopherentry));
          137                 memset(&cg[gl-1], 0, sizeof(struct gopherentry));
          138 
          139                 if (*c != 'i' && pt == 'i')
          140                         s++;
          141                 pt = *c;
          142 
          143                 cg[gl-1].type = *c;
          144                 cg[gl-1].i = gl-1;
          145                 cg[gl-1].s = s;
          146                 c++;
          147 
          148                 for (i = 0; i < 4; i++) {
          149                         p = gopher_fieldend(c);
          150                         fl = p - c;
          151                         cg[gl-1].fields[i] = xmalloc(fl + 1);
          152                         memcpy(cg[gl-1].fields[i], c, fl);
          153                         cg[gl-1].fields[i][fl] = '\0';
          154                         if (*p != '\t')
          155                                 break;
          156                         c = p + 1;
          157                 }
          158         }
          159         s++;
          160         *g = cg;
          161         return gl;
          162 }
          163 
          164 ssize_t
          165 gopher_removeentries(struct gopherentry *g, size_t l)
          166 {
          167         size_t i, j;
          168 
          169         for (i = 0; i < l;) {
          170                 if (g[i].type == 'i' || g[i].type == '3' || !g[i].fields[2] || !g[i].fields[3]) {
          171                         l--;
          172                         memmove(&g[i], &g[i+1], (l - i) * sizeof(*g));
          173                         continue;
          174                 }
          175                 for (j = 0; j < i; j++) {
          176                         if (!strcmp(g[i].fields[1], g[j].fields[1]) &&
          177                             !strcmp(g[i].fields[2], g[j].fields[2]) &&
          178                             !strcmp(g[i].fields[3], g[j].fields[3]))
          179                                 break;
          180                 }
          181                 if (j < i) {
          182                         l--;
          183                         memmove(&g[i], &g[i+1], (l - i) * sizeof(*g));
          184                         continue;
          185                 }
          186                 i++;
          187         }
          188 
          189         return l;
          190 }
          191 
          192 ssize_t
          193 gopher_request(char *host, char *port, char *selector, char *buffer, size_t l)
          194 {
          195         int fd;
          196         struct addrinfo hints;
          197         struct addrinfo *r, *rp;
          198         ssize_t n, result, rn;
          199 
          200         memset(&hints, 0, sizeof(hints));
          201         hints.ai_family = AF_UNSPEC;
          202         hints.ai_socktype = SOCK_STREAM;
          203 
          204         if (getaddrinfo(host, port, &hints, &r) != 0)
          205                 return -1;
          206 
          207         for (rp = r; rp; rp = rp->ai_next) {
          208                 if ((fd = socket(rp->ai_family, rp->ai_socktype, rp->ai_protocol)) == -1)
          209                         continue;
          210                 if (connect(fd, rp->ai_addr, rp->ai_addrlen) != -1)
          211                         break;
          212                 close(fd);
          213         }
          214         freeaddrinfo(r);
          215         if (!rp)
          216                 return -1;
          217 
          218         result = -1;
          219         if (write(fd, selector, strlen(selector)) != strlen(selector))
          220                 goto cleanup;
          221 
          222         if (write(fd, "\r\n", 2) != 2)
          223                 goto cleanup;
          224 
          225         n = 0;
          226         while (1) {
          227                 if (n + 512 > l - 1)
          228                         goto cleanup;
          229                 if ((rn = read(fd, &buffer[n], 512)) == -1)
          230                         goto cleanup;
          231                 else if (!rn)
          232                         break;
          233                 n += rn;
          234         }
          235 
          236         buffer[n] = '\0';
          237         result = n;
          238 
          239 cleanup:
          240         close(fd);
          241 
          242         return result;
          243 }
          244 
          245 size_t
          246 triangularnumber(size_t n)
          247 {
          248         return (n * n + n) / 2;
          249 }
          250 
          251 size_t
          252 distanceindex(size_t i, size_t j)
          253 {
          254         size_t t;
          255 
          256         if (i < j) {
          257                 t = i;
          258                 i = j;
          259                 j = t;
          260         }
          261 
          262         return triangularnumber(i) + j;
          263 }
          264 
          265 struct point {
          266         double dims[2];
          267 };
          268 
          269 #define SAMMON_MAPPING_MAX_ITER (1 << 13)
          270 #define SAMMON_MAPPING_LR (0.01)
          271 void
          272 sammon(double *distances, size_t l, uint32_t prng, struct point *points)
          273 {
          274         size_t i, j, k, t, tmax;
          275         double *gnarf;
          276         double c, lr;
          277         double e1, e2, sum, lasterror;
          278         struct point *newpoints;
          279 
          280         gnarf = calloc(triangularnumber(l), sizeof(*gnarf));
          281         newpoints = calloc(l, sizeof(*newpoints));
          282 
          283         for (i = 0; i < l; i++) {
          284                 points[i].dims[0] = (double)xorshift(&prng) / UINT32_MAX - 0.5;
          285                 points[i].dims[1] = (double)xorshift(&prng) / UINT32_MAX - 0.5;
          286         }
          287 
          288         c = 0;
          289         for (i = 0; i < l; i++) {
          290                 gnarf[distanceindex(i, i)] = 0;
          291                 for (j = 0; j < i; j++) {
          292                         gnarf[distanceindex(i, j)] = sqrt(pow(points[i].dims[0] - points[j].dims[0], 2) + pow(points[i].dims[1] - points[j].dims[1], 2));
          293                         c += distances[distanceindex(i, j)];
          294                 }
          295         }
          296 
          297         lasterror = 0;
          298         for (t = 0, tmax = SAMMON_MAPPING_MAX_ITER; t < tmax; t++) {
          299                 lr = SAMMON_MAPPING_LR;
          300                 for (i = 0; i < l; i++) {
          301                         for (j = 0; j < 2; j++) {
          302                                 e1 = -2. / c;
          303                                 sum = 0;
          304                                 for (k = 0; k < l; k++) {
          305                                         if (i == k)
          306                                                 continue;
          307                                         sum += (distances[distanceindex(i, k)] - gnarf[distanceindex(i, k)]) / (distances[distanceindex(i, k)] * gnarf[distanceindex(i, k)]) * (points[i].dims[j] - points[k].dims[j]);
          308                                 }
          309                                 e1 *= sum;
          310 
          311                                 e2 = -2. / c;
          312                                 sum = 0;
          313                                 for (k = 0; k < l; k++) {
          314                                         if (i == k)
          315                                                 continue;
          316                                         sum += 1. / (distances[distanceindex(i, k)] * gnarf[distanceindex(i, k)]) * ((distances[distanceindex(i, k)] - gnarf[distanceindex(i, k)]) - pow(points[i].dims[j] - points[k].dims[j], 2) / gnarf[distanceindex(i, k)] * (1 + (distances[distanceindex(i, k)] - gnarf[distanceindex(i, k)]) / gnarf[distanceindex(i, k)]));
          317                                 }
          318                                 e2 *= sum;
          319 
          320                                 newpoints[i].dims[j] = points[i].dims[j] - lr * (e1 / fabs(e2));
          321                         }
          322                 }
          323 
          324                 sum = 0;
          325                 for (i = 0; i < l; i++) {
          326                         for (j = 0; j < i; j++) {
          327                                 gnarf[distanceindex(i, j)] = sqrt(pow(newpoints[i].dims[0] - newpoints[j].dims[0], 2) + pow(newpoints[i].dims[1] - newpoints[j].dims[1], 2));
          328                                 sum += pow(distances[distanceindex(i, j)] - gnarf[distanceindex(i, j)], 2) / distances[distanceindex(i, j)];
          329                         }
          330                 }
          331 
          332                 fprintf(stderr, "sammon mapping error: %lf\n", sum / c);
          333                 if (t > 0 && fabs(sum / c - lasterror) < 0.000000000001)
          334                         break;
          335                 lasterror = sum / c;
          336 
          337                 memcpy(points, newpoints, l * sizeof(*points));
          338         }
          339 
          340         free(newpoints);
          341         free(gnarf);
          342 }
          343 
          344 int
          345 fastermsc(struct gopherentry *g, size_t l, size_t k, double *distances, size_t *assoc, uint32_t prng)
          346 {
          347         size_t i, j, n, ck, grimmel;
          348         ssize_t m1, m, m2;
          349         size_t ti, th, huh;
          350         double dsum, d, d2, gnarf, tdsum;
          351         struct {
          352                 size_t medoid;
          353                 size_t n;
          354                 double dsum;
          355         } *clusters;
          356         struct {
          357                 size_t n1, n2, n3;
          358                 double d1, d2, d3;
          359         } *cache;
          360         double *dS, *dS2;
          361         double cdS, cdS2;
          362         size_t cm, cx;
          363         ssize_t lx;
          364         int flag, result;
          365         uint32_t r;
          366         double *C;
          367 
          368         if (!l)
          369                 return 0;
          370 
          371         if (l < k)
          372                 k = l;
          373 
          374         clusters = calloc(k, sizeof(*clusters));
          375         if (!clusters)
          376                 return -1;
          377 
          378         result = -1;
          379 
          380 /*
          381         Seed the medoids randomly
          382 */
          383         for (i = n = 0; i < l; i++) {
          384                 if ((l - i) * (xorshift(&prng) / (double)UINT32_MAX) < k - n)
          385                         clusters[n++].medoid = i;
          386                 if (n == k)
          387                         break;
          388         }
          389 
          390         cache = calloc(l, sizeof(*cache));
          391         if (!cache)
          392                 goto cleanup;
          393 
          394         dS = calloc(k, sizeof(*dS));
          395         if (!dS)
          396                 goto cleanup;
          397 
          398         dS2 = calloc(k, sizeof(*dS2));
          399         if (!dS2)
          400                 goto cleanup;
          401 
          402         for (i = 0; i < l; i++) {
          403                 cache[i].d1 = cache[i].d2 = cache[i].d3 = INFINITY;
          404                 for (j = 0; j < k; j++) {
          405                         d = distances[distanceindex(i, clusters[j].medoid)];
          406                         if (d < cache[i].d1) {
          407                                 cache[i].d3 = cache[i].d2;
          408                                 cache[i].d2 = cache[i].d1;
          409                                 cache[i].d1 = d;
          410                                 cache[i].n3 = cache[i].n2;
          411                                 cache[i].n2 = cache[i].n1;
          412                                 cache[i].n1 = j;
          413                         } else if (d < cache[i].d2) {
          414                                 cache[i].d3 = cache[i].d2;
          415                                 cache[i].d2 = d;
          416                                 cache[i].n3 = cache[i].n2;
          417                                 cache[i].n2 = j;
          418                         } else if (d < cache[i].d3) {
          419                                 cache[i].d3 = d;
          420                                 cache[i].n3 = j;
          421                         }
          422                 }
          423         }
          424 
          425         for (i = 0; i < k; i++) {
          426                 dS[i] = 0;
          427                 for (j = 0; j < l; j++) {
          428                         if (cache[j].n1 != i)
          429                                 continue;
          430                         dS[i] += cache[j].d1 / cache[j].d2 - cache[j].d2 / cache[j].d3;
          431                 }
          432                 for (j = 0; j < l; j++) {
          433                         if (cache[j].n2 != i)
          434                                 continue;
          435                         dS[i] += cache[j].d1 / cache[j].d2 - cache[j].d1 / cache[j].d3;
          436                 }
          437         }
          438 
          439         lx = -1;
          440         for (grimmel = 0;grimmel < 32;grimmel++) {
          441                 cdS = 0;
          442                 for (i = 0; i < l; i++) {
          443                         if (i == lx)
          444                                 goto fastermsc_finished;
          445 
          446                         for (j = 0; j < k; j++) {
          447                                 if (clusters[j].medoid == i)
          448                                         goto fastmsc_nexti;
          449                         }
          450 
          451                         memcpy(dS2, dS, k * sizeof(*dS2));
          452                         cdS2 = 0;
          453                         for (j = 0; j < l; j++) {
          454                                 d = distances[distanceindex(i, j)];
          455                                 if (d < cache[j].d1) {
          456                                         cdS2 += cache[j].d1 / cache[j].d2 - d / cache[j].d1;
          457                                         dS2[cache[j].n1] += d / cache[j].d1 + cache[j].d2 / cache[j].d3 - (cache[j].d1 + d) / cache[j].d2; 
          458                                         dS2[cache[j].n2] += cache[j].d1 / cache[j].d3 - cache[j].d1 / cache[j].d2; 
          459                                 } else if (d < cache[j].d2) {
          460                                         cdS2 += cache[j].d1 / cache[j].d2 - cache[j].d1 / d;
          461                                         dS2[cache[j].n1] += cache[j].d1 / d + cache[j].d2 / cache[j].d3 - (cache[j].d1 + d) / cache[j].d2; 
          462                                         dS2[cache[j].n2] += cache[j].d1 / cache[j].d3 - cache[j].d1 / cache[j].d2; 
          463                                 } else if (d < cache[j].d3) {
          464                                         dS2[cache[j].n1] += cache[j].d2 / cache[j].d3 - cache[j].d2 / d; 
          465                                         dS2[cache[j].n2] += cache[j].d1 / cache[j].d3 - cache[j].d1 / d; 
          466                                 }
          467                         }
          468                         d = -INFINITY;
          469                         for (j = 0; j < k; j++) {
          470                                 if (dS2[j] > d) {
          471                                         d = dS2[j];
          472                                         m = j;
          473                                 }
          474                         }
          475                         dS2[m] += cdS2;
          476                         if (dS2[m] <= 0)
          477                                 continue;
          478                         lx = i;
          479                         clusters[m].medoid = i;
          480                         for (j = 0; j < l; j++) {
          481                                 if (cache[j].n1 != m && cache[j].n2 != m && cache[j].n3 != m)
          482                                         continue;
          483                                 cache[j].d1 = cache[j].d2 = cache[j].d3 = INFINITY;
          484                                 for (n = 0; n < k; n++) {
          485                                         d = distances[distanceindex(j, clusters[n].medoid)];
          486                                         if (d < cache[j].d1) {
          487                                                 cache[j].d3 = cache[j].d2;
          488                                                 cache[j].d2 = cache[j].d1;
          489                                                 cache[j].d1 = d;
          490                                                 cache[j].n3 = cache[j].n2;
          491                                                 cache[j].n2 = cache[j].n1;
          492                                                 cache[j].n1 = n;
          493                                         } else if (d < cache[j].d2) {
          494                                                 cache[j].d3 = cache[j].d2;
          495                                                 cache[j].d2 = d;
          496                                                 cache[j].n3 = cache[j].n2;
          497                                                 cache[j].n2 = n;
          498                                         } else if (d < cache[j].d3) {
          499                                                 cache[j].d3 = d;
          500                                                 cache[j].n3 = n;
          501                                         }
          502                                 }
          503                         }
          504                         for (j = 0; j < k; j++) {
          505                                 dS[j] = 0;
          506                                 for (n = 0; n < l; n++) {
          507                                         if (cache[n].n1 != j)
          508                                                 continue;
          509                                         dS[j] += cache[n].d1 / cache[n].d2 - cache[n].d2 / cache[n].d3;
          510                                 }
          511                                 for (n = 0; n < l; n++) {
          512                                         if (cache[n].n2 != j)
          513                                                 continue;
          514                                         dS[j] += cache[n].d1 / cache[n].d2 - cache[n].d1 / cache[n].d3;
          515                                 }
          516                         }
          517 fastmsc_nexti:;
          518                 }
          519 /*
          520                 if (lx == -1)
          521                         break;
          522 */
          523         }
          524 fastermsc_finished:;
          525 
          526         for (i = 0; i < k; i++)
          527                 clusters[i].n = 0;
          528 
          529         for (i = 0; i < l; i++) {
          530                 d = INFINITY;
          531                 for (j = 0; j < k; j++) {
          532                         if (distances[distanceindex(clusters[j].medoid, i)] < d) {
          533                                 assoc[i] = j;
          534                                 d = distances[distanceindex(clusters[j].medoid, i)];
          535                         }
          536                 }
          537                 clusters[assoc[i]].n++;
          538         }
          539 
          540         for (i = 0; i < l; i++) {
          541                 if (clusters[assoc[i]].medoid == i)
          542                         assoc[i] |= 128;
          543         }
          544 
          545         result = k;
          546 cleanup:
          547         free(clusters);
          548         return result;
          549 }
          550 
          551 int
          552 fastmsc(struct gopherentry *g, size_t l, size_t k, double *distances, size_t *assoc, uint32_t prng)
          553 {
          554         size_t i, j, n, ck;
          555         ssize_t m1, m, m2;
          556         size_t ti, th;
          557         double dsum, d, d2, gnarf, tdsum;
          558         struct {
          559                 size_t medoid;
          560                 size_t n;
          561                 double dsum;
          562         } *clusters;
          563         struct {
          564                 size_t n1, n2;
          565                 double d1, d2, d3;
          566         } *cache;
          567         double *dS, *dS2;
          568         double cdS, cdS2;
          569         double _e, _y, _z, _s;
          570         size_t cm, cx;
          571         int flag, result;
          572         uint32_t r;
          573         double *C;
          574 
          575         if (!l)
          576                 return 0;
          577 
          578         if (l < k)
          579                 k = l;
          580 
          581         clusters = calloc(k, sizeof(*clusters));
          582         if (!clusters)
          583                 return -1;
          584 
          585         result = -1;
          586 
          587 /*
          588         seed the medoids using PAM BUILD
          589         d = INFINITY;
          590         m = -1;
          591         for (i = 0; i < l; i++) {
          592                 dsum = 0;
          593                 for (j = 0; j < l; j++)
          594                         dsum += distances[distanceindex(i, j)];
          595                 if (dsum < d) {
          596                         m = i;
          597                         d = dsum;
          598                 }
          599         }
          600         if (m == -1)
          601                 goto cleanup;
          602         ck = 0;
          603         clusters[ck].medoid = m;
          604         ck++;
          605 
          606         C = xmalloc(l * l * sizeof(*C));
          607 
          608         for (; ck < k; ck++) {
          609                 memset(C, 0, l * l * sizeof(*C));
          610                 for (i = 0; i < l; i++) {
          611                         for (j = 0; j < ck; j++) {
          612                                 if (clusters[j].medoid == i)
          613                                         goto build_nexti;
          614                         }
          615                         for (j = 0; j < l; j++) {
          616                                 if (i == j)
          617                                         continue;
          618                                 for (n = 0; n < ck; n++) {
          619                                         if (clusters[n].medoid == j)
          620                                                 goto build_nextj;
          621                                 }
          622                                 d = INFINITY;
          623                                 for (n = 0; n < ck; n++) {
          624                                         if (distances[distanceindex(j, clusters[n].medoid)] < d)
          625                                                 d = distances[distanceindex(j, clusters[n].medoid)];
          626                                 }
          627                                 C[j * l + i] = d - distances[distanceindex(j, i)];
          628                                 if (C[j * l + i] < 0)
          629                                         C[j * l + i] = 0;
          630 build_nextj:;
          631                         }
          632 build_nexti:;
          633                 }
          634                 d = -1;
          635                 m = -1;
          636                 for (i = 0; i < l; i++) {
          637                         for (j = 0; j < ck; j++) {
          638                                 if (clusters[j].medoid == i)
          639                                         goto build_nexti2;
          640                         }
          641                         dsum = 0;
          642                         for (j = 0; j < l; j++)
          643                                 dsum += C[j * l + i];
          644                         if (dsum > d) {
          645                                 d = dsum;
          646                                 m = i;
          647                         }
          648 build_nexti2:;
          649                 }
          650                 clusters[ck].medoid = m;
          651         }
          652         free(C);
          653 */
          654 
          655 /*
          656         Seed the medoids randomly
          657 */
          658         for (i = n = 0; i < l; i++) {
          659                 if ((l - i) * (xorshift(&prng) / (double)UINT32_MAX) < k - n)
          660                         clusters[n++].medoid = i;
          661                 if (n == k)
          662                         break;
          663         }
          664 
          665         cache = calloc(l, sizeof(*cache));
          666         if (!cache)
          667                 goto cleanup;
          668 
          669         dS = calloc(k, sizeof(*dS));
          670         if (!dS)
          671                 goto cleanup;
          672 
          673         dS2 = calloc(k, sizeof(*dS2));
          674         if (!dS2)
          675                 goto cleanup;
          676 
          677         for (;;) {
          678                 for (i = 0; i < l; i++) {
          679                         cache[i].d1 = cache[i].d2 = cache[i].d3 = INFINITY;
          680                         for (j = 0; j < k; j++) {
          681                                 d = distances[distanceindex(i, clusters[j].medoid)];
          682                                 if (d < cache[i].d1) {
          683                                         cache[i].d3 = cache[i].d2;
          684                                         cache[i].d2 = cache[i].d1;
          685                                         cache[i].d1 = d;
          686                                         cache[i].n2 = cache[i].n1;
          687                                         cache[i].n1 = j;
          688                                 } else if (d < cache[i].d2) {
          689                                         cache[i].d3 = cache[i].d2;
          690                                         cache[i].d2 = d;
          691                                         cache[i].n2 = j;
          692                                 } else if (d < cache[i].d3) {
          693                                         cache[i].d3 = d;
          694                                 }
          695                         }
          696                 }
          697 
          698                 for (i = 0; i < k; i++) {
          699                         dS[i] = 0;
          700                         for (j = 0; j < l; j++) {
          701                                 if (cache[j].n1 != i)
          702                                         continue;
          703                                 dS[i] += cache[j].d1 / cache[j].d2 - cache[j].d2 / cache[j].d3;
          704                         }
          705                         for (j = 0; j < l; j++) {
          706                                 if (cache[j].n2 != i)
          707                                         continue;
          708                                 dS[i] += cache[j].d1 / cache[j].d2 - cache[j].d1 / cache[j].d3;
          709                         }
          710                 }
          711 
          712                 cdS = 0;
          713                 for (i = 0; i < l; i++) {
          714                         for (j = 0; j < k; j++) {
          715                                 if (clusters[j].medoid == i)
          716                                         goto fastmsc_nexti;
          717                         }
          718                         memcpy(dS2, dS, k * sizeof(*dS2));
          719                         cdS2 = _e = 0;
          720                         for (j = 0; j < l; j++) {
          721                                 d = distances[distanceindex(i, j)];
          722                                 if (d < cache[j].d1) {
          723                                         cdS2 += cache[j].d1 / cache[j].d2 - d / cache[j].d1;
          724                                         dS2[cache[j].n1] += d / cache[j].d1 + cache[j].d2 / cache[j].d3 - (cache[j].d1 + d) / cache[j].d2; 
          725                                         dS2[cache[j].n2] += cache[j].d1 / cache[j].d3 - cache[j].d1 / cache[j].d2; 
          726                                 } else if (d < cache[j].d2) {
          727                                         cdS2 += cache[j].d1 / cache[j].d2 - cache[j].d1 / d;
          728                                         dS2[cache[j].n1] += cache[j].d1 / d + cache[j].d2 / cache[j].d3 - (cache[j].d1 + d) / cache[j].d2; 
          729                                         dS2[cache[j].n2] += cache[j].d1 / cache[j].d3 - cache[j].d1 / cache[j].d2; 
          730                                 } else if (d < cache[j].d3) {
          731                                         dS2[cache[j].n1] += cache[j].d2 / cache[j].d3 - cache[j].d2 / d; 
          732                                         dS2[cache[j].n2] += cache[j].d1 / cache[j].d3 - cache[j].d1 / d; 
          733                                 }
          734                         }
          735                         d = -INFINITY;
          736                         d2 = INFINITY;
          737                         for (j = 0; j < k; j++) {
          738                                 if (dS2[j] > d) {
          739                                         d = dS2[j];
          740                                         m = j;
          741                                 }
          742                                 if (dS2[j] < d2) {
          743                                         d2 = dS2[j];
          744                                         m2 = j;
          745                                 }
          746                         }
          747                         dS2[m] += cdS2;
          748                         if (dS2[m] > cdS) {
          749                                 cdS = dS2[m];
          750                                 cm = m;
          751                                 cx = i;
          752                         }
          753 fastmsc_nexti:;
          754                 }
          755                 /* Should be 0. I guess some floating point problems? */
          756                 if (cdS <= 0.00000000000001)
          757                         break;
          758                 clusters[cm].medoid = cx;
          759         }
          760 
          761         for (i = 0; i < k; i++)
          762                 clusters[i].n = 0;
          763 
          764         for (i = 0; i < l; i++) {
          765                 d = INFINITY;
          766                 for (j = 0; j < k; j++) {
          767                         if (distances[distanceindex(clusters[j].medoid, i)] < d) {
          768                                 assoc[i] = j;
          769                                 d = distances[distanceindex(clusters[j].medoid, i)];
          770                         }
          771                 }
          772                 clusters[assoc[i]].n++;
          773         }
          774 
          775         for (i = 0; i < l; i++) {
          776                 if (clusters[assoc[i]].medoid == i)
          777                         assoc[i] |= 128;
          778         }
          779 
          780         result = k;
          781 cleanup:
          782         free(clusters);
          783         return result;
          784 }
          785 
          786 /*
          787         Seeding of medoids based on the PAM BUILD description in https://www.cs.umb.edu/cs738/pam1.pdf
          788         I was too stupid to understand other descriptions.
          789 */
          790 int
          791 kmedoids(struct gopherentry *g, size_t l, size_t k, double *distances, size_t *assoc, uint32_t prng)
          792 {
          793         size_t i, j, n, ck;
          794         ssize_t m1, m;
          795         size_t ti, th;
          796         double dsum, d, d2, gnarf, tdsum;
          797         struct {
          798                 size_t medoid;
          799                 size_t n;
          800                 double dsum;
          801         } *clusters;
          802         int flag, result;
          803         uint32_t r;
          804         double *C;
          805 
          806         if (!l)
          807                 return 0;
          808 
          809         if (l < k)
          810                 k = l;
          811 
          812         clusters = calloc(k, sizeof(*clusters));
          813         if (!clusters)
          814                 return -1;
          815 
          816         result = -1;
          817 
          818         /* seed the medoids using PAM BUILD */
          819         d = INFINITY;
          820         m = -1;
          821         for (i = 0; i < l; i++) {
          822                 dsum = 0;
          823                 for (j = 0; j < l; j++)
          824                         dsum += distances[distanceindex(i, j)];
          825                 if (dsum < d) {
          826                         m = i;
          827                         d = dsum;
          828                 }
          829         }
          830         if (m == -1)
          831                 goto cleanup;
          832         ck = 0;
          833         clusters[ck].medoid = m;
          834         ck++;
          835 
          836         C = xmalloc(l * l * sizeof(*C));
          837 
          838         for (; ck < k; ck++) {
          839                 memset(C, 0, l * l * sizeof(*C));
          840                 for (i = 0; i < l; i++) {
          841                         for (j = 0; j < ck; j++) {
          842                                 if (clusters[j].medoid == i)
          843                                         goto build_nexti;
          844                         }
          845                         for (j = 0; j < l; j++) {
          846                                 if (i == j)
          847                                         continue;
          848                                 for (n = 0; n < ck; n++) {
          849                                         if (clusters[n].medoid == j)
          850                                                 goto build_nextj;
          851                                 }
          852                                 d = INFINITY;
          853                                 for (n = 0; n < ck; n++) {
          854                                         if (distances[distanceindex(j, clusters[n].medoid)] < d)
          855                                                 d = distances[distanceindex(j, clusters[n].medoid)];
          856                                 }
          857                                 C[j * l + i] = d - distances[distanceindex(j, i)];
          858                                 if (C[j * l + i] < 0)
          859                                         C[j * l + i] = 0;
          860 build_nextj:;
          861                         }
          862 build_nexti:;
          863                 }
          864                 d = -1;
          865                 m = -1;
          866                 for (i = 0; i < l; i++) {
          867                         for (j = 0; j < ck; j++) {
          868                                 if (clusters[j].medoid == i)
          869                                         goto build_nexti2;
          870                         }
          871                         dsum = 0;
          872                         for (j = 0; j < l; j++)
          873                                 dsum += C[j * l + i];
          874                         if (dsum > d) {
          875                                 d = dsum;
          876                                 m = i;
          877                         }
          878 build_nexti2:;
          879                 }
          880                 clusters[ck].medoid = m;
          881         }
          882         free(C);
          883 
          884         for (;;) {
          885                 tdsum = INFINITY;
          886                 for (i = 0; i < k; i++) {
          887                         for (j = 0; j < l; j++) {
          888                                 for (n = 0; n < k; n++) {
          889                                         if (j == clusters[n].medoid)
          890                                                 goto swap_nextj;
          891                                 }
          892                                 dsum = 0;
          893                                 for (n = 0; n < l; n++) {
          894                                         if (n == j)
          895                                                 continue;
          896                                         for (m = 0; m < k; m++) {
          897                                                 if (clusters[m].medoid == n)
          898                                                         goto swap_nextn;
          899                                         }
          900                                         d = INFINITY;
          901                                         d2 = INFINITY;
          902                                         for (m = 0; m < k; m++) {
          903                                                 if (distances[distanceindex(clusters[m].medoid, n)] < d) {
          904                                                         d2 = d;
          905                                                         d = distances[distanceindex(clusters[m].medoid, n)];
          906                                                 } else if (distances[distanceindex(clusters[m].medoid, n)] < d2) {
          907                                                         d2 = distances[distanceindex(clusters[m].medoid, n)];
          908                                                 }
          909                                         }
          910                                         if (distances[distanceindex(clusters[i].medoid, n)] > d) {
          911                                                 if (distances[distanceindex(j, n)] >= d)
          912                                                         gnarf = 0;
          913                                                 else
          914                                                         gnarf = distances[distanceindex(j, n)] - d;
          915                                         } else if (distances[distanceindex(clusters[i].medoid, n)] == d) {
          916                                                 if (distances[distanceindex(j, n)] < d2)
          917                                                         gnarf = distances[distanceindex(j, n)] - d;
          918                                                 else
          919                                                         gnarf = d2 - d;
          920                                         } else {
          921                                                 continue;
          922                                         }
          923                                         dsum += gnarf;
          924 swap_nextn:;
          925                                 }
          926                                 if (dsum < tdsum) {
          927                                         ti = i;
          928                                         th = j;
          929                                         tdsum = dsum;
          930                                 }
          931 swap_nextj:;
          932                         }
          933                 }
          934                 if (tdsum >= 0)
          935                         break;
          936                 clusters[ti].medoid = th;
          937         }
          938 
          939         for (i = 0; i < k; i++)
          940                 clusters[i].n = 0;
          941 
          942         for (i = 0; i < l; i++) {
          943                 d = INFINITY;
          944                 for (j = 0; j < k; j++) {
          945                         if (distances[distanceindex(clusters[j].medoid, i)] < d) {
          946                                 assoc[i] = j;
          947                                 d = distances[distanceindex(clusters[j].medoid, i)];
          948                         }
          949                 }
          950                 clusters[assoc[i]].n++;
          951         }
          952 
          953         for (i = 0; i < l; i++) {
          954                 if (clusters[assoc[i]].medoid == i)
          955                         assoc[i] |= 128;
          956         }
          957 
          958         result = k;
          959 cleanup:
          960         free(clusters);
          961         return result;
          962 }
          963 
          964 size_t
          965 diff(size_t a, size_t b)
          966 {
          967         if (a < b)
          968                 return b - a;
          969         return a - b;
          970 }
          971 
          972 /*
          973         https://en.wikipedia.org/wiki/Levenshtein_distance#Iterative_with_two_matrix_rows
          974         Which references https://www.codeproject.com/Articles/13525/Fast-memory-efficient-Levenshtein-algorithm-2
          975         The code there is using the CPOL: https://www.codeproject.com/info/cpol10.aspx
          976 */
          977 size_t
          978 levenshteindistance(char *a, char *b)
          979 {
          980         size_t i, j, dc, ic, sc, r;
          981         size_t *v0, *v1, *t;
          982 
          983         v0 = xmalloc((strlen(b) + 1) * sizeof(*v0));
          984         v1 = xmalloc((strlen(b) + 1) * sizeof(*v1));
          985 
          986         for (i = 0; i <= strlen(b); i++)
          987                 v0[i] = i;
          988 
          989         for (i = 0; i < strlen(a); i++) {
          990                 v1[0] = i + 1;
          991                 for (j = 0; j < strlen(b); j++) {
          992                         dc = v0[j + 1] + 1;
          993                         ic = v1[j] + 1;
          994                         if (a[i] == b[j])
          995                                 sc = v0[j];
          996                         else
          997                                 sc = v0[j] + 1;
          998                         if (dc < ic && dc < sc)
          999                                 v1[j + 1] = dc;
         1000                         else if (ic < dc && ic < sc)
         1001                                 v1[j + 1] = ic;
         1002                         else
         1003                                 v1[j + 1] = sc;
         1004                 }
         1005                 t = v0;
         1006                 v0 = v1;
         1007                 v1 = t;
         1008         }
         1009 
         1010         r = v0[strlen(b)];
         1011 
         1012         free(v1);
         1013         free(v0);
         1014 
         1015         return r;
         1016 }
         1017 
         1018 /*
         1019         Remove common prefix and suffix. The pazz0distance is the sum of the length of the leftover strings.
         1020 */
         1021 size_t
         1022 pazz0distance(char *a, char *b)
         1023 {
         1024         size_t pl, sl;
         1025         size_t al, bl;
         1026 
         1027         al = strlen(a);
         1028         bl = strlen(b);
         1029 
         1030         for (pl = 0; pl < al && pl < bl && a[pl] == b[pl]; pl++); 
         1031         for (sl = 0; al - sl > pl && bl - sl > pl && a[al - 1 - sl] == b[bl - 1 - sl]; sl++);
         1032 
         1033         return al + bl - 2 * (pl + sl);
         1034 }
         1035 
         1036 /*
         1037         https://en.wikipedia.org/wiki/Levenshtein_distance#Recursive
         1038 */
         1039 size_t
         1040 levenshteindistance_slow(char *a, char *b)
         1041 {
         1042         size_t k, l, m;
         1043 
         1044         if (!*a)
         1045                 return strlen(b);
         1046         else if (!*b)
         1047                 return strlen(a);
         1048         else if (*a == *b)
         1049                 return levenshteindistance_slow(a + 1, b + 1);
         1050 
         1051         k = levenshteindistance_slow(a + 1, b);
         1052         l = levenshteindistance_slow(a, b + 1);
         1053         m = levenshteindistance_slow(a + 1, b + 1);
         1054 
         1055         if (k < l && k < m)
         1056                 return 1 + k;
         1057         else if (l < k && l < m)
         1058                 return 1 + l;
         1059         else
         1060                 return 1 + m;
         1061 }
         1062 
         1063 ssize_t
         1064 clustering(char *host, char *port, char *selector, struct gopherentry *g, size_t l, size_t *assocs, size_t k)
         1065 {
         1066         size_t i, j;
         1067         ssize_t r;
         1068         double *distances, d;
         1069         double *mdists;
         1070         struct {
         1071                 double selector;
         1072                 double segment;
         1073                 double index;
         1074         } *distanceparts, maxdistanceparts;
         1075         uint32_t prng;
         1076         struct point *points;
         1077 
         1078         distanceparts = xmalloc(triangularnumber(l) * sizeof(*distanceparts));
         1079 
         1080         /* So the sammon mapping can be happy. */
         1081         prng = fnv1a(4, host, port, selector, "distancewiggle");
         1082         memset(&maxdistanceparts, 0, sizeof(maxdistanceparts));
         1083         for (i = 0; i < l; i++) {
         1084                 memset(&distanceparts[distanceindex(i, i)], 0, sizeof(*distanceparts));
         1085                 for (j = 0; j < i; j++) {
         1086                         distanceparts[distanceindex(i, j)].selector = levenshteindistance(g[i].fields[1], g[j].fields[1]) + levenshteindistance(g[i].fields[2], g[j].fields[2]) + levenshteindistance(g[i].fields[3], g[j].fields[3]) + ((double)xorshift(&prng) / UINT32_MAX) * 0.000000000000000000000000000000000000001;
         1087                         distanceparts[distanceindex(i, j)].segment = diff(g[i].s, g[j].s);
         1088                         distanceparts[distanceindex(i, j)].index = diff(g[i].i, g[j].i);
         1089 
         1090                         if (distanceparts[distanceindex(i, j)].selector > maxdistanceparts.selector)
         1091                                 maxdistanceparts.selector = distanceparts[distanceindex(i, j)].selector;
         1092                         if (distanceparts[distanceindex(i, j)].segment > maxdistanceparts.segment)
         1093                                 maxdistanceparts.segment = distanceparts[distanceindex(i, j)].segment;
         1094                         if (distanceparts[distanceindex(i, j)].index > maxdistanceparts.index)
         1095                                 maxdistanceparts.index = distanceparts[distanceindex(i, j)].index;
         1096                 }
         1097         }
         1098         for (i = 0; i < l; i++) {
         1099                 for (j = 0; j < i; j++) {
         1100                         distanceparts[distanceindex(i, j)].selector /= maxdistanceparts.selector;
         1101                         distanceparts[distanceindex(i, j)].segment /= maxdistanceparts.segment;
         1102                         distanceparts[distanceindex(i, j)].index /= maxdistanceparts.index;
         1103                 }
         1104         }
         1105 
         1106         distances = xmalloc(triangularnumber(l) * sizeof(*distances));
         1107         for (i = 0; i < l; i++) {
         1108                 distances[distanceindex(i, i)] = 0;
         1109                 for (j = 0; j < i; j++)
         1110                         distances[distanceindex(i, j)] = 0.75 * distanceparts[distanceindex(i, j)].selector + 0.125 * distanceparts[distanceindex(i, j)].segment + 0.125 * distanceparts[distanceindex(i, j)].index; //distanceparts[distanceindex(i, j)].selector;
         1111         }
         1112         free(distanceparts);
         1113 
         1114         r = kmedoids(g, l, k, distances, assocs, fnv1a(4, host, port, selector, "seed"));
         1115 
         1116         points = calloc(l, sizeof(*points));
         1117         sammon(distances, l, fnv1a(4, host, port, selector, "sammonseed"), points);
         1118 
         1119         for (i = 0; i < l; i++)
         1120                 printf("%lf\t%lf\t%ld\t%lu\t%s\n", points[i].dims[0], points[i].dims[1], assocs[i] & 127, assocs[i] >> 7, g[i].fields[1]);
         1121 
         1122         free(points);
         1123         free(distances);
         1124 
         1125         return r;
         1126 }
         1127 
         1128 char gopherbuffer[16 * 1024 * 1024];
         1129 
         1130 int
         1131 main(int argc, char *argv[])
         1132 {
         1133         ssize_t l;
         1134         size_t k;
         1135         struct gopherentry *g;
         1136         char *host, *port, *selector;
         1137         size_t *cassocs;
         1138 
         1139         if (argc < 2) {
         1140                 printf("%s host [port [selector]]\n", argv[0]);
         1141                 return 1;
         1142         }
         1143 
         1144         host = argv[1];
         1145         port = (argc > 2) ? argv[2] : "70";
         1146         selector = (argc > 3) ? argv[3] : "";
         1147 
         1148         l = gopher_request(host, port, selector, gopherbuffer, sizeof(gopherbuffer));
         1149         if (l == -1)
         1150                 return 1;
         1151 
         1152         l = gopher_parsemenu(gopherbuffer, &g);
         1153         if (l == -1)
         1154                 return 1;
         1155 
         1156         l = gopher_removeentries(g, l);
         1157 
         1158         cassocs = calloc(l, sizeof(*cassocs));
         1159         if (!cassocs)
         1160                 return 1;
         1161         k = 5;
         1162         clustering(host, port, selector, g, l, cassocs, k);
         1163 
         1164         return 0;
         1165 }