URI: 
       tdedup.c - dedup - data deduplication program
  HTML git clone git://bitreich.org/dedup/ git://hg6vgqziawt5s4dj.onion/dedup/
   DIR Log
   DIR Files
   DIR Refs
   DIR Tags
   DIR README
   DIR LICENSE
       ---
       tdedup.c (14186B)
       ---
            1 #include <sys/types.h>
            2 #include <sys/stat.h>
            3 #include <sys/file.h>
            4 
            5 #include <err.h>
            6 #include <fcntl.h>
            7 #include <stdio.h>
            8 #include <stdint.h>
            9 #include <stdlib.h>
           10 #include <string.h>
           11 #include <unistd.h>
           12 
           13 #include "arg.h"
           14 #include "blake2.h"
           15 #include "dedup.h"
           16 
           17 #define SNAPSF ".snapshots"
           18 #define STOREF ".store"
           19 
           20 enum {
           21         WALK_CONTINUE,
           22         WALK_STOP
           23 };
           24 
           25 struct extract_args {
           26         uint8_t *md;
           27         int fd;
           28         int ret;
           29 };
           30 
           31 static struct snap_hdr snap_hdr;
           32 static struct blk_hdr blk_hdr;
           33 static struct icache *icache;
           34 static int ifd;
           35 static int sfd;
           36 static int hash_algo = HASH_BLAKE2B;
           37 static int compr_algo = COMPR_LZ4;
           38 
           39 int verbose;
           40 char *argv0;
           41 
           42 static void
           43 print_md(FILE *fp, uint8_t *md, size_t size)
           44 {
           45         size_t i;
           46 
           47         for (i = 0; i < size; i++)
           48                 fprintf(fp, "%02x", md[i]);
           49 }
           50 
           51 static void
           52 print_stats(struct stats *st)
           53 {
           54         unsigned long long hits, misses;
           55         double hitratio;
           56 
           57         if (st->nr_blks == 0)
           58                 return;
           59 
           60         fprintf(stderr, "Original size: %llu bytes\n",
           61                 (unsigned long long)st->orig_size);
           62         fprintf(stderr, "Compressed size: %llu bytes\n",
           63                 (unsigned long long)st->compr_size);
           64         fprintf(stderr, "Deduplicated size: %llu bytes\n",
           65                 (unsigned long long)st->dedup_size);
           66         fprintf(stderr, "Deduplication ratio: %.2f\n",
           67                 (double)st->orig_size / st->dedup_size);
           68         fprintf(stderr, "Min/avg/max block size: %llu/%llu/%llu bytes\n",
           69                 (unsigned long long)st->min_blk_size,
           70                 (unsigned long long)st->dedup_size / st->nr_blks,
           71                 (unsigned long long)st->max_blk_size);
           72         fprintf(stderr, "Number of unique blocks: %llu\n",
           73                 (unsigned long long)st->nr_blks);
           74 
           75         icache_stats(icache, &hits, &misses);
           76         if (hits == 0 && misses == 0)
           77                 hitratio = 0;
           78         else
           79                 hitratio = (double)hits / (hits + misses);
           80 
           81         fprintf(stderr, "Index cache hit percentage: %.2f%%\n",
           82                 100 * hitratio);
           83 }
           84 
           85 static struct snap *
           86 alloc_snap(void)
           87 {
           88         struct snap *snap;
           89 
           90         snap = calloc(1, sizeof(*snap));
           91         if (snap == NULL)
           92                 err(1, "%s", __func__);
           93         return snap;
           94 }
           95 
           96 static void
           97 free_snap(struct snap *snap)
           98 {
           99         free(snap);
          100 }
          101 
          102 /*
          103  * The snapshot hash is calculated over the
          104  * hash of its block descriptors.
          105  */
          106 static void
          107 hash_snap(struct snap *snap, uint8_t *md)
          108 {
          109         struct hash_ctx ctx;
          110         uint64_t i;
          111 
          112         if (hash_init(&ctx, hash_algo, MD_SIZE) < 0)
          113                 errx(1, "hash_init failed");
          114         for (i = 0; i < snap->nr_blk_descs; i++) {
          115                 struct blk_desc *blk_desc;
          116 
          117                 blk_desc = &snap->blk_desc[i];
          118                 hash_update(&ctx, blk_desc->md, sizeof(blk_desc->md));
          119         }
          120         hash_final(&ctx, md, MD_SIZE);
          121 }
          122 
          123 static struct snap *
          124 grow_snap(struct snap *snap, uint64_t nr_blk_descs)
          125 {
          126         size_t size;
          127 
          128         if (nr_blk_descs > SIZE_MAX / sizeof(snap->blk_desc[0]))
          129                 errx(1, "%s: overflow", __func__);
          130         size = nr_blk_descs * sizeof(snap->blk_desc[0]);
          131 
          132         if (size > SIZE_MAX - sizeof(*snap))
          133                 errx(1, "%s: overflow", __func__);
          134         size += sizeof(*snap);
          135 
          136         snap = realloc(snap, size);
          137         if (snap == NULL)
          138                 err(1, "%s", __func__);
          139         return snap;
          140 }
          141 
          142 static void
          143 append_snap(struct snap *snap)
          144 {
          145         if (snap->nr_blk_descs > UINT64_MAX / BLK_DESC_SIZE)
          146                 errx(1, "%s: overflow", __func__);
          147         snap->size = snap->nr_blk_descs * BLK_DESC_SIZE;
          148 
          149         if (snap->size > UINT64_MAX - SNAPSHOT_SIZE)
          150                 errx(1, "%s: overflow", __func__);
          151         snap->size += SNAPSHOT_SIZE;
          152 
          153         xlseek(ifd, snap_hdr.size, SEEK_SET);
          154         write_snap(ifd, snap);
          155         write_snap_blk_descs(ifd, snap);
          156 
          157         if (snap_hdr.size > UINT64_MAX - snap->size)
          158                 errx(1, "%s: overflow", __func__);
          159         snap_hdr.size += snap->size;
          160 
          161         if (snap_hdr.nr_snaps > UINT64_MAX - 1)
          162                 errx(1, "%s: overflow", __func__);
          163         snap_hdr.nr_snaps++;
          164 }
          165 
          166 static uint8_t *
          167 alloc_buf(size_t size)
          168 {
          169         void *p;
          170 
          171         p = calloc(1, size);
          172         if (p == NULL)
          173                 err(1, "%s", __func__);
          174         return p;
          175 }
          176 
          177 static void
          178 free_buf(uint8_t *buf)
          179 {
          180         free(buf);
          181 }
          182 
          183 static void
          184 hash_blk(uint8_t *buf, size_t size, uint8_t *md)
          185 {
          186         struct hash_ctx ctx;
          187 
          188         if (hash_init(&ctx, hash_algo, MD_SIZE) < 0)
          189                 errx(1, "hash_init failed");
          190         hash_update(&ctx, buf, size);
          191         hash_final(&ctx, md, MD_SIZE);
          192 }
          193 
          194 static void
          195 read_blk(uint8_t *buf, struct blk_desc *blk_desc)
          196 {
          197         ssize_t n;
          198 
          199         xlseek(sfd, blk_desc->offset, SEEK_SET);
          200         n = xread(sfd, buf, blk_desc->size);
          201         if (n == 0)
          202                 errx(1, "%s: unexpected EOF", __func__);
          203         if (n != blk_desc->size)
          204                 errx(1, "%s: short read", __func__);
          205 }
          206 
          207 static void
          208 append_blk(uint8_t *buf, struct blk_desc *blk_desc)
          209 {
          210         xlseek(sfd, blk_hdr.size, SEEK_SET);
          211         xwrite(sfd, buf, blk_desc->size);
          212 
          213         if (blk_hdr.size > UINT64_MAX - blk_desc->size)
          214                 errx(1, "%s: overflow", __func__);
          215         blk_hdr.size += blk_desc->size;
          216 }
          217 
          218 static void
          219 dedup_chunk(struct snap *snap, uint8_t *chunkp, size_t chunk_size)
          220 {
          221         uint8_t md[MD_SIZE];
          222         struct blk_desc blk_desc;
          223         struct compr_ctx ctx;
          224         uint8_t *compr_buf;
          225         size_t n, csize;
          226 
          227         if (compr_init(&ctx, compr_algo) < 0)
          228                 errx(1, "compr_init failed");
          229         csize = compr_size(&ctx, BLKSIZE_MAX);
          230         compr_buf = alloc_buf(csize);
          231 
          232         n = compr(&ctx, chunkp, compr_buf, chunk_size, csize);
          233         hash_blk(compr_buf, n, md);
          234 
          235         snap_hdr.st.orig_size += chunk_size;
          236         snap_hdr.st.compr_size += n;
          237 
          238         memcpy(blk_desc.md, md, sizeof(blk_desc.md));
          239         if (lookup_icache(icache, &blk_desc) < 0) {
          240                 blk_desc.offset = blk_hdr.size;
          241                 blk_desc.size = n;
          242 
          243                 snap->blk_desc[snap->nr_blk_descs++] = blk_desc;
          244                 append_blk(compr_buf, &blk_desc);
          245 
          246                 insert_icache(icache, &blk_desc);
          247 
          248                 snap_hdr.st.dedup_size += blk_desc.size;
          249                 snap_hdr.st.nr_blks++;
          250 
          251                 if (blk_desc.size > snap_hdr.st.max_blk_size)
          252                         snap_hdr.st.max_blk_size = blk_desc.size;
          253                 if (blk_desc.size < snap_hdr.st.min_blk_size)
          254                         snap_hdr.st.min_blk_size = blk_desc.size;
          255         } else {
          256                 snap->blk_desc[snap->nr_blk_descs++] = blk_desc;
          257         }
          258 
          259         free(compr_buf);
          260         compr_final(&ctx);
          261 }
          262 
          263 static void
          264 dedup(int fd, char *msg)
          265 {
          266         struct snap *snap;
          267         struct chunker *chunker;
          268 
          269         snap = alloc_snap();
          270         chunker = alloc_chunker(fd, BLKSIZE_MIN, BLKSIZE_MAX,
          271                                 HASHMASK_BITS, WINSIZE);
          272 
          273         while (fill_chunker(chunker) > 0) {
          274                 uint8_t *chunkp;
          275                 size_t chunk_size;
          276 
          277                 chunkp = get_chunk(chunker, &chunk_size);
          278                 snap = grow_snap(snap, snap->nr_blk_descs + 1);
          279                 dedup_chunk(snap, chunkp, chunk_size);
          280                 drain_chunker(chunker);
          281         }
          282 
          283         if (snap->nr_blk_descs > 0) {
          284                 if (msg != NULL) {
          285                         size_t size;
          286 
          287                         size = strlen(msg) + 1;
          288                         if (size > sizeof(snap->msg))
          289                                 size = sizeof(snap->msg);
          290                         memcpy(snap->msg, msg, size);
          291                         snap->msg[size - 1] = '\0';
          292                 }
          293                 hash_snap(snap, snap->md);
          294                 append_snap(snap);
          295         }
          296 
          297         free_chunker(chunker);
          298         free_snap(snap);
          299 }
          300 
          301 static int
          302 extract(struct snap *snap, void *arg)
          303 {
          304         uint8_t *buf[2];
          305         struct extract_args *args = arg;
          306         struct compr_ctx ctx;
          307         uint64_t i;
          308 
          309         if (memcmp(snap->md, args->md, sizeof(snap->md)) != 0)
          310                 return WALK_CONTINUE;
          311 
          312         if (compr_init(&ctx, compr_algo) < 0)
          313                 errx(1, "compr_init failed");
          314         buf[0] = alloc_buf(BLKSIZE_MAX);
          315         buf[1] = alloc_buf(compr_size(&ctx, BLKSIZE_MAX));
          316         for (i = 0; i < snap->nr_blk_descs; i++) {
          317                 struct blk_desc *blk_desc;
          318                 size_t blksize;
          319 
          320                 blk_desc = &snap->blk_desc[i];
          321                 read_blk(buf[1], blk_desc);
          322                 blksize = decompr(&ctx, buf[1], buf[0], blk_desc->size, BLKSIZE_MAX);
          323                 xwrite(args->fd, buf[0], blksize);
          324         }
          325         free_buf(buf[1]);
          326         free_buf(buf[0]);
          327         compr_final(&ctx);
          328         args->ret = 0;
          329         return WALK_STOP;
          330 }
          331 
          332 /*
          333  * Hash every block referenced by the given snapshot
          334  * and compare its hash with the one stored in the corresponding
          335  * block descriptor.
          336  */
          337 static int
          338 check_snap(struct snap *snap, void *arg)
          339 {
          340         struct compr_ctx ctx;
          341         uint8_t *buf;
          342         int *ret = arg;
          343         uint64_t i;
          344 
          345         if (verbose > 0) {
          346                 fprintf(stderr, "Checking snapshot: ");
          347                 print_md(stderr, snap->md, sizeof(snap->md));
          348                 fputc('\n', stderr);
          349         }
          350 
          351         if (compr_init(&ctx, compr_algo) < 0)
          352                 errx(1, "compr_init failed");
          353         buf = alloc_buf(compr_size(&ctx, BLKSIZE_MAX));
          354         for (i = 0; i < snap->nr_blk_descs; i++) {
          355                 uint8_t md[MD_SIZE];
          356                 struct blk_desc *blk_desc;
          357 
          358                 blk_desc = &snap->blk_desc[i];
          359                 read_blk(buf, blk_desc);
          360                 hash_blk(buf, blk_desc->size, md);
          361 
          362                 if (memcmp(blk_desc->md, md, sizeof(blk_desc->md)) == 0)
          363                         continue;
          364 
          365                 fprintf(stderr, "Block hash mismatch\n");
          366                 fprintf(stderr, "  Expected hash: ");
          367                 print_md(stderr, blk_desc->md, sizeof(blk_desc->md));
          368                 fputc('\n', stderr);
          369                 fprintf(stderr, "  Actual hash: ");
          370                 print_md(stderr, md, sizeof(md));
          371                 fputc('\n', stderr);
          372                 fprintf(stderr, "  Offset: %llu\n",
          373                         (unsigned long long)blk_desc->offset);
          374                 fprintf(stderr, "  Size: %llu\n",
          375                         (unsigned long long)blk_desc->size);
          376                 *ret = -1;
          377         }
          378         free_buf(buf);
          379         compr_final(&ctx);
          380         return WALK_CONTINUE;
          381 }
          382 
          383 static int
          384 build_icache(struct snap *snap, void *arg)
          385 {
          386         struct compr_ctx ctx;
          387         uint8_t *buf;
          388         uint64_t i;
          389 
          390         if (compr_init(&ctx, compr_algo) < 0)
          391                 errx(1, "compr_init failed");
          392         buf = alloc_buf(compr_size(&ctx, BLKSIZE_MAX));
          393         for (i = 0; i < snap->nr_blk_descs; i++) {
          394                 struct blk_desc *blk_desc;
          395 
          396                 blk_desc = &snap->blk_desc[i];
          397                 insert_icache(icache, blk_desc);
          398         }
          399         free(buf);
          400         compr_final(&ctx);
          401         return WALK_CONTINUE;
          402 }
          403 
          404 static int
          405 list(struct snap *snap, void *arg)
          406 {
          407         print_md(stdout, snap->md, sizeof(snap->md));
          408         if (snap->msg[0] != '\0')
          409                 printf("\t%s\n", snap->msg);
          410         else
          411                 putchar('\n');
          412         return WALK_CONTINUE;
          413 }
          414 
          415 /* Walk through all snapshots and call fn() on each one */
          416 static void
          417 walk_snap(int (*fn)(struct snap *, void *), void *arg)
          418 {
          419         uint64_t i;
          420 
          421         xlseek(ifd, SNAP_HDR_SIZE, SEEK_SET);
          422         for (i = 0; i < snap_hdr.nr_snaps; i++) {
          423                 struct snap *snap;
          424                 int ret;
          425 
          426                 snap = alloc_snap();
          427                 read_snap(ifd, snap);
          428                 snap = grow_snap(snap, snap->nr_blk_descs);
          429                 read_snap_descs(ifd, snap);
          430 
          431                 ret = (*fn)(snap, arg);
          432                 free(snap);
          433                 if (ret == WALK_STOP)
          434                         break;
          435         }
          436 }
          437 
          438 static void
          439 match_ver(uint64_t v)
          440 {
          441         uint8_t maj, min;
          442 
          443         min = v & VER_MIN_MASK;
          444         maj = (v >> VER_MAJ_SHIFT) & VER_MAJ_MASK;
          445         if (maj == VER_MAJ && min == VER_MIN)
          446                 return;
          447         errx(1, "format version mismatch: expected %u.%u but got %u.%u",
          448              VER_MAJ, VER_MIN, maj, min);
          449 }
          450 
          451 static void
          452 init_blk_hdr(void)
          453 {
          454         blk_hdr.flags = (VER_MAJ << VER_MAJ_SHIFT) | VER_MIN;
          455         blk_hdr.flags |= compr_algo << COMPR_ALGO_SHIFT;
          456         blk_hdr.flags |= hash_algo << HASH_ALGO_SHIFT;
          457         blk_hdr.size = BLK_HDR_SIZE;
          458 }
          459 
          460 static void
          461 load_blk_hdr(void)
          462 {
          463         uint64_t v;
          464 
          465         xlseek(sfd, 0, SEEK_SET);
          466         read_blk_hdr(sfd, &blk_hdr);
          467         match_ver(blk_hdr.flags);
          468 
          469         v = blk_hdr.flags >> COMPR_ALGO_SHIFT;
          470         v &= COMPR_ALGO_MASK;
          471         compr_algo = v;
          472 
          473         if (compr_algo < 0 || compr_algo >= NR_COMPRS)
          474                 errx(1, "unsupported compression algorithm: %d", compr_algo);
          475 
          476         if (verbose > 0)
          477                 fprintf(stderr, "Compression algorithm: %s\n",
          478                         compr_type2name(compr_algo));
          479 
          480         v = blk_hdr.flags >> HASH_ALGO_SHIFT;
          481         v &= HASH_ALGO_MASK;
          482         hash_algo = v;
          483 
          484         if (hash_algo < 0 || hash_algo >= NR_HASHES)
          485                 errx(1, "unsupported hash algorithm: %d", hash_algo);
          486 
          487         if (verbose > 0)
          488                 fprintf(stderr, "Hash algorithm: %s\n",
          489                         hash_type2name(hash_algo));
          490 }
          491 
          492 static void
          493 save_blk_hdr(void)
          494 {
          495         xlseek(sfd, 0, SEEK_SET);
          496         write_blk_hdr(sfd, &blk_hdr);
          497 }
          498 
          499 static void
          500 init_snap_hdr(void)
          501 {
          502         struct compr_ctx ctx;
          503 
          504         if (compr_init(&ctx, compr_algo) < 0)
          505                 errx(1, "compr_init failed");
          506         snap_hdr.flags = (VER_MAJ << VER_MAJ_SHIFT) | VER_MIN;
          507         snap_hdr.size = SNAP_HDR_SIZE;
          508         snap_hdr.st.min_blk_size = compr_size(&ctx, BLKSIZE_MAX);
          509         compr_final(&ctx);
          510 }
          511 
          512 static void
          513 load_snap_hdr(void)
          514 {
          515         xlseek(ifd, 0, SEEK_SET);
          516         read_snap_hdr(ifd, &snap_hdr);
          517         match_ver(snap_hdr.flags);
          518 }
          519 
          520 static void
          521 save_snap_hdr(void)
          522 {
          523         xlseek(ifd, 0, SEEK_SET);
          524         write_snap_hdr(ifd, &snap_hdr);
          525 }
          526 
          527 static void
          528 init(int iflag)
          529 {
          530         int flags;
          531 
          532         flags = O_RDWR;
          533         if (iflag)
          534                 flags |= O_CREAT | O_EXCL;
          535 
          536         ifd = open(SNAPSF, flags, 0600);
          537         if (ifd < 0)
          538                 err(1, "open %s", SNAPSF);
          539 
          540         sfd = open(STOREF, flags, 0600);
          541         if (sfd < 0)
          542                 err(1, "open %s", STOREF);
          543 
          544         if (flock(ifd, LOCK_NB | LOCK_EX) < 0 ||
          545             flock(sfd, LOCK_NB | LOCK_EX) < 0)
          546                 err(1, "flock");
          547 
          548         if (iflag) {
          549                 init_snap_hdr();
          550                 init_blk_hdr();
          551         } else {
          552                 load_snap_hdr();
          553                 load_blk_hdr();
          554         }
          555 
          556         icache = alloc_icache();
          557         walk_snap(build_icache, NULL);
          558 }
          559 
          560 static void
          561 term(void)
          562 {
          563         if (verbose > 0)
          564                 print_stats(&snap_hdr.st);
          565 
          566         free_icache(icache);
          567 
          568         save_blk_hdr();
          569         save_snap_hdr();
          570 
          571         fsync(sfd);
          572         fsync(ifd);
          573 
          574         close(sfd);
          575         close(ifd);
          576 }
          577 
          578 static void
          579 usage(void)
          580 {
          581         fprintf(stderr, "usage: %s [-cilv] [-H hash] [-Z compressor] [-e id] [-r root] [-m message] [file]\n", argv0);
          582         exit(1);
          583 }
          584 
          585 int
          586 main(int argc, char *argv[])
          587 {
          588         uint8_t md[MD_SIZE];
          589         char *id = NULL, *root = NULL, *msg = NULL, *hash_name = NULL;
          590         char *compr_name;
          591         int iflag = 0, lflag = 0, cflag = 0;
          592         int fd = -1;
          593 
          594         ARGBEGIN {
          595         case 'H':
          596                 hash_name = EARGF(usage());
          597                 if (strcmp(hash_name, "?") == 0) {
          598                         hash_list(STDERR_FILENO);
          599                         return 0;
          600                 }
          601                 hash_algo = hash_name2type(hash_name);
          602                 if (hash_algo < 0)
          603                         errx(1, "unknown hash: %s", hash_name);
          604                 break;
          605         case 'Z':
          606                 compr_name = EARGF(usage());
          607                 if (strcmp(compr_name, "?") == 0) {
          608                         compr_list(STDERR_FILENO);
          609                         return 0;
          610                 }
          611                 compr_algo = compr_name2type(compr_name);
          612                 if (compr_algo < 0)
          613                         errx(1, "unknown compressor: %s", compr_name);
          614                 break;
          615         case 'c':
          616                 cflag = 1;
          617                 break;
          618         case 'e':
          619                 id = EARGF(usage());
          620                 break;
          621         case 'i':
          622                 iflag = 1;
          623                 break;
          624         case 'l':
          625                 lflag = 1;
          626                 break;
          627         case 'r':
          628                 root = EARGF(usage());
          629                 break;
          630         case 'm':
          631                 msg = EARGF(usage());
          632                 break;
          633         case 'v':
          634                 verbose++;
          635                 break;
          636         default:
          637                 usage();
          638         } ARGEND
          639 
          640         if (argc > 1) {
          641                 usage();
          642         } else if (argc == 1) {
          643                 if (id) {
          644                         fd = open(argv[0], O_RDWR | O_CREAT, 0600);
          645                         if (fd < 0)
          646                                 err(1, "open %s", argv[0]);
          647                 } else {
          648                         fd = open(argv[0], O_RDONLY);
          649                         if (fd < 0)
          650                                 err(1, "open %s", argv[0]);
          651                 }
          652         } else {
          653                 if (id)
          654                         fd = STDOUT_FILENO;
          655                 else
          656                         fd = STDIN_FILENO;
          657         }
          658 
          659         if (root != NULL) {
          660                 mkdir(root, 0700);
          661                 if (chdir(root) < 0)
          662                         err(1, "chdir: %s", root);
          663         }
          664 
          665         init(iflag);
          666 
          667         if (iflag) {
          668                 term();
          669                 return 0;
          670         }
          671 
          672         if (cflag) {
          673                 int ret;
          674 
          675                 ret = 0;
          676                 walk_snap(check_snap, &ret);
          677                 if (ret != 0)
          678                         errx(1, "%s or %s is corrupted", SNAPSF, STOREF);
          679 
          680                 term();
          681                 return 0;
          682         }
          683 
          684         if (lflag) {
          685                 walk_snap(list, NULL);
          686                 term();
          687                 return 0;
          688         }
          689 
          690         if (id) {
          691                 struct extract_args args;
          692 
          693                 str2bin(id, md);
          694                 args.md = md;
          695                 args.fd = fd;
          696                 args.ret = -1;
          697                 walk_snap(extract, &args);
          698                 if (args.ret != 0)
          699                         errx(1, "unknown snapshot: %s", id);
          700         } else {
          701                 dedup(fd, msg);
          702         }
          703 
          704         term();
          705         return 0;
          706 }