tdedup.c - dedup - data deduplication program HTML git clone git://bitreich.org/dedup/ git://hg6vgqziawt5s4dj.onion/dedup/ DIR Log DIR Files DIR Refs DIR Tags DIR README DIR LICENSE --- tdedup.c (14186B) --- 1 #include <sys/types.h> 2 #include <sys/stat.h> 3 #include <sys/file.h> 4 5 #include <err.h> 6 #include <fcntl.h> 7 #include <stdio.h> 8 #include <stdint.h> 9 #include <stdlib.h> 10 #include <string.h> 11 #include <unistd.h> 12 13 #include "arg.h" 14 #include "blake2.h" 15 #include "dedup.h" 16 17 #define SNAPSF ".snapshots" 18 #define STOREF ".store" 19 20 enum { 21 WALK_CONTINUE, 22 WALK_STOP 23 }; 24 25 struct extract_args { 26 uint8_t *md; 27 int fd; 28 int ret; 29 }; 30 31 static struct snap_hdr snap_hdr; 32 static struct blk_hdr blk_hdr; 33 static struct icache *icache; 34 static int ifd; 35 static int sfd; 36 static int hash_algo = HASH_BLAKE2B; 37 static int compr_algo = COMPR_LZ4; 38 39 int verbose; 40 char *argv0; 41 42 static void 43 print_md(FILE *fp, uint8_t *md, size_t size) 44 { 45 size_t i; 46 47 for (i = 0; i < size; i++) 48 fprintf(fp, "%02x", md[i]); 49 } 50 51 static void 52 print_stats(struct stats *st) 53 { 54 unsigned long long hits, misses; 55 double hitratio; 56 57 if (st->nr_blks == 0) 58 return; 59 60 fprintf(stderr, "Original size: %llu bytes\n", 61 (unsigned long long)st->orig_size); 62 fprintf(stderr, "Compressed size: %llu bytes\n", 63 (unsigned long long)st->compr_size); 64 fprintf(stderr, "Deduplicated size: %llu bytes\n", 65 (unsigned long long)st->dedup_size); 66 fprintf(stderr, "Deduplication ratio: %.2f\n", 67 (double)st->orig_size / st->dedup_size); 68 fprintf(stderr, "Min/avg/max block size: %llu/%llu/%llu bytes\n", 69 (unsigned long long)st->min_blk_size, 70 (unsigned long long)st->dedup_size / st->nr_blks, 71 (unsigned long long)st->max_blk_size); 72 fprintf(stderr, "Number of unique blocks: %llu\n", 73 (unsigned long long)st->nr_blks); 74 75 icache_stats(icache, &hits, &misses); 76 if (hits == 0 && misses == 0) 77 hitratio = 0; 78 else 79 hitratio = (double)hits / (hits + misses); 80 81 fprintf(stderr, "Index cache hit percentage: %.2f%%\n", 82 100 * hitratio); 83 } 84 85 static struct snap * 86 alloc_snap(void) 87 { 88 struct snap *snap; 89 90 snap = calloc(1, sizeof(*snap)); 91 if (snap == NULL) 92 err(1, "%s", __func__); 93 return snap; 94 } 95 96 static void 97 free_snap(struct snap *snap) 98 { 99 free(snap); 100 } 101 102 /* 103 * The snapshot hash is calculated over the 104 * hash of its block descriptors. 105 */ 106 static void 107 hash_snap(struct snap *snap, uint8_t *md) 108 { 109 struct hash_ctx ctx; 110 uint64_t i; 111 112 if (hash_init(&ctx, hash_algo, MD_SIZE) < 0) 113 errx(1, "hash_init failed"); 114 for (i = 0; i < snap->nr_blk_descs; i++) { 115 struct blk_desc *blk_desc; 116 117 blk_desc = &snap->blk_desc[i]; 118 hash_update(&ctx, blk_desc->md, sizeof(blk_desc->md)); 119 } 120 hash_final(&ctx, md, MD_SIZE); 121 } 122 123 static struct snap * 124 grow_snap(struct snap *snap, uint64_t nr_blk_descs) 125 { 126 size_t size; 127 128 if (nr_blk_descs > SIZE_MAX / sizeof(snap->blk_desc[0])) 129 errx(1, "%s: overflow", __func__); 130 size = nr_blk_descs * sizeof(snap->blk_desc[0]); 131 132 if (size > SIZE_MAX - sizeof(*snap)) 133 errx(1, "%s: overflow", __func__); 134 size += sizeof(*snap); 135 136 snap = realloc(snap, size); 137 if (snap == NULL) 138 err(1, "%s", __func__); 139 return snap; 140 } 141 142 static void 143 append_snap(struct snap *snap) 144 { 145 if (snap->nr_blk_descs > UINT64_MAX / BLK_DESC_SIZE) 146 errx(1, "%s: overflow", __func__); 147 snap->size = snap->nr_blk_descs * BLK_DESC_SIZE; 148 149 if (snap->size > UINT64_MAX - SNAPSHOT_SIZE) 150 errx(1, "%s: overflow", __func__); 151 snap->size += SNAPSHOT_SIZE; 152 153 xlseek(ifd, snap_hdr.size, SEEK_SET); 154 write_snap(ifd, snap); 155 write_snap_blk_descs(ifd, snap); 156 157 if (snap_hdr.size > UINT64_MAX - snap->size) 158 errx(1, "%s: overflow", __func__); 159 snap_hdr.size += snap->size; 160 161 if (snap_hdr.nr_snaps > UINT64_MAX - 1) 162 errx(1, "%s: overflow", __func__); 163 snap_hdr.nr_snaps++; 164 } 165 166 static uint8_t * 167 alloc_buf(size_t size) 168 { 169 void *p; 170 171 p = calloc(1, size); 172 if (p == NULL) 173 err(1, "%s", __func__); 174 return p; 175 } 176 177 static void 178 free_buf(uint8_t *buf) 179 { 180 free(buf); 181 } 182 183 static void 184 hash_blk(uint8_t *buf, size_t size, uint8_t *md) 185 { 186 struct hash_ctx ctx; 187 188 if (hash_init(&ctx, hash_algo, MD_SIZE) < 0) 189 errx(1, "hash_init failed"); 190 hash_update(&ctx, buf, size); 191 hash_final(&ctx, md, MD_SIZE); 192 } 193 194 static void 195 read_blk(uint8_t *buf, struct blk_desc *blk_desc) 196 { 197 ssize_t n; 198 199 xlseek(sfd, blk_desc->offset, SEEK_SET); 200 n = xread(sfd, buf, blk_desc->size); 201 if (n == 0) 202 errx(1, "%s: unexpected EOF", __func__); 203 if (n != blk_desc->size) 204 errx(1, "%s: short read", __func__); 205 } 206 207 static void 208 append_blk(uint8_t *buf, struct blk_desc *blk_desc) 209 { 210 xlseek(sfd, blk_hdr.size, SEEK_SET); 211 xwrite(sfd, buf, blk_desc->size); 212 213 if (blk_hdr.size > UINT64_MAX - blk_desc->size) 214 errx(1, "%s: overflow", __func__); 215 blk_hdr.size += blk_desc->size; 216 } 217 218 static void 219 dedup_chunk(struct snap *snap, uint8_t *chunkp, size_t chunk_size) 220 { 221 uint8_t md[MD_SIZE]; 222 struct blk_desc blk_desc; 223 struct compr_ctx ctx; 224 uint8_t *compr_buf; 225 size_t n, csize; 226 227 if (compr_init(&ctx, compr_algo) < 0) 228 errx(1, "compr_init failed"); 229 csize = compr_size(&ctx, BLKSIZE_MAX); 230 compr_buf = alloc_buf(csize); 231 232 n = compr(&ctx, chunkp, compr_buf, chunk_size, csize); 233 hash_blk(compr_buf, n, md); 234 235 snap_hdr.st.orig_size += chunk_size; 236 snap_hdr.st.compr_size += n; 237 238 memcpy(blk_desc.md, md, sizeof(blk_desc.md)); 239 if (lookup_icache(icache, &blk_desc) < 0) { 240 blk_desc.offset = blk_hdr.size; 241 blk_desc.size = n; 242 243 snap->blk_desc[snap->nr_blk_descs++] = blk_desc; 244 append_blk(compr_buf, &blk_desc); 245 246 insert_icache(icache, &blk_desc); 247 248 snap_hdr.st.dedup_size += blk_desc.size; 249 snap_hdr.st.nr_blks++; 250 251 if (blk_desc.size > snap_hdr.st.max_blk_size) 252 snap_hdr.st.max_blk_size = blk_desc.size; 253 if (blk_desc.size < snap_hdr.st.min_blk_size) 254 snap_hdr.st.min_blk_size = blk_desc.size; 255 } else { 256 snap->blk_desc[snap->nr_blk_descs++] = blk_desc; 257 } 258 259 free(compr_buf); 260 compr_final(&ctx); 261 } 262 263 static void 264 dedup(int fd, char *msg) 265 { 266 struct snap *snap; 267 struct chunker *chunker; 268 269 snap = alloc_snap(); 270 chunker = alloc_chunker(fd, BLKSIZE_MIN, BLKSIZE_MAX, 271 HASHMASK_BITS, WINSIZE); 272 273 while (fill_chunker(chunker) > 0) { 274 uint8_t *chunkp; 275 size_t chunk_size; 276 277 chunkp = get_chunk(chunker, &chunk_size); 278 snap = grow_snap(snap, snap->nr_blk_descs + 1); 279 dedup_chunk(snap, chunkp, chunk_size); 280 drain_chunker(chunker); 281 } 282 283 if (snap->nr_blk_descs > 0) { 284 if (msg != NULL) { 285 size_t size; 286 287 size = strlen(msg) + 1; 288 if (size > sizeof(snap->msg)) 289 size = sizeof(snap->msg); 290 memcpy(snap->msg, msg, size); 291 snap->msg[size - 1] = '\0'; 292 } 293 hash_snap(snap, snap->md); 294 append_snap(snap); 295 } 296 297 free_chunker(chunker); 298 free_snap(snap); 299 } 300 301 static int 302 extract(struct snap *snap, void *arg) 303 { 304 uint8_t *buf[2]; 305 struct extract_args *args = arg; 306 struct compr_ctx ctx; 307 uint64_t i; 308 309 if (memcmp(snap->md, args->md, sizeof(snap->md)) != 0) 310 return WALK_CONTINUE; 311 312 if (compr_init(&ctx, compr_algo) < 0) 313 errx(1, "compr_init failed"); 314 buf[0] = alloc_buf(BLKSIZE_MAX); 315 buf[1] = alloc_buf(compr_size(&ctx, BLKSIZE_MAX)); 316 for (i = 0; i < snap->nr_blk_descs; i++) { 317 struct blk_desc *blk_desc; 318 size_t blksize; 319 320 blk_desc = &snap->blk_desc[i]; 321 read_blk(buf[1], blk_desc); 322 blksize = decompr(&ctx, buf[1], buf[0], blk_desc->size, BLKSIZE_MAX); 323 xwrite(args->fd, buf[0], blksize); 324 } 325 free_buf(buf[1]); 326 free_buf(buf[0]); 327 compr_final(&ctx); 328 args->ret = 0; 329 return WALK_STOP; 330 } 331 332 /* 333 * Hash every block referenced by the given snapshot 334 * and compare its hash with the one stored in the corresponding 335 * block descriptor. 336 */ 337 static int 338 check_snap(struct snap *snap, void *arg) 339 { 340 struct compr_ctx ctx; 341 uint8_t *buf; 342 int *ret = arg; 343 uint64_t i; 344 345 if (verbose > 0) { 346 fprintf(stderr, "Checking snapshot: "); 347 print_md(stderr, snap->md, sizeof(snap->md)); 348 fputc('\n', stderr); 349 } 350 351 if (compr_init(&ctx, compr_algo) < 0) 352 errx(1, "compr_init failed"); 353 buf = alloc_buf(compr_size(&ctx, BLKSIZE_MAX)); 354 for (i = 0; i < snap->nr_blk_descs; i++) { 355 uint8_t md[MD_SIZE]; 356 struct blk_desc *blk_desc; 357 358 blk_desc = &snap->blk_desc[i]; 359 read_blk(buf, blk_desc); 360 hash_blk(buf, blk_desc->size, md); 361 362 if (memcmp(blk_desc->md, md, sizeof(blk_desc->md)) == 0) 363 continue; 364 365 fprintf(stderr, "Block hash mismatch\n"); 366 fprintf(stderr, " Expected hash: "); 367 print_md(stderr, blk_desc->md, sizeof(blk_desc->md)); 368 fputc('\n', stderr); 369 fprintf(stderr, " Actual hash: "); 370 print_md(stderr, md, sizeof(md)); 371 fputc('\n', stderr); 372 fprintf(stderr, " Offset: %llu\n", 373 (unsigned long long)blk_desc->offset); 374 fprintf(stderr, " Size: %llu\n", 375 (unsigned long long)blk_desc->size); 376 *ret = -1; 377 } 378 free_buf(buf); 379 compr_final(&ctx); 380 return WALK_CONTINUE; 381 } 382 383 static int 384 build_icache(struct snap *snap, void *arg) 385 { 386 struct compr_ctx ctx; 387 uint8_t *buf; 388 uint64_t i; 389 390 if (compr_init(&ctx, compr_algo) < 0) 391 errx(1, "compr_init failed"); 392 buf = alloc_buf(compr_size(&ctx, BLKSIZE_MAX)); 393 for (i = 0; i < snap->nr_blk_descs; i++) { 394 struct blk_desc *blk_desc; 395 396 blk_desc = &snap->blk_desc[i]; 397 insert_icache(icache, blk_desc); 398 } 399 free(buf); 400 compr_final(&ctx); 401 return WALK_CONTINUE; 402 } 403 404 static int 405 list(struct snap *snap, void *arg) 406 { 407 print_md(stdout, snap->md, sizeof(snap->md)); 408 if (snap->msg[0] != '\0') 409 printf("\t%s\n", snap->msg); 410 else 411 putchar('\n'); 412 return WALK_CONTINUE; 413 } 414 415 /* Walk through all snapshots and call fn() on each one */ 416 static void 417 walk_snap(int (*fn)(struct snap *, void *), void *arg) 418 { 419 uint64_t i; 420 421 xlseek(ifd, SNAP_HDR_SIZE, SEEK_SET); 422 for (i = 0; i < snap_hdr.nr_snaps; i++) { 423 struct snap *snap; 424 int ret; 425 426 snap = alloc_snap(); 427 read_snap(ifd, snap); 428 snap = grow_snap(snap, snap->nr_blk_descs); 429 read_snap_descs(ifd, snap); 430 431 ret = (*fn)(snap, arg); 432 free(snap); 433 if (ret == WALK_STOP) 434 break; 435 } 436 } 437 438 static void 439 match_ver(uint64_t v) 440 { 441 uint8_t maj, min; 442 443 min = v & VER_MIN_MASK; 444 maj = (v >> VER_MAJ_SHIFT) & VER_MAJ_MASK; 445 if (maj == VER_MAJ && min == VER_MIN) 446 return; 447 errx(1, "format version mismatch: expected %u.%u but got %u.%u", 448 VER_MAJ, VER_MIN, maj, min); 449 } 450 451 static void 452 init_blk_hdr(void) 453 { 454 blk_hdr.flags = (VER_MAJ << VER_MAJ_SHIFT) | VER_MIN; 455 blk_hdr.flags |= compr_algo << COMPR_ALGO_SHIFT; 456 blk_hdr.flags |= hash_algo << HASH_ALGO_SHIFT; 457 blk_hdr.size = BLK_HDR_SIZE; 458 } 459 460 static void 461 load_blk_hdr(void) 462 { 463 uint64_t v; 464 465 xlseek(sfd, 0, SEEK_SET); 466 read_blk_hdr(sfd, &blk_hdr); 467 match_ver(blk_hdr.flags); 468 469 v = blk_hdr.flags >> COMPR_ALGO_SHIFT; 470 v &= COMPR_ALGO_MASK; 471 compr_algo = v; 472 473 if (compr_algo < 0 || compr_algo >= NR_COMPRS) 474 errx(1, "unsupported compression algorithm: %d", compr_algo); 475 476 if (verbose > 0) 477 fprintf(stderr, "Compression algorithm: %s\n", 478 compr_type2name(compr_algo)); 479 480 v = blk_hdr.flags >> HASH_ALGO_SHIFT; 481 v &= HASH_ALGO_MASK; 482 hash_algo = v; 483 484 if (hash_algo < 0 || hash_algo >= NR_HASHES) 485 errx(1, "unsupported hash algorithm: %d", hash_algo); 486 487 if (verbose > 0) 488 fprintf(stderr, "Hash algorithm: %s\n", 489 hash_type2name(hash_algo)); 490 } 491 492 static void 493 save_blk_hdr(void) 494 { 495 xlseek(sfd, 0, SEEK_SET); 496 write_blk_hdr(sfd, &blk_hdr); 497 } 498 499 static void 500 init_snap_hdr(void) 501 { 502 struct compr_ctx ctx; 503 504 if (compr_init(&ctx, compr_algo) < 0) 505 errx(1, "compr_init failed"); 506 snap_hdr.flags = (VER_MAJ << VER_MAJ_SHIFT) | VER_MIN; 507 snap_hdr.size = SNAP_HDR_SIZE; 508 snap_hdr.st.min_blk_size = compr_size(&ctx, BLKSIZE_MAX); 509 compr_final(&ctx); 510 } 511 512 static void 513 load_snap_hdr(void) 514 { 515 xlseek(ifd, 0, SEEK_SET); 516 read_snap_hdr(ifd, &snap_hdr); 517 match_ver(snap_hdr.flags); 518 } 519 520 static void 521 save_snap_hdr(void) 522 { 523 xlseek(ifd, 0, SEEK_SET); 524 write_snap_hdr(ifd, &snap_hdr); 525 } 526 527 static void 528 init(int iflag) 529 { 530 int flags; 531 532 flags = O_RDWR; 533 if (iflag) 534 flags |= O_CREAT | O_EXCL; 535 536 ifd = open(SNAPSF, flags, 0600); 537 if (ifd < 0) 538 err(1, "open %s", SNAPSF); 539 540 sfd = open(STOREF, flags, 0600); 541 if (sfd < 0) 542 err(1, "open %s", STOREF); 543 544 if (flock(ifd, LOCK_NB | LOCK_EX) < 0 || 545 flock(sfd, LOCK_NB | LOCK_EX) < 0) 546 err(1, "flock"); 547 548 if (iflag) { 549 init_snap_hdr(); 550 init_blk_hdr(); 551 } else { 552 load_snap_hdr(); 553 load_blk_hdr(); 554 } 555 556 icache = alloc_icache(); 557 walk_snap(build_icache, NULL); 558 } 559 560 static void 561 term(void) 562 { 563 if (verbose > 0) 564 print_stats(&snap_hdr.st); 565 566 free_icache(icache); 567 568 save_blk_hdr(); 569 save_snap_hdr(); 570 571 fsync(sfd); 572 fsync(ifd); 573 574 close(sfd); 575 close(ifd); 576 } 577 578 static void 579 usage(void) 580 { 581 fprintf(stderr, "usage: %s [-cilv] [-H hash] [-Z compressor] [-e id] [-r root] [-m message] [file]\n", argv0); 582 exit(1); 583 } 584 585 int 586 main(int argc, char *argv[]) 587 { 588 uint8_t md[MD_SIZE]; 589 char *id = NULL, *root = NULL, *msg = NULL, *hash_name = NULL; 590 char *compr_name; 591 int iflag = 0, lflag = 0, cflag = 0; 592 int fd = -1; 593 594 ARGBEGIN { 595 case 'H': 596 hash_name = EARGF(usage()); 597 if (strcmp(hash_name, "?") == 0) { 598 hash_list(STDERR_FILENO); 599 return 0; 600 } 601 hash_algo = hash_name2type(hash_name); 602 if (hash_algo < 0) 603 errx(1, "unknown hash: %s", hash_name); 604 break; 605 case 'Z': 606 compr_name = EARGF(usage()); 607 if (strcmp(compr_name, "?") == 0) { 608 compr_list(STDERR_FILENO); 609 return 0; 610 } 611 compr_algo = compr_name2type(compr_name); 612 if (compr_algo < 0) 613 errx(1, "unknown compressor: %s", compr_name); 614 break; 615 case 'c': 616 cflag = 1; 617 break; 618 case 'e': 619 id = EARGF(usage()); 620 break; 621 case 'i': 622 iflag = 1; 623 break; 624 case 'l': 625 lflag = 1; 626 break; 627 case 'r': 628 root = EARGF(usage()); 629 break; 630 case 'm': 631 msg = EARGF(usage()); 632 break; 633 case 'v': 634 verbose++; 635 break; 636 default: 637 usage(); 638 } ARGEND 639 640 if (argc > 1) { 641 usage(); 642 } else if (argc == 1) { 643 if (id) { 644 fd = open(argv[0], O_RDWR | O_CREAT, 0600); 645 if (fd < 0) 646 err(1, "open %s", argv[0]); 647 } else { 648 fd = open(argv[0], O_RDONLY); 649 if (fd < 0) 650 err(1, "open %s", argv[0]); 651 } 652 } else { 653 if (id) 654 fd = STDOUT_FILENO; 655 else 656 fd = STDIN_FILENO; 657 } 658 659 if (root != NULL) { 660 mkdir(root, 0700); 661 if (chdir(root) < 0) 662 err(1, "chdir: %s", root); 663 } 664 665 init(iflag); 666 667 if (iflag) { 668 term(); 669 return 0; 670 } 671 672 if (cflag) { 673 int ret; 674 675 ret = 0; 676 walk_snap(check_snap, &ret); 677 if (ret != 0) 678 errx(1, "%s or %s is corrupted", SNAPSF, STOREF); 679 680 term(); 681 return 0; 682 } 683 684 if (lflag) { 685 walk_snap(list, NULL); 686 term(); 687 return 0; 688 } 689 690 if (id) { 691 struct extract_args args; 692 693 str2bin(id, md); 694 args.md = md; 695 args.fd = fd; 696 args.ret = -1; 697 walk_snap(extract, &args); 698 if (args.ret != 0) 699 errx(1, "unknown snapshot: %s", id); 700 } else { 701 dedup(fd, msg); 702 } 703 704 term(); 705 return 0; 706 }