tdedup.c - dedup - data deduplication program
HTML git clone git://bitreich.org/dedup/ git://hg6vgqziawt5s4dj.onion/dedup/
DIR Log
DIR Files
DIR Refs
DIR Tags
DIR README
DIR LICENSE
---
tdedup.c (14186B)
---
1 #include <sys/types.h>
2 #include <sys/stat.h>
3 #include <sys/file.h>
4
5 #include <err.h>
6 #include <fcntl.h>
7 #include <stdio.h>
8 #include <stdint.h>
9 #include <stdlib.h>
10 #include <string.h>
11 #include <unistd.h>
12
13 #include "arg.h"
14 #include "blake2.h"
15 #include "dedup.h"
16
17 #define SNAPSF ".snapshots"
18 #define STOREF ".store"
19
20 enum {
21 WALK_CONTINUE,
22 WALK_STOP
23 };
24
25 struct extract_args {
26 uint8_t *md;
27 int fd;
28 int ret;
29 };
30
31 static struct snap_hdr snap_hdr;
32 static struct blk_hdr blk_hdr;
33 static struct icache *icache;
34 static int ifd;
35 static int sfd;
36 static int hash_algo = HASH_BLAKE2B;
37 static int compr_algo = COMPR_LZ4;
38
39 int verbose;
40 char *argv0;
41
42 static void
43 print_md(FILE *fp, uint8_t *md, size_t size)
44 {
45 size_t i;
46
47 for (i = 0; i < size; i++)
48 fprintf(fp, "%02x", md[i]);
49 }
50
51 static void
52 print_stats(struct stats *st)
53 {
54 unsigned long long hits, misses;
55 double hitratio;
56
57 if (st->nr_blks == 0)
58 return;
59
60 fprintf(stderr, "Original size: %llu bytes\n",
61 (unsigned long long)st->orig_size);
62 fprintf(stderr, "Compressed size: %llu bytes\n",
63 (unsigned long long)st->compr_size);
64 fprintf(stderr, "Deduplicated size: %llu bytes\n",
65 (unsigned long long)st->dedup_size);
66 fprintf(stderr, "Deduplication ratio: %.2f\n",
67 (double)st->orig_size / st->dedup_size);
68 fprintf(stderr, "Min/avg/max block size: %llu/%llu/%llu bytes\n",
69 (unsigned long long)st->min_blk_size,
70 (unsigned long long)st->dedup_size / st->nr_blks,
71 (unsigned long long)st->max_blk_size);
72 fprintf(stderr, "Number of unique blocks: %llu\n",
73 (unsigned long long)st->nr_blks);
74
75 icache_stats(icache, &hits, &misses);
76 if (hits == 0 && misses == 0)
77 hitratio = 0;
78 else
79 hitratio = (double)hits / (hits + misses);
80
81 fprintf(stderr, "Index cache hit percentage: %.2f%%\n",
82 100 * hitratio);
83 }
84
85 static struct snap *
86 alloc_snap(void)
87 {
88 struct snap *snap;
89
90 snap = calloc(1, sizeof(*snap));
91 if (snap == NULL)
92 err(1, "%s", __func__);
93 return snap;
94 }
95
96 static void
97 free_snap(struct snap *snap)
98 {
99 free(snap);
100 }
101
102 /*
103 * The snapshot hash is calculated over the
104 * hash of its block descriptors.
105 */
106 static void
107 hash_snap(struct snap *snap, uint8_t *md)
108 {
109 struct hash_ctx ctx;
110 uint64_t i;
111
112 if (hash_init(&ctx, hash_algo, MD_SIZE) < 0)
113 errx(1, "hash_init failed");
114 for (i = 0; i < snap->nr_blk_descs; i++) {
115 struct blk_desc *blk_desc;
116
117 blk_desc = &snap->blk_desc[i];
118 hash_update(&ctx, blk_desc->md, sizeof(blk_desc->md));
119 }
120 hash_final(&ctx, md, MD_SIZE);
121 }
122
123 static struct snap *
124 grow_snap(struct snap *snap, uint64_t nr_blk_descs)
125 {
126 size_t size;
127
128 if (nr_blk_descs > SIZE_MAX / sizeof(snap->blk_desc[0]))
129 errx(1, "%s: overflow", __func__);
130 size = nr_blk_descs * sizeof(snap->blk_desc[0]);
131
132 if (size > SIZE_MAX - sizeof(*snap))
133 errx(1, "%s: overflow", __func__);
134 size += sizeof(*snap);
135
136 snap = realloc(snap, size);
137 if (snap == NULL)
138 err(1, "%s", __func__);
139 return snap;
140 }
141
142 static void
143 append_snap(struct snap *snap)
144 {
145 if (snap->nr_blk_descs > UINT64_MAX / BLK_DESC_SIZE)
146 errx(1, "%s: overflow", __func__);
147 snap->size = snap->nr_blk_descs * BLK_DESC_SIZE;
148
149 if (snap->size > UINT64_MAX - SNAPSHOT_SIZE)
150 errx(1, "%s: overflow", __func__);
151 snap->size += SNAPSHOT_SIZE;
152
153 xlseek(ifd, snap_hdr.size, SEEK_SET);
154 write_snap(ifd, snap);
155 write_snap_blk_descs(ifd, snap);
156
157 if (snap_hdr.size > UINT64_MAX - snap->size)
158 errx(1, "%s: overflow", __func__);
159 snap_hdr.size += snap->size;
160
161 if (snap_hdr.nr_snaps > UINT64_MAX - 1)
162 errx(1, "%s: overflow", __func__);
163 snap_hdr.nr_snaps++;
164 }
165
166 static uint8_t *
167 alloc_buf(size_t size)
168 {
169 void *p;
170
171 p = calloc(1, size);
172 if (p == NULL)
173 err(1, "%s", __func__);
174 return p;
175 }
176
177 static void
178 free_buf(uint8_t *buf)
179 {
180 free(buf);
181 }
182
183 static void
184 hash_blk(uint8_t *buf, size_t size, uint8_t *md)
185 {
186 struct hash_ctx ctx;
187
188 if (hash_init(&ctx, hash_algo, MD_SIZE) < 0)
189 errx(1, "hash_init failed");
190 hash_update(&ctx, buf, size);
191 hash_final(&ctx, md, MD_SIZE);
192 }
193
194 static void
195 read_blk(uint8_t *buf, struct blk_desc *blk_desc)
196 {
197 ssize_t n;
198
199 xlseek(sfd, blk_desc->offset, SEEK_SET);
200 n = xread(sfd, buf, blk_desc->size);
201 if (n == 0)
202 errx(1, "%s: unexpected EOF", __func__);
203 if (n != blk_desc->size)
204 errx(1, "%s: short read", __func__);
205 }
206
207 static void
208 append_blk(uint8_t *buf, struct blk_desc *blk_desc)
209 {
210 xlseek(sfd, blk_hdr.size, SEEK_SET);
211 xwrite(sfd, buf, blk_desc->size);
212
213 if (blk_hdr.size > UINT64_MAX - blk_desc->size)
214 errx(1, "%s: overflow", __func__);
215 blk_hdr.size += blk_desc->size;
216 }
217
218 static void
219 dedup_chunk(struct snap *snap, uint8_t *chunkp, size_t chunk_size)
220 {
221 uint8_t md[MD_SIZE];
222 struct blk_desc blk_desc;
223 struct compr_ctx ctx;
224 uint8_t *compr_buf;
225 size_t n, csize;
226
227 if (compr_init(&ctx, compr_algo) < 0)
228 errx(1, "compr_init failed");
229 csize = compr_size(&ctx, BLKSIZE_MAX);
230 compr_buf = alloc_buf(csize);
231
232 n = compr(&ctx, chunkp, compr_buf, chunk_size, csize);
233 hash_blk(compr_buf, n, md);
234
235 snap_hdr.st.orig_size += chunk_size;
236 snap_hdr.st.compr_size += n;
237
238 memcpy(blk_desc.md, md, sizeof(blk_desc.md));
239 if (lookup_icache(icache, &blk_desc) < 0) {
240 blk_desc.offset = blk_hdr.size;
241 blk_desc.size = n;
242
243 snap->blk_desc[snap->nr_blk_descs++] = blk_desc;
244 append_blk(compr_buf, &blk_desc);
245
246 insert_icache(icache, &blk_desc);
247
248 snap_hdr.st.dedup_size += blk_desc.size;
249 snap_hdr.st.nr_blks++;
250
251 if (blk_desc.size > snap_hdr.st.max_blk_size)
252 snap_hdr.st.max_blk_size = blk_desc.size;
253 if (blk_desc.size < snap_hdr.st.min_blk_size)
254 snap_hdr.st.min_blk_size = blk_desc.size;
255 } else {
256 snap->blk_desc[snap->nr_blk_descs++] = blk_desc;
257 }
258
259 free(compr_buf);
260 compr_final(&ctx);
261 }
262
263 static void
264 dedup(int fd, char *msg)
265 {
266 struct snap *snap;
267 struct chunker *chunker;
268
269 snap = alloc_snap();
270 chunker = alloc_chunker(fd, BLKSIZE_MIN, BLKSIZE_MAX,
271 HASHMASK_BITS, WINSIZE);
272
273 while (fill_chunker(chunker) > 0) {
274 uint8_t *chunkp;
275 size_t chunk_size;
276
277 chunkp = get_chunk(chunker, &chunk_size);
278 snap = grow_snap(snap, snap->nr_blk_descs + 1);
279 dedup_chunk(snap, chunkp, chunk_size);
280 drain_chunker(chunker);
281 }
282
283 if (snap->nr_blk_descs > 0) {
284 if (msg != NULL) {
285 size_t size;
286
287 size = strlen(msg) + 1;
288 if (size > sizeof(snap->msg))
289 size = sizeof(snap->msg);
290 memcpy(snap->msg, msg, size);
291 snap->msg[size - 1] = '\0';
292 }
293 hash_snap(snap, snap->md);
294 append_snap(snap);
295 }
296
297 free_chunker(chunker);
298 free_snap(snap);
299 }
300
301 static int
302 extract(struct snap *snap, void *arg)
303 {
304 uint8_t *buf[2];
305 struct extract_args *args = arg;
306 struct compr_ctx ctx;
307 uint64_t i;
308
309 if (memcmp(snap->md, args->md, sizeof(snap->md)) != 0)
310 return WALK_CONTINUE;
311
312 if (compr_init(&ctx, compr_algo) < 0)
313 errx(1, "compr_init failed");
314 buf[0] = alloc_buf(BLKSIZE_MAX);
315 buf[1] = alloc_buf(compr_size(&ctx, BLKSIZE_MAX));
316 for (i = 0; i < snap->nr_blk_descs; i++) {
317 struct blk_desc *blk_desc;
318 size_t blksize;
319
320 blk_desc = &snap->blk_desc[i];
321 read_blk(buf[1], blk_desc);
322 blksize = decompr(&ctx, buf[1], buf[0], blk_desc->size, BLKSIZE_MAX);
323 xwrite(args->fd, buf[0], blksize);
324 }
325 free_buf(buf[1]);
326 free_buf(buf[0]);
327 compr_final(&ctx);
328 args->ret = 0;
329 return WALK_STOP;
330 }
331
332 /*
333 * Hash every block referenced by the given snapshot
334 * and compare its hash with the one stored in the corresponding
335 * block descriptor.
336 */
337 static int
338 check_snap(struct snap *snap, void *arg)
339 {
340 struct compr_ctx ctx;
341 uint8_t *buf;
342 int *ret = arg;
343 uint64_t i;
344
345 if (verbose > 0) {
346 fprintf(stderr, "Checking snapshot: ");
347 print_md(stderr, snap->md, sizeof(snap->md));
348 fputc('\n', stderr);
349 }
350
351 if (compr_init(&ctx, compr_algo) < 0)
352 errx(1, "compr_init failed");
353 buf = alloc_buf(compr_size(&ctx, BLKSIZE_MAX));
354 for (i = 0; i < snap->nr_blk_descs; i++) {
355 uint8_t md[MD_SIZE];
356 struct blk_desc *blk_desc;
357
358 blk_desc = &snap->blk_desc[i];
359 read_blk(buf, blk_desc);
360 hash_blk(buf, blk_desc->size, md);
361
362 if (memcmp(blk_desc->md, md, sizeof(blk_desc->md)) == 0)
363 continue;
364
365 fprintf(stderr, "Block hash mismatch\n");
366 fprintf(stderr, " Expected hash: ");
367 print_md(stderr, blk_desc->md, sizeof(blk_desc->md));
368 fputc('\n', stderr);
369 fprintf(stderr, " Actual hash: ");
370 print_md(stderr, md, sizeof(md));
371 fputc('\n', stderr);
372 fprintf(stderr, " Offset: %llu\n",
373 (unsigned long long)blk_desc->offset);
374 fprintf(stderr, " Size: %llu\n",
375 (unsigned long long)blk_desc->size);
376 *ret = -1;
377 }
378 free_buf(buf);
379 compr_final(&ctx);
380 return WALK_CONTINUE;
381 }
382
383 static int
384 build_icache(struct snap *snap, void *arg)
385 {
386 struct compr_ctx ctx;
387 uint8_t *buf;
388 uint64_t i;
389
390 if (compr_init(&ctx, compr_algo) < 0)
391 errx(1, "compr_init failed");
392 buf = alloc_buf(compr_size(&ctx, BLKSIZE_MAX));
393 for (i = 0; i < snap->nr_blk_descs; i++) {
394 struct blk_desc *blk_desc;
395
396 blk_desc = &snap->blk_desc[i];
397 insert_icache(icache, blk_desc);
398 }
399 free(buf);
400 compr_final(&ctx);
401 return WALK_CONTINUE;
402 }
403
404 static int
405 list(struct snap *snap, void *arg)
406 {
407 print_md(stdout, snap->md, sizeof(snap->md));
408 if (snap->msg[0] != '\0')
409 printf("\t%s\n", snap->msg);
410 else
411 putchar('\n');
412 return WALK_CONTINUE;
413 }
414
415 /* Walk through all snapshots and call fn() on each one */
416 static void
417 walk_snap(int (*fn)(struct snap *, void *), void *arg)
418 {
419 uint64_t i;
420
421 xlseek(ifd, SNAP_HDR_SIZE, SEEK_SET);
422 for (i = 0; i < snap_hdr.nr_snaps; i++) {
423 struct snap *snap;
424 int ret;
425
426 snap = alloc_snap();
427 read_snap(ifd, snap);
428 snap = grow_snap(snap, snap->nr_blk_descs);
429 read_snap_descs(ifd, snap);
430
431 ret = (*fn)(snap, arg);
432 free(snap);
433 if (ret == WALK_STOP)
434 break;
435 }
436 }
437
438 static void
439 match_ver(uint64_t v)
440 {
441 uint8_t maj, min;
442
443 min = v & VER_MIN_MASK;
444 maj = (v >> VER_MAJ_SHIFT) & VER_MAJ_MASK;
445 if (maj == VER_MAJ && min == VER_MIN)
446 return;
447 errx(1, "format version mismatch: expected %u.%u but got %u.%u",
448 VER_MAJ, VER_MIN, maj, min);
449 }
450
451 static void
452 init_blk_hdr(void)
453 {
454 blk_hdr.flags = (VER_MAJ << VER_MAJ_SHIFT) | VER_MIN;
455 blk_hdr.flags |= compr_algo << COMPR_ALGO_SHIFT;
456 blk_hdr.flags |= hash_algo << HASH_ALGO_SHIFT;
457 blk_hdr.size = BLK_HDR_SIZE;
458 }
459
460 static void
461 load_blk_hdr(void)
462 {
463 uint64_t v;
464
465 xlseek(sfd, 0, SEEK_SET);
466 read_blk_hdr(sfd, &blk_hdr);
467 match_ver(blk_hdr.flags);
468
469 v = blk_hdr.flags >> COMPR_ALGO_SHIFT;
470 v &= COMPR_ALGO_MASK;
471 compr_algo = v;
472
473 if (compr_algo < 0 || compr_algo >= NR_COMPRS)
474 errx(1, "unsupported compression algorithm: %d", compr_algo);
475
476 if (verbose > 0)
477 fprintf(stderr, "Compression algorithm: %s\n",
478 compr_type2name(compr_algo));
479
480 v = blk_hdr.flags >> HASH_ALGO_SHIFT;
481 v &= HASH_ALGO_MASK;
482 hash_algo = v;
483
484 if (hash_algo < 0 || hash_algo >= NR_HASHES)
485 errx(1, "unsupported hash algorithm: %d", hash_algo);
486
487 if (verbose > 0)
488 fprintf(stderr, "Hash algorithm: %s\n",
489 hash_type2name(hash_algo));
490 }
491
492 static void
493 save_blk_hdr(void)
494 {
495 xlseek(sfd, 0, SEEK_SET);
496 write_blk_hdr(sfd, &blk_hdr);
497 }
498
499 static void
500 init_snap_hdr(void)
501 {
502 struct compr_ctx ctx;
503
504 if (compr_init(&ctx, compr_algo) < 0)
505 errx(1, "compr_init failed");
506 snap_hdr.flags = (VER_MAJ << VER_MAJ_SHIFT) | VER_MIN;
507 snap_hdr.size = SNAP_HDR_SIZE;
508 snap_hdr.st.min_blk_size = compr_size(&ctx, BLKSIZE_MAX);
509 compr_final(&ctx);
510 }
511
512 static void
513 load_snap_hdr(void)
514 {
515 xlseek(ifd, 0, SEEK_SET);
516 read_snap_hdr(ifd, &snap_hdr);
517 match_ver(snap_hdr.flags);
518 }
519
520 static void
521 save_snap_hdr(void)
522 {
523 xlseek(ifd, 0, SEEK_SET);
524 write_snap_hdr(ifd, &snap_hdr);
525 }
526
527 static void
528 init(int iflag)
529 {
530 int flags;
531
532 flags = O_RDWR;
533 if (iflag)
534 flags |= O_CREAT | O_EXCL;
535
536 ifd = open(SNAPSF, flags, 0600);
537 if (ifd < 0)
538 err(1, "open %s", SNAPSF);
539
540 sfd = open(STOREF, flags, 0600);
541 if (sfd < 0)
542 err(1, "open %s", STOREF);
543
544 if (flock(ifd, LOCK_NB | LOCK_EX) < 0 ||
545 flock(sfd, LOCK_NB | LOCK_EX) < 0)
546 err(1, "flock");
547
548 if (iflag) {
549 init_snap_hdr();
550 init_blk_hdr();
551 } else {
552 load_snap_hdr();
553 load_blk_hdr();
554 }
555
556 icache = alloc_icache();
557 walk_snap(build_icache, NULL);
558 }
559
560 static void
561 term(void)
562 {
563 if (verbose > 0)
564 print_stats(&snap_hdr.st);
565
566 free_icache(icache);
567
568 save_blk_hdr();
569 save_snap_hdr();
570
571 fsync(sfd);
572 fsync(ifd);
573
574 close(sfd);
575 close(ifd);
576 }
577
578 static void
579 usage(void)
580 {
581 fprintf(stderr, "usage: %s [-cilv] [-H hash] [-Z compressor] [-e id] [-r root] [-m message] [file]\n", argv0);
582 exit(1);
583 }
584
585 int
586 main(int argc, char *argv[])
587 {
588 uint8_t md[MD_SIZE];
589 char *id = NULL, *root = NULL, *msg = NULL, *hash_name = NULL;
590 char *compr_name;
591 int iflag = 0, lflag = 0, cflag = 0;
592 int fd = -1;
593
594 ARGBEGIN {
595 case 'H':
596 hash_name = EARGF(usage());
597 if (strcmp(hash_name, "?") == 0) {
598 hash_list(STDERR_FILENO);
599 return 0;
600 }
601 hash_algo = hash_name2type(hash_name);
602 if (hash_algo < 0)
603 errx(1, "unknown hash: %s", hash_name);
604 break;
605 case 'Z':
606 compr_name = EARGF(usage());
607 if (strcmp(compr_name, "?") == 0) {
608 compr_list(STDERR_FILENO);
609 return 0;
610 }
611 compr_algo = compr_name2type(compr_name);
612 if (compr_algo < 0)
613 errx(1, "unknown compressor: %s", compr_name);
614 break;
615 case 'c':
616 cflag = 1;
617 break;
618 case 'e':
619 id = EARGF(usage());
620 break;
621 case 'i':
622 iflag = 1;
623 break;
624 case 'l':
625 lflag = 1;
626 break;
627 case 'r':
628 root = EARGF(usage());
629 break;
630 case 'm':
631 msg = EARGF(usage());
632 break;
633 case 'v':
634 verbose++;
635 break;
636 default:
637 usage();
638 } ARGEND
639
640 if (argc > 1) {
641 usage();
642 } else if (argc == 1) {
643 if (id) {
644 fd = open(argv[0], O_RDWR | O_CREAT, 0600);
645 if (fd < 0)
646 err(1, "open %s", argv[0]);
647 } else {
648 fd = open(argv[0], O_RDONLY);
649 if (fd < 0)
650 err(1, "open %s", argv[0]);
651 }
652 } else {
653 if (id)
654 fd = STDOUT_FILENO;
655 else
656 fd = STDIN_FILENO;
657 }
658
659 if (root != NULL) {
660 mkdir(root, 0700);
661 if (chdir(root) < 0)
662 err(1, "chdir: %s", root);
663 }
664
665 init(iflag);
666
667 if (iflag) {
668 term();
669 return 0;
670 }
671
672 if (cflag) {
673 int ret;
674
675 ret = 0;
676 walk_snap(check_snap, &ret);
677 if (ret != 0)
678 errx(1, "%s or %s is corrupted", SNAPSF, STOREF);
679
680 term();
681 return 0;
682 }
683
684 if (lflag) {
685 walk_snap(list, NULL);
686 term();
687 return 0;
688 }
689
690 if (id) {
691 struct extract_args args;
692
693 str2bin(id, md);
694 args.md = md;
695 args.fd = fd;
696 args.ret = -1;
697 walk_snap(extract, &args);
698 if (args.ret != 0)
699 errx(1, "unknown snapshot: %s", id);
700 } else {
701 dedup(fd, msg);
702 }
703
704 term();
705 return 0;
706 }