When matching a pattern check if bottom bits of hash are 0 - dedup - deduplicating backup program HTML git clone git://bitreich.org/dedup/ git://enlrupgkhuxnvlhsf6lc3fziv5h2hhfrinws65d7roiv6bfj7d652fid.onion/dedup/ DIR Log DIR Files DIR Refs DIR Tags DIR README DIR LICENSE --- DIR commit d60ace395a74a5efe067ee9cd5d85446c7facf43 DIR parent d8bfc3a69ce4c3c35dfa8c0d5cef3ce10e424300 HTML Author: sin <sin@2f30.org> Date: Tue, 26 Feb 2019 09:48:57 +0000 When matching a pattern check if bottom bits of hash are 0 This approach is more efficient and easier to understand. Diffstat: M chunker.c | 10 +--------- M config.h | 1 + 2 files changed, 2 insertions(+), 9 deletions(-) --- DIR diff --git a/chunker.c b/chunker.c @@ -14,7 +14,6 @@ struct chunker { size_t cap; size_t rpos; size_t wpos; - size_t discr; int fd; }; @@ -88,7 +87,7 @@ match_pattern(struct chunker *chunker, size_t chunk_size, uint32_t fp) return 1; if (chunk_size < BLKSIZE_MIN) return 0; - return (fp % chunker->discr) == chunker->discr - 1; + return (fp & HASHMASK_BITS) == 0; } static size_t @@ -123,12 +122,6 @@ get_chunk_size(struct chunker *chunker) return chunk_size; } -static size_t -calc_discr(size_t avg) -{ - return avg / (-1.42888852e-7 * avg + 1.33237515); -} - struct chunker * alloc_chunker(size_t cap, int fd) { @@ -145,7 +138,6 @@ alloc_chunker(size_t cap, int fd) chunker->rpos = 0; chunker->wpos = 0; chunker->fd = fd; - chunker->discr = calc_discr(BLKSIZE_AVG); return chunker; } DIR diff --git a/config.h b/config.h @@ -1,4 +1,5 @@ #define BLKSIZE_AVG ((size_t)524288) #define BLKSIZE_MIN ((BLKSIZE_AVG) / 4) #define BLKSIZE_MAX ((BLKSIZE_AVG) * 4) +#define HASHMASK_BITS (BLKSIZE_AVG - 1) #define WINSIZE 32