From d9967d8c4d532bb41c07358acb650f25132bf067 Mon Sep 17 00:00:00 2001 From: Avery Pennarun Date: Mon, 1 Feb 2010 20:40:30 -0500 Subject: [PATCH] cmd-margin: a command to find out the max bits of overlap between hashes. Run 'bup margin' to go through the list of all the objects in your bup directory and count the number of overlapping prefix bits between each two consecutive objects. That is, fine the longest hash length (in bits) that *would* have caused an overlap, if sha1 hashes had been that length. On my system with 111 gigs of packs, I get 44 bits. Out of a total of 160. That means I'm still safe from collisions for about 2^116 times over. Or is it only the square root of that? Anyway, it's such a large number that my brain explodes just thinking about it. Mark my words: 2^160 ought to be enough for anyone. --- Makefile | 2 +- _hashsplit.c | 28 ++++++++++++++++++++++++++++ cmd-margin.py | 29 +++++++++++++++++++++++++++++ cmd-midx.py | 33 ++++----------------------------- git.py | 29 +++++++++++++++++++++++++++++ helpers.py | 7 +++++++ 6 files changed, 98 insertions(+), 30 deletions(-) create mode 100755 cmd-margin.py diff --git a/Makefile b/Makefile index ccebfbe..8b3be55 100644 --- a/Makefile +++ b/Makefile @@ -20,7 +20,7 @@ endif default: all all: bup-split bup-join bup-save bup-init bup-server bup-index bup-tick \ - bup-midx bup-fuse bup-ls bup-damage bup-fsck \ + bup-midx bup-fuse bup-ls bup-damage bup-fsck bup-margin \ bup memtest randomgen$(EXT) _hashsplit$(SOEXT) randomgen$(EXT): randomgen.o diff --git a/_hashsplit.c b/_hashsplit.c index a213d00..933abe6 100644 --- a/_hashsplit.c +++ b/_hashsplit.c @@ -48,9 +48,37 @@ static PyObject *splitbuf(PyObject *self, PyObject *args) } +static PyObject *bitmatch(PyObject *self, PyObject *args) +{ + unsigned char *buf1 = NULL, *buf2 = NULL; + int len1 = 0, len2 = 0; + int byte, bit; + + if (!PyArg_ParseTuple(args, "t#t#", &buf1, &len1, &buf2, &len2)) + return NULL; + + bit = 0; + for (byte = 0; byte < len1 && byte < len2; byte++) + { + int b1 = buf1[byte], b2 = buf2[byte]; + if (b1 != b2) + { + for (bit = 0; bit < 8; bit++) + if ( (b1 & (0x80 >> bit)) != (b2 & (0x80 >> bit)) ) + break; + break; + } + } + + return Py_BuildValue("i", byte*8 + bit); +} + + static PyMethodDef hashsplit_methods[] = { { "splitbuf", splitbuf, METH_VARARGS, "Split a list of strings based on a rolling checksum." }, + { "bitmatch", bitmatch, METH_VARARGS, + "Count the number of matching prefix bits between two strings." }, { NULL, NULL, 0, NULL }, // sentinel }; diff --git a/cmd-margin.py b/cmd-margin.py new file mode 100755 index 0000000..010a840 --- /dev/null +++ b/cmd-margin.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python +import sys +import options, git, _hashsplit +from helpers import * + + +optspec = """ +bup margin +""" +o = options.Options('bup margin', optspec) +(opt, flags, extra) = o.parse(sys.argv[1:]) + +if extra: + log("bup margin: no arguments expected\n") + o.usage() + +git.check_repo_or_die() +#git.ignore_midx = 1 + +mi = git.MultiPackIndex(git.repo('objects/pack')) +last = '\0'*20 +longmatch = 0 +for i in mi: + if i == last: + continue + pm = _hashsplit.bitmatch(last, i) + longmatch = max(longmatch, pm) + last = i +print longmatch diff --git a/cmd-midx.py b/cmd-midx.py index 68ff7db..97d01de 100755 --- a/cmd-midx.py +++ b/cmd-midx.py @@ -7,38 +7,13 @@ PAGE_SIZE=4096 SHA_PER_PAGE=PAGE_SIZE/200. -def next(it): - try: - return it.next() - except StopIteration: - return None - - -def merge(idxlist, total, bits, table): - iters = [iter(i) for i in idxlist] - iters = [[next(it), it] for it in iters] +def merge(idxlist, bits, table): count = 0 - iters.sort() - while iters: - if (count % 10000) == 0: - log('\rMerging: %.2f%% (%d/%d)' - % (count*100.0/total, count, total)) - e = iters[0][0] - yield e + for e in git.idxmerge(idxlist): count += 1 prefix = git.extract_bits(e, bits) table[prefix] = count - e = iters[0][0] = next(iters[0][1]) - if not e: - iters = iters[1:] - else: - i = 1 - while i < len(iters): - if iters[i][0] > e: - break - i += 1 - iters = iters[1:i] + [iters[0]] + iters[i:] - log('\rMerging: done. \n') + yield e def do_midx(outdir, outfilename, infilenames): @@ -76,7 +51,7 @@ def do_midx(outdir, outfilename, infilenames): assert(f.tell() == 12) f.write('\0'*8*entries) - for e in merge(inp, total, bits, table): + for e in merge(inp, bits, table): f.write(e) f.write('\0'.join([os.path.basename(p) for p in infilenames])) diff --git a/git.py b/git.py index 0f05248..d7d7466 100644 --- a/git.py +++ b/git.py @@ -207,6 +207,9 @@ class MultiPackIndex: _mpi_count -= 1 assert(_mpi_count == 0) + def __iter__(self): + return iter(idxmerge(self.packs)) + def exists(self, hash): if hash in self.also: return True @@ -267,6 +270,32 @@ def _shalist_sort_key(ent): return name +def idxmerge(idxlist): + total = sum([len(i) for i in idxlist]) + iters = [iter(i) for i in idxlist] + iters = [[next(it), it] for it in iters] + count = 0 + iters.sort() + while iters: + if (count % 10000) == 0: + log('Merging: %.2f%% (%d/%d)\r' + % (count*100.0/total, count, total)) + e = iters[0][0] + yield e + count += 1 + e = iters[0][0] = next(iters[0][1]) + if not e: + iters = iters[1:] + else: + i = 1 + while i < len(iters): + if iters[i][0] > e: + break + i += 1 + iters = iters[1:i] + [iters[0]] + iters[i:] + log('Merging: %.2f%% (%d/%d), done.\n' % (100, total, total)) + + class PackWriter: def __init__(self, objcache_maker=None): self.count = 0 diff --git a/helpers.py b/helpers.py index e6f0772..c7dfd5f 100644 --- a/helpers.py +++ b/helpers.py @@ -15,6 +15,13 @@ def mkdirp(d): raise +def next(it): + try: + return it.next() + except StopIteration: + return None + + def readpipe(argv): p = subprocess.Popen(argv, stdout=subprocess.PIPE) r = p.stdout.read() -- 2.39.5