From: Avery Pennarun Date: Mon, 25 Jan 2010 06:24:16 +0000 (-0500) Subject: MultiPackIndex: use .midx files if they exist. X-Git-Url: https://git.michaelhowe.org/gitweb/?a=commitdiff_plain;h=d2258c284d9f34ee7a0e5ef101d778386afb6c62;p=packages%2Fb%2Fbup.git MultiPackIndex: use .midx files if they exist. Wow, using a single .midx file that merges my 435 megs of packfile indexes (across 169 files) reduces memory churn in memtest.py by at least two orders of magnitude. (ie. we need to map 100x fewer memory pages in order to search for each nonexistent object when creating a new backup) memtest.py runs *visibly* faster. We can also remove the PackBitmap code now, since it's not nearly as good as the PackMidx stuff and is now an unnecessary layer of indirection. --- diff --git a/cmd-midx.py b/cmd-midx.py index d3e87a7..63000e0 100755 --- a/cmd-midx.py +++ b/cmd-midx.py @@ -52,8 +52,8 @@ def merge(idxlist): iters = [[next(it), it] for it in iters] count = 0 while iters: - if (count % (total/100)) == 0: - log('\rMerging: %d%%' % (count*100/total)) + if (count % 10000) == 0: + log('\rMerging: %d/%d' % (count, total)) e = min(iters) # FIXME: very slow for long lists assert(e[0]) yield e[0] @@ -62,7 +62,7 @@ def merge(idxlist): table[prefix] = count e[0] = next(e[1]) iters = filter(lambda x: x[0], iters) - log('\rMerging: done.\n') + log('\rMerging: done. \n') f = open(opt.output, 'w+') f.write('MIDX\0\0\0\1') @@ -80,7 +80,7 @@ f.write(struct.pack('!%dQ' % entries, *table)) f.close() # this is just for testing -if 1: +if 0: p = git.PackMidx(opt.output) assert(len(p.idxnames) == len(extra)) print p.idxnames diff --git a/git.py b/git.py index efe70a9..9dd9662 100644 --- a/git.py +++ b/git.py @@ -79,49 +79,6 @@ def _decode_packobj(buf): return (type, zlib.decompress(buf[i+1:])) -MAP_BITS = 20 - - -class PackBitmap: - def __init__(self, idxname): - self.idxname = idxname - assert(idxname.endswith('.idx')) - self.mapname = idxname[:-4] + '.map' - if not os.path.exists(self.mapname): - self.gen_map() - self.num = 1 << (MAP_BITS-3) - self.map = mmap_read(open(self.mapname), self.num) - - def gen_map(self): - (dir,fn) = os.path.split(self.idxname) - log('Generating map for %s...\n' % fn) - count = 0 - a = ['\0']*((1 << MAP_BITS)/8) - for bin in PackIndex(self.idxname): - idx = self.bin_to_idx(bin) - byte = idx / 8 - bit = idx % 8 - a[byte] = chr(ord(a[byte]) | (1 << (7-bit))) - count += 1 - open(self.mapname, 'w+').write(''.join(a)) - - def bin_to_idx(self, bin): - v = 0 - for i in range(MAP_BITS/8): - v = (v << 8) | ord(bin[i]) - rest = MAP_BITS - MAP_BITS/8*8 - x = ord(bin[MAP_BITS/8]) >> (8-rest) - v = (v << rest) | x - return v - - def might_exist(self, bin): - idx = self.bin_to_idx(bin) - byte = idx / 8 - bit = idx % 8 - v = ord(self.map[byte]) - return (v >> (7-bit)) & 1 - - class PackIndex: def __init__(self, filename): self.name = filename @@ -242,7 +199,6 @@ class MultiPackIndex: self.dir = dir self.also = {} self.packs = [] - self.maps = [] self.refresh() def __del__(self): @@ -254,25 +210,26 @@ class MultiPackIndex: if hash in self.also: return True for i in range(len(self.packs)): - m = self.maps[i] - if not m.might_exist(hash): - # FIXME: this test should perhaps be inside PackIndex? - continue p = self.packs[i] if p.exists(hash): # reorder so most recently used packs are searched first self.packs = [p] + self.packs[:i] + self.packs[i+1:] - self.maps = [m] + self.maps[:i] + self.maps[i+1:] return p.name return None def refresh(self): d = dict([(p.name, 1) for p in self.packs]) if os.path.exists(self.dir): + for f in os.listdir(self.dir): + full = os.path.join(self.dir, f) + if f.endswith('.midx') and not d.get(full): + ix = PackMidx(full) + self.packs.append(ix) + for name in ix.idxnames: + d[os.path.join(self.dir, name)] = 1 for f in os.listdir(self.dir): full = os.path.join(self.dir, f) if f.endswith('.idx') and not d.get(full): - self.maps.append(PackBitmap(full)) self.packs.append(PackIndex(full)) #log('MultiPackIndex: using %d packs.\n' % len(self.packs))