From: Avery Pennarun <apenwarr@gmail.com>
Date: Mon, 25 Jan 2010 06:24:16 +0000 (-0500)
Subject: MultiPackIndex: use .midx files if they exist.
X-Git-Url: https://git.michaelhowe.org/gitweb/?a=commitdiff_plain;h=d2258c284d9f34ee7a0e5ef101d778386afb6c62;p=packages%2Fb%2Fbup.git

MultiPackIndex: use .midx files if they exist.

Wow, using a single .midx file that merges my 435 megs of packfile indexes
(across 169 files) reduces memory churn in memtest.py by at least two orders
of magnitude.  (ie. we need to map 100x fewer memory pages in order to
search for each nonexistent object when creating a new backup)  memtest.py
runs *visibly* faster.

We can also remove the PackBitmap code now, since it's not nearly as good as
the PackMidx stuff and is now an unnecessary layer of indirection.
---

diff --git a/cmd-midx.py b/cmd-midx.py
index d3e87a7..63000e0 100755
--- a/cmd-midx.py
+++ b/cmd-midx.py
@@ -52,8 +52,8 @@ def merge(idxlist):
     iters = [[next(it), it] for it in iters]
     count = 0
     while iters:
-        if (count % (total/100)) == 0:
-            log('\rMerging: %d%%' % (count*100/total))
+        if (count % 10000) == 0:
+            log('\rMerging: %d/%d' % (count, total))
         e = min(iters)  # FIXME: very slow for long lists
         assert(e[0])
         yield e[0]
@@ -62,7 +62,7 @@ def merge(idxlist):
         table[prefix] = count
         e[0] = next(e[1])
         iters = filter(lambda x: x[0], iters)
-    log('\rMerging: done.\n')
+    log('\rMerging: done.                                    \n')
 
 f = open(opt.output, 'w+')
 f.write('MIDX\0\0\0\1')
@@ -80,7 +80,7 @@ f.write(struct.pack('!%dQ' % entries, *table))
 f.close()
 
 # this is just for testing
-if 1:
+if 0:
     p = git.PackMidx(opt.output)
     assert(len(p.idxnames) == len(extra))
     print p.idxnames
diff --git a/git.py b/git.py
index efe70a9..9dd9662 100644
--- a/git.py
+++ b/git.py
@@ -79,49 +79,6 @@ def _decode_packobj(buf):
     return (type, zlib.decompress(buf[i+1:]))
 
 
-MAP_BITS = 20
-
-
-class PackBitmap:
-    def __init__(self, idxname):
-        self.idxname = idxname
-        assert(idxname.endswith('.idx'))
-        self.mapname = idxname[:-4] + '.map'
-        if not os.path.exists(self.mapname):
-            self.gen_map()
-        self.num = 1 << (MAP_BITS-3)
-        self.map = mmap_read(open(self.mapname), self.num)
-
-    def gen_map(self):
-        (dir,fn) = os.path.split(self.idxname)
-        log('Generating map for %s...\n' % fn)
-        count = 0
-        a = ['\0']*((1 << MAP_BITS)/8)
-        for bin in PackIndex(self.idxname):
-            idx = self.bin_to_idx(bin)
-            byte = idx / 8
-            bit = idx % 8
-            a[byte] = chr(ord(a[byte]) | (1 << (7-bit)))
-            count += 1
-        open(self.mapname, 'w+').write(''.join(a))
-
-    def bin_to_idx(self, bin):
-        v = 0
-        for i in range(MAP_BITS/8):
-            v = (v << 8) | ord(bin[i])
-        rest = MAP_BITS - MAP_BITS/8*8
-        x = ord(bin[MAP_BITS/8]) >> (8-rest)
-        v = (v << rest) | x
-        return v
-
-    def might_exist(self, bin):
-        idx = self.bin_to_idx(bin)
-        byte = idx / 8
-        bit = idx % 8
-        v = ord(self.map[byte])
-        return (v >> (7-bit)) & 1
-
-
 class PackIndex:
     def __init__(self, filename):
         self.name = filename
@@ -242,7 +199,6 @@ class MultiPackIndex:
         self.dir = dir
         self.also = {}
         self.packs = []
-        self.maps = []
         self.refresh()
 
     def __del__(self):
@@ -254,25 +210,26 @@ class MultiPackIndex:
         if hash in self.also:
             return True
         for i in range(len(self.packs)):
-            m = self.maps[i]
-            if not m.might_exist(hash):
-                # FIXME: this test should perhaps be inside PackIndex?
-                continue
             p = self.packs[i]
             if p.exists(hash):
                 # reorder so most recently used packs are searched first
                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
-                self.maps =  [m] + self.maps[:i]  + self.maps[i+1:]
                 return p.name
         return None
 
     def refresh(self):
         d = dict([(p.name, 1) for p in self.packs])
         if os.path.exists(self.dir):
+            for f in os.listdir(self.dir):
+                full = os.path.join(self.dir, f)
+                if f.endswith('.midx') and not d.get(full):
+                    ix = PackMidx(full)
+                    self.packs.append(ix)
+                    for name in ix.idxnames:
+                        d[os.path.join(self.dir, name)] = 1
             for f in os.listdir(self.dir):
                 full = os.path.join(self.dir, f)
                 if f.endswith('.idx') and not d.get(full):
-                    self.maps.append(PackBitmap(full))
                     self.packs.append(PackIndex(full))
         #log('MultiPackIndex: using %d packs.\n' % len(self.packs))