]> git.michaelhowe.org Git - packages/b/bup.git/commitdiff
MultiPackIndex: use .midx files if they exist.
authorAvery Pennarun <apenwarr@gmail.com>
Mon, 25 Jan 2010 06:24:16 +0000 (01:24 -0500)
committerAvery Pennarun <apenwarr@gmail.com>
Mon, 25 Jan 2010 06:29:55 +0000 (01:29 -0500)
Wow, using a single .midx file that merges my 435 megs of packfile indexes
(across 169 files) reduces memory churn in memtest.py by at least two orders
of magnitude.  (ie. we need to map 100x fewer memory pages in order to
search for each nonexistent object when creating a new backup)  memtest.py
runs *visibly* faster.

We can also remove the PackBitmap code now, since it's not nearly as good as
the PackMidx stuff and is now an unnecessary layer of indirection.

cmd-midx.py
git.py

index d3e87a741a4a5a31a2e7604c2cfc90f417e1e815..63000e050620f9ecf09beb1caeea50819cb45f95 100755 (executable)
@@ -52,8 +52,8 @@ def merge(idxlist):
     iters = [[next(it), it] for it in iters]
     count = 0
     while iters:
-        if (count % (total/100)) == 0:
-            log('\rMerging: %d%%' % (count*100/total))
+        if (count % 10000) == 0:
+            log('\rMerging: %d/%d' % (count, total))
         e = min(iters)  # FIXME: very slow for long lists
         assert(e[0])
         yield e[0]
@@ -62,7 +62,7 @@ def merge(idxlist):
         table[prefix] = count
         e[0] = next(e[1])
         iters = filter(lambda x: x[0], iters)
-    log('\rMerging: done.\n')
+    log('\rMerging: done.                                    \n')
 
 f = open(opt.output, 'w+')
 f.write('MIDX\0\0\0\1')
@@ -80,7 +80,7 @@ f.write(struct.pack('!%dQ' % entries, *table))
 f.close()
 
 # this is just for testing
-if 1:
+if 0:
     p = git.PackMidx(opt.output)
     assert(len(p.idxnames) == len(extra))
     print p.idxnames
diff --git a/git.py b/git.py
index efe70a9204bb5ffbd8c8cc370c4c3d46e9b7423e..9dd9662efe4c6a382de0b3ccdb2326494bffbbdb 100644 (file)
--- a/git.py
+++ b/git.py
@@ -79,49 +79,6 @@ def _decode_packobj(buf):
     return (type, zlib.decompress(buf[i+1:]))
 
 
-MAP_BITS = 20
-
-
-class PackBitmap:
-    def __init__(self, idxname):
-        self.idxname = idxname
-        assert(idxname.endswith('.idx'))
-        self.mapname = idxname[:-4] + '.map'
-        if not os.path.exists(self.mapname):
-            self.gen_map()
-        self.num = 1 << (MAP_BITS-3)
-        self.map = mmap_read(open(self.mapname), self.num)
-
-    def gen_map(self):
-        (dir,fn) = os.path.split(self.idxname)
-        log('Generating map for %s...\n' % fn)
-        count = 0
-        a = ['\0']*((1 << MAP_BITS)/8)
-        for bin in PackIndex(self.idxname):
-            idx = self.bin_to_idx(bin)
-            byte = idx / 8
-            bit = idx % 8
-            a[byte] = chr(ord(a[byte]) | (1 << (7-bit)))
-            count += 1
-        open(self.mapname, 'w+').write(''.join(a))
-
-    def bin_to_idx(self, bin):
-        v = 0
-        for i in range(MAP_BITS/8):
-            v = (v << 8) | ord(bin[i])
-        rest = MAP_BITS - MAP_BITS/8*8
-        x = ord(bin[MAP_BITS/8]) >> (8-rest)
-        v = (v << rest) | x
-        return v
-
-    def might_exist(self, bin):
-        idx = self.bin_to_idx(bin)
-        byte = idx / 8
-        bit = idx % 8
-        v = ord(self.map[byte])
-        return (v >> (7-bit)) & 1
-
-
 class PackIndex:
     def __init__(self, filename):
         self.name = filename
@@ -242,7 +199,6 @@ class MultiPackIndex:
         self.dir = dir
         self.also = {}
         self.packs = []
-        self.maps = []
         self.refresh()
 
     def __del__(self):
@@ -254,25 +210,26 @@ class MultiPackIndex:
         if hash in self.also:
             return True
         for i in range(len(self.packs)):
-            m = self.maps[i]
-            if not m.might_exist(hash):
-                # FIXME: this test should perhaps be inside PackIndex?
-                continue
             p = self.packs[i]
             if p.exists(hash):
                 # reorder so most recently used packs are searched first
                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
-                self.maps =  [m] + self.maps[:i]  + self.maps[i+1:]
                 return p.name
         return None
 
     def refresh(self):
         d = dict([(p.name, 1) for p in self.packs])
         if os.path.exists(self.dir):
+            for f in os.listdir(self.dir):
+                full = os.path.join(self.dir, f)
+                if f.endswith('.midx') and not d.get(full):
+                    ix = PackMidx(full)
+                    self.packs.append(ix)
+                    for name in ix.idxnames:
+                        d[os.path.join(self.dir, name)] = 1
             for f in os.listdir(self.dir):
                 full = os.path.join(self.dir, f)
                 if f.endswith('.idx') and not d.get(full):
-                    self.maps.append(PackBitmap(full))
                     self.packs.append(PackIndex(full))
         #log('MultiPackIndex: using %d packs.\n' % len(self.packs))