From: Avery Pennarun Date: Tue, 8 Feb 2011 01:41:00 +0000 (-0800) Subject: cmd/bloom: map only one .idx file at a time. X-Git-Url: https://git.michaelhowe.org/gitweb/?a=commitdiff_plain;h=8cd1f6daf982365858702a0fb5f2c74bcba31d2c;p=packages%2Fb%2Fbup.git cmd/bloom: map only one .idx file at a time. This massively decreases virtual memory allocation since we only ever need to look at a single idx at once. In theory, VM doesn't cost us anything, but on 32-bit systems we can actually run out of address space if we try to map all the idx files at once on a very large repo. Signed-off-by: Avery Pennarun --- diff --git a/cmd/bloom-cmd.py b/cmd/bloom-cmd.py index 92cdf8b..8ee7586 100755 --- a/cmd/bloom-cmd.py +++ b/cmd/bloom-cmd.py @@ -30,11 +30,11 @@ def do_bloom(path, outfilename): for name in glob.glob('%s/*.idx' % path): ix = git.open_idx(name) ixbase = os.path.basename(name) - if b is not None and ixbase in b.idxnames: - rest.append(ix) + if b and (ixbase in b.idxnames): + rest.append(name) rest_count += len(ix) else: - add.append(ix) + add.append(name) add_count += len(ix) total = add_count + rest_count @@ -68,7 +68,8 @@ def do_bloom(path, outfilename): b = git.ShaBloom.create( tfname, f=tf, readwrite=True, expected=add_count, k=opt.k) count = 0 - for ix in add: + for name in add: + ix = git.open_idx(name) progress('Writing bloom: %d/%d\r' % (count, len(add))) b.add_idx(ix) count += 1