From: Avery Pennarun Date: Sat, 2 Jan 2010 09:16:25 +0000 (-0500) Subject: Write git pack files instead of loose object files. X-Git-Url: https://git.michaelhowe.org/gitweb/?a=commitdiff_plain;h=5965d416da3fd138996f22cf234dece23c7bd19c;p=packages%2Fb%2Fbup.git Write git pack files instead of loose object files. This causes much, much less disk grinding than creating zillions of files, plus it's even more disk space efficient. We could theoretically make it go even faster by generating the .idx file ourselves, but for now, we just call "git index-pack" to do it. That helpfully also confirms that the data was written in a git-compatible way. --- diff --git a/cmd-save.py b/cmd-save.py index 458dc1a..e63054b 100755 --- a/cmd-save.py +++ b/cmd-save.py @@ -124,5 +124,7 @@ if opt.commit or opt.name: if opt.commit: print commit +git.flush_pack() + if saved_errors: log('WARNING: %d errors encountered while saving.\n' % len(saved_errors)) diff --git a/cmd-split.py b/cmd-split.py index 18f2e05..157b36c 100755 --- a/cmd-split.py +++ b/cmd-split.py @@ -25,6 +25,7 @@ hashsplit.split_verbosely = opt.verbose start_time = time.time() (shalist,tree) = hashsplit.split_to_tree(hashsplit.autofiles(extra)) + if opt.blobs: for (mode,name,sum) in shalist: print sum @@ -42,3 +43,5 @@ size = hashsplit.total_split if opt.bench: log('\nbup: %.2fkbytes in %.2f secs = %.2f kbytes/sec\n' % (size/1024., secs, size/1024./secs)) + +git.flush_pack() diff --git a/git.py b/git.py index 1e1c79f..1ff1b74 100644 --- a/git.py +++ b/git.py @@ -1,16 +1,10 @@ -import os, errno, zlib, time, sha, subprocess +import os, errno, zlib, time, sha, subprocess, struct from helpers import * -_objcache = {} -def hash_raw(type, s): - global _objcache - header = '%s %d\0' % (type, len(s)) - sum = sha.sha(header) - sum.update(s) - bin = sum.digest() - hex = sum.hexdigest() - if bin in _objcache: - return hex + +def _old_write_object(bin, type, content): + hex = bin.encode('hex') + header = '%s %d\0' % (type, len(content)) dir = '.git/objects/%s' % hex[0:2] fn = '%s/%s' % (dir, hex[2:]) if not os.path.exists(fn): @@ -20,19 +14,105 @@ def hash_raw(type, s): except OSError, e: if e.errno != errno.EEXIST: raise - tfn = '%s.%d' % (fn, os.getpid()) + tfn = '.git/objects/bup%d.tmp' % os.getpid() f = open(tfn, 'w') z = zlib.compressobj(1) f.write(z.compress(header)) - f.write(z.compress(s)) + f.write(z.compress(content)) f.write(z.flush()) f.close() os.rename(tfn, fn) + + +_typemap = dict(blob=3, tree=2, commit=1, tag=8) +class PackWriter: + def __init__(self): + self.count = 0 + self.binlist = [] + self.filename = '.git/objects/bup%d' % os.getpid() + self.file = open(self.filename + '.pack', 'w+') + self.file.write('PACK\0\0\0\2\0\0\0\0') + + def write(self, bin, type, content): + global _typemap + f = self.file + + sz = len(content) + szbits = (sz & 0x0f) | (_typemap[type]<<4) + sz >>= 4 + while 1: + if sz: szbits |= 0x80 + f.write(chr(szbits)) + if not sz: + break + szbits = sz & 0x7f + sz >>= 7 + + z = zlib.compressobj(1) + f.write(z.compress(content)) + f.write(z.flush()) + + self.count += 1 + self.binlist.append(bin) + + def close(self): + f = self.file + + # update object count + f.seek(8) + cp = struct.pack('!i', self.count) + assert(len(cp) == 4) + f.write(cp) + + # calculate the pack sha1sum + f.seek(0) + sum = sha.sha() + while 1: + b = f.read(65536) + sum.update(b) + if not b: break + f.write(sum.digest()) + + f.close() + + p = subprocess.Popen(['git', 'index-pack', '-v', + self.filename + '.pack'], + preexec_fn = lambda: _gitenv('.git'), + stdout = subprocess.PIPE) + out = p.stdout.read().strip() + if p.wait() or not out: + raise Exception('git index-pack returned an error') + os.rename(self.filename + '.pack', '.git/objects/pack/%s.pack' % out) + os.rename(self.filename + '.idx', '.git/objects/pack/%s.idx' % out) + +_packout = None +def _write_object(bin, type, content): + global _packout + if not _packout: + _packout = PackWriter() + _packout.write(bin, type, content) + + +def flush_pack(): + global _packout + if _packout: + _packout.close() + + +_objcache = {} +def hash_raw(type, s): + global _objcache + header = '%s %d\0' % (type, len(s)) + sum = sha.sha(header) + sum.update(s) + bin = sum.digest() + hex = sum.hexdigest() + if bin in _objcache: + return hex else: - #log('exists %s' % fn) - pass - _objcache[bin] = 1 - return hex + _write_object(bin, type, s) + _objcache[bin] = 1 + return hex def hash_blob(blob):