From: Avery Pennarun Date: Sun, 14 Feb 2010 06:16:43 +0000 (-0500) Subject: hashsplit: smallish files (less than BLOB_MAX) weren't getting split. X-Git-Url: https://git.michaelhowe.org/gitweb/?a=commitdiff_plain;h=459c44f95f7a2ca5d71fc024b94171be13123269;p=packages%2Fb%2Fbup.git hashsplit: smallish files (less than BLOB_MAX) weren't getting split. This buglet was introduced when doing my new fanout cleanups. It's relatively unimportant, but it would cause a bit of space wastage for smallish files that changed by a bit, since we couldn't take advantage of deduplication for their blocks. This also explains why the --fanout argument test broke earlier. I thought I was going crazy (since the whole fanout implementation had changed and the number now means something slightly different), so I just removed it. But now we can bring it back and it passes again.^ --- diff --git a/hashsplit.py b/hashsplit.py index ca5682f..f9bc48d 100644 --- a/hashsplit.py +++ b/hashsplit.py @@ -54,26 +54,35 @@ def blobiter(files): yield b -def hashsplit_iter(files): - assert(BLOB_HWM > BLOB_MAX) - buf = Buf() - fi = blobiter(files) +def drainbuf(buf, finalize): while 1: (blob, bits) = splitbuf(buf) if blob: yield (blob, bits) else: - if buf.used() >= BLOB_MAX: - # limit max blob size - yield (buf.get(buf.used()), 0) - while buf.used() < BLOB_HWM: - bnew = next(fi) - if not bnew: - # eof - if buf.used(): - yield (buf.get(buf.used()), 0) - return - buf.put(bnew) + break + if buf.used() > BLOB_MAX: + # limit max blob size + yield (buf.get(buf.used()), 0) + elif finalize and buf.used(): + yield (buf.get(buf.used()), 0) + + +def hashsplit_iter(files): + assert(BLOB_HWM > BLOB_MAX) + buf = Buf() + fi = blobiter(files) + while 1: + for i in drainbuf(buf, finalize=False): + yield i + while buf.used() < BLOB_HWM: + bnew = next(fi) + if not bnew: + # eof + for i in drainbuf(buf, finalize=True): + yield i + return + buf.put(bnew) total_split = 0 diff --git a/t/test.sh b/t/test.sh index f1ba8ca..40230a5 100755 --- a/t/test.sh +++ b/t/test.sh @@ -106,6 +106,7 @@ WVPASS bup margin WVPASS bup midx -f WVPASS bup margin WVPASS bup split -t t/testfile2 >tags2t.tmp +WVPASS bup split -t t/testfile2 --fanout 3 >tags2tf.tmp WVPASS bup split -r "$BUP_DIR" -c t/testfile2 >tags2c.tmp WVPASS ls -lR \ | WVPASS bup split -r "$BUP_DIR" -c --fanout 3 --max-pack-objects 3 -n lslr @@ -115,6 +116,8 @@ WVPASS bup ls /lslr WVPASS bup ls /lslr/1971-01-01 # all dates always exist WVFAIL diff -u tags1.tmp tags2.tmp +# fanout must be different from non-fanout +WVFAIL diff -q tags2t.tmp tags2tf.tmp wc -c t/testfile1 t/testfile2 wc -l tags1.tmp tags2.tmp