aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTaylor Blau <me@ttaylorr.com>2024-04-01 17:16:44 -0400
committerJunio C Hamano <gitster@pobox.com>2024-04-01 14:18:17 -0700
commitb7d6f23a17110d597d58f4a8e1b34b7a72c43fe1 (patch)
tree5ddf23f5e96103e8af00d10f94a7161644301ba1
parent440e470edb183d020d2982d90d27113f05189727 (diff)
downloadgit-b7d6f23a17110d597d58f4a8e1b34b7a72c43fe1.tar.gz
midx-write.c: use `--stdin-packs` when repacking
When constructing a new pack `git multi-pack-index repack` provides a list of objects which is the union of objects in all MIDX'd packs which were "included" in the repack. Though correct, this typically yields a poorly structured pack, since providing the objects list over stdin does not give pack-objects a chance to discover the namehash values for each object, leading to sub-optimal delta selection. We can use `--stdin-packs` instead, which has a couple of benefits: - it does a supplemental walk over objects in the supplied list of packs to discover their namehash, leading to higher-quality delta selection - it requires us to list far less data over stdin; instead of listing each object in the resulting pack, we need only list the constituent packs from which those objects were selected in the MIDX Of course, this comes at a slight cost: though we save time on listing packs versus objects over stdin[^1] (around ~650 milliseconds), we add a non-trivial amount of time walking over the given objects in order to find better deltas. In general, this is likely to more closely match the user's expectations (i.e. that packs generated via `git multi-pack-index repack` are written with high-quality deltas). But if not, we can always introduce a new option in pack-objects to disable the supplemental object walk, which would yield a pure CPU-time savings, at the cost of the on-disk size of the resulting pack. [^1]: In a patched version of Git that doesn't perform the supplemental object walk in `pack-objects --stdin-packs`, we save around ~650ms (from 5.968 to 5.325 seconds) when running `git multi-pack-index repack --batch-size=0` on git.git with all objects packed, and all packs in a MIDX. Signed-off-by: Taylor Blau <me@ttaylorr.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
-rw-r--r--midx-write.c18
1 files changed, 9 insertions, 9 deletions
diff --git a/midx-write.c b/midx-write.c
index 960cc46250..65e69d2de7 100644
--- a/midx-write.c
+++ b/midx-write.c
@@ -1474,7 +1474,8 @@ int midx_repack(struct repository *r, const char *object_dir, size_t batch_size,
repo_config_get_bool(r, "repack.usedeltabaseoffset", &delta_base_offset);
repo_config_get_bool(r, "repack.usedeltaislands", &use_delta_islands);
- strvec_push(&cmd.args, "pack-objects");
+ strvec_pushl(&cmd.args, "pack-objects", "--stdin-packs", "--non-empty",
+ NULL);
strvec_pushf(&cmd.args, "%s/pack/pack", object_dir);
@@ -1498,16 +1499,15 @@ int midx_repack(struct repository *r, const char *object_dir, size_t batch_size,
}
cmd_in = xfdopen(cmd.in, "w");
-
- for (i = 0; i < m->num_objects; i++) {
- struct object_id oid;
- uint32_t pack_int_id = nth_midxed_pack_int_id(m, i);
-
- if (!include_pack[pack_int_id])
+ for (i = 0; i < m->num_packs; i++) {
+ struct packed_git *p = m->packs[i];
+ if (!p)
continue;
- nth_midxed_object_oid(&oid, m, i);
- fprintf(cmd_in, "%s\n", oid_to_hex(&oid));
+ if (include_pack[i])
+ fprintf(cmd_in, "%s\n", pack_basename(p));
+ else
+ fprintf(cmd_in, "^%s\n", pack_basename(p));
}
fclose(cmd_in);