aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJunio C Hamano <gitster@pobox.com>2024-04-12 11:31:39 -0700
committerJunio C Hamano <gitster@pobox.com>2024-04-12 11:31:39 -0700
commit7fbe3ead1989bed1ea1fc8754d13aa7af16dd946 (patch)
treeef381d996e8cb285936a5245b50e5ee08c2227dd
parent847af43a3afb39394d5fe58192f94b993ca18f9f (diff)
parentd51d8cc36831bdabbbcec8553a7e83d9f5a3be4d (diff)
downloadgit-7fbe3ead1989bed1ea1fc8754d13aa7af16dd946.tar.gz
Merge branch 'ps/reftable-binsearch-updates'
Reftable code clean-up and some bugfixes. * ps/reftable-binsearch-updates: reftable/block: avoid decoding keys when searching restart points reftable/record: extract function to decode key lengths reftable/block: fix error handling when searching restart points reftable/block: refactor binary search over restart points reftable/refname: refactor binary search over refnames reftable/basics: improve `binsearch()` test reftable/basics: fix return type of `binsearch()` to be `size_t`
-rw-r--r--reftable/basics.c7
-rw-r--r--reftable/basics.h7
-rw-r--r--reftable/basics_test.c55
-rw-r--r--reftable/block.c117
-rw-r--r--reftable/record.c34
-rw-r--r--reftable/record.h6
-rw-r--r--reftable/refname.c53
7 files changed, 182 insertions, 97 deletions
diff --git a/reftable/basics.c b/reftable/basics.c
index 0785aff941..fea711db7e 100644
--- a/reftable/basics.c
+++ b/reftable/basics.c
@@ -27,7 +27,7 @@ void put_be16(uint8_t *out, uint16_t i)
out[1] = (uint8_t)(i & 0xff);
}
-int binsearch(size_t sz, int (*f)(size_t k, void *args), void *args)
+size_t binsearch(size_t sz, int (*f)(size_t k, void *args), void *args)
{
size_t lo = 0;
size_t hi = sz;
@@ -39,8 +39,11 @@ int binsearch(size_t sz, int (*f)(size_t k, void *args), void *args)
*/
while (hi - lo > 1) {
size_t mid = lo + (hi - lo) / 2;
+ int ret = f(mid, args);
+ if (ret < 0)
+ return sz;
- if (f(mid, args))
+ if (ret > 0)
hi = mid;
else
lo = mid;
diff --git a/reftable/basics.h b/reftable/basics.h
index 91f3533efe..523ecd5307 100644
--- a/reftable/basics.h
+++ b/reftable/basics.h
@@ -22,13 +22,14 @@ uint32_t get_be24(uint8_t *in);
void put_be16(uint8_t *out, uint16_t i);
/*
- * find smallest index i in [0, sz) at which f(i) is true, assuming
- * that f is ascending. Return sz if f(i) is false for all indices.
+ * find smallest index i in [0, sz) at which `f(i) > 0`, assuming that f is
+ * ascending. Return sz if `f(i) == 0` for all indices. The search is aborted
+ * and `sz` is returned in case `f(i) < 0`.
*
* Contrary to bsearch(3), this returns something useful if the argument is not
* found.
*/
-int binsearch(size_t sz, int (*f)(size_t k, void *args), void *args);
+size_t binsearch(size_t sz, int (*f)(size_t k, void *args), void *args);
/*
* Frees a NULL terminated array of malloced strings. The array itself is also
diff --git a/reftable/basics_test.c b/reftable/basics_test.c
index 1fcd229725..997c4d9e01 100644
--- a/reftable/basics_test.c
+++ b/reftable/basics_test.c
@@ -12,40 +12,47 @@ https://developers.google.com/open-source/licenses/bsd
#include "test_framework.h"
#include "reftable-tests.h"
-struct binsearch_args {
- int key;
- int *arr;
+struct integer_needle_lesseq_args {
+ int needle;
+ int *haystack;
};
-static int binsearch_func(size_t i, void *void_args)
+static int integer_needle_lesseq(size_t i, void *_args)
{
- struct binsearch_args *args = void_args;
-
- return args->key < args->arr[i];
+ struct integer_needle_lesseq_args *args = _args;
+ return args->needle <= args->haystack[i];
}
static void test_binsearch(void)
{
- int arr[] = { 2, 4, 6, 8, 10 };
- size_t sz = ARRAY_SIZE(arr);
- struct binsearch_args args = {
- .arr = arr,
+ int haystack[] = { 2, 4, 6, 8, 10 };
+ struct {
+ int needle;
+ size_t expected_idx;
+ } testcases[] = {
+ {-9000, 0},
+ {-1, 0},
+ {0, 0},
+ {2, 0},
+ {3, 1},
+ {4, 1},
+ {7, 3},
+ {9, 4},
+ {10, 4},
+ {11, 5},
+ {9000, 5},
};
+ size_t i = 0;
- int i = 0;
- for (i = 1; i < 11; i++) {
- int res;
- args.key = i;
- res = binsearch(sz, &binsearch_func, &args);
+ for (i = 0; i < ARRAY_SIZE(testcases); i++) {
+ struct integer_needle_lesseq_args args = {
+ .haystack = haystack,
+ .needle = testcases[i].needle,
+ };
+ size_t idx;
- if (res < sz) {
- EXPECT(args.key < arr[res]);
- if (res > 0) {
- EXPECT(args.key >= arr[res - 1]);
- }
- } else {
- EXPECT(args.key == 10 || args.key == 11);
- }
+ idx = binsearch(ARRAY_SIZE(haystack), &integer_needle_lesseq, &args);
+ EXPECT(idx == testcases[i].expected_idx);
}
}
diff --git a/reftable/block.c b/reftable/block.c
index e2a2cee58d..298e8c56b9 100644
--- a/reftable/block.c
+++ b/reftable/block.c
@@ -273,35 +273,46 @@ void block_reader_start(struct block_reader *br, struct block_iter *it)
it->next_off = br->header_off + 4;
}
-struct restart_find_args {
+struct restart_needle_less_args {
int error;
- struct strbuf key;
- struct block_reader *r;
+ struct strbuf needle;
+ struct block_reader *reader;
};
-static int restart_key_less(size_t idx, void *args)
+static int restart_needle_less(size_t idx, void *_args)
{
- struct restart_find_args *a = args;
- uint32_t off = block_reader_restart_offset(a->r, idx);
+ struct restart_needle_less_args *args = _args;
+ uint32_t off = block_reader_restart_offset(args->reader, idx);
struct string_view in = {
- .buf = a->r->block.data + off,
- .len = a->r->block_len - off,
+ .buf = args->reader->block.data + off,
+ .len = args->reader->block_len - off,
};
+ uint64_t prefix_len, suffix_len;
+ uint8_t extra;
+ int n;
+
+ /*
+ * Records at restart points are stored without prefix compression, so
+ * there is no need to fully decode the record key here. This removes
+ * the need for allocating memory.
+ */
+ n = reftable_decode_keylen(in, &prefix_len, &suffix_len, &extra);
+ if (n < 0 || prefix_len) {
+ args->error = 1;
+ return -1;
+ }
- /* the restart key is verbatim in the block, so this could avoid the
- alloc for decoding the key */
- struct strbuf rkey = STRBUF_INIT;
- uint8_t unused_extra;
- int n = reftable_decode_key(&rkey, &unused_extra, in);
- int result;
- if (n < 0) {
- a->error = 1;
+ string_view_consume(&in, n);
+ if (suffix_len > in.len) {
+ args->error = 1;
return -1;
}
- result = strbuf_cmp(&a->key, &rkey);
- strbuf_release(&rkey);
- return result < 0;
+ n = memcmp(args->needle.buf, in.buf,
+ args->needle.len < suffix_len ? args->needle.len : suffix_len);
+ if (n)
+ return n < 0;
+ return args->needle.len < suffix_len;
}
void block_iter_copy_from(struct block_iter *dest, struct block_iter *src)
@@ -376,20 +387,51 @@ void block_iter_close(struct block_iter *it)
int block_reader_seek(struct block_reader *br, struct block_iter *it,
struct strbuf *want)
{
- struct restart_find_args args = {
- .key = *want,
- .r = br,
+ struct restart_needle_less_args args = {
+ .needle = *want,
+ .reader = br,
};
struct block_iter next = BLOCK_ITER_INIT;
struct reftable_record rec;
- int err = 0, i;
-
+ int err = 0;
+ size_t i;
+
+ /*
+ * Perform a binary search over the block's restart points, which
+ * avoids doing a linear scan over the whole block. Like this, we
+ * identify the section of the block that should contain our key.
+ *
+ * Note that we explicitly search for the first restart point _greater_
+ * than the sought-after record, not _greater or equal_ to it. In case
+ * the sought-after record is located directly at the restart point we
+ * would otherwise start doing the linear search at the preceding
+ * restart point. While that works alright, we would end up scanning
+ * too many record.
+ */
+ i = binsearch(br->restart_count, &restart_needle_less, &args);
if (args.error) {
err = REFTABLE_FORMAT_ERROR;
goto done;
}
- i = binsearch(br->restart_count, &restart_key_less, &args);
+ /*
+ * Now there are multiple cases:
+ *
+ * - `i == 0`: The wanted record is smaller than the record found at
+ * the first restart point. As the first restart point is the first
+ * record in the block, our wanted record cannot be located in this
+ * block at all. We still need to position the iterator so that the
+ * next call to `block_iter_next()` will yield an end-of-iterator
+ * signal.
+ *
+ * - `i == restart_count`: The wanted record was not found at any of
+ * the restart points. As there is no restart point at the end of
+ * the section the record may thus be contained in the last block.
+ *
+ * - `i > 0`: The wanted record must be contained in the section
+ * before the found restart point. We thus do a linear search
+ * starting from the preceding restart point.
+ */
if (i > 0)
it->next_off = block_reader_restart_offset(br, i - 1);
else
@@ -398,21 +440,34 @@ int block_reader_seek(struct block_reader *br, struct block_iter *it,
reftable_record_init(&rec, block_reader_type(br));
- /* We're looking for the last entry less/equal than the wanted key, so
- we have to go one entry too far and then back up.
- */
+ /*
+ * We're looking for the last entry less than the wanted key so that
+ * the next call to `block_reader_next()` would yield the wanted
+ * record. We thus don't want to position our reader at the sought
+ * after record, but one before. To do so, we have to go one entry too
+ * far and then back up.
+ */
while (1) {
block_iter_copy_from(&next, it);
err = block_iter_next(&next, &rec);
if (err < 0)
goto done;
-
- reftable_record_key(&rec, &it->last_key);
- if (err > 0 || strbuf_cmp(&it->last_key, want) >= 0) {
+ if (err > 0) {
err = 0;
goto done;
}
+ /*
+ * Check whether the current key is greater or equal to the
+ * sought-after key. In case it is greater we know that the
+ * record does not exist in the block and can thus abort early.
+ * In case it is equal to the sought-after key we have found
+ * the desired record.
+ */
+ reftable_record_key(&rec, &it->last_key);
+ if (strbuf_cmp(&it->last_key, want) >= 0)
+ goto done;
+
block_iter_copy_from(it, &next);
}
diff --git a/reftable/record.c b/reftable/record.c
index 23b497adab..5506f3e913 100644
--- a/reftable/record.c
+++ b/reftable/record.c
@@ -159,26 +159,42 @@ int reftable_encode_key(int *restart, struct string_view dest,
return start.len - dest.len;
}
-int reftable_decode_key(struct strbuf *last_key, uint8_t *extra,
- struct string_view in)
+int reftable_decode_keylen(struct string_view in,
+ uint64_t *prefix_len,
+ uint64_t *suffix_len,
+ uint8_t *extra)
{
- int start_len = in.len;
- uint64_t prefix_len = 0;
- uint64_t suffix_len = 0;
+ size_t start_len = in.len;
int n;
- n = get_var_int(&prefix_len, &in);
+ n = get_var_int(prefix_len, &in);
if (n < 0)
return -1;
string_view_consume(&in, n);
- n = get_var_int(&suffix_len, &in);
+ n = get_var_int(suffix_len, &in);
if (n <= 0)
return -1;
string_view_consume(&in, n);
- *extra = (uint8_t)(suffix_len & 0x7);
- suffix_len >>= 3;
+ *extra = (uint8_t)(*suffix_len & 0x7);
+ *suffix_len >>= 3;
+
+ return start_len - in.len;
+}
+
+int reftable_decode_key(struct strbuf *last_key, uint8_t *extra,
+ struct string_view in)
+{
+ int start_len = in.len;
+ uint64_t prefix_len = 0;
+ uint64_t suffix_len = 0;
+ int n;
+
+ n = reftable_decode_keylen(in, &prefix_len, &suffix_len, extra);
+ if (n < 0)
+ return -1;
+ string_view_consume(&in, n);
if (in.len < suffix_len ||
prefix_len > last_key->len)
diff --git a/reftable/record.h b/reftable/record.h
index 826ee1c55c..d778133e6e 100644
--- a/reftable/record.h
+++ b/reftable/record.h
@@ -86,6 +86,12 @@ int reftable_encode_key(int *is_restart, struct string_view dest,
struct strbuf prev_key, struct strbuf key,
uint8_t extra);
+/* Decode a record's key lengths. */
+int reftable_decode_keylen(struct string_view in,
+ uint64_t *prefix_len,
+ uint64_t *suffix_len,
+ uint8_t *extra);
+
/*
* Decode into `last_key` and `extra` from `in`. `last_key` is expected to
* contain the decoded key of the preceding record, if any.
diff --git a/reftable/refname.c b/reftable/refname.c
index 7570e4acf9..bbfde15754 100644
--- a/reftable/refname.c
+++ b/reftable/refname.c
@@ -12,15 +12,15 @@
#include "refname.h"
#include "reftable-iterator.h"
-struct find_arg {
- char **names;
- const char *want;
+struct refname_needle_lesseq_args {
+ char **haystack;
+ const char *needle;
};
-static int find_name(size_t k, void *arg)
+static int refname_needle_lesseq(size_t k, void *_args)
{
- struct find_arg *f_arg = arg;
- return strcmp(f_arg->names[k], f_arg->want) >= 0;
+ struct refname_needle_lesseq_args *args = _args;
+ return strcmp(args->needle, args->haystack[k]) <= 0;
}
static int modification_has_ref(struct modification *mod, const char *name)
@@ -29,25 +29,23 @@ static int modification_has_ref(struct modification *mod, const char *name)
int err = 0;
if (mod->add_len > 0) {
- struct find_arg arg = {
- .names = mod->add,
- .want = name,
+ struct refname_needle_lesseq_args args = {
+ .haystack = mod->add,
+ .needle = name,
};
- int idx = binsearch(mod->add_len, find_name, &arg);
- if (idx < mod->add_len && !strcmp(mod->add[idx], name)) {
+ size_t idx = binsearch(mod->add_len, refname_needle_lesseq, &args);
+ if (idx < mod->add_len && !strcmp(mod->add[idx], name))
return 0;
- }
}
if (mod->del_len > 0) {
- struct find_arg arg = {
- .names = mod->del,
- .want = name,
+ struct refname_needle_lesseq_args args = {
+ .haystack = mod->del,
+ .needle = name,
};
- int idx = binsearch(mod->del_len, find_name, &arg);
- if (idx < mod->del_len && !strcmp(mod->del[idx], name)) {
+ size_t idx = binsearch(mod->del_len, refname_needle_lesseq, &args);
+ if (idx < mod->del_len && !strcmp(mod->del[idx], name))
return 1;
- }
}
err = reftable_table_read_ref(&mod->tab, name, &ref);
@@ -73,11 +71,11 @@ static int modification_has_ref_with_prefix(struct modification *mod,
int err = 0;
if (mod->add_len > 0) {
- struct find_arg arg = {
- .names = mod->add,
- .want = prefix,
+ struct refname_needle_lesseq_args args = {
+ .haystack = mod->add,
+ .needle = prefix,
};
- int idx = binsearch(mod->add_len, find_name, &arg);
+ size_t idx = binsearch(mod->add_len, refname_needle_lesseq, &args);
if (idx < mod->add_len &&
!strncmp(prefix, mod->add[idx], strlen(prefix)))
goto done;
@@ -92,15 +90,14 @@ static int modification_has_ref_with_prefix(struct modification *mod,
goto done;
if (mod->del_len > 0) {
- struct find_arg arg = {
- .names = mod->del,
- .want = ref.refname,
+ struct refname_needle_lesseq_args args = {
+ .haystack = mod->del,
+ .needle = ref.refname,
};
- int idx = binsearch(mod->del_len, find_name, &arg);
+ size_t idx = binsearch(mod->del_len, refname_needle_lesseq, &args);
if (idx < mod->del_len &&
- !strcmp(ref.refname, mod->del[idx])) {
+ !strcmp(ref.refname, mod->del[idx]))
continue;
- }
}
if (strncmp(ref.refname, prefix, strlen(prefix))) {