diff options
author | Junio C Hamano <gitster@pobox.com> | 2024-04-12 11:31:39 -0700 |
---|---|---|
committer | Junio C Hamano <gitster@pobox.com> | 2024-04-12 11:31:39 -0700 |
commit | 7fbe3ead1989bed1ea1fc8754d13aa7af16dd946 (patch) | |
tree | ef381d996e8cb285936a5245b50e5ee08c2227dd | |
parent | 847af43a3afb39394d5fe58192f94b993ca18f9f (diff) | |
parent | d51d8cc36831bdabbbcec8553a7e83d9f5a3be4d (diff) | |
download | git-7fbe3ead1989bed1ea1fc8754d13aa7af16dd946.tar.gz |
Merge branch 'ps/reftable-binsearch-updates'
Reftable code clean-up and some bugfixes.
* ps/reftable-binsearch-updates:
reftable/block: avoid decoding keys when searching restart points
reftable/record: extract function to decode key lengths
reftable/block: fix error handling when searching restart points
reftable/block: refactor binary search over restart points
reftable/refname: refactor binary search over refnames
reftable/basics: improve `binsearch()` test
reftable/basics: fix return type of `binsearch()` to be `size_t`
-rw-r--r-- | reftable/basics.c | 7 | ||||
-rw-r--r-- | reftable/basics.h | 7 | ||||
-rw-r--r-- | reftable/basics_test.c | 55 | ||||
-rw-r--r-- | reftable/block.c | 117 | ||||
-rw-r--r-- | reftable/record.c | 34 | ||||
-rw-r--r-- | reftable/record.h | 6 | ||||
-rw-r--r-- | reftable/refname.c | 53 |
7 files changed, 182 insertions, 97 deletions
diff --git a/reftable/basics.c b/reftable/basics.c index 0785aff941..fea711db7e 100644 --- a/reftable/basics.c +++ b/reftable/basics.c @@ -27,7 +27,7 @@ void put_be16(uint8_t *out, uint16_t i) out[1] = (uint8_t)(i & 0xff); } -int binsearch(size_t sz, int (*f)(size_t k, void *args), void *args) +size_t binsearch(size_t sz, int (*f)(size_t k, void *args), void *args) { size_t lo = 0; size_t hi = sz; @@ -39,8 +39,11 @@ int binsearch(size_t sz, int (*f)(size_t k, void *args), void *args) */ while (hi - lo > 1) { size_t mid = lo + (hi - lo) / 2; + int ret = f(mid, args); + if (ret < 0) + return sz; - if (f(mid, args)) + if (ret > 0) hi = mid; else lo = mid; diff --git a/reftable/basics.h b/reftable/basics.h index 91f3533efe..523ecd5307 100644 --- a/reftable/basics.h +++ b/reftable/basics.h @@ -22,13 +22,14 @@ uint32_t get_be24(uint8_t *in); void put_be16(uint8_t *out, uint16_t i); /* - * find smallest index i in [0, sz) at which f(i) is true, assuming - * that f is ascending. Return sz if f(i) is false for all indices. + * find smallest index i in [0, sz) at which `f(i) > 0`, assuming that f is + * ascending. Return sz if `f(i) == 0` for all indices. The search is aborted + * and `sz` is returned in case `f(i) < 0`. * * Contrary to bsearch(3), this returns something useful if the argument is not * found. */ -int binsearch(size_t sz, int (*f)(size_t k, void *args), void *args); +size_t binsearch(size_t sz, int (*f)(size_t k, void *args), void *args); /* * Frees a NULL terminated array of malloced strings. The array itself is also diff --git a/reftable/basics_test.c b/reftable/basics_test.c index 1fcd229725..997c4d9e01 100644 --- a/reftable/basics_test.c +++ b/reftable/basics_test.c @@ -12,40 +12,47 @@ https://developers.google.com/open-source/licenses/bsd #include "test_framework.h" #include "reftable-tests.h" -struct binsearch_args { - int key; - int *arr; +struct integer_needle_lesseq_args { + int needle; + int *haystack; }; -static int binsearch_func(size_t i, void *void_args) +static int integer_needle_lesseq(size_t i, void *_args) { - struct binsearch_args *args = void_args; - - return args->key < args->arr[i]; + struct integer_needle_lesseq_args *args = _args; + return args->needle <= args->haystack[i]; } static void test_binsearch(void) { - int arr[] = { 2, 4, 6, 8, 10 }; - size_t sz = ARRAY_SIZE(arr); - struct binsearch_args args = { - .arr = arr, + int haystack[] = { 2, 4, 6, 8, 10 }; + struct { + int needle; + size_t expected_idx; + } testcases[] = { + {-9000, 0}, + {-1, 0}, + {0, 0}, + {2, 0}, + {3, 1}, + {4, 1}, + {7, 3}, + {9, 4}, + {10, 4}, + {11, 5}, + {9000, 5}, }; + size_t i = 0; - int i = 0; - for (i = 1; i < 11; i++) { - int res; - args.key = i; - res = binsearch(sz, &binsearch_func, &args); + for (i = 0; i < ARRAY_SIZE(testcases); i++) { + struct integer_needle_lesseq_args args = { + .haystack = haystack, + .needle = testcases[i].needle, + }; + size_t idx; - if (res < sz) { - EXPECT(args.key < arr[res]); - if (res > 0) { - EXPECT(args.key >= arr[res - 1]); - } - } else { - EXPECT(args.key == 10 || args.key == 11); - } + idx = binsearch(ARRAY_SIZE(haystack), &integer_needle_lesseq, &args); + EXPECT(idx == testcases[i].expected_idx); } } diff --git a/reftable/block.c b/reftable/block.c index e2a2cee58d..298e8c56b9 100644 --- a/reftable/block.c +++ b/reftable/block.c @@ -273,35 +273,46 @@ void block_reader_start(struct block_reader *br, struct block_iter *it) it->next_off = br->header_off + 4; } -struct restart_find_args { +struct restart_needle_less_args { int error; - struct strbuf key; - struct block_reader *r; + struct strbuf needle; + struct block_reader *reader; }; -static int restart_key_less(size_t idx, void *args) +static int restart_needle_less(size_t idx, void *_args) { - struct restart_find_args *a = args; - uint32_t off = block_reader_restart_offset(a->r, idx); + struct restart_needle_less_args *args = _args; + uint32_t off = block_reader_restart_offset(args->reader, idx); struct string_view in = { - .buf = a->r->block.data + off, - .len = a->r->block_len - off, + .buf = args->reader->block.data + off, + .len = args->reader->block_len - off, }; + uint64_t prefix_len, suffix_len; + uint8_t extra; + int n; + + /* + * Records at restart points are stored without prefix compression, so + * there is no need to fully decode the record key here. This removes + * the need for allocating memory. + */ + n = reftable_decode_keylen(in, &prefix_len, &suffix_len, &extra); + if (n < 0 || prefix_len) { + args->error = 1; + return -1; + } - /* the restart key is verbatim in the block, so this could avoid the - alloc for decoding the key */ - struct strbuf rkey = STRBUF_INIT; - uint8_t unused_extra; - int n = reftable_decode_key(&rkey, &unused_extra, in); - int result; - if (n < 0) { - a->error = 1; + string_view_consume(&in, n); + if (suffix_len > in.len) { + args->error = 1; return -1; } - result = strbuf_cmp(&a->key, &rkey); - strbuf_release(&rkey); - return result < 0; + n = memcmp(args->needle.buf, in.buf, + args->needle.len < suffix_len ? args->needle.len : suffix_len); + if (n) + return n < 0; + return args->needle.len < suffix_len; } void block_iter_copy_from(struct block_iter *dest, struct block_iter *src) @@ -376,20 +387,51 @@ void block_iter_close(struct block_iter *it) int block_reader_seek(struct block_reader *br, struct block_iter *it, struct strbuf *want) { - struct restart_find_args args = { - .key = *want, - .r = br, + struct restart_needle_less_args args = { + .needle = *want, + .reader = br, }; struct block_iter next = BLOCK_ITER_INIT; struct reftable_record rec; - int err = 0, i; - + int err = 0; + size_t i; + + /* + * Perform a binary search over the block's restart points, which + * avoids doing a linear scan over the whole block. Like this, we + * identify the section of the block that should contain our key. + * + * Note that we explicitly search for the first restart point _greater_ + * than the sought-after record, not _greater or equal_ to it. In case + * the sought-after record is located directly at the restart point we + * would otherwise start doing the linear search at the preceding + * restart point. While that works alright, we would end up scanning + * too many record. + */ + i = binsearch(br->restart_count, &restart_needle_less, &args); if (args.error) { err = REFTABLE_FORMAT_ERROR; goto done; } - i = binsearch(br->restart_count, &restart_key_less, &args); + /* + * Now there are multiple cases: + * + * - `i == 0`: The wanted record is smaller than the record found at + * the first restart point. As the first restart point is the first + * record in the block, our wanted record cannot be located in this + * block at all. We still need to position the iterator so that the + * next call to `block_iter_next()` will yield an end-of-iterator + * signal. + * + * - `i == restart_count`: The wanted record was not found at any of + * the restart points. As there is no restart point at the end of + * the section the record may thus be contained in the last block. + * + * - `i > 0`: The wanted record must be contained in the section + * before the found restart point. We thus do a linear search + * starting from the preceding restart point. + */ if (i > 0) it->next_off = block_reader_restart_offset(br, i - 1); else @@ -398,21 +440,34 @@ int block_reader_seek(struct block_reader *br, struct block_iter *it, reftable_record_init(&rec, block_reader_type(br)); - /* We're looking for the last entry less/equal than the wanted key, so - we have to go one entry too far and then back up. - */ + /* + * We're looking for the last entry less than the wanted key so that + * the next call to `block_reader_next()` would yield the wanted + * record. We thus don't want to position our reader at the sought + * after record, but one before. To do so, we have to go one entry too + * far and then back up. + */ while (1) { block_iter_copy_from(&next, it); err = block_iter_next(&next, &rec); if (err < 0) goto done; - - reftable_record_key(&rec, &it->last_key); - if (err > 0 || strbuf_cmp(&it->last_key, want) >= 0) { + if (err > 0) { err = 0; goto done; } + /* + * Check whether the current key is greater or equal to the + * sought-after key. In case it is greater we know that the + * record does not exist in the block and can thus abort early. + * In case it is equal to the sought-after key we have found + * the desired record. + */ + reftable_record_key(&rec, &it->last_key); + if (strbuf_cmp(&it->last_key, want) >= 0) + goto done; + block_iter_copy_from(it, &next); } diff --git a/reftable/record.c b/reftable/record.c index 23b497adab..5506f3e913 100644 --- a/reftable/record.c +++ b/reftable/record.c @@ -159,26 +159,42 @@ int reftable_encode_key(int *restart, struct string_view dest, return start.len - dest.len; } -int reftable_decode_key(struct strbuf *last_key, uint8_t *extra, - struct string_view in) +int reftable_decode_keylen(struct string_view in, + uint64_t *prefix_len, + uint64_t *suffix_len, + uint8_t *extra) { - int start_len = in.len; - uint64_t prefix_len = 0; - uint64_t suffix_len = 0; + size_t start_len = in.len; int n; - n = get_var_int(&prefix_len, &in); + n = get_var_int(prefix_len, &in); if (n < 0) return -1; string_view_consume(&in, n); - n = get_var_int(&suffix_len, &in); + n = get_var_int(suffix_len, &in); if (n <= 0) return -1; string_view_consume(&in, n); - *extra = (uint8_t)(suffix_len & 0x7); - suffix_len >>= 3; + *extra = (uint8_t)(*suffix_len & 0x7); + *suffix_len >>= 3; + + return start_len - in.len; +} + +int reftable_decode_key(struct strbuf *last_key, uint8_t *extra, + struct string_view in) +{ + int start_len = in.len; + uint64_t prefix_len = 0; + uint64_t suffix_len = 0; + int n; + + n = reftable_decode_keylen(in, &prefix_len, &suffix_len, extra); + if (n < 0) + return -1; + string_view_consume(&in, n); if (in.len < suffix_len || prefix_len > last_key->len) diff --git a/reftable/record.h b/reftable/record.h index 826ee1c55c..d778133e6e 100644 --- a/reftable/record.h +++ b/reftable/record.h @@ -86,6 +86,12 @@ int reftable_encode_key(int *is_restart, struct string_view dest, struct strbuf prev_key, struct strbuf key, uint8_t extra); +/* Decode a record's key lengths. */ +int reftable_decode_keylen(struct string_view in, + uint64_t *prefix_len, + uint64_t *suffix_len, + uint8_t *extra); + /* * Decode into `last_key` and `extra` from `in`. `last_key` is expected to * contain the decoded key of the preceding record, if any. diff --git a/reftable/refname.c b/reftable/refname.c index 7570e4acf9..bbfde15754 100644 --- a/reftable/refname.c +++ b/reftable/refname.c @@ -12,15 +12,15 @@ #include "refname.h" #include "reftable-iterator.h" -struct find_arg { - char **names; - const char *want; +struct refname_needle_lesseq_args { + char **haystack; + const char *needle; }; -static int find_name(size_t k, void *arg) +static int refname_needle_lesseq(size_t k, void *_args) { - struct find_arg *f_arg = arg; - return strcmp(f_arg->names[k], f_arg->want) >= 0; + struct refname_needle_lesseq_args *args = _args; + return strcmp(args->needle, args->haystack[k]) <= 0; } static int modification_has_ref(struct modification *mod, const char *name) @@ -29,25 +29,23 @@ static int modification_has_ref(struct modification *mod, const char *name) int err = 0; if (mod->add_len > 0) { - struct find_arg arg = { - .names = mod->add, - .want = name, + struct refname_needle_lesseq_args args = { + .haystack = mod->add, + .needle = name, }; - int idx = binsearch(mod->add_len, find_name, &arg); - if (idx < mod->add_len && !strcmp(mod->add[idx], name)) { + size_t idx = binsearch(mod->add_len, refname_needle_lesseq, &args); + if (idx < mod->add_len && !strcmp(mod->add[idx], name)) return 0; - } } if (mod->del_len > 0) { - struct find_arg arg = { - .names = mod->del, - .want = name, + struct refname_needle_lesseq_args args = { + .haystack = mod->del, + .needle = name, }; - int idx = binsearch(mod->del_len, find_name, &arg); - if (idx < mod->del_len && !strcmp(mod->del[idx], name)) { + size_t idx = binsearch(mod->del_len, refname_needle_lesseq, &args); + if (idx < mod->del_len && !strcmp(mod->del[idx], name)) return 1; - } } err = reftable_table_read_ref(&mod->tab, name, &ref); @@ -73,11 +71,11 @@ static int modification_has_ref_with_prefix(struct modification *mod, int err = 0; if (mod->add_len > 0) { - struct find_arg arg = { - .names = mod->add, - .want = prefix, + struct refname_needle_lesseq_args args = { + .haystack = mod->add, + .needle = prefix, }; - int idx = binsearch(mod->add_len, find_name, &arg); + size_t idx = binsearch(mod->add_len, refname_needle_lesseq, &args); if (idx < mod->add_len && !strncmp(prefix, mod->add[idx], strlen(prefix))) goto done; @@ -92,15 +90,14 @@ static int modification_has_ref_with_prefix(struct modification *mod, goto done; if (mod->del_len > 0) { - struct find_arg arg = { - .names = mod->del, - .want = ref.refname, + struct refname_needle_lesseq_args args = { + .haystack = mod->del, + .needle = ref.refname, }; - int idx = binsearch(mod->del_len, find_name, &arg); + size_t idx = binsearch(mod->del_len, refname_needle_lesseq, &args); if (idx < mod->del_len && - !strcmp(ref.refname, mod->del[idx])) { + !strcmp(ref.refname, mod->del[idx])) continue; - } } if (strncmp(ref.refname, prefix, strlen(prefix))) { |