From aef0147fa19ea6627dd3e01f7e8c6994e928e8cb Mon Sep 17 00:00:00 2001 From: Maxime Daniel Date: Mon, 20 Dec 2021 11:53:03 +0100 Subject: [PATCH 1/7] libzdb: rewrite index memory management Previous index memory management was basically one large array of 128 MB and each block of 8 bytes was a pointer to a linked-list of entries. Each entry was the object entry with an extra link for the namespace because all namespace were sharing the same master-array. To find the correct list index, the crc32 of the key were used and the first 24 bits was the array-id, this method was really fast but needed to maintains a unique full large shared buffer. When removing a namespace, deleting process could be very long to iterate over all keys (for all namespaces) even if namespace contains few keys. Real memory usage was quite effcient actually, thanks Linux, only effective used pages was allocated, so the large 128 MB array was dynamically allocated by the kernel but on really large database, limit can quickly come and sharing the array was anyway not a good idea IMO. -- Here is a new implementation, in test. This new implementation use a dedicated index map per namespace, the namespace pointer per entry is thus not needed anymore, it's more logic and less error prone, no sharing across namespace anymore. When deleting a namespace, the complete isolated map can be free'd without interfering with other keys/namespace. There is still a crc32 computing on the key and that value will point to an array entry aswell, but this array become multi-level and levels are allocated dynamically. Each level is composed of an array of 16 pointers. This version use 5 level (20 bits out of 32). This is configurable. -- Test were made on 2 millions keys namespace, memory usage can be reduced up to 6 times than before and is more predictible (eg: 2 times the same namespace will use 2 times the memory). More test to come, performance doesn't seems to be affected a lot, but it's slower (lookup and allocation are slower, but difference is not really noticable). --- libzdb/index.c | 87 ++-------- libzdb/index.h | 56 +++---- libzdb/index_branch.c | 380 ++++++++++++++++++++++++++++-------------- libzdb/index_branch.h | 29 ++-- libzdb/index_loader.c | 87 ++++------ libzdb/index_loader.h | 4 +- libzdb/index_set.c | 6 +- libzdb/namespace.c | 31 +--- libzdb/namespace.h | 8 - 9 files changed, 354 insertions(+), 334 deletions(-) diff --git a/libzdb/index.c b/libzdb/index.c index d780468..f0ebfe9 100644 --- a/libzdb/index.c +++ b/libzdb/index.c @@ -39,7 +39,6 @@ void index_entry_dump(index_entry_t *entry) { #ifdef RELEASE (void) entry; #else - zdb_debug("[+] index: entry dump: namespace : %p\n", entry->namespace); zdb_debug("[+] index: entry dump: id length : %" PRIu8 "\n", entry->idlength); zdb_debug("[+] index: entry dump: idx offset : %" PRIu32 "\n", entry->idxoffset); zdb_debug("[+] index: entry dump: idx fileid : %" PRIu32 "\n", entry->indexid); @@ -417,30 +416,28 @@ uint32_t index_next_objectid(index_root_t *root) { // perform the basic "hashing" (crc based) used to point to the expected branch // we only keep partial amount of the result to not fill the memory too fast uint32_t index_key_hash(unsigned char *id, uint8_t idlength) { - return zdb_crc32((const uint8_t *) id, idlength) & buckets_mask; + return zdb_crc32((const uint8_t *) id, idlength); } // main look-up function, used to get an entry from the memory index index_entry_t *index_entry_get(index_root_t *root, unsigned char *id, uint8_t idlength) { uint32_t branchkey = index_key_hash(id, idlength); - index_branch_t *branch = index_branch_get(root->branches, branchkey); - index_entry_t *entry; + index_entry_t *list; - // branch not exists - if(!branch) + // no list found, entry not found + if(!(list = index_hash_lookup(root->hash, branchkey))) return NULL; - for(entry = branch->list; entry; entry = entry->next) { + // walk over the list + for(index_entry_t *entry = list; entry; entry = entry->next) { if(entry->idlength != idlength) continue; - if(entry->namespace != root->namespace) - continue; - if(memcmp(entry->id, id, idlength) == 0) return entry; } + // entry not found return NULL; } @@ -509,23 +506,14 @@ int index_entry_delete_memory(index_root_t *root, index_entry_t *entry) { root->stats.size -= sizeof(index_entry_t) + entry->idlength; // running in a mode without index, let's just skip this - if(root->branches == NULL) + if(root->hash == NULL) return 0; - uint32_t branchkey = index_key_hash(entry->id, entry->idlength); - index_branch_t *branch = index_branch_get(root->branches, branchkey); - index_entry_t *previous = index_branch_get_previous(branch, entry); - zdb_debug("[+] index: delete memory: removing entry from memory\n"); - if(previous == entry) { - zdb_danger("[-] index: entry delete memory: something wrong happens"); - zdb_danger("[-] index: entry delete memory: branches seems buggy"); + uint32_t hashkey = index_key_hash(entry->id, entry->idlength); + if(!index_hash_remove(root->hash, hashkey, entry)) return 1; - } - - // removing entry from global branch - index_branch_remove(branch, entry, previous); // cleaning memory object free(entry); @@ -710,63 +698,14 @@ size_t index_offset_objectid(uint32_t objectid) { return offset; } -// iterate over all entries in a single branch -// and remove if this entry is related to requested namespace -static inline size_t index_clean_namespace_branch(index_branch_t *branch, void *namespace) { - index_entry_t *entry = branch->list; - index_entry_t *previous = NULL; - size_t deleted = 0; - - while(entry) { - if(entry->namespace != namespace) { - // keeping this key, looking forward - previous = entry; - entry = entry->next; - continue; - } - - #ifndef RELEASE - zdb_log("[+] index: namespace cleaner: free: "); - zdb_hexdump(entry->id, entry->idlength); - printf("\n"); // FIXME - #endif - - // okay, we need to remove this key - index_entry_t *next = entry->next; - index_entry_t *removed = index_branch_remove(branch, entry, previous); - - free(removed); - deleted += 1; - - entry = next; - } - - return deleted; -} - // remove specific namespace from the index // // we use a global index for everything, when removing a // namespace, we walk over all the keys and remove keys matching // to this namespace -int index_clean_namespace(index_root_t *root, void *namespace) { - index_branch_t **branches = root->branches; - size_t deleted = 0; - - if(!branches) - return 0; - - zdb_debug("[+] index: starting namespace cleaner\n"); - - for(uint32_t b = 0; b < buckets_branches; b++) { - if(!branches[b]) - continue; - - deleted += index_clean_namespace_branch(branches[b], namespace); - } - - zdb_debug("[+] index: namespace cleaner: %lu keys removed\n", deleted); - +int index_clean_namespace(index_root_t *root) { + index_hash_free(root->hash); + root->hash = NULL; return 0; } diff --git a/libzdb/index.h b/libzdb/index.h index 4ef104d..7673362 100644 --- a/libzdb/index.h +++ b/libzdb/index.h @@ -78,16 +78,6 @@ // linked list pointer struct index_entry_t *next; - // pointer to source namespace - // index should not be aware of his namespace - // but since we use a single big index, we need to - // be able to make namespace distinction - // note: another approch could be separate branch-list per namespace - // note 2: we keep a void pointer, we will only compare address and not - // the object itself, this make some opacity later if we change - // and reduce issue with circular inclusion - void *namespace; - uint8_t idlength; // length of the id, here uint8_t limits to 256 bytes uint32_t offset; // offset on the corresponding datafile uint32_t idxoffset; // offset on the index file (index file id is the same as data file) @@ -103,27 +93,33 @@ } index_entry_t; - // WARNING: this should be on index_branch.h - // but we can't due to circular dependencies - // in order to fix this, we should put all structs in a dedicated file // - // the current implementation of the index use rudimental index memory system - // it's basicly just linked-list of entries - // to improve performance without changing this basic implementation, - // which is really slow, of course, we use a "branch" system which simply - // splits all the arrays based on an id + // new index memory hash use a multi-level indirection + // array, based on crc32 entry // - // the id is specified on the implementation file, with the length, etc. + // more information can be found on index-branch files // - // - id 0000: [...........] - // - id 0001: [...................] - // - id 0002: [...] - typedef struct index_branch_t { - size_t length; // length of this branch (count of entries) - index_entry_t *list; // entry point of the linked list - index_entry_t *last; // pointer to the last item, quicker to append - } index_branch_t; + typedef struct index_hash_t { + char type; + union { + struct index_hash_t **sub; + index_entry_t *list; + }; + + } index_hash_t; + + typedef struct index_hash_stats_t { + size_t subs; + size_t subsubs; + size_t entries; + size_t max_entries; + size_t lists; + size_t entries_size; + size_t ids_size; + + } index_hash_stats_t; + // index status flags // keep some heatly status of the index @@ -189,10 +185,8 @@ int updated; // does current index changed since opened int secure; // enable some safety (see secure zdb_settings_t) - void *namespace; // see index_entry_t, same reason - index_seqid_t *seqid; // sequential fileid mapping - index_branch_t **branches; // list of branches (explained later) + index_hash_t *hash; // index keys hashmap index_status_t status; // index health index_stats_t stats; // index statistics index_dirty_t dirty; // bitmap of dirty index files @@ -281,7 +275,7 @@ int index_entry_delete_memory(index_root_t *root, index_entry_t *entry); int index_entry_is_deleted(index_entry_t *entry); - int index_clean_namespace(index_root_t *root, void *namespace); + int index_clean_namespace(index_root_t *root); extern index_entry_t *index_reusable_entry; diff --git a/libzdb/index_branch.c b/libzdb/index_branch.c index a1a2229..cc20697 100644 --- a/libzdb/index_branch.c +++ b/libzdb/index_branch.c @@ -6,169 +6,301 @@ #include "libzdb.h" #include "libzdb_private.h" -// maximum allowed branch in memory -// -// this settings is mainly the most important to -// determine the keys lookup time -// -// the more bits you allows here, the more buckets -// can be used for lookup without collision +#define INDEX_HASH_SUB 1 +#define INDEX_HASH_LIST 2 + +#define BITS_PER_ROWS 4 // 4 bits per entry (0x00 -> 0x0f) +#define KEY_LENGTH 20 // using crc32 but only using 20 bits +#define DEEP_LEVEL KEY_LENGTH / BITS_PER_ROWS // 5 levels (20 bits total, 4 bits per entry) +#define ENTRIES_PER_ROWS 1 << BITS_PER_ROWS // 0x00 -> 0x0f = 16 + // -// the index works like a hash-table and uses crc32 'hash' -// algorithm, the result of the crc32 is used to point to -// the bucket, but using a full 32-bits hashlist would -// consume more than (2^32 * 8) bytes of memory (on 64-bits) +// CRC32 => 0x10320af +// => 0x10320.. # we only use 20 bits // -// the default settings sets this to 24 bits, which allows -// 16 millions direct entries, collisions uses linked-list +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// |0|1|2|3|4|5|6|7|8|9|A|B|C|D|E|F| +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// ^ 0x1xxxxxxx +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// |0|1|2|3|4|5|6|7|8|9|A|B|C|D|E|F| +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// ^ 0x10xxxxxx +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// |0|1|2|3|4|5|6|7|8|9|A|B|C|D|E|F| +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// ^ 0x103xxxxx +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// |0|1|2|3|4|5|6|7|8|9|A|B|C|D|E|F| +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// ^ 0x1032xxxx +// ... // -// makes sur mask and amount of branch are always in relation -// use 'index_set_buckets_bits' to be sure -uint32_t buckets_branches = (1 << 24); -uint32_t buckets_mask = (1 << 24) - 1; - -// WARNING: this doesn't resize anything, you should calls this -// only before initialization -int index_set_buckets_bits(uint8_t bits) { - buckets_branches = 1 << bits; - buckets_mask = (1 << bits) - 1; - - return buckets_branches; +// when the last level is reached, list object point to the +// head of a linked-list of entries + +index_hash_t *index_hash_new(int type) { + index_hash_t *root; + + if(!(root = calloc(sizeof(index_hash_t), 1))) + zdb_diep("index: hash: root calloc"); + + if(type == INDEX_HASH_SUB) { + root->type = INDEX_HASH_SUB; + if(!(root->sub = calloc(sizeof(index_hash_t **), ENTRIES_PER_ROWS))) + zdb_diep("index: hash: sub calloc"); + } + + if(type == INDEX_HASH_LIST) + root->type = INDEX_HASH_LIST; + + return root; } -// -// index branch -// this implementation uses a lazy load of branches -// this allows us to use a lot of branches (buckets_branches) in this case) -// without consuming all the memory if we don't need it -// -index_branch_t **index_buckets_init() { - return (index_branch_t **) calloc(sizeof(index_branch_t *), buckets_branches); +index_hash_t *index_hash_init() { + return index_hash_new(INDEX_HASH_SUB); } -index_branch_t *index_branch_init(index_branch_t **branches, uint32_t branchid) { - // zdb_debug("[+] initializing branch id 0x%x\n", branchid); +void *index_hash_push(index_hash_t *root, uint32_t lookup, index_entry_t *entry) { + // start with mask 0x0000000f (with 4 bits per rows) + uint32_t shift = ~(0xffffffff << BITS_PER_ROWS); - branches[branchid] = malloc(sizeof(index_branch_t)); - index_branch_t *branch = branches[branchid]; + // same algorythm than lookup, but with allocation + for(int i = 0; i < DEEP_LEVEL; i++) { + unsigned int mask = (lookup & shift); + unsigned int check = mask >> (i * BITS_PER_ROWS); - branch->length = 0; - branch->last = NULL; - branch->list = NULL; + if(root->sub[check] == NULL) { + if(i < DEEP_LEVEL - 1) + root->sub[check] = index_hash_new(INDEX_HASH_SUB); - return branch; -} + if(i == DEEP_LEVEL - 1) + root->sub[check] = index_hash_new(INDEX_HASH_LIST); + } -void index_branch_free(index_branch_t **branches, uint32_t branchid) { - // this branch was not allocated - if(!branches[branchid]) - return; + if(i == DEEP_LEVEL - 1) { + entry->next = root->sub[check]->list; + root->sub[check]->list = entry; - index_entry_t *entry = branches[branchid]->list; - index_entry_t *next = NULL; + return entry; + } - // deleting branch content by - // iterate over the linked-list - for(; entry; entry = next) { - next = entry->next; - free(entry); + root = root->sub[check]; + shift <<= BITS_PER_ROWS; } - // deleting branch - free(branches[branchid]); -} + // insertion failed, should never happen + return NULL; -// returns branch from rootindex, if branch is not allocated yet, returns NULL -// useful for any read on the index in memory -index_branch_t *index_branch_get(index_branch_t **branches, uint32_t branchid) { - if(!branches) - return NULL; - - return branches[branchid]; } -// returns branch from rootindex, if branch doesn't exists, it will be allocated -// (useful for any write in the index in memory) -index_branch_t *index_branch_get_allocate(index_branch_t **branches, uint32_t branchid) { - if(!branches[branchid]) - return index_branch_init(branches, branchid); +static index_hash_t *index_hash_lookup_member(index_hash_t *root, uint32_t lookup) { + // BITS_PER_ROWS specifies how many bits we use to compare each level + // we need to use a mask we shift for each level, we hardcode maximum + // to 32 bits mask + // + // starting from 0xffffffff (all bits sets) + // + // with 4 bits: + // Shifting with amount of bits: 0xfffffff0 + // Then negate that : 0x0000000f + // + // with 16 bits: + // Shifting with amount of bits: 0xffff0000 + // Then negate that : 0x0000ffff + + // start with mask 0x0000000f (with 4 bits per rows) + uint32_t shift = ~(0xffffffff << BITS_PER_ROWS); + + // printf(">> %x\n", lookup); + + for(int i = 0; i < DEEP_LEVEL; i++) { + unsigned int mask = (lookup & shift); + unsigned int check = mask >> (i * BITS_PER_ROWS); + + if(root->sub[check] == NULL) + return NULL; + + root = root->sub[check]; + shift <<= BITS_PER_ROWS; + } - // zdb_debug("[+] branch: exists: %lu entries\n", branches[branchid]->length); - return branches[branchid]; + return root; } -// append an entry (item) to the memory list -// since we use a linked-list, the logic of appending -// only occures here -// -// if there is no index, we just skip the appending -index_entry_t *index_branch_append(index_branch_t **branches, uint32_t branchid, index_entry_t *entry) { - index_branch_t *branch; +index_entry_t *index_hash_lookup(index_hash_t *root, uint32_t lookup) { + index_hash_t *member; - if(!branches) + if(!(member = index_hash_lookup_member(root, lookup))) return NULL; - // grabbing the branch - branch = index_branch_get_allocate(branches, branchid); - branch->length += 1; + // point to the head of the list + return member->list; +} - // adding this item and pointing previous last one - // to this new one - if(!branch->list) - branch->list = entry; +index_entry_t *index_hash_remove(index_hash_t *root, uint32_t lookup, index_entry_t *entry) { + index_hash_t *member = index_hash_lookup_member(root, lookup); + if(!member) + return NULL; - if(branch->last) - branch->last->next = entry; + // entry is the list head, replace + // head with next entry and we are done + if(member->list == entry) { + member->list = entry->next; + return entry; + } - branch->last = entry; - entry->next = NULL; + // looking for the entry in the list + index_entry_t *previous = member->list; + while(previous->next != entry) + previous = previous->next; + + // update linked list + previous->next = entry->next; return entry; } -// remove one entry on this branch -// since it's a linked-list, we need to know which entry was the previous one -// we use a single-direction linked-list -// -// removing an entry from the list don't free this entry, is just re-order -// list to keep it coherent -index_entry_t *index_branch_remove(index_branch_t *branch, index_entry_t *entry, index_entry_t *previous) { - // removing the first entry - if(branch->list == entry) - branch->list = entry->next; +// call user function pointer (with user argument) for +// each entries available on the index, the order follow memory +// order and is not related to entries +int index_hash_walk(index_hash_t *root, int (*callback)(index_entry_t *, void *), void *userptr) { + index_entry_t *entry; + int value; + + for(int i = 0; i < ENTRIES_PER_ROWS; i++) { + // ignore unallocated sub + if(!root->sub[i]) + continue; + + if(root->sub[i]->type == INDEX_HASH_LIST) { + for(entry = root->sub[i]->list; entry; entry = entry->next) { + if((value = callback(entry, userptr)) != 0) { + // callback interruption + return value; + } + } + } + + if(root->sub[i]->type == INDEX_HASH_SUB) { + if((value = index_hash_walk(root->sub[i], callback, userptr)) != 0) { + // callback interruption + return value; + } + } + } + + return 0; +} + +// compute statistics on index entries and size +static index_hash_stats_t index_hash_stats_level(index_hash_t *root) { + index_hash_stats_t stats = { + .subs = 0, + .subsubs = 0, + .entries = 0, + .max_entries = 0, + .lists = 0, + .entries_size = 0, + .ids_size = 0, + }; + + for(int i = 0; i < ENTRIES_PER_ROWS; i++) { + if(root->sub[i]) { + stats.subs += 1; + + if(root->sub[i]->type == INDEX_HASH_LIST) { + size_t localent = 0; + stats.lists += 1; + + for(index_entry_t *entry = root->sub[i]->list; entry; entry = entry->next) { + stats.entries_size += sizeof(index_entry_t) + entry->idlength; + stats.ids_size += entry->idlength; + localent += 1; + } + + if(localent > stats.max_entries) + stats.max_entries = localent; + + stats.entries += localent; + } + + if(root->sub[i]->type == INDEX_HASH_SUB) { + stats.subsubs += 1; + index_hash_stats_t extra = index_hash_stats_level(root->sub[i]); + + stats.subs += extra.subs; + stats.subsubs += extra.subsubs; + stats.entries += extra.entries; + stats.lists += extra.lists; + stats.entries_size += extra.entries_size; + stats.ids_size += extra.ids_size; + + if(extra.max_entries > stats.max_entries) + stats.max_entries = extra.max_entries; + } + } + } - // skipping this entry, linking next from previous - // to our next one - if(previous) - previous->next = entry->next; + return stats; +} - // if our entry was the last one - // the new last one is the previous one - if(branch->last == entry) - branch->last = previous; +void index_hash_stats(index_hash_t *root) { + index_hash_stats_t stats = index_hash_stats_level(root); + size_t subs_size = stats.subs * sizeof(index_hash_t); + size_t lists_size = stats.lists * sizeof(index_entry_t *); + size_t arrays_size = stats.subsubs * sizeof(index_hash_t **) * ENTRIES_PER_ROWS; + + zdb_debug("[+] index: metrics: subs alloc : %lu\n", stats.subs); + zdb_debug("[+] index: metrics: lists alloc: %lu\n", stats.lists); + zdb_debug("[+] index: metrics: subsubs : %lu\n", stats.subsubs); + zdb_verbose("[+] index: metrics: entries : %lu\n", stats.entries); + zdb_debug("[+] index: metrics: max entries: %lu\n", stats.max_entries); + zdb_verbose("[+] index: metrics: items size : %lu (%.2f MB)\n", stats.entries_size, MB(stats.entries_size)); + zdb_verbose("[+] index: metrics: items ids : %lu (%.2f MB)\n", stats.ids_size, MB(stats.ids_size)); + zdb_verbose("[+] index: metrics: subs size : %lu (%.2f MB)\n", subs_size, MB(subs_size)); + zdb_verbose("[+] index: metrics: lists size : %lu (%.2f MB)\n", lists_size, MB(lists_size)); + zdb_verbose("[+] index: metrics: subs array : %lu (%.2f MB)\n", arrays_size, MB(arrays_size)); + + if(stats.lists) { + zdb_debug("[+] index: metrics: avg entries: %lu\n", stats.entries / stats.lists); + } - branch->length -= 1; + size_t total = stats.entries_size + subs_size + lists_size + arrays_size; - return entry; + zdb_verbose("[+] index: metrics: total size : %lu (%.2f MB)\n", total, MB(total)); } -// iterate over a branch and try to find the previous entry of the given entry -// if by mystake, the entry was not found on the branch, we returns the entry itself -// if entry was the first entry, previous will also be NULL -index_entry_t *index_branch_get_previous(index_branch_t *branch, index_entry_t *entry) { - index_entry_t *previous = NULL; - index_entry_t *iterator = branch->list; +static void index_hash_free_list(index_entry_t *head) { + index_entry_t *entry = head; + + while(entry) { + // copy current entry and saving next address + // before freeing the object + index_entry_t *current = entry; + entry = current->next; - while(iterator && iterator != entry) { - previous = iterator; - iterator = iterator->next; + // free object + free(current); } +} - // we reached the end of the list, without finding - // a matching entry, this is mostly a mistake from caller - // let's notify it by replying with it's own object - if(!iterator) - return entry; +void index_hash_free(index_hash_t *root) { + for(int i = 0; i < ENTRIES_PER_ROWS; i++) { + if(root->sub[i]) { + // clean the linked list + if(root->sub[i]->type == INDEX_HASH_LIST) { + index_hash_free_list(root->sub[i]->list); + free(root->sub[i]); + continue; + } + + if(root->sub[i]->type == INDEX_HASH_SUB) + index_hash_free(root->sub[i]); + } + } - return previous; + free(root->sub); + free(root); } + diff --git a/libzdb/index_branch.h b/libzdb/index_branch.h index 4195cb8..e6ed419 100644 --- a/libzdb/index_branch.h +++ b/libzdb/index_branch.h @@ -1,21 +1,22 @@ #ifndef __ZDB_INDEX_BRANCH_H #define __ZDB_INDEX_BRANCH_H - // buckets - extern uint32_t buckets_branches; - extern uint32_t buckets_mask; + // initializers + index_hash_t *index_hash_init(); + index_hash_t *index_hash_new(int type); - int index_set_buckets_bits(uint8_t bits); - index_branch_t **index_buckets_init(); + // cleaner + void index_hash_free(index_hash_t *root); - // initializers - index_branch_t *index_branch_init(index_branch_t **branches, uint32_t branchid); - void index_branch_free(index_branch_t **branches, uint32_t branchid); + // list manipulation + void *index_hash_push(index_hash_t *root, uint32_t lookup, index_entry_t *entry); + index_entry_t *index_hash_lookup(index_hash_t *root, uint32_t lookup); + index_entry_t *index_hash_remove(index_hash_t *root, uint32_t lookup, index_entry_t *entry); + + // inspection + int index_hash_walk(index_hash_t *root, int (*callback)(index_entry_t *, void *), void *userptr); + + // statistics + void index_hash_stats(index_hash_t *root); - // accessors - index_branch_t *index_branch_get(index_branch_t **branches, uint32_t branchid); - index_branch_t *index_branch_get_allocate(index_branch_t **branches, uint32_t branchid); - index_entry_t *index_branch_append(index_branch_t **branches, uint32_t branchid, index_entry_t *entry); - index_entry_t *index_branch_remove(index_branch_t *branch, index_entry_t *entry, index_entry_t *previous); - index_entry_t *index_branch_get_previous(index_branch_t *branch, index_entry_t *entry); #endif diff --git a/libzdb/index_loader.c b/libzdb/index_loader.c index 2c6017a..fb04021 100644 --- a/libzdb/index_loader.c +++ b/libzdb/index_loader.c @@ -16,57 +16,31 @@ // // index initializer and dumper // -static inline void index_dump_entry(index_entry_t *entry) { - zdb_log("[+] key ["); - zdb_hexdump(entry->id, entry->idlength); - zdb_log("] offset %" PRIu32 ", length: %" PRIu32 "\n", entry->offset, entry->length); -} - -// dumps the current index load -// fulldump flags enable printing each entry -static void index_dump(index_root_t *root, int fulldump) { - size_t branches = 0; - - zdb_log("[+] index: verifyfing populated keys\n"); - - if(fulldump) - zdb_log("[+] ===========================\n"); - - // iterating over each buckets - for(uint32_t b = 0; b < buckets_branches; b++) { - index_branch_t *branch = index_branch_get(root->branches, b); +static int index_dump_full_callback(index_entry_t *entry, void *userptr) { + (void) userptr; - // skipping empty branch - if(!branch) - continue; - - branches += 1; - index_entry_t *entry = branch->list; - - if(!fulldump) - continue; + zdb_log("[+] key: "); + zdb_hexdump(entry->id, entry->idlength); - // iterating over the linked-list - for(; entry; entry = entry->next) - index_dump_entry(entry); - } + zdb_log("[+] offset %" PRIu32 ", length: %" PRIu32 "\n", entry->offset, entry->length); - if(fulldump) { - if(root->stats.entries == 0) - zdb_log("[+] index is empty\n"); + return 0; +} - zdb_log("[+] ===========================\n"); - } +static void index_dump_full(index_root_t *root) { + zdb_log("[+] ===========================\n"); - zdb_verbose("[+] index: uses: %lu branches\n", branches); + // walk over all keys and dump some information + index_hash_walk(root->hash, index_dump_full_callback, NULL); - // overhead contains: - // - the buffer allocated to hold each (future) branches pointer - // - the branch struct itself for each branch - size_t overhead = (buckets_branches * sizeof(index_branch_t **)) + - (branches * sizeof(index_branch_t)); + zdb_log("[+] ===========================\n"); +} - zdb_verbose("[+] index: memory overhead: %.2f KB (%lu bytes)\n", KB(overhead), overhead); +// dumps the current index load +// fulldump flags enable printing each entry +static void index_dump(index_root_t *root) { + zdb_log("[+] index: verifyfing populated keys\n"); + index_hash_stats(root->hash); } static void index_dump_statistics(index_root_t *root) { @@ -506,7 +480,7 @@ index_seqid_t *index_allocate_seqid() { return seqid; } -index_root_t *index_init_lazy(zdb_settings_t *settings, char *indexdir, void *namespace) { +index_root_t *index_init_lazy(zdb_settings_t *settings, char *indexdir) { index_root_t *root; if(!(root = calloc(sizeof(index_root_t), 1))) { @@ -524,12 +498,14 @@ index_root_t *index_init_lazy(zdb_settings_t *settings, char *indexdir, void *na root->synctime = settings->synctime; root->lastsync = 0; root->status = INDEX_NOT_LOADED | INDEX_HEALTHY; - root->branches = NULL; - root->namespace = namespace; root->mode = settings->mode; root->rotate = time(NULL); root->secure = settings->secure; + // allocate index hash + if(!(root->hash = index_hash_init())) + zdb_diep("index: init: hash"); + index_dirty_resize(root, 1); // switching to default mode when mix enabled @@ -564,18 +540,22 @@ index_root_t *index_rehash(index_root_t *root) { } // create an index and load files -index_root_t *index_init(zdb_settings_t *settings, char *indexdir, void *namespace, index_branch_t **branches) { +index_root_t *index_init(zdb_settings_t *settings, char *indexdir) { zdb_debug("[+] index: initializing\n"); - index_root_t *root = index_init_lazy(settings, indexdir, namespace); - root->branches = branches; + index_root_t *root = index_init_lazy(settings, indexdir); // initialize internal pointers index_rehash(root); index_internal_load(root); - if(root->mode == ZDB_MODE_KEY_VALUE) - index_dump(root, settings->dump); + if(root->mode == ZDB_MODE_KEY_VALUE) { + if(settings->dump) + index_dump_full(root); + + // dump internal statistics + index_dump(root); + } index_dump_statistics(root); @@ -602,6 +582,9 @@ void index_destroy(index_root_t *root) { free(root->seqid); } + // clean hashmap + index_hash_free(root->hash); + free(root); } diff --git a/libzdb/index_loader.h b/libzdb/index_loader.h index 5f1fc68..3dcafc0 100644 --- a/libzdb/index_loader.h +++ b/libzdb/index_loader.h @@ -5,8 +5,8 @@ index_header_t index_initialize(int fd, fileid_t indexid, index_root_t *root); // initialize the whole index system - index_root_t *index_init(zdb_settings_t *settings, char *indexdir, void *namespace, index_branch_t **branches); - index_root_t *index_init_lazy(zdb_settings_t *settings, char *indexdir, void *namespace); + index_root_t *index_init(zdb_settings_t *settings, char *indexdir); + index_root_t *index_init_lazy(zdb_settings_t *settings, char *indexdir); // internal functions index_root_t *index_rehash(index_root_t *root); diff --git a/libzdb/index_set.c b/libzdb/index_set.c index 70f492d..57c74a7 100644 --- a/libzdb/index_set.c +++ b/libzdb/index_set.c @@ -182,7 +182,6 @@ index_entry_t *index_insert_memory_handler_memkey(index_root_t *root, index_set_ memcpy(entry->id, set->id, new->idlength); entry->idlength = new->idlength; - entry->namespace = root->namespace; entry->offset = new->offset; entry->length = new->length; entry->dataid = root->indexid; // WARNING: check this @@ -197,7 +196,10 @@ index_entry_t *index_insert_memory_handler_memkey(index_root_t *root, index_set_ uint32_t branchkey = index_key_hash(entry->id, entry->idlength); // commit entry into memory - index_branch_append(root->branches, branchkey, entry); + if(!index_hash_push(root->hash, branchkey, entry)) { + free(entry); + return NULL; + } // update statistics (if the key exists) // maybe it doesn't exists if it comes from a replay diff --git a/libzdb/namespace.c b/libzdb/namespace.c index 7cc2cd5..2ea7969 100644 --- a/libzdb/namespace.c +++ b/libzdb/namespace.c @@ -227,7 +227,7 @@ namespace_t *namespace_ensure(namespace_t *namespace) { static int namespace_load_lazy(ns_root_t *nsroot, namespace_t *namespace) { // now, we are sure the namespace exists, but it could be empty // let's call index and data initializer, they will take care of that - namespace->index = index_init(nsroot->settings, namespace->indexpath, namespace, nsroot->branches); + namespace->index = index_init(nsroot->settings, namespace->indexpath); namespace->data = data_init(nsroot->settings, namespace->datapath, namespace->index->indexid); return 0; @@ -447,20 +447,10 @@ ns_root_t *namespaces_allocate(zdb_settings_t *settings) { root->length = 1; // we start with the default one, only root->effective = 1; // no namespace has been loaded yet root->settings = settings; // keep the reference to the settings, needed for paths - root->branches = NULL; // maybe we don't need the branches, see below if(!(root->namespaces = (namespace_t **) malloc(sizeof(namespace_t *) * root->length))) zdb_diep("namespace malloc"); - // allocating (if needed, only some modes need it) the big (single) index branches - if(settings->mode == ZDB_MODE_KEY_VALUE || settings->mode == ZDB_MODE_MIX) { - zdb_debug("[+] namespaces: pre-allocating index (%d lazy branches)\n", buckets_branches); - - // allocating minimal branches array - if(!(root->branches = index_buckets_init())) - zdb_diep("buckets allocation"); - } - return root; } @@ -506,19 +496,6 @@ void namespace_free(namespace_t *namespace) { // this is called when we receive a graceful exit request // let's clean all indices, data and namespace arrays int namespaces_destroy() { - // freeing the big index buffer - // since branches want an index as argument, let's use - // the first namespace (default), since they all share - // the same buffer - if(nsroot->branches) { - zdb_debug("[+] namespaces: cleaning branches\n"); - for(uint32_t b = 0; b < buckets_branches; b++) - index_branch_free(nsroot->namespaces[0]->index->branches, b); - - // freeing the big index array - free(nsroot->branches); - } - // calling emergency to ensure we flushed everything namespaces_emergency(); @@ -606,7 +583,7 @@ int namespace_reload(namespace_t *namespace) { zdb_debug("[+] namespace: reloading: %s\n", namespace->name); zdb_debug("[+] namespace: reload: cleaning index\n"); - index_clean_namespace(namespace->index, namespace); + index_clean_namespace(namespace->index); zdb_debug("[+] namespace: reload: destroying objects\n"); index_destroy(namespace->index); @@ -630,7 +607,7 @@ int namespace_flush(namespace_t *namespace) { zdb_debug("[+] namespace: flushing: %s\n", namespace->name); zdb_debug("[+] namespace: flushing: cleaning index\n"); - index_clean_namespace(namespace->index, namespace); + index_clean_namespace(namespace->index); char *indexpath = strdup(namespace->index->indexdir); char *datapath = strdup(namespace->data->datadir); @@ -675,7 +652,7 @@ int namespace_delete(namespace_t *namespace) { // redis_detach_clients(namespace); // unallocating keys attached to this namespace - index_clean_namespace(namespace->index, namespace); + index_clean_namespace(namespace->index); // cleaning and closing namespace links index_destroy(namespace->index); diff --git a/libzdb/namespace.h b/libzdb/namespace.h index 5bafc73..55ceffb 100644 --- a/libzdb/namespace.h +++ b/libzdb/namespace.h @@ -58,14 +58,6 @@ size_t effective; // amount of namespaces currently loaded namespace_t **namespaces; // pointers to namespaces zdb_settings_t *settings; // global settings reminder - index_branch_t **branches; // unique global branches list - - // as explained in namespace.c, we keep a single big - // index which that contains everything (all namespaces together) - // - // for each index structure, we will point the branches to the - // same big index branches all the time, this is why we keep - // this one here, as the 'original one' } ns_root_t; From 4016d4b61ac42fb11fdd21fab2ac1fc7c5438263 Mon Sep 17 00:00:00 2001 From: Maxime Daniel Date: Mon, 20 Dec 2021 12:25:05 +0100 Subject: [PATCH 2/7] libzdb: apply index changes to the api --- libzdb/api.c | 8 ++++---- libzdb/api.h | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/libzdb/api.c b/libzdb/api.c index 3fbbdb1..0029543 100644 --- a/libzdb/api.c +++ b/libzdb/api.c @@ -416,12 +416,12 @@ zdb_api_t *zdb_api_del(namespace_t *ns, void *key, size_t ksize) { return zdb_api_reply_success(); } -index_root_t *zdb_index_init_lazy(zdb_settings_t *settings, char *indexdir, void *namespace) { - return index_init_lazy(settings, indexdir, namespace); +index_root_t *zdb_index_init_lazy(zdb_settings_t *settings, char *indexdir) { + return index_init_lazy(settings, indexdir); } -index_root_t *zdb_index_init(zdb_settings_t *settings, char *indexdir, void *namespace, index_branch_t **branches) { - return index_init(settings, indexdir, namespace, branches); +index_root_t *zdb_index_init(zdb_settings_t *settings, char *indexdir) { + return index_init(settings, indexdir); } uint64_t zdb_index_availity_check(index_root_t *root) { diff --git a/libzdb/api.h b/libzdb/api.h index 0c5621f..4641d26 100644 --- a/libzdb/api.h +++ b/libzdb/api.h @@ -55,8 +55,8 @@ int zdb_index_open_readwrite(index_root_t *root, fileid_t fileid); void zdb_index_close(index_root_t *zdbindex); - index_root_t *zdb_index_init_lazy(zdb_settings_t *settings, char *indexdir, void *namespace); - index_root_t *zdb_index_init(zdb_settings_t *settings, char *indexdir, void *namespace, index_branch_t **branches); + index_root_t *zdb_index_init_lazy(zdb_settings_t *settings, char *indexdir); + index_root_t *zdb_index_init(zdb_settings_t *settings, char *indexdir); uint64_t zdb_index_availity_check(index_root_t *root); // index header validity From 93437a2d4b8eacc43f3b5c9ce0336f5497df5cdf Mon Sep 17 00:00:00 2001 From: Maxime Daniel Date: Mon, 20 Dec 2021 12:25:24 +0100 Subject: [PATCH 3/7] zdbd: update kscan to use new index handlers --- zdbd/commands_scan.c | 44 ++++++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/zdbd/commands_scan.c b/zdbd/commands_scan.c index 311cbc7..4bc1f44 100644 --- a/zdbd/commands_scan.c +++ b/zdbd/commands_scan.c @@ -448,6 +448,24 @@ static int command_kscan_send_list(redis_client_t *client, list_t *list) { return 0; } +struct kscan_ptr { + list_t *keys; + resp_object_t *key; +}; + +// callback which build the list +static int command_kscan_callback(index_entry_t *entry, void *ptr) { + struct kscan_ptr *kscan = (struct kscan_ptr *) ptr; + + if(entry->idlength < kscan->key->length) + return 0; + + if(memcmp(entry->id, kscan->key->buffer, kscan->key->length) == 0) + list_append(kscan->keys, entry); + + return 0; +} + int command_kscan(redis_client_t *client) { resp_request_t *request = client->request; index_root_t *index = client->ns->index; @@ -469,27 +487,13 @@ int command_kscan(redis_client_t *client) { resp_object_t *key = request->argv[1]; list_t keys = list_init(NULL); - for(size_t i = 0; i < buckets_branches; i++) { - index_branch_t *branch = index->branches[i]; - - // skipping not allocated branches - if(!branch) - continue; + struct kscan_ptr kscan = { + .keys = &keys, + .key = key, + }; - for(index_entry_t *entry = branch->list; entry; entry = entry->next) { - // this key doesn't belong to the current namespace - if(entry->namespace != client->ns) - continue; - - // key is shorter than requested prefix - // it won't match at all - if(entry->idlength < key->length) - continue; - - if(memcmp(entry->id, key->buffer, key->length) == 0) - list_append(&keys, entry); - } - } + // build a list via index walk callback + index_hash_walk(client->ns->index->hash, command_kscan_callback, &kscan); command_kscan_send_list(client, &keys); list_free(&keys); From d29ea65df0bb6e6ac3f1d3b196a5fc6d40c9e05b Mon Sep 17 00:00:00 2001 From: Maxime Daniel Date: Mon, 20 Dec 2021 12:28:01 +0100 Subject: [PATCH 4/7] utilities: update code to new index arguments --- tools/index-dump/index-dump.c | 2 +- tools/index-rebuild/index-rebuild.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/index-dump/index-dump.c b/tools/index-dump/index-dump.c index a6881b3..c9b116f 100644 --- a/tools/index-dump/index-dump.c +++ b/tools/index-dump/index-dump.c @@ -108,7 +108,7 @@ int main(int argc, char *argv[]) { // zdb_open(zdb_settings); index_root_t *zdbindex; - if(!(zdbindex = zdb_index_init_lazy(zdb_settings, dirname, NULL))) { + if(!(zdbindex = zdb_index_init_lazy(zdb_settings, dirname))) { fprintf(stderr, "[-] index-dump: cannot load index\n"); exit(EXIT_FAILURE); } diff --git a/tools/index-rebuild/index-rebuild.c b/tools/index-rebuild/index-rebuild.c index 931ea73..829241e 100644 --- a/tools/index-rebuild/index-rebuild.c +++ b/tools/index-rebuild/index-rebuild.c @@ -293,7 +293,7 @@ int main(int argc, char *argv[]) { exit(EXIT_FAILURE); } - if(!(zdbindex = zdb_index_init(zdb_settings, namespace->indexpath, namespace, nsroot->branches))) { + if(!(zdbindex = zdb_index_init(zdb_settings, namespace->indexpath))) { fprintf(stderr, "[-] index-rebuild: cannot initialize index\n"); exit(EXIT_FAILURE); } From 22cd845fbbc576f5819e95e54cd48e15d7ba9849 Mon Sep 17 00:00:00 2001 From: Maxime Daniel Date: Mon, 20 Dec 2021 14:57:24 +0100 Subject: [PATCH 5/7] libzdb: avoid free non-allocated index hash --- libzdb/index_branch.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/libzdb/index_branch.c b/libzdb/index_branch.c index cc20697..cc14443 100644 --- a/libzdb/index_branch.c +++ b/libzdb/index_branch.c @@ -286,6 +286,9 @@ static void index_hash_free_list(index_entry_t *head) { } void index_hash_free(index_hash_t *root) { + if(!root) + return; + for(int i = 0; i < ENTRIES_PER_ROWS; i++) { if(root->sub[i]) { // clean the linked list From 2774213e9079df91ab3acb147235cdd4fa21082a Mon Sep 17 00:00:00 2001 From: Maxime Daniel Date: Mon, 20 Dec 2021 14:57:49 +0100 Subject: [PATCH 6/7] libzdb: avoid emergency on non-allocated namespaces --- libzdb/namespace.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/libzdb/namespace.c b/libzdb/namespace.c index 2ea7969..1c3d1d0 100644 --- a/libzdb/namespace.c +++ b/libzdb/namespace.c @@ -448,7 +448,7 @@ ns_root_t *namespaces_allocate(zdb_settings_t *settings) { root->effective = 1; // no namespace has been loaded yet root->settings = settings; // keep the reference to the settings, needed for paths - if(!(root->namespaces = (namespace_t **) malloc(sizeof(namespace_t *) * root->length))) + if(!(root->namespaces = (namespace_t **) calloc(sizeof(namespace_t *), root->length))) zdb_diep("namespace malloc"); return root; @@ -699,6 +699,10 @@ static void namespace_flushing_hook(namespace_t *namespace) { int namespaces_emergency() { namespace_t *ns; + // namespace not allocated yet + if(namespace_iter() == NULL) + return 0; + for(ns = namespace_iter(); ns; ns = namespace_iter_next(ns)) { zdb_log("[+] namespaces: flushing: %s\n", ns->name); namespace_flushing_hook(ns); From b7721fb3e8edce1422526fd121cc92a5c4c678ad Mon Sep 17 00:00:00 2001 From: Maxime Daniel Date: Wed, 2 Feb 2022 10:33:42 +0100 Subject: [PATCH 7/7] namespace: fix clients not detached when deleting a namespace When removing a namespace, clients attached to it in the server should be notified that the namespace is not available anymore, this was supported but not backported when server and library code were separated. This fixes #130 --- libzdb/namespace.c | 3 --- zdbd/commands_namespace.c | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/libzdb/namespace.c b/libzdb/namespace.c index 1c3d1d0..b7f0c2f 100644 --- a/libzdb/namespace.c +++ b/libzdb/namespace.c @@ -648,9 +648,6 @@ static void namespace_delete_hook(namespace_t *namespace) { int namespace_delete(namespace_t *namespace) { zdb_log("[+] namespace: removing: %s\n", namespace->name); - // detach all clients attached to this namespace - // redis_detach_clients(namespace); - // unallocating keys attached to this namespace index_clean_namespace(namespace->index); diff --git a/zdbd/commands_namespace.c b/zdbd/commands_namespace.c index f510698..98aa317 100644 --- a/zdbd/commands_namespace.c +++ b/zdbd/commands_namespace.c @@ -94,6 +94,9 @@ int command_nsdel(redis_client_t *client) { return 1; } + // detach all clients attached to this namespace + redis_detach_clients(namespace); + // delete the new namespace if(namespace_delete(namespace)) { redis_hardsend(client, "-Could not delete this namespace");