From 1015b20feae98de13ba6cd810a7d1530615d3604 Mon Sep 17 00:00:00 2001 From: Konstantin Date: Sat, 20 Sep 2025 02:00:07 +0000 Subject: [PATCH 1/3] accomodate more bytes when doing a bulk-search , sometimes (for a unknown reason) the function does not allocate enough space for the whole process, we fix that by allocating n*2 this commit will be complemented by a change in mdict_extern.cc that also changes how memory allocation is done in there --- src/encode/char_decoder.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/encode/char_decoder.h b/src/encode/char_decoder.h index 805a225..c14d55c 100644 --- a/src/encode/char_decoder.h +++ b/src/encode/char_decoder.h @@ -134,11 +134,17 @@ inline ssize_t utf16le_to_utf8(const unsigned char *utf16le_data, size_t utf16le // Check buffer space *before* writing // Need space for the bytes + 1 for potential null terminator later - if (utf8_idx + bytes_needed >= utf8_buf_len) { // Use >= to ensure space for null term - fprintf(stderr, "Error: Output UTF-8 buffer (size %zu) too small. Needs at least %zu bytes (+ null).\n", - utf8_buf_len, utf8_idx + bytes_needed + 1); - return -1; // Buffer overflow - } + if (utf8_idx + bytes_needed >= utf8_buf_len) { + size_t new_len = (utf8_idx + bytes_needed) * 2; // grow a bit more to reduce future reallocs + char* new_buf = (char*)realloc(utf8_buf, new_len); + if (!new_buf) { + fprintf(stderr, "Error: Failed to allocate memory for UTF-8 buffer\n"); + return -1; + } + utf8_buf = new_buf; + utf8_buf_len = new_len; + } + // Write the UTF-8 bytes if (bytes_needed == 1) { From 371d550871585e9a35ced92b818a269bff519439 Mon Sep 17 00:00:00 2001 From: Konstantin Date: Sat, 20 Sep 2025 02:07:43 +0000 Subject: [PATCH 2/3] use vectors for memory management vectors can grow automatically in c++, so that's a better way of holding a large amount of data --- src/mdict_extern.cc | 46 +++++++++++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/src/mdict_extern.cc b/src/mdict_extern.cc index 4a92f8b..985d79f 100644 --- a/src/mdict_extern.cc +++ b/src/mdict_extern.cc @@ -70,30 +70,48 @@ void *mdict_init(const char *dictionary_path) { /** lookup a word */ + void mdict_lookup(void *dict, const char *word, char **result) { - auto *self = (mdict::Mdict *)dict; - std::string queryWord(word); - std::string s = self->lookup(queryWord); + auto *self = (mdict::Mdict *)dict; + std::string queryWord(word); - (*result) = (char *)calloc(sizeof(char), s.size() + 1); - std::copy(s.begin(), s.end(), (*result)); - (*result)[s.size()] = '\0'; + std::string s = self->lookup(queryWord); + + // Create vector with null terminator + std::vector buf(s.begin(), s.end()); + buf.push_back('\0'); + + // Allocate result buffer once, copy vector content + *result = (char*)malloc(buf.size()); + if (!*result) { + perror("malloc"); + return; + } + memcpy(*result, buf.data(), buf.size()); } + /** locate a word */ -void mdict_locate(void *dict, const char *word, char **result, - mdict_encoding_t encoding) { - auto *self = (mdict::Mdict *)dict; - std::string queryWord(word); - std::string s = self->locate(queryWord, encoding); +void mdict_locate(void *dict, const char *word, char **result, mdict_encoding_t encoding) { + auto *self = (mdict::Mdict *)dict; + std::string queryWord(word); - (*result) = (char *)calloc(sizeof(char), s.size() + 1); - std::copy(s.begin(), s.end(), (*result)); - (*result)[s.size()] = '\0'; + std::string s = self->locate(queryWord, encoding); + + std::vector buf(s.begin(), s.end()); + buf.push_back('\0'); + + *result = (char*)malloc(buf.size()); + if (!*result) { + perror("malloc"); + return; + } + memcpy(*result, buf.data(), buf.size()); } + void mdict_parse_definition(void *dict, const char *word, unsigned long record_start, char **result) { auto *self = (mdict::Mdict *)dict; From 635d74a7745b98c5344280a246a854caf27e9bb5 Mon Sep 17 00:00:00 2001 From: Konstantin Date: Sat, 20 Sep 2025 02:12:57 +0000 Subject: [PATCH 3/3] fix utf8_buf error --- src/encode/char_decoder.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/encode/char_decoder.h b/src/encode/char_decoder.h index c14d55c..6206b63 100644 --- a/src/encode/char_decoder.h +++ b/src/encode/char_decoder.h @@ -141,7 +141,7 @@ inline ssize_t utf16le_to_utf8(const unsigned char *utf16le_data, size_t utf16le fprintf(stderr, "Error: Failed to allocate memory for UTF-8 buffer\n"); return -1; } - utf8_buf = new_buf; + utf8_buf = reinterpret_cast(new_buf); utf8_buf_len = new_len; }