From 967a4b239acbccb3d545981b962c4ee93d6ed9db Mon Sep 17 00:00:00 2001 From: Sean Boettger Date: Sat, 20 Dec 2025 01:40:43 +1100 Subject: [PATCH 01/19] Fix: Take into account HOME env var in path_to_user_home --- coresdk/src/backend/utility_functions.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/coresdk/src/backend/utility_functions.cpp b/coresdk/src/backend/utility_functions.cpp index 0e4cd5cd..6985c168 100644 --- a/coresdk/src/backend/utility_functions.cpp +++ b/coresdk/src/backend/utility_functions.cpp @@ -73,6 +73,10 @@ namespace splashkit_lib string path_to_user_home() { #ifndef WINDOWS + string home = get_env_var("HOME"); + if (home != "") + return home; + struct passwd *pw = getpwuid(getuid()); return string(pw->pw_dir); #else From abe59463c4443807d139f672badd3fded2522100 Mon Sep 17 00:00:00 2001 From: Sean Boettger Date: Sat, 20 Dec 2025 02:33:43 +1100 Subject: [PATCH 02/19] Add llama.cpp as build dependency --- .gitignore | 2 ++ coresdk/external | 2 +- projects/cmake/CMakeLists.txt | 44 +++++++++++++++++++++++++++++++++-- 3 files changed, 45 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index cbf9a5b3..0864e9d0 100644 --- a/.gitignore +++ b/.gitignore @@ -77,6 +77,7 @@ Makefile cmake_install.cmake splashkit_test projects/cmake/Resources +llama_ext-prefix .ninja_deps .ninja_log build.ninja @@ -101,6 +102,7 @@ out/lib/ tools/scripts/nuget-pkg/obj tools/scripts/test/obj + ### Debian packaging ### tools/scripts/debian/libsplashkit-dev* tools/scripts/debian/data.tar.xz diff --git a/coresdk/external b/coresdk/external index e089bc3c..d9c7ca08 160000 --- a/coresdk/external +++ b/coresdk/external @@ -1 +1 @@ -Subproject commit e089bc3ccbd7ff11027a790be44f6ab6038b5c58 +Subproject commit d9c7ca08ca9dbb0051bf57ceadb1d7a2d0f8d536 diff --git a/projects/cmake/CMakeLists.txt b/projects/cmake/CMakeLists.txt index 0780489a..d14d57f7 100644 --- a/projects/cmake/CMakeLists.txt +++ b/projects/cmake/CMakeLists.txt @@ -5,6 +5,7 @@ set(CMAKE_BUILD_TYPE Debug) cmake_policy(SET CMP0083 NEW) include(CheckPIESupported) +include(ExternalProject) check_pie_supported() # SK Directories relative to cmake project @@ -245,6 +246,8 @@ include_directories("${SK_EXT}/hash-library") include_directories("${SK_EXT}/json") include_directories("${SK_EXT}/catch") include_directories("${SK_EXT}/microui/src") +include_directories("${SK_EXT}/llama.cpp/include") +include_directories("${SK_EXT}/llama.cpp/ggml/include") # MAC OS DIRECTORY INCLUDES if (APPLE) @@ -257,13 +260,50 @@ if (APPLE) include_directories("${SK_EXT}/SDL_image/external/libpng-1.6.2") endif() +# INCLUDE LLAMA.CPP + +# Included as an external project so that it can be configured +# as Release, independently of the main project. + +# Compiled as CPU only +# TODO: Decide on minimum architecture requirements +ExternalProject_Add( + llama_ext + SOURCE_DIR "${SK_EXT}/llama.cpp" + CMAKE_ARGS + -DLLAMA_BUILD_TESTS=OFF + -DLLAMA_BUILD_TOOLS=OFF + -DLLAMA_BUILD_EXAMPLES=OFF + -DLLAMA_BUILD_SERVER=OFF + -DLLAMA_CUBLAS=OFF + -DLLAMA_CLBLAST=OFF + -DLLAMA_METAL=OFF + -DLLAMA_OPENCL=OFF + -DBUILD_SHARED_LIBS=OFF + -DLLAMA_BUILD_COMMON=OFF + -DLLAMA_TOOLS_INSTALL=OFF + -DCMAKE_BUILD_TYPE=Release + -DGGML_STATIC=ON + -DLLAMA_STATIC=ON + -DCMAKE_INSTALL_PREFIX= +) + +find_package(OpenMP REQUIRED) + +ExternalProject_Get_Property(llama_ext INSTALL_DIR) + +set(LLAMA_LIB_FLAGS "${INSTALL_DIR}/lib/libllama.a" + "${INSTALL_DIR}/lib/libggml.a" + "${INSTALL_DIR}/lib/libggml-cpu.a" + "${INSTALL_DIR}/lib/libggml-base.a") + # MACRO DEFINITIONS # add_definitions(-DELPP_THREAD_SAFE) #### END SETUP #### #### SplashKitBackend STATIC LIBRARY #### add_library(SplashKitBackend STATIC ${SOURCE_FILES} ${INCLUDE_FILES}) -target_link_libraries(SplashKitBackend ${LIB_FLAGS}) +target_link_libraries(SplashKitBackend ${LIB_FLAGS} ${LLAMA_LIB_FLAGS} OpenMP::OpenMP_CXX) if(RASPBERRY_PI) if(RASPBERRY_PI_5) @@ -373,4 +413,4 @@ catch_discover_tests(skunit_tests) #### END skunit_tests EXECUTABLE #### install(TARGETS SplashKitBackend DESTINATION lib) -install(FILES ${INCLUDE_FILES} DESTINATION include/SplashKitBackend) \ No newline at end of file +install(FILES ${INCLUDE_FILES} DESTINATION include/SplashKitBackend) From 43baf0569fab9549c0fd87d016241abfc792d787 Mon Sep 17 00:00:00 2001 From: Sean Boettger Date: Sat, 20 Dec 2025 02:34:25 +1100 Subject: [PATCH 03/19] Add initial GenAI driver and user facing `generate_reply` --- coresdk/src/backend/genai_driver.cpp | 229 +++++++++++++++++++++++++++ coresdk/src/backend/genai_driver.h | 60 +++++++ coresdk/src/coresdk/genai.cpp | 46 ++++++ coresdk/src/coresdk/genai.h | 34 ++++ 4 files changed, 369 insertions(+) create mode 100644 coresdk/src/backend/genai_driver.cpp create mode 100644 coresdk/src/backend/genai_driver.h create mode 100644 coresdk/src/coresdk/genai.cpp create mode 100644 coresdk/src/coresdk/genai.h diff --git a/coresdk/src/backend/genai_driver.cpp b/coresdk/src/backend/genai_driver.cpp new file mode 100644 index 00000000..4ca64d86 --- /dev/null +++ b/coresdk/src/backend/genai_driver.cpp @@ -0,0 +1,229 @@ +// +// genai_driver.cpp +// sk +// +// Created by Sean Boettger on 19/12/2025. +// +#include +#include +#include + +#include "genai_driver.h" +#include "core_driver.h" +#include "utility_functions.h" + +namespace splashkit_lib +{ + namespace llamacpp { + + static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data){/* nothing, avoid unnecessary logging*/} + + void init() + { + static bool initialized = false; + if (!initialized) + { + llama_log_set(llama_log_callback_null, NULL); + + ggml_backend_load_all(); + + initialized = true; + } + } + + model create_model(std::string path) + { + ggml_backend_load_all(); + + // initialize the model + llama_model_params model_params = llama_model_default_params(); + model_params.n_gpu_layers = 0; // cpu-only + + llama_model * model = llama_model_load_from_file(path.c_str(), model_params); + + if (model == NULL) + { + LOG(ERROR) << "Unable to load language model from " << path << " - please check if it exists."; + return {false}; + } + + const llama_vocab * vocab = llama_model_get_vocab(model); + const char* tmpl = llama_model_chat_template(model, /* name */ nullptr); + + return { + true, + model, + vocab, + tmpl + }; + } + + void delete_model(model mdl) + { + if (!mdl.valid) + return; + + if (!mdl.model) + return; + + llama_model_free(mdl.model); + } + + std::string format_chat(model& mdl, const std::vector& messages) + { + std::vector llama_formatted; + std::vector formatted(0); + + llama_formatted.reserve(messages.size()); + + for (const message& msg : messages) + { + llama_formatted.push_back({msg.role.c_str(), msg.content.c_str()}); + } + + int new_len = llama_chat_apply_template(mdl.tmpl, llama_formatted.data(), llama_formatted.size(), true, formatted.data(), formatted.size()); + if (new_len > (int)formatted.size()) + { + formatted.resize(new_len); + new_len = llama_chat_apply_template(mdl.tmpl, llama_formatted.data(), llama_formatted.size(), true, formatted.data(), formatted.size()); + } + + return std::string(formatted.begin(), formatted.end()); + } + + llama_tokens tokenize_string(model& mdl, const std::string& prompt) + { + // get token count + // note: returns a negative number, the count of tokens it would have returned if the buffer was large enough + const int n_prompt = -llama_tokenize(mdl.vocab, prompt.data(), prompt.size(), NULL, 0, true, true); + + // create buffer + std::vector prompt_tokens(n_prompt); + + // recieve the tokens + if (llama_tokenize(mdl.vocab, prompt.data(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) + { + LOG(ERROR) << "Failed to tokenize the prompt."; + return {}; + } + + return prompt_tokens; + } + + context start_context(model& mdl, llama_tokens& starting_context, int max_length) + { + // Create the context + llama_context_params ctx_params = llama_context_default_params(); + ctx_params.n_ctx = starting_context.size() + max_length - 1; + ctx_params.n_batch = starting_context.size(); + ctx_params.no_perf = true; + + llama_context * ctx = llama_init_from_model(mdl.model, ctx_params); + + if (ctx == NULL) + { + LOG(ERROR) << "Failed to create the language model context."; + return {nullptr}; + } + + // Create the sampler + auto sparams = llama_sampler_chain_default_params(); + sparams.no_perf = true; + llama_sampler * smpl = llama_sampler_chain_init(sparams); + + // Setup some reasonable defaults + // TODO: Make these user adjustable + llama_sampler_chain_add(smpl, llama_sampler_init_min_p(0.00f, 1)); + llama_sampler_chain_add(smpl, llama_sampler_init_temp(0.6f)); + llama_sampler_chain_add(smpl, llama_sampler_init_top_k(20)); + llama_sampler_chain_add(smpl, llama_sampler_init_top_p(0.95, 0)); + //llama_sampler_chain_add(smpl, llama_sampler_init_penalties(64, 1, 0, 0)); + llama_sampler_chain_add(smpl, llama_sampler_init_dist(LLAMA_DEFAULT_SEED)); + + // Prepare batch and encode starting context + llama_batch batch = llama_batch_get_one(starting_context.data(), starting_context.size()); + + if (llama_model_has_encoder(mdl.model)) + { + if (llama_encode(ctx, batch)) + { + llama_free(ctx); + llama_sampler_free(smpl); + LOG(ERROR) << "Failed to encode prompt."; + return {nullptr}; + } + + llama_token decoder_start_token_id = llama_model_decoder_start_token(mdl.model); + if (decoder_start_token_id == LLAMA_TOKEN_NULL) + { + decoder_start_token_id = llama_vocab_bos(mdl.vocab); + } + + batch = llama_batch_get_one(&decoder_start_token_id, 1); + } + + return + { + ctx, + smpl, + batch, + (int)ctx_params.n_ctx, + mdl.vocab, + 0, + "" + }; + } + + int context_step(context& ctx) + { + if (!ctx.ctx) + return -1; + + // Decode current batch with the model + if (llama_decode(ctx.ctx, ctx.batch)) + { + LOG(ERROR) << "Failed to process response from language model."; + return -1; + } + + ctx.n_pos += ctx.batch.n_tokens; + + // Sample next token + llama_token new_token_id = llama_sampler_sample(ctx.smpl, ctx.ctx, -1); + + // Has the model finished its response? + if (llama_vocab_is_eog(ctx.vocab, new_token_id)) + return 1; + + char buf[128]; + int n = llama_token_to_piece(ctx.vocab, new_token_id, buf, sizeof(buf), 0, true); + if (n < 0) + { + LOG(ERROR) << "Failed to convert response token from language model."; + return -1; + } + + std::string s(buf, n); + ctx.ctx_string += s; + + // prepare the next batch with the sampled token + ctx.batch = llama_batch_get_one(&new_token_id, 1); + + // Have we reached the end of the context? + // If so, stop now. + if (ctx.n_pos + ctx.batch.n_tokens >= ctx.ctx_size) + return 1; + + return 0; + } + + void delete_context(context& ctx) + { + if (ctx.smpl) + llama_sampler_free(ctx.smpl); + + if (ctx.ctx) + llama_free(ctx.ctx); + } + } +} diff --git a/coresdk/src/backend/genai_driver.h b/coresdk/src/backend/genai_driver.h new file mode 100644 index 00000000..8f175017 --- /dev/null +++ b/coresdk/src/backend/genai_driver.h @@ -0,0 +1,60 @@ +// +// genai_driver.h +// sk +// +// Created by Sean Boettger on 19/12/2025. +// + +#ifndef genai_driver_h +#define genai_driver_h + +#include "backend_types.h" + +#include "llama.h" + +namespace splashkit_lib +{ + typedef unsigned int uint; + + namespace llamacpp { + struct model { + bool valid; + llama_model* model; + const llama_vocab* vocab; + const char* tmpl; + }; + + struct message { + std::string role; + std::string content; + }; + + struct context { + llama_context* ctx; + llama_sampler* smpl; + llama_batch batch; + int ctx_size = 0; + + const llama_vocab* vocab; + + int n_pos; + std::string ctx_string; + }; + + typedef std::vector llama_tokens; + + void init(); + + model create_model(std::string path); + void delete_model(model mdl); + + std::string format_chat(model& mdl, const std::vector& messages); + llama_tokens tokenize_string(model& mdl, const std::string& prompt); + + context start_context(model& mdl, llama_tokens& starting_context, int max_length); + int context_step(context& ctx); + void delete_context(context& ctx); + } +} + +#endif /* defined(graphics_driver) */ diff --git a/coresdk/src/coresdk/genai.cpp b/coresdk/src/coresdk/genai.cpp new file mode 100644 index 00000000..ed408251 --- /dev/null +++ b/coresdk/src/coresdk/genai.cpp @@ -0,0 +1,46 @@ +// +// genai.cpp +// splashkit +// +// Created by Sean Boettger on 20/12/25. +// + +#include "genai_driver.h" +#include "utility_functions.h" + +namespace splashkit_lib +{ + + string generate_reply(string prompt) + { + llamacpp::init(); + + string path = path_from( {path_to_user_home(), ".splashkit", "models"} ); + + // TODO: add auto download & choices for at least the following + //"Qwen3-4B-Instruct-2507-UD-Q2_K_XL.gguf" + //"Qwen3-1.7B-Q8_0.gguf" + //"Qwen3-0.6B-Q8_0.gguf" + llamacpp::model model = llamacpp::create_model(path + "Qwen3-4B-Instruct-2507-UD-Q2_K_XL.gguf"); + + if (!model.valid) return ""; + + std::string formatted = llamacpp::format_chat(model, { + {"user", prompt} + }); + llamacpp::llama_tokens tokens = llamacpp::tokenize_string(model, formatted); + + llamacpp::context ctx = llamacpp::start_context(model, tokens, 4096); + while (!llamacpp::context_step(ctx)){ + // just wait until it completes + // we could also stream the text to the user through a callback + }; + + std::string result = ctx.ctx_string; + + llamacpp::delete_context(ctx); + llamacpp::delete_model(model); + + return result; + } +} diff --git a/coresdk/src/coresdk/genai.h b/coresdk/src/coresdk/genai.h new file mode 100644 index 00000000..d1175872 --- /dev/null +++ b/coresdk/src/coresdk/genai.h @@ -0,0 +1,34 @@ +/** + * @header genai + * @author Sean Boettger + * @brief SplashKit gives you a simple way to use and embed local AIs in your projects, + * that run on your own computer. + * + * @attribute group generative_ai + * @attribute static generative_ai + */ + +#ifndef genai_hpp +#define genai_hpp + +#include +#include + +using std::string; + +namespace splashkit_lib +{ + + /** + * @brief Generates a reply to a textual prompt by a language model + * + * The language model will respond to the textual prompt in a chat style format. It will follow instructions and answer questions. + * + * @param prompt The prompt for the language model to reply to. + * + * @returns The generated reply. + */ + string generate_reply(string prompt); + +} +#endif /* genai_hpp */ From 8c1f662fc08fbc3d0f575c480ce7a9f6591e5894 Mon Sep 17 00:00:00 2001 From: Sean Boettger Date: Sat, 20 Dec 2025 02:35:57 +1100 Subject: [PATCH 04/19] Add simple GenAI test --- coresdk/src/test/test_genai.cpp | 30 ++++++++++++++++++++++++++++++ coresdk/src/test/test_main.cpp | 1 + coresdk/src/test/test_main.h | 1 + 3 files changed, 32 insertions(+) create mode 100644 coresdk/src/test/test_genai.cpp diff --git a/coresdk/src/test/test_genai.cpp b/coresdk/src/test/test_genai.cpp new file mode 100644 index 00000000..2cb16fca --- /dev/null +++ b/coresdk/src/test/test_genai.cpp @@ -0,0 +1,30 @@ +// +// test_genai.cpp +// splashkit +// +// Created by Sean Boettger on 20/12/2025. +// + +#include "genai.h" +#include "terminal.h" +#include "utils.h" +#include +#include + +using namespace std; +using namespace splashkit_lib; + +void run_genai_test() +{ + write("User\n> "); + string prompt = read_line(); + + write("LLM\n> (generating...)"); + string response = generate_reply(prompt); + write_line("\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\033[K" + response); + + delay(300); + + write_line("-- Press enter to end --"); + read_line(); +} diff --git a/coresdk/src/test/test_main.cpp b/coresdk/src/test/test_main.cpp index 0b6e8ab9..e4b7750a 100644 --- a/coresdk/src/test/test_main.cpp +++ b/coresdk/src/test/test_main.cpp @@ -68,6 +68,7 @@ void setup_tests() add_test("GPIO - SPI MAX7219 LED matrix Tests", run_gpio_spi_led_matrix_tests); add_test("GPIO - I2C HT16K33 LED matrix Tests", run_gpio_i2c_led_matrix_tests); add_test("GPIO - I2C HT16K33 LED 14 Segment Tests", run_gpio_i2c_quad_14_seg_test); + add_test("Gen AI", run_genai_test); } int main(int argv, char **args) diff --git a/coresdk/src/test/test_main.h b/coresdk/src/test/test_main.h index 1beddfc8..89f42267 100644 --- a/coresdk/src/test/test_main.h +++ b/coresdk/src/test/test_main.h @@ -44,5 +44,6 @@ void run_gpio_i2c_quad_14_seg_test(); void run_terminal_test(); void run_logging_test(); void run_ui_test(); +void run_genai_test(); #endif /* test_main_h */ From f172f7b76d7952c7f27acf966e81004b68ee3580 Mon Sep 17 00:00:00 2001 From: Sean Boettger Date: Sun, 21 Dec 2025 21:03:19 +1100 Subject: [PATCH 05/19] GenAI add custom logger --- coresdk/src/backend/genai_driver.cpp | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/coresdk/src/backend/genai_driver.cpp b/coresdk/src/backend/genai_driver.cpp index 4ca64d86..a290e801 100644 --- a/coresdk/src/backend/genai_driver.cpp +++ b/coresdk/src/backend/genai_driver.cpp @@ -27,6 +27,18 @@ namespace splashkit_lib ggml_backend_load_all(); + // Create custom logger with colouring + el::Configurations conf; + conf.setToDefault(); + conf.setGlobally(el::ConfigurationType::Format, "%level -> %msg"); + conf.setGlobally(el::ConfigurationType::Filename, "logs/splashkit.log"); + + // `el::Loggers::addFlag(el::LoggingFlag::ColoredTerminalOutput);` would be better but has global effect + conf.set(el::Level::Warning, el::ConfigurationType::Format, "\x1b[33m%level -> %msg\x1b[0m"); + conf.set(el::Level::Error, el::ConfigurationType::Format, "\x1b[31m%level -> %msg\x1b[0m"); + + el::Loggers::reconfigureLogger("GenAI", conf); + initialized = true; } } @@ -43,7 +55,7 @@ namespace splashkit_lib if (model == NULL) { - LOG(ERROR) << "Unable to load language model from " << path << " - please check if it exists."; + CLOG(ERROR, "GenAI") << "Unable to load language model from " << path << " - it may be corrupted or missing."; return {false}; } @@ -103,7 +115,7 @@ namespace splashkit_lib // recieve the tokens if (llama_tokenize(mdl.vocab, prompt.data(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) { - LOG(ERROR) << "Failed to tokenize the prompt."; + CLOG(ERROR, "GenAI") << "Failed to tokenize the prompt."; return {}; } @@ -122,7 +134,7 @@ namespace splashkit_lib if (ctx == NULL) { - LOG(ERROR) << "Failed to create the language model context."; + CLOG(ERROR, "GenAI") << "Failed to create the language model context."; return {nullptr}; } @@ -149,7 +161,7 @@ namespace splashkit_lib { llama_free(ctx); llama_sampler_free(smpl); - LOG(ERROR) << "Failed to encode prompt."; + CLOG(ERROR, "GenAI") << "Failed to encode prompt."; return {nullptr}; } @@ -182,7 +194,7 @@ namespace splashkit_lib // Decode current batch with the model if (llama_decode(ctx.ctx, ctx.batch)) { - LOG(ERROR) << "Failed to process response from language model."; + CLOG(ERROR, "GenAI") << "Failed to process response from language model."; return -1; } @@ -199,7 +211,7 @@ namespace splashkit_lib int n = llama_token_to_piece(ctx.vocab, new_token_id, buf, sizeof(buf), 0, true); if (n < 0) { - LOG(ERROR) << "Failed to convert response token from language model."; + CLOG(ERROR, "GenAI") << "Failed to convert response token from language model."; return -1; } From ddac4c96929c54247a8e6e763ed06a5e10f4855c Mon Sep 17 00:00:00 2001 From: Sean Boettger Date: Sun, 21 Dec 2025 21:18:42 +1100 Subject: [PATCH 06/19] Add sk_http_get_file function - better for large files/handles resuming --- coresdk/src/backend/web_driver.cpp | 61 ++++++++++++++++++++++++++++++ coresdk/src/backend/web_driver.h | 1 + 2 files changed, 62 insertions(+) diff --git a/coresdk/src/backend/web_driver.cpp b/coresdk/src/backend/web_driver.cpp index f388e083..2305e91b 100644 --- a/coresdk/src/backend/web_driver.cpp +++ b/coresdk/src/backend/web_driver.cpp @@ -206,6 +206,67 @@ namespace splashkit_lib return _create_response(curl_handle, res, data_read); } + struct _sk_http_get_file_callback_data + { + void (*user_callback)(unsigned long, unsigned long); + int resuming_from; + }; + + int _sk_http_get_file_callback(_sk_http_get_file_callback_data* data, curl_off_t dltotal, curl_off_t dlnow, curl_off_t ultotal, curl_off_t ulnow) + { + data->user_callback(dltotal == 0 ? 0 : (data->resuming_from + dltotal), data->resuming_from + dlnow); + return 0; + } + + sk_http_response *sk_http_get_file(const string &filename, const string &host, unsigned short port, void (*user_callback)(unsigned long, unsigned long)) + { + const string temp_extension = ".temp"; + string temp_filename = filename+temp_extension; + + FILE *file = fopen(temp_filename.c_str(), "ab+"); + + // find resume point + fseek(file, 0L, SEEK_END); + curl_off_t resume_from = ftell(file); + + // init the curl session + CURL *curl_handle = curl_easy_init(); + CURLcode res; + + _init_curl(curl_handle, host, port); + + curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, write_data); + curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, file); + + _sk_http_get_file_callback_data callback_data; + if (user_callback) + { + curl_easy_setopt(curl_handle, CURLOPT_XFERINFOFUNCTION, _sk_http_get_file_callback); + curl_easy_setopt(curl_handle, CURLOPT_XFERINFODATA, &callback_data); + curl_easy_setopt(curl_handle, CURLOPT_NOPROGRESS, 0); + + callback_data.user_callback = user_callback; + callback_data.resuming_from = resume_from; + } + + curl_easy_setopt(curl_handle, CURLOPT_RESUME_FROM_LARGE, resume_from); + + // get it! + res = curl_easy_perform(curl_handle); + + fclose(file); + + // try renaming the temp file if the download was okay - rename returns 0 on success + if (res == CURLE_OK && rename(temp_filename.c_str(), filename.c_str())) + { + LOG(WARNING) << "Failed to rename temporary download file " << temp_filename << " to " << filename; + return nullptr; + } + + request_stream data_read = { nullptr, 0 }; + return _create_response(curl_handle, res, data_read); + } + sk_http_response *sk_http_put(const string &host, unsigned short port, const string &body) { request_stream data_read = { nullptr, 0 }; diff --git a/coresdk/src/backend/web_driver.h b/coresdk/src/backend/web_driver.h index e5e54de2..f5e4810a 100644 --- a/coresdk/src/backend/web_driver.h +++ b/coresdk/src/backend/web_driver.h @@ -17,6 +17,7 @@ namespace splashkit_lib sk_http_response *sk_http_post(const string &host, unsigned short port, const string &body); sk_http_response *sk_http_get(const string &host, unsigned short port); + sk_http_response *sk_http_get_file(const string &filename, const string &host, unsigned short port, void (*user_callback)(unsigned long, unsigned long)); sk_http_response *sk_http_put(const string &host, unsigned short port, const string &body); sk_http_response *sk_http_delete(const string &host, unsigned short port, const string &body); sk_http_response *sk_http_make_request(const sk_http_request &request); From 97b3fa8ec32384bdc4838b9a5d4c772e7c838633 Mon Sep 17 00:00:00 2001 From: Sean Boettger Date: Sun, 21 Dec 2025 22:43:47 +1100 Subject: [PATCH 07/19] Add GenAI model downloading --- coresdk/src/coresdk/genai.cpp | 104 +++++++++++++++++++++++++++++++++- 1 file changed, 103 insertions(+), 1 deletion(-) diff --git a/coresdk/src/coresdk/genai.cpp b/coresdk/src/coresdk/genai.cpp index ed408251..881a1455 100644 --- a/coresdk/src/coresdk/genai.cpp +++ b/coresdk/src/coresdk/genai.cpp @@ -7,21 +7,123 @@ #include "genai_driver.h" #include "utility_functions.h" +#include "web_driver.h" +#include "terminal.h" + +#include + +using std::to_string; namespace splashkit_lib { + /* terminal util functions in lieu of ncurses*/ + void terminal_erase_left(int count /* -1 for all */) + { + if (count == 0) + return; + + if (count == -1) + write("\r\033[K"); + else + write("\033["+to_string(count)+"D\033[K"); + } + + std::vector terminal_stack; + + void terminal_push(const string &str) + { + write(str); + terminal_stack.push_back(str.size()); + } + + void terminal_pop() + { + terminal_erase_left(terminal_stack.back()); + terminal_stack.pop_back(); + } + + bool download_with_progress_bar(string filename, string url) + { + auto callback = [](unsigned long expected_size, unsigned long current_size) + { + terminal_pop(); + + if (expected_size == 0) + { + terminal_push(""); + return; + } + + static int spinner_index = 0; + + const int progress_bar_length = 10; + const string spinner = "|/-\\"; + + int expected_mb = expected_size / (1024 * 1024); + int current_mb = current_size / (1024 * 1024); + + // construct progress bar + int progress_bar_filled = 0; + if (expected_size > 0) + progress_bar_filled = (int)(progress_bar_length * current_size/(double)expected_size); + if (progress_bar_filled > progress_bar_length) + progress_bar_filled = progress_bar_length; + + string progress_bar = string(progress_bar_filled, '=') + string(progress_bar_length-progress_bar_filled, ' '); + if (progress_bar_filled <= progress_bar_length) + progress_bar[progress_bar_filled] = spinner[spinner_index++ % spinner.size()]; + + // write message + terminal_push(progress_bar + "| (" + to_string(current_mb) + "mb / " + to_string(expected_mb) + "mb)"); + }; + + terminal_push(""); + + sk_http_response * resp = sk_http_get_file(filename, url, 443, callback); + + terminal_pop(); + + return resp != nullptr && resp->code >= 200 && resp->code < 300; + } + + bool ensure_exists_or_download(string path, string url, string message) + { + if (std::filesystem::exists(path)) + return true; + + terminal_push(message); + + bool result = download_with_progress_bar(path, url); + + terminal_pop(); + + return result; + } + string generate_reply(string prompt) { llamacpp::init(); string path = path_from( {path_to_user_home(), ".splashkit", "models"} ); + path += "Qwen3-4B-Instruct-2507-UD-Q2_K_XL.gguf"; + + string model_name = "Qwen3 4B Instruct"; + + if (!ensure_exists_or_download(path, + "https://huggingface.co/unsloth/Qwen3-4B-Instruct-2507-GGUF/resolve/main/Qwen3-4B-Instruct-2507-UD-Q2_K_XL.gguf?download=true", + " ::: Downloading Language Model: "+model_name + " |" + )) + { + CLOG(ERROR, "GenAI") << "Failed to download language model - see error above."; + return ""; + } // TODO: add auto download & choices for at least the following //"Qwen3-4B-Instruct-2507-UD-Q2_K_XL.gguf" //"Qwen3-1.7B-Q8_0.gguf" //"Qwen3-0.6B-Q8_0.gguf" - llamacpp::model model = llamacpp::create_model(path + "Qwen3-4B-Instruct-2507-UD-Q2_K_XL.gguf"); + llamacpp::model model = llamacpp::create_model(path); if (!model.valid) return ""; From bedecf29f7226219d8c968af9b6b251e72cd380d Mon Sep 17 00:00:00 2001 From: Sean Boettger Date: Mon, 22 Dec 2025 06:20:48 +1100 Subject: [PATCH 08/19] Add language_model and language_model_options struct/enum --- coresdk/src/coresdk/types.h | 69 +++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/coresdk/src/coresdk/types.h b/coresdk/src/coresdk/types.h index 3573af9f..78ff4cae 100644 --- a/coresdk/src/coresdk/types.h +++ b/coresdk/src/coresdk/types.h @@ -548,5 +548,74 @@ namespace splashkit_lib BUBBLE = 4, BUBBLE_MULTICOLORED = 5 }; + + /** + * Language Models: + * Choose between different language models to trade off speed and intelligence + * Each model is scaled to fit within 1~2GB and will be automatically downloaded when needed - feel free to try them out! + * + * @constant QWEN3_0_6B_BASE Qwen3 0.6B Base model - small, extremely fast and good for text commpletion. Very limited world knowledge. + * @constant QWEN3_0_6B_INSTRUCT Qwen3 0.6B Instruct model (default) - small, extremely fast and can follow simple instructions. Very limited world knowledge. + * @constant QWEN3_0_6B_THINKING Qwen3 0.6B Thinking model - small, extremely fast and can follow more specific instructions, but has a short delay before starting to reply. Very limited world knowledge. + * @constant QWEN3_1_7B_BASE Qwen3 1.7B Base model - decently fast and good for text commpletion. Limited world knowledge. + * @constant QWEN3_1_7B_INSTRUCT Qwen3 1.7B Instruct model - decently fast and can follow instructions. Limited world knowledge. + * @constant QWEN3_1_7B_THINKING Qwen3 1.7B Thinking model - decently fast and can follow more difficult instructions, but has a delay before starting to reply. Limited world knowledge. + * @constant QWEN3_4B_BASE Qwen3 4B Base model - slower but excellent for text commpletion/pattern based completion + * @constant QWEN3_4B_INSTRUCT Qwen3 4B Instruct model - slower but can follow complex instructions + * @constant QWEN3_4B_THINKING Qwen3 4B Thinking model - slower but can follow complex and specific instructions, but has a potentially long delay before starting to reply + * @constant GEMMA_270M_BASE Gemma3 270M Base model - tiny, extremely fast, and good for text completion. Very limited world knowledge. + * @constant GEMMA_270M_BASE Gemma3 270M Instruct model - tiny, extremely fast, and good for very simple instructions. Very limited world knowledge. + * @constant GEMMA_1B_BASE Gemma3 1B Base model - fast and good for text completion. Has decent world knowledge and multi-lingual abilities. + * @constant GEMMA_1B_INSTRUCT Gemma3 1B Instruct model - fast and can follow instructions. Has decent world knowledge and multi-lingual abilities. + * @constant GEMMA_4B_BASE Gemma3 4B Base model - slower but good for text commpletion/pattern based completion. Has decent world knowledge and multi-lingual abilities. + * @constant GEMMA_4B_INSTRUCT Gemma3 4B Instruct model - slower but can follow complex instructions. Has decent world knowledge and multi-lingual abilities. + */ + enum language_model + { + QWEN3_0_6B_BASE = 4, + QWEN3_0_6B_INSTRUCT = 5, + QWEN3_0_6B_THINKING = 6, + QWEN3_1_7B_BASE = 8, + QWEN3_1_7B_INSTRUCT = 9, + QWEN3_1_7B_THINKING = 10, + QWEN3_4B_BASE = 12, + QWEN3_4B_INSTRUCT = 13, + QWEN3_4B_THINKING = 14, + GEMMA3_270M_BASE = 16, + GEMMA3_270M_INSTRUCT = 17, + GEMMA3_1B_BASE = 20, + GEMMA3_1B_INSTRUCT = 21, + GEMMA3_4B_BASE = 24, + GEMMA3_4B_INSTRUCT = 25, + }; + + /** + * Language model options allow you to customize the language model used. These should be + * initialised using functions such as `option_language_model`. + * + * @field name The name of the model (used in diagnostic messages). + * @field url A URL to download a model from. + * @field path A path to a custom language model (.gguf) file on your computer/a place to download it to. + * @field max_tokens The maximum number of tokens to output when replying. One word is approximately two tokens. + * @field temperature Increases the likelihood of unlikely tokens to be chosen. + * @field top_p Only choose from the top P most likely tokens. + * @field top_k Only choose from the top K most likely tokens. + * @field min_p Remove tokens less likely than P. + * @field presence_penalty Penalizes words that have been used once, making them less likely. Can reduce repetition. + * @field prompt_append A string to append to prompts automatically. + */ + struct language_model_options + { + string name; + string url; + string path; + int max_tokens; + double temperature; + double top_p; + int top_k; + double min_p; + double presence_penalty; + string prompt_append; + }; } #endif /* types_hpp */ From 7d1ad32de14371ec57c56c6522fc196aa0fc4762 Mon Sep 17 00:00:00 2001 From: Sean Boettger Date: Mon, 22 Dec 2025 07:09:35 +1100 Subject: [PATCH 09/19] Add default models and allow choosing model - also add various overloads --- coresdk/src/coresdk/genai.cpp | 205 ++++++++++++++++++++++++++++---- coresdk/src/coresdk/genai.h | 77 ++++++++++++ coresdk/src/test/test_genai.cpp | 2 +- 3 files changed, 263 insertions(+), 21 deletions(-) diff --git a/coresdk/src/coresdk/genai.cpp b/coresdk/src/coresdk/genai.cpp index 881a1455..1c923fd0 100644 --- a/coresdk/src/coresdk/genai.cpp +++ b/coresdk/src/coresdk/genai.cpp @@ -6,6 +6,7 @@ // #include "genai_driver.h" +#include "genai.h" #include "utility_functions.h" #include "web_driver.h" #include "terminal.h" @@ -16,6 +17,13 @@ using std::to_string; namespace splashkit_lib { + const language_model DEFAULT_LANGUAGE_MODEL = QWEN3_0_6B_INSTRUCT; + + const int default_max_tokens_base = 256; // base has a higher likelihood of running forever for no reason, better to limit it early + const int default_max_tokens_instruct = 4096; + const int default_max_tokens_thinking = 4096; + + extern const std::array models; // defined at end of file /* terminal util functions in lieu of ncurses*/ void terminal_erase_left(int count /* -1 for all */) @@ -72,7 +80,7 @@ namespace splashkit_lib string progress_bar = string(progress_bar_filled, '=') + string(progress_bar_length-progress_bar_filled, ' '); if (progress_bar_filled <= progress_bar_length) - progress_bar[progress_bar_filled] = spinner[spinner_index++ % spinner.size()]; + progress_bar[progress_bar_filled] = spinner[(spinner_index++)/2 % spinner.size()]; // write message terminal_push(progress_bar + "| (" + to_string(current_mb) + "mb / " + to_string(expected_mb) + "mb)"); @@ -101,38 +109,33 @@ namespace splashkit_lib return result; } - string generate_reply(string prompt) + string __generate_common(string prompt, language_model_options options, bool format_chat) { llamacpp::init(); - string path = path_from( {path_to_user_home(), ".splashkit", "models"} ); - path += "Qwen3-4B-Instruct-2507-UD-Q2_K_XL.gguf"; - - string model_name = "Qwen3 4B Instruct"; - - if (!ensure_exists_or_download(path, - "https://huggingface.co/unsloth/Qwen3-4B-Instruct-2507-GGUF/resolve/main/Qwen3-4B-Instruct-2507-UD-Q2_K_XL.gguf?download=true", - " ::: Downloading Language Model: "+model_name + " |" - )) + if (options.url != "" && !ensure_exists_or_download(options.path, options.url, " ::: Downloading Language Model: " + options.name + " |")) { CLOG(ERROR, "GenAI") << "Failed to download language model - see error above."; return ""; } - // TODO: add auto download & choices for at least the following - //"Qwen3-4B-Instruct-2507-UD-Q2_K_XL.gguf" - //"Qwen3-1.7B-Q8_0.gguf" - //"Qwen3-0.6B-Q8_0.gguf" - llamacpp::model model = llamacpp::create_model(path); + llamacpp::model model = llamacpp::create_model(options.path); if (!model.valid) return ""; - std::string formatted = llamacpp::format_chat(model, { - {"user", prompt} - }); + std::string formatted = prompt; + + if (format_chat) + { + llamacpp::format_chat(model, { + { + "user", prompt + options.prompt_append + } + }); + } llamacpp::llama_tokens tokens = llamacpp::tokenize_string(model, formatted); - llamacpp::context ctx = llamacpp::start_context(model, tokens, 4096); + llamacpp::context ctx = llamacpp::start_context(model, tokens, options.max_tokens); while (!llamacpp::context_step(ctx)){ // just wait until it completes // we could also stream the text to the user through a callback @@ -145,4 +148,166 @@ namespace splashkit_lib return result; } + + + string generate_reply(string prompt) + { + return generate_reply(DEFAULT_LANGUAGE_MODEL, prompt); + } + + string generate_reply(language_model model, string prompt) + { + return generate_reply(prompt, option_language_model(model)); + } + + string generate_reply(string prompt, language_model_options options) + { + return __generate_common(prompt, options, true); + } + + string generate_text(string text) + { + return generate_text(DEFAULT_LANGUAGE_MODEL, text); + } + + string generate_text(language_model model, string text) + { + return generate_text(text, option_language_model(model)); + } + + string generate_text(string text, language_model_options options) + { + return __generate_common(text, options, false); + } + + language_model_options option_language_model(language_model model) + { + if (model < 0 || model >= models.size() || models[model].name == "") + { + model = DEFAULT_LANGUAGE_MODEL; + CLOG(WARNING, "GenAI") << "Invalid model selected, defaulting to '" << models[model].name << "'"; + } + + string home_path = path_from( {path_to_user_home(), ".splashkit", "models"} ); + + language_model_options options = models[model]; + options.path = home_path + options.path; + + return options; + } + + // -------------------------------------------------------------- + + + // default model definitions + + const std::array models = {{ + [0]={}, [1]={}, [2]={}, [3]={}, + + [QWEN3_0_6B_BASE] = { + "Qwen3 0.6B Base", + "https://huggingface.co/mradermacher/Qwen3-0.6B-Base-GGUF/resolve/main/Qwen3-0.6B-Base.Q8_0.gguf?download=true", + "Qwen3-0.6B-Base.Q8_0.gguf", + default_max_tokens_base, 0.7, 0.8, 20, 0, 1.5 + }, + [QWEN3_0_6B_INSTRUCT] = { + "Qwen3 0.6B Instruct", + "https://huggingface.co/Qwen/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q8_0.gguf?download=true", + "Qwen3-0.6B-Q8_0.gguf", + default_max_tokens_instruct, 0.7, 0.8, 20, 0, 1.5, " /no_think" + }, + [QWEN3_0_6B_THINKING] = { + "Qwen3 0.6B Thinking", + "https://huggingface.co/Qwen/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q8_0.gguf?download=true", + "Qwen3-0.6B-Q8_0.gguf", + default_max_tokens_thinking, 0.6, 0.95, 20, 0, 1.5 + }, + + [7]={}, + + [QWEN3_1_7B_BASE] = { + "Qwen3 1.7B Base", + "https://huggingface.co/mradermacher/Qwen3-1.7B-Base-GGUF/resolve/main/Qwen3-1.7B-Base.Q8_0.gguf?download=true", + "Qwen3-1.7B-Base.Q8_0.gguf", + default_max_tokens_base, 0.7, 0.8, 20, 0, 1.5 + }, + [QWEN3_1_7B_INSTRUCT] = { + "Qwen3 1.7B Instruct", + "https://huggingface.co/Qwen/Qwen3-1.7B-GGUF/resolve/main/Qwen3-1.7B-Q8_0.gguf?download=true", + "Qwen3-1.7B-Q8_0.gguf", + default_max_tokens_instruct, 0.7, 0.8, 20, 0, 1.5, " /no_think" + }, + [QWEN3_1_7B_THINKING] = { + "Qwen3 1.7B Thinking", + "https://huggingface.co/Qwen/Qwen3-1.7B-GGUF/resolve/main/Qwen3-1.7B-Q8_0.gguf?download=true", + "Qwen3-1.7B-Q8_0.gguf", + default_max_tokens_thinking, 0.6, 0.95, 20, 0, 1.5 + }, + + [11]={}, + + [QWEN3_4B_BASE] = { + "Qwen3 4B Base", + "https://huggingface.co/mradermacher/Qwen3-4B-Base-GGUF/resolve/main/Qwen3-4B-Base.Q2_K.gguf?download=true", + "Qwen3-4B-Base.Q2_K.gguf", + default_max_tokens_base, 0.7, 0.8, 20, 0, 0 + }, + [QWEN3_4B_INSTRUCT] = { + "Qwen3 4B Instruct", + "https://huggingface.co/unsloth/Qwen3-4B-Instruct-2507-GGUF/resolve/main/Qwen3-4B-Instruct-2507-UD-Q2_K_XL.gguf?download=true", + "Qwen3-4B-Instruct-2507-UD-Q2_K_XL.gguf", + default_max_tokens_instruct, 0.7, 0.8, 20, 0, 0 + }, + [QWEN3_4B_THINKING] = { + "Qwen3 4B Thinking", + "https://huggingface.co/unsloth/Qwen3-4B-Thinking-2507-GGUF/resolve/main/Qwen3-4B-Thinking-2507-UD-Q2_K_XL.gguf?download=true", + "Qwen3-4B-Thinking-2507-UD-Q2_K_XL.gguf", + default_max_tokens_thinking, 0.6, 0.95, 20, 0, 0 + }, + + [15]={}, + + [GEMMA3_270M_BASE] = { + "Gemma3 270M Base", + "https://huggingface.co/ggml-org/gemma-3-270m-GGUF/resolve/main/gemma-3-270m-Q8_0.gguf?download=true", + "gemma-3-270m-Q8_0.gguf", + default_max_tokens_base, 1.0, 0.95, 64, 0, 0 + }, + [GEMMA3_270M_INSTRUCT] = { + "Gemma3 270M Instruct", + "https://huggingface.co/unsloth/gemma-3-270m-it-GGUF/resolve/main/gemma-3-270m-it-Q8_0.gguf?download=true", + "gemma-3-270m-it-Q8_0.gguf", + default_max_tokens_instruct, 1.0, 0.95, 64, 0, 0 + }, + + [18]={}, [19]={}, + + [GEMMA3_1B_BASE] = { + "Gemma3 1B Base", + "https://huggingface.co/mradermacher/gemma-3-1b-pt-GGUF/resolve/main/gemma-3-1b-pt.Q8_0.gguf?download=true", + "gemma-3-1b-pt.Q8_0.gguf", + default_max_tokens_base, 1.0, 0.95, 64, 0, 0 + }, + [GEMMA3_1B_INSTRUCT] = { + "Gemma3 1B Instruct", + "https://huggingface.co/unsloth/gemma-3-1b-it-GGUF/resolve/main/gemma-3-1b-it-Q8_0.gguf?download=true", + "gemma-3-1b-it-Q8_0.gguf", + default_max_tokens_instruct, 1.0, 0.95, 64, 0, 0 + }, + + [22]={}, [23]={}, + + [GEMMA3_4B_BASE] = { + "Gemma3 4B Base", + "https://huggingface.co/mradermacher/gemma-3-4b-pt-GGUF/resolve/main/gemma-3-4b-pt.Q2_K.gguf?download=true", + "gemma-3-4b-pt.Q2_K.gguf", + default_max_tokens_base, 1.0, 0.95, 64, 0, 0 + }, + [GEMMA3_4B_INSTRUCT] = { + "Gemma3 4B Instruct", + "https://huggingface.co/unsloth/gemma-3-4b-it-GGUF/resolve/main/gemma-3-4b-it-UD-IQ3_XXS.gguf?download=true", + "gemma-3-4b-it-UD-IQ3_XXS.gguf", + default_max_tokens_instruct, 1.0, 0.95, 64, 0, 0 + } + }}; } diff --git a/coresdk/src/coresdk/genai.h b/coresdk/src/coresdk/genai.h index d1175872..c8e90eb6 100644 --- a/coresdk/src/coresdk/genai.h +++ b/coresdk/src/coresdk/genai.h @@ -11,6 +11,8 @@ #ifndef genai_hpp #define genai_hpp +#include "types.h" + #include #include @@ -23,6 +25,7 @@ namespace splashkit_lib * @brief Generates a reply to a textual prompt by a language model * * The language model will respond to the textual prompt in a chat style format. It will follow instructions and answer questions. + * Instruct or Thinking models are recommended. Base models likely won't output sensible results. * * @param prompt The prompt for the language model to reply to. * @@ -30,5 +33,79 @@ namespace splashkit_lib */ string generate_reply(string prompt); + /** + * @brief Generates a reply to a textual prompt by a language model + * + * The language model will respond to the textual prompt in a chat style format. It will follow instructions and answer questions. + * Instruct or Thinking models are recommended. Base models likely won't output sensible results. + * + * @param model The language model to use + * @param prompt The prompt for the language model to reply to. + * + * @returns The generated reply. + */ + string generate_reply(language_model model, string prompt); + + /** + * @brief Generates a reply to a textual prompt by a language model + * + * The language model will respond to the textual prompt in a chat style format. It will follow instructions and answer questions. + * Instruct or Thinking models are recommended. Base models likely won't output sensible results. + * + * @param prompt The prompt for the language model to reply to. + * @param options The generation options - use the `option_` functions to create this, for instance `option_language_model` + * + * @returns The generated reply. + */ + string generate_reply(string prompt, language_model_options options); + + + /** + * @brief Generates text that continues from a prompt + * + * The language model will continue predicting text based on patterns in the prompt - it will not directly follow instructions or answer questions. + * Base models are recommended; Instruct and Thinking models may work. + * + * @param text The input text for the language model to continue. + * + * @returns The generated reply. + */ + string generate_text(string text); + + /** + * @brief Generates text that continues from a prompt + * + * The language model will continue predicting text based on patterns in the prompt - it will not directly follow instructions or answer questions. + * Base models are recommended; Instruct and Thinking models may work. + * + * @param model The language model to use + * @param text The input text for the language model to continue. + * + * @returns The generated reply. + */ + string generate_text(language_model model, string text); + + /** + * @brief Generates text that continues from a prompt + * + * The language model will continue predicting text based on patterns in the prompt - it will not directly follow instructions or answer questions. + * Base models are recommended; Instruct and Thinking models may work. + * + * @param text The input text for the language model to continue. + * @param options The generation options - use the `option_` functions to create this, for instance `option_language_model` + * + * @returns The generated reply. + */ + string generate_text(string text, language_model_options options); + + /** + * Use this option to choose which language model to use, and initialize its default settings + * + * @param model The language model to use + * + * @return Language model options that will use that model and its default settings. + */ + language_model_options option_language_model(language_model model); + } #endif /* genai_hpp */ diff --git a/coresdk/src/test/test_genai.cpp b/coresdk/src/test/test_genai.cpp index 2cb16fca..84aba027 100644 --- a/coresdk/src/test/test_genai.cpp +++ b/coresdk/src/test/test_genai.cpp @@ -20,7 +20,7 @@ void run_genai_test() string prompt = read_line(); write("LLM\n> (generating...)"); - string response = generate_reply(prompt); + string response = generate_reply(QWEN3_0_6B_INSTRUCT, prompt); write_line("\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\033[K" + response); delay(300); From 3f94760fb2797e52f3d8826a3ef87b89c3336ce9 Mon Sep 17 00:00:00 2001 From: Sean Boettger Date: Mon, 22 Dec 2025 07:31:06 +1100 Subject: [PATCH 10/19] Pass inference settings to genai_driver --- coresdk/src/backend/genai_driver.cpp | 13 +++++++------ coresdk/src/backend/genai_driver.h | 10 +++++++++- coresdk/src/coresdk/genai.cpp | 8 +++++++- 3 files changed, 23 insertions(+), 8 deletions(-) diff --git a/coresdk/src/backend/genai_driver.cpp b/coresdk/src/backend/genai_driver.cpp index a290e801..3e8e56c5 100644 --- a/coresdk/src/backend/genai_driver.cpp +++ b/coresdk/src/backend/genai_driver.cpp @@ -122,7 +122,7 @@ namespace splashkit_lib return prompt_tokens; } - context start_context(model& mdl, llama_tokens& starting_context, int max_length) + context start_context(model& mdl, llama_tokens& starting_context, int max_length, inference_settings settings) { // Create the context llama_context_params ctx_params = llama_context_default_params(); @@ -145,11 +145,12 @@ namespace splashkit_lib // Setup some reasonable defaults // TODO: Make these user adjustable - llama_sampler_chain_add(smpl, llama_sampler_init_min_p(0.00f, 1)); - llama_sampler_chain_add(smpl, llama_sampler_init_temp(0.6f)); - llama_sampler_chain_add(smpl, llama_sampler_init_top_k(20)); - llama_sampler_chain_add(smpl, llama_sampler_init_top_p(0.95, 0)); - //llama_sampler_chain_add(smpl, llama_sampler_init_penalties(64, 1, 0, 0)); + llama_sampler_chain_add(smpl, llama_sampler_init_min_p(settings.min_p, 1)); + llama_sampler_chain_add(smpl, llama_sampler_init_temp(settings.temperature)); + llama_sampler_chain_add(smpl, llama_sampler_init_top_k(settings.top_k)); + llama_sampler_chain_add(smpl, llama_sampler_init_top_p(settings.top_p, 0)); + if (settings.presence_penalty > 0) + llama_sampler_chain_add(smpl, llama_sampler_init_penalties(64, 0, 0, settings.presence_penalty)); llama_sampler_chain_add(smpl, llama_sampler_init_dist(LLAMA_DEFAULT_SEED)); // Prepare batch and encode starting context diff --git a/coresdk/src/backend/genai_driver.h b/coresdk/src/backend/genai_driver.h index 8f175017..658c5155 100644 --- a/coresdk/src/backend/genai_driver.h +++ b/coresdk/src/backend/genai_driver.h @@ -24,6 +24,14 @@ namespace splashkit_lib const char* tmpl; }; + struct inference_settings { + double temperature = 0.6; + double top_p = 0.95; + int top_k = 20; + double min_p = 0; + double presence_penalty = 0; + }; + struct message { std::string role; std::string content; @@ -51,7 +59,7 @@ namespace splashkit_lib std::string format_chat(model& mdl, const std::vector& messages); llama_tokens tokenize_string(model& mdl, const std::string& prompt); - context start_context(model& mdl, llama_tokens& starting_context, int max_length); + context start_context(model& mdl, llama_tokens& starting_context, int max_length, inference_settings settings); int context_step(context& ctx); void delete_context(context& ctx); } diff --git a/coresdk/src/coresdk/genai.cpp b/coresdk/src/coresdk/genai.cpp index 1c923fd0..e7f8435d 100644 --- a/coresdk/src/coresdk/genai.cpp +++ b/coresdk/src/coresdk/genai.cpp @@ -135,7 +135,13 @@ namespace splashkit_lib } llamacpp::llama_tokens tokens = llamacpp::tokenize_string(model, formatted); - llamacpp::context ctx = llamacpp::start_context(model, tokens, options.max_tokens); + llamacpp::context ctx = llamacpp::start_context(model, tokens, options.max_tokens, { + options.temperature, + options.top_p, + options.top_k, + options.min_p, + options.presence_penalty + }); while (!llamacpp::context_step(ctx)){ // just wait until it completes // we could also stream the text to the user through a callback From f5da09f9104d4ff32f4f87de50955d3b71c1414b Mon Sep 17 00:00:00 2001 From: Sean Boettger Date: Mon, 22 Dec 2025 07:31:23 +1100 Subject: [PATCH 11/19] genai_driver formatting fixes --- coresdk/src/backend/genai_driver.cpp | 6 +++--- coresdk/src/backend/genai_driver.h | 15 ++++++++++----- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/coresdk/src/backend/genai_driver.cpp b/coresdk/src/backend/genai_driver.cpp index 3e8e56c5..7b7814ec 100644 --- a/coresdk/src/backend/genai_driver.cpp +++ b/coresdk/src/backend/genai_driver.cpp @@ -14,7 +14,8 @@ namespace splashkit_lib { - namespace llamacpp { + namespace llamacpp + { static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data){/* nothing, avoid unnecessary logging*/} @@ -143,8 +144,7 @@ namespace splashkit_lib sparams.no_perf = true; llama_sampler * smpl = llama_sampler_chain_init(sparams); - // Setup some reasonable defaults - // TODO: Make these user adjustable + // Set up sampler llama_sampler_chain_add(smpl, llama_sampler_init_min_p(settings.min_p, 1)); llama_sampler_chain_add(smpl, llama_sampler_init_temp(settings.temperature)); llama_sampler_chain_add(smpl, llama_sampler_init_top_k(settings.top_k)); diff --git a/coresdk/src/backend/genai_driver.h b/coresdk/src/backend/genai_driver.h index 658c5155..a65d624e 100644 --- a/coresdk/src/backend/genai_driver.h +++ b/coresdk/src/backend/genai_driver.h @@ -16,15 +16,18 @@ namespace splashkit_lib { typedef unsigned int uint; - namespace llamacpp { - struct model { + namespace llamacpp + { + struct model + { bool valid; llama_model* model; const llama_vocab* vocab; const char* tmpl; }; - struct inference_settings { + struct inference_settings + { double temperature = 0.6; double top_p = 0.95; int top_k = 20; @@ -32,12 +35,14 @@ namespace splashkit_lib double presence_penalty = 0; }; - struct message { + struct message + { std::string role; std::string content; }; - struct context { + struct context + { llama_context* ctx; llama_sampler* smpl; llama_batch batch; From 1742e9b38c9f5b57e1b6f44deddef7c97a3ccba6 Mon Sep 17 00:00:00 2001 From: Sean Boettger Date: Mon, 22 Dec 2025 08:53:08 +1100 Subject: [PATCH 12/19] Fix CMakeLists so llama.cpp links correctly on first make --- projects/cmake/CMakeLists.txt | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/projects/cmake/CMakeLists.txt b/projects/cmake/CMakeLists.txt index d14d57f7..6877d3cf 100644 --- a/projects/cmake/CMakeLists.txt +++ b/projects/cmake/CMakeLists.txt @@ -292,10 +292,16 @@ find_package(OpenMP REQUIRED) ExternalProject_Get_Property(llama_ext INSTALL_DIR) -set(LLAMA_LIB_FLAGS "${INSTALL_DIR}/lib/libllama.a" - "${INSTALL_DIR}/lib/libggml.a" - "${INSTALL_DIR}/lib/libggml-cpu.a" - "${INSTALL_DIR}/lib/libggml-base.a") +foreach(lib libllama libggml libggml-cpu libggml-base) + add_library(${lib} STATIC IMPORTED GLOBAL) + set_target_properties(${lib} PROPERTIES + IMPORTED_LOCATION + ${INSTALL_DIR}/lib/${lib}.a + ) + add_dependencies(${lib} llama_ext) +endforeach() + +set(LLAMA_LIB_FLAGS libllama libggml libggml-cpu libggml-base) # MACRO DEFINITIONS # add_definitions(-DELPP_THREAD_SAFE) From 47ddd98f7fd71ee168be4c61c015f2dbebeb6258 Mon Sep 17 00:00:00 2001 From: Sean Boettger Date: Mon, 22 Dec 2025 23:55:58 +1100 Subject: [PATCH 13/19] Remove OpenMP dependency --- projects/cmake/CMakeLists.txt | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/projects/cmake/CMakeLists.txt b/projects/cmake/CMakeLists.txt index 6877d3cf..30a3ce95 100644 --- a/projects/cmake/CMakeLists.txt +++ b/projects/cmake/CMakeLists.txt @@ -276,20 +276,17 @@ ExternalProject_Add( -DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_SERVER=OFF -DLLAMA_CUBLAS=OFF - -DLLAMA_CLBLAST=OFF -DLLAMA_METAL=OFF - -DLLAMA_OPENCL=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=OFF -DLLAMA_TOOLS_INSTALL=OFF -DCMAKE_BUILD_TYPE=Release -DGGML_STATIC=ON - -DLLAMA_STATIC=ON + -DGGML_OPENMP=OFF + -DLLAMA_OPENMP=OFF -DCMAKE_INSTALL_PREFIX= ) -find_package(OpenMP REQUIRED) - ExternalProject_Get_Property(llama_ext INSTALL_DIR) foreach(lib libllama libggml libggml-cpu libggml-base) @@ -309,7 +306,7 @@ add_definitions(-DELPP_THREAD_SAFE) #### END SETUP #### #### SplashKitBackend STATIC LIBRARY #### add_library(SplashKitBackend STATIC ${SOURCE_FILES} ${INCLUDE_FILES}) -target_link_libraries(SplashKitBackend ${LIB_FLAGS} ${LLAMA_LIB_FLAGS} OpenMP::OpenMP_CXX) +target_link_libraries(SplashKitBackend ${LIB_FLAGS} ${LLAMA_LIB_FLAGS}) if(RASPBERRY_PI) if(RASPBERRY_PI_5) From 7af66dcba18ec010e2585668fffe3caa1290bd1b Mon Sep 17 00:00:00 2001 From: Sean Boettger Date: Tue, 23 Dec 2025 07:46:13 +1100 Subject: [PATCH 14/19] Fix llama.cpp linking on windows --- projects/cmake/CMakeLists.txt | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/projects/cmake/CMakeLists.txt b/projects/cmake/CMakeLists.txt index 30a3ce95..84cc863c 100644 --- a/projects/cmake/CMakeLists.txt +++ b/projects/cmake/CMakeLists.txt @@ -289,16 +289,23 @@ ExternalProject_Add( ExternalProject_Get_Property(llama_ext INSTALL_DIR) -foreach(lib libllama libggml libggml-cpu libggml-base) +foreach(lib llama ggml ggml-cpu ggml-base) add_library(${lib} STATIC IMPORTED GLOBAL) - set_target_properties(${lib} PROPERTIES - IMPORTED_LOCATION - ${INSTALL_DIR}/lib/${lib}.a - ) + if (MSYS AND NOT "${lib}" STREQUAL "llama") # llama still ends up as libllama.a on Windows, unsure why + set_target_properties(${lib} PROPERTIES + IMPORTED_LOCATION + ${INSTALL_DIR}/lib/${lib}.a # no lib prefix + ) + else() + set_target_properties(${lib} PROPERTIES + IMPORTED_LOCATION + ${INSTALL_DIR}/lib/lib${lib}.a # lib prefix + ) + endif() add_dependencies(${lib} llama_ext) endforeach() -set(LLAMA_LIB_FLAGS libllama libggml libggml-cpu libggml-base) +set(LLAMA_LIB_FLAGS llama ggml ggml-cpu ggml-base) # MACRO DEFINITIONS # add_definitions(-DELPP_THREAD_SAFE) From b7503f0628237fe6b407bdd7d7f50f3ba360f0b3 Mon Sep 17 00:00:00 2001 From: Sean Boettger Date: Tue, 23 Dec 2025 07:46:51 +1100 Subject: [PATCH 15/19] Fix llama.cpp flags for MacOS --- projects/cmake/CMakeLists.txt | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/projects/cmake/CMakeLists.txt b/projects/cmake/CMakeLists.txt index 84cc863c..017e2a27 100644 --- a/projects/cmake/CMakeLists.txt +++ b/projects/cmake/CMakeLists.txt @@ -45,6 +45,7 @@ if (APPLE) -framework AudioToolbox \ -framework CoreAudio \ -framework CoreVideo \ + -framework Accelerate \ -lSDL2 \ -lSDL2_mixer \ -lSDL2_ttf \ @@ -275,15 +276,15 @@ ExternalProject_Add( -DLLAMA_BUILD_TOOLS=OFF -DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_SERVER=OFF - -DLLAMA_CUBLAS=OFF - -DLLAMA_METAL=OFF + -DGGML_BLAS=OFF + -DGGML_METAL=OFF + -DGGML_VULKAN=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=OFF -DLLAMA_TOOLS_INSTALL=OFF -DCMAKE_BUILD_TYPE=Release -DGGML_STATIC=ON -DGGML_OPENMP=OFF - -DLLAMA_OPENMP=OFF -DCMAKE_INSTALL_PREFIX= ) From 2af41de74e540df83221d88feb042ee3dae89213 Mon Sep 17 00:00:00 2001 From: Sean Boettger Date: Tue, 23 Dec 2025 02:28:38 +1100 Subject: [PATCH 16/19] Fix genai enum header docs --- coresdk/src/coresdk/types.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/coresdk/src/coresdk/types.h b/coresdk/src/coresdk/types.h index 78ff4cae..9df35575 100644 --- a/coresdk/src/coresdk/types.h +++ b/coresdk/src/coresdk/types.h @@ -563,12 +563,12 @@ namespace splashkit_lib * @constant QWEN3_4B_BASE Qwen3 4B Base model - slower but excellent for text commpletion/pattern based completion * @constant QWEN3_4B_INSTRUCT Qwen3 4B Instruct model - slower but can follow complex instructions * @constant QWEN3_4B_THINKING Qwen3 4B Thinking model - slower but can follow complex and specific instructions, but has a potentially long delay before starting to reply - * @constant GEMMA_270M_BASE Gemma3 270M Base model - tiny, extremely fast, and good for text completion. Very limited world knowledge. - * @constant GEMMA_270M_BASE Gemma3 270M Instruct model - tiny, extremely fast, and good for very simple instructions. Very limited world knowledge. - * @constant GEMMA_1B_BASE Gemma3 1B Base model - fast and good for text completion. Has decent world knowledge and multi-lingual abilities. - * @constant GEMMA_1B_INSTRUCT Gemma3 1B Instruct model - fast and can follow instructions. Has decent world knowledge and multi-lingual abilities. - * @constant GEMMA_4B_BASE Gemma3 4B Base model - slower but good for text commpletion/pattern based completion. Has decent world knowledge and multi-lingual abilities. - * @constant GEMMA_4B_INSTRUCT Gemma3 4B Instruct model - slower but can follow complex instructions. Has decent world knowledge and multi-lingual abilities. + * @constant GEMMA3_270M_BASE Gemma3 270M Base model - tiny, extremely fast, and good for text completion. Very limited world knowledge. + * @constant GEMMA3_270M_INSTRUCT Gemma3 270M Instruct model - tiny, extremely fast, and good for very simple instructions. Very limited world knowledge. + * @constant GEMMA3_1B_BASE Gemma3 1B Base model - fast and good for text completion. Has decent world knowledge and multi-lingual abilities. + * @constant GEMMA3_1B_INSTRUCT Gemma3 1B Instruct model - fast and can follow instructions. Has decent world knowledge and multi-lingual abilities. + * @constant GEMMA3_4B_BASE Gemma3 4B Base model - slower but good for text commpletion/pattern based completion. Has decent world knowledge and multi-lingual abilities. + * @constant GEMMA3_4B_INSTRUCT Gemma3 4B Instruct model - slower but can follow complex instructions. Has decent world knowledge and multi-lingual abilities. */ enum language_model { From 5afa6f0048d6bed7b55b995dcbba8186ce2705e9 Mon Sep 17 00:00:00 2001 From: Sean Boettger Date: Tue, 23 Dec 2025 02:28:47 +1100 Subject: [PATCH 17/19] Fix genai function header docs --- coresdk/src/coresdk/genai.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/coresdk/src/coresdk/genai.h b/coresdk/src/coresdk/genai.h index c8e90eb6..cc51ce70 100644 --- a/coresdk/src/coresdk/genai.h +++ b/coresdk/src/coresdk/genai.h @@ -43,6 +43,8 @@ namespace splashkit_lib * @param prompt The prompt for the language model to reply to. * * @returns The generated reply. + * + * @attribute suffix with_model */ string generate_reply(language_model model, string prompt); @@ -56,6 +58,8 @@ namespace splashkit_lib * @param options The generation options - use the `option_` functions to create this, for instance `option_language_model` * * @returns The generated reply. + * + * @attribute suffix with_options */ string generate_reply(string prompt, language_model_options options); @@ -82,6 +86,8 @@ namespace splashkit_lib * @param text The input text for the language model to continue. * * @returns The generated reply. + * + * @attribute suffix with_model */ string generate_text(language_model model, string text); @@ -95,6 +101,8 @@ namespace splashkit_lib * @param options The generation options - use the `option_` functions to create this, for instance `option_language_model` * * @returns The generated reply. + * + * @attribute suffix with_options */ string generate_text(string text, language_model_options options); From 9341ffbc8aecb4991a6a718bdce9d0d379ea8908 Mon Sep 17 00:00:00 2001 From: Sean Boettger Date: Tue, 23 Dec 2025 00:26:25 +1100 Subject: [PATCH 18/19] Make generations reproducible, fixed seed --- coresdk/src/backend/genai_driver.cpp | 6 +++--- coresdk/src/backend/genai_driver.h | 4 +++- coresdk/src/coresdk/genai.cpp | 7 +++++-- coresdk/src/coresdk/types.h | 1 + 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/coresdk/src/backend/genai_driver.cpp b/coresdk/src/backend/genai_driver.cpp index 7b7814ec..3b586fc7 100644 --- a/coresdk/src/backend/genai_driver.cpp +++ b/coresdk/src/backend/genai_driver.cpp @@ -123,11 +123,11 @@ namespace splashkit_lib return prompt_tokens; } - context start_context(model& mdl, llama_tokens& starting_context, int max_length, inference_settings settings) + context start_context(model& mdl, llama_tokens& starting_context, inference_settings settings) { // Create the context llama_context_params ctx_params = llama_context_default_params(); - ctx_params.n_ctx = starting_context.size() + max_length - 1; + ctx_params.n_ctx = starting_context.size() + settings.max_length - 1; ctx_params.n_batch = starting_context.size(); ctx_params.no_perf = true; @@ -151,7 +151,7 @@ namespace splashkit_lib llama_sampler_chain_add(smpl, llama_sampler_init_top_p(settings.top_p, 0)); if (settings.presence_penalty > 0) llama_sampler_chain_add(smpl, llama_sampler_init_penalties(64, 0, 0, settings.presence_penalty)); - llama_sampler_chain_add(smpl, llama_sampler_init_dist(LLAMA_DEFAULT_SEED)); + llama_sampler_chain_add(smpl, llama_sampler_init_dist(settings.seed)); // Prepare batch and encode starting context llama_batch batch = llama_batch_get_one(starting_context.data(), starting_context.size()); diff --git a/coresdk/src/backend/genai_driver.h b/coresdk/src/backend/genai_driver.h index a65d624e..bda8a46e 100644 --- a/coresdk/src/backend/genai_driver.h +++ b/coresdk/src/backend/genai_driver.h @@ -33,6 +33,8 @@ namespace splashkit_lib int top_k = 20; double min_p = 0; double presence_penalty = 0; + int max_length = 256; + uint32_t seed = 42; }; struct message @@ -64,7 +66,7 @@ namespace splashkit_lib std::string format_chat(model& mdl, const std::vector& messages); llama_tokens tokenize_string(model& mdl, const std::string& prompt); - context start_context(model& mdl, llama_tokens& starting_context, int max_length, inference_settings settings); + context start_context(model& mdl, llama_tokens& starting_context, inference_settings settings); int context_step(context& ctx); void delete_context(context& ctx); } diff --git a/coresdk/src/coresdk/genai.cpp b/coresdk/src/coresdk/genai.cpp index e7f8435d..362084fb 100644 --- a/coresdk/src/coresdk/genai.cpp +++ b/coresdk/src/coresdk/genai.cpp @@ -135,12 +135,14 @@ namespace splashkit_lib } llamacpp::llama_tokens tokens = llamacpp::tokenize_string(model, formatted); - llamacpp::context ctx = llamacpp::start_context(model, tokens, options.max_tokens, { + llamacpp::context ctx = llamacpp::start_context(model, tokens, { options.temperature, options.top_p, options.top_k, options.min_p, - options.presence_penalty + options.presence_penalty, + options.max_tokens, + (uint32_t)options.seed }); while (!llamacpp::context_step(ctx)){ // just wait until it completes @@ -198,6 +200,7 @@ namespace splashkit_lib language_model_options options = models[model]; options.path = home_path + options.path; + options.seed = 0; return options; } diff --git a/coresdk/src/coresdk/types.h b/coresdk/src/coresdk/types.h index 9df35575..c46ce9f6 100644 --- a/coresdk/src/coresdk/types.h +++ b/coresdk/src/coresdk/types.h @@ -616,6 +616,7 @@ namespace splashkit_lib double min_p; double presence_penalty; string prompt_append; + int seed; }; } #endif /* types_hpp */ From dbb176291a52a7094f9ce8b285156420084fd93c Mon Sep 17 00:00:00 2001 From: Sean Boettger Date: Tue, 23 Dec 2025 07:42:03 +1100 Subject: [PATCH 19/19] Conversation & Streaming support + some refactoring + updated test program --- coresdk/src/backend/backend_types.h | 1 + coresdk/src/backend/genai_driver.cpp | 115 ++++++++++----- coresdk/src/backend/genai_driver.h | 47 ++++++- coresdk/src/coresdk/genai.cpp | 201 +++++++++++++++++++++++++-- coresdk/src/coresdk/genai.h | 139 ++++++++++++++++++ coresdk/src/test/test_genai.cpp | 59 ++++++-- 6 files changed, 501 insertions(+), 61 deletions(-) diff --git a/coresdk/src/backend/backend_types.h b/coresdk/src/backend/backend_types.h index 03083a86..2eeead1e 100644 --- a/coresdk/src/backend/backend_types.h +++ b/coresdk/src/backend/backend_types.h @@ -64,6 +64,7 @@ namespace splashkit_lib ADC_PTR= 0x41444350, //'ADCP'; MOTOR_DRIVER_PTR = 0x4d444950, //'MDIP'; SERVO_DRIVER_PTR = 0x53455256, //'SERV'; + CONVERSATION_PTR = 0x434f4e56, //'CONV'; NONE_PTR = 0x4e4f4e45 //'NONE'; }; diff --git a/coresdk/src/backend/genai_driver.cpp b/coresdk/src/backend/genai_driver.cpp index 3b586fc7..c6f3a05b 100644 --- a/coresdk/src/backend/genai_driver.cpp +++ b/coresdk/src/backend/genai_driver.cpp @@ -60,6 +60,13 @@ namespace splashkit_lib return {false}; } + if (llama_model_has_encoder(model)) + { + llama_model_free(model); + CLOG(ERROR, "GenAI") << "Unsupported model, requires encoder-decoder support."; + return {false}; + } + const llama_vocab * vocab = llama_model_get_vocab(model); const char* tmpl = llama_model_chat_template(model, /* name */ nullptr); @@ -82,7 +89,7 @@ namespace splashkit_lib llama_model_free(mdl.model); } - std::string format_chat(model& mdl, const std::vector& messages) + std::string format_chat(model& mdl, const std::vector& messages, bool add_assistant) { std::vector llama_formatted; std::vector formatted(0); @@ -94,27 +101,27 @@ namespace splashkit_lib llama_formatted.push_back({msg.role.c_str(), msg.content.c_str()}); } - int new_len = llama_chat_apply_template(mdl.tmpl, llama_formatted.data(), llama_formatted.size(), true, formatted.data(), formatted.size()); + int new_len = llama_chat_apply_template(mdl.tmpl, llama_formatted.data(), llama_formatted.size(), add_assistant, formatted.data(), formatted.size()); if (new_len > (int)formatted.size()) { formatted.resize(new_len); - new_len = llama_chat_apply_template(mdl.tmpl, llama_formatted.data(), llama_formatted.size(), true, formatted.data(), formatted.size()); + new_len = llama_chat_apply_template(mdl.tmpl, llama_formatted.data(), llama_formatted.size(), add_assistant, formatted.data(), formatted.size()); } return std::string(formatted.begin(), formatted.end()); } - llama_tokens tokenize_string(model& mdl, const std::string& prompt) + llama_tokens tokenize_string(model& mdl, const std::string& prompt, bool is_first) { // get token count // note: returns a negative number, the count of tokens it would have returned if the buffer was large enough - const int n_prompt = -llama_tokenize(mdl.vocab, prompt.data(), prompt.size(), NULL, 0, true, true); + const int n_prompt = -llama_tokenize(mdl.vocab, prompt.data(), prompt.size(), NULL, 0, is_first, true); // create buffer std::vector prompt_tokens(n_prompt); // recieve the tokens - if (llama_tokenize(mdl.vocab, prompt.data(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) + if (llama_tokenize(mdl.vocab, prompt.data(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), is_first, true) < 0) { CLOG(ERROR, "GenAI") << "Failed to tokenize the prompt."; return {}; @@ -128,7 +135,7 @@ namespace splashkit_lib // Create the context llama_context_params ctx_params = llama_context_default_params(); ctx_params.n_ctx = starting_context.size() + settings.max_length - 1; - ctx_params.n_batch = starting_context.size(); + ctx_params.n_batch = ctx_params.n_ctx; ctx_params.no_perf = true; llama_context * ctx = llama_init_from_model(mdl.model, ctx_params); @@ -153,60 +160,58 @@ namespace splashkit_lib llama_sampler_chain_add(smpl, llama_sampler_init_penalties(64, 0, 0, settings.presence_penalty)); llama_sampler_chain_add(smpl, llama_sampler_init_dist(settings.seed)); - // Prepare batch and encode starting context - llama_batch batch = llama_batch_get_one(starting_context.data(), starting_context.size()); + // Prepare batch for starting context + llama_tokens next_batch = starting_context; - if (llama_model_has_encoder(mdl.model)) - { - if (llama_encode(ctx, batch)) - { - llama_free(ctx); - llama_sampler_free(smpl); - CLOG(ERROR, "GenAI") << "Failed to encode prompt."; - return {nullptr}; - } - - llama_token decoder_start_token_id = llama_model_decoder_start_token(mdl.model); - if (decoder_start_token_id == LLAMA_TOKEN_NULL) - { - decoder_start_token_id = llama_vocab_bos(mdl.vocab); - } - - batch = llama_batch_get_one(&decoder_start_token_id, 1); - } + // Cache newline token - we use this manually in some spots + llama_token newline_token; + llama_tokenize(mdl.vocab, "\n", 1, &newline_token, 1, false, true); return { ctx, smpl, - batch, + next_batch, (int)ctx_params.n_ctx, mdl.vocab, + newline_token, 0, - "" + {}, + false }; } - int context_step(context& ctx) + int context_step(context& ctx, token_result* token) { + const string THINKING_START = ""; + const string THINKING_END = ""; + if (!ctx.ctx) return -1; + llama_batch batch = llama_batch_get_one(ctx.next_batch.data(), ctx.next_batch.size()); // Decode current batch with the model - if (llama_decode(ctx.ctx, ctx.batch)) + if (llama_decode(ctx.ctx, batch)) { CLOG(ERROR, "GenAI") << "Failed to process response from language model."; + if (token) + token->type = token_result::NONE; return -1; } - ctx.n_pos += ctx.batch.n_tokens; + ctx.total_context.insert(ctx.total_context.end(), ctx.next_batch.begin(), ctx.next_batch.end()); + ctx.n_pos += batch.n_tokens; // Sample next token llama_token new_token_id = llama_sampler_sample(ctx.smpl, ctx.ctx, -1); // Has the model finished its response? if (llama_vocab_is_eog(ctx.vocab, new_token_id)) + { + if (token) + token->type = token_result::NONE; return 1; + } char buf[128]; int n = llama_token_to_piece(ctx.vocab, new_token_id, buf, sizeof(buf), 0, true); @@ -217,19 +222,46 @@ namespace splashkit_lib } std::string s(buf, n); - ctx.ctx_string += s; + + if (token) + { + bool is_meta = s == THINKING_START || s == THINKING_END; + token->text = s; + if (is_meta) + token->type = token_result::META; + else if (ctx.in_thinking) + token->type = token_result::THINKING; + else + token->type = token_result::CONTENT; + } + + if (s == THINKING_START) + ctx.in_thinking = true; + else if (s == THINKING_END) + ctx.in_thinking = false; // prepare the next batch with the sampled token - ctx.batch = llama_batch_get_one(&new_token_id, 1); + ctx.next_batch = {new_token_id}; // Have we reached the end of the context? // If so, stop now. - if (ctx.n_pos + ctx.batch.n_tokens >= ctx.ctx_size) + if (ctx.n_pos + ctx.next_batch.size() >= ctx.ctx_size) return 1; return 0; } + void add_to_context(context& ctx, llama_tokens& message) + { + ctx.next_batch.insert(ctx.next_batch.end(), message.begin(), message.end()); + } + + void manual_end_message(context& ctx) + { + ctx.next_batch.push_back(llama_vocab_eot(ctx.vocab)); + ctx.next_batch.push_back(ctx.newline_token); + } + void delete_context(context& ctx) { if (ctx.smpl) @@ -238,5 +270,18 @@ namespace splashkit_lib if (ctx.ctx) llama_free(ctx.ctx); } + + void __print_debug_context(context& ctx) + { + for (auto& x : ctx.total_context) + { + char buf[128]; + int n = llama_token_to_piece(ctx.vocab, x, buf, sizeof(buf), 0, true); + + std::string s(buf, n); + std::cout << "|" << s; + } + std::cout << std::endl; + } } } diff --git a/coresdk/src/backend/genai_driver.h b/coresdk/src/backend/genai_driver.h index bda8a46e..b24c3e91 100644 --- a/coresdk/src/backend/genai_driver.h +++ b/coresdk/src/backend/genai_driver.h @@ -18,6 +18,8 @@ namespace splashkit_lib namespace llamacpp { + typedef std::vector llama_tokens; + struct model { bool valid; @@ -47,29 +49,62 @@ namespace splashkit_lib { llama_context* ctx; llama_sampler* smpl; - llama_batch batch; + llama_tokens next_batch; int ctx_size = 0; const llama_vocab* vocab; + llama_token newline_token; int n_pos; - std::string ctx_string; + llama_tokens total_context; + + bool in_thinking = false; }; - typedef std::vector llama_tokens; + struct token_result + { + enum token_type { + NONE, + CONTENT, + THINKING, + META + }; + string text; + token_type type; + }; void init(); model create_model(std::string path); void delete_model(model mdl); - std::string format_chat(model& mdl, const std::vector& messages); - llama_tokens tokenize_string(model& mdl, const std::string& prompt); + std::string format_chat(model& mdl, const std::vector& messages, bool add_assistant); + llama_tokens tokenize_string(model& mdl, const std::string& prompt, bool is_first); context start_context(model& mdl, llama_tokens& starting_context, inference_settings settings); - int context_step(context& ctx); void delete_context(context& ctx); + + int context_step(context& ctx, token_result* token); + void add_to_context(context& ctx, llama_tokens& message); + void manual_end_message(context& ctx); + + void __print_debug_context(context& ctx); } + + struct sk_conversation + { + pointer_identifier id; + + llamacpp::model model; + llamacpp::context context; + + bool was_generating; + bool is_generating; + + string prompt_append; + + llamacpp::token_result next_token; + }; } #endif /* defined(graphics_driver) */ diff --git a/coresdk/src/coresdk/genai.cpp b/coresdk/src/coresdk/genai.cpp index 362084fb..cde93610 100644 --- a/coresdk/src/coresdk/genai.cpp +++ b/coresdk/src/coresdk/genai.cpp @@ -10,6 +10,7 @@ #include "utility_functions.h" #include "web_driver.h" #include "terminal.h" +#include "core_driver.h" #include @@ -17,6 +18,8 @@ using std::to_string; namespace splashkit_lib { + static vector objects; + const language_model DEFAULT_LANGUAGE_MODEL = QWEN3_0_6B_INSTRUCT; const int default_max_tokens_base = 256; // base has a higher likelihood of running forever for no reason, better to limit it early @@ -109,17 +112,22 @@ namespace splashkit_lib return result; } - string __generate_common(string prompt, language_model_options options, bool format_chat) + llamacpp::model __get_model(language_model_options options) { llamacpp::init(); if (options.url != "" && !ensure_exists_or_download(options.path, options.url, " ::: Downloading Language Model: " + options.name + " |")) { CLOG(ERROR, "GenAI") << "Failed to download language model - see error above."; - return ""; + return {false}; } - llamacpp::model model = llamacpp::create_model(options.path); + return llamacpp::create_model(options.path); + } + + string __generate_common(string prompt, language_model_options options, bool format_chat) + { + llamacpp::model model = __get_model(options); if (!model.valid) return ""; @@ -127,13 +135,13 @@ namespace splashkit_lib if (format_chat) { - llamacpp::format_chat(model, { + formatted = llamacpp::format_chat(model, { { "user", prompt + options.prompt_append - } - }); + }, + }, true); } - llamacpp::llama_tokens tokens = llamacpp::tokenize_string(model, formatted); + llamacpp::llama_tokens tokens = llamacpp::tokenize_string(model, formatted, true); llamacpp::context ctx = llamacpp::start_context(model, tokens, { options.temperature, @@ -144,12 +152,15 @@ namespace splashkit_lib options.max_tokens, (uint32_t)options.seed }); - while (!llamacpp::context_step(ctx)){ - // just wait until it completes - // we could also stream the text to the user through a callback - }; - std::string result = ctx.ctx_string; + std::string result = ""; + llamacpp::token_result token; + + while (!llamacpp::context_step(ctx, &token)) + { + if (token.type == llamacpp::token_result::CONTENT) + result += token.text; + }; llamacpp::delete_context(ctx); llamacpp::delete_model(model); @@ -188,6 +199,172 @@ namespace splashkit_lib return __generate_common(text, options, false); } + // -------------------------------------------------------------- + + // Streaming conversation + + #define CONVERSATION_CHECK(x, val) \ + if (INVALID_PTR(c, CONVERSATION_PTR))\ + {\ + LOG(WARNING) << "Passed an invalid conversation object to " x;\ + return val;\ + } + + conversation create_conversation() + { + return create_conversation(option_language_model(DEFAULT_LANGUAGE_MODEL)); + } + + conversation create_conversation(language_model model) + { + return create_conversation(option_language_model(model)); + } + + conversation create_conversation(language_model_options options) + { + internal_sk_init(); + + llamacpp::model model = __get_model(options); + + if (!model.valid) return nullptr; + + llamacpp::llama_tokens initial_tokens = llamacpp::tokenize_string(model, "", true); + + sk_conversation* c = new sk_conversation(); + c->id = CONVERSATION_PTR; + c->model = model; + c->context = llamacpp::start_context(model, initial_tokens, { + options.temperature, + options.top_p, + options.top_k, + options.min_p, + options.presence_penalty, + options.max_tokens, + (uint32_t)options.seed + });; + + c->was_generating = false; + c->is_generating = true; + + c->prompt_append = options.prompt_append; + + objects.push_back(c); + + return c; + }; + + void conversation_add_message(conversation c, const string& message) + { + CONVERSATION_CHECK("conversation_add_message", ) + + // end the language model's turn + if (c->was_generating) + { + c->was_generating = false; + llamacpp::manual_end_message(c->context); + } + + // tokenize user's prompt and add to context + llamacpp::llama_tokens tokens = llamacpp::tokenize_string(c->model, llamacpp::format_chat(c->model, { + {"user", message + c->prompt_append} + }, true), false); + llamacpp::add_to_context(c->context, tokens); + + // the model is ready to generate again + c->is_generating = true; + } + + void __buffer_next_token(conversation c) + { + if (c->next_token.type != llamacpp::token_result::token_type::NONE) + return; // already buffered + + // attempt to get next token that is non-meta + do + { + // if we reach the end of the message, return even if a meta token (shouldn't happen though) + if (llamacpp::context_step(c->context, &c->next_token)) + { + c->is_generating = false; + return; + } + } while (c->next_token.type == llamacpp::token_result::token_type::META); + } + + // These next three functions buffer the next token so that they can + // return information about it + bool conversation_is_replying(conversation c) + { + CONVERSATION_CHECK("conversation_is_replying", false) + + __buffer_next_token(c); + + return c->is_generating; + } + + bool conversation_is_thinking(conversation c) + { + CONVERSATION_CHECK("conversation_is_thinking", false) + + __buffer_next_token(c); + + return c->next_token.type == llamacpp::token_result::token_type::THINKING; + } + + string conversation_get_reply_piece(conversation c) + { + CONVERSATION_CHECK("conversation_get_reply_piece", "") + + // if the user wants a token, we can resume generating even if we already finished + c->is_generating = true; + c->was_generating = true; + + __buffer_next_token(c); + + // token is consumed + c->next_token.type = llamacpp::token_result::token_type::NONE; + + return c->next_token.text; + } + + void __free_conversation_resource(conversation c) + { + llamacpp::delete_context(c->context); + llamacpp::delete_model(c->model); + } + + void free_conversation(conversation c) + { + CONVERSATION_CHECK("free_conversation", ) + + __free_conversation_resource(c); + + for (auto it = objects.begin(); it != objects.end(); it++) + { + if (*it == c) + { + notify_of_free(c); + + delete *it; + + it = objects.erase(it); + return; + } + } + } + + void free_all_conversations() + { + for (conversation c : objects) + { + __free_conversation_resource(c); + } + + objects.clear(); + } + + // -------------------------------------------------------------- + language_model_options option_language_model(language_model model) { if (model < 0 || model >= models.size() || models[model].name == "") diff --git a/coresdk/src/coresdk/genai.h b/coresdk/src/coresdk/genai.h index cc51ce70..c57f1fa8 100644 --- a/coresdk/src/coresdk/genai.h +++ b/coresdk/src/coresdk/genai.h @@ -20,6 +20,26 @@ using std::string; namespace splashkit_lib { + /** + * The `conversation` type is used to refer to conversations between the user + * and a language model. You can use it to send messages to the language model, + * and stream responses back. + * + * + * All `conversation` objects are: + * + * + * - created with `create_conversation()`, `create_conversation(language_model model)` or + * `create_conversation(language_model_options options)` + * + * + * - and must be released using `free_conversation()` (to release a specific `conversation` object) + * or `free_all_conversation()` (to release all created `conversation` objects). + * + * + * @attribute class conversation + */ + typedef struct sk_conversation *conversation; /** * @brief Generates a reply to a textual prompt by a language model @@ -106,6 +126,125 @@ namespace splashkit_lib */ string generate_text(string text, language_model_options options); + /** + * @brief Creates a new `conversation` object, that uses the default language model. + * + * The `conversation` object can have messages added to it, and responses streamed back from it via the other Conversation functions and procedures + * + * @returns Returns a new `conversation` object. + * + * @attribute class conversation + * @attribute constructor true + */ + conversation create_conversation(); + + /** + * @brief Creates a new `conversation` object, that uses a chosen language model. + * + * The `conversation` object can have messages added to it, and responses streamed back from it via the other Conversation functions and procedures + * + * @param model The language model to use + * + * @returns Returns a new `conversation` object. + * + * @attribute class conversation + * @attribute constructor true + * + * @attribute suffix with_model + */ + conversation create_conversation(language_model model); + + /** + * @brief Creates a new `conversation` object, that uses a chosen language model among other options. + * + * The `conversation` object can have messages added to it, and responses streamed back from it via the other Conversation functions and procedures + * + * @param options The options to use - use this to choose the language model, and change various parameters. + * + * @returns Returns a new `conversation` object. + * + * @attribute class conversation + * @attribute constructor true + * + * @attribute suffix with_options + */ + conversation create_conversation(language_model_options options); + + /** + * Checks if a language model is currently generating a reply within a `conversation`. + * If so, you can continue to receive the message with `conversation_get_reply_piece(conversation c)` + * + * @param c The `conversation` object to check + * + * @returns Returns whether the language model is still generating a reply + * + * @attribute class conversation + * @attribute method is_replying + * @attribute self c + */ + bool conversation_is_replying(conversation c); + + /** + * Checks if a language model is currently "thinking" while generating a reply within a `conversation`. + * You can use this to filter out the "thoughts" and display them differently (or hide them entirely) + * + * @param c The `conversation` object to check + * + * @returns Returns whether the language model is currently thinking while generating a reply + * + * @attribute class conversation + * @attribute method is_thinking + * @attribute self c + */ + bool conversation_is_thinking(conversation c); + + /** + * Adds a message to a `conversation`, that the language model will begin replying to. + * You can receive the reply one piece at a time by calling `conversation_get_reply_piece(conversation c)` in a loop + * + * @param c The `conversation` object to check + * @param message The user message to add to the conversation - the language model will reply to this + * + * @attribute class conversation + * @attribute method add_message + * @attribute self c + */ + void conversation_add_message(conversation c, const string& message); + + /** + * Returns a single piece of a reply (generally one word at a time) from the `conversation` + * You can use a loop while checking `conversation_is_replying` to retrieve the reply as it generates + * + * @param c The `conversation` object to recieve the reply from + * + * @returns Returns a small piece of the reply (generally 1 word or less) + * + * @attribute class conversation + * @attribute method get_reply_piece + * @attribute self c + */ + string conversation_get_reply_piece(conversation c); + + /** + * Frees the resources associated with the `conversation` object. + * + * @param c The `conversation` object whose resources should be released. + * + * @attribute class conversation + * @attribute destructor true + * @attribute self c + * @attribute method free + */ + void free_conversation(conversation c); + + /** + * Releases all of the `conversation` objects which have been loaded. + * + * @attribute static conversations + * @attribute method free_all + */ + void free_all_conversations(); + /** * Use this option to choose which language model to use, and initialize its default settings * diff --git a/coresdk/src/test/test_genai.cpp b/coresdk/src/test/test_genai.cpp index 84aba027..669bea23 100644 --- a/coresdk/src/test/test_genai.cpp +++ b/coresdk/src/test/test_genai.cpp @@ -7,6 +7,7 @@ #include "genai.h" #include "terminal.h" +#include "basics.h" #include "utils.h" #include #include @@ -16,15 +17,57 @@ using namespace splashkit_lib; void run_genai_test() { - write("User\n> "); - string prompt = read_line(); + const string THINKING_STYLE = "\033[37;3m"; + const string RESET_STYLE = "\033[0m"; - write("LLM\n> (generating...)"); - string response = generate_reply(QWEN3_0_6B_INSTRUCT, prompt); - write_line("\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\033[K" + response); + conversation conv = create_conversation(QWEN3_1_7B_THINKING); - delay(300); + while(true) + { + write("\n> "); + string prompt = read_line(); - write_line("-- Press enter to end --"); - read_line(); + // See if the user wants to exit + string exit = trim(generate_reply(QWEN3_1_7B_INSTRUCT, "User A: "+prompt+"\nDoes user A want to end the conversation? Answer with one word, either CONTINUE or END:")); + + write_line("["+exit+"]"); + + if (exit == "END") + break; + + // otherwise continue the conversation + conversation_add_message(conv, prompt); + + bool thinking = false; + string last_piece = "\n"; + while(conversation_is_replying(conv)) + { + if (conversation_is_thinking(conv) != thinking) + { + thinking = conversation_is_thinking(conv); + + if (thinking) + write(THINKING_STYLE); + else + write(RESET_STYLE); + } + + string piece = conversation_get_reply_piece(conv); + + // avoid double newlines - ideally this will be filtered on SplashKit's side instead + if (piece == "\n" && last_piece == "\n") + continue; + + if (piece == "\n\n") + piece = "\n"; + + write(piece); + last_piece = piece; + } + + if (last_piece != "\n") + write("\n"); + } + + free_conversation(conv); }