From 967a4b239acbccb3d545981b962c4ee93d6ed9db Mon Sep 17 00:00:00 2001
From: Sean Boettger <sean@whypenguins.com>
Date: Sat, 20 Dec 2025 01:40:43 +1100
Subject: [PATCH 01/19] Fix: Take into account HOME env var in
 path_to_user_home

---
 coresdk/src/backend/utility_functions.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/coresdk/src/backend/utility_functions.cpp b/coresdk/src/backend/utility_functions.cpp
index 0e4cd5cd..6985c168 100644
--- a/coresdk/src/backend/utility_functions.cpp
+++ b/coresdk/src/backend/utility_functions.cpp
@@ -73,6 +73,10 @@ namespace splashkit_lib
     string path_to_user_home()
     {
 #ifndef WINDOWS
+        string home = get_env_var("HOME");
+        if (home != "")
+            return home;
+
         struct passwd *pw = getpwuid(getuid());
         return string(pw->pw_dir);
 #else

From abe59463c4443807d139f672badd3fded2522100 Mon Sep 17 00:00:00 2001
From: Sean Boettger <sean@whypenguins.com>
Date: Sat, 20 Dec 2025 02:33:43 +1100
Subject: [PATCH 02/19] Add llama.cpp as build dependency

---
 .gitignore                    |  2 ++
 coresdk/external              |  2 +-
 projects/cmake/CMakeLists.txt | 44 +++++++++++++++++++++++++++++++++--
 3 files changed, 45 insertions(+), 3 deletions(-)

diff --git a/.gitignore b/.gitignore
index cbf9a5b3..0864e9d0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -77,6 +77,7 @@ Makefile
 cmake_install.cmake
 splashkit_test
 projects/cmake/Resources
+llama_ext-prefix
 .ninja_deps
 .ninja_log
 build.ninja
@@ -101,6 +102,7 @@ out/lib/
 tools/scripts/nuget-pkg/obj
 tools/scripts/test/obj
 
+
 ### Debian packaging ###
 tools/scripts/debian/libsplashkit-dev*
 tools/scripts/debian/data.tar.xz
diff --git a/coresdk/external b/coresdk/external
index e089bc3c..d9c7ca08 160000
--- a/coresdk/external
+++ b/coresdk/external
@@ -1 +1 @@
-Subproject commit e089bc3ccbd7ff11027a790be44f6ab6038b5c58
+Subproject commit d9c7ca08ca9dbb0051bf57ceadb1d7a2d0f8d536
diff --git a/projects/cmake/CMakeLists.txt b/projects/cmake/CMakeLists.txt
index 0780489a..d14d57f7 100644
--- a/projects/cmake/CMakeLists.txt
+++ b/projects/cmake/CMakeLists.txt
@@ -5,6 +5,7 @@ set(CMAKE_BUILD_TYPE Debug)
 
 cmake_policy(SET CMP0083 NEW)
 include(CheckPIESupported)
+include(ExternalProject)
 check_pie_supported()
 
 # SK Directories relative to cmake project
@@ -245,6 +246,8 @@ include_directories("${SK_EXT}/hash-library")
 include_directories("${SK_EXT}/json")
 include_directories("${SK_EXT}/catch")
 include_directories("${SK_EXT}/microui/src")
+include_directories("${SK_EXT}/llama.cpp/include")
+include_directories("${SK_EXT}/llama.cpp/ggml/include")
 
 # MAC OS DIRECTORY INCLUDES
 if (APPLE)
@@ -257,13 +260,50 @@ if (APPLE)
     include_directories("${SK_EXT}/SDL_image/external/libpng-1.6.2")
 endif()
 
+# INCLUDE LLAMA.CPP
+
+# Included as an external project so that it can be configured
+# as Release, independently of the main project.
+
+# Compiled as CPU only
+# TODO: Decide on minimum architecture requirements
+ExternalProject_Add(
+    llama_ext
+    SOURCE_DIR "${SK_EXT}/llama.cpp"
+    CMAKE_ARGS
+        -DLLAMA_BUILD_TESTS=OFF
+        -DLLAMA_BUILD_TOOLS=OFF
+        -DLLAMA_BUILD_EXAMPLES=OFF
+        -DLLAMA_BUILD_SERVER=OFF
+        -DLLAMA_CUBLAS=OFF
+        -DLLAMA_CLBLAST=OFF
+        -DLLAMA_METAL=OFF
+        -DLLAMA_OPENCL=OFF
+        -DBUILD_SHARED_LIBS=OFF
+        -DLLAMA_BUILD_COMMON=OFF
+        -DLLAMA_TOOLS_INSTALL=OFF
+        -DCMAKE_BUILD_TYPE=Release
+        -DGGML_STATIC=ON
+        -DLLAMA_STATIC=ON
+        -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
+)
+
+find_package(OpenMP REQUIRED)
+
+ExternalProject_Get_Property(llama_ext INSTALL_DIR)
+
+set(LLAMA_LIB_FLAGS "${INSTALL_DIR}/lib/libllama.a"
+                     "${INSTALL_DIR}/lib/libggml.a"
+                     "${INSTALL_DIR}/lib/libggml-cpu.a"
+                     "${INSTALL_DIR}/lib/libggml-base.a")
+
 # MACRO DEFINITIONS #
 add_definitions(-DELPP_THREAD_SAFE)
 
 #### END SETUP ####
 #### SplashKitBackend STATIC LIBRARY ####
 add_library(SplashKitBackend STATIC ${SOURCE_FILES} ${INCLUDE_FILES})
-target_link_libraries(SplashKitBackend ${LIB_FLAGS})
+target_link_libraries(SplashKitBackend ${LIB_FLAGS} ${LLAMA_LIB_FLAGS} OpenMP::OpenMP_CXX)
 
 if(RASPBERRY_PI)
     if(RASPBERRY_PI_5)
@@ -373,4 +413,4 @@ catch_discover_tests(skunit_tests)
 #### END skunit_tests EXECUTABLE ####
 
 install(TARGETS SplashKitBackend DESTINATION lib)
-install(FILES ${INCLUDE_FILES} DESTINATION include/SplashKitBackend)
\ No newline at end of file
+install(FILES ${INCLUDE_FILES} DESTINATION include/SplashKitBackend)

From 43baf0569fab9549c0fd87d016241abfc792d787 Mon Sep 17 00:00:00 2001
From: Sean Boettger <sean@whypenguins.com>
Date: Sat, 20 Dec 2025 02:34:25 +1100
Subject: [PATCH 03/19] Add initial GenAI driver and user facing
 `generate_reply`

---
 coresdk/src/backend/genai_driver.cpp | 229 +++++++++++++++++++++++++++
 coresdk/src/backend/genai_driver.h   |  60 +++++++
 coresdk/src/coresdk/genai.cpp        |  46 ++++++
 coresdk/src/coresdk/genai.h          |  34 ++++
 4 files changed, 369 insertions(+)
 create mode 100644 coresdk/src/backend/genai_driver.cpp
 create mode 100644 coresdk/src/backend/genai_driver.h
 create mode 100644 coresdk/src/coresdk/genai.cpp
 create mode 100644 coresdk/src/coresdk/genai.h

diff --git a/coresdk/src/backend/genai_driver.cpp b/coresdk/src/backend/genai_driver.cpp
new file mode 100644
index 00000000..4ca64d86
--- /dev/null
+++ b/coresdk/src/backend/genai_driver.cpp
@@ -0,0 +1,229 @@
+//
+//  genai_driver.cpp
+//  sk
+//
+//  Created by Sean Boettger on 19/12/2025.
+//
+#include <iostream>
+#include <string.h>
+#include <vector>
+
+#include "genai_driver.h"
+#include "core_driver.h"
+#include "utility_functions.h"
+
+namespace splashkit_lib
+{
+    namespace llamacpp {
+
+        static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data){/* nothing, avoid unnecessary logging*/}
+
+        void init()
+        {
+            static bool initialized = false;
+            if (!initialized)
+            {
+                llama_log_set(llama_log_callback_null, NULL);
+
+                ggml_backend_load_all();
+
+                initialized = true;
+            }
+        }
+
+        model create_model(std::string path)
+        {
+            ggml_backend_load_all();
+
+            // initialize the model
+            llama_model_params model_params = llama_model_default_params();
+            model_params.n_gpu_layers = 0; // cpu-only
+
+            llama_model * model = llama_model_load_from_file(path.c_str(), model_params);
+
+            if (model == NULL)
+            {
+                LOG(ERROR) << "Unable to load language model from " << path << " - please check if it exists.";
+                return {false};
+            }
+
+            const llama_vocab * vocab = llama_model_get_vocab(model);
+            const char* tmpl = llama_model_chat_template(model, /* name */ nullptr);
+
+            return {
+                true,
+                model,
+                vocab,
+                tmpl
+            };
+        }
+
+        void delete_model(model mdl)
+        {
+            if (!mdl.valid)
+                return;
+
+            if (!mdl.model)
+                return;
+
+            llama_model_free(mdl.model);
+        }
+
+        std::string format_chat(model& mdl, const std::vector<message>& messages)
+        {
+            std::vector<llama_chat_message> llama_formatted;
+            std::vector<char> formatted(0);
+
+            llama_formatted.reserve(messages.size());
+
+            for (const message& msg : messages)
+            {
+                llama_formatted.push_back({msg.role.c_str(), msg.content.c_str()});
+            }
+
+            int new_len = llama_chat_apply_template(mdl.tmpl, llama_formatted.data(), llama_formatted.size(), true, formatted.data(), formatted.size());
+            if (new_len > (int)formatted.size())
+            {
+                formatted.resize(new_len);
+                new_len = llama_chat_apply_template(mdl.tmpl, llama_formatted.data(), llama_formatted.size(), true, formatted.data(), formatted.size());
+            }
+
+            return std::string(formatted.begin(), formatted.end());
+        }
+
+        llama_tokens tokenize_string(model& mdl, const std::string& prompt)
+        {
+            // get token count
+            // note: returns a negative number, the count of tokens it would have returned if the buffer was large enough
+            const int n_prompt = -llama_tokenize(mdl.vocab, prompt.data(), prompt.size(), NULL, 0, true, true);
+
+            // create buffer
+            std::vector<llama_token> prompt_tokens(n_prompt);
+
+            // recieve the tokens
+            if (llama_tokenize(mdl.vocab, prompt.data(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0)
+            {
+                LOG(ERROR) << "Failed to tokenize the prompt.";
+                return {};
+            }
+
+            return prompt_tokens;
+        }
+
+        context start_context(model& mdl, llama_tokens& starting_context, int max_length)
+        {
+            // Create the context
+            llama_context_params ctx_params = llama_context_default_params();
+            ctx_params.n_ctx = starting_context.size() + max_length - 1;
+            ctx_params.n_batch = starting_context.size();
+            ctx_params.no_perf = true;
+
+            llama_context * ctx = llama_init_from_model(mdl.model, ctx_params);
+
+            if (ctx == NULL)
+            {
+                LOG(ERROR) << "Failed to create the language model context.";
+                return {nullptr};
+            }
+
+            // Create the sampler
+            auto sparams = llama_sampler_chain_default_params();
+            sparams.no_perf = true;
+            llama_sampler * smpl = llama_sampler_chain_init(sparams);
+
+            // Setup some reasonable defaults
+            // TODO: Make these user adjustable
+            llama_sampler_chain_add(smpl, llama_sampler_init_min_p(0.00f, 1));
+            llama_sampler_chain_add(smpl, llama_sampler_init_temp(0.6f));
+            llama_sampler_chain_add(smpl, llama_sampler_init_top_k(20));
+            llama_sampler_chain_add(smpl, llama_sampler_init_top_p(0.95, 0));
+            //llama_sampler_chain_add(smpl, llama_sampler_init_penalties(64, 1, 0, 0));
+            llama_sampler_chain_add(smpl, llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
+
+            // Prepare batch and encode starting context
+            llama_batch batch = llama_batch_get_one(starting_context.data(), starting_context.size());
+
+            if (llama_model_has_encoder(mdl.model))
+            {
+                if (llama_encode(ctx, batch))
+                {
+                    llama_free(ctx);
+                    llama_sampler_free(smpl);
+                    LOG(ERROR) << "Failed to encode prompt.";
+                    return {nullptr};
+                }
+
+                llama_token decoder_start_token_id = llama_model_decoder_start_token(mdl.model);
+                if (decoder_start_token_id == LLAMA_TOKEN_NULL)
+                {
+                    decoder_start_token_id = llama_vocab_bos(mdl.vocab);
+                }
+
+                batch = llama_batch_get_one(&decoder_start_token_id, 1);
+            }
+
+            return
+            {
+                ctx,
+                smpl,
+                batch,
+                (int)ctx_params.n_ctx,
+                mdl.vocab,
+                0,
+                ""
+            };
+        }
+
+        int context_step(context& ctx)
+        {
+            if (!ctx.ctx)
+                return -1;
+
+            // Decode current batch with the model
+            if (llama_decode(ctx.ctx, ctx.batch))
+            {
+                LOG(ERROR) << "Failed to process response from language model.";
+                return -1;
+            }
+
+            ctx.n_pos += ctx.batch.n_tokens;
+
+            // Sample next token
+            llama_token new_token_id = llama_sampler_sample(ctx.smpl, ctx.ctx, -1);
+
+            // Has the model finished its response?
+            if (llama_vocab_is_eog(ctx.vocab, new_token_id))
+                return 1;
+
+            char buf[128];
+            int n = llama_token_to_piece(ctx.vocab, new_token_id, buf, sizeof(buf), 0, true);
+            if (n < 0)
+            {
+                LOG(ERROR) << "Failed to convert response token from language model.";
+                return -1;
+            }
+
+            std::string s(buf, n);
+            ctx.ctx_string += s;
+
+            // prepare the next batch with the sampled token
+            ctx.batch = llama_batch_get_one(&new_token_id, 1);
+
+            // Have we reached the end of the context?
+            // If so, stop now.
+            if (ctx.n_pos + ctx.batch.n_tokens >= ctx.ctx_size)
+                return 1;
+
+            return 0;
+        }
+
+        void delete_context(context& ctx)
+        {
+            if (ctx.smpl)
+                llama_sampler_free(ctx.smpl);
+
+            if (ctx.ctx)
+                llama_free(ctx.ctx);
+        }
+    }
+}
diff --git a/coresdk/src/backend/genai_driver.h b/coresdk/src/backend/genai_driver.h
new file mode 100644
index 00000000..8f175017
--- /dev/null
+++ b/coresdk/src/backend/genai_driver.h
@@ -0,0 +1,60 @@
+//
+//  genai_driver.h
+//  sk
+//
+//  Created by Sean Boettger on 19/12/2025.
+//
+
+#ifndef genai_driver_h
+#define genai_driver_h
+
+#include "backend_types.h"
+
+#include "llama.h"
+
+namespace splashkit_lib
+{
+    typedef unsigned int uint;
+
+    namespace llamacpp {
+        struct model {
+            bool valid;
+            llama_model* model;
+            const llama_vocab* vocab;
+            const char* tmpl;
+        };
+
+        struct message {
+            std::string role;
+            std::string content;
+        };
+
+        struct context {
+            llama_context* ctx;
+            llama_sampler* smpl;
+            llama_batch batch;
+            int ctx_size = 0;
+
+            const llama_vocab* vocab;
+
+            int n_pos;
+            std::string ctx_string;
+        };
+
+        typedef std::vector<llama_token> llama_tokens;
+
+        void init();
+
+        model create_model(std::string path);
+        void delete_model(model mdl);
+
+        std::string format_chat(model& mdl, const std::vector<message>& messages);
+        llama_tokens tokenize_string(model& mdl, const std::string& prompt);
+
+        context start_context(model& mdl, llama_tokens& starting_context, int max_length);
+        int context_step(context& ctx);
+        void delete_context(context& ctx);
+    }
+}
+
+#endif /* defined(graphics_driver) */
diff --git a/coresdk/src/coresdk/genai.cpp b/coresdk/src/coresdk/genai.cpp
new file mode 100644
index 00000000..ed408251
--- /dev/null
+++ b/coresdk/src/coresdk/genai.cpp
@@ -0,0 +1,46 @@
+//
+//  genai.cpp
+//  splashkit
+//
+//  Created by Sean Boettger on 20/12/25.
+//
+
+#include "genai_driver.h"
+#include "utility_functions.h"
+
+namespace splashkit_lib
+{
+
+    string generate_reply(string prompt)
+    {
+        llamacpp::init();
+
+        string path = path_from( {path_to_user_home(), ".splashkit", "models"} );
+
+        // TODO: add auto download & choices for at least the following
+        //"Qwen3-4B-Instruct-2507-UD-Q2_K_XL.gguf"
+        //"Qwen3-1.7B-Q8_0.gguf"
+        //"Qwen3-0.6B-Q8_0.gguf"
+        llamacpp::model model = llamacpp::create_model(path + "Qwen3-4B-Instruct-2507-UD-Q2_K_XL.gguf");
+
+        if (!model.valid) return "";
+
+        std::string formatted = llamacpp::format_chat(model, {
+            {"user", prompt}
+        });
+        llamacpp::llama_tokens tokens = llamacpp::tokenize_string(model, formatted);
+
+        llamacpp::context ctx = llamacpp::start_context(model, tokens, 4096);
+        while (!llamacpp::context_step(ctx)){
+            // just wait until it completes
+            // we could also stream the text to the user through a callback
+        };
+
+        std::string result = ctx.ctx_string;
+
+        llamacpp::delete_context(ctx);
+        llamacpp::delete_model(model);
+
+        return result;
+    }
+}
diff --git a/coresdk/src/coresdk/genai.h b/coresdk/src/coresdk/genai.h
new file mode 100644
index 00000000..d1175872
--- /dev/null
+++ b/coresdk/src/coresdk/genai.h
@@ -0,0 +1,34 @@
+/**
+ * @header  genai
+ * @author  Sean Boettger
+ * @brief   SplashKit gives you a simple way to use and embed local AIs in your projects,
+ * that run on your own computer.
+ *
+ * @attribute group  generative_ai
+ * @attribute static generative_ai
+ */
+
+#ifndef genai_hpp
+#define genai_hpp
+
+#include <string>
+#include <vector>
+
+using std::string;
+
+namespace splashkit_lib
+{
+
+    /**
+     * @brief Generates a reply to a textual prompt by a language model
+     *
+     * The language model will respond to the textual prompt in a chat style format. It will follow instructions and answer questions.
+     *
+     * @param prompt The prompt for the language model to reply to.
+     *
+     * @returns The generated reply.
+     */
+    string generate_reply(string prompt);
+
+}
+#endif /* genai_hpp */

From 8c1f662fc08fbc3d0f575c480ce7a9f6591e5894 Mon Sep 17 00:00:00 2001
From: Sean Boettger <sean@whypenguins.com>
Date: Sat, 20 Dec 2025 02:35:57 +1100
Subject: [PATCH 04/19] Add simple GenAI test

---
 coresdk/src/test/test_genai.cpp | 30 ++++++++++++++++++++++++++++++
 coresdk/src/test/test_main.cpp  |  1 +
 coresdk/src/test/test_main.h    |  1 +
 3 files changed, 32 insertions(+)
 create mode 100644 coresdk/src/test/test_genai.cpp

diff --git a/coresdk/src/test/test_genai.cpp b/coresdk/src/test/test_genai.cpp
new file mode 100644
index 00000000..2cb16fca
--- /dev/null
+++ b/coresdk/src/test/test_genai.cpp
@@ -0,0 +1,30 @@
+//
+//  test_genai.cpp
+//  splashkit
+//
+//  Created by Sean Boettger on 20/12/2025.
+//
+
+#include "genai.h"
+#include "terminal.h"
+#include "utils.h"
+#include <vector>
+#include <iostream>
+
+using namespace std;
+using namespace splashkit_lib;
+
+void run_genai_test()
+{
+    write("User\n> ");
+    string prompt = read_line();
+
+    write("LLM\n> (generating...)");
+    string response = generate_reply(prompt);
+    write_line("\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\033[K" + response);
+
+    delay(300);
+
+    write_line("-- Press enter to end --");
+    read_line();
+}
diff --git a/coresdk/src/test/test_main.cpp b/coresdk/src/test/test_main.cpp
index 0b6e8ab9..e4b7750a 100644
--- a/coresdk/src/test/test_main.cpp
+++ b/coresdk/src/test/test_main.cpp
@@ -68,6 +68,7 @@ void setup_tests()
     add_test("GPIO - SPI MAX7219 LED matrix Tests", run_gpio_spi_led_matrix_tests);
     add_test("GPIO - I2C HT16K33 LED matrix Tests", run_gpio_i2c_led_matrix_tests);
     add_test("GPIO - I2C HT16K33 LED 14 Segment Tests", run_gpio_i2c_quad_14_seg_test);
+    add_test("Gen AI", run_genai_test);
 }
 
 int main(int argv, char **args)
diff --git a/coresdk/src/test/test_main.h b/coresdk/src/test/test_main.h
index 1beddfc8..89f42267 100644
--- a/coresdk/src/test/test_main.h
+++ b/coresdk/src/test/test_main.h
@@ -44,5 +44,6 @@ void run_gpio_i2c_quad_14_seg_test();
 void run_terminal_test();
 void run_logging_test();
 void run_ui_test();
+void run_genai_test();
 
 #endif /* test_main_h */

From f172f7b76d7952c7f27acf966e81004b68ee3580 Mon Sep 17 00:00:00 2001
From: Sean Boettger <sean@whypenguins.com>
Date: Sun, 21 Dec 2025 21:03:19 +1100
Subject: [PATCH 05/19] GenAI add custom logger

---
 coresdk/src/backend/genai_driver.cpp | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/coresdk/src/backend/genai_driver.cpp b/coresdk/src/backend/genai_driver.cpp
index 4ca64d86..a290e801 100644
--- a/coresdk/src/backend/genai_driver.cpp
+++ b/coresdk/src/backend/genai_driver.cpp
@@ -27,6 +27,18 @@ namespace splashkit_lib
 
                 ggml_backend_load_all();
 
+                // Create custom logger with colouring
+                el::Configurations conf;
+                conf.setToDefault();
+                conf.setGlobally(el::ConfigurationType::Format, "%level -> %msg");
+                conf.setGlobally(el::ConfigurationType::Filename, "logs/splashkit.log");
+
+                // `el::Loggers::addFlag(el::LoggingFlag::ColoredTerminalOutput);` would be better but has global effect
+                conf.set(el::Level::Warning, el::ConfigurationType::Format, "\x1b[33m%level -> %msg\x1b[0m");
+                conf.set(el::Level::Error, el::ConfigurationType::Format, "\x1b[31m%level -> %msg\x1b[0m");
+
+                el::Loggers::reconfigureLogger("GenAI", conf);
+
                 initialized = true;
             }
         }
@@ -43,7 +55,7 @@ namespace splashkit_lib
 
             if (model == NULL)
             {
-                LOG(ERROR) << "Unable to load language model from " << path << " - please check if it exists.";
+                CLOG(ERROR, "GenAI") << "Unable to load language model from " << path << " - it may be corrupted or missing.";
                 return {false};
             }
 
@@ -103,7 +115,7 @@ namespace splashkit_lib
             // recieve the tokens
             if (llama_tokenize(mdl.vocab, prompt.data(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0)
             {
-                LOG(ERROR) << "Failed to tokenize the prompt.";
+                CLOG(ERROR, "GenAI") << "Failed to tokenize the prompt.";
                 return {};
             }
 
@@ -122,7 +134,7 @@ namespace splashkit_lib
 
             if (ctx == NULL)
             {
-                LOG(ERROR) << "Failed to create the language model context.";
+                CLOG(ERROR, "GenAI") << "Failed to create the language model context.";
                 return {nullptr};
             }
 
@@ -149,7 +161,7 @@ namespace splashkit_lib
                 {
                     llama_free(ctx);
                     llama_sampler_free(smpl);
-                    LOG(ERROR) << "Failed to encode prompt.";
+                    CLOG(ERROR, "GenAI") << "Failed to encode prompt.";
                     return {nullptr};
                 }
 
@@ -182,7 +194,7 @@ namespace splashkit_lib
             // Decode current batch with the model
             if (llama_decode(ctx.ctx, ctx.batch))
             {
-                LOG(ERROR) << "Failed to process response from language model.";
+                CLOG(ERROR, "GenAI") << "Failed to process response from language model.";
                 return -1;
             }
 
@@ -199,7 +211,7 @@ namespace splashkit_lib
             int n = llama_token_to_piece(ctx.vocab, new_token_id, buf, sizeof(buf), 0, true);
             if (n < 0)
             {
-                LOG(ERROR) << "Failed to convert response token from language model.";
+                CLOG(ERROR, "GenAI") << "Failed to convert response token from language model.";
                 return -1;
             }
 

From ddac4c96929c54247a8e6e763ed06a5e10f4855c Mon Sep 17 00:00:00 2001
From: Sean Boettger <sean@whypenguins.com>
Date: Sun, 21 Dec 2025 21:18:42 +1100
Subject: [PATCH 06/19] Add sk_http_get_file function  - better for large
 files/handles resuming

---
 coresdk/src/backend/web_driver.cpp | 61 ++++++++++++++++++++++++++++++
 coresdk/src/backend/web_driver.h   |  1 +
 2 files changed, 62 insertions(+)

diff --git a/coresdk/src/backend/web_driver.cpp b/coresdk/src/backend/web_driver.cpp
index f388e083..2305e91b 100644
--- a/coresdk/src/backend/web_driver.cpp
+++ b/coresdk/src/backend/web_driver.cpp
@@ -206,6 +206,67 @@ namespace splashkit_lib
         return _create_response(curl_handle, res, data_read);
     }
 
+    struct _sk_http_get_file_callback_data
+    {
+        void (*user_callback)(unsigned long, unsigned long);
+        int resuming_from;
+    };
+
+    int _sk_http_get_file_callback(_sk_http_get_file_callback_data* data, curl_off_t dltotal, curl_off_t dlnow, curl_off_t ultotal, curl_off_t ulnow)
+    {
+        data->user_callback(dltotal == 0 ? 0 : (data->resuming_from + dltotal), data->resuming_from + dlnow);
+        return 0;
+    }
+
+    sk_http_response *sk_http_get_file(const string &filename, const string &host, unsigned short port, void (*user_callback)(unsigned long, unsigned long))
+    {
+        const string temp_extension = ".temp";
+        string temp_filename = filename+temp_extension;
+
+        FILE *file = fopen(temp_filename.c_str(), "ab+");
+
+        // find resume point
+        fseek(file, 0L, SEEK_END);
+        curl_off_t resume_from = ftell(file);
+
+        // init the curl session
+        CURL *curl_handle = curl_easy_init();
+        CURLcode res;
+
+        _init_curl(curl_handle, host, port);
+
+        curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, write_data);
+        curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, file);
+
+        _sk_http_get_file_callback_data callback_data;
+        if (user_callback)
+        {
+            curl_easy_setopt(curl_handle, CURLOPT_XFERINFOFUNCTION, _sk_http_get_file_callback);
+            curl_easy_setopt(curl_handle, CURLOPT_XFERINFODATA, &callback_data);
+            curl_easy_setopt(curl_handle, CURLOPT_NOPROGRESS, 0);
+
+            callback_data.user_callback = user_callback;
+            callback_data.resuming_from = resume_from;
+        }
+
+        curl_easy_setopt(curl_handle, CURLOPT_RESUME_FROM_LARGE, resume_from);
+
+        // get it!
+        res = curl_easy_perform(curl_handle);
+
+        fclose(file);
+
+        // try renaming the temp file if the download was okay - rename returns 0 on success
+        if (res == CURLE_OK && rename(temp_filename.c_str(), filename.c_str()))
+        {
+            LOG(WARNING) << "Failed to rename temporary download file " << temp_filename << " to " << filename;
+            return nullptr;
+        }
+
+        request_stream data_read = { nullptr, 0 };
+        return _create_response(curl_handle, res, data_read);
+    }
+
     sk_http_response *sk_http_put(const string &host, unsigned short port, const string &body)
     {
         request_stream data_read = { nullptr, 0 };
diff --git a/coresdk/src/backend/web_driver.h b/coresdk/src/backend/web_driver.h
index e5e54de2..f5e4810a 100644
--- a/coresdk/src/backend/web_driver.h
+++ b/coresdk/src/backend/web_driver.h
@@ -17,6 +17,7 @@ namespace splashkit_lib
 
     sk_http_response *sk_http_post(const string &host, unsigned short port, const string &body);
     sk_http_response *sk_http_get(const string &host, unsigned short port);
+    sk_http_response *sk_http_get_file(const string &filename, const string &host, unsigned short port, void (*user_callback)(unsigned long, unsigned long));
     sk_http_response *sk_http_put(const string &host, unsigned short port, const string &body);
     sk_http_response *sk_http_delete(const string &host, unsigned short port, const string &body);
     sk_http_response *sk_http_make_request(const sk_http_request &request);

From 97b3fa8ec32384bdc4838b9a5d4c772e7c838633 Mon Sep 17 00:00:00 2001
From: Sean Boettger <sean@whypenguins.com>
Date: Sun, 21 Dec 2025 22:43:47 +1100
Subject: [PATCH 07/19] Add GenAI model downloading

---
 coresdk/src/coresdk/genai.cpp | 104 +++++++++++++++++++++++++++++++++-
 1 file changed, 103 insertions(+), 1 deletion(-)

diff --git a/coresdk/src/coresdk/genai.cpp b/coresdk/src/coresdk/genai.cpp
index ed408251..881a1455 100644
--- a/coresdk/src/coresdk/genai.cpp
+++ b/coresdk/src/coresdk/genai.cpp
@@ -7,21 +7,123 @@
 
 #include "genai_driver.h"
 #include "utility_functions.h"
+#include "web_driver.h"
+#include "terminal.h"
+
+#include <filesystem>
+
+using std::to_string;
 
 namespace splashkit_lib
 {
 
+    /* terminal util functions in lieu of ncurses*/
+    void terminal_erase_left(int count /* -1 for all */)
+    {
+        if (count == 0)
+            return;
+
+        if (count == -1)
+            write("\r\033[K");
+        else
+            write("\033["+to_string(count)+"D\033[K");
+    }
+
+    std::vector<int> terminal_stack;
+
+    void terminal_push(const string &str)
+    {
+        write(str);
+        terminal_stack.push_back(str.size());
+    }
+
+    void terminal_pop()
+    {
+        terminal_erase_left(terminal_stack.back());
+        terminal_stack.pop_back();
+    }
+
+    bool download_with_progress_bar(string filename, string url)
+    {
+        auto callback = [](unsigned long expected_size, unsigned long current_size)
+        {
+            terminal_pop();
+
+            if (expected_size == 0)
+            {
+                terminal_push("");
+                return;
+            }
+
+            static int spinner_index = 0;
+
+            const int progress_bar_length = 10;
+            const string spinner = "|/-\\";
+
+            int expected_mb = expected_size / (1024 * 1024);
+            int current_mb = current_size / (1024 * 1024);
+
+            // construct progress bar
+            int progress_bar_filled = 0;
+            if (expected_size > 0)
+                progress_bar_filled = (int)(progress_bar_length * current_size/(double)expected_size);
+            if (progress_bar_filled > progress_bar_length)
+                progress_bar_filled = progress_bar_length;
+
+            string progress_bar = string(progress_bar_filled, '=') + string(progress_bar_length-progress_bar_filled, ' ');
+            if (progress_bar_filled <= progress_bar_length)
+                progress_bar[progress_bar_filled] = spinner[spinner_index++ % spinner.size()];
+
+            // write message
+            terminal_push(progress_bar + "| (" + to_string(current_mb) + "mb / " + to_string(expected_mb) + "mb)");
+        };
+
+        terminal_push("");
+
+        sk_http_response * resp = sk_http_get_file(filename, url, 443, callback);
+
+        terminal_pop();
+
+        return resp != nullptr && resp->code >= 200 && resp->code < 300;
+    }
+
+    bool ensure_exists_or_download(string path, string url, string message)
+    {
+        if (std::filesystem::exists(path))
+            return true;
+
+        terminal_push(message);
+
+        bool result = download_with_progress_bar(path, url);
+
+        terminal_pop();
+
+        return result;
+    }
+
     string generate_reply(string prompt)
     {
         llamacpp::init();
 
         string path = path_from( {path_to_user_home(), ".splashkit", "models"} );
+        path += "Qwen3-4B-Instruct-2507-UD-Q2_K_XL.gguf";
+
+        string model_name = "Qwen3 4B Instruct";
+
+        if (!ensure_exists_or_download(path,
+                      "https://huggingface.co/unsloth/Qwen3-4B-Instruct-2507-GGUF/resolve/main/Qwen3-4B-Instruct-2507-UD-Q2_K_XL.gguf?download=true",
+                      " ::: Downloading Language Model: "+model_name + " |"
+        ))
+        {
+            CLOG(ERROR, "GenAI") << "Failed to download language model - see error above.";
+            return "";
+        }
 
         // TODO: add auto download & choices for at least the following
         //"Qwen3-4B-Instruct-2507-UD-Q2_K_XL.gguf"
         //"Qwen3-1.7B-Q8_0.gguf"
         //"Qwen3-0.6B-Q8_0.gguf"
-        llamacpp::model model = llamacpp::create_model(path + "Qwen3-4B-Instruct-2507-UD-Q2_K_XL.gguf");
+        llamacpp::model model = llamacpp::create_model(path);
 
         if (!model.valid) return "";
 

From bedecf29f7226219d8c968af9b6b251e72cd380d Mon Sep 17 00:00:00 2001
From: Sean Boettger <sean@whypenguins.com>
Date: Mon, 22 Dec 2025 06:20:48 +1100
Subject: [PATCH 08/19] Add language_model and language_model_options
 struct/enum

---
 coresdk/src/coresdk/types.h | 69 +++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

diff --git a/coresdk/src/coresdk/types.h b/coresdk/src/coresdk/types.h
index 3573af9f..78ff4cae 100644
--- a/coresdk/src/coresdk/types.h
+++ b/coresdk/src/coresdk/types.h
@@ -548,5 +548,74 @@ namespace splashkit_lib
         BUBBLE = 4,
         BUBBLE_MULTICOLORED = 5
     };
+
+    /**
+     * Language Models:
+     * Choose between different language models to trade off speed and intelligence
+     * Each model is scaled to fit within 1~2GB and will be automatically downloaded when needed - feel free to try them out!
+     *
+     * @constant QWEN3_0_6B_BASE       Qwen3 0.6B Base model - small, extremely fast and good for text commpletion. Very limited world knowledge.
+     * @constant QWEN3_0_6B_INSTRUCT   Qwen3 0.6B Instruct model (default) - small, extremely fast and can follow simple instructions. Very limited world knowledge.
+     * @constant QWEN3_0_6B_THINKING   Qwen3 0.6B Thinking model - small, extremely fast and can follow more specific instructions, but has a short delay before starting to reply. Very limited world knowledge.
+     * @constant QWEN3_1_7B_BASE       Qwen3 1.7B Base model - decently fast and good for text commpletion. Limited world knowledge.
+     * @constant QWEN3_1_7B_INSTRUCT   Qwen3 1.7B Instruct model - decently fast and can follow instructions. Limited world knowledge.
+     * @constant QWEN3_1_7B_THINKING   Qwen3 1.7B Thinking model - decently fast and can follow more difficult instructions, but has a delay before starting to reply. Limited world knowledge.
+     * @constant QWEN3_4B_BASE         Qwen3 4B Base model - slower but excellent for text commpletion/pattern based completion
+     * @constant QWEN3_4B_INSTRUCT     Qwen3 4B Instruct model - slower but can follow complex instructions
+     * @constant QWEN3_4B_THINKING     Qwen3 4B Thinking model - slower but can follow complex and specific instructions, but has a potentially long delay before starting to reply
+     * @constant GEMMA_270M_BASE       Gemma3 270M Base model - tiny, extremely fast, and good for text completion. Very limited world knowledge.
+     * @constant GEMMA_270M_BASE       Gemma3 270M Instruct model - tiny, extremely fast, and good for very simple instructions. Very limited world knowledge.
+     * @constant GEMMA_1B_BASE         Gemma3 1B Base model - fast and good for text completion. Has decent world knowledge and multi-lingual abilities.
+     * @constant GEMMA_1B_INSTRUCT     Gemma3 1B Instruct model - fast and can follow instructions. Has decent world knowledge and multi-lingual abilities.
+     * @constant GEMMA_4B_BASE         Gemma3 4B Base model - slower but good for text commpletion/pattern based completion. Has decent world knowledge and multi-lingual abilities.
+     * @constant GEMMA_4B_INSTRUCT     Gemma3 4B Instruct model - slower but can follow complex instructions. Has decent world knowledge and multi-lingual abilities.
+     */
+    enum language_model
+    {
+        QWEN3_0_6B_BASE = 4,
+        QWEN3_0_6B_INSTRUCT = 5,
+        QWEN3_0_6B_THINKING = 6,
+        QWEN3_1_7B_BASE = 8,
+        QWEN3_1_7B_INSTRUCT = 9,
+        QWEN3_1_7B_THINKING = 10,
+        QWEN3_4B_BASE = 12,
+        QWEN3_4B_INSTRUCT = 13,
+        QWEN3_4B_THINKING = 14,
+        GEMMA3_270M_BASE = 16,
+        GEMMA3_270M_INSTRUCT = 17,
+        GEMMA3_1B_BASE = 20,
+        GEMMA3_1B_INSTRUCT = 21,
+        GEMMA3_4B_BASE = 24,
+        GEMMA3_4B_INSTRUCT = 25,
+    };
+
+    /**
+     * Language model options allow you to customize the language model used. These should be
+     * initialised using functions such as `option_language_model`.
+     *
+     * @field name             The name of the model (used in diagnostic messages).
+     * @field url              A URL to download a model from.
+     * @field path             A path to a custom language model (.gguf) file on your computer/a place to download it to.
+     * @field max_tokens       The maximum number of tokens to output when replying. One word is approximately two tokens.
+     * @field temperature      Increases the likelihood of unlikely tokens to be chosen.
+     * @field top_p            Only choose from the top P most likely tokens.
+     * @field top_k            Only choose from the top K most likely tokens.
+     * @field min_p            Remove tokens less likely than P.
+     * @field presence_penalty Penalizes words that have been used once, making them less likely. Can reduce repetition.
+     * @field prompt_append    A string to append to prompts automatically.
+     */
+    struct language_model_options
+    {
+        string name;
+        string url;
+        string path;
+        int max_tokens;
+        double temperature;
+        double top_p;
+        int top_k;
+        double min_p;
+        double presence_penalty;
+        string prompt_append;
+    };
 }
 #endif /* types_hpp */

From 7d1ad32de14371ec57c56c6522fc196aa0fc4762 Mon Sep 17 00:00:00 2001
From: Sean Boettger <sean@whypenguins.com>
Date: Mon, 22 Dec 2025 07:09:35 +1100
Subject: [PATCH 09/19] Add default models and  allow choosing model  - also
 add various overloads

---
 coresdk/src/coresdk/genai.cpp   | 205 ++++++++++++++++++++++++++++----
 coresdk/src/coresdk/genai.h     |  77 ++++++++++++
 coresdk/src/test/test_genai.cpp |   2 +-
 3 files changed, 263 insertions(+), 21 deletions(-)

diff --git a/coresdk/src/coresdk/genai.cpp b/coresdk/src/coresdk/genai.cpp
index 881a1455..1c923fd0 100644
--- a/coresdk/src/coresdk/genai.cpp
+++ b/coresdk/src/coresdk/genai.cpp
@@ -6,6 +6,7 @@
 //
 
 #include "genai_driver.h"
+#include "genai.h"
 #include "utility_functions.h"
 #include "web_driver.h"
 #include "terminal.h"
@@ -16,6 +17,13 @@ using std::to_string;
 
 namespace splashkit_lib
 {
+    const language_model DEFAULT_LANGUAGE_MODEL = QWEN3_0_6B_INSTRUCT;
+
+    const int default_max_tokens_base = 256; // base has a higher likelihood of running forever for no reason, better to limit it early
+    const int default_max_tokens_instruct = 4096;
+    const int default_max_tokens_thinking = 4096;
+
+    extern const std::array<language_model_options, 26> models; // defined at end of file
 
     /* terminal util functions in lieu of ncurses*/
     void terminal_erase_left(int count /* -1 for all */)
@@ -72,7 +80,7 @@ namespace splashkit_lib
 
             string progress_bar = string(progress_bar_filled, '=') + string(progress_bar_length-progress_bar_filled, ' ');
             if (progress_bar_filled <= progress_bar_length)
-                progress_bar[progress_bar_filled] = spinner[spinner_index++ % spinner.size()];
+                progress_bar[progress_bar_filled] = spinner[(spinner_index++)/2 % spinner.size()];
 
             // write message
             terminal_push(progress_bar + "| (" + to_string(current_mb) + "mb / " + to_string(expected_mb) + "mb)");
@@ -101,38 +109,33 @@ namespace splashkit_lib
         return result;
     }
 
-    string generate_reply(string prompt)
+    string __generate_common(string prompt, language_model_options options, bool format_chat)
     {
         llamacpp::init();
 
-        string path = path_from( {path_to_user_home(), ".splashkit", "models"} );
-        path += "Qwen3-4B-Instruct-2507-UD-Q2_K_XL.gguf";
-
-        string model_name = "Qwen3 4B Instruct";
-
-        if (!ensure_exists_or_download(path,
-                      "https://huggingface.co/unsloth/Qwen3-4B-Instruct-2507-GGUF/resolve/main/Qwen3-4B-Instruct-2507-UD-Q2_K_XL.gguf?download=true",
-                      " ::: Downloading Language Model: "+model_name + " |"
-        ))
+        if (options.url != "" && !ensure_exists_or_download(options.path, options.url, " ::: Downloading Language Model: " + options.name + " |"))
         {
             CLOG(ERROR, "GenAI") << "Failed to download language model - see error above.";
             return "";
         }
 
-        // TODO: add auto download & choices for at least the following
-        //"Qwen3-4B-Instruct-2507-UD-Q2_K_XL.gguf"
-        //"Qwen3-1.7B-Q8_0.gguf"
-        //"Qwen3-0.6B-Q8_0.gguf"
-        llamacpp::model model = llamacpp::create_model(path);
+        llamacpp::model model = llamacpp::create_model(options.path);
 
         if (!model.valid) return "";
 
-        std::string formatted = llamacpp::format_chat(model, {
-            {"user", prompt}
-        });
+        std::string formatted = prompt;
+
+        if (format_chat)
+        {
+            llamacpp::format_chat(model, {
+                {
+                    "user", prompt + options.prompt_append
+                }
+            });
+        }
         llamacpp::llama_tokens tokens = llamacpp::tokenize_string(model, formatted);
 
-        llamacpp::context ctx = llamacpp::start_context(model, tokens, 4096);
+        llamacpp::context ctx = llamacpp::start_context(model, tokens, options.max_tokens);
         while (!llamacpp::context_step(ctx)){
             // just wait until it completes
             // we could also stream the text to the user through a callback
@@ -145,4 +148,166 @@ namespace splashkit_lib
 
         return result;
     }
+
+
+    string generate_reply(string prompt)
+    {
+        return generate_reply(DEFAULT_LANGUAGE_MODEL, prompt);
+    }
+
+    string generate_reply(language_model model, string prompt)
+    {
+        return generate_reply(prompt, option_language_model(model));
+    }
+
+    string generate_reply(string prompt, language_model_options options)
+    {
+        return __generate_common(prompt, options, true);
+    }
+
+    string generate_text(string text)
+    {
+        return generate_text(DEFAULT_LANGUAGE_MODEL, text);
+    }
+
+    string generate_text(language_model model, string text)
+    {
+        return generate_text(text, option_language_model(model));
+    }
+
+    string generate_text(string text, language_model_options options)
+    {
+        return __generate_common(text, options, false);
+    }
+
+    language_model_options option_language_model(language_model model)
+    {
+        if (model < 0 || model >= models.size() || models[model].name == "")
+        {
+            model = DEFAULT_LANGUAGE_MODEL;
+            CLOG(WARNING, "GenAI") << "Invalid model selected, defaulting to '" << models[model].name << "'";
+        }
+
+        string home_path = path_from( {path_to_user_home(), ".splashkit", "models"} );
+
+        language_model_options options = models[model];
+        options.path =  home_path + options.path;
+
+        return options;
+    }
+
+    // --------------------------------------------------------------
+
+
+    // default model definitions
+
+    const std::array<language_model_options, 26> models = {{
+        [0]={}, [1]={}, [2]={}, [3]={},
+
+        [QWEN3_0_6B_BASE] = {
+            "Qwen3 0.6B Base",
+            "https://huggingface.co/mradermacher/Qwen3-0.6B-Base-GGUF/resolve/main/Qwen3-0.6B-Base.Q8_0.gguf?download=true",
+            "Qwen3-0.6B-Base.Q8_0.gguf",
+            default_max_tokens_base, 0.7, 0.8, 20, 0, 1.5
+        },
+        [QWEN3_0_6B_INSTRUCT] = {
+            "Qwen3 0.6B Instruct",
+            "https://huggingface.co/Qwen/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q8_0.gguf?download=true",
+            "Qwen3-0.6B-Q8_0.gguf",
+            default_max_tokens_instruct, 0.7, 0.8, 20, 0, 1.5, " /no_think"
+        },
+        [QWEN3_0_6B_THINKING] = {
+            "Qwen3 0.6B Thinking",
+            "https://huggingface.co/Qwen/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q8_0.gguf?download=true",
+            "Qwen3-0.6B-Q8_0.gguf",
+            default_max_tokens_thinking, 0.6, 0.95, 20, 0, 1.5
+        },
+
+        [7]={},
+
+        [QWEN3_1_7B_BASE] = {
+            "Qwen3 1.7B Base",
+            "https://huggingface.co/mradermacher/Qwen3-1.7B-Base-GGUF/resolve/main/Qwen3-1.7B-Base.Q8_0.gguf?download=true",
+            "Qwen3-1.7B-Base.Q8_0.gguf",
+            default_max_tokens_base, 0.7, 0.8, 20, 0, 1.5
+        },
+        [QWEN3_1_7B_INSTRUCT] = {
+            "Qwen3 1.7B Instruct",
+            "https://huggingface.co/Qwen/Qwen3-1.7B-GGUF/resolve/main/Qwen3-1.7B-Q8_0.gguf?download=true",
+            "Qwen3-1.7B-Q8_0.gguf",
+            default_max_tokens_instruct, 0.7, 0.8, 20, 0, 1.5, " /no_think"
+        },
+        [QWEN3_1_7B_THINKING] = {
+            "Qwen3 1.7B Thinking",
+            "https://huggingface.co/Qwen/Qwen3-1.7B-GGUF/resolve/main/Qwen3-1.7B-Q8_0.gguf?download=true",
+            "Qwen3-1.7B-Q8_0.gguf",
+            default_max_tokens_thinking, 0.6, 0.95, 20, 0, 1.5
+        },
+
+        [11]={},
+
+        [QWEN3_4B_BASE] = {
+            "Qwen3 4B Base",
+            "https://huggingface.co/mradermacher/Qwen3-4B-Base-GGUF/resolve/main/Qwen3-4B-Base.Q2_K.gguf?download=true",
+            "Qwen3-4B-Base.Q2_K.gguf",
+            default_max_tokens_base, 0.7, 0.8, 20, 0, 0
+        },
+        [QWEN3_4B_INSTRUCT] = {
+            "Qwen3 4B Instruct",
+            "https://huggingface.co/unsloth/Qwen3-4B-Instruct-2507-GGUF/resolve/main/Qwen3-4B-Instruct-2507-UD-Q2_K_XL.gguf?download=true",
+            "Qwen3-4B-Instruct-2507-UD-Q2_K_XL.gguf",
+            default_max_tokens_instruct, 0.7, 0.8, 20, 0, 0
+        },
+        [QWEN3_4B_THINKING] = {
+            "Qwen3 4B Thinking",
+            "https://huggingface.co/unsloth/Qwen3-4B-Thinking-2507-GGUF/resolve/main/Qwen3-4B-Thinking-2507-UD-Q2_K_XL.gguf?download=true",
+            "Qwen3-4B-Thinking-2507-UD-Q2_K_XL.gguf",
+            default_max_tokens_thinking, 0.6, 0.95, 20, 0, 0
+        },
+
+        [15]={},
+
+        [GEMMA3_270M_BASE] = {
+            "Gemma3 270M Base",
+            "https://huggingface.co/ggml-org/gemma-3-270m-GGUF/resolve/main/gemma-3-270m-Q8_0.gguf?download=true",
+            "gemma-3-270m-Q8_0.gguf",
+            default_max_tokens_base, 1.0, 0.95, 64, 0, 0
+        },
+        [GEMMA3_270M_INSTRUCT] = {
+            "Gemma3 270M Instruct",
+            "https://huggingface.co/unsloth/gemma-3-270m-it-GGUF/resolve/main/gemma-3-270m-it-Q8_0.gguf?download=true",
+            "gemma-3-270m-it-Q8_0.gguf",
+            default_max_tokens_instruct, 1.0, 0.95, 64, 0, 0
+        },
+
+        [18]={}, [19]={},
+
+        [GEMMA3_1B_BASE] = {
+            "Gemma3 1B Base",
+            "https://huggingface.co/mradermacher/gemma-3-1b-pt-GGUF/resolve/main/gemma-3-1b-pt.Q8_0.gguf?download=true",
+            "gemma-3-1b-pt.Q8_0.gguf",
+            default_max_tokens_base, 1.0, 0.95, 64, 0, 0
+        },
+        [GEMMA3_1B_INSTRUCT] = {
+            "Gemma3 1B Instruct",
+            "https://huggingface.co/unsloth/gemma-3-1b-it-GGUF/resolve/main/gemma-3-1b-it-Q8_0.gguf?download=true",
+            "gemma-3-1b-it-Q8_0.gguf",
+            default_max_tokens_instruct, 1.0, 0.95, 64, 0, 0
+        },
+
+        [22]={}, [23]={},
+
+        [GEMMA3_4B_BASE] = {
+            "Gemma3 4B Base",
+            "https://huggingface.co/mradermacher/gemma-3-4b-pt-GGUF/resolve/main/gemma-3-4b-pt.Q2_K.gguf?download=true",
+            "gemma-3-4b-pt.Q2_K.gguf",
+            default_max_tokens_base, 1.0, 0.95, 64, 0, 0
+        },
+        [GEMMA3_4B_INSTRUCT] = {
+            "Gemma3 4B Instruct",
+            "https://huggingface.co/unsloth/gemma-3-4b-it-GGUF/resolve/main/gemma-3-4b-it-UD-IQ3_XXS.gguf?download=true",
+            "gemma-3-4b-it-UD-IQ3_XXS.gguf",
+            default_max_tokens_instruct, 1.0, 0.95, 64, 0, 0
+        }
+    }};
 }
diff --git a/coresdk/src/coresdk/genai.h b/coresdk/src/coresdk/genai.h
index d1175872..c8e90eb6 100644
--- a/coresdk/src/coresdk/genai.h
+++ b/coresdk/src/coresdk/genai.h
@@ -11,6 +11,8 @@
 #ifndef genai_hpp
 #define genai_hpp
 
+#include "types.h"
+
 #include <string>
 #include <vector>
 
@@ -23,6 +25,7 @@ namespace splashkit_lib
      * @brief Generates a reply to a textual prompt by a language model
      *
      * The language model will respond to the textual prompt in a chat style format. It will follow instructions and answer questions.
+     * Instruct or Thinking models are recommended. Base models likely won't output sensible results.
      *
      * @param prompt The prompt for the language model to reply to.
      *
@@ -30,5 +33,79 @@ namespace splashkit_lib
      */
     string generate_reply(string prompt);
 
+    /**
+     * @brief Generates a reply to a textual prompt by a language model
+     *
+     * The language model will respond to the textual prompt in a chat style format. It will follow instructions and answer questions.
+     * Instruct or Thinking models are recommended. Base models likely won't output sensible results.
+     *
+     * @param model  The language model to use
+     * @param prompt The prompt for the language model to reply to.
+     *
+     * @returns The generated reply.
+     */
+    string generate_reply(language_model model, string prompt);
+
+    /**
+     * @brief Generates a reply to a textual prompt by a language model
+     *
+     * The language model will respond to the textual prompt in a chat style format. It will follow instructions and answer questions.
+     * Instruct or Thinking models are recommended. Base models likely won't output sensible results.
+     *
+     * @param prompt  The prompt for the language model to reply to.
+     * @param options The generation options - use the `option_` functions to create this, for instance `option_language_model`
+     *
+     * @returns The generated reply.
+     */
+    string generate_reply(string prompt, language_model_options options);
+
+
+    /**
+     * @brief Generates text that continues from a prompt
+     *
+     * The language model will continue predicting text based on patterns in the prompt - it will not directly follow instructions or answer questions.
+     * Base models are recommended; Instruct and Thinking models may work.
+     *
+     * @param text The input text for the language model to continue.
+     *
+     * @returns The generated reply.
+     */
+    string generate_text(string text);
+
+    /**
+     * @brief Generates text that continues from a prompt
+     *
+     * The language model will continue predicting text based on patterns in the prompt - it will not directly follow instructions or answer questions.
+     * Base models are recommended; Instruct and Thinking models may work.
+     *
+     * @param model  The language model to use
+     * @param text The input text for the language model to continue.
+     *
+     * @returns The generated reply.
+     */
+    string generate_text(language_model model, string text);
+
+    /**
+     * @brief Generates text that continues from a prompt
+     *
+     * The language model will continue predicting text based on patterns in the prompt - it will not directly follow instructions or answer questions.
+     * Base models are recommended; Instruct and Thinking models may work.
+     *
+     * @param text The input text for the language model to continue.
+     * @param options The generation options - use the `option_` functions to create this, for instance `option_language_model`
+     *
+     * @returns The generated reply.
+     */
+    string generate_text(string text, language_model_options options);
+
+    /**
+     * Use this option to choose which language model to use, and initialize its default settings
+     *
+     * @param  model The language model to use
+     *
+     * @return       Language model options that will use that model and its default settings.
+     */
+    language_model_options option_language_model(language_model model);
+
 }
 #endif /* genai_hpp */
diff --git a/coresdk/src/test/test_genai.cpp b/coresdk/src/test/test_genai.cpp
index 2cb16fca..84aba027 100644
--- a/coresdk/src/test/test_genai.cpp
+++ b/coresdk/src/test/test_genai.cpp
@@ -20,7 +20,7 @@ void run_genai_test()
     string prompt = read_line();
 
     write("LLM\n> (generating...)");
-    string response = generate_reply(prompt);
+    string response = generate_reply(QWEN3_0_6B_INSTRUCT, prompt);
     write_line("\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\033[K" + response);
 
     delay(300);

From 3f94760fb2797e52f3d8826a3ef87b89c3336ce9 Mon Sep 17 00:00:00 2001
From: Sean Boettger <sean@whypenguins.com>
Date: Mon, 22 Dec 2025 07:31:06 +1100
Subject: [PATCH 10/19] Pass inference settings to genai_driver

---
 coresdk/src/backend/genai_driver.cpp | 13 +++++++------
 coresdk/src/backend/genai_driver.h   | 10 +++++++++-
 coresdk/src/coresdk/genai.cpp        |  8 +++++++-
 3 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/coresdk/src/backend/genai_driver.cpp b/coresdk/src/backend/genai_driver.cpp
index a290e801..3e8e56c5 100644
--- a/coresdk/src/backend/genai_driver.cpp
+++ b/coresdk/src/backend/genai_driver.cpp
@@ -122,7 +122,7 @@ namespace splashkit_lib
             return prompt_tokens;
         }
 
-        context start_context(model& mdl, llama_tokens& starting_context, int max_length)
+        context start_context(model& mdl, llama_tokens& starting_context, int max_length, inference_settings settings)
         {
             // Create the context
             llama_context_params ctx_params = llama_context_default_params();
@@ -145,11 +145,12 @@ namespace splashkit_lib
 
             // Setup some reasonable defaults
             // TODO: Make these user adjustable
-            llama_sampler_chain_add(smpl, llama_sampler_init_min_p(0.00f, 1));
-            llama_sampler_chain_add(smpl, llama_sampler_init_temp(0.6f));
-            llama_sampler_chain_add(smpl, llama_sampler_init_top_k(20));
-            llama_sampler_chain_add(smpl, llama_sampler_init_top_p(0.95, 0));
-            //llama_sampler_chain_add(smpl, llama_sampler_init_penalties(64, 1, 0, 0));
+            llama_sampler_chain_add(smpl, llama_sampler_init_min_p(settings.min_p, 1));
+            llama_sampler_chain_add(smpl, llama_sampler_init_temp(settings.temperature));
+            llama_sampler_chain_add(smpl, llama_sampler_init_top_k(settings.top_k));
+            llama_sampler_chain_add(smpl, llama_sampler_init_top_p(settings.top_p, 0));
+            if (settings.presence_penalty > 0)
+                llama_sampler_chain_add(smpl, llama_sampler_init_penalties(64, 0, 0, settings.presence_penalty));
             llama_sampler_chain_add(smpl, llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
 
             // Prepare batch and encode starting context
diff --git a/coresdk/src/backend/genai_driver.h b/coresdk/src/backend/genai_driver.h
index 8f175017..658c5155 100644
--- a/coresdk/src/backend/genai_driver.h
+++ b/coresdk/src/backend/genai_driver.h
@@ -24,6 +24,14 @@ namespace splashkit_lib
             const char* tmpl;
         };
 
+        struct inference_settings {
+            double temperature = 0.6;
+            double top_p = 0.95;
+            int top_k = 20;
+            double min_p = 0;
+            double presence_penalty = 0;
+        };
+
         struct message {
             std::string role;
             std::string content;
@@ -51,7 +59,7 @@ namespace splashkit_lib
         std::string format_chat(model& mdl, const std::vector<message>& messages);
         llama_tokens tokenize_string(model& mdl, const std::string& prompt);
 
-        context start_context(model& mdl, llama_tokens& starting_context, int max_length);
+        context start_context(model& mdl, llama_tokens& starting_context, int max_length, inference_settings settings);
         int context_step(context& ctx);
         void delete_context(context& ctx);
     }
diff --git a/coresdk/src/coresdk/genai.cpp b/coresdk/src/coresdk/genai.cpp
index 1c923fd0..e7f8435d 100644
--- a/coresdk/src/coresdk/genai.cpp
+++ b/coresdk/src/coresdk/genai.cpp
@@ -135,7 +135,13 @@ namespace splashkit_lib
         }
         llamacpp::llama_tokens tokens = llamacpp::tokenize_string(model, formatted);
 
-        llamacpp::context ctx = llamacpp::start_context(model, tokens, options.max_tokens);
+        llamacpp::context ctx = llamacpp::start_context(model, tokens, options.max_tokens, {
+            options.temperature,
+            options.top_p,
+            options.top_k,
+            options.min_p,
+            options.presence_penalty
+        });
         while (!llamacpp::context_step(ctx)){
             // just wait until it completes
             // we could also stream the text to the user through a callback

From f5da09f9104d4ff32f4f87de50955d3b71c1414b Mon Sep 17 00:00:00 2001
From: Sean Boettger <sean@whypenguins.com>
Date: Mon, 22 Dec 2025 07:31:23 +1100
Subject: [PATCH 11/19] genai_driver formatting fixes

---
 coresdk/src/backend/genai_driver.cpp |  6 +++---
 coresdk/src/backend/genai_driver.h   | 15 ++++++++++-----
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/coresdk/src/backend/genai_driver.cpp b/coresdk/src/backend/genai_driver.cpp
index 3e8e56c5..7b7814ec 100644
--- a/coresdk/src/backend/genai_driver.cpp
+++ b/coresdk/src/backend/genai_driver.cpp
@@ -14,7 +14,8 @@
 
 namespace splashkit_lib
 {
-    namespace llamacpp {
+    namespace llamacpp
+    {
 
         static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data){/* nothing, avoid unnecessary logging*/}
 
@@ -143,8 +144,7 @@ namespace splashkit_lib
             sparams.no_perf = true;
             llama_sampler * smpl = llama_sampler_chain_init(sparams);
 
-            // Setup some reasonable defaults
-            // TODO: Make these user adjustable
+            // Set up sampler
             llama_sampler_chain_add(smpl, llama_sampler_init_min_p(settings.min_p, 1));
             llama_sampler_chain_add(smpl, llama_sampler_init_temp(settings.temperature));
             llama_sampler_chain_add(smpl, llama_sampler_init_top_k(settings.top_k));
diff --git a/coresdk/src/backend/genai_driver.h b/coresdk/src/backend/genai_driver.h
index 658c5155..a65d624e 100644
--- a/coresdk/src/backend/genai_driver.h
+++ b/coresdk/src/backend/genai_driver.h
@@ -16,15 +16,18 @@ namespace splashkit_lib
 {
     typedef unsigned int uint;
 
-    namespace llamacpp {
-        struct model {
+    namespace llamacpp
+    {
+        struct model
+        {
             bool valid;
             llama_model* model;
             const llama_vocab* vocab;
             const char* tmpl;
         };
 
-        struct inference_settings {
+        struct inference_settings
+        {
             double temperature = 0.6;
             double top_p = 0.95;
             int top_k = 20;
@@ -32,12 +35,14 @@ namespace splashkit_lib
             double presence_penalty = 0;
         };
 
-        struct message {
+        struct message
+        {
             std::string role;
             std::string content;
         };
 
-        struct context {
+        struct context
+        {
             llama_context* ctx;
             llama_sampler* smpl;
             llama_batch batch;

From 1742e9b38c9f5b57e1b6f44deddef7c97a3ccba6 Mon Sep 17 00:00:00 2001
From: Sean Boettger <sean@whypenguins.com>
Date: Mon, 22 Dec 2025 08:53:08 +1100
Subject: [PATCH 12/19] Fix CMakeLists so llama.cpp links correctly on first
 make

---
 projects/cmake/CMakeLists.txt | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/projects/cmake/CMakeLists.txt b/projects/cmake/CMakeLists.txt
index d14d57f7..6877d3cf 100644
--- a/projects/cmake/CMakeLists.txt
+++ b/projects/cmake/CMakeLists.txt
@@ -292,10 +292,16 @@ find_package(OpenMP REQUIRED)
 
 ExternalProject_Get_Property(llama_ext INSTALL_DIR)
 
-set(LLAMA_LIB_FLAGS "${INSTALL_DIR}/lib/libllama.a"
-                     "${INSTALL_DIR}/lib/libggml.a"
-                     "${INSTALL_DIR}/lib/libggml-cpu.a"
-                     "${INSTALL_DIR}/lib/libggml-base.a")
+foreach(lib libllama libggml libggml-cpu libggml-base)
+    add_library(${lib} STATIC IMPORTED GLOBAL)
+    set_target_properties(${lib} PROPERTIES
+        IMPORTED_LOCATION
+            ${INSTALL_DIR}/lib/${lib}.a
+    )
+    add_dependencies(${lib} llama_ext)
+endforeach()
+
+set(LLAMA_LIB_FLAGS libllama libggml libggml-cpu libggml-base)
 
 # MACRO DEFINITIONS #
 add_definitions(-DELPP_THREAD_SAFE)

From 47ddd98f7fd71ee168be4c61c015f2dbebeb6258 Mon Sep 17 00:00:00 2001
From: Sean Boettger <sean@whypenguins.com>
Date: Mon, 22 Dec 2025 23:55:58 +1100
Subject: [PATCH 13/19] Remove OpenMP dependency

---
 projects/cmake/CMakeLists.txt | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/projects/cmake/CMakeLists.txt b/projects/cmake/CMakeLists.txt
index 6877d3cf..30a3ce95 100644
--- a/projects/cmake/CMakeLists.txt
+++ b/projects/cmake/CMakeLists.txt
@@ -276,20 +276,17 @@ ExternalProject_Add(
         -DLLAMA_BUILD_EXAMPLES=OFF
         -DLLAMA_BUILD_SERVER=OFF
         -DLLAMA_CUBLAS=OFF
-        -DLLAMA_CLBLAST=OFF
         -DLLAMA_METAL=OFF
-        -DLLAMA_OPENCL=OFF
         -DBUILD_SHARED_LIBS=OFF
         -DLLAMA_BUILD_COMMON=OFF
         -DLLAMA_TOOLS_INSTALL=OFF
         -DCMAKE_BUILD_TYPE=Release
         -DGGML_STATIC=ON
-        -DLLAMA_STATIC=ON
+        -DGGML_OPENMP=OFF
+        -DLLAMA_OPENMP=OFF
         -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
 )
 
-find_package(OpenMP REQUIRED)
-
 ExternalProject_Get_Property(llama_ext INSTALL_DIR)
 
 foreach(lib libllama libggml libggml-cpu libggml-base)
@@ -309,7 +306,7 @@ add_definitions(-DELPP_THREAD_SAFE)
 #### END SETUP ####
 #### SplashKitBackend STATIC LIBRARY ####
 add_library(SplashKitBackend STATIC ${SOURCE_FILES} ${INCLUDE_FILES})
-target_link_libraries(SplashKitBackend ${LIB_FLAGS} ${LLAMA_LIB_FLAGS} OpenMP::OpenMP_CXX)
+target_link_libraries(SplashKitBackend ${LIB_FLAGS} ${LLAMA_LIB_FLAGS})
 
 if(RASPBERRY_PI)
     if(RASPBERRY_PI_5)

From 7af66dcba18ec010e2585668fffe3caa1290bd1b Mon Sep 17 00:00:00 2001
From: Sean Boettger <sean@whypenguins.com>
Date: Tue, 23 Dec 2025 07:46:13 +1100
Subject: [PATCH 14/19] Fix llama.cpp linking on windows

---
 projects/cmake/CMakeLists.txt | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/projects/cmake/CMakeLists.txt b/projects/cmake/CMakeLists.txt
index 30a3ce95..84cc863c 100644
--- a/projects/cmake/CMakeLists.txt
+++ b/projects/cmake/CMakeLists.txt
@@ -289,16 +289,23 @@ ExternalProject_Add(
 
 ExternalProject_Get_Property(llama_ext INSTALL_DIR)
 
-foreach(lib libllama libggml libggml-cpu libggml-base)
+foreach(lib llama ggml ggml-cpu ggml-base)
     add_library(${lib} STATIC IMPORTED GLOBAL)
-    set_target_properties(${lib} PROPERTIES
-        IMPORTED_LOCATION
-            ${INSTALL_DIR}/lib/${lib}.a
-    )
+    if (MSYS AND NOT "${lib}" STREQUAL "llama") # llama still ends up as libllama.a on Windows, unsure why
+        set_target_properties(${lib} PROPERTIES
+            IMPORTED_LOCATION
+                ${INSTALL_DIR}/lib/${lib}.a # no lib prefix
+        )
+    else()
+        set_target_properties(${lib} PROPERTIES
+            IMPORTED_LOCATION
+                ${INSTALL_DIR}/lib/lib${lib}.a # lib prefix
+        )
+    endif()
     add_dependencies(${lib} llama_ext)
 endforeach()
 
-set(LLAMA_LIB_FLAGS libllama libggml libggml-cpu libggml-base)
+set(LLAMA_LIB_FLAGS llama ggml ggml-cpu ggml-base)
 
 # MACRO DEFINITIONS #
 add_definitions(-DELPP_THREAD_SAFE)

From b7503f0628237fe6b407bdd7d7f50f3ba360f0b3 Mon Sep 17 00:00:00 2001
From: Sean Boettger <sean@whypenguins.com>
Date: Tue, 23 Dec 2025 07:46:51 +1100
Subject: [PATCH 15/19] Fix llama.cpp flags for MacOS

---
 projects/cmake/CMakeLists.txt | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/projects/cmake/CMakeLists.txt b/projects/cmake/CMakeLists.txt
index 84cc863c..017e2a27 100644
--- a/projects/cmake/CMakeLists.txt
+++ b/projects/cmake/CMakeLists.txt
@@ -45,6 +45,7 @@ if (APPLE)
                    -framework AudioToolbox \
                    -framework CoreAudio \
                    -framework CoreVideo \
+                   -framework Accelerate \
                    -lSDL2 \
                    -lSDL2_mixer \
                    -lSDL2_ttf \
@@ -275,15 +276,15 @@ ExternalProject_Add(
         -DLLAMA_BUILD_TOOLS=OFF
         -DLLAMA_BUILD_EXAMPLES=OFF
         -DLLAMA_BUILD_SERVER=OFF
-        -DLLAMA_CUBLAS=OFF
-        -DLLAMA_METAL=OFF
+        -DGGML_BLAS=OFF
+        -DGGML_METAL=OFF
+        -DGGML_VULKAN=OFF
         -DBUILD_SHARED_LIBS=OFF
         -DLLAMA_BUILD_COMMON=OFF
         -DLLAMA_TOOLS_INSTALL=OFF
         -DCMAKE_BUILD_TYPE=Release
         -DGGML_STATIC=ON
         -DGGML_OPENMP=OFF
-        -DLLAMA_OPENMP=OFF
         -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
 )
 

From 2af41de74e540df83221d88feb042ee3dae89213 Mon Sep 17 00:00:00 2001
From: Sean Boettger <sean@whypenguins.com>
Date: Tue, 23 Dec 2025 02:28:38 +1100
Subject: [PATCH 16/19] Fix genai enum header docs

---
 coresdk/src/coresdk/types.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/coresdk/src/coresdk/types.h b/coresdk/src/coresdk/types.h
index 78ff4cae..9df35575 100644
--- a/coresdk/src/coresdk/types.h
+++ b/coresdk/src/coresdk/types.h
@@ -563,12 +563,12 @@ namespace splashkit_lib
      * @constant QWEN3_4B_BASE         Qwen3 4B Base model - slower but excellent for text commpletion/pattern based completion
      * @constant QWEN3_4B_INSTRUCT     Qwen3 4B Instruct model - slower but can follow complex instructions
      * @constant QWEN3_4B_THINKING     Qwen3 4B Thinking model - slower but can follow complex and specific instructions, but has a potentially long delay before starting to reply
-     * @constant GEMMA_270M_BASE       Gemma3 270M Base model - tiny, extremely fast, and good for text completion. Very limited world knowledge.
-     * @constant GEMMA_270M_BASE       Gemma3 270M Instruct model - tiny, extremely fast, and good for very simple instructions. Very limited world knowledge.
-     * @constant GEMMA_1B_BASE         Gemma3 1B Base model - fast and good for text completion. Has decent world knowledge and multi-lingual abilities.
-     * @constant GEMMA_1B_INSTRUCT     Gemma3 1B Instruct model - fast and can follow instructions. Has decent world knowledge and multi-lingual abilities.
-     * @constant GEMMA_4B_BASE         Gemma3 4B Base model - slower but good for text commpletion/pattern based completion. Has decent world knowledge and multi-lingual abilities.
-     * @constant GEMMA_4B_INSTRUCT     Gemma3 4B Instruct model - slower but can follow complex instructions. Has decent world knowledge and multi-lingual abilities.
+     * @constant GEMMA3_270M_BASE      Gemma3 270M Base model - tiny, extremely fast, and good for text completion. Very limited world knowledge.
+     * @constant GEMMA3_270M_INSTRUCT  Gemma3 270M Instruct model - tiny, extremely fast, and good for very simple instructions. Very limited world knowledge.
+     * @constant GEMMA3_1B_BASE        Gemma3 1B Base model - fast and good for text completion. Has decent world knowledge and multi-lingual abilities.
+     * @constant GEMMA3_1B_INSTRUCT    Gemma3 1B Instruct model - fast and can follow instructions. Has decent world knowledge and multi-lingual abilities.
+     * @constant GEMMA3_4B_BASE        Gemma3 4B Base model - slower but good for text commpletion/pattern based completion. Has decent world knowledge and multi-lingual abilities.
+     * @constant GEMMA3_4B_INSTRUCT    Gemma3 4B Instruct model - slower but can follow complex instructions. Has decent world knowledge and multi-lingual abilities.
      */
     enum language_model
     {

From 5afa6f0048d6bed7b55b995dcbba8186ce2705e9 Mon Sep 17 00:00:00 2001
From: Sean Boettger <sean@whypenguins.com>
Date: Tue, 23 Dec 2025 02:28:47 +1100
Subject: [PATCH 17/19] Fix genai function header docs

---
 coresdk/src/coresdk/genai.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/coresdk/src/coresdk/genai.h b/coresdk/src/coresdk/genai.h
index c8e90eb6..cc51ce70 100644
--- a/coresdk/src/coresdk/genai.h
+++ b/coresdk/src/coresdk/genai.h
@@ -43,6 +43,8 @@ namespace splashkit_lib
      * @param prompt The prompt for the language model to reply to.
      *
      * @returns The generated reply.
+     *
+     * @attribute suffix with_model
      */
     string generate_reply(language_model model, string prompt);
 
@@ -56,6 +58,8 @@ namespace splashkit_lib
      * @param options The generation options - use the `option_` functions to create this, for instance `option_language_model`
      *
      * @returns The generated reply.
+     *
+     * @attribute suffix with_options
      */
     string generate_reply(string prompt, language_model_options options);
 
@@ -82,6 +86,8 @@ namespace splashkit_lib
      * @param text The input text for the language model to continue.
      *
      * @returns The generated reply.
+     *
+     * @attribute suffix with_model
      */
     string generate_text(language_model model, string text);
 
@@ -95,6 +101,8 @@ namespace splashkit_lib
      * @param options The generation options - use the `option_` functions to create this, for instance `option_language_model`
      *
      * @returns The generated reply.
+     *
+     * @attribute suffix with_options
      */
     string generate_text(string text, language_model_options options);
 

From 9341ffbc8aecb4991a6a718bdce9d0d379ea8908 Mon Sep 17 00:00:00 2001
From: Sean Boettger <sean@whypenguins.com>
Date: Tue, 23 Dec 2025 00:26:25 +1100
Subject: [PATCH 18/19] Make generations reproducible, fixed seed

---
 coresdk/src/backend/genai_driver.cpp | 6 +++---
 coresdk/src/backend/genai_driver.h   | 4 +++-
 coresdk/src/coresdk/genai.cpp        | 7 +++++--
 coresdk/src/coresdk/types.h          | 1 +
 4 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/coresdk/src/backend/genai_driver.cpp b/coresdk/src/backend/genai_driver.cpp
index 7b7814ec..3b586fc7 100644
--- a/coresdk/src/backend/genai_driver.cpp
+++ b/coresdk/src/backend/genai_driver.cpp
@@ -123,11 +123,11 @@ namespace splashkit_lib
             return prompt_tokens;
         }
 
-        context start_context(model& mdl, llama_tokens& starting_context, int max_length, inference_settings settings)
+        context start_context(model& mdl, llama_tokens& starting_context, inference_settings settings)
         {
             // Create the context
             llama_context_params ctx_params = llama_context_default_params();
-            ctx_params.n_ctx = starting_context.size() + max_length - 1;
+            ctx_params.n_ctx = starting_context.size() + settings.max_length - 1;
             ctx_params.n_batch = starting_context.size();
             ctx_params.no_perf = true;
 
@@ -151,7 +151,7 @@ namespace splashkit_lib
             llama_sampler_chain_add(smpl, llama_sampler_init_top_p(settings.top_p, 0));
             if (settings.presence_penalty > 0)
                 llama_sampler_chain_add(smpl, llama_sampler_init_penalties(64, 0, 0, settings.presence_penalty));
-            llama_sampler_chain_add(smpl, llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
+            llama_sampler_chain_add(smpl, llama_sampler_init_dist(settings.seed));
 
             // Prepare batch and encode starting context
             llama_batch batch = llama_batch_get_one(starting_context.data(), starting_context.size());
diff --git a/coresdk/src/backend/genai_driver.h b/coresdk/src/backend/genai_driver.h
index a65d624e..bda8a46e 100644
--- a/coresdk/src/backend/genai_driver.h
+++ b/coresdk/src/backend/genai_driver.h
@@ -33,6 +33,8 @@ namespace splashkit_lib
             int top_k = 20;
             double min_p = 0;
             double presence_penalty = 0;
+            int max_length = 256;
+            uint32_t seed = 42;
         };
 
         struct message
@@ -64,7 +66,7 @@ namespace splashkit_lib
         std::string format_chat(model& mdl, const std::vector<message>& messages);
         llama_tokens tokenize_string(model& mdl, const std::string& prompt);
 
-        context start_context(model& mdl, llama_tokens& starting_context, int max_length, inference_settings settings);
+        context start_context(model& mdl, llama_tokens& starting_context, inference_settings settings);
         int context_step(context& ctx);
         void delete_context(context& ctx);
     }
diff --git a/coresdk/src/coresdk/genai.cpp b/coresdk/src/coresdk/genai.cpp
index e7f8435d..362084fb 100644
--- a/coresdk/src/coresdk/genai.cpp
+++ b/coresdk/src/coresdk/genai.cpp
@@ -135,12 +135,14 @@ namespace splashkit_lib
         }
         llamacpp::llama_tokens tokens = llamacpp::tokenize_string(model, formatted);
 
-        llamacpp::context ctx = llamacpp::start_context(model, tokens, options.max_tokens, {
+        llamacpp::context ctx = llamacpp::start_context(model, tokens, {
             options.temperature,
             options.top_p,
             options.top_k,
             options.min_p,
-            options.presence_penalty
+            options.presence_penalty,
+            options.max_tokens,
+            (uint32_t)options.seed
         });
         while (!llamacpp::context_step(ctx)){
             // just wait until it completes
@@ -198,6 +200,7 @@ namespace splashkit_lib
 
         language_model_options options = models[model];
         options.path =  home_path + options.path;
+        options.seed = 0;
 
         return options;
     }
diff --git a/coresdk/src/coresdk/types.h b/coresdk/src/coresdk/types.h
index 9df35575..c46ce9f6 100644
--- a/coresdk/src/coresdk/types.h
+++ b/coresdk/src/coresdk/types.h
@@ -616,6 +616,7 @@ namespace splashkit_lib
         double min_p;
         double presence_penalty;
         string prompt_append;
+        int seed;
     };
 }
 #endif /* types_hpp */

From dbb176291a52a7094f9ce8b285156420084fd93c Mon Sep 17 00:00:00 2001
From: Sean Boettger <sean@whypenguins.com>
Date: Tue, 23 Dec 2025 07:42:03 +1100
Subject: [PATCH 19/19] Conversation & Streaming support + some refactoring +
 updated test program

---
 coresdk/src/backend/backend_types.h  |   1 +
 coresdk/src/backend/genai_driver.cpp | 115 ++++++++++-----
 coresdk/src/backend/genai_driver.h   |  47 ++++++-
 coresdk/src/coresdk/genai.cpp        | 201 +++++++++++++++++++++++++--
 coresdk/src/coresdk/genai.h          | 139 ++++++++++++++++++
 coresdk/src/test/test_genai.cpp      |  59 ++++++--
 6 files changed, 501 insertions(+), 61 deletions(-)

diff --git a/coresdk/src/backend/backend_types.h b/coresdk/src/backend/backend_types.h
index 03083a86..2eeead1e 100644
--- a/coresdk/src/backend/backend_types.h
+++ b/coresdk/src/backend/backend_types.h
@@ -64,6 +64,7 @@ namespace splashkit_lib
         ADC_PTR=                    0x41444350, //'ADCP';
         MOTOR_DRIVER_PTR =           0x4d444950, //'MDIP';
         SERVO_DRIVER_PTR =           0x53455256, //'SERV'; 
+        CONVERSATION_PTR =          0x434f4e56, //'CONV';
         NONE_PTR =                  0x4e4f4e45  //'NONE';
     };
 
diff --git a/coresdk/src/backend/genai_driver.cpp b/coresdk/src/backend/genai_driver.cpp
index 3b586fc7..c6f3a05b 100644
--- a/coresdk/src/backend/genai_driver.cpp
+++ b/coresdk/src/backend/genai_driver.cpp
@@ -60,6 +60,13 @@ namespace splashkit_lib
                 return {false};
             }
 
+            if (llama_model_has_encoder(model))
+            {
+                llama_model_free(model);
+                CLOG(ERROR, "GenAI") << "Unsupported model, requires encoder-decoder support.";
+                return {false};
+            }
+
             const llama_vocab * vocab = llama_model_get_vocab(model);
             const char* tmpl = llama_model_chat_template(model, /* name */ nullptr);
 
@@ -82,7 +89,7 @@ namespace splashkit_lib
             llama_model_free(mdl.model);
         }
 
-        std::string format_chat(model& mdl, const std::vector<message>& messages)
+        std::string format_chat(model& mdl, const std::vector<message>& messages, bool add_assistant)
         {
             std::vector<llama_chat_message> llama_formatted;
             std::vector<char> formatted(0);
@@ -94,27 +101,27 @@ namespace splashkit_lib
                 llama_formatted.push_back({msg.role.c_str(), msg.content.c_str()});
             }
 
-            int new_len = llama_chat_apply_template(mdl.tmpl, llama_formatted.data(), llama_formatted.size(), true, formatted.data(), formatted.size());
+            int new_len = llama_chat_apply_template(mdl.tmpl, llama_formatted.data(), llama_formatted.size(), add_assistant, formatted.data(), formatted.size());
             if (new_len > (int)formatted.size())
             {
                 formatted.resize(new_len);
-                new_len = llama_chat_apply_template(mdl.tmpl, llama_formatted.data(), llama_formatted.size(), true, formatted.data(), formatted.size());
+                new_len = llama_chat_apply_template(mdl.tmpl, llama_formatted.data(), llama_formatted.size(), add_assistant, formatted.data(), formatted.size());
             }
 
             return std::string(formatted.begin(), formatted.end());
         }
 
-        llama_tokens tokenize_string(model& mdl, const std::string& prompt)
+        llama_tokens tokenize_string(model& mdl, const std::string& prompt, bool is_first)
         {
             // get token count
             // note: returns a negative number, the count of tokens it would have returned if the buffer was large enough
-            const int n_prompt = -llama_tokenize(mdl.vocab, prompt.data(), prompt.size(), NULL, 0, true, true);
+            const int n_prompt = -llama_tokenize(mdl.vocab, prompt.data(), prompt.size(), NULL, 0, is_first, true);
 
             // create buffer
             std::vector<llama_token> prompt_tokens(n_prompt);
 
             // recieve the tokens
-            if (llama_tokenize(mdl.vocab, prompt.data(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0)
+            if (llama_tokenize(mdl.vocab, prompt.data(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), is_first, true) < 0)
             {
                 CLOG(ERROR, "GenAI") << "Failed to tokenize the prompt.";
                 return {};
@@ -128,7 +135,7 @@ namespace splashkit_lib
             // Create the context
             llama_context_params ctx_params = llama_context_default_params();
             ctx_params.n_ctx = starting_context.size() + settings.max_length - 1;
-            ctx_params.n_batch = starting_context.size();
+            ctx_params.n_batch = ctx_params.n_ctx;
             ctx_params.no_perf = true;
 
             llama_context * ctx = llama_init_from_model(mdl.model, ctx_params);
@@ -153,60 +160,58 @@ namespace splashkit_lib
                 llama_sampler_chain_add(smpl, llama_sampler_init_penalties(64, 0, 0, settings.presence_penalty));
             llama_sampler_chain_add(smpl, llama_sampler_init_dist(settings.seed));
 
-            // Prepare batch and encode starting context
-            llama_batch batch = llama_batch_get_one(starting_context.data(), starting_context.size());
+            // Prepare batch for starting context
+            llama_tokens next_batch = starting_context;
 
-            if (llama_model_has_encoder(mdl.model))
-            {
-                if (llama_encode(ctx, batch))
-                {
-                    llama_free(ctx);
-                    llama_sampler_free(smpl);
-                    CLOG(ERROR, "GenAI") << "Failed to encode prompt.";
-                    return {nullptr};
-                }
-
-                llama_token decoder_start_token_id = llama_model_decoder_start_token(mdl.model);
-                if (decoder_start_token_id == LLAMA_TOKEN_NULL)
-                {
-                    decoder_start_token_id = llama_vocab_bos(mdl.vocab);
-                }
-
-                batch = llama_batch_get_one(&decoder_start_token_id, 1);
-            }
+            // Cache newline token - we use this manually in some spots
+            llama_token newline_token;
+            llama_tokenize(mdl.vocab, "\n", 1, &newline_token, 1, false, true);
 
             return
             {
                 ctx,
                 smpl,
-                batch,
+                next_batch,
                 (int)ctx_params.n_ctx,
                 mdl.vocab,
+                newline_token,
                 0,
-                ""
+                {},
+                false
             };
         }
 
-        int context_step(context& ctx)
+        int context_step(context& ctx, token_result* token)
         {
+            const string THINKING_START = "<think>";
+            const string THINKING_END = "</think>";
+
             if (!ctx.ctx)
                 return -1;
 
+            llama_batch batch = llama_batch_get_one(ctx.next_batch.data(), ctx.next_batch.size());
             // Decode current batch with the model
-            if (llama_decode(ctx.ctx, ctx.batch))
+            if (llama_decode(ctx.ctx, batch))
             {
                 CLOG(ERROR, "GenAI") << "Failed to process response from language model.";
+                if (token)
+                    token->type = token_result::NONE;
                 return -1;
             }
 
-            ctx.n_pos += ctx.batch.n_tokens;
+            ctx.total_context.insert(ctx.total_context.end(), ctx.next_batch.begin(), ctx.next_batch.end());
+            ctx.n_pos += batch.n_tokens;
 
             // Sample next token
             llama_token new_token_id = llama_sampler_sample(ctx.smpl, ctx.ctx, -1);
 
             // Has the model finished its response?
             if (llama_vocab_is_eog(ctx.vocab, new_token_id))
+            {
+                if (token)
+                    token->type = token_result::NONE;
                 return 1;
+            }
 
             char buf[128];
             int n = llama_token_to_piece(ctx.vocab, new_token_id, buf, sizeof(buf), 0, true);
@@ -217,19 +222,46 @@ namespace splashkit_lib
             }
 
             std::string s(buf, n);
-            ctx.ctx_string += s;
+
+            if (token)
+            {
+                bool is_meta = s == THINKING_START || s == THINKING_END;
+                token->text = s;
+                if (is_meta)
+                    token->type = token_result::META;
+                else if (ctx.in_thinking)
+                    token->type = token_result::THINKING;
+                else
+                    token->type = token_result::CONTENT;
+            }
+
+            if (s == THINKING_START)
+                ctx.in_thinking = true;
+            else if (s == THINKING_END)
+                ctx.in_thinking = false;
 
             // prepare the next batch with the sampled token
-            ctx.batch = llama_batch_get_one(&new_token_id, 1);
+            ctx.next_batch = {new_token_id};
 
             // Have we reached the end of the context?
             // If so, stop now.
-            if (ctx.n_pos + ctx.batch.n_tokens >= ctx.ctx_size)
+            if (ctx.n_pos + ctx.next_batch.size() >= ctx.ctx_size)
                 return 1;
 
             return 0;
         }
 
+        void add_to_context(context& ctx, llama_tokens& message)
+        {
+            ctx.next_batch.insert(ctx.next_batch.end(), message.begin(), message.end());
+        }
+
+        void manual_end_message(context& ctx)
+        {
+            ctx.next_batch.push_back(llama_vocab_eot(ctx.vocab));
+            ctx.next_batch.push_back(ctx.newline_token);
+        }
+
         void delete_context(context& ctx)
         {
             if (ctx.smpl)
@@ -238,5 +270,18 @@ namespace splashkit_lib
             if (ctx.ctx)
                 llama_free(ctx.ctx);
         }
+
+        void __print_debug_context(context& ctx)
+        {
+            for (auto& x : ctx.total_context)
+            {
+                char buf[128];
+                int n = llama_token_to_piece(ctx.vocab, x, buf, sizeof(buf), 0, true);
+
+                std::string s(buf, n);
+                std::cout << "|" << s;
+            }
+            std::cout << std::endl;
+        }
     }
 }
diff --git a/coresdk/src/backend/genai_driver.h b/coresdk/src/backend/genai_driver.h
index bda8a46e..b24c3e91 100644
--- a/coresdk/src/backend/genai_driver.h
+++ b/coresdk/src/backend/genai_driver.h
@@ -18,6 +18,8 @@ namespace splashkit_lib
 
     namespace llamacpp
     {
+        typedef std::vector<llama_token> llama_tokens;
+
         struct model
         {
             bool valid;
@@ -47,29 +49,62 @@ namespace splashkit_lib
         {
             llama_context* ctx;
             llama_sampler* smpl;
-            llama_batch batch;
+            llama_tokens next_batch;
             int ctx_size = 0;
 
             const llama_vocab* vocab;
+            llama_token newline_token;
 
             int n_pos;
-            std::string ctx_string;
+            llama_tokens total_context;
+
+            bool in_thinking = false;
         };
 
-        typedef std::vector<llama_token> llama_tokens;
+        struct token_result
+        {
+            enum token_type {
+                NONE,
+                CONTENT,
+                THINKING,
+                META
+            };
+            string text;
+            token_type type;
+        };
 
         void init();
 
         model create_model(std::string path);
         void delete_model(model mdl);
 
-        std::string format_chat(model& mdl, const std::vector<message>& messages);
-        llama_tokens tokenize_string(model& mdl, const std::string& prompt);
+        std::string format_chat(model& mdl, const std::vector<message>& messages, bool add_assistant);
+        llama_tokens tokenize_string(model& mdl, const std::string& prompt, bool is_first);
 
         context start_context(model& mdl, llama_tokens& starting_context, inference_settings settings);
-        int context_step(context& ctx);
         void delete_context(context& ctx);
+
+        int context_step(context& ctx, token_result* token);
+        void add_to_context(context& ctx, llama_tokens& message);
+        void manual_end_message(context& ctx);
+
+        void __print_debug_context(context& ctx);
     }
+
+    struct sk_conversation
+    {
+        pointer_identifier id;
+
+        llamacpp::model model;
+        llamacpp::context context;
+
+        bool was_generating;
+        bool is_generating;
+
+        string prompt_append;
+
+        llamacpp::token_result next_token;
+    };
 }
 
 #endif /* defined(graphics_driver) */
diff --git a/coresdk/src/coresdk/genai.cpp b/coresdk/src/coresdk/genai.cpp
index 362084fb..cde93610 100644
--- a/coresdk/src/coresdk/genai.cpp
+++ b/coresdk/src/coresdk/genai.cpp
@@ -10,6 +10,7 @@
 #include "utility_functions.h"
 #include "web_driver.h"
 #include "terminal.h"
+#include "core_driver.h"
 
 #include <filesystem>
 
@@ -17,6 +18,8 @@ using std::to_string;
 
 namespace splashkit_lib
 {
+    static vector<conversation> objects;
+
     const language_model DEFAULT_LANGUAGE_MODEL = QWEN3_0_6B_INSTRUCT;
 
     const int default_max_tokens_base = 256; // base has a higher likelihood of running forever for no reason, better to limit it early
@@ -109,17 +112,22 @@ namespace splashkit_lib
         return result;
     }
 
-    string __generate_common(string prompt, language_model_options options, bool format_chat)
+    llamacpp::model __get_model(language_model_options options)
     {
         llamacpp::init();
 
         if (options.url != "" && !ensure_exists_or_download(options.path, options.url, " ::: Downloading Language Model: " + options.name + " |"))
         {
             CLOG(ERROR, "GenAI") << "Failed to download language model - see error above.";
-            return "";
+            return {false};
         }
 
-        llamacpp::model model = llamacpp::create_model(options.path);
+        return llamacpp::create_model(options.path);
+    }
+
+    string __generate_common(string prompt, language_model_options options, bool format_chat)
+    {
+        llamacpp::model model = __get_model(options);
 
         if (!model.valid) return "";
 
@@ -127,13 +135,13 @@ namespace splashkit_lib
 
         if (format_chat)
         {
-            llamacpp::format_chat(model, {
+            formatted = llamacpp::format_chat(model, {
                 {
                     "user", prompt + options.prompt_append
-                }
-            });
+                },
+            }, true);
         }
-        llamacpp::llama_tokens tokens = llamacpp::tokenize_string(model, formatted);
+        llamacpp::llama_tokens tokens = llamacpp::tokenize_string(model, formatted, true);
 
         llamacpp::context ctx = llamacpp::start_context(model, tokens, {
             options.temperature,
@@ -144,12 +152,15 @@ namespace splashkit_lib
             options.max_tokens,
             (uint32_t)options.seed
         });
-        while (!llamacpp::context_step(ctx)){
-            // just wait until it completes
-            // we could also stream the text to the user through a callback
-        };
 
-        std::string result = ctx.ctx_string;
+        std::string result = "";
+        llamacpp::token_result token;
+
+        while (!llamacpp::context_step(ctx, &token))
+        {
+            if (token.type == llamacpp::token_result::CONTENT)
+                result += token.text;
+        };
 
         llamacpp::delete_context(ctx);
         llamacpp::delete_model(model);
@@ -188,6 +199,172 @@ namespace splashkit_lib
         return __generate_common(text, options, false);
     }
 
+    // --------------------------------------------------------------
+
+    // Streaming conversation
+
+    #define CONVERSATION_CHECK(x, val) \
+        if (INVALID_PTR(c, CONVERSATION_PTR))\
+        {\
+            LOG(WARNING) << "Passed an invalid conversation object to " x;\
+            return val;\
+        }
+
+    conversation create_conversation()
+    {
+        return create_conversation(option_language_model(DEFAULT_LANGUAGE_MODEL));
+    }
+
+    conversation create_conversation(language_model model)
+    {
+        return create_conversation(option_language_model(model));
+    }
+
+    conversation create_conversation(language_model_options options)
+    {
+        internal_sk_init();
+
+        llamacpp::model model = __get_model(options);
+
+        if (!model.valid) return nullptr;
+
+        llamacpp::llama_tokens initial_tokens = llamacpp::tokenize_string(model, "", true);
+
+        sk_conversation* c = new sk_conversation();
+        c->id = CONVERSATION_PTR;
+        c->model = model;
+        c->context = llamacpp::start_context(model, initial_tokens, {
+            options.temperature,
+            options.top_p,
+            options.top_k,
+            options.min_p,
+            options.presence_penalty,
+            options.max_tokens,
+            (uint32_t)options.seed
+        });;
+
+        c->was_generating = false;
+        c->is_generating = true;
+
+        c->prompt_append = options.prompt_append;
+
+        objects.push_back(c);
+
+        return c;
+    };
+
+    void conversation_add_message(conversation c, const string& message)
+    {
+        CONVERSATION_CHECK("conversation_add_message", )
+
+        // end the language model's turn
+        if (c->was_generating)
+        {
+            c->was_generating = false;
+            llamacpp::manual_end_message(c->context);
+        }
+
+        // tokenize user's prompt and add to context
+        llamacpp::llama_tokens tokens = llamacpp::tokenize_string(c->model, llamacpp::format_chat(c->model, {
+            {"user", message + c->prompt_append}
+        }, true), false);
+        llamacpp::add_to_context(c->context, tokens);
+
+        // the model is ready to generate again
+        c->is_generating = true;
+    }
+
+    void __buffer_next_token(conversation c)
+    {
+        if (c->next_token.type != llamacpp::token_result::token_type::NONE)
+            return; // already buffered
+
+        // attempt to get next token that is non-meta
+        do
+        {
+            // if we reach the end of the message, return even if a meta token (shouldn't happen though)
+            if (llamacpp::context_step(c->context, &c->next_token))
+            {
+                c->is_generating = false;
+                return;
+            }
+        } while (c->next_token.type == llamacpp::token_result::token_type::META);
+    }
+
+    // These next three functions buffer the next token so that they can
+    // return information about it
+    bool conversation_is_replying(conversation c)
+    {
+        CONVERSATION_CHECK("conversation_is_replying", false)
+
+        __buffer_next_token(c);
+
+        return c->is_generating;
+    }
+
+    bool conversation_is_thinking(conversation c)
+    {
+        CONVERSATION_CHECK("conversation_is_thinking", false)
+
+        __buffer_next_token(c);
+
+        return c->next_token.type == llamacpp::token_result::token_type::THINKING;
+    }
+
+    string conversation_get_reply_piece(conversation c)
+    {
+        CONVERSATION_CHECK("conversation_get_reply_piece", "")
+
+        // if the user wants a token, we can resume generating even if we already finished
+        c->is_generating = true;
+        c->was_generating = true;
+
+        __buffer_next_token(c);
+
+        // token is consumed
+        c->next_token.type = llamacpp::token_result::token_type::NONE;
+
+        return c->next_token.text;
+    }
+
+    void __free_conversation_resource(conversation c)
+    {
+        llamacpp::delete_context(c->context);
+        llamacpp::delete_model(c->model);
+    }
+
+    void free_conversation(conversation c)
+    {
+        CONVERSATION_CHECK("free_conversation", )
+
+        __free_conversation_resource(c);
+
+        for (auto it = objects.begin(); it != objects.end(); it++)
+        {
+            if (*it == c)
+            {
+                notify_of_free(c);
+
+                delete *it;
+
+                it = objects.erase(it);
+                return;
+            }
+        }
+    }
+
+    void free_all_conversations()
+    {
+        for (conversation c : objects)
+        {
+            __free_conversation_resource(c);
+        }
+
+        objects.clear();
+    }
+
+    // --------------------------------------------------------------
+
     language_model_options option_language_model(language_model model)
     {
         if (model < 0 || model >= models.size() || models[model].name == "")
diff --git a/coresdk/src/coresdk/genai.h b/coresdk/src/coresdk/genai.h
index cc51ce70..c57f1fa8 100644
--- a/coresdk/src/coresdk/genai.h
+++ b/coresdk/src/coresdk/genai.h
@@ -20,6 +20,26 @@ using std::string;
 
 namespace splashkit_lib
 {
+    /**
+     * The `conversation` type is used to refer to conversations between the user
+     * and a language model. You can use it to send messages to the language model,
+     * and stream responses back.
+     *
+     *
+     * All `conversation` objects are:
+     *
+     *
+     *   - created with `create_conversation()`, `create_conversation(language_model model)` or
+     *   `create_conversation(language_model_options options)`
+     *
+     *
+     *   - and must be released using `free_conversation()` (to release a specific `conversation` object)
+     *   or `free_all_conversation()` (to release all created `conversation` objects).
+     *
+     *
+     * @attribute class conversation
+     */
+    typedef struct sk_conversation *conversation;
 
     /**
      * @brief Generates a reply to a textual prompt by a language model
@@ -106,6 +126,125 @@ namespace splashkit_lib
      */
     string generate_text(string text, language_model_options options);
 
+    /**
+     * @brief Creates a new `conversation` object, that uses the default language model.
+     *
+     * The `conversation` object can have messages added to it, and responses streamed back from it via the other Conversation functions and procedures
+     *
+     * @returns Returns a new `conversation` object.
+     *
+     * @attribute class       conversation
+     * @attribute constructor true
+     */
+    conversation create_conversation();
+
+    /**
+     * @brief Creates a new `conversation` object, that uses a chosen language model.
+     *
+     * The `conversation` object can have messages added to it, and responses streamed back from it via the other Conversation functions and procedures
+     *
+     * @param model The language model to use
+     *
+     * @returns Returns a new `conversation` object.
+     *
+     * @attribute class       conversation
+     * @attribute constructor true
+     *
+     * @attribute suffix with_model
+     */
+    conversation create_conversation(language_model model);
+
+    /**
+     * @brief Creates a new `conversation` object, that uses a chosen language model among other options.
+     *
+     * The `conversation` object can have messages added to it, and responses streamed back from it via the other Conversation functions and procedures
+     *
+     * @param options The options to use - use this to choose the language model, and change various parameters.
+     *
+     * @returns Returns a new `conversation` object.
+     *
+     * @attribute class       conversation
+     * @attribute constructor true
+     *
+     * @attribute suffix with_options
+     */
+    conversation create_conversation(language_model_options options);
+
+    /**
+     * Checks if a language model is currently generating a reply within a `conversation`.
+     * If so, you can continue to receive the message with `conversation_get_reply_piece(conversation c)`
+     *
+     * @param c The `conversation` object to check
+     *
+     * @returns Returns whether the language model is still generating a reply
+     *
+     * @attribute class conversation
+     * @attribute method is_replying
+     * @attribute self c
+     */
+    bool conversation_is_replying(conversation c);
+
+    /**
+     * Checks if a language model is currently "thinking" while generating a reply within a `conversation`.
+     * You can use this to filter out the "thoughts" and display them differently (or hide them entirely)
+     *
+     * @param c The `conversation` object to check
+     *
+     * @returns Returns whether the language model is currently thinking while generating a reply
+     *
+     * @attribute class conversation
+     * @attribute method is_thinking
+     * @attribute self c
+     */
+    bool conversation_is_thinking(conversation c);
+
+    /**
+     * Adds a message to a `conversation`, that the language model will begin replying to.
+     * You can receive the reply one piece at a time by calling `conversation_get_reply_piece(conversation c)` in a loop
+     *
+     * @param c The `conversation` object to check
+     * @param message The user message to add to the conversation - the language model will reply to this
+     *
+     * @attribute class conversation
+     * @attribute method add_message
+     * @attribute self c
+     */
+    void conversation_add_message(conversation c, const string& message);
+
+    /**
+     * Returns a single piece of a reply (generally one word at a time) from the `conversation`
+     * You can use a loop while checking `conversation_is_replying` to retrieve the reply as it generates
+     *
+     * @param c The `conversation` object to recieve the reply from
+     *
+     * @returns Returns a small piece of the reply (generally 1 word or less)
+     *
+     * @attribute class conversation
+     * @attribute method get_reply_piece
+     * @attribute self c
+     */
+    string conversation_get_reply_piece(conversation c);
+
+    /**
+     * Frees the resources associated with the `conversation` object.
+     *
+     * @param c The `conversation` object whose resources should be released.
+     *
+     * @attribute class      conversation
+     * @attribute destructor true
+     * @attribute self       c
+     * @attribute method     free
+     */
+    void free_conversation(conversation c);
+
+    /**
+     * Releases all of the `conversation` objects which have been loaded.
+     *
+     * @attribute static conversations
+     * @attribute method free_all
+     */
+    void free_all_conversations();
+
     /**
      * Use this option to choose which language model to use, and initialize its default settings
      *
diff --git a/coresdk/src/test/test_genai.cpp b/coresdk/src/test/test_genai.cpp
index 84aba027..669bea23 100644
--- a/coresdk/src/test/test_genai.cpp
+++ b/coresdk/src/test/test_genai.cpp
@@ -7,6 +7,7 @@
 
 #include "genai.h"
 #include "terminal.h"
+#include "basics.h"
 #include "utils.h"
 #include <vector>
 #include <iostream>
@@ -16,15 +17,57 @@ using namespace splashkit_lib;
 
 void run_genai_test()
 {
-    write("User\n> ");
-    string prompt = read_line();
+    const string THINKING_STYLE = "\033[37;3m";
+    const string RESET_STYLE = "\033[0m";
 
-    write("LLM\n> (generating...)");
-    string response = generate_reply(QWEN3_0_6B_INSTRUCT, prompt);
-    write_line("\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\033[K" + response);
+    conversation conv = create_conversation(QWEN3_1_7B_THINKING);
 
-    delay(300);
+    while(true)
+    {
+        write("\n> ");
+        string prompt = read_line();
 
-    write_line("-- Press enter to end --");
-    read_line();
+        // See if the user wants to exit
+        string exit = trim(generate_reply(QWEN3_1_7B_INSTRUCT, "User A: "+prompt+"\nDoes user A want to end the conversation? Answer with one word, either CONTINUE or END:"));
+
+        write_line("["+exit+"]");
+
+        if (exit == "END")
+            break;
+
+        // otherwise continue the conversation
+        conversation_add_message(conv, prompt);
+
+        bool thinking = false;
+        string last_piece = "\n";
+        while(conversation_is_replying(conv))
+        {
+            if (conversation_is_thinking(conv) != thinking)
+            {
+                thinking = conversation_is_thinking(conv);
+
+                if (thinking)
+                    write(THINKING_STYLE);
+                else
+                    write(RESET_STYLE);
+            }
+
+            string piece = conversation_get_reply_piece(conv);
+
+            // avoid double newlines - ideally this will be filtered on SplashKit's side instead
+            if (piece == "\n" && last_piece == "\n")
+                continue;
+
+            if (piece == "\n\n")
+                piece = "\n";
+
+            write(piece);
+            last_piece = piece;
+        }
+
+        if (last_piece != "\n")
+            write("\n");
+    }
+
+    free_conversation(conv);
 }