From 37c3ad1ae21025a444f077adc6fd83a183191b51 Mon Sep 17 00:00:00 2001
From: Uche Ogbuji <uche@ogbuji.net>
Date: Tue, 2 Sep 2025 12:04:32 -0600
Subject: [PATCH 1/4] Claude coding base enhancements for prompt caching and
 4-bit KV quantization

---
 demo/mlx_enhanced_features.py | 87 +++++++++++++++++++++++++++++++++++
 pylib/schema_helper.py        | 24 +++++++---
 pyproject.toml                |  4 +-
 3 files changed, 106 insertions(+), 9 deletions(-)
 create mode 100644 demo/mlx_enhanced_features.py

diff --git a/demo/mlx_enhanced_features.py b/demo/mlx_enhanced_features.py
new file mode 100644
index 0000000..a6952e8
--- /dev/null
+++ b/demo/mlx_enhanced_features.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python
+'''
+Demo of MLX 0.29.0 and MLX-LM 0.27.0 enhanced features:
+- Prompt caching for faster repeated queries
+- 4-bit KV cache quantization for memory efficiency
+'''
+
+import sys
+sys.path.insert(0, '..')
+
+from pylib.schema_helper import Model
+
+def demo_enhanced_features():
+    print("MLX Enhanced Features Demo")
+    print("=" * 50)
+    
+    # Initialize model with prompt cache support
+    model = Model()
+    
+    # Load model with max_kv_size for prompt caching
+    print("\n1. Loading model with prompt cache support...")
+    model.load(
+        "mlx-community/Qwen2.5-1.5B-Instruct-4bit",
+        max_kv_size=4096  # Enable prompt cache with 4K tokens
+    )
+    print("   ✓ Model loaded with prompt caching enabled")
+    
+    # Example schema for structured output
+    schema = {
+        "type": "object",
+        "properties": {
+            "name": {"type": "string"},
+            "age": {"type": "integer"},
+            "skills": {
+                "type": "array",
+                "items": {"type": "string"}
+            }
+        },
+        "required": ["name", "age", "skills"]
+    }
+    
+    # Test 1: Basic generation with prompt caching
+    print("\n2. Testing prompt caching...")
+    messages = [
+        {"role": "user", "content": "Generate a profile for a software developer"}
+    ]
+    
+    print("   First generation (cold cache):")
+    result = model.completion(
+        messages=messages,
+        schema=schema,
+        cache_prompt=True,  # Enable prompt caching
+        max_tokens=100
+    )
+    for chunk in result:
+        print(f"   {chunk}", end="")
+    print()
+    
+    # Test 2: Generation with 4-bit KV quantization
+    print("\n3. Testing 4-bit KV cache quantization...")
+    messages2 = [
+        {"role": "user", "content": "Generate a profile for a data scientist"}
+    ]
+    
+    result = model.completion(
+        messages=messages2,
+        schema=schema,
+        kv_bits=4,  # Enable 4-bit quantization
+        kv_group_size=64,
+        quantized_kv_start=100,  # Start quantizing after 100 tokens
+        max_tokens=100
+    )
+    
+    print("   Generation with 4-bit KV cache:")
+    for chunk in result:
+        print(f"   {chunk}", end="")
+    print()
+    
+    print("\n" + "=" * 50)
+    print("Enhanced features successfully demonstrated!")
+    print("\nKey improvements with MLX 0.29.0 & MLX-LM 0.27.0:")
+    print("• Prompt caching reduces latency for repeated queries")
+    print("• 4-bit KV quantization reduces memory by ~50%")
+    print("• Quantized attention maintains quality while saving memory")
+
+if __name__ == "__main__":
+    demo_enhanced_features()
\ No newline at end of file
diff --git a/pylib/schema_helper.py b/pylib/schema_helper.py
index d84d77c..3137ed1 100644
--- a/pylib/schema_helper.py
+++ b/pylib/schema_helper.py
@@ -12,8 +12,9 @@
 from typing import Iterable
 
 import mlx.core as mx
-# from mlx_lm.models.cache import KVCache, _BaseCache
-# from mlx_lm.models.cache import make_prompt_cache
+from mlx_lm.models.cache import KVCache, _BaseCache
+from mlx_lm.models.cache import make_prompt_cache
+from mlx_lm.models.base import QuantizedKVCache
 from mlx_lm.generate import load, stream_generate # , GenerationResponse
 
 from toolio.vendor.llm_structured_output import JsonSchemaAcceptorDriver
@@ -27,11 +28,11 @@
     'sampler': None,
     'logits_processors': None,
     'max_kv_size': None,
-    # 'prompt_cache': None,
+    'prompt_cache': None,
     'prefill_step_size': 512,
-    'kv_bits': None,
+    'kv_bits': None,  # Set to 4 for mxfp4 quantization, 8 for int8
     'kv_group_size': 64,
-    'quantized_kv_start': 0,
+    'quantized_kv_start': 0,  # Start quantizing after N tokens
     'prompt_progress_callback': None
 }
 
@@ -80,9 +81,9 @@ def __init__(self):
         self.eos_id = None
         self.json_schema_acceptor_driver_factory = None
         # Note: If for example the user loads a cache from a file, and we support prompt caching that way, they should not have to re-specify init params such as max_kv_size
-        # self._prompt_cache = make_prompt_cache(self.model, max_kv_size)
+        self._prompt_cache = None
 
-    def load(self, model_path: str):
+    def load(self, model_path: str, max_kv_size: int | None = None):
         '''
         Load locally or download from Huggingface hub.
         '''
@@ -94,6 +95,11 @@ def load(self, model_path: str):
                 self.vocabulary, self.eos_id
             )
         )
+        # Initialize prompt cache if requested
+        if max_kv_size:
+            self._prompt_cache = make_prompt_cache(self.model, max_kv_size)
+        else:
+            self._prompt_cache = None
 
     def completion(
         self,
@@ -125,6 +131,10 @@ def completion(
         prompt_tokens = self.tokenizer.apply_chat_template(messages, add_generation_prompt=True)
         self._prompt_length = len(prompt_tokens)  # Store prompt length
 
+        # Use prompt cache if available and requested
+        if cache_prompt and self._prompt_cache:
+            kwargs['prompt_cache'] = self._prompt_cache
+        
         logits_generator = stream_generate(self.model, self.tokenizer, prompt_tokens, **kwargs)
 
         self._step_count = 0
diff --git a/pyproject.toml b/pyproject.toml
index e3d344e..a6f37c5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,8 +24,8 @@ classifiers = [
 dependencies = [
   # MLX libraries are only installed on Apple Silicon Macs running macOS 13.5+ with Python 3.8+, as required
   # For more on environment markers see: https://hatch.pypa.io/dev/config/dependency/#environment-markers
-  "mlx>=0.23.1; sys_platform == 'darwin' and platform_machine == 'arm64' and python_version >= '3.8'",
-  "mlx_lm>=0.21.4; sys_platform == 'darwin' and platform_machine == 'arm64' and python_version >= '3.8'",
+  "mlx>=0.29.0; sys_platform == 'darwin' and platform_machine == 'arm64' and python_version >= '3.8'",
+  "mlx_lm>=0.27.0; sys_platform == 'darwin' and platform_machine == 'arm64' and python_version >= '3.8'",
   # Rather than a marker such as platform_release >= '22.6.0' (corresponding to macOS 13.5 or later) we'll make this a runtime check
 
   # This is the former logic, but in some scenarios, e.g. running within Docker, we can get platform_release markers such as Linux kernel version "6.10.14-linuxkit", which are not valid version strings according to Python's packaging.version

From 6fe088db2f99be304ec0d2c0d7c5d4ee8ee21bfe Mon Sep 17 00:00:00 2001
From: Uche Ogbuji <uche@ogbuji.net>
Date: Fri, 7 Nov 2025 07:56:40 -0700
Subject: [PATCH 2/4] Dev setup in package

---
 pyproject.toml | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index a6f37c5..507696c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,18 +40,28 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
+# Extra requirements for the built-in slate of tools
 tools = [
-  # Extra requirements for the built-in slate of tools
   "google-re2"
 ]
+# Extra requirements for project contributors & other developers
 dev = [
-  # Extra requirements for project contributors & other developers
   "build",
+  "twine",
+  "hatch",
+  "pipdeptree",
+  "ruff",
+  "pytest",
+  "pytest-mock",
   "pytest-asyncio",
+  "pytest-cov",
+  "respx",
+  "pgvector",
+  "asyncpg",
   "pytest-httpserver",
-  # "pytest-mock"
 ]
 
+
 [project.urls]
 Documentation = "https://OoriData.github.io/Toolio/"
 Issues = "https://github.com/OoriData/Toolio/issues"

From e3791768daa872eed920ef83917129d6dc659ec0 Mon Sep 17 00:00:00 2001
From: Uche Ogbuji <uche@ogbuji.net>
Date: Mon, 8 Dec 2025 08:00:44 -0700
Subject: [PATCH 3/4] SUndry tweaks

---
 README.md                     |  2 +-
 demo/mlx_enhanced_features.py | 18 +++++++++---------
 pylib/schema_helper.py        |  3 +--
 pyproject.toml                |  2 --
 4 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index 77ff0f3..17550f8 100644
--- a/README.md
+++ b/README.md
@@ -25,7 +25,7 @@ In either case you get better results if you've trained or fine-tuned the model
 
 <table><tr>
   <td><a href="https://oori.dev/"><img src="https://www.oori.dev/assets/branding/oori_Logo_FullColor.png" width="64" /></a></td>
-  <td>Toolio is primarily developed by the crew at <a href="https://oori.dev/">Oori Data</a>. We offer data pipelines and software engineering services around AI/LLM applications.</td>
+  <td>Toolio is primarily developed by the crew at <a href="https://oori.dev/">Oori Data</a>. We offer LLMOps, data pipelines and software engineering services around AI/LLM applications.</td>
 </tr></table>
 
 We'd love your help, though! [Click to learn how to make contributions to the project](https://github.com/OoriData/Toolio/wiki/Notes-for-contributors).
diff --git a/demo/mlx_enhanced_features.py b/demo/mlx_enhanced_features.py
index a6952e8..00ca99d 100644
--- a/demo/mlx_enhanced_features.py
+++ b/demo/mlx_enhanced_features.py
@@ -13,10 +13,10 @@
 def demo_enhanced_features():
     print("MLX Enhanced Features Demo")
     print("=" * 50)
-    
+
     # Initialize model with prompt cache support
     model = Model()
-    
+
     # Load model with max_kv_size for prompt caching
     print("\n1. Loading model with prompt cache support...")
     model.load(
@@ -24,7 +24,7 @@ def demo_enhanced_features():
         max_kv_size=4096  # Enable prompt cache with 4K tokens
     )
     print("   ✓ Model loaded with prompt caching enabled")
-    
+
     # Example schema for structured output
     schema = {
         "type": "object",
@@ -38,13 +38,13 @@ def demo_enhanced_features():
         },
         "required": ["name", "age", "skills"]
     }
-    
+
     # Test 1: Basic generation with prompt caching
     print("\n2. Testing prompt caching...")
     messages = [
         {"role": "user", "content": "Generate a profile for a software developer"}
     ]
-    
+
     print("   First generation (cold cache):")
     result = model.completion(
         messages=messages,
@@ -55,13 +55,13 @@ def demo_enhanced_features():
     for chunk in result:
         print(f"   {chunk}", end="")
     print()
-    
+
     # Test 2: Generation with 4-bit KV quantization
     print("\n3. Testing 4-bit KV cache quantization...")
     messages2 = [
         {"role": "user", "content": "Generate a profile for a data scientist"}
     ]
-    
+
     result = model.completion(
         messages=messages2,
         schema=schema,
@@ -70,12 +70,12 @@ def demo_enhanced_features():
         quantized_kv_start=100,  # Start quantizing after 100 tokens
         max_tokens=100
     )
-    
+
     print("   Generation with 4-bit KV cache:")
     for chunk in result:
         print(f"   {chunk}", end="")
     print()
-    
+
     print("\n" + "=" * 50)
     print("Enhanced features successfully demonstrated!")
     print("\nKey improvements with MLX 0.29.0 & MLX-LM 0.27.0:")
diff --git a/pylib/schema_helper.py b/pylib/schema_helper.py
index 3137ed1..7e7f8ff 100644
--- a/pylib/schema_helper.py
+++ b/pylib/schema_helper.py
@@ -14,7 +14,6 @@
 import mlx.core as mx
 from mlx_lm.models.cache import KVCache, _BaseCache
 from mlx_lm.models.cache import make_prompt_cache
-from mlx_lm.models.base import QuantizedKVCache
 from mlx_lm.generate import load, stream_generate # , GenerationResponse
 
 from toolio.vendor.llm_structured_output import JsonSchemaAcceptorDriver
@@ -134,7 +133,7 @@ def completion(
         # Use prompt cache if available and requested
         if cache_prompt and self._prompt_cache:
             kwargs['prompt_cache'] = self._prompt_cache
-        
+
         logits_generator = stream_generate(self.model, self.tokenizer, prompt_tokens, **kwargs)
 
         self._step_count = 0
diff --git a/pyproject.toml b/pyproject.toml
index 507696c..cb1ebf9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -56,8 +56,6 @@ dev = [
   "pytest-asyncio",
   "pytest-cov",
   "respx",
-  "pgvector",
-  "asyncpg",
   "pytest-httpserver",
 ]
 

From b6a0669ec5827a87190a5895709eef45cdf8d488 Mon Sep 17 00:00:00 2001
From: Uche Ogbuji <uche@ogbuji.net>
Date: Mon, 8 Dec 2025 08:22:42 -0700
Subject: [PATCH 4/4] SUndry tweaks

---
 pylib/common.py        | 4 ++--
 pylib/schema_helper.py | 2 --
 pyproject.toml         | 3 ++-
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/pylib/common.py b/pylib/common.py
index 545ef6c..32219b5 100644
--- a/pylib/common.py
+++ b/pylib/common.py
@@ -20,7 +20,7 @@
 
 # from mlx_lm.models import olmo  # Will say:: To run olmo install ai2-olmo: pip install ai2-olmo
 
-from ogbujipt import word_loom
+import wordloom
 
 
 class model_flag(int, Flag):
@@ -89,7 +89,7 @@ def obj_file_path_parent(obj):
 
 HERE = obj_file_path_parent(lambda: 0)
 with open(HERE / Path('resource/language.toml'), mode='rb') as fp:
-    LANG = word_loom.load(fp)
+    LANG = wordloom.load(fp)
 
 
 class model_runner_base:
diff --git a/pylib/schema_helper.py b/pylib/schema_helper.py
index 7e7f8ff..54ccf2c 100644
--- a/pylib/schema_helper.py
+++ b/pylib/schema_helper.py
@@ -134,8 +134,6 @@ def completion(
         if cache_prompt and self._prompt_cache:
             kwargs['prompt_cache'] = self._prompt_cache
 
-        logits_generator = stream_generate(self.model, self.tokenizer, prompt_tokens, **kwargs)
-
         self._step_count = 0
         for generation_resp in stream_generate(self.model, self.tokenizer, prompt_tokens, **kwargs):
             yield generation_resp
diff --git a/pyproject.toml b/pyproject.toml
index cb1ebf9..dc7a628 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,7 +31,8 @@ dependencies = [
   # This is the former logic, but in some scenarios, e.g. running within Docker, we can get platform_release markers such as Linux kernel version "6.10.14-linuxkit", which are not valid version strings according to Python's packaging.version
   # "mlx>=0.23.1; sys_platform == 'darwin' and platform_machine == 'arm64' and python_version >= '3.8' and platform_release >= '22.6.0'",
   # "mlx_lm>=0.21.4; sys_platform == 'darwin' and platform_machine == 'arm64' and python_version >= '3.8' and platform_release >= '22.6.0'",
-  "ogbujipt>=0.9.3",
+  "wordloom",
+  # "ogbujipt>=0.10.0",
   "fastapi>=0.115.3",
   "click",
   "httpx>=0.27.2",