From a104cff339a6f9ee9603250912122c7352b9ce7e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 2 Feb 2026 08:29:43 +0200
Subject: [PATCH 1/2] spec : fix the check-rate logic of ngram-simple

---
 common/ngram-map.cpp | 12 +++++-------
 common/ngram-map.h   |  2 +-
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/common/ngram-map.cpp b/common/ngram-map.cpp
index cab231bad70..b22e35b9960 100644
--- a/common/ngram-map.cpp
+++ b/common/ngram-map.cpp
@@ -53,11 +53,12 @@ llama_tokens common_ngram_simple_draft(
     // Simple implementation of self-speculative decoding without a draft model.
     //
     const size_t cur_len = tokens.size();
+
     // Only check every check_rate tokens to save compute
-    // i.e., perform check if (cur_len - idx_last_check) >= check_rate
-    if (state.idx_last_check + state.config.check_rate > cur_len) {
-        llama_tokens draft_tokens;
-        return draft_tokens;
+    if (state.check_id++ >= state.config.check_rate) {
+        state.check_id = 0;
+
+        return {};
     }
 
     size_t n_draft_min = state.config.size_ngram; // size of n-gram to lookup in token history
@@ -80,9 +81,6 @@ llama_tokens common_ngram_simple_draft(
     }
     pattern.push_back(sampled); // add the last token to the pattern
 
-    // We do a search in the token history.
-    state.idx_last_check = cur_len;
-
     size_t match_pos = 0; // we ignore position 0, position 0 == no match
                           // search backwards, but skip the current match (we are currently there)
     for (size_t j = cur_len - n_draft_min - 1; j > 0; --j) {
diff --git a/common/ngram-map.h b/common/ngram-map.h
index c094d513d5d..1d87d4b5419 100644
--- a/common/ngram-map.h
+++ b/common/ngram-map.h
@@ -31,7 +31,7 @@ struct common_ngram_simple_config {
 struct common_ngram_simple_state {
     common_ngram_simple_config config;
 
-    size_t idx_last_check = 0; // index of last check in context history (mutable)
+    size_t check_id = 0; // used to control the frequency of generating drafts
 
     common_ngram_simple_state(const common_ngram_simple_config & config)
         : config(config) {}

From b3fa165a64213a69f77cb2e599039368630b42f6 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 2 Feb 2026 09:03:49 +0200
Subject: [PATCH 2/2] cont : refactor + fix checks

---
 common/ngram-map.cpp   | 13 +++----------
 common/ngram-map.h     | 16 +---------------
 common/speculative.cpp | 20 ++++++++++++++------
 3 files changed, 18 insertions(+), 31 deletions(-)

diff --git a/common/ngram-map.cpp b/common/ngram-map.cpp
index b22e35b9960..c5b8fc75ed8 100644
--- a/common/ngram-map.cpp
+++ b/common/ngram-map.cpp
@@ -47,22 +47,15 @@ static std::string common_tokens_to_str(const llama_tokens & inp, size_t start,
  * @return Vector of draft tokens, empty if no matching pattern is found
  */
 llama_tokens common_ngram_simple_draft(
-        common_ngram_simple_state & state,
+        const common_ngram_simple_config & config,
         const llama_tokens & tokens, llama_token sampled) {
 
     // Simple implementation of self-speculative decoding without a draft model.
     //
     const size_t cur_len = tokens.size();
 
-    // Only check every check_rate tokens to save compute
-    if (state.check_id++ >= state.config.check_rate) {
-        state.check_id = 0;
-
-        return {};
-    }
-
-    size_t n_draft_min = state.config.size_ngram; // size of n-gram to lookup in token history
-    size_t n_draft_max = state.config.size_mgram; // the m-gram following the found n-gram is used for draft
+    const size_t n_draft_min = config.size_ngram; // size of n-gram to lookup in token history
+    const size_t n_draft_max = config.size_mgram; // the m-gram following the found n-gram is used for draft
 
     // vector for tokens we want to verify.
     // return empty vector if there is no match.
diff --git a/common/ngram-map.h b/common/ngram-map.h
index 1d87d4b5419..9668bd5a7c5 100644
--- a/common/ngram-map.h
+++ b/common/ngram-map.h
@@ -27,23 +27,9 @@ struct common_ngram_simple_config {
     uint16_t   check_rate;      // check for speculative decoding without draft model for each check_rate token
 };
 
-// current state (and config) of n-gram simple.
-struct common_ngram_simple_state {
-    common_ngram_simple_config config;
-
-    size_t check_id = 0; // used to control the frequency of generating drafts
-
-    common_ngram_simple_state(const common_ngram_simple_config & config)
-        : config(config) {}
-};
-
 // Searches for a n-gram in the history and checks whether a draft sequence should be generated.
-// state:              the ngram simple state to search in.
-// inp:                the tokens generated so far.
-// sampled:            the token that was just sampled.
-// draft:              vector to store the draft tokens, initially empty.
 llama_tokens common_ngram_simple_draft(
-        common_ngram_simple_state & state,
+        const common_ngram_simple_config & config,
         const llama_tokens & tokens, llama_token sampled);
 
 
diff --git a/common/speculative.cpp b/common/speculative.cpp
index 152aaa48d44..127c969d47f 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -463,12 +463,14 @@ struct common_speculative_state_eagle3 : public common_speculative_state {
 
 // state of self-speculation (simple implementation, not ngram-map)
 struct common_speculative_state_ngram_simple : public common_speculative_state {
-    common_ngram_simple_state state;
+    common_ngram_simple_config config;
+
+    uint16_t check_id = 0; // used to control the frequency of generating drafts
 
     common_speculative_state_ngram_simple(
             enum common_speculative_type type,
-            common_ngram_simple_state state)
-        : common_speculative_state(type), state(state) {}
+            common_ngram_simple_config config)
+        : common_speculative_state(type), config(config) {}
 
     void begin(const llama_tokens & prompt) override {
         GGML_UNUSED(prompt);
@@ -479,7 +481,13 @@ struct common_speculative_state_ngram_simple : public common_speculative_state {
             const llama_tokens & prompt_tgt,
             llama_token id_last,
             llama_tokens & result) override {
-        result = common_ngram_simple_draft(state, prompt_tgt, id_last);
+        ++check_id;
+        if (check_id < config.check_rate) {
+            return;
+        }
+        check_id = 0;
+
+        result = common_ngram_simple_draft(config, prompt_tgt, id_last);
         GGML_UNUSED(params);
     }
 
@@ -889,14 +897,14 @@ common_speculative * common_speculative_init(
                 uint16_t mgram_size_value = ngram_map.size_value;
                 uint16_t check_rate       = ngram_map.check_rate;
 
-                auto config_simple = common_ngram_simple_config{
+                auto config_simple = common_ngram_simple_config {
                     /* .size_ngram      = */ ngram_size_key,
                     /* .size_mgram      = */ mgram_size_value,
                     /* .check_rate      = */ check_rate
                 };
                 auto state = std::make_unique<common_speculative_state_ngram_simple>(
                     /* .type            = */ config.type,
-                    /* .state           = */ common_ngram_simple_state(config_simple)
+                    /* .state           = */ config_simple
                 );
                 impls.push_back(std::move(state));
                 break;