Open-Superintelligence-Lab · RohanKhanBD · Jan 18, 2026 · Jan 18, 2026 · Jan 18, 2026
diff --git a/configs/llm_config.py b/configs/llm_config.py
@@ -12,7 +12,10 @@ class BlueberryConfig:
 
     # GQA parameters
     n_kv_heads: int = 4      
-
+    # Deep Delta res
+    beta_dim: int = 128
+    k_eps: float = 1e-6
+    sigmoid_scale: float = 4.0
     # Data params
     # ⚠️ WARNING: For simplicity, I recomend not changing max_seq_len
     # If you change max_seq_len, you MUST re-run data preparation!

diff --git a/models/deepdelta.py b/models/deepdelta.py
@@ -0,0 +1,30 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class DeepDeltaRes(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        beta_dim: int,
+        k_eps: float = 1e-6,
+        sigmoid_scale: float = 4.0,
+    ):
+        super().__init__()
+        self.k_eps = k_eps
+        self.sigmoid_scale = sigmoid_scale
+
+        self.beta_in = nn.Linear(d_model, beta_dim, bias=False)
+        self.beta_out = nn.Linear(beta_dim, 1, bias=True)
+        self.v_proj = nn.Linear(d_model, 1, bias=True)
+
+    def forward(self, x: torch.Tensor, k_in: torch.Tensor, contxt: torch.Tensor):
+        k = F.normalize(k_in, p=1, dim=-1, eps=self.k_eps)
+        beta_logits = self.beta_out(F.tanh(self.beta_in(contxt)))
+        beta = 2.0 * F.sigmoid(beta_logits)
+        proj = torch.sum(k * x, dim=-1, keepdim=True)
+        v = F.sigmoid(self.v_proj(x)) * self.sigmoid_scale
+        delta = beta * (v - proj)
+        update = delta * k
+        return x * update
diff --git a/models/layers.py b/models/layers.py
@@ -3,7 +3,7 @@
 import torch.nn.functional as F
 from torchtune.modules import RotaryPositionalEmbeddings
 from .components import SquaredReLUFeedForward
-
+from deepdelta import DeepDeltaRes
 
 class Rotary(nn.Module):
     def __init__(self, dim: int, max_seq_len: int):
@@ -114,13 +114,19 @@ def __init__(
         n_heads: int,
         d_ff: int,
         max_seq_len: int,
+        # deep delta res
+        beta_dim: int,
+        k_eps: float = 1e-6,
+        sigmoid_scale: float = 4.0,
         dropout: float = 0.1,
         n_kv_heads: int | None = None,
     ):
         super().__init__()
 
         self.attention = MultiHeadAttention(d_model, n_heads, max_seq_len, dropout, n_kv_heads)
         self.feed_forward = SquaredReLUFeedForward(d_model, d_ff, dropout)
+        self.ddr1 = DeepDeltaRes(d_model, beta_dim, k_eps, sigmoid_scale)
+        self.ddr2 = DeepDeltaRes(d_model, beta_dim, k_eps, sigmoid_scale)
 
         # Normalization layers
         self.norm1 = nn.RMSNorm(d_model)
@@ -129,10 +135,12 @@ def __init__(
 
     def forward(self, x):
         # Self-attention
-        attn_out = self.attention(self.norm1(x))
-        x = x + self.dropout(attn_out)
+        norm = self.norm1(x)
+        attn_out = self.attention(norm)
+        x = self.dropout(self.ddr1(x, attn_out, norm))
 
         # Feed-forward
-        ff_out = self.feed_forward(self.norm2(x))
-        x = x + self.dropout(ff_out)
+        norm = self.norm2(x)
+        ff_out = self.feed_forward(norm)
+        x = self.dropout(self.ddr2(x, ff_out, norm))
         return x
diff --git a/models/llm.py b/models/llm.py
@@ -25,6 +25,9 @@ def __init__(self, config: BlueberryConfig):
                     config.n_heads,
                     config.d_ff,
                     config.max_seq_len,
+                    config.beta_dim,
+                    config.k_eps,
+                    config.sigmoid_scale,
                     config.dropout,
                     n_kv_heads=config.n_kv_heads,
                 )