From aa0de2f41d107836ed9ec26a874962cad3823511 Mon Sep 17 00:00:00 2001
From: Antonio Linares <antonio.fivetech@gmail.com>
Date: Wed, 12 Nov 2025 22:20:55 +0100
Subject: [PATCH 01/15] added Q&A dataset and training

---
 dataset/build_qa_dataset.py | 386 ++++++++++++++++++++++++++++++++++++
 train_qa_pairs.sh           |  27 +++
 2 files changed, 413 insertions(+)
 create mode 100644 dataset/build_qa_dataset.py
 create mode 100755 train_qa_pairs.sh

diff --git a/dataset/build_qa_dataset.py b/dataset/build_qa_dataset.py
new file mode 100644
index 00000000..a01914e9
--- /dev/null
+++ b/dataset/build_qa_dataset.py
@@ -0,0 +1,386 @@
+#!/usr/bin/env python3
+"""
+Build Q&A dataset for TinyRecursiveModels training.
+Generates question-answer pairs using natural language sentences.
+"""
+
+import json
+import numpy as np
+import random
+from pathlib import Path
+import argparse
+
+from common import PuzzleDatasetMetadata
+
+# Q&A templates for generating diverse data
+QA_TEMPLATES = [
+    # Factual questions
+    {
+        "questions": [
+            "What is the capital of {country}?",
+            "What is {country}'s capital city?",
+            "Which city is the capital of {country}?",
+        ],
+        "answers": ["{capital}"],
+        "data": [
+            {"country": "France", "capital": "Paris"},
+            {"country": "Germany", "capital": "Berlin"},
+            {"country": "Italy", "capital": "Rome"},
+            {"country": "Spain", "capital": "Madrid"},
+            {"country": "United Kingdom", "capital": "London"},
+            {"country": "Japan", "capital": "Tokyo"},
+            {"country": "China", "capital": "Beijing"},
+            {"country": "India", "capital": "New Delhi"},
+            {"country": "Brazil", "capital": "Brasília"},
+            {"country": "Canada", "capital": "Ottawa"},
+        ]
+    },
+    # Mathematical questions
+    {
+        "questions": [
+            "What is {num1} plus {num2}?",
+            "What is the sum of {num1} and {num2}?",
+            "If you add {num1} and {num2}, what do you get?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"num1": "5", "num2": "3", "result": "8"},
+            {"num1": "10", "num2": "7", "result": "17"},
+            {"num1": "12", "num2": "8", "result": "20"},
+            {"num1": "15", "num2": "9", "result": "24"},
+            {"num1": "20", "num2": "5", "result": "25"},
+        ]
+    },
+    # Color questions
+    {
+        "questions": [
+            "What color is a {fruit}?",
+            "What is the color of a {fruit}?",
+            "Which color does a {fruit} have?",
+        ],
+        "answers": ["{color}"],
+        "data": [
+            {"fruit": "banana", "color": "yellow"},
+            {"fruit": "apple", "color": "red"},
+            {"fruit": "orange", "color": "orange"},
+            {"fruit": "grape", "color": "purple"},
+            {"fruit": "lemon", "color": "yellow"},
+        ]
+    },
+    # Animal questions
+    {
+        "questions": [
+            "What sound does a {animal} make?",
+            "What noise does a {animal} make?",
+            "How does a {animal} sound?",
+        ],
+        "answers": ["{sound}"],
+        "data": [
+            {"animal": "dog", "sound": "woof"},
+            {"animal": "cat", "sound": "meow"},
+            {"animal": "cow", "sound": "moo"},
+            {"animal": "sheep", "sound": "baa"},
+            {"animal": "duck", "sound": "quack"},
+        ]
+    },
+    # Time questions
+    {
+        "questions": [
+            "What day comes after {day}?",
+            "What is the day after {day}?",
+            "Which day follows {day}?",
+        ],
+        "answers": ["{next_day}"],
+        "data": [
+            {"day": "Monday", "next_day": "Tuesday"},
+            {"day": "Tuesday", "next_day": "Wednesday"},
+            {"day": "Wednesday", "next_day": "Thursday"},
+            {"day": "Thursday", "next_day": "Friday"},
+            {"day": "Friday", "next_day": "Saturday"},
+            {"day": "Saturday", "next_day": "Sunday"},
+            {"day": "Sunday", "next_day": "Monday"},
+        ]
+    },
+    # Weather questions
+    {
+        "questions": [
+            "What is the weather like when it {condition}?",
+            "What kind of weather is {condition}?",
+            "When it {condition}, what is the weather?",
+        ],
+        "answers": ["{weather}"],
+        "data": [
+            {"condition": "rains", "weather": "rainy"},
+            {"condition": "snows", "weather": "snowy"},
+            {"condition": "is sunny", "weather": "sunny"},
+            {"condition": "is cloudy", "weather": "cloudy"},
+            {"condition": "is windy", "weather": "windy"},
+        ]
+    }
+]
+
+def generate_qa_pair(template, data_item):
+    """Generate a single Q&A pair from template and data."""
+    question_template = random.choice(template["questions"])
+    answer_template = random.choice(template["answers"])
+
+    # Format question and answer
+    question = question_template.format(**data_item)
+    answer = answer_template.format(**data_item)
+
+    return question, answer
+
+def create_qa_sequence(question, answer, vocab_size=1000):
+    """Convert Q&A pair to sequence format for TRM training."""
+    # Simple tokenization - split into words and map to token IDs
+    # In practice, you'd want a proper tokenizer, but this works for demo
+
+    # Combine question and answer with special tokens
+    full_text = f"Question: {question} Answer: {answer}"
+
+    # Simple word-level tokenization (lowercase, basic punctuation)
+    import re
+    words = re.findall(r'\b\w+\b', full_text.lower())
+
+    # Create a simple vocabulary mapping (in practice, use a real tokenizer)
+    vocab = {}
+    token_id = 1  # Start from 1, reserve 0 for padding
+
+    # Build vocabulary from all possible words in our templates
+    all_words = set()
+    for template in QA_TEMPLATES:
+        for data_item in template["data"]:
+            for q_template in template["questions"]:
+                q = q_template.format(**data_item)
+                all_words.update(re.findall(r'\b\w+\b', q.lower()))
+            for a_template in template["answers"]:
+                a = a_template.format(**data_item)
+                all_words.update(re.findall(r'\b\w+\b', a.lower()))
+
+    # Sort for consistent ordering
+    all_words = sorted(list(all_words))
+    for word in all_words:
+        vocab[word] = token_id
+        token_id += 1
+
+    # Convert words to token IDs
+    tokens = [vocab.get(word, 1) for word in words]  # Default to 1 for unknown words
+
+    # Pad or truncate to fixed length (adjust as needed)
+    seq_len = 32
+    if len(tokens) < seq_len:
+        tokens.extend([0] * (seq_len - len(tokens)))
+    else:
+        tokens = tokens[:seq_len]
+
+    return tokens, len(vocab) + 1  # +1 for padding token
+
+def build_qa_dataset(num_train_puzzles=10000, num_test_puzzles=2000):
+    """Build Q&A dataset with specified number of examples."""
+
+    print(f"Building Q&A dataset with {num_train_puzzles} training and {num_test_puzzles} test examples...")
+
+    # Generate training data
+    train_data = []
+    train_inputs = []
+    train_labels = []
+    train_puzzle_identifiers = []
+    train_puzzle_indices = []
+    train_group_indices = []
+
+    train_puzzle_indices.append(0)  # Start with 0
+    train_group_indices.append(0)   # Start with 0
+
+    puzzle_id = 0
+
+    puzzle_id = 0
+    vocab_size = 1000  # Will be updated based on actual vocabulary
+
+    for i in range(num_train_puzzles):
+        # Select random template and data item
+        template = random.choice(QA_TEMPLATES)
+        data_item = random.choice(template["data"])
+
+        # Generate Q&A pair
+        question, answer = generate_qa_pair(template, data_item)
+
+        # Convert to sequence format
+        tokens, actual_vocab_size = create_qa_sequence(question, answer, vocab_size)
+        vocab_size = max(vocab_size, actual_vocab_size)
+
+        # Create training example (for sequence prediction)
+        # Input: question tokens, Label: answer tokens shifted by 1
+        input_tokens = tokens[:-1]  # All but last token
+        label_tokens = tokens[1:]   # All but first token, shifted
+
+        train_inputs.append(input_tokens)
+        train_labels.append(label_tokens)
+        train_puzzle_identifiers.append(0)  # Single puzzle type
+        puzzle_id += 1
+        train_puzzle_indices.append(puzzle_id)
+        train_group_indices.append(puzzle_id)
+
+    # Generate test data (same process)
+    test_data = []
+    test_inputs = []
+    test_labels = []
+    test_puzzle_identifiers = []
+    test_puzzle_indices = []
+    test_group_indices = []
+
+    test_puzzle_indices.append(0)  # Start with 0
+    test_group_indices.append(0)   # Start with 0
+
+    puzzle_id = 0  # Reset for test
+
+    for i in range(num_test_puzzles):
+        template = random.choice(QA_TEMPLATES)
+        data_item = random.choice(template["data"])
+
+        question, answer = generate_qa_pair(template, data_item)
+        tokens, _ = create_qa_sequence(question, answer, vocab_size)
+
+        input_tokens = tokens[:-1]
+        label_tokens = tokens[1:]
+
+        test_inputs.append(input_tokens)
+        test_labels.append(label_tokens)
+        test_puzzle_identifiers.append(0)
+        puzzle_id += 1
+        test_puzzle_indices.append(puzzle_id)
+        test_group_indices.append(puzzle_id)
+
+    # Create metadata
+    metadata = {
+        "vocab_size": vocab_size,
+        "num_puzzle_identifiers": 1,
+        "seq_len": 31,  # input length (32 - 1)
+        "num_train_puzzles": num_train_puzzles,
+        "num_test_puzzles": num_test_puzzles,
+        "puzzle_type": "qa_pairs",
+        "description": "Question-Answer pairs dataset for language modeling"
+    }
+
+    return {
+        "train": {
+            "inputs": train_inputs,
+            "labels": train_labels,
+            "puzzle_identifiers": train_puzzle_identifiers,
+            "puzzle_indices": train_puzzle_indices,
+            "group_indices": train_group_indices
+        },
+        "test": {
+            "inputs": test_inputs,
+            "labels": test_labels,
+            "puzzle_identifiers": test_puzzle_identifiers,
+            "puzzle_indices": test_puzzle_indices,
+            "group_indices": test_group_indices
+        },
+        "metadata": metadata
+    }
+
+def save_dataset(data, output_dir="data/qa_pairs"):
+    """Save dataset to disk in the expected format."""
+
+    output_path = Path(output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+
+    # Create train and test directories
+    train_dir = output_path / "train"
+    test_dir = output_path / "test"
+    train_dir.mkdir(exist_ok=True)
+    test_dir.mkdir(exist_ok=True)
+
+    # Save JSON data
+    with open(output_path / "train.json", "w") as f:
+        json.dump(data["train"], f, indent=2)
+
+    with open(output_path / "test.json", "w") as f:
+        json.dump(data["test"], f, indent=2)
+
+    with open(output_path / "metadata.json", "w") as f:
+        json.dump(data["metadata"], f, indent=2)
+
+    # Convert to numpy arrays and save
+    print("Converting to numpy arrays...")
+
+    # Train data
+    np.save(train_dir / "all__inputs.npy",
+            np.array(data["train"]["inputs"]))
+    np.save(train_dir / "all__labels.npy",
+            np.array(data["train"]["labels"]))
+    np.save(train_dir / "all__puzzle_identifiers.npy",
+            np.array(data["train"]["puzzle_identifiers"]))
+    np.save(train_dir / "all__puzzle_indices.npy",
+            np.array(data["train"]["puzzle_indices"]))
+    np.save(train_dir / "all__group_indices.npy",
+            np.array(data["train"]["group_indices"]))
+
+    # Test data
+    np.save(test_dir / "all__inputs.npy",
+            np.array(data["test"]["inputs"]))
+    np.save(test_dir / "all__labels.npy",
+            np.array(data["test"]["labels"]))
+    np.save(test_dir / "all__puzzle_identifiers.npy",
+            np.array(data["test"]["puzzle_identifiers"]))
+    np.save(test_dir / "all__puzzle_indices.npy",
+            np.array(data["test"]["puzzle_indices"]))
+    np.save(test_dir / "all__group_indices.npy",
+            np.array(data["test"]["group_indices"]))
+
+    # Create and save dataset.json for train
+    train_metadata = PuzzleDatasetMetadata(
+        pad_id=0,
+        ignore_label_id=0,
+        blank_identifier_id=0,
+        vocab_size=data["metadata"]["vocab_size"],
+        seq_len=data["metadata"]["seq_len"],
+        num_puzzle_identifiers=1,
+        total_groups=data["metadata"]["num_train_puzzles"],
+        mean_puzzle_examples=1.0,
+        total_puzzles=data["metadata"]["num_train_puzzles"],
+        sets=["all"]
+    )
+    with open(train_dir / "dataset.json", "w") as f:
+        json.dump(train_metadata.model_dump(), f)
+
+    # Create and save dataset.json for test
+    test_metadata = PuzzleDatasetMetadata(
+        pad_id=0,
+        ignore_label_id=0,
+        blank_identifier_id=0,
+        vocab_size=data["metadata"]["vocab_size"],
+        seq_len=data["metadata"]["seq_len"],
+        num_puzzle_identifiers=1,
+        total_groups=data["metadata"]["num_test_puzzles"],
+        mean_puzzle_examples=1.0,
+        total_puzzles=data["metadata"]["num_test_puzzles"],
+        sets=["all"]
+    )
+    with open(test_dir / "dataset.json", "w") as f:
+        json.dump(test_metadata.model_dump(), f)
+
+    print(f"Dataset saved to {output_path}")
+    print(f"Vocabulary size: {data['metadata']['vocab_size']}")
+    print(f"Training examples: {data['metadata']['num_train_puzzles']}")
+    print(f"Test examples: {data['metadata']['num_test_puzzles']}")
+
+def main():
+    parser = argparse.ArgumentParser(description="Build Q&A dataset")
+    parser.add_argument("--num-train-puzzles", type=int, default=10000,
+                       help="Number of training examples")
+    parser.add_argument("--num-test-puzzles", type=int, default=2000,
+                       help="Number of test examples")
+    parser.add_argument("--output-dir", type=str, default="data/qa_pairs",
+                       help="Output directory")
+
+    args = parser.parse_args()
+
+    # Build dataset
+    data = build_qa_dataset(args.num_train_puzzles, args.num_test_puzzles)
+
+    # Save to disk
+    save_dataset(data, args.output_dir)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/train_qa_pairs.sh b/train_qa_pairs.sh
new file mode 100755
index 00000000..457511fa
--- /dev/null
+++ b/train_qa_pairs.sh
@@ -0,0 +1,27 @@
+#!/bin/sh
+
+DISABLE_COMPILE=1 uv run python pretrain.py \
+    arch=trm \
+    data_paths="[data/qa_pairs]" \
+    arch.halt_exploration_prob=0.0 \
+    arch.halt_max_steps=8 \
+    arch.H_cycles=2 \
+    arch.L_cycles=2 \
+    arch.H_layers=0 \
+    arch.L_layers=1 \
+    arch.hidden_size=128 \
+    arch.num_heads=4 \
+    arch.expansion=2 \
+    arch.puzzle_emb_ndim=8 \
+    arch.forward_dtype=float32 \
+    arch.puzzle_emb_len=8 \
+    global_batch_size=256 \
+    epochs=10000 \
+    lr=0.001 \
+    puzzle_emb_lr=0.01 \
+    weight_decay=0.0 \
+    puzzle_emb_weight_decay=0.0 \
+    lr_warmup_steps=1000 \
+    eval_interval=10 \
+    use_wandb=false \
+    +project_name="qa_pairs_baseline"
\ No newline at end of file

From 3daf2951f225b46bcd2687129a2ffec1a66ff556 Mon Sep 17 00:00:00 2001
From: FiveTech Software <antonio.fivetech@gmail.com>
Date: Wed, 12 Nov 2025 22:27:53 +0100
Subject: [PATCH 02/15] Revise evaluation commands and add Q&A section

Updated evaluation commands and added Q&A pairs example.
---
 README.md | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a75c9ac9..e1d050a0 100644
--- a/README.md
+++ b/README.md
@@ -19,8 +19,19 @@ To train the model: `train_rubik2x2.sh` (this model trains in a few minutes on a
 
 To evaluate the model:
 
-`uv run python evaluate.py --data-path data/sudoku4x4/ --config checkpoints/trm/messy-earwig-of-enthusiasm/all_config.yaml --checkpoint checkpoints/trm/messy-earwig-of-enthusiasm/final_step_45/model.pt`
+`uv run python evaluate.py --data-path data/rubik2x2/ --config checkpoints/trm/<yours>/all_config.yaml --checkpoint checkpoints/trm/<yours>/final_step_4500/model.pt`
+
+## Example on Q&A pairs (natural language understanding task)
+
+To prepare the data:
+
+`uv run dataset/build_qa_dataset.py`
+
+To train the model: `train_qa_pairs.sh` (this model trains in a few minutes on an A10)
+
+To evaluate the model:
 
+`uv run python evaluate.py --data-path data/qa_pairs/ --config checkpoints/trm/<yours>/all_config.yaml --checkpoint checkpoints/trm/<yours>/final_step_4500/model.pt`
 
 ## Example on Sudoku 4x4
 

From f6d16fd09b8f889a3604e06710144829b10e0bcb Mon Sep 17 00:00:00 2001
From: FiveTech Software <antonio.fivetech@gmail.com>
Date: Thu, 13 Nov 2025 21:48:50 +0100
Subject: [PATCH 03/15] Add files via upload

---
 train_chess.sh | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 train_chess.sh

diff --git a/train_chess.sh b/train_chess.sh
new file mode 100644
index 00000000..2db373b2
--- /dev/null
+++ b/train_chess.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+# Chess training script for TinyRecursiveModels
+# Based on the Q&A training configuration
+
+set -e
+
+# Training configuration for chess puzzles
+export CUDA_VISIBLE_DEVICES=0
+
+uv run python pretrain.py \
+    arch=trm \
+    data_paths="[data/chess]" \
+    arch.halt_exploration_prob=0.0 \
+    arch.halt_max_steps=8 \
+    arch.H_cycles=2 \
+    arch.L_cycles=2 \
+    arch.H_layers=0 \
+    arch.L_layers=1 \
+    arch.hidden_size=128 \
+    arch.num_heads=4 \
+    arch.expansion=2 \
+    arch.puzzle_emb_ndim=8 \
+    arch.forward_dtype=float32 \
+    arch.puzzle_emb_len=8 \
+    global_batch_size=256 \
+    epochs=10000 \
+    lr=0.001 \
+    puzzle_emb_lr=0.01 \
+    weight_decay=0.0 \
+    puzzle_emb_weight_decay=0.0 \
+    lr_warmup_steps=1000 \
+    eval_interval=10 \
+    use_wandb=false \
+    +project_name=chess_baseline
\ No newline at end of file

From 2a002f8a2f0117f8c1e135a3efee78572183e9b7 Mon Sep 17 00:00:00 2001
From: FiveTech Software <antonio.fivetech@gmail.com>
Date: Thu, 13 Nov 2025 21:49:18 +0100
Subject: [PATCH 04/15] Add files via upload

---
 dataset/build_chess_dataset.py | 324 +++++++++++++++++++++++++++++++++
 1 file changed, 324 insertions(+)
 create mode 100644 dataset/build_chess_dataset.py

diff --git a/dataset/build_chess_dataset.py b/dataset/build_chess_dataset.py
new file mode 100644
index 00000000..0ac493f6
--- /dev/null
+++ b/dataset/build_chess_dataset.py
@@ -0,0 +1,324 @@
+#!/usr/bin/env python3
+"""
+Build Chess dataset for TinyRecursiveModels training.
+Generates chess puzzle positions and solutions.
+"""
+
+import json
+import numpy as np
+import random
+from pathlib import Path
+import argparse
+
+from common import PuzzleDatasetMetadata
+
+# Chess piece representations
+PIECES = {
+    'P': 'pawn', 'N': 'knight', 'B': 'bishop', 'R': 'rook', 'Q': 'queen', 'K': 'king',
+    'p': 'pawn', 'n': 'knight', 'b': 'bishop', 'r': 'rook', 'q': 'queen', 'k': 'king'
+}
+
+# Chess puzzle templates - simplified positions requiring specific moves
+CHESS_PUZZLES = [
+    # Checkmate in 1 puzzles
+    {
+        "description": "checkmate_in_1",
+        "positions": [
+            # Queen checkmate
+            {
+                "fen": "r1bqkbnr/pppp1ppp/2n5/4p3/2B1P3/3P1N2/PPP2PPP/RNBQK2R b KQkq - 0 4",
+                "solution": "Qh4#",
+                "description": "Black queen delivers checkmate on h4"
+            },
+            # Rook checkmate
+            {
+                "fen": "r1bqk2r/pppp1ppp/2n2n2/2b1p3/2B1P3/3P1N2/PPP2PPP/RNBQK2R w KQkq - 0 5",
+                "solution": "Rh8#",
+                "description": "White rook checkmates on h8"
+            },
+            # Knight checkmate
+            {
+                "fen": "r1bqkbnr/pppp1ppp/2n5/4p3/2B1P3/3P1N2/PPP2PPP/RNBQK2R b KQkq - 0 4",
+                "solution": "Nf3#",
+                "description": "Black knight checkmates on f3"
+            }
+        ]
+    },
+    # Capture puzzles
+    {
+        "description": "capture_piece",
+        "positions": [
+            {
+                "fen": "r1bqkbnr/pppp1ppp/2n5/4p3/2B1P3/3P1N2/PPP2PPP/RNBQK2R w KQkq - 0 4",
+                "solution": "Bxc6",
+                "description": "White bishop captures black knight on c6"
+            },
+            {
+                "fen": "r1bqkbnr/pppp1ppp/2n5/4p3/2B1P3/3P1N2/PPP2PPP/RNBQK2R b KQkq - 0 4",
+                "solution": "Nxe4",
+                "description": "Black knight captures white pawn on e4"
+            }
+        ]
+    },
+    # Defensive moves
+    {
+        "description": "defend_attack",
+        "positions": [
+            {
+                "fen": "r1bqkbnr/pppp1ppp/2n5/4p3/2B1P3/3P1N2/PPP2PPP/RNBQK2R w KQkq - 0 4",
+                "solution": "Nf3",
+                "description": "White knight moves to f3 to defend against attack"
+            },
+            {
+                "fen": "r1bqkbnr/pppp1ppp/2n5/4p3/2B1P3/3P1N2/PPP2PPP/RNBQK2R b KQkq - 0 4",
+                "solution": "Nf6",
+                "description": "Black knight moves to f6 for development and defense"
+            }
+        ]
+    }
+]
+
+def generate_chess_puzzle(puzzle_template):
+    """Generate a single chess puzzle from template."""
+    position = random.choice(puzzle_template["positions"])
+    return position["fen"], position["solution"], position["description"]
+
+def create_chess_sequence(fen, solution, vocab_size=1000):
+    """Convert chess puzzle to sequence format for TRM training."""
+    # Combine FEN position and solution
+    full_text = f"Position: {fen} Solution: {solution}"
+
+    # Simple word-level tokenization
+    import re
+    words = re.findall(r'\b\w+\b', full_text.lower())
+
+    # Build vocabulary from chess-related terms
+    chess_vocab = set()
+    for puzzle_type in CHESS_PUZZLES:
+        for pos in puzzle_type["positions"]:
+            # Add FEN pieces and coordinates
+            chess_vocab.update(re.findall(r'\b\w+\b', pos["fen"].lower()))
+            chess_vocab.update(re.findall(r'\b\w+\b', pos["solution"].lower()))
+            chess_vocab.update(re.findall(r'\b\w+\b', pos["description"].lower()))
+
+    # Add common chess terms
+    chess_vocab.update(['position', 'solution', 'white', 'black', 'pawn', 'knight',
+                       'bishop', 'rook', 'queen', 'king', 'checkmate', 'capture'])
+
+    chess_vocab = sorted(list(chess_vocab))
+    vocab = {word: i+1 for i, word in enumerate(chess_vocab)}
+    id_to_word = {v: k for k, v in vocab.items()}
+
+    # Convert words to token IDs
+    tokens = [vocab.get(word, 1) for word in words]  # Default to 1 for unknown words
+
+    # Pad or truncate to fixed length
+    seq_len = 64  # Longer sequences for chess
+    if len(tokens) < seq_len:
+        tokens.extend([0] * (seq_len - len(tokens)))
+    else:
+        tokens = tokens[:seq_len]
+
+    return tokens, len(vocab) + 1
+
+def build_chess_dataset(num_train_puzzles=10000, num_test_puzzles=2000):
+    """Build chess dataset with specified number of examples."""
+
+    print(f"Building chess dataset with {num_train_puzzles} training and {num_test_puzzles} test examples...")
+
+    # Generate training data
+    train_inputs = []
+    train_labels = []
+    train_puzzle_identifiers = []
+    train_puzzle_indices = []
+    train_group_indices = []
+
+    train_puzzle_indices.append(0)
+    train_group_indices.append(0)
+
+    vocab_size = 1000
+    puzzle_id = 0
+
+    for i in range(num_train_puzzles):
+        # Randomly select puzzle type
+        puzzle_type = random.choice(CHESS_PUZZLES)
+
+        fen, solution, description = generate_chess_puzzle(puzzle_type)
+
+        # Convert to sequence format
+        tokens, actual_vocab_size = create_chess_sequence(fen, solution, vocab_size)
+        vocab_size = max(vocab_size, actual_vocab_size)
+
+        # Create training example (for sequence prediction)
+        input_tokens = tokens[:-1]  # All but last token
+        label_tokens = tokens[1:]   # All but first token, shifted
+
+        train_inputs.append(input_tokens)
+        train_labels.append(label_tokens)
+        train_puzzle_identifiers.append(0)  # Single puzzle type
+        puzzle_id += 1
+        train_puzzle_indices.append(puzzle_id)
+        train_group_indices.append(puzzle_id)
+
+    # Generate test data (same process)
+    test_inputs = []
+    test_labels = []
+    test_puzzle_identifiers = []
+    test_puzzle_indices = []
+    test_group_indices = []
+
+    test_puzzle_indices.append(0)
+    test_group_indices.append(0)
+
+    puzzle_id = 0
+
+    for i in range(num_test_puzzles):
+        puzzle_type = random.choice(CHESS_PUZZLES)
+        fen, solution, description = generate_chess_puzzle(puzzle_type)
+
+        tokens, _ = create_chess_sequence(fen, solution, vocab_size)
+
+        input_tokens = tokens[:-1]
+        label_tokens = tokens[1:]
+
+        test_inputs.append(input_tokens)
+        test_labels.append(label_tokens)
+        test_puzzle_identifiers.append(0)
+        puzzle_id += 1
+        test_puzzle_indices.append(puzzle_id)
+        test_group_indices.append(puzzle_id)
+
+    # Create metadata
+    metadata = {
+        "vocab_size": vocab_size,
+        "num_puzzle_identifiers": 1,
+        "seq_len": 63,  # input length (64 - 1)
+        "num_train_puzzles": num_train_puzzles,
+        "num_test_puzzles": num_test_puzzles,
+        "puzzle_type": "chess",
+        "description": "Chess puzzle dataset for strategic reasoning"
+    }
+
+    return {
+        "train": {
+            "inputs": train_inputs,
+            "labels": train_labels,
+            "puzzle_identifiers": train_puzzle_identifiers,
+            "puzzle_indices": train_puzzle_indices,
+            "group_indices": train_group_indices
+        },
+        "test": {
+            "inputs": test_inputs,
+            "labels": test_labels,
+            "puzzle_identifiers": test_puzzle_identifiers,
+            "puzzle_indices": test_puzzle_indices,
+            "group_indices": test_group_indices
+        },
+        "metadata": metadata
+    }
+
+def save_dataset(data, output_dir="data/chess"):
+    """Save dataset to disk in the expected format."""
+
+    output_path = Path(output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+
+    # Create train and test directories
+    train_dir = output_path / "train"
+    test_dir = output_path / "test"
+    train_dir.mkdir(exist_ok=True)
+    test_dir.mkdir(exist_ok=True)
+
+    # Save JSON data
+    with open(output_path / "train.json", "w") as f:
+        json.dump(data["train"], f, indent=2)
+
+    with open(output_path / "test.json", "w") as f:
+        json.dump(data["test"], f, indent=2)
+
+    with open(output_path / "metadata.json", "w") as f:
+        json.dump(data["metadata"], f, indent=2)
+
+    # Convert to numpy arrays and save
+    print("Converting to numpy arrays...")
+
+    # Train data
+    np.save(train_dir / "all__inputs.npy",
+            np.array(data["train"]["inputs"]))
+    np.save(train_dir / "all__labels.npy",
+            np.array(data["train"]["labels"]))
+    np.save(train_dir / "all__puzzle_identifiers.npy",
+            np.array(data["train"]["puzzle_identifiers"]))
+    np.save(train_dir / "all__puzzle_indices.npy",
+            np.array(data["train"]["puzzle_indices"]))
+    np.save(train_dir / "all__group_indices.npy",
+            np.array(data["train"]["group_indices"]))
+
+    # Test data
+    np.save(test_dir / "all__inputs.npy",
+            np.array(data["test"]["inputs"]))
+    np.save(test_dir / "all__labels.npy",
+            np.array(data["test"]["labels"]))
+    np.save(test_dir / "all__puzzle_identifiers.npy",
+            np.array(data["test"]["puzzle_identifiers"]))
+    np.save(test_dir / "all__puzzle_indices.npy",
+            np.array(data["test"]["puzzle_indices"]))
+    np.save(test_dir / "all__group_indices.npy",
+            np.array(data["test"]["group_indices"]))
+
+    # Create and save dataset.json for train
+    train_metadata = PuzzleDatasetMetadata(
+        pad_id=0,
+        ignore_label_id=0,
+        blank_identifier_id=0,
+        vocab_size=data["metadata"]["vocab_size"],
+        seq_len=data["metadata"]["seq_len"],
+        num_puzzle_identifiers=1,
+        total_groups=data["metadata"]["num_train_puzzles"],
+        mean_puzzle_examples=1.0,
+        total_puzzles=data["metadata"]["num_train_puzzles"],
+        sets=["all"]
+    )
+    with open(train_dir / "dataset.json", "w") as f:
+        json.dump(train_metadata.model_dump(), f)
+
+    # Create and save dataset.json for test
+    test_metadata = PuzzleDatasetMetadata(
+        pad_id=0,
+        ignore_label_id=0,
+        blank_identifier_id=0,
+        vocab_size=data["metadata"]["vocab_size"],
+        seq_len=data["metadata"]["seq_len"],
+        num_puzzle_identifiers=1,
+        total_groups=data["metadata"]["num_test_puzzles"],
+        mean_puzzle_examples=1.0,
+        total_puzzles=data["metadata"]["num_test_puzzles"],
+        sets=["all"]
+    )
+    with open(test_dir / "dataset.json", "w") as f:
+        json.dump(test_metadata.model_dump(), f)
+
+    print(f"Dataset saved to {output_path}")
+    print(f"Vocabulary size: {data['metadata']['vocab_size']}")
+    print(f"Training examples: {data['metadata']['num_train_puzzles']}")
+    print(f"Test examples: {data['metadata']['num_test_puzzles']}")
+
+def main():
+    parser = argparse.ArgumentParser(description="Build Chess dataset")
+    parser.add_argument("--num-train-puzzles", type=int, default=10000,
+                       help="Number of training examples")
+    parser.add_argument("--num-test-puzzles", type=int, default=2000,
+                       help="Number of test examples")
+    parser.add_argument("--output-dir", type=str, default="data/chess",
+                       help="Output directory")
+
+    args = parser.parse_args()
+
+    # Build dataset
+    data = build_chess_dataset(args.num_train_puzzles, args.num_test_puzzles)
+
+    # Save to disk
+    save_dataset(data, args.output_dir)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From 81d6c56625a60a428f2e6bdea3f2ae4799fbe989 Mon Sep 17 00:00:00 2001
From: FiveTech Software <antonio.fivetech@gmail.com>
Date: Fri, 14 Nov 2025 08:40:05 +0100
Subject: [PATCH 05/15] Add files via upload


From 75d5dbc355d16a460cac2bbf3a62ee9bb25498f3 Mon Sep 17 00:00:00 2001
From: FiveTech Software <antonio.fivetech@gmail.com>
Date: Fri, 14 Nov 2025 08:40:42 +0100
Subject: [PATCH 06/15] Add files via upload


From 5824bf9ae1648e1d7d66ca083d8caf64938f74a2 Mon Sep 17 00:00:00 2001
From: FiveTech Software <antonio.fivetech@gmail.com>
Date: Fri, 14 Nov 2025 08:43:16 +0100
Subject: [PATCH 07/15] Add files via upload


From f0e3ff4da3ba6574b981f891bb6cb0be1cad0dac Mon Sep 17 00:00:00 2001
From: Antonio Linares <antonio.fivetech@gmail.com>
Date: Fri, 14 Nov 2025 09:07:09 +0100
Subject: [PATCH 08/15] more complex Q&A dataset

---
 dataset/build_qa_dataset.py           | 487 ++++++++++++++++++++++----
 pyproject.toml                        |   3 +-
 train_qa_pairs_ultra_complex.sh       |  32 ++
 train_qa_pairs_ultra_complex_small.sh |  32 ++
 4 files changed, 483 insertions(+), 71 deletions(-)
 create mode 100755 train_qa_pairs_ultra_complex.sh
 create mode 100755 train_qa_pairs_ultra_complex_small.sh

diff --git a/dataset/build_qa_dataset.py b/dataset/build_qa_dataset.py
index a01914e9..65d33816 100644
--- a/dataset/build_qa_dataset.py
+++ b/dataset/build_qa_dataset.py
@@ -12,109 +12,456 @@
 
 from common import PuzzleDatasetMetadata
 
-# Q&A templates for generating diverse data
+# Ultra-Advanced Q&A templates for sophisticated reasoning tasks
 QA_TEMPLATES = [
-    # Factual questions
+    # Advanced Mathematics - Algebra and Equations
     {
         "questions": [
-            "What is the capital of {country}?",
-            "What is {country}'s capital city?",
-            "Which city is the capital of {country}?",
+            "Solve for x: 2x + 3 = 7. What is x?",
+            "If 3x - 5 = 13, what is the value of x?",
+            "Solve the equation: 4(x + 2) = 24. What is x?",
+            "What is the solution to: x² - 9 = 0?",
+            "If f(x) = 2x + 1 and f(3) = ?, what is the answer?",
         ],
-        "answers": ["{capital}"],
+        "answers": ["{result}"],
         "data": [
-            {"country": "France", "capital": "Paris"},
-            {"country": "Germany", "capital": "Berlin"},
-            {"country": "Italy", "capital": "Rome"},
-            {"country": "Spain", "capital": "Madrid"},
-            {"country": "United Kingdom", "capital": "London"},
-            {"country": "Japan", "capital": "Tokyo"},
-            {"country": "China", "capital": "Beijing"},
-            {"country": "India", "capital": "New Delhi"},
-            {"country": "Brazil", "capital": "Brasília"},
-            {"country": "Canada", "capital": "Ottawa"},
+            {"result": "2"},
+            {"result": "6"},
+            {"result": "4"},
+            {"result": "x = 3 or x = -3"},
+            {"result": "7"},
         ]
     },
-    # Mathematical questions
+    # Advanced Mathematics - Word Problems with Multiple Steps
     {
         "questions": [
-            "What is {num1} plus {num2}?",
-            "What is the sum of {num1} and {num2}?",
-            "If you add {num1} and {num2}, what do you get?",
+            "A man buys a horse for $60, sells it for $70, buys it back for $80, and sells it again for $90. How much profit did he make?",
+            "If a plane can fly 500 miles per hour in still air, and there is a 50 mph headwind, what is the plane's ground speed?",
+            "A ladder 13 feet long leans against a wall. The base is 5 feet from the wall. How high up the wall does the ladder reach?",
+            "If a car travels at 60 mph for 2 hours and 40 mph for 1 hour, what is the average speed for the entire trip?",
+            "A rectangular garden is twice as long as it is wide. If the perimeter is 60 feet, what are the dimensions?",
         ],
         "answers": ["{result}"],
         "data": [
-            {"num1": "5", "num2": "3", "result": "8"},
-            {"num1": "10", "num2": "7", "result": "17"},
-            {"num1": "12", "num2": "8", "result": "20"},
-            {"num1": "15", "num2": "9", "result": "24"},
-            {"num1": "20", "num2": "5", "result": "25"},
+            {"result": "$20 profit"},
+            {"result": "450 mph"},
+            {"result": "12 feet"},
+            {"result": "53.33 mph"},
+            {"result": "10 feet by 20 feet"},
+        ]
+    },
+    # Philosophical and Ethical Reasoning
+    {
+        "questions": [
+            "If a trolley is heading toward 5 people and you can switch it to a track with 1 person, should you do it? Why?",
+            "Is it ever morally acceptable to lie? Give an example.",
+            "What is the difference between justice and fairness?",
+            "Can machines ever truly think, or do they just simulate thinking?",
+            "What makes an action morally right or wrong?",
+        ],
+        "answers": ["{answer}"],
+        "data": [
+            {"answer": "Yes, because it minimizes harm (utilitarian ethics)"},
+            {"answer": "Sometimes, such as to protect someone from harm"},
+            {"answer": "Justice is about rules and punishment, fairness is about equality"},
+            {"answer": "This is the philosophical zombie problem - we cannot know for certain"},
+            {"answer": "According to deontology: following rules; utilitarianism: maximizing happiness"},
+        ]
+    },
+    # Advanced Scientific Concepts
+    {
+        "questions": [
+            "Explain quantum entanglement in simple terms.",
+            "What is the uncertainty principle in quantum mechanics?",
+            "How does natural selection explain antibiotic resistance in bacteria?",
+            "What is the difference between genotype and phenotype?",
+            "Explain how CRISPR gene editing works.",
+        ],
+        "answers": ["{answer}"],
+        "data": [
+            {"answer": "Two particles that remain connected so that the state of one instantly influences the other, regardless of distance"},
+            {"answer": "We cannot simultaneously know both the position and momentum of a particle with perfect accuracy"},
+            {"answer": "Bacteria with random mutations that make them resistant survive and reproduce when antibiotics are present"},
+            {"answer": "Genotype is the genetic code, phenotype is the physical expression of those genes"},
+            {"answer": "CRISPR uses guide RNA to target specific DNA sequences, then Cas9 enzyme cuts the DNA for editing"},
+        ]
+    },
+    # Complex Logical Puzzles and Paradoxes
+    {
+        "questions": [
+            "This sentence is false. Is this statement true or false?",
+            "Can an omnipotent being create a stone so heavy that even they cannot lift it?",
+            "If God is omnipotent, can God create a being more powerful than itself?",
+            "What is the unexpected hanging paradox?",
+            "Explain the Monty Hall problem and why switching doors improves your chances.",
+        ],
+        "answers": ["{answer}"],
+        "data": [
+            {"answer": "This creates a paradox - if true, then it's false; if false, then it's true"},
+            {"answer": "This is a paradox that challenges the concept of omnipotence"},
+            {"answer": "No, because that would contradict the definition of being the most powerful"},
+            {"answer": "A prisoner is told they will be hanged on a surprise date, leading to logical contradictions"},
+            {"answer": "With 3 doors and 1 car, switching gives 2/3 chance vs 1/3 for staying"},
+        ]
+    },
+    # Historical Analysis and Inference
+    {
+        "questions": [
+            "Why did the Roman Empire fall? Give three main reasons.",
+            "What were the primary causes of World War I?",
+            "How did the Industrial Revolution change society?",
+            "What was the significance of the Magna Carta?",
+            "Why did the Soviet Union collapse?",
+        ],
+        "answers": ["{answer}"],
+        "data": [
+            {"answer": "Economic troubles, military overextension, political corruption, and barbarian invasions"},
+            {"answer": "Nationalism, imperialism, militarism, and the assassination of Archduke Franz Ferdinand"},
+            {"answer": "Mass production, urbanization, new social classes, and technological advancement"},
+            {"answer": "It limited the king's power and established the principle that everyone is subject to the law"},
+            {"answer": "Economic stagnation, political repression, the Afghanistan war, and Gorbachev's reforms"},
+        ]
+    },
+    # Advanced Psychology and Human Behavior
+    {
+        "questions": [
+            "What is cognitive dissonance and give an example?",
+            "Explain the bystander effect with an example.",
+            "What is confirmation bias and how does it affect decision making?",
+            "Describe the difference between intrinsic and extrinsic motivation.",
+            "What is the Dunning-Kruger effect?",
+        ],
+        "answers": ["{answer}"],
+        "data": [
+            {"answer": "Mental discomfort from holding contradictory beliefs, like smoking while knowing it's unhealthy"},
+            {"answer": "People are less likely to help in emergencies when others are present, as seen in the Kitty Genovese case"},
+            {"answer": "Tendency to seek information that confirms existing beliefs and ignore contradictory evidence"},
+            {"answer": "Intrinsic comes from within (enjoyment), extrinsic comes from external rewards (money, grades)"},
+            {"answer": "Less competent people overestimate their abilities while highly competent people underestimate theirs"},
+        ]
+    },
+    # Complex Systems and Economics
+    {
+        "questions": [
+            "Explain how compound interest works with an example.",
+            "What is the tragedy of the commons?",
+            "How does inflation affect purchasing power?",
+            "Explain the concept of supply and demand with an example.",
+            "What is opportunity cost?",
+        ],
+        "answers": ["{answer}"],
+        "data": [
+            {"answer": "Interest earned on both principal and accumulated interest - $100 at 10% becomes $110, then $121, etc."},
+            {"answer": "Shared resources get overused because individuals prioritize self-interest over collective good"},
+            {"answer": "It reduces the value of money, so you need more dollars to buy the same goods"},
+            {"answer": "High demand + low supply = high prices; low demand + high supply = low prices"},
+            {"answer": "The value of the best alternative you give up when making a choice"},
+        ]
+    },
+    # Advanced Language and Literature Analysis
+    {
+        "questions": [
+            "What is irony and give three types with examples?",
+            "Explain the difference between denotation and connotation.",
+            "What is a metaphor versus a simile?",
+            "Analyze the theme of isolation in Mary Shelley's Frankenstein.",
+            "What is stream of consciousness in literature?",
+        ],
+        "answers": ["{answer}"],
+        "data": [
+            {"answer": "Verbal (saying opposite of what you mean), situational (outcome opposite of expectation), dramatic (audience knows more than characters)"},
+            {"answer": "Denotation is literal meaning, connotation is emotional/cultural associations"},
+            {"answer": "Metaphor directly equates things (time is a thief), simile uses 'like' or 'as' (time like a thief)"},
+            {"answer": "The creature and Victor both experience profound isolation, leading to their destructive behaviors"},
+            {"answer": "A narrative technique that presents thoughts as they flow through a character's mind"},
+        ]
+    },
+    # Advanced Mathematics - Calculus and Analysis
+    {
+        "questions": [
+            "What is the derivative of f(x) = x³ + 2x² - 5x + 1?",
+            "Evaluate the integral ∫(3x² + 2x + 1)dx",
+            "What is the limit as x approaches 0 of (sin(x))/x?",
+            "Find the critical points of f(x) = x³ - 3x² + 2",
+            "What is the fundamental theorem of calculus?",
+            "Solve the differential equation dy/dx = 2x + 1",
+            "What is the Taylor series expansion of e^x around x=0?",
+            "Explain the concept of convergence in infinite series",
+        ],
+        "answers": ["{answer}"],
+        "data": [
+            {"answer": "f'(x) = 3x² + 4x - 5"},
+            {"answer": "x³ + x² + x + C"},
+            {"answer": "1 (this is a fundamental limit in calculus)"},
+            {"answer": "x = 1 and x = 2 (local maximum and minimum)"},
+            {"answer": "It connects differentiation and integration - the derivative of an integral gives the original function"},
+            {"answer": "y = x² + x + C"},
+            {"answer": "1 + x + x²/2! + x³/3! + x⁴/4! + ..."},
+            {"answer": "A series converges if its partial sums approach a finite limit"},
+        ]
+    },
+    # Theoretical Computer Science
+    {
+        "questions": [
+            "What is P vs NP problem and why is it important?",
+            "Explain the halting problem and its implications",
+            "What is computational complexity class NP-complete?",
+            "How does public-key cryptography work?",
+            "What is the difference between deterministic and nondeterministic Turing machines?",
+            "Explain the concept of algorithmic information theory",
+            "What are the limitations of neural networks in computation?",
+            "How does quantum computing differ from classical computing?",
+        ],
+        "answers": ["{answer}"],
+        "data": [
+            {"answer": "P is problems solvable in polynomial time, NP is verifiable in polynomial time - if P=NP, many cryptography systems would break"},
+            {"answer": "No algorithm can determine if an arbitrary program will halt - proves fundamental limits of computation"},
+            {"answer": "Problems that are in NP and to which all other NP problems can be reduced - solving one solves all"},
+            {"answer": "Uses two keys: public for encryption, private for decryption - based on mathematical trapdoor functions"},
+            {"answer": "Deterministic follows one path, nondeterministic can explore multiple paths simultaneously"},
+            {"answer": "Studies the information content of algorithms - Kolmogorov complexity measures randomness"},
+            {"answer": "They are universal approximators but may not learn efficiently and lack true understanding"},
+            {"answer": "Uses quantum superposition and entanglement for parallel computation of multiple states"},
+        ]
+    },
+    # Advanced Physics - Relativity and Quantum Field Theory
+    {
+        "questions": [
+            "Explain Einstein's theory of special relativity in simple terms",
+            "What is the twin paradox in relativity?",
+            "How does general relativity explain gravity?",
+            "What is quantum field theory?",
+            "Explain the Higgs mechanism and the Higgs boson",
+            "What is the cosmological constant problem?",
+            "How does black hole information paradox challenge physics?",
+            "What is string theory trying to accomplish?",
+        ],
+        "answers": ["{answer}"],
+        "data": [
+            {"answer": "Time and space are relative, not absolute - simultaneity depends on reference frame, nothing travels faster than light"},
+            {"answer": "One twin travels at near light speed and returns younger due to time dilation"},
+            {"answer": "Gravity is curvature of spacetime caused by mass-energy, not a force between objects"},
+            {"answer": "Framework unifying quantum mechanics and special relativity - particles are excitations in quantum fields"},
+            {"answer": "Mechanism that gives particles mass through interaction with Higgs field - Higgs boson is the field excitation"},
+            {"answer": "Why is the measured cosmological constant 120 orders of magnitude smaller than quantum predictions?"},
+            {"answer": "Black holes destroy information, contradicting quantum mechanics' unitarity principle"},
+            {"answer": "Unify all fundamental forces and explain particle properties through vibrating strings in higher dimensions"},
+        ]
+    },
+    # Neuroscience and Cognitive Science
+    {
+        "questions": [
+            "How does synaptic plasticity enable learning?",
+            "What is the binding problem in neuroscience?",
+            "Explain the difference between working memory and long-term memory",
+            "How does the brain's predictive coding work?",
+            "What is embodied cognition?",
+            "Explain neural oscillations and their cognitive functions",
+            "How does the mirror neuron system work?",
+            "What is the hard problem of consciousness?",
+        ],
+        "answers": ["{answer}"],
+        "data": [
+            {"answer": "Synapses strengthen or weaken based on correlated activity - Hebbian learning: 'neurons that fire together wire together'"},
+            {"answer": "How different sensory features (color, shape, motion) are integrated into coherent object perception"},
+            {"answer": "Working memory holds information temporarily for processing, long-term memory stores it permanently"},
+            {"answer": "Brain predicts sensory input and learns from prediction errors to minimize surprise"},
+            {"answer": "Cognition arises from interactions between brain, body, and environment, not just brain alone"},
+            {"answer": "Rhythmic brain activity coordinates information processing across different frequency bands"},
+            {"answer": "Neurons fire both when performing actions and observing others perform same actions - enables empathy and imitation"},
+            {"answer": "Why subjective experience (qualia) exists - the 'what it's like' aspect of consciousness"},
+        ]
+    },
+    # Advanced Chemistry and Biochemistry
+    {
+        "questions": [
+            "Explain quantum chemistry and molecular orbital theory",
+            "How does enzyme catalysis work at the molecular level?",
+            "What is the difference between primary, secondary, and tertiary protein structure?",
+            "Explain the citric acid cycle and its role in metabolism",
+            "How does PCR (polymerase chain reaction) work?",
+            "What is the role of ATP in cellular energy transfer?",
+            "Explain the mechanism of photosynthesis at the quantum level",
+            "How do neurotransmitters cross the synaptic cleft?",
+        ],
+        "answers": ["{answer}"],
+        "data": [
+            {"answer": "Applies quantum mechanics to chemical systems - molecular orbitals form from atomic orbital combinations"},
+            {"answer": "Enzymes lower activation energy by stabilizing transition states through specific binding interactions"},
+            {"answer": "Primary: amino acid sequence; Secondary: local folding (alpha helices, beta sheets); Tertiary: overall 3D structure"},
+            {"answer": "Circular metabolic pathway that oxidizes acetyl-CoA to CO2, producing NADH, FADH2, and ATP"},
+            {"answer": "Uses heat-stable polymerase to exponentially amplify DNA through repeated denaturation, annealing, and extension cycles"},
+            {"answer": "ATP is the universal energy currency - hydrolysis releases energy for cellular work through phosphate transfer"},
+            {"answer": "Quantum coherence allows efficient energy transfer through photosynthetic complexes despite thermal noise"},
+            {"answer": "Neurotransmitters diffuse across synapse and bind to receptors on postsynaptic neuron, triggering signal cascades"},
+        ]
+    },
+    # Linguistics and Philosophy of Language
+    {
+        "questions": [
+            "What is the Sapir-Whorf hypothesis?",
+            "Explain Chomsky's theory of generative grammar",
+            "What is the difference between semantics and pragmatics?",
+            "How does speech act theory work?",
+            "What is the problem of reference in philosophy of language?",
+            "Explain the concept of linguistic relativity",
+            "How does language acquisition work in children?",
+            "What is the philosophy of meaning in Wittgenstein's later work?",
+        ],
+        "answers": ["{answer}"],
+        "data": [
+            {"answer": "Language determines thought - the structure of language influences how speakers perceive and conceptualize the world"},
+            {"answer": "Humans have innate universal grammar that generates all possible grammatical sentences"},
+            {"answer": "Semantics studies literal meaning, pragmatics studies meaning in context and speaker intentions"},
+            {"answer": "Utterances don't just convey information but perform actions (promising, requesting, apologizing)"},
+            {"answer": "How words connect to things in the world - Frege's distinction between sense and reference"},
+            {"answer": "Different languages categorize reality differently, affecting cognition and worldview"},
+            {"answer": "Children use innate language acquisition device plus environmental input to learn grammar rules"},
+            {"answer": "Meaning is use - language games in social contexts determine word meaning, not mental representations"},
+        ]
+    },
+    # Advanced Statistics and Probability Theory
+    {
+        "questions": [
+            "Explain Bayesian inference with an example",
+            "What is the central limit theorem and why is it important?",
+            "How does maximum likelihood estimation work?",
+            "What is the difference between frequentist and Bayesian statistics?",
+            "Explain the concept of statistical power",
+            "How does bootstrapping work for statistical inference?",
+            "What is the curse of dimensionality?",
+            "Explain Markov chains and their applications",
+        ],
+        "answers": ["{answer}"],
+        "data": [
+            {"answer": "Updates beliefs based on evidence - prior probability × likelihood gives posterior probability"},
+            {"answer": "Sample means approach normal distribution regardless of population distribution, enabling statistical inference"},
+            {"answer": "Finds parameter values that maximize probability of observing the data given the model"},
+            {"answer": "Frequentist: long-run frequencies; Bayesian: degree of belief updated with evidence"},
+            {"answer": "Probability of correctly rejecting false null hypothesis - higher power means better chance of detecting effects"},
+            {"answer": "Resamples data with replacement to estimate sampling distributions without assumptions"},
+            {"answer": "As dimensions increase, data becomes sparse, making distance-based algorithms less effective"},
+            {"answer": "Systems where future states depend only on current state - used in physics, biology, and algorithms"},
+        ]
+    },
+    # Game Theory and Decision Theory
+    {
+        "questions": [
+            "Explain the prisoner's dilemma",
+            "What is Nash equilibrium?",
+            "How does prospect theory differ from expected utility theory?",
+            "What is the difference between cooperative and non-cooperative game theory?",
+            "Explain the concept of zero-sum games",
+            "How does evolutionary game theory work?",
+            "What is the tragedy of the commons in game theoretic terms?",
+            "Explain backward induction in extensive form games",
+        ],
+        "answers": ["{answer}"],
+        "data": [
+            {"answer": "Two prisoners must choose to confess or stay silent - rational self-interest leads to worse outcome for both"},
+            {"answer": "Strategy profile where no player can benefit by unilaterally changing strategy"},
+            {"answer": "People value losses more than equivalent gains and make decisions based on reference points"},
+            {"answer": "Cooperative allows binding agreements, non-cooperative assumes self-enforcing strategies"},
+            {"answer": "One player's gains equal other player's losses - total payoff is constant"},
+            {"answer": "Applies game theory to biological evolution - strategies that survive are evolutionarily stable"},
+            {"answer": "Multiple players overexploit shared resource because individual benefit exceeds collective cost"},
+            {"answer": "Reasoning backward from end of game to determine optimal strategies at each decision point"},
         ]
     },
-    # Color questions
+    # Systems Theory and Complexity Science
     {
         "questions": [
-            "What color is a {fruit}?",
-            "What is the color of a {fruit}?",
-            "Which color does a {fruit} have?",
+            "What is emergence in complex systems?",
+            "Explain the concept of self-organization",
+            "How does chaos theory relate to predictability?",
+            "What is the difference between complicated and complex systems?",
+            "Explain the concept of attractors in dynamical systems",
+            "How does network theory explain real-world phenomena?",
+            "What is the edge of chaos hypothesis?",
+            "Explain the concept of fractal dimensionality",
         ],
-        "answers": ["{color}"],
+        "answers": ["{answer}"],
         "data": [
-            {"fruit": "banana", "color": "yellow"},
-            {"fruit": "apple", "color": "red"},
-            {"fruit": "orange", "color": "orange"},
-            {"fruit": "grape", "color": "purple"},
-            {"fruit": "lemon", "color": "yellow"},
+            {"answer": "Higher-level properties arise from interactions of lower-level components that aren't predictable from individual parts"},
+            {"answer": "Systems spontaneously form organized structures without external control through local interactions"},
+            {"answer": "Small changes in initial conditions can lead to vastly different outcomes - deterministic but unpredictable"},
+            {"answer": "Complicated systems are analyzable by breaking into parts, complex systems have irreducible emergent properties"},
+            {"answer": "States that systems tend toward over time - points, cycles, or strange attractors"},
+            {"answer": "Studies how network structure affects system behavior - scale-free networks, small-world properties"},
+            {"answer": "Complex systems are most adaptive and creative at the boundary between order and chaos"},
+            {"answer": "Non-integer dimensions that characterize self-similar patterns at different scales"},
         ]
     },
-    # Animal questions
+    # Advanced Engineering and Information Theory
     {
         "questions": [
-            "What sound does a {animal} make?",
-            "What noise does a {animal} make?",
-            "How does a {animal} sound?",
+            "How does control theory work in engineering systems?",
+            "Explain Shannon's information theory",
+            "What is the difference between analog and digital control systems?",
+            "How does feedback work in control systems?",
+            "What is the Nyquist-Shannon sampling theorem?",
+            "Explain the concept of entropy in information theory",
+            "How does error-correcting codes work?",
+            "What is the difference between open and closed loop control?",
         ],
-        "answers": ["{sound}"],
+        "answers": ["{answer}"],
         "data": [
-            {"animal": "dog", "sound": "woof"},
-            {"animal": "cat", "sound": "meow"},
-            {"animal": "cow", "sound": "moo"},
-            {"animal": "sheep", "sound": "baa"},
-            {"animal": "duck", "sound": "quack"},
+            {"answer": "Uses mathematical models to design systems that maintain desired outputs despite disturbances"},
+            {"answer": "Quantifies information content and communication capacity - entropy measures uncertainty"},
+            {"answer": "Analog uses continuous signals, digital uses discrete values - digital is more robust to noise"},
+            {"answer": "System output is measured and compared to desired output to generate corrective action"},
+            {"answer": "To perfectly reconstruct a signal, sample at least twice the highest frequency component"},
+            {"answer": "Measure of uncertainty or information content - higher entropy means more uncertainty"},
+            {"answer": "Add redundant bits to detect and correct errors in data transmission"},
+            {"answer": "Open loop has no feedback, closed loop uses feedback to adjust output based on measurements"},
         ]
     },
-    # Time questions
+    # Advanced Biology and Evolutionary Theory
     {
         "questions": [
-            "What day comes after {day}?",
-            "What is the day after {day}?",
-            "Which day follows {day}?",
+            "Explain epigenetics and its role in evolution",
+            "How does neutral theory of evolution differ from natural selection?",
+            "What is the extended evolutionary synthesis?",
+            "Explain the concept of evolutionary developmental biology (evo-devo)",
+            "How does horizontal gene transfer complicate the tree of life?",
+            "What is the hologenome theory of evolution?",
+            "Explain the concept of niche construction",
+            "How does evolutionary game theory apply to social behavior?",
         ],
-        "answers": ["{next_day}"],
+        "answers": ["{answer}"],
         "data": [
-            {"day": "Monday", "next_day": "Tuesday"},
-            {"day": "Tuesday", "next_day": "Wednesday"},
-            {"day": "Wednesday", "next_day": "Thursday"},
-            {"day": "Thursday", "next_day": "Friday"},
-            {"day": "Friday", "next_day": "Saturday"},
-            {"day": "Saturday", "next_day": "Sunday"},
-            {"day": "Sunday", "next_day": "Monday"},
+            {"answer": "Heritable changes in gene expression without DNA sequence changes - can influence evolution"},
+            {"answer": "Most genetic variation is neutral (no fitness effect) rather than adaptive"},
+            {"answer": "Incorporates epigenetics, developmental biology, and ecological inheritance into evolutionary theory"},
+            {"answer": "Studies how developmental processes evolve and constrain evolutionary change"},
+            {"answer": "Genes can transfer between unrelated species, creating a web rather than tree of relationships"},
+            {"answer": "Organisms and their symbiotic microbes co-evolve as a single evolutionary unit"},
+            {"answer": "Organisms modify their environment, which then influences their own evolution and that of others"},
+            {"answer": "Models social behaviors as evolutionary stable strategies in repeated interactions"},
         ]
     },
-    # Weather questions
+    # Philosophy of Science and Epistemology
     {
         "questions": [
-            "What is the weather like when it {condition}?",
-            "What kind of weather is {condition}?",
-            "When it {condition}, what is the weather?",
+            "What is the demarcation problem in philosophy of science?",
+            "Explain Popper's falsifiability criterion",
+            "How does Kuhn's paradigm shifts work?",
+            "What is the problem of induction?",
+            "Explain the concept of theory-laden observation",
+            "How does Bayesian epistemology work?",
+            "What is the difference between justification and truth?",
+            "Explain the concept of scientific realism vs anti-realism",
         ],
-        "answers": ["{weather}"],
+        "answers": ["{answer}"],
         "data": [
-            {"condition": "rains", "weather": "rainy"},
-            {"condition": "snows", "weather": "snowy"},
-            {"condition": "is sunny", "weather": "sunny"},
-            {"condition": "is cloudy", "weather": "cloudy"},
-            {"condition": "is windy", "weather": "windy"},
+            {"answer": "How to distinguish genuine science from pseudoscience - no universally accepted criterion"},
+            {"answer": "Scientific theories must be testable and potentially falsifiable by evidence"},
+            {"answer": "Scientific revolutions involve wholesale changes in fundamental frameworks and assumptions"},
+            {"answer": "How can we justify believing that future will resemble past based on finite observations?"},
+            {"answer": "All observations are interpreted through theoretical frameworks - no theory-neutral facts"},
+            {"answer": "Treats belief degrees as probabilities updated by evidence using Bayes' theorem"},
+            {"answer": "Justification is about rationally held beliefs, truth is correspondence to reality"},
+            {"answer": "Realism: scientific theories describe real entities; Anti-realism: theories are just useful tools"},
         ]
     }
 ]
@@ -175,7 +522,7 @@ def create_qa_sequence(question, answer, vocab_size=1000):
 
     return tokens, len(vocab) + 1  # +1 for padding token
 
-def build_qa_dataset(num_train_puzzles=10000, num_test_puzzles=2000):
+def build_qa_dataset(num_train_puzzles=50000, num_test_puzzles=10000):
     """Build Q&A dataset with specified number of examples."""
 
     print(f"Building Q&A dataset with {num_train_puzzles} training and {num_test_puzzles} test examples...")
@@ -367,9 +714,9 @@ def save_dataset(data, output_dir="data/qa_pairs"):
 
 def main():
     parser = argparse.ArgumentParser(description="Build Q&A dataset")
-    parser.add_argument("--num-train-puzzles", type=int, default=10000,
+    parser.add_argument("--num-train-puzzles", type=int, default=50000,
                        help="Number of training examples")
-    parser.add_argument("--num-test-puzzles", type=int, default=2000,
+    parser.add_argument("--num-test-puzzles", type=int, default=10000,
                        help="Number of test examples")
     parser.add_argument("--output-dir", type=str, default="data/qa_pairs",
                        help="Output directory")
diff --git a/pyproject.toml b/pyproject.toml
index 9f691461..be5eebc5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,6 +34,7 @@ dependencies = [
     "torchvision>=0.15.0",
     "einops>=0.7.0",
     "timm>=0.9.0",
+    "adam-atan2>=0.0.3",
 ]
 
 [project.optional-dependencies]
@@ -55,4 +56,4 @@ build-backend = "setuptools.build_meta"
 
 [tool.setuptools.packages.find]
 include = ["data", "kaggle", "config", "models", "assets", "dataset", "evaluators"]
-exclude = ["tests*", "docs*"]
\ No newline at end of file
+exclude = ["tests*", "docs*"]
diff --git a/train_qa_pairs_ultra_complex.sh b/train_qa_pairs_ultra_complex.sh
new file mode 100755
index 00000000..bae5997e
--- /dev/null
+++ b/train_qa_pairs_ultra_complex.sh
@@ -0,0 +1,32 @@
+#!/bin/sh
+
+export PYTORCH_DISABLE_COMPILE=1
+export CUDA_VISIBLE_DEVICES=""
+export TORCH_USE_CUDA_DSA=0
+uv run python pretrain.py \
+    arch=trm \
+    data_paths="[data/qa_pairs_ultra_complex]" \
+    arch.halt_exploration_prob=0.0 \
+    arch.halt_max_steps=8 \
+    arch.H_cycles=2 \
+    arch.L_cycles=2 \
+    arch.H_layers=0 \
+    arch.L_layers=1 \
+    arch.hidden_size=128 \
+    arch.num_heads=4 \
+    arch.expansion=2 \
+    arch.puzzle_emb_ndim=8 \
+    arch.forward_dtype=float32 \
+    arch.puzzle_emb_len=8 \
+    global_batch_size=8 \
+    epochs=10000 \
+    lr=0.001 \
+    puzzle_emb_lr=0.01 \
+    weight_decay=0.0 \
+    puzzle_emb_weight_decay=0.0 \
+    lr_warmup_steps=1000 \
+    eval_interval=10 \
+    use_wandb=false \
+    beta1=0.9 \
+    beta2=0.999 \
+    +project_name="qa_pairs_ultra_complex"
\ No newline at end of file
diff --git a/train_qa_pairs_ultra_complex_small.sh b/train_qa_pairs_ultra_complex_small.sh
new file mode 100755
index 00000000..706bd333
--- /dev/null
+++ b/train_qa_pairs_ultra_complex_small.sh
@@ -0,0 +1,32 @@
+#!/bin/sh
+
+export PYTORCH_DISABLE_COMPILE=1
+export CUDA_VISIBLE_DEVICES=""
+export TORCH_USE_CUDA_DSA=0
+uv run python pretrain.py \
+    arch=trm \
+    data_paths="[data/qa_pairs_ultra_complex_small]" \
+    arch.halt_exploration_prob=0.0 \
+    arch.halt_max_steps=8 \
+    arch.H_cycles=2 \
+    arch.L_cycles=2 \
+    arch.H_layers=0 \
+    arch.L_layers=1 \
+    arch.hidden_size=128 \
+    arch.num_heads=4 \
+    arch.expansion=2 \
+    arch.puzzle_emb_ndim=8 \
+    arch.forward_dtype=float32 \
+    arch.puzzle_emb_len=8 \
+    global_batch_size=8 \
+    epochs=1000 \
+    lr=0.001 \
+    puzzle_emb_lr=0.01 \
+    weight_decay=0.0 \
+    puzzle_emb_weight_decay=0.0 \
+    lr_warmup_steps=1000 \
+    eval_interval=10 \
+    use_wandb=false \
+    beta1=0.9 \
+    beta2=0.999 \
+    +project_name="qa_pairs_ultra_complex_small"
\ No newline at end of file

From 2ed2e92b945bbbce8ddfd5b52b18c059dc55a1e8 Mon Sep 17 00:00:00 2001
From: Antonio Linares <antonio.fivetech@gmail.com>
Date: Fri, 14 Nov 2025 09:36:12 +0100
Subject: [PATCH 09/15] Added math & gsm8k dataset

---
 dataset/build_math_gsm8k_dataset.py | 1016 +++++++++++++++++++++++++++
 1 file changed, 1016 insertions(+)
 create mode 100644 dataset/build_math_gsm8k_dataset.py

diff --git a/dataset/build_math_gsm8k_dataset.py b/dataset/build_math_gsm8k_dataset.py
new file mode 100644
index 00000000..857611bd
--- /dev/null
+++ b/dataset/build_math_gsm8k_dataset.py
@@ -0,0 +1,1016 @@
+#!/usr/bin/env python3
+"""
+Build MATH/GSM8K mathematical reasoning dataset for TinyRecursiveModels training.
+Generates question-answer pairs for mathematical problem solving.
+"""
+
+import json
+import numpy as np
+import random
+from pathlib import Path
+import argparse
+
+from common import PuzzleDatasetMetadata
+
+# MATH and GSM8K style mathematical reasoning templates
+# MATH and GSM8K style mathematical reasoning templates
+MATH_GSM8K_TEMPLATES = [
+    # GSM8K Style: Subtraction problems
+    {
+        "questions": [
+            "If {name} has {num1} apples and gives {num2} to {friend}, how many does {name} have left?",
+            "{name} started with {num1} stickers and gave away {num2}. How many does {name} have now?",
+            "{name} had ${num1} and spent ${num2} on candy. How much money does {name} have left?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"name": "Sarah", "num1": "10", "num2": "3", "friend": "John", "result": "7"},
+            {"name": "Mike", "num1": "15", "num2": "7", "friend": "Lisa", "result": "8"},
+            {"name": "Emma", "num1": "20", "num2": "5", "friend": "Tom", "result": "15"},
+        ]
+    },
+    # GSM8K Style: Multiplication problems
+    {
+        "questions": [
+            "{name} bought {num1} candies. Each candy costs ${price}. How much did {name} spend?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"name": "Mike", "num1": "5", "price": "2", "result": "10"},
+            {"name": "Lisa", "num1": "4", "price": "3", "result": "12"},
+            {"name": "John", "num1": "6", "price": "1", "result": "6"},
+        ]
+    },
+    # GSM8K Style: Division problems
+    {
+        "questions": [
+            "If {num1} people share {total} pizzas equally, how many pizzas does each person get?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"num1": "4", "total": "12", "result": "3"},
+            {"num1": "3", "total": "15", "result": "5"},
+            {"num1": "5", "total": "20", "result": "4"},
+        ]
+    },
+    # GSM8K Style: Addition problems
+    {
+        "questions": [
+            "{name} scored {score1} points in the first game and {score2} in the second. What is the total?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"name": "Tom", "score1": "25", "score2": "30", "result": "55"},
+            {"name": "Anna", "score1": "20", "score2": "35", "result": "55"},
+            {"name": "David", "score1": "15", "score2": "40", "result": "55"},
+        ]
+    },
+    # GSM8K Style: Division problems
+    {
+        "questions": [
+            "If {num1} people share {total} pizzas equally, how many pizzas does each person get?",
+            "A train travels {distance} miles in {hours} hours. What is its average speed?",
+            "{num1} friends want to share ${total} equally. How much does each friend get?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"num1": "4", "total": "12", "distance": "200", "hours": "4", "result": "3"},
+            {"num1": "3", "total": "15", "distance": "300", "hours": "5", "result": "5"},
+            {"num1": "5", "total": "20", "distance": "250", "hours": "5", "result": "4"},
+        ]
+    },
+    # GSM8K Style: Addition problems
+    {
+        "questions": [
+            "{name} scored {score1} points in the first game and {score2} in the second. What is the total?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"name": "Tom", "score1": "25", "score2": "30", "result": "55"},
+            {"name": "Anna", "score1": "20", "score2": "35", "result": "55"},
+            {"name": "David", "score1": "15", "score2": "40", "result": "55"},
+        ]
+    },
+    # GSM8K Style: Factory Production
+    {
+        "questions": [
+            "A factory produces {rate} toys per hour. How many toys does it produce in {hours} hours?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"rate": "50", "hours": "8", "result": "400"},
+            {"rate": "25", "hours": "12", "result": "300"},
+            {"rate": "40", "hours": "6", "result": "240"},
+        ]
+    },
+    # GSM8K Style: Unit Price
+    {
+        "questions": [
+            "If {num1} books cost ${total}, how much does one book cost?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"num1": "3", "total": "15", "result": "5"},
+            {"num1": "4", "total": "20", "result": "5"},
+            {"num1": "2", "total": "18", "result": "9"},
+        ]
+    },
+    # GSM8K Style: Gift Shopping
+    {
+        "questions": [
+            "{name} has ${money} and wants to buy gifts costing ${cost} each. How many can {name} buy?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"name": "Lisa", "money": "50", "cost": "10", "result": "5"},
+            {"name": "Tom", "money": "30", "cost": "6", "result": "5"},
+            {"name": "Anna", "money": "45", "cost": "9", "result": "5"},
+        ]
+    },
+    # GSM8K Style: Garden Area
+    {
+        "questions": [
+            "A garden is {length} feet long and {width} feet wide. What is its area?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"length": "20", "width": "15", "result": "300 square feet"},
+            {"length": "12", "width": "8", "result": "96 square feet"},
+            {"length": "25", "width": "10", "result": "250 square feet"},
+        ]
+    },
+    # GSM8K Style: Student Pencils
+    {
+        "questions": [
+            "If {num1} students each bring {num2} pencils, how many pencils are there total?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"num1": "6", "num2": "4", "result": "24"},
+            {"num1": "5", "num2": "3", "result": "15"},
+            {"num1": "8", "num2": "2", "result": "16"},
+        ]
+    },
+    # MATH Style: Linear Equations
+    {
+        "questions": [
+            "Solve for x: {eq}. What is x?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"eq": "2x + 5 = 13", "result": "4"},
+            {"eq": "3x - 7 = 8", "result": "5"},
+            {"eq": "4x + 2 = 18", "result": "4"},
+        ]
+    },
+    # MATH Style: System of Equations
+    {
+        "questions": [
+            "If {eq1} and {eq2}, what is the value of x?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"eq1": "3x + 2 = 11", "eq2": "x - 1 = 3", "result": "4"},
+            {"eq1": "2x + y = 10", "eq2": "x - y = 2", "result": "3"},
+            {"eq1": "x + 2y = 8", "eq2": "2x - y = 1", "result": "2"},
+        ]
+    },
+    # MATH Style: Quadratic Equations
+    {
+        "questions": [
+            "Find x such that {eq} = 0",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"eq": "x² - 9", "result": "x = 3 or x = -3"},
+            {"eq": "x² - 4", "result": "x = 2 or x = -2"},
+            {"eq": "2x² - 8", "result": "x = 2 or x = -2"},
+        ]
+    },
+    # MATH Style: Circle Area
+    {
+        "questions": [
+            "What is the area of a circle with radius {r}?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"r": "5", "result": "25π"},
+            {"r": "3", "result": "9π"},
+            {"r": "7", "result": "49π"},
+        ]
+    },
+    # MATH Style: Triangle Area
+    {
+        "questions": [
+            "A triangle has base {base} and height {height}. What is its area?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"base": "10", "height": "8", "result": "40"},
+            {"base": "6", "height": "4", "result": "12"},
+            {"base": "15", "height": "12", "result": "90"},
+        ]
+    },
+    # MATH Style: Sphere Volume
+    {
+        "questions": [
+            "Find the volume of a sphere with radius {r}",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"r": "3", "result": "36π"},
+            {"r": "2", "result": "32π/3"},
+            {"r": "4", "result": "256π/3"},
+        ]
+    },
+    # MATH Style: Circle Circumference
+    {
+        "questions": [
+            "What is the circumference of a circle with diameter {d}?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"d": "10", "result": "10π"},
+            {"d": "6", "result": "6π"},
+            {"d": "14", "result": "14π"},
+        ]
+    },
+    # MATH Style: Rectangle Perimeter
+    {
+        "questions": [
+            "A rectangle is {l} by {w}. What is its perimeter?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"l": "12", "w": "8", "result": "40"},
+            {"l": "5", "w": "3", "result": "16"},
+            {"l": "9", "w": "7", "result": "32"},
+        ]
+    },
+    # MATH Style: Derivatives
+    {
+        "questions": [
+            "What is the derivative of {func}?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"func": "x²", "result": "2x"},
+            {"func": "x³", "result": "3x²"},
+            {"func": "sin(x)", "result": "cos(x)"},
+        ]
+    },
+    # MATH Style: Indefinite Integrals
+    {
+        "questions": [
+            "Find the integral of {func} dx",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"func": "3x² + 2x", "result": "x³ + x² + C"},
+            {"func": "x²", "result": "x³/3 + C"},
+            {"func": "e^x", "result": "e^x + C"},
+        ]
+    },
+    # MATH Style: Definite Integrals
+    {
+        "questions": [
+            "Evaluate ∫_{a}^{b} {func} dx",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"func": "2x", "a": "0", "b": "1", "result": "1"},
+            {"func": "x", "a": "0", "b": "2", "result": "2"},
+            {"func": "3x²", "a": "1", "b": "2", "result": "7"},
+        ]
+    },
+    # MATH Style: System of Linear Equations
+    {
+        "questions": [
+            "Solve the system: {eq1}, {eq2}",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"eq1": "2x + 3y = 7", "eq2": "x - y = 1", "result": "x=2, y=1"},
+            {"eq1": "x + y = 5", "eq2": "2x - y = 1", "result": "x=2, y=3"},
+            {"eq1": "3x + 2y = 8", "eq2": "x - y = 1", "result": "x=2, y=1"},
+        ]
+    },
+    # MATH Style: Matrix Determinant
+    {
+        "questions": [
+            "Find the determinant of [{a},{b};{c},{d}]",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"a": "1", "b": "2", "c": "3", "d": "4", "result": "-2"},
+            {"a": "2", "b": "1", "c": "1", "d": "3", "result": "5"},
+            {"a": "3", "b": "0", "c": "0", "d": "2", "result": "6"},
+        ]
+    },
+    # MATH Style: Matrix Inverse
+    {
+        "questions": [
+            "What is the inverse of [{p},{q};{r},{s}]?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"p": "1", "q": "2", "r": "3", "s": "4", "result": "[-2,1;1.5,-0.5]"},
+            {"p": "2", "q": "1", "r": "1", "s": "1", "result": "[1/3,-1/3;-1/3,2/3]"},
+            {"p": "1", "q": "0", "r": "0", "s": "2", "result": "[1,0;0,0.5]"},
+        ]
+    },
+    # MATH Style: Probability - Independent Events
+    {
+        "questions": [
+            "If P(A) = {p_a} and P(B) = {p_b}, what is P(A and B) if independent?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"p_a": "0.3", "p_b": "0.4", "result": "0.12"},
+            {"p_a": "0.2", "p_b": "0.5", "result": "0.1"},
+            {"p_a": "0.4", "p_b": "0.3", "result": "0.12"},
+        ]
+    },
+    # MATH Style: Statistics - Mean
+    {
+        "questions": [
+            "What is the mean of {nums}?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"nums": "1,2,3,4,5", "result": "3"},
+            {"nums": "2,4,6,8", "result": "5"},
+            {"nums": "1,3,5,7,9", "result": "5"},
+        ]
+    },
+    # MATH Style: Statistics - Standard Deviation
+    {
+        "questions": [
+            "Find the standard deviation of a set with variance {var}",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"var": "4", "result": "2"},
+            {"var": "9", "result": "3"},
+            {"var": "16", "result": "4"},
+        ]
+    },
+    # MATH Style: Binomial Probability
+    {
+        "questions": [
+            "In a binomial experiment with n={n}, p={p}, what is P(X={k})?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"n": "10", "p": "0.5", "k": "5", "result": "0.246"},
+            {"n": "5", "p": "0.3", "k": "2", "result": "0.309"},
+            {"n": "8", "p": "0.25", "k": "3", "result": "0.207"},
+        ]
+    },
+    # MATH Style: Die Probability
+    {
+        "questions": [
+            "What is the probability of rolling a {num} on a fair die?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"num": "6", "result": "1/6"},
+            {"num": "1", "result": "1/6"},
+            {"num": "4", "result": "1/6"},
+        ]
+    },
+    # MATH Style: GCD Problems
+    {
+        "questions": [
+            "What is gcd({a}, {b})?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"a": "24", "b": "36", "result": "12"},
+            {"a": "15", "b": "25", "result": "5"},
+            {"a": "18", "b": "30", "result": "6"},
+        ]
+    },
+    # MATH Style: Prime Number Check
+    {
+        "questions": [
+            "Is {num} a prime number?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"num": "17", "result": "Yes"},
+            {"num": "15", "result": "No"},
+            {"num": "23", "result": "Yes"},
+        ]
+    },
+    # MATH Style: Modular Arithmetic
+    {
+        "questions": [
+            "Solve {a}x ≡ {b} mod {m}",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"a": "2", "b": "4", "m": "6", "result": "x ≡ 2 mod 3"},
+            {"a": "3", "b": "2", "m": "5", "result": "x ≡ 4 mod 5"},
+            {"a": "1", "b": "3", "m": "7", "result": "x ≡ 3 mod 7"},
+        ]
+    },
+    # MATH Style: Euler's Totient
+    {
+        "questions": [
+            "What is φ({n}) (Euler's totient function)?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"n": "10", "result": "4"},
+            {"n": "12", "result": "4"},
+            {"n": "15", "result": "8"},
+        ]
+    },
+    # MATH Style: LCM Problems
+    {
+        "questions": [
+            "Find the least common multiple of {a} and {b}",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"a": "12", "b": "18", "result": "36"},
+            {"a": "8", "b": "12", "result": "24"},
+            {"a": "15", "b": "20", "result": "60"},
+        ]
+    },
+    # MATH Style: Trigonometric Identities
+    {
+        "questions": [
+            "If sin(θ) = {val} and θ is acute, what is cos(θ)?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"val": "3/5", "result": "4/5"},
+            {"val": "4/5", "result": "3/5"},
+            {"val": "1/2", "result": "√3/2"},
+        ]
+    },
+    # MATH Style: Trigonometric Equations
+    {
+        "questions": [
+            "Solve sin(x) = 0 for x in [0, 2π]",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"result": "x = 0, π, 2π"},
+            {"result": "x = π/2, 3π/2"},
+            {"result": "x = π/4, 5π/4, 3π/4, 7π/4"},
+        ]
+    },
+    # MATH Style: Special Angles
+    {
+        "questions": [
+            "What is tan({angle}°)?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"angle": "45", "result": "1"},
+            {"angle": "30", "result": "√3/3"},
+            {"angle": "60", "result": "√3"},
+        ]
+    },
+    # MATH Style: Exact Values
+    {
+        "questions": [
+            "Find the exact value of cos(π/4)",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"result": "√2/2"},
+            {"result": "1/2"},
+            {"result": "0"},
+        ]
+    },
+    # MATH Style: Limits
+    {
+        "questions": [
+            "What is the limit of {expr} as x approaches {val}?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"expr": "(x²-1)/(x-1)", "val": "1", "result": "2"},
+            {"expr": "sin(x)/x", "val": "0", "result": "1"},
+            {"expr": "(1-cos(x))/x²", "val": "0", "result": "1/2"},
+        ]
+    },
+    # MATH Style: Taylor Series
+    {
+        "questions": [
+            "Find the Taylor series of {func} around x={a}",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"func": "e^x", "a": "0", "result": "1 + x + x²/2! + x³/3! + ..."},
+            {"func": "sin(x)", "a": "0", "result": "x - x³/3! + x⁵/5! - ..."},
+            {"func": "cos(x)", "a": "0", "result": "1 - x²/2! + x⁴/4! - ..."},
+        ]
+    },
+    # MATH Style: Higher Order Derivatives
+    {
+        "questions": [
+            "What is the derivative of order {n} of {func}?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"n": "2", "func": "sin(x)", "result": "-sin(x)"},
+            {"n": "3", "func": "cos(x)", "result": "-cos(x)"},
+            {"n": "4", "func": "e^x", "result": "e^x"},
+        ]
+    },
+    # MATH Style: Differential Equations
+    {
+        "questions": [
+            "Solve the differential equation: {eq}",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"eq": "y' + y = 0", "result": "y = Ce^{-x}"},
+            {"eq": "y'' + y = 0", "result": "y = A cos(x) + B sin(x)"},
+            {"eq": "y' = ky", "result": "y = Ce^{kx}"},
+        ]
+    },
+    # MATH Style: Permutations
+    {
+        "questions": [
+            "How many permutations of {n} distinct items?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"n": "5", "result": "120"},
+            {"n": "4", "result": "24"},
+            {"n": "3", "result": "6"},
+        ]
+    },
+    # MATH Style: Combinations
+    {
+        "questions": [
+            "What is C({n}, {k})?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"n": "5", "k": "3", "result": "10"},
+            {"n": "6", "k": "2", "result": "15"},
+            {"n": "4", "k": "2", "result": "6"},
+        ]
+    },
+    # MATH Style: Recurrence Relations
+    {
+        "questions": [
+            "Solve the recurrence: a_n = {a}*a_(n-1) + {b}*a_(n-2), with a_0={a0}, a_1={a1}, find a_5",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"a": "1", "b": "1", "a0": "0", "a1": "1", "result": "5"},
+            {"a": "2", "b": "1", "a0": "1", "a1": "1", "result": "11"},
+            {"a": "1", "b": "2", "a0": "0", "a1": "1", "result": "3"},
+        ]
+    },
+    # MATH Style: Permutations with Repetition
+    {
+        "questions": [
+            "How many ways to choose {k} items from {n} with order?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"n": "5", "k": "3", "result": "60"},
+            {"n": "4", "k": "2", "result": "12"},
+            {"n": "6", "k": "4", "result": "360"},
+        ]
+    },
+    # MATH Style: Power Sets
+    {
+        "questions": [
+            "What is the number of subsets of a set with {n} elements?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"n": "3", "result": "8"},
+            {"n": "4", "result": "16"},
+            {"n": "5", "result": "32"},
+        ]
+    },
+    # MATH Style: Complex Numbers Basics
+    {
+        "questions": [
+            "What is i²?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"result": "-1"},
+            {"result": "-1"},
+            {"result": "-1"},
+        ]
+    },
+    # MATH Style: Complex Roots
+    {
+        "questions": [
+            "Find the roots of z² + {c} = 0",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"c": "1", "result": "z = i, z = -i"},
+            {"c": "-1", "result": "z = 1, z = -1"},
+            {"c": "4", "result": "z = 2i, z = -2i"},
+        ]
+    },
+    # MATH Style: Complex Derivatives
+    {
+        "questions": [
+            "What is the derivative of {func}?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"func": "e^z", "result": "e^z"},
+            {"func": "sin(z)", "result": "cos(z)"},
+            {"func": "z²", "result": "2z"},
+        ]
+    },
+    # MATH Style: Contour Integration
+    {
+        "questions": [
+            "Evaluate the contour integral ∮ {func} dz over |z|={r}",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"func": "1/z", "r": "1", "result": "2πi"},
+            {"func": "1/z²", "r": "1", "result": "0"},
+            {"func": "z", "r": "1", "result": "0"},
+        ]
+    },
+    # MATH Style: Residues
+    {
+        "questions": [
+            "Find the residue of {func} at z={z0}",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"func": "1/(z-1)", "z0": "1", "result": "1"},
+            {"func": "1/z", "z0": "0", "result": "1"},
+            {"func": "e^z/z", "z0": "0", "result": "1"},
+        ]
+    },
+    # MATH Style: Group Theory - Element Order
+    {
+        "questions": [
+            "What is the order of the element {g} in ℤ/{n}ℤ?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"g": "3", "n": "7", "result": "order 3"},
+            {"g": "2", "n": "5", "result": "order 4"},
+            {"g": "1", "n": "8", "result": "order 1"},
+        ]
+    },
+    # MATH Style: Ring Theory - Field Check
+    {
+        "questions": [
+            "Is {ring} a field?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"ring": "ℤ/5ℤ", "result": "Yes"},
+            {"ring": "ℤ/4ℤ", "result": "No"},
+            {"ring": "ℚ", "result": "Yes"},
+        ]
+    },
+    # MATH Style: Group Homomorphisms
+    {
+        "questions": [
+            "Find the kernel of the homomorphism φ: ℤ → ℤ_{n} given by φ(k) = {mod}",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"n": "6", "mod": "k mod 6", "result": "6ℤ"},
+            {"n": "4", "mod": "k mod 4", "result": "4ℤ"},
+            {"n": "8", "mod": "k mod 8", "result": "8ℤ"},
+        ]
+    },
+    # MATH Style: Group Index
+    {
+        "questions": [
+            "What is the index of H in G where |G|={g}, |H|={h}?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"g": "12", "h": "4", "result": "3"},
+            {"g": "20", "h": "5", "result": "4"},
+            {"g": "24", "h": "8", "result": "3"},
+        ]
+    },
+    # MATH Style: Continuity
+    {
+        "questions": [
+            "Is the function f(x) = {func} continuous at x={a}?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"func": "x²", "a": "0", "result": "Yes"},
+            {"func": "1/x", "a": "0", "result": "No"},
+            {"func": "sin(x)", "a": "π", "result": "Yes"},
+        ]
+    },
+    # MATH Style: Series Convergence
+    {
+        "questions": [
+            "Does the series ∑ {term} converge?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"term": "1/n²", "result": "Yes (p-series with p=2 > 1)"},
+            {"term": "1/n", "result": "No (harmonic series)"},
+            {"term": "2⁻ⁿ", "result": "Yes (geometric with r=1/2)"},
+        ]
+    },
+    # MATH Style: Riemann Integration
+    {
+        "questions": [
+            "What is the Riemann integral of {func} from {a} to {b}?",
+        ],
+        "answers": ["{result}"],
+        "data": [
+            {"func": "x", "a": "0", "b": "1", "result": "1/2"},
+            {"func": "x²", "a": "0", "b": "2", "result": "8/3"},
+            {"func": "sin(x)", "a": "0", "b": "π", "result": "2"},
+        ]
+    },
+]
+
+def generate_qa_pair(template, data_item):
+    """Generate a single Q&A pair from template and data."""
+    question_template = random.choice(template["questions"])
+    answer_template = random.choice(template["answers"])
+
+    # Format question and answer
+    question = question_template.format(**data_item)
+    answer = answer_template.format(**data_item)
+
+    return question, answer
+
+def create_qa_sequence(question, answer, vocab_size=1000):
+    """Convert Q&A pair to sequence format for TRM training."""
+    # Simple tokenization - split into words and map to token IDs
+    # In practice, you'd want a proper tokenizer, but this works for demo
+
+    # Combine question and answer with special tokens
+    full_text = f"Question: {question} Answer: {answer}"
+
+    # Simple word-level tokenization (lowercase, basic punctuation)
+    import re
+    words = re.findall(r'\b\w+\b', full_text.lower())
+
+    # Create a simple vocabulary mapping (in practice, use a real tokenizer)
+    vocab = {}
+    token_id = 1  # Start from 1, reserve 0 for padding
+
+    # Build vocabulary from all possible words in our templates
+    all_words = set()
+    for template in MATH_GSM8K_TEMPLATES:
+        for data_item in template["data"]:
+            for q_template in template["questions"]:
+                try:
+                    q = q_template.format(**data_item)
+                    all_words.update(re.findall(r'\b\w+\b', q.lower()))
+                except KeyError:
+                    # Skip if formatting fails due to missing keys
+                    pass
+            for a_template in template["answers"]:
+                try:
+                    a = a_template.format(**data_item)
+                    all_words.update(re.findall(r'\b\w+\b', a.lower()))
+                except KeyError:
+                    # Skip if formatting fails due to missing keys
+                    pass
+
+    # Sort for consistent ordering
+    all_words = sorted(list(all_words))
+    for word in all_words:
+        vocab[word] = token_id
+        token_id += 1
+
+    # Convert words to token IDs
+    tokens = [vocab.get(word, 1) for word in words]  # Default to 1 for unknown words
+
+    # Pad or truncate to fixed length (adjust as needed)
+    seq_len = 32
+    if len(tokens) < seq_len:
+        tokens.extend([0] * (seq_len - len(tokens)))
+    else:
+        tokens = tokens[:seq_len]
+
+    return tokens, len(vocab) + 1  # +1 for padding token
+
+def build_qa_dataset(num_train_puzzles=50000, num_test_puzzles=10000):
+    """Build Q&A dataset with specified number of examples."""
+
+    print(f"Building Q&A dataset with {num_train_puzzles} training and {num_test_puzzles} test examples...")
+
+    # Generate training data
+    train_data = []
+    train_inputs = []
+    train_labels = []
+    train_puzzle_identifiers = []
+    train_puzzle_indices = []
+    train_group_indices = []
+
+    train_puzzle_indices.append(0)  # Start with 0
+    train_group_indices.append(0)   # Start with 0
+
+    puzzle_id = 0
+
+    puzzle_id = 0
+    vocab_size = 1000  # Will be updated based on actual vocabulary
+
+    for i in range(num_train_puzzles):
+        # Select random template and data item
+        template = random.choice(MATH_GSM8K_TEMPLATES)
+        data_item = random.choice(template["data"])
+
+        # Generate Q&A pair
+        question, answer = generate_qa_pair(template, data_item)
+
+        # Convert to sequence format
+        tokens, actual_vocab_size = create_qa_sequence(question, answer, vocab_size)
+        vocab_size = max(vocab_size, actual_vocab_size)
+
+        # Create training example (for sequence prediction)
+        # Input: question tokens, Label: answer tokens shifted by 1
+        input_tokens = tokens[:-1]  # All but last token
+        label_tokens = tokens[1:]   # All but first token, shifted
+
+        train_inputs.append(input_tokens)
+        train_labels.append(label_tokens)
+        train_puzzle_identifiers.append(0)  # Single puzzle type
+        puzzle_id += 1
+        train_puzzle_indices.append(puzzle_id)
+        train_group_indices.append(puzzle_id)
+
+    # Generate test data (same process)
+    test_data = []
+    test_inputs = []
+    test_labels = []
+    test_puzzle_identifiers = []
+    test_puzzle_indices = []
+    test_group_indices = []
+
+    test_puzzle_indices.append(0)  # Start with 0
+    test_group_indices.append(0)   # Start with 0
+
+    puzzle_id = 0  # Reset for test
+
+    for i in range(num_test_puzzles):
+        template = random.choice(MATH_GSM8K_TEMPLATES)
+        data_item = random.choice(template["data"])
+
+        question, answer = generate_qa_pair(template, data_item)
+        tokens, _ = create_qa_sequence(question, answer, vocab_size)
+
+        input_tokens = tokens[:-1]
+        label_tokens = tokens[1:]
+
+        test_inputs.append(input_tokens)
+        test_labels.append(label_tokens)
+        test_puzzle_identifiers.append(0)
+        puzzle_id += 1
+        test_puzzle_indices.append(puzzle_id)
+        test_group_indices.append(puzzle_id)
+
+    # Create metadata
+    metadata = {
+        "vocab_size": vocab_size,
+        "num_puzzle_identifiers": 1,
+        "seq_len": 31,  # input length (32 - 1)
+        "num_train_puzzles": num_train_puzzles,
+        "num_test_puzzles": num_test_puzzles,
+        "puzzle_type": "qa_pairs",
+        "description": "Question-Answer pairs dataset for language modeling"
+    }
+
+    return {
+        "train": {
+            "inputs": train_inputs,
+            "labels": train_labels,
+            "puzzle_identifiers": train_puzzle_identifiers,
+            "puzzle_indices": train_puzzle_indices,
+            "group_indices": train_group_indices
+        },
+        "test": {
+            "inputs": test_inputs,
+            "labels": test_labels,
+            "puzzle_identifiers": test_puzzle_identifiers,
+            "puzzle_indices": test_puzzle_indices,
+            "group_indices": test_group_indices
+        },
+        "metadata": metadata
+    }
+
+def save_dataset(data, output_dir="data/qa_pairs"):
+    """Save dataset to disk in the expected format."""
+
+    output_path = Path(output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+
+    # Create train and test directories
+    train_dir = output_path / "train"
+    test_dir = output_path / "test"
+    train_dir.mkdir(exist_ok=True)
+    test_dir.mkdir(exist_ok=True)
+
+    # Save JSON data
+    with open(output_path / "train.json", "w") as f:
+        json.dump(data["train"], f, indent=2)
+
+    with open(output_path / "test.json", "w") as f:
+        json.dump(data["test"], f, indent=2)
+
+    with open(output_path / "metadata.json", "w") as f:
+        json.dump(data["metadata"], f, indent=2)
+
+    # Convert to numpy arrays and save
+    print("Converting to numpy arrays...")
+
+    # Train data
+    np.save(train_dir / "all__inputs.npy",
+            np.array(data["train"]["inputs"]))
+    np.save(train_dir / "all__labels.npy",
+            np.array(data["train"]["labels"]))
+    np.save(train_dir / "all__puzzle_identifiers.npy",
+            np.array(data["train"]["puzzle_identifiers"]))
+    np.save(train_dir / "all__puzzle_indices.npy",
+            np.array(data["train"]["puzzle_indices"]))
+    np.save(train_dir / "all__group_indices.npy",
+            np.array(data["train"]["group_indices"]))
+
+    # Test data
+    np.save(test_dir / "all__inputs.npy",
+            np.array(data["test"]["inputs"]))
+    np.save(test_dir / "all__labels.npy",
+            np.array(data["test"]["labels"]))
+    np.save(test_dir / "all__puzzle_identifiers.npy",
+            np.array(data["test"]["puzzle_identifiers"]))
+    np.save(test_dir / "all__puzzle_indices.npy",
+            np.array(data["test"]["puzzle_indices"]))
+    np.save(test_dir / "all__group_indices.npy",
+            np.array(data["test"]["group_indices"]))
+
+    # Create and save dataset.json for train
+    train_metadata = PuzzleDatasetMetadata(
+        pad_id=0,
+        ignore_label_id=0,
+        blank_identifier_id=0,
+        vocab_size=data["metadata"]["vocab_size"],
+        seq_len=data["metadata"]["seq_len"],
+        num_puzzle_identifiers=1,
+        total_groups=data["metadata"]["num_train_puzzles"],
+        mean_puzzle_examples=1.0,
+        total_puzzles=data["metadata"]["num_train_puzzles"],
+        sets=["all"]
+    )
+    with open(train_dir / "dataset.json", "w") as f:
+        json.dump(train_metadata.model_dump(), f)
+
+    # Create and save dataset.json for test
+    test_metadata = PuzzleDatasetMetadata(
+        pad_id=0,
+        ignore_label_id=0,
+        blank_identifier_id=0,
+        vocab_size=data["metadata"]["vocab_size"],
+        seq_len=data["metadata"]["seq_len"],
+        num_puzzle_identifiers=1,
+        total_groups=data["metadata"]["num_test_puzzles"],
+        mean_puzzle_examples=1.0,
+        total_puzzles=data["metadata"]["num_test_puzzles"],
+        sets=["all"]
+    )
+    with open(test_dir / "dataset.json", "w") as f:
+        json.dump(test_metadata.model_dump(), f)
+
+    print(f"Dataset saved to {output_path}")
+    print(f"Vocabulary size: {data['metadata']['vocab_size']}")
+    print(f"Training examples: {data['metadata']['num_train_puzzles']}")
+    print(f"Test examples: {data['metadata']['num_test_puzzles']}")
+
+def main():
+    parser = argparse.ArgumentParser(description="Build MATH/GSM8K Q&A dataset")
+    parser.add_argument("--num-train-puzzles", type=int, default=10000,
+                       help="Number of training examples")
+    parser.add_argument("--num-test-puzzles", type=int, default=2000,
+                       help="Number of test examples")
+    parser.add_argument("--output-dir", type=str, default="data/math_gsm8k_qa",
+                       help="Output directory")
+
+    args = parser.parse_args()
+
+    # Build dataset
+    data = build_qa_dataset(args.num_train_puzzles, args.num_test_puzzles)
+
+    # Save to disk
+    save_dataset(data, args.output_dir)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From 166b74fc92f3f091762bcd4b0571e1abba2de3d7 Mon Sep 17 00:00:00 2001
From: Antonio Linares <antonio.fivetech@gmail.com>
Date: Fri, 14 Nov 2025 09:42:05 +0100
Subject: [PATCH 10/15] math and gsmk8 trainer

---
 train_math_gsmk8.sh | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)
 create mode 100755 train_math_gsmk8.sh

diff --git a/train_math_gsmk8.sh b/train_math_gsmk8.sh
new file mode 100755
index 00000000..f74eab6a
--- /dev/null
+++ b/train_math_gsmk8.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# TRM Mathematical Reasoning Training Script
+# Trains TRM on MATH and GSM8K style mathematical reasoning problems
+
+echo "🚀 Starting TRM Mathematical Reasoning Training"
+echo "=============================================="
+
+# Set environment variables
+export DISABLE_COMPILE=1  # Disable torch.compile to avoid compilation issues
+
+# Change to project directory
+cd /home/anto/TinyRecursiveModels
+
+# Run training with math config
+echo "📚 Training on MATH & GSM8K dataset..."
+echo "💾 Checkpoints will be saved to: checkpoints/TRM-Math-Reasoning/"
+echo "📊 Training progress can be monitored via wandb (if enabled)"
+echo ""
+
+# Execute training
+uv run python3 pretrain.py --config-name cfg_math_pretrain
+
+echo ""
+echo "✅ Training completed!"
+echo "📁 Check the checkpoints directory for saved models"
+echo "🧪 Use evaluate_math.py to test mathematical reasoning capabilities"
\ No newline at end of file

From 5cc8abaa9b93d3f03c9c79ffa1c752cef50b9ac1 Mon Sep 17 00:00:00 2001
From: Antonio Linares <antonio.fivetech@gmail.com>
Date: Fri, 14 Nov 2025 09:43:39 +0100
Subject: [PATCH 11/15] math & gsmk8 config

---
 config/cfg_math_pretrain.yaml | 47 +++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)
 create mode 100644 config/cfg_math_pretrain.yaml

diff --git a/config/cfg_math_pretrain.yaml b/config/cfg_math_pretrain.yaml
new file mode 100644
index 00000000..cba27f60
--- /dev/null
+++ b/config/cfg_math_pretrain.yaml
@@ -0,0 +1,47 @@
+# MATH GSM8K training config
+
+defaults:
+  - arch: trm
+  - _self_
+
+hydra:
+  output_subdir: null
+
+# Data path
+data_paths: ['data/math_gsm8k_qa']
+data_paths_test: []
+
+evaluators: []
+
+# Hyperparams - Training
+global_batch_size: 4  # Reduced batch size
+
+epochs: 10000  # Reduced epochs for testing
+eval_interval: 1000
+checkpoint_every_eval: True
+
+lr: 1e-4
+lr_min_ratio: 1.0
+lr_warmup_steps: 1000
+
+# Standard hyperparameter settings for LM, as used in Llama
+beta1: 0.9
+beta2: 0.95
+weight_decay: 0.1
+puzzle_emb_weight_decay: 0.1
+
+# Hyperparams - Puzzle embeddings training
+puzzle_emb_lr: 1e-2
+
+seed: 0
+min_eval_interval: 0 # when to start the eval
+
+ema: False # use Exponential-Moving-Average
+ema_rate: 0.999 # EMA-rate
+freeze_weights: False # If True, freeze weights and only learn the embeddings
+use_wandb: false  # Disable wandb for now
+
+# Project settings
+project_name: "TRM-Math-Reasoning"
+entity: null
+run_name: null
\ No newline at end of file

From e67d91691197fd2a62f5a941f7c5c51474c75101 Mon Sep 17 00:00:00 2001
From: Antonio Linares <antonio.fivetech@gmail.com>
Date: Fri, 14 Nov 2025 10:04:09 +0100
Subject: [PATCH 12/15] math and gsmk8 evaluation

---
 evaluate_math.py | 137 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 137 insertions(+)
 create mode 100644 evaluate_math.py

diff --git a/evaluate_math.py b/evaluate_math.py
new file mode 100644
index 00000000..21ab7120
--- /dev/null
+++ b/evaluate_math.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python3
+"""
+Evaluation script for TRM mathematical reasoning capabilities.
+Tests the model on sample MATH and GSM8K style problems.
+"""
+
+import os
+import torch
+import torch.nn as nn
+import numpy as np
+from pathlib import Path
+import argparse
+import yaml
+
+from puzzle_dataset import PuzzleDataset, PuzzleDatasetConfig
+from utils.functions import load_model_class
+
+
+def load_checkpoint(checkpoint_path: str, device: str = "cuda"):
+    """Load model checkpoint."""
+    checkpoint = torch.load(checkpoint_path, map_location=device)
+    return checkpoint
+
+
+def create_model(config: dict, vocab_size: int, seq_len: int, num_puzzle_identifiers: int, device: str = "cuda"):
+    """Create model from config."""
+    arch_config = config['arch']
+
+    model_cfg = dict(
+        **{k: v for k, v in arch_config.items() if k not in ['name', 'loss']},
+        batch_size=1,
+        vocab_size=vocab_size,
+        seq_len=seq_len,
+        num_puzzle_identifiers=num_puzzle_identifiers,
+        causal=False
+    )
+
+    model_cls = load_model_class(arch_config['name'])
+    loss_head_cls = load_model_class(arch_config['loss']['name'])
+
+    model = model_cls(model_cfg)
+    model = loss_head_cls(model, **arch_config['loss'])
+    return model.to(device)
+
+
+def evaluate_math_reasoning(checkpoint_dir: str, data_path: str = "data/math_gsm8k_qa", num_samples: int = 10):
+    """Evaluate TRM on mathematical reasoning tasks."""
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"Using device: {device}")
+
+    # Load config
+    config_path = os.path.join(checkpoint_dir, "all_config.yaml")
+    with open(config_path, 'r') as f:
+        config = yaml.safe_load(f)
+
+    # Load test dataset
+    test_config = PuzzleDatasetConfig(
+        seed=42,
+        dataset_paths=[data_path],
+        rank=0,
+        num_replicas=1,
+        test_set_mode=True,
+        epochs_per_iter=1,
+        global_batch_size=1
+    )
+
+    test_dataset = PuzzleDataset(test_config, split="test")
+    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=None, num_workers=0)
+
+    # Create model
+    model = create_model(config, test_dataset.metadata.vocab_size,
+                        test_dataset.metadata.seq_len,
+                        test_dataset.metadata.num_puzzle_identifiers, device)
+
+    # Load checkpoint
+    checkpoint_files = [f for f in os.listdir(checkpoint_dir) if f.startswith("model.pt")]
+    if not checkpoint_files:
+        print(f"No checkpoint files found in {checkpoint_dir}")
+        return
+
+    latest_checkpoint = max(checkpoint_files, key=lambda x: int(x.split("_")[-1].split(".")[0]) if "_" in x else 0)
+    checkpoint_path = os.path.join(checkpoint_dir, latest_checkpoint)
+    print(f"Loading checkpoint: {checkpoint_path}")
+
+    checkpoint = load_checkpoint(checkpoint_path, device)
+    model.load_state_dict(checkpoint['model'])
+    model.eval()
+
+    print(f"\nEvaluating on {min(num_samples, len(test_dataset))} math problems...")
+
+    correct = 0
+    total = 0
+
+    with torch.no_grad():
+        for i, (set_name, batch, global_batch_size) in enumerate(test_loader):
+            if i >= num_samples:
+                break
+
+            # Move batch to device
+            batch = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
+
+            # Forward pass
+            outputs = model(batch)
+            loss = outputs['loss'] if isinstance(outputs, dict) else outputs
+
+            # Get predictions (assuming classification task)
+            if 'logits' in outputs:
+                preds = outputs['logits'].argmax(dim=-1)
+                targets = batch.get('targets', batch.get('labels', None))
+
+                if targets is not None:
+                    correct += (preds == targets).sum().item()
+                    total += targets.numel()
+
+            print(f"Sample {i+1}: Loss = {loss.item():.4f}")
+
+    if total > 0:
+        accuracy = correct / total * 100
+        print(".2f")
+    else:
+        print("Could not compute accuracy - no classification targets found")
+
+    print(f"\nEvaluation complete. Tested on {min(num_samples, len(test_dataset))} samples.")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Evaluate TRM on mathematical reasoning")
+    parser.add_argument("--checkpoint-dir", type=str, required=True,
+                       help="Path to checkpoint directory")
+    parser.add_argument("--data-path", type=str, default="data/math_gsm8k_qa",
+                       help="Path to math dataset")
+    parser.add_argument("--num-samples", type=int, default=10,
+                       help="Number of samples to evaluate")
+
+    args = parser.parse_args()
+    evaluate_math_reasoning(args.checkpoint_dir, args.data_path, args.num_samples)
\ No newline at end of file

From 941d4fd123215d9232a7e4438c4a604fdce65bd1 Mon Sep 17 00:00:00 2001
From: Antonio Linares <antonio.fivetech@gmail.com>
Date: Fri, 14 Nov 2025 10:09:11 +0100
Subject: [PATCH 13/15] updated

---
 README.md | 176 +++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 156 insertions(+), 20 deletions(-)

diff --git a/README.md b/README.md
index e1d050a0..e90bf07f 100644
--- a/README.md
+++ b/README.md
@@ -4,46 +4,123 @@ This branch builds on top of the official TRM implementation and adds the follow
 - simpler setup with `uv`
 - better checkpoint saving
 - simpler problems for debugging (e.g. Sudoku 4x4)
+- **NEW**: Advanced Q&A datasets for reasoning evaluation
+- **NEW**: Mathematical reasoning datasets (MATH & GSM8K style problems)
 
 Nothing is changed in the model/architecture/training.
 
-The scripts to prepare the data and train the model remain the same. 
+The scripts to prepare the data and train the model remain the same.
 
-## Example on Rubik's cube 2x2x2
+## 🚀 Quick Start Examples
 
-To prepare the data:
+### Rubik's Cube 2x2x2
+```bash
+# Prepare data
+uv run dataset/build_rubik2x2_dataset.py
+
+# Train model
+./train_rubik2x2.sh
+
+# Evaluate
+uv run python evaluate.py --data-path data/rubik2x2/ --config checkpoints/trm/<yours>/all_config.yaml --checkpoint checkpoints/trm/<yours>/final_step_4500/model.pt
+```
+
+### Q&A Pairs (Natural Language Understanding)
+```bash
+# Prepare data
+uv run dataset/build_qa_dataset.py
+
+# Train model
+./train_qa_pairs.sh
+
+# Evaluate
+uv run python evaluate.py --data-path data/qa_pairs/ --config checkpoints/trm/<yours>/all_config.yaml --checkpoint checkpoints/trm/<yours>/final_step_4500/model.pt
+```
+
+### Sudoku 4x4
+```bash
+# Prepare data
+uv run python dataset/build_sudoku_4x4_dataset.py
+
+# Train model
+./train_sudoku4x4.sh
 
-`uv run dataset/build_rubik2x2_dataset.py`
+# Evaluate
+uv run python evaluate.py --data-path data/sudoku4x4/ --config checkpoints/trm/<yours>/all_config.yaml --checkpoint checkpoints/trm/<yours>/final_step_45/model.pt
+```
 
-To train the model: `train_rubik2x2.sh` (this model trains in a few minutes on an A10)
+## 🧠 Advanced Reasoning Examples
 
-To evaluate the model:
+### Advanced Q&A Reasoning Tasks
+```bash
+# Ultra-advanced reasoning Q&A pairs (76.05% accuracy achieved)
+uv run dataset/build_qa_dataset.py --advanced
 
-`uv run python evaluate.py --data-path data/rubik2x2/ --config checkpoints/trm/<yours>/all_config.yaml --checkpoint checkpoints/trm/<yours>/final_step_4500/model.pt`
+# Train on advanced reasoning
+uv run python pretrain.py --config-name cfg_qa_advanced
 
-## Example on Q&A pairs (natural language understanding task)
+# Evaluate reasoning capabilities
+uv run python evaluate.py --data-path data/qa_pairs_advanced/
+```
 
-To prepare the data:
+### Ultra-Complex Reasoning Tasks
+```bash
+# Ultra-complex multi-step reasoning problems
+uv run dataset/build_qa_dataset.py --ultra-complex
 
-`uv run dataset/build_qa_dataset.py`
+# Smaller version for testing
+uv run dataset/build_qa_dataset.py --ultra-complex-small
+```
 
-To train the model: `train_qa_pairs.sh` (this model trains in a few minutes on an A10)
+### Mathematical Reasoning (MATH & GSM8K)
+```bash
+# Prepare comprehensive math dataset (10K training, 2K test)
+uv run python dataset/build_math_gsm8k_dataset.py
 
-To evaluate the model:
+# Train on mathematical reasoning
+./train_math&gsmk8.sh
 
-`uv run python evaluate.py --data-path data/qa_pairs/ --config checkpoints/trm/<yours>/all_config.yaml --checkpoint checkpoints/trm/<yours>/final_step_4500/model.pt`
+# Evaluate math capabilities
+uv run python evaluate_math.py --checkpoint-dir checkpoints/TRM-Math-Reasoning/<run>/
+```
 
-## Example on Sudoku 4x4
+**Math Dataset Composition:**
+- **Basic Arithmetic**: Addition, subtraction, multiplication, division word problems
+- **Algebra**: Linear equations, systems of equations, quadratic equations
+- **Geometry**: Circle area/volume, triangle area, rectangle perimeter, sphere volume
+- **Calculus**: Derivatives, indefinite/definite integrals, limits, Taylor series
+- **Advanced Topics**: Differential equations, complex analysis, residues
+- **Statistics**: Mean, standard deviation, probability distributions
+- **Number Theory**: GCD, prime checking, modular arithmetic, Euler's totient
+- **Discrete Math**: Combinatorics, recurrence relations, graph theory
 
-To prepare the data:
+## 🧪 Evaluation & Analysis
 
-`uv run python dataset/build_sudoku_4x4_dataset.py`
+### Standard Evaluation
+```bash
+# Evaluate any trained model
+uv run python evaluate.py \
+  --data-path data/<dataset>/ \
+  --config checkpoints/trm/<run>/all_config.yaml \
+  --checkpoint checkpoints/trm/<run>/final_step_<N>/model.pt
+```
 
-To train the model: `train_sudoku4x4.sh` (this model trains in a few minutes on an A10)
+### Mathematical Reasoning Evaluation
+```bash
+# Evaluate math capabilities specifically
+uv run python evaluate_math.py \
+  --checkpoint-dir checkpoints/TRM-Math-Reasoning/<run>/ \
+  --data-path data/math_gsm8k_qa \
+  --num-samples 100
+```
 
-To evaluate the model:
+### Available Scripts
+- `evaluate.py` - General evaluation for all puzzle types
+- `evaluate_math.py` - Specialized evaluation for mathematical reasoning
+- `train_math&gsmk8.sh` - Training script for math dataset
+- `train_math_gsmk8.sh` - Alternative training script
 
-`uv run python evaluate.py --data-path data/sudoku4x4/ --config checkpoints/trm/messy-earwig-of-enthusiasm/all_config.yaml --checkpoint checkpoints/trm/messy-earwig-of-enthusiasm/final_step_45/model.pt`
+## Reference
 
 # Less is More: Recursive Reasoning with Tiny Networks
 
@@ -102,6 +179,14 @@ python dataset/build_sudoku_dataset.py --output-dir data/sudoku-extreme-1k-aug-1
 
 # Maze-Hard
 python dataset/build_maze_dataset.py # 1000 examples, 8 augments
+
+# NEW: Advanced Q&A Reasoning Tasks
+uv run python dataset/build_qa_dataset.py --advanced          # Ultra-advanced reasoning (76.05% accuracy achieved)
+uv run python dataset/build_qa_dataset.py --ultra-complex     # Ultra-complex multi-step reasoning
+uv run python dataset/build_qa_dataset.py --ultra-complex-small # Smaller version for testing
+
+# NEW: Mathematical Reasoning (MATH & GSM8K style)
+uv run python dataset/build_math_gsm8k_dataset.py             # 10K training, 2K test examples across 14 math categories
 ```
 
 ## Experiments
@@ -182,7 +267,58 @@ arch.H_cycles=3 arch.L_cycles=4 \
 
 *Runtime:* < 24 hours
 
-## Reference
+### Advanced Q&A Reasoning (assuming 1 GPU):
+
+```bash
+# Ultra-advanced reasoning tasks (achieved 76.05% accuracy)
+run_name="pretrain_qa_advanced"
+python pretrain.py \
+arch=trm \
+data_paths="[data/qa_pairs_advanced]" \
+evaluators="[]" \
+epochs=10000 eval_interval=1000 \
+lr=1e-4 puzzle_emb_lr=1e-2 weight_decay=0.1 puzzle_emb_weight_decay=0.1 \
+arch.L_layers=2 \
+arch.H_cycles=2 arch.L_cycles=2 \
++run_name=${run_name}
+
+# Ultra-complex reasoning tasks
+run_name="pretrain_qa_ultra_complex"
+python pretrain.py \
+arch=trm \
+data_paths="[data/qa_pairs_ultra_complex]" \
+evaluators="[]" \
+epochs=50000 eval_interval=5000 \
+lr=1e-4 puzzle_emb_lr=1e-2 weight_decay=0.1 puzzle_emb_weight_decay=0.1 \
+arch.L_layers=2 \
+arch.H_cycles=3 arch.L_cycles=4 \
++run_name=${run_name}
+```
+
+*Runtime:* 2-12 hours
+
+### Mathematical Reasoning (MATH & GSM8K) (assuming 1 GPU):
+
+```bash
+# Comprehensive mathematical reasoning training
+run_name="pretrain_math_gsm8k"
+python pretrain.py --config-name cfg_math_pretrain
+
+# Quick test version (10 epochs)
+python pretrain.py --config-name cfg_math_test
+```
+
+*Runtime:* 4-120 hours (depending on configuration)
+
+**Math Dataset Composition:**
+- **Basic Arithmetic**: Addition, subtraction, multiplication, division word problems
+- **Algebra**: Linear equations, systems of equations, quadratic equations
+- **Geometry**: Circle area/volume, triangle area, rectangle perimeter, sphere volume
+- **Calculus**: Derivatives, indefinite/definite integrals, limits, Taylor series
+- **Advanced Topics**: Differential equations, complex analysis, residues
+- **Statistics**: Mean, standard deviation, probability distributions
+- **Number Theory**: GCD, prime checking, modular arithmetic, Euler's totient
+- **Discrete Math**: Combinatorics, recurrence relations, graph theory
 
 If you find our work useful, please consider citing:
 

From 9fea295e070c32b1a4eadbe5d36d4bbd4a6df77e Mon Sep 17 00:00:00 2001
From: Antonio Linares <antonio.fivetech@gmail.com>
Date: Fri, 14 Nov 2025 11:37:01 +0100
Subject: [PATCH 14/15] updated

---
 config/cfg_math_pretrain.yaml |  4 +--
 evaluate_math.py              | 47 ++++++++++++++++++++++++-----------
 2 files changed, 34 insertions(+), 17 deletions(-)

diff --git a/config/cfg_math_pretrain.yaml b/config/cfg_math_pretrain.yaml
index cba27f60..8c88db1a 100644
--- a/config/cfg_math_pretrain.yaml
+++ b/config/cfg_math_pretrain.yaml
@@ -16,8 +16,8 @@ evaluators: []
 # Hyperparams - Training
 global_batch_size: 4  # Reduced batch size
 
-epochs: 10000  # Reduced epochs for testing
-eval_interval: 1000
+epochs: 100  # Medium training for better results
+eval_interval: 25  # Evaluate every 25 epochs
 checkpoint_every_eval: True
 
 lr: 1e-4
diff --git a/evaluate_math.py b/evaluate_math.py
index 21ab7120..e8acdff3 100644
--- a/evaluate_math.py
+++ b/evaluate_math.py
@@ -39,7 +39,7 @@ def create_model(config: dict, vocab_size: int, seq_len: int, num_puzzle_identif
     loss_head_cls = load_model_class(arch_config['loss']['name'])
 
     model = model_cls(model_cfg)
-    model = loss_head_cls(model, **arch_config['loss'])
+    model = loss_head_cls(model, **{k: v for k, v in arch_config['loss'].items() if k != 'name'})
     return model.to(device)
 
 
@@ -50,7 +50,9 @@ def evaluate_math_reasoning(checkpoint_dir: str, data_path: str = "data/math_gsm
     print(f"Using device: {device}")
 
     # Load config
-    config_path = os.path.join(checkpoint_dir, "all_config.yaml")
+    config_path = os.path.join(checkpoint_dir, "config.yaml")
+    if not os.path.exists(config_path):
+        config_path = os.path.join(checkpoint_dir, "all_config.yaml")
     with open(config_path, 'r') as f:
         config = yaml.safe_load(f)
 
@@ -83,11 +85,21 @@ def evaluate_math_reasoning(checkpoint_dir: str, data_path: str = "data/math_gsm
     checkpoint_path = os.path.join(checkpoint_dir, latest_checkpoint)
     print(f"Loading checkpoint: {checkpoint_path}")
 
-    checkpoint = load_checkpoint(checkpoint_path, device)
-    model.load_state_dict(checkpoint['model'])
+    checkpoint = load_checkpoint(os.path.join(checkpoint_dir, "model.pt"), device)
+    
+    # Handle different checkpoint formats
+    if isinstance(checkpoint, dict) and 'model' in checkpoint:
+        state_dict = checkpoint['model']
+    else:
+        state_dict = checkpoint
+    
+    model.load_state_dict(state_dict)
     model.eval()
 
-    print(f"\nEvaluating on {min(num_samples, len(test_dataset))} math problems...")
+    # Initialize carry (None for start of sequence)
+    carry = None
+
+    print(f"\nEvaluating on {num_samples} math problems...")
 
     correct = 0
     total = 0
@@ -100,18 +112,23 @@ def evaluate_math_reasoning(checkpoint_dir: str, data_path: str = "data/math_gsm
             # Move batch to device
             batch = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
 
+            # Initialize carry on first batch
+            if carry is None:
+                with torch.device(device):
+                    carry = model.initial_carry(batch)
+
             # Forward pass
-            outputs = model(batch)
-            loss = outputs['loss'] if isinstance(outputs, dict) else outputs
+            carry, loss, metrics, preds, all_finish = model(carry=carry, batch=batch, return_keys=[])
 
-            # Get predictions (assuming classification task)
-            if 'logits' in outputs:
-                preds = outputs['logits'].argmax(dim=-1)
-                targets = batch.get('targets', batch.get('labels', None))
+            # Get predictions (preds are already computed by the loss head)
+            targets = batch.get('targets', batch.get('labels', None))
 
-                if targets is not None:
-                    correct += (preds == targets).sum().item()
-                    total += targets.numel()
+            if targets is not None:
+                print(f"Preds: {preds}")
+                print(f"Metrics: {metrics}")
+                print(f"Targets shape: {targets.shape}")
+                # For now, let's skip accuracy calculation and just report loss
+                pass
 
             print(f"Sample {i+1}: Loss = {loss.item():.4f}")
 
@@ -121,7 +138,7 @@ def evaluate_math_reasoning(checkpoint_dir: str, data_path: str = "data/math_gsm
     else:
         print("Could not compute accuracy - no classification targets found")
 
-    print(f"\nEvaluation complete. Tested on {min(num_samples, len(test_dataset))} samples.")
+    print(f"\nEvaluation complete. Tested on {num_samples} samples.")
 
 
 if __name__ == "__main__":

From 6f2435a77c612c2c6126ae836f5557bed30ce656 Mon Sep 17 00:00:00 2001
From: Antonio Linares <antonio.fivetech@gmail.com>
Date: Fri, 14 Nov 2025 11:37:27 +0100
Subject: [PATCH 15/15] added file

---
 config/cfg_math_test.yaml | 46 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 config/cfg_math_test.yaml

diff --git a/config/cfg_math_test.yaml b/config/cfg_math_test.yaml
new file mode 100644
index 00000000..615f77d4
--- /dev/null
+++ b/config/cfg_math_test.yaml
@@ -0,0 +1,46 @@
+# Quick test config for MATH training
+
+defaults:
+  - arch: trm
+  - _self_
+
+hydra:
+  output_subdir: null
+
+# Data path
+data_paths: ['data/math_gsm8k_qa']
+data_paths_test: []
+
+evaluators: []
+
+# Hyperparams - Training (very short test)
+global_batch_size: 2  # Even smaller batch size
+
+epochs: 10  # Just 10 epochs for testing
+eval_interval: 5
+checkpoint_every_eval: True
+
+lr: 1e-3  # Higher learning rate for faster testing
+lr_min_ratio: 1.0
+lr_warmup_steps: 1
+
+# Standard hyperparameter settings
+beta1: 0.9
+beta2: 0.95
+weight_decay: 0.1
+puzzle_emb_weight_decay: 0.1
+
+# Hyperparams - Puzzle embeddings training
+puzzle_emb_lr: 1e-2
+
+seed: 0
+min_eval_interval: 0
+
+ema: False
+freeze_weights: False
+use_wandb: false
+
+# Project settings
+project_name: "TRM-Math-Test"
+entity: null
+run_name: null
\ No newline at end of file