From 6d3e229577380cfa5187cb2916417c9914ff682e Mon Sep 17 00:00:00 2001
From: Dzmitry Pihulski <dim.pigulsky@gmail.com>
Date: Tue, 24 Feb 2026 15:00:04 +0100
Subject: [PATCH] leaderboard data added

---
 .../5fewshots/inference_script.py             |  25 +++
 .../5fewshots/requirements.txt                | 172 ++++++++++++++++++
 .../Llama-3.2-1B-Instruct/5fewshots/run.yaml  |  57 ++++++
 .../5fewshots/inference_script.py             |  25 +++
 .../5fewshots/requirements.txt                | 172 ++++++++++++++++++
 .../Llama-3.2-3B-Instruct/5fewshots/run.yaml  |  57 ++++++
 .../5fewshots/inference_script.py             |  25 +++
 .../5fewshots/requirements.txt                | 172 ++++++++++++++++++
 .../Llama-3.3-70B-Instruct/5fewshots/run.yaml |  57 ++++++
 .../5fewshots/inference_script.py             |  25 +++
 .../5fewshots/requirements.txt                | 172 ++++++++++++++++++
 .../5fewshots/run.yaml                        |  56 ++++++
 .../5fewshots/inference_script.py             |  26 +++
 .../5fewshots/requirements.txt                | 172 ++++++++++++++++++
 .../5fewshots/run.yaml                        |  59 ++++++
 .../5fewshots/inference_script.py             |  25 +++
 .../PLLuM-12B-chat/5fewshots/requirements.txt | 172 ++++++++++++++++++
 leaderboard/PLLuM-12B-chat/5fewshots/run.yaml |  57 ++++++
 .../5fewshots/inference_script.py             |  25 +++
 .../5fewshots/requirements.txt                | 172 ++++++++++++++++++
 .../PLLuM-12B-nc-chat/5fewshots/run.yaml      |  57 ++++++
 .../5fewshots/inference_script.py             |  25 +++
 .../5fewshots/requirements.txt                | 172 ++++++++++++++++++
 .../Qwen2.5-1.5B-Instruct/5fewshots/run.yaml  |  57 ++++++
 .../5fewshots/inference_script.py             |  25 +++
 .../5fewshots/requirements.txt                | 172 ++++++++++++++++++
 .../Qwen2.5-7B-Instruct/5fewshots/run.yaml    |  56 ++++++
 .../Qwen3-0.6B/5fewshots/inference_script.py  |  26 +++
 .../Qwen3-0.6B/5fewshots/requirements.txt     | 172 ++++++++++++++++++
 leaderboard/Qwen3-0.6B/5fewshots/run.yaml     |  60 ++++++
 .../5fewshots/inference_script.py             |  26 +++
 .../gpt-oss-120b/5fewshots/requirements.txt   | 172 ++++++++++++++++++
 leaderboard/gpt-oss-120b/5fewshots/run.yaml   |  60 ++++++
 .../gpt-oss-20b/5fewshots/inference_script.py |  26 +++
 .../gpt-oss-20b/5fewshots/requirements.txt    | 172 ++++++++++++++++++
 leaderboard/gpt-oss-20b/5fewshots/run.yaml    |  59 ++++++
 .../5fewshots/inference_script.py             |  25 +++
 .../5fewshots/requirements.txt                | 172 ++++++++++++++++++
 .../5fewshots/run.yaml                        |  56 ++++++
 39 files changed, 3313 insertions(+)
 create mode 100644 leaderboard/Llama-3.2-1B-Instruct/5fewshots/inference_script.py
 create mode 100644 leaderboard/Llama-3.2-1B-Instruct/5fewshots/requirements.txt
 create mode 100644 leaderboard/Llama-3.2-1B-Instruct/5fewshots/run.yaml
 create mode 100644 leaderboard/Llama-3.2-3B-Instruct/5fewshots/inference_script.py
 create mode 100644 leaderboard/Llama-3.2-3B-Instruct/5fewshots/requirements.txt
 create mode 100644 leaderboard/Llama-3.2-3B-Instruct/5fewshots/run.yaml
 create mode 100644 leaderboard/Llama-3.3-70B-Instruct/5fewshots/inference_script.py
 create mode 100644 leaderboard/Llama-3.3-70B-Instruct/5fewshots/requirements.txt
 create mode 100644 leaderboard/Llama-3.3-70B-Instruct/5fewshots/run.yaml
 create mode 100644 leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/inference_script.py
 create mode 100644 leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/requirements.txt
 create mode 100644 leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/run.yaml
 create mode 100644 leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/inference_script.py
 create mode 100644 leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/requirements.txt
 create mode 100644 leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/run.yaml
 create mode 100644 leaderboard/PLLuM-12B-chat/5fewshots/inference_script.py
 create mode 100644 leaderboard/PLLuM-12B-chat/5fewshots/requirements.txt
 create mode 100644 leaderboard/PLLuM-12B-chat/5fewshots/run.yaml
 create mode 100644 leaderboard/PLLuM-12B-nc-chat/5fewshots/inference_script.py
 create mode 100644 leaderboard/PLLuM-12B-nc-chat/5fewshots/requirements.txt
 create mode 100644 leaderboard/PLLuM-12B-nc-chat/5fewshots/run.yaml
 create mode 100644 leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/inference_script.py
 create mode 100644 leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/requirements.txt
 create mode 100644 leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/run.yaml
 create mode 100644 leaderboard/Qwen2.5-7B-Instruct/5fewshots/inference_script.py
 create mode 100644 leaderboard/Qwen2.5-7B-Instruct/5fewshots/requirements.txt
 create mode 100644 leaderboard/Qwen2.5-7B-Instruct/5fewshots/run.yaml
 create mode 100644 leaderboard/Qwen3-0.6B/5fewshots/inference_script.py
 create mode 100644 leaderboard/Qwen3-0.6B/5fewshots/requirements.txt
 create mode 100644 leaderboard/Qwen3-0.6B/5fewshots/run.yaml
 create mode 100644 leaderboard/gpt-oss-120b/5fewshots/inference_script.py
 create mode 100644 leaderboard/gpt-oss-120b/5fewshots/requirements.txt
 create mode 100644 leaderboard/gpt-oss-120b/5fewshots/run.yaml
 create mode 100644 leaderboard/gpt-oss-20b/5fewshots/inference_script.py
 create mode 100644 leaderboard/gpt-oss-20b/5fewshots/requirements.txt
 create mode 100644 leaderboard/gpt-oss-20b/5fewshots/run.yaml
 create mode 100644 leaderboard/pllum-12b-nc-chat-250715/5fewshots/inference_script.py
 create mode 100644 leaderboard/pllum-12b-nc-chat-250715/5fewshots/requirements.txt
 create mode 100644 leaderboard/pllum-12b-nc-chat-250715/5fewshots/run.yaml

diff --git a/leaderboard/Llama-3.2-1B-Instruct/5fewshots/inference_script.py b/leaderboard/Llama-3.2-1B-Instruct/5fewshots/inference_script.py
new file mode 100644
index 0000000..1049e44
--- /dev/null
+++ b/leaderboard/Llama-3.2-1B-Instruct/5fewshots/inference_script.py
@@ -0,0 +1,25 @@
+import os
+
+from dotenv import load_dotenv
+
+from llmsql import evaluate, inference_vllm
+
+load_dotenv()
+
+MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
+
+results = inference_vllm(
+    model_name=MODEL_NAME,
+    output_file=f"{MODEL_NAME}_outputs.jsonl",
+    batch_size=20000,
+    tensor_parallel_size=4,
+    do_sample=False,
+    hf_token=os.environ["HF_TOKEN"],
+    max_new_tokens=256,
+    temperature=0.0,
+    num_fewshots=5,
+    seed=42,
+    llm_kwargs={"dtype": "bfloat16"},
+)
+
+evaluate(results)
diff --git a/leaderboard/Llama-3.2-1B-Instruct/5fewshots/requirements.txt b/leaderboard/Llama-3.2-1B-Instruct/5fewshots/requirements.txt
new file mode 100644
index 0000000..929f583
--- /dev/null
+++ b/leaderboard/Llama-3.2-1B-Instruct/5fewshots/requirements.txt
@@ -0,0 +1,172 @@
+accelerate==1.12.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.13.3
+aiosignal==1.4.0
+annotated-doc==0.0.4
+annotated-types==0.7.0
+anthropic==0.83.0
+anyio==4.12.1
+apache-tvm-ffi==0.1.8.post2
+astor==0.8.1
+attrs==25.4.0
+blake3==1.0.8
+cachetools==7.0.1
+cbor2==5.8.0
+certifi==2026.1.4
+cffi==2.0.0
+charset-normalizer==3.4.4
+click==8.3.1
+cloudpickle==3.1.2
+compressed-tensors==0.13.0
+cryptography==46.0.5
+cuda-bindings==13.1.1
+cuda-pathfinder==1.3.5
+cuda-python==13.1.1
+cupy-cuda12x==14.0.1
+datasets==4.5.0
+depyf==0.20.0
+dill==0.4.0
+diskcache==5.6.3
+distro==1.9.0
+dnspython==2.8.0
+docstring_parser==0.17.0
+einops==0.8.2
+email-validator==2.3.0
+fastapi==0.132.0
+fastapi-cli==0.0.23
+fastapi-cloud-cli==0.13.0
+fastar==0.8.0
+filelock==3.24.3
+flashinfer-python==0.6.1
+frozenlist==1.8.0
+fsspec==2025.10.0
+gguf==0.17.1
+grpcio==1.78.1
+grpcio-reflection==1.78.1
+h11==0.16.0
+hf-xet==1.3.0
+httpcore==1.0.9
+httptools==0.7.1
+httpx==0.28.1
+httpx-sse==0.4.3
+huggingface_hub==0.36.2
+idna==3.11
+ijson==3.5.0
+interegular==0.3.3
+Jinja2==3.1.6
+jiter==0.13.0
+jmespath==1.1.0
+jsonschema==4.26.0
+jsonschema-specifications==2025.9.1
+lark==1.2.2
+llguidance==1.3.0
+llmsql==0.1.15
+llvmlite==0.44.0
+lm-format-enforcer==0.11.3
+loguru==0.7.3
+markdown-it-py==4.0.0
+MarkupSafe==3.0.3
+mcp==1.26.0
+mdurl==0.1.2
+mistral_common==1.9.1
+model-hosting-container-standards==0.1.13
+mpmath==1.3.0
+msgpack==1.1.2
+msgspec==0.20.0
+multidict==6.7.1
+multiprocess==0.70.18
+networkx==3.6.1
+ninja==1.13.0
+numba==0.61.2
+numpy==2.2.6
+nvidia-cublas-cu12==12.8.4.1
+nvidia-cuda-cupti-cu12==12.8.90
+nvidia-cuda-nvrtc-cu12==12.8.93
+nvidia-cuda-runtime-cu12==12.8.90
+nvidia-cudnn-cu12==9.10.2.21
+nvidia-cudnn-frontend==1.18.0
+nvidia-cufft-cu12==11.3.3.83
+nvidia-cufile-cu12==1.13.1.3
+nvidia-curand-cu12==10.3.9.90
+nvidia-cusolver-cu12==11.7.3.90
+nvidia-cusparse-cu12==12.5.8.93
+nvidia-cusparselt-cu12==0.7.1
+nvidia-cutlass-dsl==4.4.0
+nvidia-cutlass-dsl-libs-base==4.4.0
+nvidia-ml-py==13.590.48
+nvidia-nccl-cu12==2.27.5
+nvidia-nvjitlink-cu12==12.8.93
+nvidia-nvshmem-cu12==3.3.20
+nvidia-nvtx-cu12==12.8.90
+openai==2.23.0
+openai-harmony==0.0.8
+opencv-python-headless==4.13.0.92
+outlines_core==0.2.11
+packaging==26.0
+pandas==3.0.1
+partial-json-parser==0.2.1.1.post7
+pillow==12.1.1
+prometheus-fastapi-instrumentator==7.1.0
+prometheus_client==0.24.1
+propcache==0.4.1
+protobuf==6.33.5
+psutil==7.2.2
+py-cpuinfo==9.0.0
+pyarrow==23.0.1
+pybase64==1.4.3
+pycountry==26.2.16
+pycparser==3.0
+pydantic==2.12.5
+pydantic-extra-types==2.11.0
+pydantic-settings==2.13.1
+pydantic_core==2.41.5
+Pygments==2.19.2
+PyJWT==2.11.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.2.1
+python-json-logger==4.0.0
+python-multipart==0.0.22
+PyYAML==6.0.3
+pyzmq==27.1.0
+ray==2.54.0
+referencing==0.37.0
+regex==2026.2.19
+requests==2.32.5
+rich==14.3.3
+rich-toolkit==0.19.4
+rignore==0.7.6
+rpds-py==0.30.0
+safetensors==0.7.0
+sentencepiece==0.2.1
+sentry-sdk==2.53.0
+setproctitle==1.3.7
+setuptools==80.10.2
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+sse-starlette==3.2.0
+starlette==0.52.1
+supervisor==4.3.0
+sympy==1.14.0
+tabulate==0.9.0
+tiktoken==0.12.0
+tokenizers==0.22.2
+torch==2.9.1
+torchaudio==2.9.1
+torchvision==0.24.1
+tqdm==4.67.3
+transformers==4.57.6
+triton==3.5.1
+typer==0.24.1
+typer-slim==0.24.0
+typing-inspection==0.4.2
+typing_extensions==4.15.0
+urllib3==2.6.3
+uvicorn==0.41.0
+uvloop==0.22.1
+vllm==0.15.1
+watchfiles==1.1.1
+websockets==16.0
+xgrammar==0.1.29
+xxhash==3.6.0
+yarl==1.22.0
diff --git a/leaderboard/Llama-3.2-1B-Instruct/5fewshots/run.yaml b/leaderboard/Llama-3.2-1B-Instruct/5fewshots/run.yaml
new file mode 100644
index 0000000..7aaea2e
--- /dev/null
+++ b/leaderboard/Llama-3.2-1B-Instruct/5fewshots/run.yaml
@@ -0,0 +1,57 @@
+date: 2026-02-24
+# =====================
+# Model Information
+# =====================
+model:
+  name: meta-llama/Llama-3.2-1B-Instruct
+  revision: main
+  commit_hash: 9213176726f574b556790deb65791e0c5aa438b6
+  parameter_count: 1B
+  dtype: bfloat16
+  thinking: false
+
+type: open-source  # open-source | proprietary
+
+# =====================
+# Package Information
+# =====================
+llmsql:
+  version: 0.1.15
+  commit_hash: 79175212c90b1fc094abd2c9666c23d903060014
+
+# =====================
+# Benchmark Information
+# =====================
+version: 2.0
+
+# =====================
+# Environment Information
+# =====================
+os_name: Ubuntu 24.04.3 LTS
+python_version: 3.12.12
+pip_freeze: requirements.txt
+device: 4xH200
+
+# =====================
+# Function Inputs / Inference Backend
+# =====================
+inference:
+  backend: vllm  # vllm | transformers
+  arguments:
+    batch_size: 20000
+    tetensor_parallel_size: 4
+    do_sample: false
+    max_new_tokens: 256
+    temperature: 0.0
+    num_fewshots: 5
+    seed: 42
+    llm_kwargs:
+      dtype: bfloat16
+
+
+# =====================
+# Results
+# =====================
+results:
+  execution_accuracy: 0.2678
+  answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/Llama-3.2-1B-Instruct/5fewshots/Llama-3.2-1B-Instruct_outputs.jsonl
diff --git a/leaderboard/Llama-3.2-3B-Instruct/5fewshots/inference_script.py b/leaderboard/Llama-3.2-3B-Instruct/5fewshots/inference_script.py
new file mode 100644
index 0000000..a3400c8
--- /dev/null
+++ b/leaderboard/Llama-3.2-3B-Instruct/5fewshots/inference_script.py
@@ -0,0 +1,25 @@
+import os
+
+from dotenv import load_dotenv
+
+from llmsql import evaluate, inference_vllm
+
+load_dotenv()
+
+MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct"
+
+results = inference_vllm(
+    model_name=MODEL_NAME,
+    output_file=f"{MODEL_NAME}_outputs.jsonl",
+    batch_size=20000,
+    tensor_parallel_size=4,
+    do_sample=False,
+    hf_token=os.environ["HF_TOKEN"],
+    max_new_tokens=256,
+    temperature=0.0,
+    num_fewshots=5,
+    seed=42,
+    llm_kwargs={"dtype": "bfloat16"},
+)
+
+evaluate(results)
diff --git a/leaderboard/Llama-3.2-3B-Instruct/5fewshots/requirements.txt b/leaderboard/Llama-3.2-3B-Instruct/5fewshots/requirements.txt
new file mode 100644
index 0000000..929f583
--- /dev/null
+++ b/leaderboard/Llama-3.2-3B-Instruct/5fewshots/requirements.txt
@@ -0,0 +1,172 @@
+accelerate==1.12.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.13.3
+aiosignal==1.4.0
+annotated-doc==0.0.4
+annotated-types==0.7.0
+anthropic==0.83.0
+anyio==4.12.1
+apache-tvm-ffi==0.1.8.post2
+astor==0.8.1
+attrs==25.4.0
+blake3==1.0.8
+cachetools==7.0.1
+cbor2==5.8.0
+certifi==2026.1.4
+cffi==2.0.0
+charset-normalizer==3.4.4
+click==8.3.1
+cloudpickle==3.1.2
+compressed-tensors==0.13.0
+cryptography==46.0.5
+cuda-bindings==13.1.1
+cuda-pathfinder==1.3.5
+cuda-python==13.1.1
+cupy-cuda12x==14.0.1
+datasets==4.5.0
+depyf==0.20.0
+dill==0.4.0
+diskcache==5.6.3
+distro==1.9.0
+dnspython==2.8.0
+docstring_parser==0.17.0
+einops==0.8.2
+email-validator==2.3.0
+fastapi==0.132.0
+fastapi-cli==0.0.23
+fastapi-cloud-cli==0.13.0
+fastar==0.8.0
+filelock==3.24.3
+flashinfer-python==0.6.1
+frozenlist==1.8.0
+fsspec==2025.10.0
+gguf==0.17.1
+grpcio==1.78.1
+grpcio-reflection==1.78.1
+h11==0.16.0
+hf-xet==1.3.0
+httpcore==1.0.9
+httptools==0.7.1
+httpx==0.28.1
+httpx-sse==0.4.3
+huggingface_hub==0.36.2
+idna==3.11
+ijson==3.5.0
+interegular==0.3.3
+Jinja2==3.1.6
+jiter==0.13.0
+jmespath==1.1.0
+jsonschema==4.26.0
+jsonschema-specifications==2025.9.1
+lark==1.2.2
+llguidance==1.3.0
+llmsql==0.1.15
+llvmlite==0.44.0
+lm-format-enforcer==0.11.3
+loguru==0.7.3
+markdown-it-py==4.0.0
+MarkupSafe==3.0.3
+mcp==1.26.0
+mdurl==0.1.2
+mistral_common==1.9.1
+model-hosting-container-standards==0.1.13
+mpmath==1.3.0
+msgpack==1.1.2
+msgspec==0.20.0
+multidict==6.7.1
+multiprocess==0.70.18
+networkx==3.6.1
+ninja==1.13.0
+numba==0.61.2
+numpy==2.2.6
+nvidia-cublas-cu12==12.8.4.1
+nvidia-cuda-cupti-cu12==12.8.90
+nvidia-cuda-nvrtc-cu12==12.8.93
+nvidia-cuda-runtime-cu12==12.8.90
+nvidia-cudnn-cu12==9.10.2.21
+nvidia-cudnn-frontend==1.18.0
+nvidia-cufft-cu12==11.3.3.83
+nvidia-cufile-cu12==1.13.1.3
+nvidia-curand-cu12==10.3.9.90
+nvidia-cusolver-cu12==11.7.3.90
+nvidia-cusparse-cu12==12.5.8.93
+nvidia-cusparselt-cu12==0.7.1
+nvidia-cutlass-dsl==4.4.0
+nvidia-cutlass-dsl-libs-base==4.4.0
+nvidia-ml-py==13.590.48
+nvidia-nccl-cu12==2.27.5
+nvidia-nvjitlink-cu12==12.8.93
+nvidia-nvshmem-cu12==3.3.20
+nvidia-nvtx-cu12==12.8.90
+openai==2.23.0
+openai-harmony==0.0.8
+opencv-python-headless==4.13.0.92
+outlines_core==0.2.11
+packaging==26.0
+pandas==3.0.1
+partial-json-parser==0.2.1.1.post7
+pillow==12.1.1
+prometheus-fastapi-instrumentator==7.1.0
+prometheus_client==0.24.1
+propcache==0.4.1
+protobuf==6.33.5
+psutil==7.2.2
+py-cpuinfo==9.0.0
+pyarrow==23.0.1
+pybase64==1.4.3
+pycountry==26.2.16
+pycparser==3.0
+pydantic==2.12.5
+pydantic-extra-types==2.11.0
+pydantic-settings==2.13.1
+pydantic_core==2.41.5
+Pygments==2.19.2
+PyJWT==2.11.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.2.1
+python-json-logger==4.0.0
+python-multipart==0.0.22
+PyYAML==6.0.3
+pyzmq==27.1.0
+ray==2.54.0
+referencing==0.37.0
+regex==2026.2.19
+requests==2.32.5
+rich==14.3.3
+rich-toolkit==0.19.4
+rignore==0.7.6
+rpds-py==0.30.0
+safetensors==0.7.0
+sentencepiece==0.2.1
+sentry-sdk==2.53.0
+setproctitle==1.3.7
+setuptools==80.10.2
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+sse-starlette==3.2.0
+starlette==0.52.1
+supervisor==4.3.0
+sympy==1.14.0
+tabulate==0.9.0
+tiktoken==0.12.0
+tokenizers==0.22.2
+torch==2.9.1
+torchaudio==2.9.1
+torchvision==0.24.1
+tqdm==4.67.3
+transformers==4.57.6
+triton==3.5.1
+typer==0.24.1
+typer-slim==0.24.0
+typing-inspection==0.4.2
+typing_extensions==4.15.0
+urllib3==2.6.3
+uvicorn==0.41.0
+uvloop==0.22.1
+vllm==0.15.1
+watchfiles==1.1.1
+websockets==16.0
+xgrammar==0.1.29
+xxhash==3.6.0
+yarl==1.22.0
diff --git a/leaderboard/Llama-3.2-3B-Instruct/5fewshots/run.yaml b/leaderboard/Llama-3.2-3B-Instruct/5fewshots/run.yaml
new file mode 100644
index 0000000..57616a3
--- /dev/null
+++ b/leaderboard/Llama-3.2-3B-Instruct/5fewshots/run.yaml
@@ -0,0 +1,57 @@
+date: 2026-02-24
+# =====================
+# Model Information
+# =====================
+model:
+  name: meta-llama/Llama-3.2-3B-Instruct
+  revision: main
+  commit_hash: 0cb88a4f764b7a12671c53f0838cd831a0843b95
+  parameter_count: 3B
+  dtype: bfloat16
+  thinking: false
+
+type: open-source  # open-source | proprietary
+
+# =====================
+# Package Information
+# =====================
+llmsql:
+  version: 0.1.15
+  commit_hash: 79175212c90b1fc094abd2c9666c23d903060014
+
+# =====================
+# Benchmark Information
+# =====================
+version: 2.0
+
+# =====================
+# Environment Information
+# =====================
+os_name: Ubuntu 24.04.3 LTS
+python_version: 3.12.12
+pip_freeze: requirements.txt
+device: 4xH200
+
+# =====================
+# Function Inputs / Inference Backend
+# =====================
+inference:
+  backend: vllm  # vllm | transformers
+  arguments:
+    batch_size: 20000
+    tetensor_parallel_size: 4
+    do_sample: false
+    max_new_tokens: 256
+    temperature: 0.0
+    num_fewshots: 5
+    seed: 42
+    llm_kwargs:
+      dtype: bfloat16
+
+
+# =====================
+# Results
+# =====================
+results:
+  execution_accuracy: 0.5415
+  answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/Llama-3.2-3B-Instruct/5fewshots/Llama-3.2-3B-Instruct_outputs.jsonl
diff --git a/leaderboard/Llama-3.3-70B-Instruct/5fewshots/inference_script.py b/leaderboard/Llama-3.3-70B-Instruct/5fewshots/inference_script.py
new file mode 100644
index 0000000..4efc97e
--- /dev/null
+++ b/leaderboard/Llama-3.3-70B-Instruct/5fewshots/inference_script.py
@@ -0,0 +1,25 @@
+import os
+
+from dotenv import load_dotenv
+
+from llmsql import evaluate, inference_vllm
+
+load_dotenv()
+
+MODEL_NAME = "meta-llama/Llama-3.3-70B-Instruct"
+
+results = inference_vllm(
+    model_name=MODEL_NAME,
+    output_file=f"{MODEL_NAME}_outputs.jsonl",
+    batch_size=20000,
+    tensor_parallel_size=4,
+    do_sample=False,
+    hf_token=os.environ["HF_TOKEN"],
+    max_new_tokens=256,
+    temperature=0.0,
+    num_fewshots=5,
+    seed=42,
+    llm_kwargs={"dtype": "bfloat16"},
+)
+
+evaluate(results)
diff --git a/leaderboard/Llama-3.3-70B-Instruct/5fewshots/requirements.txt b/leaderboard/Llama-3.3-70B-Instruct/5fewshots/requirements.txt
new file mode 100644
index 0000000..929f583
--- /dev/null
+++ b/leaderboard/Llama-3.3-70B-Instruct/5fewshots/requirements.txt
@@ -0,0 +1,172 @@
+accelerate==1.12.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.13.3
+aiosignal==1.4.0
+annotated-doc==0.0.4
+annotated-types==0.7.0
+anthropic==0.83.0
+anyio==4.12.1
+apache-tvm-ffi==0.1.8.post2
+astor==0.8.1
+attrs==25.4.0
+blake3==1.0.8
+cachetools==7.0.1
+cbor2==5.8.0
+certifi==2026.1.4
+cffi==2.0.0
+charset-normalizer==3.4.4
+click==8.3.1
+cloudpickle==3.1.2
+compressed-tensors==0.13.0
+cryptography==46.0.5
+cuda-bindings==13.1.1
+cuda-pathfinder==1.3.5
+cuda-python==13.1.1
+cupy-cuda12x==14.0.1
+datasets==4.5.0
+depyf==0.20.0
+dill==0.4.0
+diskcache==5.6.3
+distro==1.9.0
+dnspython==2.8.0
+docstring_parser==0.17.0
+einops==0.8.2
+email-validator==2.3.0
+fastapi==0.132.0
+fastapi-cli==0.0.23
+fastapi-cloud-cli==0.13.0
+fastar==0.8.0
+filelock==3.24.3
+flashinfer-python==0.6.1
+frozenlist==1.8.0
+fsspec==2025.10.0
+gguf==0.17.1
+grpcio==1.78.1
+grpcio-reflection==1.78.1
+h11==0.16.0
+hf-xet==1.3.0
+httpcore==1.0.9
+httptools==0.7.1
+httpx==0.28.1
+httpx-sse==0.4.3
+huggingface_hub==0.36.2
+idna==3.11
+ijson==3.5.0
+interegular==0.3.3
+Jinja2==3.1.6
+jiter==0.13.0
+jmespath==1.1.0
+jsonschema==4.26.0
+jsonschema-specifications==2025.9.1
+lark==1.2.2
+llguidance==1.3.0
+llmsql==0.1.15
+llvmlite==0.44.0
+lm-format-enforcer==0.11.3
+loguru==0.7.3
+markdown-it-py==4.0.0
+MarkupSafe==3.0.3
+mcp==1.26.0
+mdurl==0.1.2
+mistral_common==1.9.1
+model-hosting-container-standards==0.1.13
+mpmath==1.3.0
+msgpack==1.1.2
+msgspec==0.20.0
+multidict==6.7.1
+multiprocess==0.70.18
+networkx==3.6.1
+ninja==1.13.0
+numba==0.61.2
+numpy==2.2.6
+nvidia-cublas-cu12==12.8.4.1
+nvidia-cuda-cupti-cu12==12.8.90
+nvidia-cuda-nvrtc-cu12==12.8.93
+nvidia-cuda-runtime-cu12==12.8.90
+nvidia-cudnn-cu12==9.10.2.21
+nvidia-cudnn-frontend==1.18.0
+nvidia-cufft-cu12==11.3.3.83
+nvidia-cufile-cu12==1.13.1.3
+nvidia-curand-cu12==10.3.9.90
+nvidia-cusolver-cu12==11.7.3.90
+nvidia-cusparse-cu12==12.5.8.93
+nvidia-cusparselt-cu12==0.7.1
+nvidia-cutlass-dsl==4.4.0
+nvidia-cutlass-dsl-libs-base==4.4.0
+nvidia-ml-py==13.590.48
+nvidia-nccl-cu12==2.27.5
+nvidia-nvjitlink-cu12==12.8.93
+nvidia-nvshmem-cu12==3.3.20
+nvidia-nvtx-cu12==12.8.90
+openai==2.23.0
+openai-harmony==0.0.8
+opencv-python-headless==4.13.0.92
+outlines_core==0.2.11
+packaging==26.0
+pandas==3.0.1
+partial-json-parser==0.2.1.1.post7
+pillow==12.1.1
+prometheus-fastapi-instrumentator==7.1.0
+prometheus_client==0.24.1
+propcache==0.4.1
+protobuf==6.33.5
+psutil==7.2.2
+py-cpuinfo==9.0.0
+pyarrow==23.0.1
+pybase64==1.4.3
+pycountry==26.2.16
+pycparser==3.0
+pydantic==2.12.5
+pydantic-extra-types==2.11.0
+pydantic-settings==2.13.1
+pydantic_core==2.41.5
+Pygments==2.19.2
+PyJWT==2.11.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.2.1
+python-json-logger==4.0.0
+python-multipart==0.0.22
+PyYAML==6.0.3
+pyzmq==27.1.0
+ray==2.54.0
+referencing==0.37.0
+regex==2026.2.19
+requests==2.32.5
+rich==14.3.3
+rich-toolkit==0.19.4
+rignore==0.7.6
+rpds-py==0.30.0
+safetensors==0.7.0
+sentencepiece==0.2.1
+sentry-sdk==2.53.0
+setproctitle==1.3.7
+setuptools==80.10.2
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+sse-starlette==3.2.0
+starlette==0.52.1
+supervisor==4.3.0
+sympy==1.14.0
+tabulate==0.9.0
+tiktoken==0.12.0
+tokenizers==0.22.2
+torch==2.9.1
+torchaudio==2.9.1
+torchvision==0.24.1
+tqdm==4.67.3
+transformers==4.57.6
+triton==3.5.1
+typer==0.24.1
+typer-slim==0.24.0
+typing-inspection==0.4.2
+typing_extensions==4.15.0
+urllib3==2.6.3
+uvicorn==0.41.0
+uvloop==0.22.1
+vllm==0.15.1
+watchfiles==1.1.1
+websockets==16.0
+xgrammar==0.1.29
+xxhash==3.6.0
+yarl==1.22.0
diff --git a/leaderboard/Llama-3.3-70B-Instruct/5fewshots/run.yaml b/leaderboard/Llama-3.3-70B-Instruct/5fewshots/run.yaml
new file mode 100644
index 0000000..1e966f9
--- /dev/null
+++ b/leaderboard/Llama-3.3-70B-Instruct/5fewshots/run.yaml
@@ -0,0 +1,57 @@
+date: 2026-02-24
+# =====================
+# Model Information
+# =====================
+model:
+  name: meta-llama/Llama-3.3-70B-Instruct
+  revision: main
+  commit_hash: 6f6073b423013f6a7d4d9f39144961bfbfbc386b
+  parameter_count: 70B
+  dtype: bfloat16
+  thinking: false
+
+type: open-source  # open-source | proprietary
+
+# =====================
+# Package Information
+# =====================
+llmsql:
+  version: 0.1.15
+  commit_hash: 79175212c90b1fc094abd2c9666c23d903060014
+
+# =====================
+# Benchmark Information
+# =====================
+version: 2.0
+
+# =====================
+# Environment Information
+# =====================
+os_name: Ubuntu 24.04.3 LTS
+python_version: 3.12.12
+pip_freeze: requirements.txt
+device: 4xH200
+
+# =====================
+# Function Inputs / Inference Backend
+# =====================
+inference:
+  backend: vllm  # vllm | transformers
+  arguments:
+    batch_size: 20000
+    tetensor_parallel_size: 4
+    do_sample: false
+    max_new_tokens: 256
+    temperature: 0.0
+    num_fewshots: 5
+    seed: 42
+    llm_kwargs:
+      dtype: bfloat16
+
+
+# =====================
+# Results
+# =====================
+results:
+  execution_accuracy: 0.8607
+  answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/Llama-3.3-70B-Instruct/5fewshots/Llama-3.3-70B-Instruct_outputs.jsonl
diff --git a/leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/inference_script.py b/leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/inference_script.py
new file mode 100644
index 0000000..ef0562c
--- /dev/null
+++ b/leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/inference_script.py
@@ -0,0 +1,25 @@
+import os
+
+from dotenv import load_dotenv
+
+from llmsql import evaluate, inference_vllm
+
+load_dotenv()
+
+MODEL_NAME = "mistralai/Mistral-Nemo-Instruct-2407"
+
+results = inference_vllm(
+    model_name=MODEL_NAME,
+    output_file=f"{MODEL_NAME}_outputs.jsonl",
+    batch_size=20000,
+    tensor_parallel_size=4,
+    do_sample=False,
+    hf_token=os.environ["HF_TOKEN"],
+    max_new_tokens=256,
+    temperature=0.0,
+    num_fewshots=5,
+    seed=42,
+    llm_kwargs={"dtype": "bfloat16"},
+)
+
+evaluate(results)
diff --git a/leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/requirements.txt b/leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/requirements.txt
new file mode 100644
index 0000000..929f583
--- /dev/null
+++ b/leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/requirements.txt
@@ -0,0 +1,172 @@
+accelerate==1.12.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.13.3
+aiosignal==1.4.0
+annotated-doc==0.0.4
+annotated-types==0.7.0
+anthropic==0.83.0
+anyio==4.12.1
+apache-tvm-ffi==0.1.8.post2
+astor==0.8.1
+attrs==25.4.0
+blake3==1.0.8
+cachetools==7.0.1
+cbor2==5.8.0
+certifi==2026.1.4
+cffi==2.0.0
+charset-normalizer==3.4.4
+click==8.3.1
+cloudpickle==3.1.2
+compressed-tensors==0.13.0
+cryptography==46.0.5
+cuda-bindings==13.1.1
+cuda-pathfinder==1.3.5
+cuda-python==13.1.1
+cupy-cuda12x==14.0.1
+datasets==4.5.0
+depyf==0.20.0
+dill==0.4.0
+diskcache==5.6.3
+distro==1.9.0
+dnspython==2.8.0
+docstring_parser==0.17.0
+einops==0.8.2
+email-validator==2.3.0
+fastapi==0.132.0
+fastapi-cli==0.0.23
+fastapi-cloud-cli==0.13.0
+fastar==0.8.0
+filelock==3.24.3
+flashinfer-python==0.6.1
+frozenlist==1.8.0
+fsspec==2025.10.0
+gguf==0.17.1
+grpcio==1.78.1
+grpcio-reflection==1.78.1
+h11==0.16.0
+hf-xet==1.3.0
+httpcore==1.0.9
+httptools==0.7.1
+httpx==0.28.1
+httpx-sse==0.4.3
+huggingface_hub==0.36.2
+idna==3.11
+ijson==3.5.0
+interegular==0.3.3
+Jinja2==3.1.6
+jiter==0.13.0
+jmespath==1.1.0
+jsonschema==4.26.0
+jsonschema-specifications==2025.9.1
+lark==1.2.2
+llguidance==1.3.0
+llmsql==0.1.15
+llvmlite==0.44.0
+lm-format-enforcer==0.11.3
+loguru==0.7.3
+markdown-it-py==4.0.0
+MarkupSafe==3.0.3
+mcp==1.26.0
+mdurl==0.1.2
+mistral_common==1.9.1
+model-hosting-container-standards==0.1.13
+mpmath==1.3.0
+msgpack==1.1.2
+msgspec==0.20.0
+multidict==6.7.1
+multiprocess==0.70.18
+networkx==3.6.1
+ninja==1.13.0
+numba==0.61.2
+numpy==2.2.6
+nvidia-cublas-cu12==12.8.4.1
+nvidia-cuda-cupti-cu12==12.8.90
+nvidia-cuda-nvrtc-cu12==12.8.93
+nvidia-cuda-runtime-cu12==12.8.90
+nvidia-cudnn-cu12==9.10.2.21
+nvidia-cudnn-frontend==1.18.0
+nvidia-cufft-cu12==11.3.3.83
+nvidia-cufile-cu12==1.13.1.3
+nvidia-curand-cu12==10.3.9.90
+nvidia-cusolver-cu12==11.7.3.90
+nvidia-cusparse-cu12==12.5.8.93
+nvidia-cusparselt-cu12==0.7.1
+nvidia-cutlass-dsl==4.4.0
+nvidia-cutlass-dsl-libs-base==4.4.0
+nvidia-ml-py==13.590.48
+nvidia-nccl-cu12==2.27.5
+nvidia-nvjitlink-cu12==12.8.93
+nvidia-nvshmem-cu12==3.3.20
+nvidia-nvtx-cu12==12.8.90
+openai==2.23.0
+openai-harmony==0.0.8
+opencv-python-headless==4.13.0.92
+outlines_core==0.2.11
+packaging==26.0
+pandas==3.0.1
+partial-json-parser==0.2.1.1.post7
+pillow==12.1.1
+prometheus-fastapi-instrumentator==7.1.0
+prometheus_client==0.24.1
+propcache==0.4.1
+protobuf==6.33.5
+psutil==7.2.2
+py-cpuinfo==9.0.0
+pyarrow==23.0.1
+pybase64==1.4.3
+pycountry==26.2.16
+pycparser==3.0
+pydantic==2.12.5
+pydantic-extra-types==2.11.0
+pydantic-settings==2.13.1
+pydantic_core==2.41.5
+Pygments==2.19.2
+PyJWT==2.11.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.2.1
+python-json-logger==4.0.0
+python-multipart==0.0.22
+PyYAML==6.0.3
+pyzmq==27.1.0
+ray==2.54.0
+referencing==0.37.0
+regex==2026.2.19
+requests==2.32.5
+rich==14.3.3
+rich-toolkit==0.19.4
+rignore==0.7.6
+rpds-py==0.30.0
+safetensors==0.7.0
+sentencepiece==0.2.1
+sentry-sdk==2.53.0
+setproctitle==1.3.7
+setuptools==80.10.2
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+sse-starlette==3.2.0
+starlette==0.52.1
+supervisor==4.3.0
+sympy==1.14.0
+tabulate==0.9.0
+tiktoken==0.12.0
+tokenizers==0.22.2
+torch==2.9.1
+torchaudio==2.9.1
+torchvision==0.24.1
+tqdm==4.67.3
+transformers==4.57.6
+triton==3.5.1
+typer==0.24.1
+typer-slim==0.24.0
+typing-inspection==0.4.2
+typing_extensions==4.15.0
+urllib3==2.6.3
+uvicorn==0.41.0
+uvloop==0.22.1
+vllm==0.15.1
+watchfiles==1.1.1
+websockets==16.0
+xgrammar==0.1.29
+xxhash==3.6.0
+yarl==1.22.0
diff --git a/leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/run.yaml b/leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/run.yaml
new file mode 100644
index 0000000..7914a99
--- /dev/null
+++ b/leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/run.yaml
@@ -0,0 +1,56 @@
+date: 2026-02-24
+# =====================
+# Model Information
+# =====================
+model:
+  name: mistralai/Mistral-Nemo-Instruct-2407
+  revision: main
+  commit_hash: 04d8a90549d23fc6bd7f642064003592df51e9b3
+  parameter_count: 12B
+  dtype: bfloat16
+  thinking: false
+
+type: open-source  # open-source | proprietary
+
+# =====================
+# Package Information
+# =====================
+llmsql:
+  version: 0.1.15
+  commit_hash: 79175212c90b1fc094abd2c9666c23d903060014
+
+# =====================
+# Benchmark Information
+# =====================
+version: 2.0
+
+# =====================
+# Environment Information
+# =====================
+os_name: Ubuntu 24.04.3 LTS
+python_version: 3.12.12
+pip_freeze: requirements.txt
+device: 4xH200
+
+# =====================
+# Function Inputs / Inference Backend
+# =====================
+inference:
+  backend: vllm  # vllm | transformers
+  arguments:
+    batch_size: 20000
+    tetensor_parallel_size: 4
+    do_sample: false
+    max_new_tokens: 256
+    temperature: 0.0
+    num_fewshots: 5
+    seed: 42
+    llm_kwargs:
+      dtype: bfloat16
+
+# =====================
+# Results
+# =====================
+results:
+  execution_accuracy: 0.7599
+  answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/Mistral-Nemo-Instruct-2407/5fewshots/Mistral-Nemo-Instruct-2407_outputs.jsonl
diff --git a/leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/inference_script.py b/leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/inference_script.py
new file mode 100644
index 0000000..fa7cfb7
--- /dev/null
+++ b/leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/inference_script.py
@@ -0,0 +1,26 @@
+import os
+
+from dotenv import load_dotenv
+
+from llmsql import evaluate, inference_vllm
+
+load_dotenv()
+
+MODEL_NAME = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
+
+results = inference_vllm(
+    model_name=MODEL_NAME,
+    output_file=f"{MODEL_NAME}_outputs.jsonl",
+    batch_size=20000,
+    tensor_parallel_size=4,
+    do_sample=True,
+    hf_token=os.environ["HF_TOKEN"],
+    max_new_tokens=1024,
+    temperature=1.0,
+    sampling_kwargs={"top_p": 1.0},
+    num_fewshots=5,
+    seed=42,
+    llm_kwargs={"dtype": "bfloat16"},
+)
+
+evaluate(results)
diff --git a/leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/requirements.txt b/leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/requirements.txt
new file mode 100644
index 0000000..929f583
--- /dev/null
+++ b/leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/requirements.txt
@@ -0,0 +1,172 @@
+accelerate==1.12.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.13.3
+aiosignal==1.4.0
+annotated-doc==0.0.4
+annotated-types==0.7.0
+anthropic==0.83.0
+anyio==4.12.1
+apache-tvm-ffi==0.1.8.post2
+astor==0.8.1
+attrs==25.4.0
+blake3==1.0.8
+cachetools==7.0.1
+cbor2==5.8.0
+certifi==2026.1.4
+cffi==2.0.0
+charset-normalizer==3.4.4
+click==8.3.1
+cloudpickle==3.1.2
+compressed-tensors==0.13.0
+cryptography==46.0.5
+cuda-bindings==13.1.1
+cuda-pathfinder==1.3.5
+cuda-python==13.1.1
+cupy-cuda12x==14.0.1
+datasets==4.5.0
+depyf==0.20.0
+dill==0.4.0
+diskcache==5.6.3
+distro==1.9.0
+dnspython==2.8.0
+docstring_parser==0.17.0
+einops==0.8.2
+email-validator==2.3.0
+fastapi==0.132.0
+fastapi-cli==0.0.23
+fastapi-cloud-cli==0.13.0
+fastar==0.8.0
+filelock==3.24.3
+flashinfer-python==0.6.1
+frozenlist==1.8.0
+fsspec==2025.10.0
+gguf==0.17.1
+grpcio==1.78.1
+grpcio-reflection==1.78.1
+h11==0.16.0
+hf-xet==1.3.0
+httpcore==1.0.9
+httptools==0.7.1
+httpx==0.28.1
+httpx-sse==0.4.3
+huggingface_hub==0.36.2
+idna==3.11
+ijson==3.5.0
+interegular==0.3.3
+Jinja2==3.1.6
+jiter==0.13.0
+jmespath==1.1.0
+jsonschema==4.26.0
+jsonschema-specifications==2025.9.1
+lark==1.2.2
+llguidance==1.3.0
+llmsql==0.1.15
+llvmlite==0.44.0
+lm-format-enforcer==0.11.3
+loguru==0.7.3
+markdown-it-py==4.0.0
+MarkupSafe==3.0.3
+mcp==1.26.0
+mdurl==0.1.2
+mistral_common==1.9.1
+model-hosting-container-standards==0.1.13
+mpmath==1.3.0
+msgpack==1.1.2
+msgspec==0.20.0
+multidict==6.7.1
+multiprocess==0.70.18
+networkx==3.6.1
+ninja==1.13.0
+numba==0.61.2
+numpy==2.2.6
+nvidia-cublas-cu12==12.8.4.1
+nvidia-cuda-cupti-cu12==12.8.90
+nvidia-cuda-nvrtc-cu12==12.8.93
+nvidia-cuda-runtime-cu12==12.8.90
+nvidia-cudnn-cu12==9.10.2.21
+nvidia-cudnn-frontend==1.18.0
+nvidia-cufft-cu12==11.3.3.83
+nvidia-cufile-cu12==1.13.1.3
+nvidia-curand-cu12==10.3.9.90
+nvidia-cusolver-cu12==11.7.3.90
+nvidia-cusparse-cu12==12.5.8.93
+nvidia-cusparselt-cu12==0.7.1
+nvidia-cutlass-dsl==4.4.0
+nvidia-cutlass-dsl-libs-base==4.4.0
+nvidia-ml-py==13.590.48
+nvidia-nccl-cu12==2.27.5
+nvidia-nvjitlink-cu12==12.8.93
+nvidia-nvshmem-cu12==3.3.20
+nvidia-nvtx-cu12==12.8.90
+openai==2.23.0
+openai-harmony==0.0.8
+opencv-python-headless==4.13.0.92
+outlines_core==0.2.11
+packaging==26.0
+pandas==3.0.1
+partial-json-parser==0.2.1.1.post7
+pillow==12.1.1
+prometheus-fastapi-instrumentator==7.1.0
+prometheus_client==0.24.1
+propcache==0.4.1
+protobuf==6.33.5
+psutil==7.2.2
+py-cpuinfo==9.0.0
+pyarrow==23.0.1
+pybase64==1.4.3
+pycountry==26.2.16
+pycparser==3.0
+pydantic==2.12.5
+pydantic-extra-types==2.11.0
+pydantic-settings==2.13.1
+pydantic_core==2.41.5
+Pygments==2.19.2
+PyJWT==2.11.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.2.1
+python-json-logger==4.0.0
+python-multipart==0.0.22
+PyYAML==6.0.3
+pyzmq==27.1.0
+ray==2.54.0
+referencing==0.37.0
+regex==2026.2.19
+requests==2.32.5
+rich==14.3.3
+rich-toolkit==0.19.4
+rignore==0.7.6
+rpds-py==0.30.0
+safetensors==0.7.0
+sentencepiece==0.2.1
+sentry-sdk==2.53.0
+setproctitle==1.3.7
+setuptools==80.10.2
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+sse-starlette==3.2.0
+starlette==0.52.1
+supervisor==4.3.0
+sympy==1.14.0
+tabulate==0.9.0
+tiktoken==0.12.0
+tokenizers==0.22.2
+torch==2.9.1
+torchaudio==2.9.1
+torchvision==0.24.1
+tqdm==4.67.3
+transformers==4.57.6
+triton==3.5.1
+typer==0.24.1
+typer-slim==0.24.0
+typing-inspection==0.4.2
+typing_extensions==4.15.0
+urllib3==2.6.3
+uvicorn==0.41.0
+uvloop==0.22.1
+vllm==0.15.1
+watchfiles==1.1.1
+websockets==16.0
+xgrammar==0.1.29
+xxhash==3.6.0
+yarl==1.22.0
diff --git a/leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/run.yaml b/leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/run.yaml
new file mode 100644
index 0000000..a2d5154
--- /dev/null
+++ b/leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/run.yaml
@@ -0,0 +1,59 @@
+date: 2026-02-24
+# =====================
+# Model Information
+# =====================
+model:
+  name: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16
+  revision: main
+  commit_hash: 5a48de7e98cce824b3456eb9857ded839c3b6475
+  parameter_count: 30B
+  dtype: bfloat16
+  thinking: true
+
+type: open-source  # open-source | proprietary
+
+# =====================
+# Package Information
+# =====================
+llmsql:
+  version: 0.1.15
+  commit_hash: 79175212c90b1fc094abd2c9666c23d903060014
+
+# =====================
+# Benchmark Information
+# =====================
+version: 2.0
+
+# =====================
+# Environment Information
+# =====================
+os_name: Ubuntu 24.04.3 LTS
+python_version: 3.12.12
+pip_freeze: requirements.txt
+device: 4xH200
+
+# =====================
+# Function Inputs / Inference Backend
+# =====================
+inference:
+  backend: vllm  # vllm | transformers
+  arguments:
+    batch_size: 20000
+    tetensor_parallel_size: 4
+    do_sample: true
+    max_new_tokens: 1024
+    temperature: 1.0
+    sampling_kwargs:
+      top_p: 1.0
+    num_fewshots: 5
+    seed: 42
+    llm_kwargs:
+      dtype: bfloat16
+
+
+# =====================
+# Results
+# =====================
+results:
+  execution_accuracy: 0.8519
+  answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16_outputs.jsonl
diff --git a/leaderboard/PLLuM-12B-chat/5fewshots/inference_script.py b/leaderboard/PLLuM-12B-chat/5fewshots/inference_script.py
new file mode 100644
index 0000000..59bfa27
--- /dev/null
+++ b/leaderboard/PLLuM-12B-chat/5fewshots/inference_script.py
@@ -0,0 +1,25 @@
+import os
+
+from dotenv import load_dotenv
+
+from llmsql import evaluate, inference_vllm
+
+load_dotenv()
+
+MODEL_NAME = "CYFRAGOVPL/PLLuM-12B-chat"
+
+results = inference_vllm(
+    model_name=MODEL_NAME,
+    output_file=f"{MODEL_NAME}_outputs.jsonl",
+    batch_size=20000,
+    tensor_parallel_size=4,
+    do_sample=False,
+    hf_token=os.environ["HF_TOKEN"],
+    max_new_tokens=256,
+    temperature=0.0,
+    num_fewshots=5,
+    seed=42,
+    llm_kwargs={"dtype": "bfloat16"},
+)
+
+evaluate(results)
diff --git a/leaderboard/PLLuM-12B-chat/5fewshots/requirements.txt b/leaderboard/PLLuM-12B-chat/5fewshots/requirements.txt
new file mode 100644
index 0000000..929f583
--- /dev/null
+++ b/leaderboard/PLLuM-12B-chat/5fewshots/requirements.txt
@@ -0,0 +1,172 @@
+accelerate==1.12.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.13.3
+aiosignal==1.4.0
+annotated-doc==0.0.4
+annotated-types==0.7.0
+anthropic==0.83.0
+anyio==4.12.1
+apache-tvm-ffi==0.1.8.post2
+astor==0.8.1
+attrs==25.4.0
+blake3==1.0.8
+cachetools==7.0.1
+cbor2==5.8.0
+certifi==2026.1.4
+cffi==2.0.0
+charset-normalizer==3.4.4
+click==8.3.1
+cloudpickle==3.1.2
+compressed-tensors==0.13.0
+cryptography==46.0.5
+cuda-bindings==13.1.1
+cuda-pathfinder==1.3.5
+cuda-python==13.1.1
+cupy-cuda12x==14.0.1
+datasets==4.5.0
+depyf==0.20.0
+dill==0.4.0
+diskcache==5.6.3
+distro==1.9.0
+dnspython==2.8.0
+docstring_parser==0.17.0
+einops==0.8.2
+email-validator==2.3.0
+fastapi==0.132.0
+fastapi-cli==0.0.23
+fastapi-cloud-cli==0.13.0
+fastar==0.8.0
+filelock==3.24.3
+flashinfer-python==0.6.1
+frozenlist==1.8.0
+fsspec==2025.10.0
+gguf==0.17.1
+grpcio==1.78.1
+grpcio-reflection==1.78.1
+h11==0.16.0
+hf-xet==1.3.0
+httpcore==1.0.9
+httptools==0.7.1
+httpx==0.28.1
+httpx-sse==0.4.3
+huggingface_hub==0.36.2
+idna==3.11
+ijson==3.5.0
+interegular==0.3.3
+Jinja2==3.1.6
+jiter==0.13.0
+jmespath==1.1.0
+jsonschema==4.26.0
+jsonschema-specifications==2025.9.1
+lark==1.2.2
+llguidance==1.3.0
+llmsql==0.1.15
+llvmlite==0.44.0
+lm-format-enforcer==0.11.3
+loguru==0.7.3
+markdown-it-py==4.0.0
+MarkupSafe==3.0.3
+mcp==1.26.0
+mdurl==0.1.2
+mistral_common==1.9.1
+model-hosting-container-standards==0.1.13
+mpmath==1.3.0
+msgpack==1.1.2
+msgspec==0.20.0
+multidict==6.7.1
+multiprocess==0.70.18
+networkx==3.6.1
+ninja==1.13.0
+numba==0.61.2
+numpy==2.2.6
+nvidia-cublas-cu12==12.8.4.1
+nvidia-cuda-cupti-cu12==12.8.90
+nvidia-cuda-nvrtc-cu12==12.8.93
+nvidia-cuda-runtime-cu12==12.8.90
+nvidia-cudnn-cu12==9.10.2.21
+nvidia-cudnn-frontend==1.18.0
+nvidia-cufft-cu12==11.3.3.83
+nvidia-cufile-cu12==1.13.1.3
+nvidia-curand-cu12==10.3.9.90
+nvidia-cusolver-cu12==11.7.3.90
+nvidia-cusparse-cu12==12.5.8.93
+nvidia-cusparselt-cu12==0.7.1
+nvidia-cutlass-dsl==4.4.0
+nvidia-cutlass-dsl-libs-base==4.4.0
+nvidia-ml-py==13.590.48
+nvidia-nccl-cu12==2.27.5
+nvidia-nvjitlink-cu12==12.8.93
+nvidia-nvshmem-cu12==3.3.20
+nvidia-nvtx-cu12==12.8.90
+openai==2.23.0
+openai-harmony==0.0.8
+opencv-python-headless==4.13.0.92
+outlines_core==0.2.11
+packaging==26.0
+pandas==3.0.1
+partial-json-parser==0.2.1.1.post7
+pillow==12.1.1
+prometheus-fastapi-instrumentator==7.1.0
+prometheus_client==0.24.1
+propcache==0.4.1
+protobuf==6.33.5
+psutil==7.2.2
+py-cpuinfo==9.0.0
+pyarrow==23.0.1
+pybase64==1.4.3
+pycountry==26.2.16
+pycparser==3.0
+pydantic==2.12.5
+pydantic-extra-types==2.11.0
+pydantic-settings==2.13.1
+pydantic_core==2.41.5
+Pygments==2.19.2
+PyJWT==2.11.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.2.1
+python-json-logger==4.0.0
+python-multipart==0.0.22
+PyYAML==6.0.3
+pyzmq==27.1.0
+ray==2.54.0
+referencing==0.37.0
+regex==2026.2.19
+requests==2.32.5
+rich==14.3.3
+rich-toolkit==0.19.4
+rignore==0.7.6
+rpds-py==0.30.0
+safetensors==0.7.0
+sentencepiece==0.2.1
+sentry-sdk==2.53.0
+setproctitle==1.3.7
+setuptools==80.10.2
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+sse-starlette==3.2.0
+starlette==0.52.1
+supervisor==4.3.0
+sympy==1.14.0
+tabulate==0.9.0
+tiktoken==0.12.0
+tokenizers==0.22.2
+torch==2.9.1
+torchaudio==2.9.1
+torchvision==0.24.1
+tqdm==4.67.3
+transformers==4.57.6
+triton==3.5.1
+typer==0.24.1
+typer-slim==0.24.0
+typing-inspection==0.4.2
+typing_extensions==4.15.0
+urllib3==2.6.3
+uvicorn==0.41.0
+uvloop==0.22.1
+vllm==0.15.1
+watchfiles==1.1.1
+websockets==16.0
+xgrammar==0.1.29
+xxhash==3.6.0
+yarl==1.22.0
diff --git a/leaderboard/PLLuM-12B-chat/5fewshots/run.yaml b/leaderboard/PLLuM-12B-chat/5fewshots/run.yaml
new file mode 100644
index 0000000..6a2ab5f
--- /dev/null
+++ b/leaderboard/PLLuM-12B-chat/5fewshots/run.yaml
@@ -0,0 +1,57 @@
+date: 2026-02-24
+# =====================
+# Model Information
+# =====================
+model:
+  name: CYFRAGOVPL/PLLuM-12B-chat
+  revision: main
+  commit_hash: 74d80ff96552d9555f6f6f28321433da3895d2ec
+  parameter_count: 12B
+  dtype: bfloat16
+  thinking: false
+
+type: open-source  # open-source | proprietary
+
+# =====================
+# Package Information
+# =====================
+llmsql:
+  version: 0.1.15
+  commit_hash: 79175212c90b1fc094abd2c9666c23d903060014
+
+# =====================
+# Benchmark Information
+# =====================
+version: 2.0
+
+# =====================
+# Environment Information
+# =====================
+os_name: Ubuntu 24.04.3 LTS
+python_version: 3.12.12
+pip_freeze: requirements.txt
+device: 4xH200
+
+# =====================
+# Function Inputs / Inference Backend
+# =====================
+inference:
+  backend: vllm  # vllm | transformers
+  arguments:
+    batch_size: 20000
+    tetensor_parallel_size: 4
+    do_sample: false
+    max_new_tokens: 256
+    temperature: 0.0
+    num_fewshots: 5
+    seed: 42
+    llm_kwargs:
+      dtype: bfloat16
+
+
+# =====================
+# Results
+# =====================
+results:
+  execution_accuracy: 0.5224
+  answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/PLLuM-12B-chat/5fewshots/PLLuM-12B-chat_outputs.jsonl
diff --git a/leaderboard/PLLuM-12B-nc-chat/5fewshots/inference_script.py b/leaderboard/PLLuM-12B-nc-chat/5fewshots/inference_script.py
new file mode 100644
index 0000000..372e696
--- /dev/null
+++ b/leaderboard/PLLuM-12B-nc-chat/5fewshots/inference_script.py
@@ -0,0 +1,25 @@
+import os
+
+from dotenv import load_dotenv
+
+from llmsql import evaluate, inference_vllm
+
+load_dotenv()
+
+MODEL_NAME = "CYFRAGOVPL/PLLuM-12B-nc-chat"
+
+results = inference_vllm(
+    model_name=MODEL_NAME,
+    output_file=f"{MODEL_NAME}_outputs.jsonl",
+    batch_size=20000,
+    tensor_parallel_size=4,
+    do_sample=False,
+    hf_token=os.environ["HF_TOKEN"],
+    max_new_tokens=256,
+    temperature=0.0,
+    num_fewshots=5,
+    seed=42,
+    llm_kwargs={"dtype": "bfloat16"},
+)
+
+evaluate(results)
diff --git a/leaderboard/PLLuM-12B-nc-chat/5fewshots/requirements.txt b/leaderboard/PLLuM-12B-nc-chat/5fewshots/requirements.txt
new file mode 100644
index 0000000..929f583
--- /dev/null
+++ b/leaderboard/PLLuM-12B-nc-chat/5fewshots/requirements.txt
@@ -0,0 +1,172 @@
+accelerate==1.12.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.13.3
+aiosignal==1.4.0
+annotated-doc==0.0.4
+annotated-types==0.7.0
+anthropic==0.83.0
+anyio==4.12.1
+apache-tvm-ffi==0.1.8.post2
+astor==0.8.1
+attrs==25.4.0
+blake3==1.0.8
+cachetools==7.0.1
+cbor2==5.8.0
+certifi==2026.1.4
+cffi==2.0.0
+charset-normalizer==3.4.4
+click==8.3.1
+cloudpickle==3.1.2
+compressed-tensors==0.13.0
+cryptography==46.0.5
+cuda-bindings==13.1.1
+cuda-pathfinder==1.3.5
+cuda-python==13.1.1
+cupy-cuda12x==14.0.1
+datasets==4.5.0
+depyf==0.20.0
+dill==0.4.0
+diskcache==5.6.3
+distro==1.9.0
+dnspython==2.8.0
+docstring_parser==0.17.0
+einops==0.8.2
+email-validator==2.3.0
+fastapi==0.132.0
+fastapi-cli==0.0.23
+fastapi-cloud-cli==0.13.0
+fastar==0.8.0
+filelock==3.24.3
+flashinfer-python==0.6.1
+frozenlist==1.8.0
+fsspec==2025.10.0
+gguf==0.17.1
+grpcio==1.78.1
+grpcio-reflection==1.78.1
+h11==0.16.0
+hf-xet==1.3.0
+httpcore==1.0.9
+httptools==0.7.1
+httpx==0.28.1
+httpx-sse==0.4.3
+huggingface_hub==0.36.2
+idna==3.11
+ijson==3.5.0
+interegular==0.3.3
+Jinja2==3.1.6
+jiter==0.13.0
+jmespath==1.1.0
+jsonschema==4.26.0
+jsonschema-specifications==2025.9.1
+lark==1.2.2
+llguidance==1.3.0
+llmsql==0.1.15
+llvmlite==0.44.0
+lm-format-enforcer==0.11.3
+loguru==0.7.3
+markdown-it-py==4.0.0
+MarkupSafe==3.0.3
+mcp==1.26.0
+mdurl==0.1.2
+mistral_common==1.9.1
+model-hosting-container-standards==0.1.13
+mpmath==1.3.0
+msgpack==1.1.2
+msgspec==0.20.0
+multidict==6.7.1
+multiprocess==0.70.18
+networkx==3.6.1
+ninja==1.13.0
+numba==0.61.2
+numpy==2.2.6
+nvidia-cublas-cu12==12.8.4.1
+nvidia-cuda-cupti-cu12==12.8.90
+nvidia-cuda-nvrtc-cu12==12.8.93
+nvidia-cuda-runtime-cu12==12.8.90
+nvidia-cudnn-cu12==9.10.2.21
+nvidia-cudnn-frontend==1.18.0
+nvidia-cufft-cu12==11.3.3.83
+nvidia-cufile-cu12==1.13.1.3
+nvidia-curand-cu12==10.3.9.90
+nvidia-cusolver-cu12==11.7.3.90
+nvidia-cusparse-cu12==12.5.8.93
+nvidia-cusparselt-cu12==0.7.1
+nvidia-cutlass-dsl==4.4.0
+nvidia-cutlass-dsl-libs-base==4.4.0
+nvidia-ml-py==13.590.48
+nvidia-nccl-cu12==2.27.5
+nvidia-nvjitlink-cu12==12.8.93
+nvidia-nvshmem-cu12==3.3.20
+nvidia-nvtx-cu12==12.8.90
+openai==2.23.0
+openai-harmony==0.0.8
+opencv-python-headless==4.13.0.92
+outlines_core==0.2.11
+packaging==26.0
+pandas==3.0.1
+partial-json-parser==0.2.1.1.post7
+pillow==12.1.1
+prometheus-fastapi-instrumentator==7.1.0
+prometheus_client==0.24.1
+propcache==0.4.1
+protobuf==6.33.5
+psutil==7.2.2
+py-cpuinfo==9.0.0
+pyarrow==23.0.1
+pybase64==1.4.3
+pycountry==26.2.16
+pycparser==3.0
+pydantic==2.12.5
+pydantic-extra-types==2.11.0
+pydantic-settings==2.13.1
+pydantic_core==2.41.5
+Pygments==2.19.2
+PyJWT==2.11.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.2.1
+python-json-logger==4.0.0
+python-multipart==0.0.22
+PyYAML==6.0.3
+pyzmq==27.1.0
+ray==2.54.0
+referencing==0.37.0
+regex==2026.2.19
+requests==2.32.5
+rich==14.3.3
+rich-toolkit==0.19.4
+rignore==0.7.6
+rpds-py==0.30.0
+safetensors==0.7.0
+sentencepiece==0.2.1
+sentry-sdk==2.53.0
+setproctitle==1.3.7
+setuptools==80.10.2
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+sse-starlette==3.2.0
+starlette==0.52.1
+supervisor==4.3.0
+sympy==1.14.0
+tabulate==0.9.0
+tiktoken==0.12.0
+tokenizers==0.22.2
+torch==2.9.1
+torchaudio==2.9.1
+torchvision==0.24.1
+tqdm==4.67.3
+transformers==4.57.6
+triton==3.5.1
+typer==0.24.1
+typer-slim==0.24.0
+typing-inspection==0.4.2
+typing_extensions==4.15.0
+urllib3==2.6.3
+uvicorn==0.41.0
+uvloop==0.22.1
+vllm==0.15.1
+watchfiles==1.1.1
+websockets==16.0
+xgrammar==0.1.29
+xxhash==3.6.0
+yarl==1.22.0
diff --git a/leaderboard/PLLuM-12B-nc-chat/5fewshots/run.yaml b/leaderboard/PLLuM-12B-nc-chat/5fewshots/run.yaml
new file mode 100644
index 0000000..819021e
--- /dev/null
+++ b/leaderboard/PLLuM-12B-nc-chat/5fewshots/run.yaml
@@ -0,0 +1,57 @@
+date: 2026-02-24
+# =====================
+# Model Information
+# =====================
+model:
+  name: CYFRAGOVPL/PLLuM-12B-nc-chat
+  revision: main
+  commit_hash: 7089352cfc2efbd2d3c64cc8cd5c97cd2c4fc013
+  parameter_count: 12B
+  dtype: bfloat16
+  thinking: false
+
+type: open-source  # open-source | proprietary
+
+# =====================
+# Package Information
+# =====================
+llmsql:
+  version: 0.1.15
+  commit_hash: 79175212c90b1fc094abd2c9666c23d903060014
+
+# =====================
+# Benchmark Information
+# =====================
+version: 2.0
+
+
+# =====================
+# Environment Information
+# =====================
+os_name: Ubuntu 24.04.3 LTS
+python_version: 3.12.12
+pip_freeze: requirements.txt
+device: 4xH200
+
+# =====================
+# Function Inputs / Inference Backend
+# =====================
+inference:
+  backend: vllm  # vllm | transformers
+  arguments:
+    batch_size: 20000
+    tetensor_parallel_size: 4
+    do_sample: false
+    max_new_tokens: 256
+    temperature: 0.0
+    num_fewshots: 5
+    seed: 42
+    llm_kwargs:
+      dtype: bfloat16
+
+# =====================
+# Results
+# =====================
+results:
+  execution_accuracy: 0.4044
+  answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/PLLuM-12B-nc-chat/5fewshots/PLLuM-12B-nc-chat_outputs.jsonl
diff --git a/leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/inference_script.py b/leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/inference_script.py
new file mode 100644
index 0000000..c8af571
--- /dev/null
+++ b/leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/inference_script.py
@@ -0,0 +1,25 @@
+import os
+
+from dotenv import load_dotenv
+
+from llmsql import evaluate, inference_vllm
+
+load_dotenv()
+
+MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
+
+results = inference_vllm(
+    model_name=MODEL_NAME,
+    output_file=f"{MODEL_NAME}_outputs.jsonl",
+    batch_size=20000,
+    tensor_parallel_size=4,
+    do_sample=False,
+    hf_token=os.environ["HF_TOKEN"],
+    max_new_tokens=256,
+    temperature=0.0,
+    num_fewshots=5,
+    seed=42,
+    llm_kwargs={"dtype": "bfloat16"},
+)
+
+evaluate(results)
diff --git a/leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/requirements.txt b/leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/requirements.txt
new file mode 100644
index 0000000..929f583
--- /dev/null
+++ b/leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/requirements.txt
@@ -0,0 +1,172 @@
+accelerate==1.12.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.13.3
+aiosignal==1.4.0
+annotated-doc==0.0.4
+annotated-types==0.7.0
+anthropic==0.83.0
+anyio==4.12.1
+apache-tvm-ffi==0.1.8.post2
+astor==0.8.1
+attrs==25.4.0
+blake3==1.0.8
+cachetools==7.0.1
+cbor2==5.8.0
+certifi==2026.1.4
+cffi==2.0.0
+charset-normalizer==3.4.4
+click==8.3.1
+cloudpickle==3.1.2
+compressed-tensors==0.13.0
+cryptography==46.0.5
+cuda-bindings==13.1.1
+cuda-pathfinder==1.3.5
+cuda-python==13.1.1
+cupy-cuda12x==14.0.1
+datasets==4.5.0
+depyf==0.20.0
+dill==0.4.0
+diskcache==5.6.3
+distro==1.9.0
+dnspython==2.8.0
+docstring_parser==0.17.0
+einops==0.8.2
+email-validator==2.3.0
+fastapi==0.132.0
+fastapi-cli==0.0.23
+fastapi-cloud-cli==0.13.0
+fastar==0.8.0
+filelock==3.24.3
+flashinfer-python==0.6.1
+frozenlist==1.8.0
+fsspec==2025.10.0
+gguf==0.17.1
+grpcio==1.78.1
+grpcio-reflection==1.78.1
+h11==0.16.0
+hf-xet==1.3.0
+httpcore==1.0.9
+httptools==0.7.1
+httpx==0.28.1
+httpx-sse==0.4.3
+huggingface_hub==0.36.2
+idna==3.11
+ijson==3.5.0
+interegular==0.3.3
+Jinja2==3.1.6
+jiter==0.13.0
+jmespath==1.1.0
+jsonschema==4.26.0
+jsonschema-specifications==2025.9.1
+lark==1.2.2
+llguidance==1.3.0
+llmsql==0.1.15
+llvmlite==0.44.0
+lm-format-enforcer==0.11.3
+loguru==0.7.3
+markdown-it-py==4.0.0
+MarkupSafe==3.0.3
+mcp==1.26.0
+mdurl==0.1.2
+mistral_common==1.9.1
+model-hosting-container-standards==0.1.13
+mpmath==1.3.0
+msgpack==1.1.2
+msgspec==0.20.0
+multidict==6.7.1
+multiprocess==0.70.18
+networkx==3.6.1
+ninja==1.13.0
+numba==0.61.2
+numpy==2.2.6
+nvidia-cublas-cu12==12.8.4.1
+nvidia-cuda-cupti-cu12==12.8.90
+nvidia-cuda-nvrtc-cu12==12.8.93
+nvidia-cuda-runtime-cu12==12.8.90
+nvidia-cudnn-cu12==9.10.2.21
+nvidia-cudnn-frontend==1.18.0
+nvidia-cufft-cu12==11.3.3.83
+nvidia-cufile-cu12==1.13.1.3
+nvidia-curand-cu12==10.3.9.90
+nvidia-cusolver-cu12==11.7.3.90
+nvidia-cusparse-cu12==12.5.8.93
+nvidia-cusparselt-cu12==0.7.1
+nvidia-cutlass-dsl==4.4.0
+nvidia-cutlass-dsl-libs-base==4.4.0
+nvidia-ml-py==13.590.48
+nvidia-nccl-cu12==2.27.5
+nvidia-nvjitlink-cu12==12.8.93
+nvidia-nvshmem-cu12==3.3.20
+nvidia-nvtx-cu12==12.8.90
+openai==2.23.0
+openai-harmony==0.0.8
+opencv-python-headless==4.13.0.92
+outlines_core==0.2.11
+packaging==26.0
+pandas==3.0.1
+partial-json-parser==0.2.1.1.post7
+pillow==12.1.1
+prometheus-fastapi-instrumentator==7.1.0
+prometheus_client==0.24.1
+propcache==0.4.1
+protobuf==6.33.5
+psutil==7.2.2
+py-cpuinfo==9.0.0
+pyarrow==23.0.1
+pybase64==1.4.3
+pycountry==26.2.16
+pycparser==3.0
+pydantic==2.12.5
+pydantic-extra-types==2.11.0
+pydantic-settings==2.13.1
+pydantic_core==2.41.5
+Pygments==2.19.2
+PyJWT==2.11.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.2.1
+python-json-logger==4.0.0
+python-multipart==0.0.22
+PyYAML==6.0.3
+pyzmq==27.1.0
+ray==2.54.0
+referencing==0.37.0
+regex==2026.2.19
+requests==2.32.5
+rich==14.3.3
+rich-toolkit==0.19.4
+rignore==0.7.6
+rpds-py==0.30.0
+safetensors==0.7.0
+sentencepiece==0.2.1
+sentry-sdk==2.53.0
+setproctitle==1.3.7
+setuptools==80.10.2
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+sse-starlette==3.2.0
+starlette==0.52.1
+supervisor==4.3.0
+sympy==1.14.0
+tabulate==0.9.0
+tiktoken==0.12.0
+tokenizers==0.22.2
+torch==2.9.1
+torchaudio==2.9.1
+torchvision==0.24.1
+tqdm==4.67.3
+transformers==4.57.6
+triton==3.5.1
+typer==0.24.1
+typer-slim==0.24.0
+typing-inspection==0.4.2
+typing_extensions==4.15.0
+urllib3==2.6.3
+uvicorn==0.41.0
+uvloop==0.22.1
+vllm==0.15.1
+watchfiles==1.1.1
+websockets==16.0
+xgrammar==0.1.29
+xxhash==3.6.0
+yarl==1.22.0
diff --git a/leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/run.yaml b/leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/run.yaml
new file mode 100644
index 0000000..47c8f25
--- /dev/null
+++ b/leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/run.yaml
@@ -0,0 +1,57 @@
+date: 2026-02-24
+# =====================
+# Model Information
+# =====================
+model:
+  name: Qwen/Qwen2.5-1.5B-Instruct
+  revision: main
+  commit_hash: 989aa7980e4cf806f80c7fef2b1adb7bc71aa306
+  parameter_count: 1.5B
+  dtype: bfloat16
+  thinking: false
+
+type: open-source  # open-source | proprietary
+
+# =====================
+# Package Information
+# =====================
+llmsql:
+  version: 0.1.15
+  commit_hash: 79175212c90b1fc094abd2c9666c23d903060014
+
+# =====================
+# Benchmark Information
+# =====================
+version: 2.0
+
+# =====================
+# Environment Information
+# =====================
+os_name: Ubuntu 24.04.3 LTS
+python_version: 3.12.12
+pip_freeze: requirements.txt
+device: 4xH200
+
+# =====================
+# Function Inputs / Inference Backend
+# =====================
+inference:
+  backend: vllm  # vllm | transformers
+  arguments:
+    batch_size: 20000
+    tetensor_parallel_size: 4
+    do_sample: false
+    max_new_tokens: 256
+    temperature: 0.0
+    num_fewshots: 5
+    seed: 42
+    llm_kwargs:
+      dtype: bfloat16
+
+
+# =====================
+# Results
+# =====================
+results:
+  execution_accuracy: 0.6401
+  answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/Qwen2.5-1.5B-Instruct/5fewshots/Qwen2.5-1.5B-Instruct_outputs.jsonl
diff --git a/leaderboard/Qwen2.5-7B-Instruct/5fewshots/inference_script.py b/leaderboard/Qwen2.5-7B-Instruct/5fewshots/inference_script.py
new file mode 100644
index 0000000..e463467
--- /dev/null
+++ b/leaderboard/Qwen2.5-7B-Instruct/5fewshots/inference_script.py
@@ -0,0 +1,25 @@
+import os
+
+from dotenv import load_dotenv
+
+from llmsql import evaluate, inference_vllm
+
+load_dotenv()
+
+MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
+
+results = inference_vllm(
+    model_name=MODEL_NAME,
+    output_file=f"{MODEL_NAME}_outputs.jsonl",
+    batch_size=20000,
+    tensor_parallel_size=4,
+    do_sample=False,
+    hf_token=os.environ["HF_TOKEN"],
+    max_new_tokens=256,
+    temperature=0.0,
+    num_fewshots=5,
+    seed=42,
+    llm_kwargs={"dtype": "bfloat16"},
+)
+
+evaluate(results)
diff --git a/leaderboard/Qwen2.5-7B-Instruct/5fewshots/requirements.txt b/leaderboard/Qwen2.5-7B-Instruct/5fewshots/requirements.txt
new file mode 100644
index 0000000..929f583
--- /dev/null
+++ b/leaderboard/Qwen2.5-7B-Instruct/5fewshots/requirements.txt
@@ -0,0 +1,172 @@
+accelerate==1.12.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.13.3
+aiosignal==1.4.0
+annotated-doc==0.0.4
+annotated-types==0.7.0
+anthropic==0.83.0
+anyio==4.12.1
+apache-tvm-ffi==0.1.8.post2
+astor==0.8.1
+attrs==25.4.0
+blake3==1.0.8
+cachetools==7.0.1
+cbor2==5.8.0
+certifi==2026.1.4
+cffi==2.0.0
+charset-normalizer==3.4.4
+click==8.3.1
+cloudpickle==3.1.2
+compressed-tensors==0.13.0
+cryptography==46.0.5
+cuda-bindings==13.1.1
+cuda-pathfinder==1.3.5
+cuda-python==13.1.1
+cupy-cuda12x==14.0.1
+datasets==4.5.0
+depyf==0.20.0
+dill==0.4.0
+diskcache==5.6.3
+distro==1.9.0
+dnspython==2.8.0
+docstring_parser==0.17.0
+einops==0.8.2
+email-validator==2.3.0
+fastapi==0.132.0
+fastapi-cli==0.0.23
+fastapi-cloud-cli==0.13.0
+fastar==0.8.0
+filelock==3.24.3
+flashinfer-python==0.6.1
+frozenlist==1.8.0
+fsspec==2025.10.0
+gguf==0.17.1
+grpcio==1.78.1
+grpcio-reflection==1.78.1
+h11==0.16.0
+hf-xet==1.3.0
+httpcore==1.0.9
+httptools==0.7.1
+httpx==0.28.1
+httpx-sse==0.4.3
+huggingface_hub==0.36.2
+idna==3.11
+ijson==3.5.0
+interegular==0.3.3
+Jinja2==3.1.6
+jiter==0.13.0
+jmespath==1.1.0
+jsonschema==4.26.0
+jsonschema-specifications==2025.9.1
+lark==1.2.2
+llguidance==1.3.0
+llmsql==0.1.15
+llvmlite==0.44.0
+lm-format-enforcer==0.11.3
+loguru==0.7.3
+markdown-it-py==4.0.0
+MarkupSafe==3.0.3
+mcp==1.26.0
+mdurl==0.1.2
+mistral_common==1.9.1
+model-hosting-container-standards==0.1.13
+mpmath==1.3.0
+msgpack==1.1.2
+msgspec==0.20.0
+multidict==6.7.1
+multiprocess==0.70.18
+networkx==3.6.1
+ninja==1.13.0
+numba==0.61.2
+numpy==2.2.6
+nvidia-cublas-cu12==12.8.4.1
+nvidia-cuda-cupti-cu12==12.8.90
+nvidia-cuda-nvrtc-cu12==12.8.93
+nvidia-cuda-runtime-cu12==12.8.90
+nvidia-cudnn-cu12==9.10.2.21
+nvidia-cudnn-frontend==1.18.0
+nvidia-cufft-cu12==11.3.3.83
+nvidia-cufile-cu12==1.13.1.3
+nvidia-curand-cu12==10.3.9.90
+nvidia-cusolver-cu12==11.7.3.90
+nvidia-cusparse-cu12==12.5.8.93
+nvidia-cusparselt-cu12==0.7.1
+nvidia-cutlass-dsl==4.4.0
+nvidia-cutlass-dsl-libs-base==4.4.0
+nvidia-ml-py==13.590.48
+nvidia-nccl-cu12==2.27.5
+nvidia-nvjitlink-cu12==12.8.93
+nvidia-nvshmem-cu12==3.3.20
+nvidia-nvtx-cu12==12.8.90
+openai==2.23.0
+openai-harmony==0.0.8
+opencv-python-headless==4.13.0.92
+outlines_core==0.2.11
+packaging==26.0
+pandas==3.0.1
+partial-json-parser==0.2.1.1.post7
+pillow==12.1.1
+prometheus-fastapi-instrumentator==7.1.0
+prometheus_client==0.24.1
+propcache==0.4.1
+protobuf==6.33.5
+psutil==7.2.2
+py-cpuinfo==9.0.0
+pyarrow==23.0.1
+pybase64==1.4.3
+pycountry==26.2.16
+pycparser==3.0
+pydantic==2.12.5
+pydantic-extra-types==2.11.0
+pydantic-settings==2.13.1
+pydantic_core==2.41.5
+Pygments==2.19.2
+PyJWT==2.11.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.2.1
+python-json-logger==4.0.0
+python-multipart==0.0.22
+PyYAML==6.0.3
+pyzmq==27.1.0
+ray==2.54.0
+referencing==0.37.0
+regex==2026.2.19
+requests==2.32.5
+rich==14.3.3
+rich-toolkit==0.19.4
+rignore==0.7.6
+rpds-py==0.30.0
+safetensors==0.7.0
+sentencepiece==0.2.1
+sentry-sdk==2.53.0
+setproctitle==1.3.7
+setuptools==80.10.2
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+sse-starlette==3.2.0
+starlette==0.52.1
+supervisor==4.3.0
+sympy==1.14.0
+tabulate==0.9.0
+tiktoken==0.12.0
+tokenizers==0.22.2
+torch==2.9.1
+torchaudio==2.9.1
+torchvision==0.24.1
+tqdm==4.67.3
+transformers==4.57.6
+triton==3.5.1
+typer==0.24.1
+typer-slim==0.24.0
+typing-inspection==0.4.2
+typing_extensions==4.15.0
+urllib3==2.6.3
+uvicorn==0.41.0
+uvloop==0.22.1
+vllm==0.15.1
+watchfiles==1.1.1
+websockets==16.0
+xgrammar==0.1.29
+xxhash==3.6.0
+yarl==1.22.0
diff --git a/leaderboard/Qwen2.5-7B-Instruct/5fewshots/run.yaml b/leaderboard/Qwen2.5-7B-Instruct/5fewshots/run.yaml
new file mode 100644
index 0000000..0492a50
--- /dev/null
+++ b/leaderboard/Qwen2.5-7B-Instruct/5fewshots/run.yaml
@@ -0,0 +1,56 @@
+date: 2026-02-24
+# =====================
+# Model Information
+# =====================
+model:
+  name: Qwen/Qwen2.5-7B-Instruct
+  revision: main
+  commit_hash: a09a35458c702b33eeacc393d103063234e8bc28
+  parameter_count: 7B
+  dtype: bfloat16
+  thinking: false
+
+type: open-source  # open-source | proprietary
+
+# =====================
+# Package Information
+# =====================
+llmsql:
+  version: 0.1.15
+  commit_hash: 79175212c90b1fc094abd2c9666c23d903060014
+
+# =====================
+# Benchmark Information
+# =====================
+version: 2.0
+
+# =====================
+# Environment Information
+# =====================
+os_name: Ubuntu 24.04.3 LTS
+python_version: 3.12.12
+pip_freeze: requirements.txt
+device: 4xH200
+
+# =====================
+# Function Inputs / Inference Backend
+# =====================
+inference:
+  backend: vllm  # vllm | transformers
+  arguments:
+    batch_size: 20000
+    tetensor_parallel_size: 4
+    do_sample: false
+    max_new_tokens: 256
+    temperature: 0.0
+    num_fewshots: 5
+    seed: 42
+    llm_kwargs:
+      dtype: bfloat16
+
+# =====================
+# Results
+# =====================
+results:
+  execution_accuracy: 0.7940
+  answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/Qwen2.5-7B-Instruct/5fewshots/Qwen2.5-7B-Instruct_outputs.jsonl
diff --git a/leaderboard/Qwen3-0.6B/5fewshots/inference_script.py b/leaderboard/Qwen3-0.6B/5fewshots/inference_script.py
new file mode 100644
index 0000000..30b281f
--- /dev/null
+++ b/leaderboard/Qwen3-0.6B/5fewshots/inference_script.py
@@ -0,0 +1,26 @@
+import os
+
+from dotenv import load_dotenv
+
+from llmsql import evaluate, inference_vllm
+
+load_dotenv()
+
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+
+results = inference_vllm(
+    model_name=MODEL_NAME,
+    output_file=f"{MODEL_NAME}_outputs.jsonl",
+    batch_size=20000,
+    tensor_parallel_size=4,
+    do_sample=True,
+    hf_token=os.environ["HF_TOKEN"],
+    max_new_tokens=1024,
+    temperature=0.6,
+    sampling_kwargs={"top_p": 0.95, "top_k": 20, "min_p": 0},
+    num_fewshots=5,
+    seed=42,
+    llm_kwargs={"dtype": "bfloat16"},
+)
+
+evaluate(results)
diff --git a/leaderboard/Qwen3-0.6B/5fewshots/requirements.txt b/leaderboard/Qwen3-0.6B/5fewshots/requirements.txt
new file mode 100644
index 0000000..929f583
--- /dev/null
+++ b/leaderboard/Qwen3-0.6B/5fewshots/requirements.txt
@@ -0,0 +1,172 @@
+accelerate==1.12.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.13.3
+aiosignal==1.4.0
+annotated-doc==0.0.4
+annotated-types==0.7.0
+anthropic==0.83.0
+anyio==4.12.1
+apache-tvm-ffi==0.1.8.post2
+astor==0.8.1
+attrs==25.4.0
+blake3==1.0.8
+cachetools==7.0.1
+cbor2==5.8.0
+certifi==2026.1.4
+cffi==2.0.0
+charset-normalizer==3.4.4
+click==8.3.1
+cloudpickle==3.1.2
+compressed-tensors==0.13.0
+cryptography==46.0.5
+cuda-bindings==13.1.1
+cuda-pathfinder==1.3.5
+cuda-python==13.1.1
+cupy-cuda12x==14.0.1
+datasets==4.5.0
+depyf==0.20.0
+dill==0.4.0
+diskcache==5.6.3
+distro==1.9.0
+dnspython==2.8.0
+docstring_parser==0.17.0
+einops==0.8.2
+email-validator==2.3.0
+fastapi==0.132.0
+fastapi-cli==0.0.23
+fastapi-cloud-cli==0.13.0
+fastar==0.8.0
+filelock==3.24.3
+flashinfer-python==0.6.1
+frozenlist==1.8.0
+fsspec==2025.10.0
+gguf==0.17.1
+grpcio==1.78.1
+grpcio-reflection==1.78.1
+h11==0.16.0
+hf-xet==1.3.0
+httpcore==1.0.9
+httptools==0.7.1
+httpx==0.28.1
+httpx-sse==0.4.3
+huggingface_hub==0.36.2
+idna==3.11
+ijson==3.5.0
+interegular==0.3.3
+Jinja2==3.1.6
+jiter==0.13.0
+jmespath==1.1.0
+jsonschema==4.26.0
+jsonschema-specifications==2025.9.1
+lark==1.2.2
+llguidance==1.3.0
+llmsql==0.1.15
+llvmlite==0.44.0
+lm-format-enforcer==0.11.3
+loguru==0.7.3
+markdown-it-py==4.0.0
+MarkupSafe==3.0.3
+mcp==1.26.0
+mdurl==0.1.2
+mistral_common==1.9.1
+model-hosting-container-standards==0.1.13
+mpmath==1.3.0
+msgpack==1.1.2
+msgspec==0.20.0
+multidict==6.7.1
+multiprocess==0.70.18
+networkx==3.6.1
+ninja==1.13.0
+numba==0.61.2
+numpy==2.2.6
+nvidia-cublas-cu12==12.8.4.1
+nvidia-cuda-cupti-cu12==12.8.90
+nvidia-cuda-nvrtc-cu12==12.8.93
+nvidia-cuda-runtime-cu12==12.8.90
+nvidia-cudnn-cu12==9.10.2.21
+nvidia-cudnn-frontend==1.18.0
+nvidia-cufft-cu12==11.3.3.83
+nvidia-cufile-cu12==1.13.1.3
+nvidia-curand-cu12==10.3.9.90
+nvidia-cusolver-cu12==11.7.3.90
+nvidia-cusparse-cu12==12.5.8.93
+nvidia-cusparselt-cu12==0.7.1
+nvidia-cutlass-dsl==4.4.0
+nvidia-cutlass-dsl-libs-base==4.4.0
+nvidia-ml-py==13.590.48
+nvidia-nccl-cu12==2.27.5
+nvidia-nvjitlink-cu12==12.8.93
+nvidia-nvshmem-cu12==3.3.20
+nvidia-nvtx-cu12==12.8.90
+openai==2.23.0
+openai-harmony==0.0.8
+opencv-python-headless==4.13.0.92
+outlines_core==0.2.11
+packaging==26.0
+pandas==3.0.1
+partial-json-parser==0.2.1.1.post7
+pillow==12.1.1
+prometheus-fastapi-instrumentator==7.1.0
+prometheus_client==0.24.1
+propcache==0.4.1
+protobuf==6.33.5
+psutil==7.2.2
+py-cpuinfo==9.0.0
+pyarrow==23.0.1
+pybase64==1.4.3
+pycountry==26.2.16
+pycparser==3.0
+pydantic==2.12.5
+pydantic-extra-types==2.11.0
+pydantic-settings==2.13.1
+pydantic_core==2.41.5
+Pygments==2.19.2
+PyJWT==2.11.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.2.1
+python-json-logger==4.0.0
+python-multipart==0.0.22
+PyYAML==6.0.3
+pyzmq==27.1.0
+ray==2.54.0
+referencing==0.37.0
+regex==2026.2.19
+requests==2.32.5
+rich==14.3.3
+rich-toolkit==0.19.4
+rignore==0.7.6
+rpds-py==0.30.0
+safetensors==0.7.0
+sentencepiece==0.2.1
+sentry-sdk==2.53.0
+setproctitle==1.3.7
+setuptools==80.10.2
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+sse-starlette==3.2.0
+starlette==0.52.1
+supervisor==4.3.0
+sympy==1.14.0
+tabulate==0.9.0
+tiktoken==0.12.0
+tokenizers==0.22.2
+torch==2.9.1
+torchaudio==2.9.1
+torchvision==0.24.1
+tqdm==4.67.3
+transformers==4.57.6
+triton==3.5.1
+typer==0.24.1
+typer-slim==0.24.0
+typing-inspection==0.4.2
+typing_extensions==4.15.0
+urllib3==2.6.3
+uvicorn==0.41.0
+uvloop==0.22.1
+vllm==0.15.1
+watchfiles==1.1.1
+websockets==16.0
+xgrammar==0.1.29
+xxhash==3.6.0
+yarl==1.22.0
diff --git a/leaderboard/Qwen3-0.6B/5fewshots/run.yaml b/leaderboard/Qwen3-0.6B/5fewshots/run.yaml
new file mode 100644
index 0000000..ba714f8
--- /dev/null
+++ b/leaderboard/Qwen3-0.6B/5fewshots/run.yaml
@@ -0,0 +1,60 @@
+date: 2026-02-24
+# =====================
+# Model Information
+# =====================
+model:
+  name: Qwen/Qwen3-0.6B
+  revision: main
+  commit_hash: c1899de289a04d12100db370d81485cdf75e47ca
+  parameter_count: 0.6B
+  dtype: bfloat16
+  thinking: true
+
+type: open-source  # open-source | proprietary
+
+# =====================
+# Package Information
+# =====================
+llmsql:
+  version: 0.1.15
+  commit_hash: 79175212c90b1fc094abd2c9666c23d903060014
+
+# =====================
+# Benchmark Information
+# =====================
+version: 2.0
+
+# =====================
+# Environment Information
+# =====================
+os_name: Ubuntu 24.04.3 LTS
+python_version: 3.12.12
+pip_freeze: requirements.txt
+device: 4xH200
+
+# =====================
+# Function Inputs / Inference Backend
+# =====================
+inference:
+  backend: vllm  # vllm | transformers
+  arguments:
+    batch_size: 20000
+    tetensor_parallel_size: 4
+    do_sample: true
+    max_new_tokens: 1024
+    temperature: 0.6
+    sampling_kwargs:
+      top_p: 0.95
+      top_k: 20
+      min_p: 0
+    num_fewshots: 5
+    seed: 42
+    llm_kwargs:
+      dtype: bfloat16
+
+# =====================
+# Results
+# =====================
+results:
+  execution_accuracy: 0.4983
+  answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/Qwen3-0.6B/5fewshots/Qwen3-0.6B_outputs.jsonl
diff --git a/leaderboard/gpt-oss-120b/5fewshots/inference_script.py b/leaderboard/gpt-oss-120b/5fewshots/inference_script.py
new file mode 100644
index 0000000..e99d509
--- /dev/null
+++ b/leaderboard/gpt-oss-120b/5fewshots/inference_script.py
@@ -0,0 +1,26 @@
+import os
+
+from dotenv import load_dotenv
+
+from llmsql import evaluate, inference_vllm
+
+load_dotenv()
+
+MODEL_NAME = "openai/gpt-oss-120b"
+
+results = inference_vllm(
+    model_name=MODEL_NAME,
+    output_file=f"{MODEL_NAME}_outputs.jsonl",
+    batch_size=20000,
+    tensor_parallel_size=4,
+    do_sample=True,
+    hf_token=os.environ["HF_TOKEN"],
+    max_new_tokens=1024,
+    temperature=1.0,
+    sampling_kwargs={"top_p": 0.95, "repetition_penalty": 1.0},
+    num_fewshots=5,
+    seed=42,
+    llm_kwargs={"dtype": "bfloat16"},
+)
+
+evaluate(results)
diff --git a/leaderboard/gpt-oss-120b/5fewshots/requirements.txt b/leaderboard/gpt-oss-120b/5fewshots/requirements.txt
new file mode 100644
index 0000000..929f583
--- /dev/null
+++ b/leaderboard/gpt-oss-120b/5fewshots/requirements.txt
@@ -0,0 +1,172 @@
+accelerate==1.12.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.13.3
+aiosignal==1.4.0
+annotated-doc==0.0.4
+annotated-types==0.7.0
+anthropic==0.83.0
+anyio==4.12.1
+apache-tvm-ffi==0.1.8.post2
+astor==0.8.1
+attrs==25.4.0
+blake3==1.0.8
+cachetools==7.0.1
+cbor2==5.8.0
+certifi==2026.1.4
+cffi==2.0.0
+charset-normalizer==3.4.4
+click==8.3.1
+cloudpickle==3.1.2
+compressed-tensors==0.13.0
+cryptography==46.0.5
+cuda-bindings==13.1.1
+cuda-pathfinder==1.3.5
+cuda-python==13.1.1
+cupy-cuda12x==14.0.1
+datasets==4.5.0
+depyf==0.20.0
+dill==0.4.0
+diskcache==5.6.3
+distro==1.9.0
+dnspython==2.8.0
+docstring_parser==0.17.0
+einops==0.8.2
+email-validator==2.3.0
+fastapi==0.132.0
+fastapi-cli==0.0.23
+fastapi-cloud-cli==0.13.0
+fastar==0.8.0
+filelock==3.24.3
+flashinfer-python==0.6.1
+frozenlist==1.8.0
+fsspec==2025.10.0
+gguf==0.17.1
+grpcio==1.78.1
+grpcio-reflection==1.78.1
+h11==0.16.0
+hf-xet==1.3.0
+httpcore==1.0.9
+httptools==0.7.1
+httpx==0.28.1
+httpx-sse==0.4.3
+huggingface_hub==0.36.2
+idna==3.11
+ijson==3.5.0
+interegular==0.3.3
+Jinja2==3.1.6
+jiter==0.13.0
+jmespath==1.1.0
+jsonschema==4.26.0
+jsonschema-specifications==2025.9.1
+lark==1.2.2
+llguidance==1.3.0
+llmsql==0.1.15
+llvmlite==0.44.0
+lm-format-enforcer==0.11.3
+loguru==0.7.3
+markdown-it-py==4.0.0
+MarkupSafe==3.0.3
+mcp==1.26.0
+mdurl==0.1.2
+mistral_common==1.9.1
+model-hosting-container-standards==0.1.13
+mpmath==1.3.0
+msgpack==1.1.2
+msgspec==0.20.0
+multidict==6.7.1
+multiprocess==0.70.18
+networkx==3.6.1
+ninja==1.13.0
+numba==0.61.2
+numpy==2.2.6
+nvidia-cublas-cu12==12.8.4.1
+nvidia-cuda-cupti-cu12==12.8.90
+nvidia-cuda-nvrtc-cu12==12.8.93
+nvidia-cuda-runtime-cu12==12.8.90
+nvidia-cudnn-cu12==9.10.2.21
+nvidia-cudnn-frontend==1.18.0
+nvidia-cufft-cu12==11.3.3.83
+nvidia-cufile-cu12==1.13.1.3
+nvidia-curand-cu12==10.3.9.90
+nvidia-cusolver-cu12==11.7.3.90
+nvidia-cusparse-cu12==12.5.8.93
+nvidia-cusparselt-cu12==0.7.1
+nvidia-cutlass-dsl==4.4.0
+nvidia-cutlass-dsl-libs-base==4.4.0
+nvidia-ml-py==13.590.48
+nvidia-nccl-cu12==2.27.5
+nvidia-nvjitlink-cu12==12.8.93
+nvidia-nvshmem-cu12==3.3.20
+nvidia-nvtx-cu12==12.8.90
+openai==2.23.0
+openai-harmony==0.0.8
+opencv-python-headless==4.13.0.92
+outlines_core==0.2.11
+packaging==26.0
+pandas==3.0.1
+partial-json-parser==0.2.1.1.post7
+pillow==12.1.1
+prometheus-fastapi-instrumentator==7.1.0
+prometheus_client==0.24.1
+propcache==0.4.1
+protobuf==6.33.5
+psutil==7.2.2
+py-cpuinfo==9.0.0
+pyarrow==23.0.1
+pybase64==1.4.3
+pycountry==26.2.16
+pycparser==3.0
+pydantic==2.12.5
+pydantic-extra-types==2.11.0
+pydantic-settings==2.13.1
+pydantic_core==2.41.5
+Pygments==2.19.2
+PyJWT==2.11.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.2.1
+python-json-logger==4.0.0
+python-multipart==0.0.22
+PyYAML==6.0.3
+pyzmq==27.1.0
+ray==2.54.0
+referencing==0.37.0
+regex==2026.2.19
+requests==2.32.5
+rich==14.3.3
+rich-toolkit==0.19.4
+rignore==0.7.6
+rpds-py==0.30.0
+safetensors==0.7.0
+sentencepiece==0.2.1
+sentry-sdk==2.53.0
+setproctitle==1.3.7
+setuptools==80.10.2
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+sse-starlette==3.2.0
+starlette==0.52.1
+supervisor==4.3.0
+sympy==1.14.0
+tabulate==0.9.0
+tiktoken==0.12.0
+tokenizers==0.22.2
+torch==2.9.1
+torchaudio==2.9.1
+torchvision==0.24.1
+tqdm==4.67.3
+transformers==4.57.6
+triton==3.5.1
+typer==0.24.1
+typer-slim==0.24.0
+typing-inspection==0.4.2
+typing_extensions==4.15.0
+urllib3==2.6.3
+uvicorn==0.41.0
+uvloop==0.22.1
+vllm==0.15.1
+watchfiles==1.1.1
+websockets==16.0
+xgrammar==0.1.29
+xxhash==3.6.0
+yarl==1.22.0
diff --git a/leaderboard/gpt-oss-120b/5fewshots/run.yaml b/leaderboard/gpt-oss-120b/5fewshots/run.yaml
new file mode 100644
index 0000000..fe878e3
--- /dev/null
+++ b/leaderboard/gpt-oss-120b/5fewshots/run.yaml
@@ -0,0 +1,60 @@
+date: 2026-02-24
+# =====================
+# Model Information
+# =====================
+model:
+  name: openai/gpt-oss-120b
+  revision: main
+  commit_hash: b5c939de8f754692c1647ca79fbf85e8c1e70f8a
+  parameter_count: 120B
+  dtype: bfloat16
+  thinking: true
+
+type: open-source  # open-source | proprietary
+
+# =====================
+# Package Information
+# =====================
+llmsql:
+  version: 0.1.15
+  commit_hash: 79175212c90b1fc094abd2c9666c23d903060014
+
+# =====================
+# Benchmark Information
+# =====================
+version: 2.0
+
+# =====================
+# Environment Information
+# =====================
+os_name: Ubuntu 24.04.3 LTS
+python_version: 3.12.12
+pip_freeze: requirements.txt
+device: 4xH200
+
+# =====================
+# Function Inputs / Inference Backend
+# =====================
+inference:
+  backend: vllm  # vllm | transformers
+  arguments:
+    batch_size: 20000
+    tetensor_parallel_size: 4
+    do_sample: true
+    max_new_tokens: 1024
+    temperature: 1.0
+    sampling_kwargs:
+      top_p: 0.95
+      repetition_penalty: 1.0
+    num_fewshots: 5
+    seed: 42
+    llm_kwargs:
+      dtype: bfloat16
+
+
+# =====================
+# Results
+# =====================
+results:
+  execution_accuracy: 0.9049
+  answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/gpt-oss-120b/5fewshots/gpt-oss-120b_outputs.jsonl
diff --git a/leaderboard/gpt-oss-20b/5fewshots/inference_script.py b/leaderboard/gpt-oss-20b/5fewshots/inference_script.py
new file mode 100644
index 0000000..3125855
--- /dev/null
+++ b/leaderboard/gpt-oss-20b/5fewshots/inference_script.py
@@ -0,0 +1,26 @@
+import os
+
+from dotenv import load_dotenv
+
+from llmsql import evaluate, inference_vllm
+
+load_dotenv()
+
+MODEL_NAME = "openai/gpt-oss-20b"
+
+results = inference_vllm(
+    model_name=MODEL_NAME,
+    output_file=f"{MODEL_NAME}_outputs.jsonl",
+    batch_size=20000,
+    tensor_parallel_size=4,
+    do_sample=True,
+    hf_token=os.environ["HF_TOKEN"],
+    max_new_tokens=1024,
+    temperature=1.0,
+    sampling_kwargs={"top_p": 0.95, "repetition_penalty": 1.0},
+    num_fewshots=5,
+    seed=42,
+    llm_kwargs={"dtype": "bfloat16"},
+)
+
+evaluate(results)
diff --git a/leaderboard/gpt-oss-20b/5fewshots/requirements.txt b/leaderboard/gpt-oss-20b/5fewshots/requirements.txt
new file mode 100644
index 0000000..929f583
--- /dev/null
+++ b/leaderboard/gpt-oss-20b/5fewshots/requirements.txt
@@ -0,0 +1,172 @@
+accelerate==1.12.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.13.3
+aiosignal==1.4.0
+annotated-doc==0.0.4
+annotated-types==0.7.0
+anthropic==0.83.0
+anyio==4.12.1
+apache-tvm-ffi==0.1.8.post2
+astor==0.8.1
+attrs==25.4.0
+blake3==1.0.8
+cachetools==7.0.1
+cbor2==5.8.0
+certifi==2026.1.4
+cffi==2.0.0
+charset-normalizer==3.4.4
+click==8.3.1
+cloudpickle==3.1.2
+compressed-tensors==0.13.0
+cryptography==46.0.5
+cuda-bindings==13.1.1
+cuda-pathfinder==1.3.5
+cuda-python==13.1.1
+cupy-cuda12x==14.0.1
+datasets==4.5.0
+depyf==0.20.0
+dill==0.4.0
+diskcache==5.6.3
+distro==1.9.0
+dnspython==2.8.0
+docstring_parser==0.17.0
+einops==0.8.2
+email-validator==2.3.0
+fastapi==0.132.0
+fastapi-cli==0.0.23
+fastapi-cloud-cli==0.13.0
+fastar==0.8.0
+filelock==3.24.3
+flashinfer-python==0.6.1
+frozenlist==1.8.0
+fsspec==2025.10.0
+gguf==0.17.1
+grpcio==1.78.1
+grpcio-reflection==1.78.1
+h11==0.16.0
+hf-xet==1.3.0
+httpcore==1.0.9
+httptools==0.7.1
+httpx==0.28.1
+httpx-sse==0.4.3
+huggingface_hub==0.36.2
+idna==3.11
+ijson==3.5.0
+interegular==0.3.3
+Jinja2==3.1.6
+jiter==0.13.0
+jmespath==1.1.0
+jsonschema==4.26.0
+jsonschema-specifications==2025.9.1
+lark==1.2.2
+llguidance==1.3.0
+llmsql==0.1.15
+llvmlite==0.44.0
+lm-format-enforcer==0.11.3
+loguru==0.7.3
+markdown-it-py==4.0.0
+MarkupSafe==3.0.3
+mcp==1.26.0
+mdurl==0.1.2
+mistral_common==1.9.1
+model-hosting-container-standards==0.1.13
+mpmath==1.3.0
+msgpack==1.1.2
+msgspec==0.20.0
+multidict==6.7.1
+multiprocess==0.70.18
+networkx==3.6.1
+ninja==1.13.0
+numba==0.61.2
+numpy==2.2.6
+nvidia-cublas-cu12==12.8.4.1
+nvidia-cuda-cupti-cu12==12.8.90
+nvidia-cuda-nvrtc-cu12==12.8.93
+nvidia-cuda-runtime-cu12==12.8.90
+nvidia-cudnn-cu12==9.10.2.21
+nvidia-cudnn-frontend==1.18.0
+nvidia-cufft-cu12==11.3.3.83
+nvidia-cufile-cu12==1.13.1.3
+nvidia-curand-cu12==10.3.9.90
+nvidia-cusolver-cu12==11.7.3.90
+nvidia-cusparse-cu12==12.5.8.93
+nvidia-cusparselt-cu12==0.7.1
+nvidia-cutlass-dsl==4.4.0
+nvidia-cutlass-dsl-libs-base==4.4.0
+nvidia-ml-py==13.590.48
+nvidia-nccl-cu12==2.27.5
+nvidia-nvjitlink-cu12==12.8.93
+nvidia-nvshmem-cu12==3.3.20
+nvidia-nvtx-cu12==12.8.90
+openai==2.23.0
+openai-harmony==0.0.8
+opencv-python-headless==4.13.0.92
+outlines_core==0.2.11
+packaging==26.0
+pandas==3.0.1
+partial-json-parser==0.2.1.1.post7
+pillow==12.1.1
+prometheus-fastapi-instrumentator==7.1.0
+prometheus_client==0.24.1
+propcache==0.4.1
+protobuf==6.33.5
+psutil==7.2.2
+py-cpuinfo==9.0.0
+pyarrow==23.0.1
+pybase64==1.4.3
+pycountry==26.2.16
+pycparser==3.0
+pydantic==2.12.5
+pydantic-extra-types==2.11.0
+pydantic-settings==2.13.1
+pydantic_core==2.41.5
+Pygments==2.19.2
+PyJWT==2.11.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.2.1
+python-json-logger==4.0.0
+python-multipart==0.0.22
+PyYAML==6.0.3
+pyzmq==27.1.0
+ray==2.54.0
+referencing==0.37.0
+regex==2026.2.19
+requests==2.32.5
+rich==14.3.3
+rich-toolkit==0.19.4
+rignore==0.7.6
+rpds-py==0.30.0
+safetensors==0.7.0
+sentencepiece==0.2.1
+sentry-sdk==2.53.0
+setproctitle==1.3.7
+setuptools==80.10.2
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+sse-starlette==3.2.0
+starlette==0.52.1
+supervisor==4.3.0
+sympy==1.14.0
+tabulate==0.9.0
+tiktoken==0.12.0
+tokenizers==0.22.2
+torch==2.9.1
+torchaudio==2.9.1
+torchvision==0.24.1
+tqdm==4.67.3
+transformers==4.57.6
+triton==3.5.1
+typer==0.24.1
+typer-slim==0.24.0
+typing-inspection==0.4.2
+typing_extensions==4.15.0
+urllib3==2.6.3
+uvicorn==0.41.0
+uvloop==0.22.1
+vllm==0.15.1
+watchfiles==1.1.1
+websockets==16.0
+xgrammar==0.1.29
+xxhash==3.6.0
+yarl==1.22.0
diff --git a/leaderboard/gpt-oss-20b/5fewshots/run.yaml b/leaderboard/gpt-oss-20b/5fewshots/run.yaml
new file mode 100644
index 0000000..74b6638
--- /dev/null
+++ b/leaderboard/gpt-oss-20b/5fewshots/run.yaml
@@ -0,0 +1,59 @@
+date: 2026-02-24
+# =====================
+# Model Information
+# =====================
+model:
+  name: openai/gpt-oss-20b
+  revision: main
+  commit_hash: 6cee5e81ee83917806bbde320786a8fb61efebee
+  parameter_count: 20B
+  dtype: bfloat16
+  thinking: true
+
+type: open-source  # open-source | proprietary
+
+# =====================
+# Package Information
+# =====================
+llmsql:
+  version: 0.1.15
+  commit_hash: 79175212c90b1fc094abd2c9666c23d903060014
+
+# =====================
+# Benchmark Information
+# =====================
+version: 2.0
+
+# =====================
+# Environment Information
+# =====================
+os_name: Ubuntu 24.04.3 LTS
+python_version: 3.12.12
+pip_freeze: requirements.txt
+device: 4xH200
+
+# =====================
+# Function Inputs / Inference Backend
+# =====================
+inference:
+  backend: vllm  # vllm | transformers
+  arguments:
+    batch_size: 20000
+    tetensor_parallel_size: 4
+    do_sample: true
+    max_new_tokens: 1024
+    temperature: 1.0
+    sampling_kwargs:
+      top_p: 0.95
+      repetition_penalty: 1.0
+    num_fewshots: 5
+    seed: 42
+    llm_kwargs:
+      dtype: bfloat16
+
+# =====================
+# Results
+# =====================
+results:
+  execution_accuracy: 0.8871
+  answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/gpt-oss-20b/5fewshots/gpt-oss-20b_outputs.jsonl
diff --git a/leaderboard/pllum-12b-nc-chat-250715/5fewshots/inference_script.py b/leaderboard/pllum-12b-nc-chat-250715/5fewshots/inference_script.py
new file mode 100644
index 0000000..c2e8457
--- /dev/null
+++ b/leaderboard/pllum-12b-nc-chat-250715/5fewshots/inference_script.py
@@ -0,0 +1,25 @@
+import os
+
+from dotenv import load_dotenv
+
+from llmsql import evaluate, inference_vllm
+
+load_dotenv()
+
+MODEL_NAME = "CYFRAGOVPL/pllum-12b-nc-chat-250715"
+
+results = inference_vllm(
+    model_name=MODEL_NAME,
+    output_file=f"{MODEL_NAME}_outputs.jsonl",
+    batch_size=20000,
+    tensor_parallel_size=4,
+    do_sample=False,
+    hf_token=os.environ["HF_TOKEN"],
+    max_new_tokens=256,
+    temperature=0.0,
+    num_fewshots=5,
+    seed=42,
+    llm_kwargs={"dtype": "bfloat16"},
+)
+
+evaluate(results)
diff --git a/leaderboard/pllum-12b-nc-chat-250715/5fewshots/requirements.txt b/leaderboard/pllum-12b-nc-chat-250715/5fewshots/requirements.txt
new file mode 100644
index 0000000..929f583
--- /dev/null
+++ b/leaderboard/pllum-12b-nc-chat-250715/5fewshots/requirements.txt
@@ -0,0 +1,172 @@
+accelerate==1.12.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.13.3
+aiosignal==1.4.0
+annotated-doc==0.0.4
+annotated-types==0.7.0
+anthropic==0.83.0
+anyio==4.12.1
+apache-tvm-ffi==0.1.8.post2
+astor==0.8.1
+attrs==25.4.0
+blake3==1.0.8
+cachetools==7.0.1
+cbor2==5.8.0
+certifi==2026.1.4
+cffi==2.0.0
+charset-normalizer==3.4.4
+click==8.3.1
+cloudpickle==3.1.2
+compressed-tensors==0.13.0
+cryptography==46.0.5
+cuda-bindings==13.1.1
+cuda-pathfinder==1.3.5
+cuda-python==13.1.1
+cupy-cuda12x==14.0.1
+datasets==4.5.0
+depyf==0.20.0
+dill==0.4.0
+diskcache==5.6.3
+distro==1.9.0
+dnspython==2.8.0
+docstring_parser==0.17.0
+einops==0.8.2
+email-validator==2.3.0
+fastapi==0.132.0
+fastapi-cli==0.0.23
+fastapi-cloud-cli==0.13.0
+fastar==0.8.0
+filelock==3.24.3
+flashinfer-python==0.6.1
+frozenlist==1.8.0
+fsspec==2025.10.0
+gguf==0.17.1
+grpcio==1.78.1
+grpcio-reflection==1.78.1
+h11==0.16.0
+hf-xet==1.3.0
+httpcore==1.0.9
+httptools==0.7.1
+httpx==0.28.1
+httpx-sse==0.4.3
+huggingface_hub==0.36.2
+idna==3.11
+ijson==3.5.0
+interegular==0.3.3
+Jinja2==3.1.6
+jiter==0.13.0
+jmespath==1.1.0
+jsonschema==4.26.0
+jsonschema-specifications==2025.9.1
+lark==1.2.2
+llguidance==1.3.0
+llmsql==0.1.15
+llvmlite==0.44.0
+lm-format-enforcer==0.11.3
+loguru==0.7.3
+markdown-it-py==4.0.0
+MarkupSafe==3.0.3
+mcp==1.26.0
+mdurl==0.1.2
+mistral_common==1.9.1
+model-hosting-container-standards==0.1.13
+mpmath==1.3.0
+msgpack==1.1.2
+msgspec==0.20.0
+multidict==6.7.1
+multiprocess==0.70.18
+networkx==3.6.1
+ninja==1.13.0
+numba==0.61.2
+numpy==2.2.6
+nvidia-cublas-cu12==12.8.4.1
+nvidia-cuda-cupti-cu12==12.8.90
+nvidia-cuda-nvrtc-cu12==12.8.93
+nvidia-cuda-runtime-cu12==12.8.90
+nvidia-cudnn-cu12==9.10.2.21
+nvidia-cudnn-frontend==1.18.0
+nvidia-cufft-cu12==11.3.3.83
+nvidia-cufile-cu12==1.13.1.3
+nvidia-curand-cu12==10.3.9.90
+nvidia-cusolver-cu12==11.7.3.90
+nvidia-cusparse-cu12==12.5.8.93
+nvidia-cusparselt-cu12==0.7.1
+nvidia-cutlass-dsl==4.4.0
+nvidia-cutlass-dsl-libs-base==4.4.0
+nvidia-ml-py==13.590.48
+nvidia-nccl-cu12==2.27.5
+nvidia-nvjitlink-cu12==12.8.93
+nvidia-nvshmem-cu12==3.3.20
+nvidia-nvtx-cu12==12.8.90
+openai==2.23.0
+openai-harmony==0.0.8
+opencv-python-headless==4.13.0.92
+outlines_core==0.2.11
+packaging==26.0
+pandas==3.0.1
+partial-json-parser==0.2.1.1.post7
+pillow==12.1.1
+prometheus-fastapi-instrumentator==7.1.0
+prometheus_client==0.24.1
+propcache==0.4.1
+protobuf==6.33.5
+psutil==7.2.2
+py-cpuinfo==9.0.0
+pyarrow==23.0.1
+pybase64==1.4.3
+pycountry==26.2.16
+pycparser==3.0
+pydantic==2.12.5
+pydantic-extra-types==2.11.0
+pydantic-settings==2.13.1
+pydantic_core==2.41.5
+Pygments==2.19.2
+PyJWT==2.11.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.2.1
+python-json-logger==4.0.0
+python-multipart==0.0.22
+PyYAML==6.0.3
+pyzmq==27.1.0
+ray==2.54.0
+referencing==0.37.0
+regex==2026.2.19
+requests==2.32.5
+rich==14.3.3
+rich-toolkit==0.19.4
+rignore==0.7.6
+rpds-py==0.30.0
+safetensors==0.7.0
+sentencepiece==0.2.1
+sentry-sdk==2.53.0
+setproctitle==1.3.7
+setuptools==80.10.2
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+sse-starlette==3.2.0
+starlette==0.52.1
+supervisor==4.3.0
+sympy==1.14.0
+tabulate==0.9.0
+tiktoken==0.12.0
+tokenizers==0.22.2
+torch==2.9.1
+torchaudio==2.9.1
+torchvision==0.24.1
+tqdm==4.67.3
+transformers==4.57.6
+triton==3.5.1
+typer==0.24.1
+typer-slim==0.24.0
+typing-inspection==0.4.2
+typing_extensions==4.15.0
+urllib3==2.6.3
+uvicorn==0.41.0
+uvloop==0.22.1
+vllm==0.15.1
+watchfiles==1.1.1
+websockets==16.0
+xgrammar==0.1.29
+xxhash==3.6.0
+yarl==1.22.0
diff --git a/leaderboard/pllum-12b-nc-chat-250715/5fewshots/run.yaml b/leaderboard/pllum-12b-nc-chat-250715/5fewshots/run.yaml
new file mode 100644
index 0000000..6b48d39
--- /dev/null
+++ b/leaderboard/pllum-12b-nc-chat-250715/5fewshots/run.yaml
@@ -0,0 +1,56 @@
+date: 2026-02-24
+# =====================
+# Model Information
+# =====================
+model:
+  name: CYFRAGOVPL/pllum-12b-nc-chat-250715
+  revision: main
+  commit_hash: 025e26b3fc5ac1fa8714298e671a6cf2418123d7
+  parameter_count: 12B
+  dtype: bfloat16
+  thinking: false
+
+type: open-source  # open-source | proprietary
+
+# =====================
+# Package Information
+# =====================
+llmsql:
+  version: 0.1.15
+  commit_hash: 79175212c90b1fc094abd2c9666c23d903060014
+
+# =====================
+# Benchmark Information
+# =====================
+version: 2.0
+
+# =====================
+# Environment Information
+# =====================
+os_name: Ubuntu 24.04.3 LTS
+python_version: 3.12.12
+pip_freeze: requirements.txt
+device: 4xH200
+
+# =====================
+# Function Inputs / Inference Backend
+# =====================
+inference:
+  backend: vllm  # vllm | transformers
+  arguments:
+    batch_size: 20000
+    tetensor_parallel_size: 4
+    do_sample: false
+    max_new_tokens: 256
+    temperature: 0.0
+    num_fewshots: 5
+    seed: 42
+    llm_kwargs:
+      dtype: bfloat16
+
+# =====================
+# Results
+# =====================
+results:
+  execution_accuracy: 0.3727
+  answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/pllum-12b-nc-chat-250715/5fewshots/pllum-12b-nc-chat-250715_outputs.jsonl