Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions leaderboard/Llama-3.2-1B-Instruct/5fewshots/inference_script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import os

from dotenv import load_dotenv

from llmsql import evaluate, inference_vllm

load_dotenv()

MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"

results = inference_vllm(
model_name=MODEL_NAME,
output_file=f"{MODEL_NAME}_outputs.jsonl",
batch_size=20000,
tensor_parallel_size=4,
do_sample=False,
hf_token=os.environ["HF_TOKEN"],
max_new_tokens=256,
temperature=0.0,
num_fewshots=5,
seed=42,
llm_kwargs={"dtype": "bfloat16"},
)

evaluate(results)
172 changes: 172 additions & 0 deletions leaderboard/Llama-3.2-1B-Instruct/5fewshots/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
accelerate==1.12.0
aiohappyeyeballs==2.6.1
aiohttp==3.13.3
aiosignal==1.4.0
annotated-doc==0.0.4
annotated-types==0.7.0
anthropic==0.83.0
anyio==4.12.1
apache-tvm-ffi==0.1.8.post2
astor==0.8.1
attrs==25.4.0
blake3==1.0.8
cachetools==7.0.1
cbor2==5.8.0
certifi==2026.1.4
cffi==2.0.0
charset-normalizer==3.4.4
click==8.3.1
cloudpickle==3.1.2
compressed-tensors==0.13.0
cryptography==46.0.5
cuda-bindings==13.1.1
cuda-pathfinder==1.3.5
cuda-python==13.1.1
cupy-cuda12x==14.0.1
datasets==4.5.0
depyf==0.20.0
dill==0.4.0
diskcache==5.6.3
distro==1.9.0
dnspython==2.8.0
docstring_parser==0.17.0
einops==0.8.2
email-validator==2.3.0
fastapi==0.132.0
fastapi-cli==0.0.23
fastapi-cloud-cli==0.13.0
fastar==0.8.0
filelock==3.24.3
flashinfer-python==0.6.1
frozenlist==1.8.0
fsspec==2025.10.0
gguf==0.17.1
grpcio==1.78.1
grpcio-reflection==1.78.1
h11==0.16.0
hf-xet==1.3.0
httpcore==1.0.9
httptools==0.7.1
httpx==0.28.1
httpx-sse==0.4.3
huggingface_hub==0.36.2
idna==3.11
ijson==3.5.0
interegular==0.3.3
Jinja2==3.1.6
jiter==0.13.0
jmespath==1.1.0
jsonschema==4.26.0
jsonschema-specifications==2025.9.1
lark==1.2.2
llguidance==1.3.0
llmsql==0.1.15
llvmlite==0.44.0
lm-format-enforcer==0.11.3
loguru==0.7.3
markdown-it-py==4.0.0
MarkupSafe==3.0.3
mcp==1.26.0
mdurl==0.1.2
mistral_common==1.9.1
model-hosting-container-standards==0.1.13
mpmath==1.3.0
msgpack==1.1.2
msgspec==0.20.0
multidict==6.7.1
multiprocess==0.70.18
networkx==3.6.1
ninja==1.13.0
numba==0.61.2
numpy==2.2.6
nvidia-cublas-cu12==12.8.4.1
nvidia-cuda-cupti-cu12==12.8.90
nvidia-cuda-nvrtc-cu12==12.8.93
nvidia-cuda-runtime-cu12==12.8.90
nvidia-cudnn-cu12==9.10.2.21
nvidia-cudnn-frontend==1.18.0
nvidia-cufft-cu12==11.3.3.83
nvidia-cufile-cu12==1.13.1.3
nvidia-curand-cu12==10.3.9.90
nvidia-cusolver-cu12==11.7.3.90
nvidia-cusparse-cu12==12.5.8.93
nvidia-cusparselt-cu12==0.7.1
nvidia-cutlass-dsl==4.4.0
nvidia-cutlass-dsl-libs-base==4.4.0
nvidia-ml-py==13.590.48
nvidia-nccl-cu12==2.27.5
nvidia-nvjitlink-cu12==12.8.93
nvidia-nvshmem-cu12==3.3.20
nvidia-nvtx-cu12==12.8.90
openai==2.23.0
openai-harmony==0.0.8
opencv-python-headless==4.13.0.92
outlines_core==0.2.11
packaging==26.0
pandas==3.0.1
partial-json-parser==0.2.1.1.post7
pillow==12.1.1
prometheus-fastapi-instrumentator==7.1.0
prometheus_client==0.24.1
propcache==0.4.1
protobuf==6.33.5
psutil==7.2.2
py-cpuinfo==9.0.0
pyarrow==23.0.1
pybase64==1.4.3
pycountry==26.2.16
pycparser==3.0
pydantic==2.12.5
pydantic-extra-types==2.11.0
pydantic-settings==2.13.1
pydantic_core==2.41.5
Pygments==2.19.2
PyJWT==2.11.0
python-dateutil==2.9.0.post0
python-dotenv==1.2.1
python-json-logger==4.0.0
python-multipart==0.0.22
PyYAML==6.0.3
pyzmq==27.1.0
ray==2.54.0
referencing==0.37.0
regex==2026.2.19
requests==2.32.5
rich==14.3.3
rich-toolkit==0.19.4
rignore==0.7.6
rpds-py==0.30.0
safetensors==0.7.0
sentencepiece==0.2.1
sentry-sdk==2.53.0
setproctitle==1.3.7
setuptools==80.10.2
shellingham==1.5.4
six==1.17.0
sniffio==1.3.1
sse-starlette==3.2.0
starlette==0.52.1
supervisor==4.3.0
sympy==1.14.0
tabulate==0.9.0
tiktoken==0.12.0
tokenizers==0.22.2
torch==2.9.1
torchaudio==2.9.1
torchvision==0.24.1
tqdm==4.67.3
transformers==4.57.6
triton==3.5.1
typer==0.24.1
typer-slim==0.24.0
typing-inspection==0.4.2
typing_extensions==4.15.0
urllib3==2.6.3
uvicorn==0.41.0
uvloop==0.22.1
vllm==0.15.1
watchfiles==1.1.1
websockets==16.0
xgrammar==0.1.29
xxhash==3.6.0
yarl==1.22.0
57 changes: 57 additions & 0 deletions leaderboard/Llama-3.2-1B-Instruct/5fewshots/run.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
date: 2026-02-24
# =====================
# Model Information
# =====================
model:
name: meta-llama/Llama-3.2-1B-Instruct
revision: main
commit_hash: 9213176726f574b556790deb65791e0c5aa438b6
parameter_count: 1B
dtype: bfloat16
thinking: false

type: open-source # open-source | proprietary

# =====================
# Package Information
# =====================
llmsql:
version: 0.1.15
commit_hash: 79175212c90b1fc094abd2c9666c23d903060014

# =====================
# Benchmark Information
# =====================
version: 2.0

# =====================
# Environment Information
# =====================
os_name: Ubuntu 24.04.3 LTS
python_version: 3.12.12
pip_freeze: requirements.txt
device: 4xH200

# =====================
# Function Inputs / Inference Backend
# =====================
inference:
backend: vllm # vllm | transformers
arguments:
batch_size: 20000
tetensor_parallel_size: 4
do_sample: false
max_new_tokens: 256
temperature: 0.0
num_fewshots: 5
seed: 42
llm_kwargs:
dtype: bfloat16


# =====================
# Results
# =====================
results:
execution_accuracy: 0.2678
answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/Llama-3.2-1B-Instruct/5fewshots/Llama-3.2-1B-Instruct_outputs.jsonl
25 changes: 25 additions & 0 deletions leaderboard/Llama-3.2-3B-Instruct/5fewshots/inference_script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import os

from dotenv import load_dotenv

from llmsql import evaluate, inference_vllm

load_dotenv()

MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct"

results = inference_vllm(
model_name=MODEL_NAME,
output_file=f"{MODEL_NAME}_outputs.jsonl",
batch_size=20000,
tensor_parallel_size=4,
do_sample=False,
hf_token=os.environ["HF_TOKEN"],
max_new_tokens=256,
temperature=0.0,
num_fewshots=5,
seed=42,
llm_kwargs={"dtype": "bfloat16"},
)

evaluate(results)
Loading