From 6d3e229577380cfa5187cb2916417c9914ff682e Mon Sep 17 00:00:00 2001 From: Dzmitry Pihulski Date: Tue, 24 Feb 2026 15:00:04 +0100 Subject: [PATCH] leaderboard data added --- .../5fewshots/inference_script.py | 25 +++ .../5fewshots/requirements.txt | 172 ++++++++++++++++++ .../Llama-3.2-1B-Instruct/5fewshots/run.yaml | 57 ++++++ .../5fewshots/inference_script.py | 25 +++ .../5fewshots/requirements.txt | 172 ++++++++++++++++++ .../Llama-3.2-3B-Instruct/5fewshots/run.yaml | 57 ++++++ .../5fewshots/inference_script.py | 25 +++ .../5fewshots/requirements.txt | 172 ++++++++++++++++++ .../Llama-3.3-70B-Instruct/5fewshots/run.yaml | 57 ++++++ .../5fewshots/inference_script.py | 25 +++ .../5fewshots/requirements.txt | 172 ++++++++++++++++++ .../5fewshots/run.yaml | 56 ++++++ .../5fewshots/inference_script.py | 26 +++ .../5fewshots/requirements.txt | 172 ++++++++++++++++++ .../5fewshots/run.yaml | 59 ++++++ .../5fewshots/inference_script.py | 25 +++ .../PLLuM-12B-chat/5fewshots/requirements.txt | 172 ++++++++++++++++++ leaderboard/PLLuM-12B-chat/5fewshots/run.yaml | 57 ++++++ .../5fewshots/inference_script.py | 25 +++ .../5fewshots/requirements.txt | 172 ++++++++++++++++++ .../PLLuM-12B-nc-chat/5fewshots/run.yaml | 57 ++++++ .../5fewshots/inference_script.py | 25 +++ .../5fewshots/requirements.txt | 172 ++++++++++++++++++ .../Qwen2.5-1.5B-Instruct/5fewshots/run.yaml | 57 ++++++ .../5fewshots/inference_script.py | 25 +++ .../5fewshots/requirements.txt | 172 ++++++++++++++++++ .../Qwen2.5-7B-Instruct/5fewshots/run.yaml | 56 ++++++ .../Qwen3-0.6B/5fewshots/inference_script.py | 26 +++ .../Qwen3-0.6B/5fewshots/requirements.txt | 172 ++++++++++++++++++ leaderboard/Qwen3-0.6B/5fewshots/run.yaml | 60 ++++++ .../5fewshots/inference_script.py | 26 +++ .../gpt-oss-120b/5fewshots/requirements.txt | 172 ++++++++++++++++++ leaderboard/gpt-oss-120b/5fewshots/run.yaml | 60 ++++++ .../gpt-oss-20b/5fewshots/inference_script.py | 26 +++ .../gpt-oss-20b/5fewshots/requirements.txt | 172 ++++++++++++++++++ leaderboard/gpt-oss-20b/5fewshots/run.yaml | 59 ++++++ .../5fewshots/inference_script.py | 25 +++ .../5fewshots/requirements.txt | 172 ++++++++++++++++++ .../5fewshots/run.yaml | 56 ++++++ 39 files changed, 3313 insertions(+) create mode 100644 leaderboard/Llama-3.2-1B-Instruct/5fewshots/inference_script.py create mode 100644 leaderboard/Llama-3.2-1B-Instruct/5fewshots/requirements.txt create mode 100644 leaderboard/Llama-3.2-1B-Instruct/5fewshots/run.yaml create mode 100644 leaderboard/Llama-3.2-3B-Instruct/5fewshots/inference_script.py create mode 100644 leaderboard/Llama-3.2-3B-Instruct/5fewshots/requirements.txt create mode 100644 leaderboard/Llama-3.2-3B-Instruct/5fewshots/run.yaml create mode 100644 leaderboard/Llama-3.3-70B-Instruct/5fewshots/inference_script.py create mode 100644 leaderboard/Llama-3.3-70B-Instruct/5fewshots/requirements.txt create mode 100644 leaderboard/Llama-3.3-70B-Instruct/5fewshots/run.yaml create mode 100644 leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/inference_script.py create mode 100644 leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/requirements.txt create mode 100644 leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/run.yaml create mode 100644 leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/inference_script.py create mode 100644 leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/requirements.txt create mode 100644 leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/run.yaml create mode 100644 leaderboard/PLLuM-12B-chat/5fewshots/inference_script.py create mode 100644 leaderboard/PLLuM-12B-chat/5fewshots/requirements.txt create mode 100644 leaderboard/PLLuM-12B-chat/5fewshots/run.yaml create mode 100644 leaderboard/PLLuM-12B-nc-chat/5fewshots/inference_script.py create mode 100644 leaderboard/PLLuM-12B-nc-chat/5fewshots/requirements.txt create mode 100644 leaderboard/PLLuM-12B-nc-chat/5fewshots/run.yaml create mode 100644 leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/inference_script.py create mode 100644 leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/requirements.txt create mode 100644 leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/run.yaml create mode 100644 leaderboard/Qwen2.5-7B-Instruct/5fewshots/inference_script.py create mode 100644 leaderboard/Qwen2.5-7B-Instruct/5fewshots/requirements.txt create mode 100644 leaderboard/Qwen2.5-7B-Instruct/5fewshots/run.yaml create mode 100644 leaderboard/Qwen3-0.6B/5fewshots/inference_script.py create mode 100644 leaderboard/Qwen3-0.6B/5fewshots/requirements.txt create mode 100644 leaderboard/Qwen3-0.6B/5fewshots/run.yaml create mode 100644 leaderboard/gpt-oss-120b/5fewshots/inference_script.py create mode 100644 leaderboard/gpt-oss-120b/5fewshots/requirements.txt create mode 100644 leaderboard/gpt-oss-120b/5fewshots/run.yaml create mode 100644 leaderboard/gpt-oss-20b/5fewshots/inference_script.py create mode 100644 leaderboard/gpt-oss-20b/5fewshots/requirements.txt create mode 100644 leaderboard/gpt-oss-20b/5fewshots/run.yaml create mode 100644 leaderboard/pllum-12b-nc-chat-250715/5fewshots/inference_script.py create mode 100644 leaderboard/pllum-12b-nc-chat-250715/5fewshots/requirements.txt create mode 100644 leaderboard/pllum-12b-nc-chat-250715/5fewshots/run.yaml diff --git a/leaderboard/Llama-3.2-1B-Instruct/5fewshots/inference_script.py b/leaderboard/Llama-3.2-1B-Instruct/5fewshots/inference_script.py new file mode 100644 index 0000000..1049e44 --- /dev/null +++ b/leaderboard/Llama-3.2-1B-Instruct/5fewshots/inference_script.py @@ -0,0 +1,25 @@ +import os + +from dotenv import load_dotenv + +from llmsql import evaluate, inference_vllm + +load_dotenv() + +MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct" + +results = inference_vllm( + model_name=MODEL_NAME, + output_file=f"{MODEL_NAME}_outputs.jsonl", + batch_size=20000, + tensor_parallel_size=4, + do_sample=False, + hf_token=os.environ["HF_TOKEN"], + max_new_tokens=256, + temperature=0.0, + num_fewshots=5, + seed=42, + llm_kwargs={"dtype": "bfloat16"}, +) + +evaluate(results) diff --git a/leaderboard/Llama-3.2-1B-Instruct/5fewshots/requirements.txt b/leaderboard/Llama-3.2-1B-Instruct/5fewshots/requirements.txt new file mode 100644 index 0000000..929f583 --- /dev/null +++ b/leaderboard/Llama-3.2-1B-Instruct/5fewshots/requirements.txt @@ -0,0 +1,172 @@ +accelerate==1.12.0 +aiohappyeyeballs==2.6.1 +aiohttp==3.13.3 +aiosignal==1.4.0 +annotated-doc==0.0.4 +annotated-types==0.7.0 +anthropic==0.83.0 +anyio==4.12.1 +apache-tvm-ffi==0.1.8.post2 +astor==0.8.1 +attrs==25.4.0 +blake3==1.0.8 +cachetools==7.0.1 +cbor2==5.8.0 +certifi==2026.1.4 +cffi==2.0.0 +charset-normalizer==3.4.4 +click==8.3.1 +cloudpickle==3.1.2 +compressed-tensors==0.13.0 +cryptography==46.0.5 +cuda-bindings==13.1.1 +cuda-pathfinder==1.3.5 +cuda-python==13.1.1 +cupy-cuda12x==14.0.1 +datasets==4.5.0 +depyf==0.20.0 +dill==0.4.0 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.8.0 +docstring_parser==0.17.0 +einops==0.8.2 +email-validator==2.3.0 +fastapi==0.132.0 +fastapi-cli==0.0.23 +fastapi-cloud-cli==0.13.0 +fastar==0.8.0 +filelock==3.24.3 +flashinfer-python==0.6.1 +frozenlist==1.8.0 +fsspec==2025.10.0 +gguf==0.17.1 +grpcio==1.78.1 +grpcio-reflection==1.78.1 +h11==0.16.0 +hf-xet==1.3.0 +httpcore==1.0.9 +httptools==0.7.1 +httpx==0.28.1 +httpx-sse==0.4.3 +huggingface_hub==0.36.2 +idna==3.11 +ijson==3.5.0 +interegular==0.3.3 +Jinja2==3.1.6 +jiter==0.13.0 +jmespath==1.1.0 +jsonschema==4.26.0 +jsonschema-specifications==2025.9.1 +lark==1.2.2 +llguidance==1.3.0 +llmsql==0.1.15 +llvmlite==0.44.0 +lm-format-enforcer==0.11.3 +loguru==0.7.3 +markdown-it-py==4.0.0 +MarkupSafe==3.0.3 +mcp==1.26.0 +mdurl==0.1.2 +mistral_common==1.9.1 +model-hosting-container-standards==0.1.13 +mpmath==1.3.0 +msgpack==1.1.2 +msgspec==0.20.0 +multidict==6.7.1 +multiprocess==0.70.18 +networkx==3.6.1 +ninja==1.13.0 +numba==0.61.2 +numpy==2.2.6 +nvidia-cublas-cu12==12.8.4.1 +nvidia-cuda-cupti-cu12==12.8.90 +nvidia-cuda-nvrtc-cu12==12.8.93 +nvidia-cuda-runtime-cu12==12.8.90 +nvidia-cudnn-cu12==9.10.2.21 +nvidia-cudnn-frontend==1.18.0 +nvidia-cufft-cu12==11.3.3.83 +nvidia-cufile-cu12==1.13.1.3 +nvidia-curand-cu12==10.3.9.90 +nvidia-cusolver-cu12==11.7.3.90 +nvidia-cusparse-cu12==12.5.8.93 +nvidia-cusparselt-cu12==0.7.1 +nvidia-cutlass-dsl==4.4.0 +nvidia-cutlass-dsl-libs-base==4.4.0 +nvidia-ml-py==13.590.48 +nvidia-nccl-cu12==2.27.5 +nvidia-nvjitlink-cu12==12.8.93 +nvidia-nvshmem-cu12==3.3.20 +nvidia-nvtx-cu12==12.8.90 +openai==2.23.0 +openai-harmony==0.0.8 +opencv-python-headless==4.13.0.92 +outlines_core==0.2.11 +packaging==26.0 +pandas==3.0.1 +partial-json-parser==0.2.1.1.post7 +pillow==12.1.1 +prometheus-fastapi-instrumentator==7.1.0 +prometheus_client==0.24.1 +propcache==0.4.1 +protobuf==6.33.5 +psutil==7.2.2 +py-cpuinfo==9.0.0 +pyarrow==23.0.1 +pybase64==1.4.3 +pycountry==26.2.16 +pycparser==3.0 +pydantic==2.12.5 +pydantic-extra-types==2.11.0 +pydantic-settings==2.13.1 +pydantic_core==2.41.5 +Pygments==2.19.2 +PyJWT==2.11.0 +python-dateutil==2.9.0.post0 +python-dotenv==1.2.1 +python-json-logger==4.0.0 +python-multipart==0.0.22 +PyYAML==6.0.3 +pyzmq==27.1.0 +ray==2.54.0 +referencing==0.37.0 +regex==2026.2.19 +requests==2.32.5 +rich==14.3.3 +rich-toolkit==0.19.4 +rignore==0.7.6 +rpds-py==0.30.0 +safetensors==0.7.0 +sentencepiece==0.2.1 +sentry-sdk==2.53.0 +setproctitle==1.3.7 +setuptools==80.10.2 +shellingham==1.5.4 +six==1.17.0 +sniffio==1.3.1 +sse-starlette==3.2.0 +starlette==0.52.1 +supervisor==4.3.0 +sympy==1.14.0 +tabulate==0.9.0 +tiktoken==0.12.0 +tokenizers==0.22.2 +torch==2.9.1 +torchaudio==2.9.1 +torchvision==0.24.1 +tqdm==4.67.3 +transformers==4.57.6 +triton==3.5.1 +typer==0.24.1 +typer-slim==0.24.0 +typing-inspection==0.4.2 +typing_extensions==4.15.0 +urllib3==2.6.3 +uvicorn==0.41.0 +uvloop==0.22.1 +vllm==0.15.1 +watchfiles==1.1.1 +websockets==16.0 +xgrammar==0.1.29 +xxhash==3.6.0 +yarl==1.22.0 diff --git a/leaderboard/Llama-3.2-1B-Instruct/5fewshots/run.yaml b/leaderboard/Llama-3.2-1B-Instruct/5fewshots/run.yaml new file mode 100644 index 0000000..7aaea2e --- /dev/null +++ b/leaderboard/Llama-3.2-1B-Instruct/5fewshots/run.yaml @@ -0,0 +1,57 @@ +date: 2026-02-24 +# ===================== +# Model Information +# ===================== +model: + name: meta-llama/Llama-3.2-1B-Instruct + revision: main + commit_hash: 9213176726f574b556790deb65791e0c5aa438b6 + parameter_count: 1B + dtype: bfloat16 + thinking: false + +type: open-source # open-source | proprietary + +# ===================== +# Package Information +# ===================== +llmsql: + version: 0.1.15 + commit_hash: 79175212c90b1fc094abd2c9666c23d903060014 + +# ===================== +# Benchmark Information +# ===================== +version: 2.0 + +# ===================== +# Environment Information +# ===================== +os_name: Ubuntu 24.04.3 LTS +python_version: 3.12.12 +pip_freeze: requirements.txt +device: 4xH200 + +# ===================== +# Function Inputs / Inference Backend +# ===================== +inference: + backend: vllm # vllm | transformers + arguments: + batch_size: 20000 + tetensor_parallel_size: 4 + do_sample: false + max_new_tokens: 256 + temperature: 0.0 + num_fewshots: 5 + seed: 42 + llm_kwargs: + dtype: bfloat16 + + +# ===================== +# Results +# ===================== +results: + execution_accuracy: 0.2678 + answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/Llama-3.2-1B-Instruct/5fewshots/Llama-3.2-1B-Instruct_outputs.jsonl diff --git a/leaderboard/Llama-3.2-3B-Instruct/5fewshots/inference_script.py b/leaderboard/Llama-3.2-3B-Instruct/5fewshots/inference_script.py new file mode 100644 index 0000000..a3400c8 --- /dev/null +++ b/leaderboard/Llama-3.2-3B-Instruct/5fewshots/inference_script.py @@ -0,0 +1,25 @@ +import os + +from dotenv import load_dotenv + +from llmsql import evaluate, inference_vllm + +load_dotenv() + +MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct" + +results = inference_vllm( + model_name=MODEL_NAME, + output_file=f"{MODEL_NAME}_outputs.jsonl", + batch_size=20000, + tensor_parallel_size=4, + do_sample=False, + hf_token=os.environ["HF_TOKEN"], + max_new_tokens=256, + temperature=0.0, + num_fewshots=5, + seed=42, + llm_kwargs={"dtype": "bfloat16"}, +) + +evaluate(results) diff --git a/leaderboard/Llama-3.2-3B-Instruct/5fewshots/requirements.txt b/leaderboard/Llama-3.2-3B-Instruct/5fewshots/requirements.txt new file mode 100644 index 0000000..929f583 --- /dev/null +++ b/leaderboard/Llama-3.2-3B-Instruct/5fewshots/requirements.txt @@ -0,0 +1,172 @@ +accelerate==1.12.0 +aiohappyeyeballs==2.6.1 +aiohttp==3.13.3 +aiosignal==1.4.0 +annotated-doc==0.0.4 +annotated-types==0.7.0 +anthropic==0.83.0 +anyio==4.12.1 +apache-tvm-ffi==0.1.8.post2 +astor==0.8.1 +attrs==25.4.0 +blake3==1.0.8 +cachetools==7.0.1 +cbor2==5.8.0 +certifi==2026.1.4 +cffi==2.0.0 +charset-normalizer==3.4.4 +click==8.3.1 +cloudpickle==3.1.2 +compressed-tensors==0.13.0 +cryptography==46.0.5 +cuda-bindings==13.1.1 +cuda-pathfinder==1.3.5 +cuda-python==13.1.1 +cupy-cuda12x==14.0.1 +datasets==4.5.0 +depyf==0.20.0 +dill==0.4.0 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.8.0 +docstring_parser==0.17.0 +einops==0.8.2 +email-validator==2.3.0 +fastapi==0.132.0 +fastapi-cli==0.0.23 +fastapi-cloud-cli==0.13.0 +fastar==0.8.0 +filelock==3.24.3 +flashinfer-python==0.6.1 +frozenlist==1.8.0 +fsspec==2025.10.0 +gguf==0.17.1 +grpcio==1.78.1 +grpcio-reflection==1.78.1 +h11==0.16.0 +hf-xet==1.3.0 +httpcore==1.0.9 +httptools==0.7.1 +httpx==0.28.1 +httpx-sse==0.4.3 +huggingface_hub==0.36.2 +idna==3.11 +ijson==3.5.0 +interegular==0.3.3 +Jinja2==3.1.6 +jiter==0.13.0 +jmespath==1.1.0 +jsonschema==4.26.0 +jsonschema-specifications==2025.9.1 +lark==1.2.2 +llguidance==1.3.0 +llmsql==0.1.15 +llvmlite==0.44.0 +lm-format-enforcer==0.11.3 +loguru==0.7.3 +markdown-it-py==4.0.0 +MarkupSafe==3.0.3 +mcp==1.26.0 +mdurl==0.1.2 +mistral_common==1.9.1 +model-hosting-container-standards==0.1.13 +mpmath==1.3.0 +msgpack==1.1.2 +msgspec==0.20.0 +multidict==6.7.1 +multiprocess==0.70.18 +networkx==3.6.1 +ninja==1.13.0 +numba==0.61.2 +numpy==2.2.6 +nvidia-cublas-cu12==12.8.4.1 +nvidia-cuda-cupti-cu12==12.8.90 +nvidia-cuda-nvrtc-cu12==12.8.93 +nvidia-cuda-runtime-cu12==12.8.90 +nvidia-cudnn-cu12==9.10.2.21 +nvidia-cudnn-frontend==1.18.0 +nvidia-cufft-cu12==11.3.3.83 +nvidia-cufile-cu12==1.13.1.3 +nvidia-curand-cu12==10.3.9.90 +nvidia-cusolver-cu12==11.7.3.90 +nvidia-cusparse-cu12==12.5.8.93 +nvidia-cusparselt-cu12==0.7.1 +nvidia-cutlass-dsl==4.4.0 +nvidia-cutlass-dsl-libs-base==4.4.0 +nvidia-ml-py==13.590.48 +nvidia-nccl-cu12==2.27.5 +nvidia-nvjitlink-cu12==12.8.93 +nvidia-nvshmem-cu12==3.3.20 +nvidia-nvtx-cu12==12.8.90 +openai==2.23.0 +openai-harmony==0.0.8 +opencv-python-headless==4.13.0.92 +outlines_core==0.2.11 +packaging==26.0 +pandas==3.0.1 +partial-json-parser==0.2.1.1.post7 +pillow==12.1.1 +prometheus-fastapi-instrumentator==7.1.0 +prometheus_client==0.24.1 +propcache==0.4.1 +protobuf==6.33.5 +psutil==7.2.2 +py-cpuinfo==9.0.0 +pyarrow==23.0.1 +pybase64==1.4.3 +pycountry==26.2.16 +pycparser==3.0 +pydantic==2.12.5 +pydantic-extra-types==2.11.0 +pydantic-settings==2.13.1 +pydantic_core==2.41.5 +Pygments==2.19.2 +PyJWT==2.11.0 +python-dateutil==2.9.0.post0 +python-dotenv==1.2.1 +python-json-logger==4.0.0 +python-multipart==0.0.22 +PyYAML==6.0.3 +pyzmq==27.1.0 +ray==2.54.0 +referencing==0.37.0 +regex==2026.2.19 +requests==2.32.5 +rich==14.3.3 +rich-toolkit==0.19.4 +rignore==0.7.6 +rpds-py==0.30.0 +safetensors==0.7.0 +sentencepiece==0.2.1 +sentry-sdk==2.53.0 +setproctitle==1.3.7 +setuptools==80.10.2 +shellingham==1.5.4 +six==1.17.0 +sniffio==1.3.1 +sse-starlette==3.2.0 +starlette==0.52.1 +supervisor==4.3.0 +sympy==1.14.0 +tabulate==0.9.0 +tiktoken==0.12.0 +tokenizers==0.22.2 +torch==2.9.1 +torchaudio==2.9.1 +torchvision==0.24.1 +tqdm==4.67.3 +transformers==4.57.6 +triton==3.5.1 +typer==0.24.1 +typer-slim==0.24.0 +typing-inspection==0.4.2 +typing_extensions==4.15.0 +urllib3==2.6.3 +uvicorn==0.41.0 +uvloop==0.22.1 +vllm==0.15.1 +watchfiles==1.1.1 +websockets==16.0 +xgrammar==0.1.29 +xxhash==3.6.0 +yarl==1.22.0 diff --git a/leaderboard/Llama-3.2-3B-Instruct/5fewshots/run.yaml b/leaderboard/Llama-3.2-3B-Instruct/5fewshots/run.yaml new file mode 100644 index 0000000..57616a3 --- /dev/null +++ b/leaderboard/Llama-3.2-3B-Instruct/5fewshots/run.yaml @@ -0,0 +1,57 @@ +date: 2026-02-24 +# ===================== +# Model Information +# ===================== +model: + name: meta-llama/Llama-3.2-3B-Instruct + revision: main + commit_hash: 0cb88a4f764b7a12671c53f0838cd831a0843b95 + parameter_count: 3B + dtype: bfloat16 + thinking: false + +type: open-source # open-source | proprietary + +# ===================== +# Package Information +# ===================== +llmsql: + version: 0.1.15 + commit_hash: 79175212c90b1fc094abd2c9666c23d903060014 + +# ===================== +# Benchmark Information +# ===================== +version: 2.0 + +# ===================== +# Environment Information +# ===================== +os_name: Ubuntu 24.04.3 LTS +python_version: 3.12.12 +pip_freeze: requirements.txt +device: 4xH200 + +# ===================== +# Function Inputs / Inference Backend +# ===================== +inference: + backend: vllm # vllm | transformers + arguments: + batch_size: 20000 + tetensor_parallel_size: 4 + do_sample: false + max_new_tokens: 256 + temperature: 0.0 + num_fewshots: 5 + seed: 42 + llm_kwargs: + dtype: bfloat16 + + +# ===================== +# Results +# ===================== +results: + execution_accuracy: 0.5415 + answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/Llama-3.2-3B-Instruct/5fewshots/Llama-3.2-3B-Instruct_outputs.jsonl diff --git a/leaderboard/Llama-3.3-70B-Instruct/5fewshots/inference_script.py b/leaderboard/Llama-3.3-70B-Instruct/5fewshots/inference_script.py new file mode 100644 index 0000000..4efc97e --- /dev/null +++ b/leaderboard/Llama-3.3-70B-Instruct/5fewshots/inference_script.py @@ -0,0 +1,25 @@ +import os + +from dotenv import load_dotenv + +from llmsql import evaluate, inference_vllm + +load_dotenv() + +MODEL_NAME = "meta-llama/Llama-3.3-70B-Instruct" + +results = inference_vllm( + model_name=MODEL_NAME, + output_file=f"{MODEL_NAME}_outputs.jsonl", + batch_size=20000, + tensor_parallel_size=4, + do_sample=False, + hf_token=os.environ["HF_TOKEN"], + max_new_tokens=256, + temperature=0.0, + num_fewshots=5, + seed=42, + llm_kwargs={"dtype": "bfloat16"}, +) + +evaluate(results) diff --git a/leaderboard/Llama-3.3-70B-Instruct/5fewshots/requirements.txt b/leaderboard/Llama-3.3-70B-Instruct/5fewshots/requirements.txt new file mode 100644 index 0000000..929f583 --- /dev/null +++ b/leaderboard/Llama-3.3-70B-Instruct/5fewshots/requirements.txt @@ -0,0 +1,172 @@ +accelerate==1.12.0 +aiohappyeyeballs==2.6.1 +aiohttp==3.13.3 +aiosignal==1.4.0 +annotated-doc==0.0.4 +annotated-types==0.7.0 +anthropic==0.83.0 +anyio==4.12.1 +apache-tvm-ffi==0.1.8.post2 +astor==0.8.1 +attrs==25.4.0 +blake3==1.0.8 +cachetools==7.0.1 +cbor2==5.8.0 +certifi==2026.1.4 +cffi==2.0.0 +charset-normalizer==3.4.4 +click==8.3.1 +cloudpickle==3.1.2 +compressed-tensors==0.13.0 +cryptography==46.0.5 +cuda-bindings==13.1.1 +cuda-pathfinder==1.3.5 +cuda-python==13.1.1 +cupy-cuda12x==14.0.1 +datasets==4.5.0 +depyf==0.20.0 +dill==0.4.0 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.8.0 +docstring_parser==0.17.0 +einops==0.8.2 +email-validator==2.3.0 +fastapi==0.132.0 +fastapi-cli==0.0.23 +fastapi-cloud-cli==0.13.0 +fastar==0.8.0 +filelock==3.24.3 +flashinfer-python==0.6.1 +frozenlist==1.8.0 +fsspec==2025.10.0 +gguf==0.17.1 +grpcio==1.78.1 +grpcio-reflection==1.78.1 +h11==0.16.0 +hf-xet==1.3.0 +httpcore==1.0.9 +httptools==0.7.1 +httpx==0.28.1 +httpx-sse==0.4.3 +huggingface_hub==0.36.2 +idna==3.11 +ijson==3.5.0 +interegular==0.3.3 +Jinja2==3.1.6 +jiter==0.13.0 +jmespath==1.1.0 +jsonschema==4.26.0 +jsonschema-specifications==2025.9.1 +lark==1.2.2 +llguidance==1.3.0 +llmsql==0.1.15 +llvmlite==0.44.0 +lm-format-enforcer==0.11.3 +loguru==0.7.3 +markdown-it-py==4.0.0 +MarkupSafe==3.0.3 +mcp==1.26.0 +mdurl==0.1.2 +mistral_common==1.9.1 +model-hosting-container-standards==0.1.13 +mpmath==1.3.0 +msgpack==1.1.2 +msgspec==0.20.0 +multidict==6.7.1 +multiprocess==0.70.18 +networkx==3.6.1 +ninja==1.13.0 +numba==0.61.2 +numpy==2.2.6 +nvidia-cublas-cu12==12.8.4.1 +nvidia-cuda-cupti-cu12==12.8.90 +nvidia-cuda-nvrtc-cu12==12.8.93 +nvidia-cuda-runtime-cu12==12.8.90 +nvidia-cudnn-cu12==9.10.2.21 +nvidia-cudnn-frontend==1.18.0 +nvidia-cufft-cu12==11.3.3.83 +nvidia-cufile-cu12==1.13.1.3 +nvidia-curand-cu12==10.3.9.90 +nvidia-cusolver-cu12==11.7.3.90 +nvidia-cusparse-cu12==12.5.8.93 +nvidia-cusparselt-cu12==0.7.1 +nvidia-cutlass-dsl==4.4.0 +nvidia-cutlass-dsl-libs-base==4.4.0 +nvidia-ml-py==13.590.48 +nvidia-nccl-cu12==2.27.5 +nvidia-nvjitlink-cu12==12.8.93 +nvidia-nvshmem-cu12==3.3.20 +nvidia-nvtx-cu12==12.8.90 +openai==2.23.0 +openai-harmony==0.0.8 +opencv-python-headless==4.13.0.92 +outlines_core==0.2.11 +packaging==26.0 +pandas==3.0.1 +partial-json-parser==0.2.1.1.post7 +pillow==12.1.1 +prometheus-fastapi-instrumentator==7.1.0 +prometheus_client==0.24.1 +propcache==0.4.1 +protobuf==6.33.5 +psutil==7.2.2 +py-cpuinfo==9.0.0 +pyarrow==23.0.1 +pybase64==1.4.3 +pycountry==26.2.16 +pycparser==3.0 +pydantic==2.12.5 +pydantic-extra-types==2.11.0 +pydantic-settings==2.13.1 +pydantic_core==2.41.5 +Pygments==2.19.2 +PyJWT==2.11.0 +python-dateutil==2.9.0.post0 +python-dotenv==1.2.1 +python-json-logger==4.0.0 +python-multipart==0.0.22 +PyYAML==6.0.3 +pyzmq==27.1.0 +ray==2.54.0 +referencing==0.37.0 +regex==2026.2.19 +requests==2.32.5 +rich==14.3.3 +rich-toolkit==0.19.4 +rignore==0.7.6 +rpds-py==0.30.0 +safetensors==0.7.0 +sentencepiece==0.2.1 +sentry-sdk==2.53.0 +setproctitle==1.3.7 +setuptools==80.10.2 +shellingham==1.5.4 +six==1.17.0 +sniffio==1.3.1 +sse-starlette==3.2.0 +starlette==0.52.1 +supervisor==4.3.0 +sympy==1.14.0 +tabulate==0.9.0 +tiktoken==0.12.0 +tokenizers==0.22.2 +torch==2.9.1 +torchaudio==2.9.1 +torchvision==0.24.1 +tqdm==4.67.3 +transformers==4.57.6 +triton==3.5.1 +typer==0.24.1 +typer-slim==0.24.0 +typing-inspection==0.4.2 +typing_extensions==4.15.0 +urllib3==2.6.3 +uvicorn==0.41.0 +uvloop==0.22.1 +vllm==0.15.1 +watchfiles==1.1.1 +websockets==16.0 +xgrammar==0.1.29 +xxhash==3.6.0 +yarl==1.22.0 diff --git a/leaderboard/Llama-3.3-70B-Instruct/5fewshots/run.yaml b/leaderboard/Llama-3.3-70B-Instruct/5fewshots/run.yaml new file mode 100644 index 0000000..1e966f9 --- /dev/null +++ b/leaderboard/Llama-3.3-70B-Instruct/5fewshots/run.yaml @@ -0,0 +1,57 @@ +date: 2026-02-24 +# ===================== +# Model Information +# ===================== +model: + name: meta-llama/Llama-3.3-70B-Instruct + revision: main + commit_hash: 6f6073b423013f6a7d4d9f39144961bfbfbc386b + parameter_count: 70B + dtype: bfloat16 + thinking: false + +type: open-source # open-source | proprietary + +# ===================== +# Package Information +# ===================== +llmsql: + version: 0.1.15 + commit_hash: 79175212c90b1fc094abd2c9666c23d903060014 + +# ===================== +# Benchmark Information +# ===================== +version: 2.0 + +# ===================== +# Environment Information +# ===================== +os_name: Ubuntu 24.04.3 LTS +python_version: 3.12.12 +pip_freeze: requirements.txt +device: 4xH200 + +# ===================== +# Function Inputs / Inference Backend +# ===================== +inference: + backend: vllm # vllm | transformers + arguments: + batch_size: 20000 + tetensor_parallel_size: 4 + do_sample: false + max_new_tokens: 256 + temperature: 0.0 + num_fewshots: 5 + seed: 42 + llm_kwargs: + dtype: bfloat16 + + +# ===================== +# Results +# ===================== +results: + execution_accuracy: 0.8607 + answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/Llama-3.3-70B-Instruct/5fewshots/Llama-3.3-70B-Instruct_outputs.jsonl diff --git a/leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/inference_script.py b/leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/inference_script.py new file mode 100644 index 0000000..ef0562c --- /dev/null +++ b/leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/inference_script.py @@ -0,0 +1,25 @@ +import os + +from dotenv import load_dotenv + +from llmsql import evaluate, inference_vllm + +load_dotenv() + +MODEL_NAME = "mistralai/Mistral-Nemo-Instruct-2407" + +results = inference_vllm( + model_name=MODEL_NAME, + output_file=f"{MODEL_NAME}_outputs.jsonl", + batch_size=20000, + tensor_parallel_size=4, + do_sample=False, + hf_token=os.environ["HF_TOKEN"], + max_new_tokens=256, + temperature=0.0, + num_fewshots=5, + seed=42, + llm_kwargs={"dtype": "bfloat16"}, +) + +evaluate(results) diff --git a/leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/requirements.txt b/leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/requirements.txt new file mode 100644 index 0000000..929f583 --- /dev/null +++ b/leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/requirements.txt @@ -0,0 +1,172 @@ +accelerate==1.12.0 +aiohappyeyeballs==2.6.1 +aiohttp==3.13.3 +aiosignal==1.4.0 +annotated-doc==0.0.4 +annotated-types==0.7.0 +anthropic==0.83.0 +anyio==4.12.1 +apache-tvm-ffi==0.1.8.post2 +astor==0.8.1 +attrs==25.4.0 +blake3==1.0.8 +cachetools==7.0.1 +cbor2==5.8.0 +certifi==2026.1.4 +cffi==2.0.0 +charset-normalizer==3.4.4 +click==8.3.1 +cloudpickle==3.1.2 +compressed-tensors==0.13.0 +cryptography==46.0.5 +cuda-bindings==13.1.1 +cuda-pathfinder==1.3.5 +cuda-python==13.1.1 +cupy-cuda12x==14.0.1 +datasets==4.5.0 +depyf==0.20.0 +dill==0.4.0 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.8.0 +docstring_parser==0.17.0 +einops==0.8.2 +email-validator==2.3.0 +fastapi==0.132.0 +fastapi-cli==0.0.23 +fastapi-cloud-cli==0.13.0 +fastar==0.8.0 +filelock==3.24.3 +flashinfer-python==0.6.1 +frozenlist==1.8.0 +fsspec==2025.10.0 +gguf==0.17.1 +grpcio==1.78.1 +grpcio-reflection==1.78.1 +h11==0.16.0 +hf-xet==1.3.0 +httpcore==1.0.9 +httptools==0.7.1 +httpx==0.28.1 +httpx-sse==0.4.3 +huggingface_hub==0.36.2 +idna==3.11 +ijson==3.5.0 +interegular==0.3.3 +Jinja2==3.1.6 +jiter==0.13.0 +jmespath==1.1.0 +jsonschema==4.26.0 +jsonschema-specifications==2025.9.1 +lark==1.2.2 +llguidance==1.3.0 +llmsql==0.1.15 +llvmlite==0.44.0 +lm-format-enforcer==0.11.3 +loguru==0.7.3 +markdown-it-py==4.0.0 +MarkupSafe==3.0.3 +mcp==1.26.0 +mdurl==0.1.2 +mistral_common==1.9.1 +model-hosting-container-standards==0.1.13 +mpmath==1.3.0 +msgpack==1.1.2 +msgspec==0.20.0 +multidict==6.7.1 +multiprocess==0.70.18 +networkx==3.6.1 +ninja==1.13.0 +numba==0.61.2 +numpy==2.2.6 +nvidia-cublas-cu12==12.8.4.1 +nvidia-cuda-cupti-cu12==12.8.90 +nvidia-cuda-nvrtc-cu12==12.8.93 +nvidia-cuda-runtime-cu12==12.8.90 +nvidia-cudnn-cu12==9.10.2.21 +nvidia-cudnn-frontend==1.18.0 +nvidia-cufft-cu12==11.3.3.83 +nvidia-cufile-cu12==1.13.1.3 +nvidia-curand-cu12==10.3.9.90 +nvidia-cusolver-cu12==11.7.3.90 +nvidia-cusparse-cu12==12.5.8.93 +nvidia-cusparselt-cu12==0.7.1 +nvidia-cutlass-dsl==4.4.0 +nvidia-cutlass-dsl-libs-base==4.4.0 +nvidia-ml-py==13.590.48 +nvidia-nccl-cu12==2.27.5 +nvidia-nvjitlink-cu12==12.8.93 +nvidia-nvshmem-cu12==3.3.20 +nvidia-nvtx-cu12==12.8.90 +openai==2.23.0 +openai-harmony==0.0.8 +opencv-python-headless==4.13.0.92 +outlines_core==0.2.11 +packaging==26.0 +pandas==3.0.1 +partial-json-parser==0.2.1.1.post7 +pillow==12.1.1 +prometheus-fastapi-instrumentator==7.1.0 +prometheus_client==0.24.1 +propcache==0.4.1 +protobuf==6.33.5 +psutil==7.2.2 +py-cpuinfo==9.0.0 +pyarrow==23.0.1 +pybase64==1.4.3 +pycountry==26.2.16 +pycparser==3.0 +pydantic==2.12.5 +pydantic-extra-types==2.11.0 +pydantic-settings==2.13.1 +pydantic_core==2.41.5 +Pygments==2.19.2 +PyJWT==2.11.0 +python-dateutil==2.9.0.post0 +python-dotenv==1.2.1 +python-json-logger==4.0.0 +python-multipart==0.0.22 +PyYAML==6.0.3 +pyzmq==27.1.0 +ray==2.54.0 +referencing==0.37.0 +regex==2026.2.19 +requests==2.32.5 +rich==14.3.3 +rich-toolkit==0.19.4 +rignore==0.7.6 +rpds-py==0.30.0 +safetensors==0.7.0 +sentencepiece==0.2.1 +sentry-sdk==2.53.0 +setproctitle==1.3.7 +setuptools==80.10.2 +shellingham==1.5.4 +six==1.17.0 +sniffio==1.3.1 +sse-starlette==3.2.0 +starlette==0.52.1 +supervisor==4.3.0 +sympy==1.14.0 +tabulate==0.9.0 +tiktoken==0.12.0 +tokenizers==0.22.2 +torch==2.9.1 +torchaudio==2.9.1 +torchvision==0.24.1 +tqdm==4.67.3 +transformers==4.57.6 +triton==3.5.1 +typer==0.24.1 +typer-slim==0.24.0 +typing-inspection==0.4.2 +typing_extensions==4.15.0 +urllib3==2.6.3 +uvicorn==0.41.0 +uvloop==0.22.1 +vllm==0.15.1 +watchfiles==1.1.1 +websockets==16.0 +xgrammar==0.1.29 +xxhash==3.6.0 +yarl==1.22.0 diff --git a/leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/run.yaml b/leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/run.yaml new file mode 100644 index 0000000..7914a99 --- /dev/null +++ b/leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/run.yaml @@ -0,0 +1,56 @@ +date: 2026-02-24 +# ===================== +# Model Information +# ===================== +model: + name: mistralai/Mistral-Nemo-Instruct-2407 + revision: main + commit_hash: 04d8a90549d23fc6bd7f642064003592df51e9b3 + parameter_count: 12B + dtype: bfloat16 + thinking: false + +type: open-source # open-source | proprietary + +# ===================== +# Package Information +# ===================== +llmsql: + version: 0.1.15 + commit_hash: 79175212c90b1fc094abd2c9666c23d903060014 + +# ===================== +# Benchmark Information +# ===================== +version: 2.0 + +# ===================== +# Environment Information +# ===================== +os_name: Ubuntu 24.04.3 LTS +python_version: 3.12.12 +pip_freeze: requirements.txt +device: 4xH200 + +# ===================== +# Function Inputs / Inference Backend +# ===================== +inference: + backend: vllm # vllm | transformers + arguments: + batch_size: 20000 + tetensor_parallel_size: 4 + do_sample: false + max_new_tokens: 256 + temperature: 0.0 + num_fewshots: 5 + seed: 42 + llm_kwargs: + dtype: bfloat16 + +# ===================== +# Results +# ===================== +results: + execution_accuracy: 0.7599 + answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/Mistral-Nemo-Instruct-2407/5fewshots/Mistral-Nemo-Instruct-2407_outputs.jsonl diff --git a/leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/inference_script.py b/leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/inference_script.py new file mode 100644 index 0000000..fa7cfb7 --- /dev/null +++ b/leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/inference_script.py @@ -0,0 +1,26 @@ +import os + +from dotenv import load_dotenv + +from llmsql import evaluate, inference_vllm + +load_dotenv() + +MODEL_NAME = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16" + +results = inference_vllm( + model_name=MODEL_NAME, + output_file=f"{MODEL_NAME}_outputs.jsonl", + batch_size=20000, + tensor_parallel_size=4, + do_sample=True, + hf_token=os.environ["HF_TOKEN"], + max_new_tokens=1024, + temperature=1.0, + sampling_kwargs={"top_p": 1.0}, + num_fewshots=5, + seed=42, + llm_kwargs={"dtype": "bfloat16"}, +) + +evaluate(results) diff --git a/leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/requirements.txt b/leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/requirements.txt new file mode 100644 index 0000000..929f583 --- /dev/null +++ b/leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/requirements.txt @@ -0,0 +1,172 @@ +accelerate==1.12.0 +aiohappyeyeballs==2.6.1 +aiohttp==3.13.3 +aiosignal==1.4.0 +annotated-doc==0.0.4 +annotated-types==0.7.0 +anthropic==0.83.0 +anyio==4.12.1 +apache-tvm-ffi==0.1.8.post2 +astor==0.8.1 +attrs==25.4.0 +blake3==1.0.8 +cachetools==7.0.1 +cbor2==5.8.0 +certifi==2026.1.4 +cffi==2.0.0 +charset-normalizer==3.4.4 +click==8.3.1 +cloudpickle==3.1.2 +compressed-tensors==0.13.0 +cryptography==46.0.5 +cuda-bindings==13.1.1 +cuda-pathfinder==1.3.5 +cuda-python==13.1.1 +cupy-cuda12x==14.0.1 +datasets==4.5.0 +depyf==0.20.0 +dill==0.4.0 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.8.0 +docstring_parser==0.17.0 +einops==0.8.2 +email-validator==2.3.0 +fastapi==0.132.0 +fastapi-cli==0.0.23 +fastapi-cloud-cli==0.13.0 +fastar==0.8.0 +filelock==3.24.3 +flashinfer-python==0.6.1 +frozenlist==1.8.0 +fsspec==2025.10.0 +gguf==0.17.1 +grpcio==1.78.1 +grpcio-reflection==1.78.1 +h11==0.16.0 +hf-xet==1.3.0 +httpcore==1.0.9 +httptools==0.7.1 +httpx==0.28.1 +httpx-sse==0.4.3 +huggingface_hub==0.36.2 +idna==3.11 +ijson==3.5.0 +interegular==0.3.3 +Jinja2==3.1.6 +jiter==0.13.0 +jmespath==1.1.0 +jsonschema==4.26.0 +jsonschema-specifications==2025.9.1 +lark==1.2.2 +llguidance==1.3.0 +llmsql==0.1.15 +llvmlite==0.44.0 +lm-format-enforcer==0.11.3 +loguru==0.7.3 +markdown-it-py==4.0.0 +MarkupSafe==3.0.3 +mcp==1.26.0 +mdurl==0.1.2 +mistral_common==1.9.1 +model-hosting-container-standards==0.1.13 +mpmath==1.3.0 +msgpack==1.1.2 +msgspec==0.20.0 +multidict==6.7.1 +multiprocess==0.70.18 +networkx==3.6.1 +ninja==1.13.0 +numba==0.61.2 +numpy==2.2.6 +nvidia-cublas-cu12==12.8.4.1 +nvidia-cuda-cupti-cu12==12.8.90 +nvidia-cuda-nvrtc-cu12==12.8.93 +nvidia-cuda-runtime-cu12==12.8.90 +nvidia-cudnn-cu12==9.10.2.21 +nvidia-cudnn-frontend==1.18.0 +nvidia-cufft-cu12==11.3.3.83 +nvidia-cufile-cu12==1.13.1.3 +nvidia-curand-cu12==10.3.9.90 +nvidia-cusolver-cu12==11.7.3.90 +nvidia-cusparse-cu12==12.5.8.93 +nvidia-cusparselt-cu12==0.7.1 +nvidia-cutlass-dsl==4.4.0 +nvidia-cutlass-dsl-libs-base==4.4.0 +nvidia-ml-py==13.590.48 +nvidia-nccl-cu12==2.27.5 +nvidia-nvjitlink-cu12==12.8.93 +nvidia-nvshmem-cu12==3.3.20 +nvidia-nvtx-cu12==12.8.90 +openai==2.23.0 +openai-harmony==0.0.8 +opencv-python-headless==4.13.0.92 +outlines_core==0.2.11 +packaging==26.0 +pandas==3.0.1 +partial-json-parser==0.2.1.1.post7 +pillow==12.1.1 +prometheus-fastapi-instrumentator==7.1.0 +prometheus_client==0.24.1 +propcache==0.4.1 +protobuf==6.33.5 +psutil==7.2.2 +py-cpuinfo==9.0.0 +pyarrow==23.0.1 +pybase64==1.4.3 +pycountry==26.2.16 +pycparser==3.0 +pydantic==2.12.5 +pydantic-extra-types==2.11.0 +pydantic-settings==2.13.1 +pydantic_core==2.41.5 +Pygments==2.19.2 +PyJWT==2.11.0 +python-dateutil==2.9.0.post0 +python-dotenv==1.2.1 +python-json-logger==4.0.0 +python-multipart==0.0.22 +PyYAML==6.0.3 +pyzmq==27.1.0 +ray==2.54.0 +referencing==0.37.0 +regex==2026.2.19 +requests==2.32.5 +rich==14.3.3 +rich-toolkit==0.19.4 +rignore==0.7.6 +rpds-py==0.30.0 +safetensors==0.7.0 +sentencepiece==0.2.1 +sentry-sdk==2.53.0 +setproctitle==1.3.7 +setuptools==80.10.2 +shellingham==1.5.4 +six==1.17.0 +sniffio==1.3.1 +sse-starlette==3.2.0 +starlette==0.52.1 +supervisor==4.3.0 +sympy==1.14.0 +tabulate==0.9.0 +tiktoken==0.12.0 +tokenizers==0.22.2 +torch==2.9.1 +torchaudio==2.9.1 +torchvision==0.24.1 +tqdm==4.67.3 +transformers==4.57.6 +triton==3.5.1 +typer==0.24.1 +typer-slim==0.24.0 +typing-inspection==0.4.2 +typing_extensions==4.15.0 +urllib3==2.6.3 +uvicorn==0.41.0 +uvloop==0.22.1 +vllm==0.15.1 +watchfiles==1.1.1 +websockets==16.0 +xgrammar==0.1.29 +xxhash==3.6.0 +yarl==1.22.0 diff --git a/leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/run.yaml b/leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/run.yaml new file mode 100644 index 0000000..a2d5154 --- /dev/null +++ b/leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/run.yaml @@ -0,0 +1,59 @@ +date: 2026-02-24 +# ===================== +# Model Information +# ===================== +model: + name: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 + revision: main + commit_hash: 5a48de7e98cce824b3456eb9857ded839c3b6475 + parameter_count: 30B + dtype: bfloat16 + thinking: true + +type: open-source # open-source | proprietary + +# ===================== +# Package Information +# ===================== +llmsql: + version: 0.1.15 + commit_hash: 79175212c90b1fc094abd2c9666c23d903060014 + +# ===================== +# Benchmark Information +# ===================== +version: 2.0 + +# ===================== +# Environment Information +# ===================== +os_name: Ubuntu 24.04.3 LTS +python_version: 3.12.12 +pip_freeze: requirements.txt +device: 4xH200 + +# ===================== +# Function Inputs / Inference Backend +# ===================== +inference: + backend: vllm # vllm | transformers + arguments: + batch_size: 20000 + tetensor_parallel_size: 4 + do_sample: true + max_new_tokens: 1024 + temperature: 1.0 + sampling_kwargs: + top_p: 1.0 + num_fewshots: 5 + seed: 42 + llm_kwargs: + dtype: bfloat16 + + +# ===================== +# Results +# ===================== +results: + execution_accuracy: 0.8519 + answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16_outputs.jsonl diff --git a/leaderboard/PLLuM-12B-chat/5fewshots/inference_script.py b/leaderboard/PLLuM-12B-chat/5fewshots/inference_script.py new file mode 100644 index 0000000..59bfa27 --- /dev/null +++ b/leaderboard/PLLuM-12B-chat/5fewshots/inference_script.py @@ -0,0 +1,25 @@ +import os + +from dotenv import load_dotenv + +from llmsql import evaluate, inference_vllm + +load_dotenv() + +MODEL_NAME = "CYFRAGOVPL/PLLuM-12B-chat" + +results = inference_vllm( + model_name=MODEL_NAME, + output_file=f"{MODEL_NAME}_outputs.jsonl", + batch_size=20000, + tensor_parallel_size=4, + do_sample=False, + hf_token=os.environ["HF_TOKEN"], + max_new_tokens=256, + temperature=0.0, + num_fewshots=5, + seed=42, + llm_kwargs={"dtype": "bfloat16"}, +) + +evaluate(results) diff --git a/leaderboard/PLLuM-12B-chat/5fewshots/requirements.txt b/leaderboard/PLLuM-12B-chat/5fewshots/requirements.txt new file mode 100644 index 0000000..929f583 --- /dev/null +++ b/leaderboard/PLLuM-12B-chat/5fewshots/requirements.txt @@ -0,0 +1,172 @@ +accelerate==1.12.0 +aiohappyeyeballs==2.6.1 +aiohttp==3.13.3 +aiosignal==1.4.0 +annotated-doc==0.0.4 +annotated-types==0.7.0 +anthropic==0.83.0 +anyio==4.12.1 +apache-tvm-ffi==0.1.8.post2 +astor==0.8.1 +attrs==25.4.0 +blake3==1.0.8 +cachetools==7.0.1 +cbor2==5.8.0 +certifi==2026.1.4 +cffi==2.0.0 +charset-normalizer==3.4.4 +click==8.3.1 +cloudpickle==3.1.2 +compressed-tensors==0.13.0 +cryptography==46.0.5 +cuda-bindings==13.1.1 +cuda-pathfinder==1.3.5 +cuda-python==13.1.1 +cupy-cuda12x==14.0.1 +datasets==4.5.0 +depyf==0.20.0 +dill==0.4.0 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.8.0 +docstring_parser==0.17.0 +einops==0.8.2 +email-validator==2.3.0 +fastapi==0.132.0 +fastapi-cli==0.0.23 +fastapi-cloud-cli==0.13.0 +fastar==0.8.0 +filelock==3.24.3 +flashinfer-python==0.6.1 +frozenlist==1.8.0 +fsspec==2025.10.0 +gguf==0.17.1 +grpcio==1.78.1 +grpcio-reflection==1.78.1 +h11==0.16.0 +hf-xet==1.3.0 +httpcore==1.0.9 +httptools==0.7.1 +httpx==0.28.1 +httpx-sse==0.4.3 +huggingface_hub==0.36.2 +idna==3.11 +ijson==3.5.0 +interegular==0.3.3 +Jinja2==3.1.6 +jiter==0.13.0 +jmespath==1.1.0 +jsonschema==4.26.0 +jsonschema-specifications==2025.9.1 +lark==1.2.2 +llguidance==1.3.0 +llmsql==0.1.15 +llvmlite==0.44.0 +lm-format-enforcer==0.11.3 +loguru==0.7.3 +markdown-it-py==4.0.0 +MarkupSafe==3.0.3 +mcp==1.26.0 +mdurl==0.1.2 +mistral_common==1.9.1 +model-hosting-container-standards==0.1.13 +mpmath==1.3.0 +msgpack==1.1.2 +msgspec==0.20.0 +multidict==6.7.1 +multiprocess==0.70.18 +networkx==3.6.1 +ninja==1.13.0 +numba==0.61.2 +numpy==2.2.6 +nvidia-cublas-cu12==12.8.4.1 +nvidia-cuda-cupti-cu12==12.8.90 +nvidia-cuda-nvrtc-cu12==12.8.93 +nvidia-cuda-runtime-cu12==12.8.90 +nvidia-cudnn-cu12==9.10.2.21 +nvidia-cudnn-frontend==1.18.0 +nvidia-cufft-cu12==11.3.3.83 +nvidia-cufile-cu12==1.13.1.3 +nvidia-curand-cu12==10.3.9.90 +nvidia-cusolver-cu12==11.7.3.90 +nvidia-cusparse-cu12==12.5.8.93 +nvidia-cusparselt-cu12==0.7.1 +nvidia-cutlass-dsl==4.4.0 +nvidia-cutlass-dsl-libs-base==4.4.0 +nvidia-ml-py==13.590.48 +nvidia-nccl-cu12==2.27.5 +nvidia-nvjitlink-cu12==12.8.93 +nvidia-nvshmem-cu12==3.3.20 +nvidia-nvtx-cu12==12.8.90 +openai==2.23.0 +openai-harmony==0.0.8 +opencv-python-headless==4.13.0.92 +outlines_core==0.2.11 +packaging==26.0 +pandas==3.0.1 +partial-json-parser==0.2.1.1.post7 +pillow==12.1.1 +prometheus-fastapi-instrumentator==7.1.0 +prometheus_client==0.24.1 +propcache==0.4.1 +protobuf==6.33.5 +psutil==7.2.2 +py-cpuinfo==9.0.0 +pyarrow==23.0.1 +pybase64==1.4.3 +pycountry==26.2.16 +pycparser==3.0 +pydantic==2.12.5 +pydantic-extra-types==2.11.0 +pydantic-settings==2.13.1 +pydantic_core==2.41.5 +Pygments==2.19.2 +PyJWT==2.11.0 +python-dateutil==2.9.0.post0 +python-dotenv==1.2.1 +python-json-logger==4.0.0 +python-multipart==0.0.22 +PyYAML==6.0.3 +pyzmq==27.1.0 +ray==2.54.0 +referencing==0.37.0 +regex==2026.2.19 +requests==2.32.5 +rich==14.3.3 +rich-toolkit==0.19.4 +rignore==0.7.6 +rpds-py==0.30.0 +safetensors==0.7.0 +sentencepiece==0.2.1 +sentry-sdk==2.53.0 +setproctitle==1.3.7 +setuptools==80.10.2 +shellingham==1.5.4 +six==1.17.0 +sniffio==1.3.1 +sse-starlette==3.2.0 +starlette==0.52.1 +supervisor==4.3.0 +sympy==1.14.0 +tabulate==0.9.0 +tiktoken==0.12.0 +tokenizers==0.22.2 +torch==2.9.1 +torchaudio==2.9.1 +torchvision==0.24.1 +tqdm==4.67.3 +transformers==4.57.6 +triton==3.5.1 +typer==0.24.1 +typer-slim==0.24.0 +typing-inspection==0.4.2 +typing_extensions==4.15.0 +urllib3==2.6.3 +uvicorn==0.41.0 +uvloop==0.22.1 +vllm==0.15.1 +watchfiles==1.1.1 +websockets==16.0 +xgrammar==0.1.29 +xxhash==3.6.0 +yarl==1.22.0 diff --git a/leaderboard/PLLuM-12B-chat/5fewshots/run.yaml b/leaderboard/PLLuM-12B-chat/5fewshots/run.yaml new file mode 100644 index 0000000..6a2ab5f --- /dev/null +++ b/leaderboard/PLLuM-12B-chat/5fewshots/run.yaml @@ -0,0 +1,57 @@ +date: 2026-02-24 +# ===================== +# Model Information +# ===================== +model: + name: CYFRAGOVPL/PLLuM-12B-chat + revision: main + commit_hash: 74d80ff96552d9555f6f6f28321433da3895d2ec + parameter_count: 12B + dtype: bfloat16 + thinking: false + +type: open-source # open-source | proprietary + +# ===================== +# Package Information +# ===================== +llmsql: + version: 0.1.15 + commit_hash: 79175212c90b1fc094abd2c9666c23d903060014 + +# ===================== +# Benchmark Information +# ===================== +version: 2.0 + +# ===================== +# Environment Information +# ===================== +os_name: Ubuntu 24.04.3 LTS +python_version: 3.12.12 +pip_freeze: requirements.txt +device: 4xH200 + +# ===================== +# Function Inputs / Inference Backend +# ===================== +inference: + backend: vllm # vllm | transformers + arguments: + batch_size: 20000 + tetensor_parallel_size: 4 + do_sample: false + max_new_tokens: 256 + temperature: 0.0 + num_fewshots: 5 + seed: 42 + llm_kwargs: + dtype: bfloat16 + + +# ===================== +# Results +# ===================== +results: + execution_accuracy: 0.5224 + answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/PLLuM-12B-chat/5fewshots/PLLuM-12B-chat_outputs.jsonl diff --git a/leaderboard/PLLuM-12B-nc-chat/5fewshots/inference_script.py b/leaderboard/PLLuM-12B-nc-chat/5fewshots/inference_script.py new file mode 100644 index 0000000..372e696 --- /dev/null +++ b/leaderboard/PLLuM-12B-nc-chat/5fewshots/inference_script.py @@ -0,0 +1,25 @@ +import os + +from dotenv import load_dotenv + +from llmsql import evaluate, inference_vllm + +load_dotenv() + +MODEL_NAME = "CYFRAGOVPL/PLLuM-12B-nc-chat" + +results = inference_vllm( + model_name=MODEL_NAME, + output_file=f"{MODEL_NAME}_outputs.jsonl", + batch_size=20000, + tensor_parallel_size=4, + do_sample=False, + hf_token=os.environ["HF_TOKEN"], + max_new_tokens=256, + temperature=0.0, + num_fewshots=5, + seed=42, + llm_kwargs={"dtype": "bfloat16"}, +) + +evaluate(results) diff --git a/leaderboard/PLLuM-12B-nc-chat/5fewshots/requirements.txt b/leaderboard/PLLuM-12B-nc-chat/5fewshots/requirements.txt new file mode 100644 index 0000000..929f583 --- /dev/null +++ b/leaderboard/PLLuM-12B-nc-chat/5fewshots/requirements.txt @@ -0,0 +1,172 @@ +accelerate==1.12.0 +aiohappyeyeballs==2.6.1 +aiohttp==3.13.3 +aiosignal==1.4.0 +annotated-doc==0.0.4 +annotated-types==0.7.0 +anthropic==0.83.0 +anyio==4.12.1 +apache-tvm-ffi==0.1.8.post2 +astor==0.8.1 +attrs==25.4.0 +blake3==1.0.8 +cachetools==7.0.1 +cbor2==5.8.0 +certifi==2026.1.4 +cffi==2.0.0 +charset-normalizer==3.4.4 +click==8.3.1 +cloudpickle==3.1.2 +compressed-tensors==0.13.0 +cryptography==46.0.5 +cuda-bindings==13.1.1 +cuda-pathfinder==1.3.5 +cuda-python==13.1.1 +cupy-cuda12x==14.0.1 +datasets==4.5.0 +depyf==0.20.0 +dill==0.4.0 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.8.0 +docstring_parser==0.17.0 +einops==0.8.2 +email-validator==2.3.0 +fastapi==0.132.0 +fastapi-cli==0.0.23 +fastapi-cloud-cli==0.13.0 +fastar==0.8.0 +filelock==3.24.3 +flashinfer-python==0.6.1 +frozenlist==1.8.0 +fsspec==2025.10.0 +gguf==0.17.1 +grpcio==1.78.1 +grpcio-reflection==1.78.1 +h11==0.16.0 +hf-xet==1.3.0 +httpcore==1.0.9 +httptools==0.7.1 +httpx==0.28.1 +httpx-sse==0.4.3 +huggingface_hub==0.36.2 +idna==3.11 +ijson==3.5.0 +interegular==0.3.3 +Jinja2==3.1.6 +jiter==0.13.0 +jmespath==1.1.0 +jsonschema==4.26.0 +jsonschema-specifications==2025.9.1 +lark==1.2.2 +llguidance==1.3.0 +llmsql==0.1.15 +llvmlite==0.44.0 +lm-format-enforcer==0.11.3 +loguru==0.7.3 +markdown-it-py==4.0.0 +MarkupSafe==3.0.3 +mcp==1.26.0 +mdurl==0.1.2 +mistral_common==1.9.1 +model-hosting-container-standards==0.1.13 +mpmath==1.3.0 +msgpack==1.1.2 +msgspec==0.20.0 +multidict==6.7.1 +multiprocess==0.70.18 +networkx==3.6.1 +ninja==1.13.0 +numba==0.61.2 +numpy==2.2.6 +nvidia-cublas-cu12==12.8.4.1 +nvidia-cuda-cupti-cu12==12.8.90 +nvidia-cuda-nvrtc-cu12==12.8.93 +nvidia-cuda-runtime-cu12==12.8.90 +nvidia-cudnn-cu12==9.10.2.21 +nvidia-cudnn-frontend==1.18.0 +nvidia-cufft-cu12==11.3.3.83 +nvidia-cufile-cu12==1.13.1.3 +nvidia-curand-cu12==10.3.9.90 +nvidia-cusolver-cu12==11.7.3.90 +nvidia-cusparse-cu12==12.5.8.93 +nvidia-cusparselt-cu12==0.7.1 +nvidia-cutlass-dsl==4.4.0 +nvidia-cutlass-dsl-libs-base==4.4.0 +nvidia-ml-py==13.590.48 +nvidia-nccl-cu12==2.27.5 +nvidia-nvjitlink-cu12==12.8.93 +nvidia-nvshmem-cu12==3.3.20 +nvidia-nvtx-cu12==12.8.90 +openai==2.23.0 +openai-harmony==0.0.8 +opencv-python-headless==4.13.0.92 +outlines_core==0.2.11 +packaging==26.0 +pandas==3.0.1 +partial-json-parser==0.2.1.1.post7 +pillow==12.1.1 +prometheus-fastapi-instrumentator==7.1.0 +prometheus_client==0.24.1 +propcache==0.4.1 +protobuf==6.33.5 +psutil==7.2.2 +py-cpuinfo==9.0.0 +pyarrow==23.0.1 +pybase64==1.4.3 +pycountry==26.2.16 +pycparser==3.0 +pydantic==2.12.5 +pydantic-extra-types==2.11.0 +pydantic-settings==2.13.1 +pydantic_core==2.41.5 +Pygments==2.19.2 +PyJWT==2.11.0 +python-dateutil==2.9.0.post0 +python-dotenv==1.2.1 +python-json-logger==4.0.0 +python-multipart==0.0.22 +PyYAML==6.0.3 +pyzmq==27.1.0 +ray==2.54.0 +referencing==0.37.0 +regex==2026.2.19 +requests==2.32.5 +rich==14.3.3 +rich-toolkit==0.19.4 +rignore==0.7.6 +rpds-py==0.30.0 +safetensors==0.7.0 +sentencepiece==0.2.1 +sentry-sdk==2.53.0 +setproctitle==1.3.7 +setuptools==80.10.2 +shellingham==1.5.4 +six==1.17.0 +sniffio==1.3.1 +sse-starlette==3.2.0 +starlette==0.52.1 +supervisor==4.3.0 +sympy==1.14.0 +tabulate==0.9.0 +tiktoken==0.12.0 +tokenizers==0.22.2 +torch==2.9.1 +torchaudio==2.9.1 +torchvision==0.24.1 +tqdm==4.67.3 +transformers==4.57.6 +triton==3.5.1 +typer==0.24.1 +typer-slim==0.24.0 +typing-inspection==0.4.2 +typing_extensions==4.15.0 +urllib3==2.6.3 +uvicorn==0.41.0 +uvloop==0.22.1 +vllm==0.15.1 +watchfiles==1.1.1 +websockets==16.0 +xgrammar==0.1.29 +xxhash==3.6.0 +yarl==1.22.0 diff --git a/leaderboard/PLLuM-12B-nc-chat/5fewshots/run.yaml b/leaderboard/PLLuM-12B-nc-chat/5fewshots/run.yaml new file mode 100644 index 0000000..819021e --- /dev/null +++ b/leaderboard/PLLuM-12B-nc-chat/5fewshots/run.yaml @@ -0,0 +1,57 @@ +date: 2026-02-24 +# ===================== +# Model Information +# ===================== +model: + name: CYFRAGOVPL/PLLuM-12B-nc-chat + revision: main + commit_hash: 7089352cfc2efbd2d3c64cc8cd5c97cd2c4fc013 + parameter_count: 12B + dtype: bfloat16 + thinking: false + +type: open-source # open-source | proprietary + +# ===================== +# Package Information +# ===================== +llmsql: + version: 0.1.15 + commit_hash: 79175212c90b1fc094abd2c9666c23d903060014 + +# ===================== +# Benchmark Information +# ===================== +version: 2.0 + + +# ===================== +# Environment Information +# ===================== +os_name: Ubuntu 24.04.3 LTS +python_version: 3.12.12 +pip_freeze: requirements.txt +device: 4xH200 + +# ===================== +# Function Inputs / Inference Backend +# ===================== +inference: + backend: vllm # vllm | transformers + arguments: + batch_size: 20000 + tetensor_parallel_size: 4 + do_sample: false + max_new_tokens: 256 + temperature: 0.0 + num_fewshots: 5 + seed: 42 + llm_kwargs: + dtype: bfloat16 + +# ===================== +# Results +# ===================== +results: + execution_accuracy: 0.4044 + answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/PLLuM-12B-nc-chat/5fewshots/PLLuM-12B-nc-chat_outputs.jsonl diff --git a/leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/inference_script.py b/leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/inference_script.py new file mode 100644 index 0000000..c8af571 --- /dev/null +++ b/leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/inference_script.py @@ -0,0 +1,25 @@ +import os + +from dotenv import load_dotenv + +from llmsql import evaluate, inference_vllm + +load_dotenv() + +MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct" + +results = inference_vllm( + model_name=MODEL_NAME, + output_file=f"{MODEL_NAME}_outputs.jsonl", + batch_size=20000, + tensor_parallel_size=4, + do_sample=False, + hf_token=os.environ["HF_TOKEN"], + max_new_tokens=256, + temperature=0.0, + num_fewshots=5, + seed=42, + llm_kwargs={"dtype": "bfloat16"}, +) + +evaluate(results) diff --git a/leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/requirements.txt b/leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/requirements.txt new file mode 100644 index 0000000..929f583 --- /dev/null +++ b/leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/requirements.txt @@ -0,0 +1,172 @@ +accelerate==1.12.0 +aiohappyeyeballs==2.6.1 +aiohttp==3.13.3 +aiosignal==1.4.0 +annotated-doc==0.0.4 +annotated-types==0.7.0 +anthropic==0.83.0 +anyio==4.12.1 +apache-tvm-ffi==0.1.8.post2 +astor==0.8.1 +attrs==25.4.0 +blake3==1.0.8 +cachetools==7.0.1 +cbor2==5.8.0 +certifi==2026.1.4 +cffi==2.0.0 +charset-normalizer==3.4.4 +click==8.3.1 +cloudpickle==3.1.2 +compressed-tensors==0.13.0 +cryptography==46.0.5 +cuda-bindings==13.1.1 +cuda-pathfinder==1.3.5 +cuda-python==13.1.1 +cupy-cuda12x==14.0.1 +datasets==4.5.0 +depyf==0.20.0 +dill==0.4.0 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.8.0 +docstring_parser==0.17.0 +einops==0.8.2 +email-validator==2.3.0 +fastapi==0.132.0 +fastapi-cli==0.0.23 +fastapi-cloud-cli==0.13.0 +fastar==0.8.0 +filelock==3.24.3 +flashinfer-python==0.6.1 +frozenlist==1.8.0 +fsspec==2025.10.0 +gguf==0.17.1 +grpcio==1.78.1 +grpcio-reflection==1.78.1 +h11==0.16.0 +hf-xet==1.3.0 +httpcore==1.0.9 +httptools==0.7.1 +httpx==0.28.1 +httpx-sse==0.4.3 +huggingface_hub==0.36.2 +idna==3.11 +ijson==3.5.0 +interegular==0.3.3 +Jinja2==3.1.6 +jiter==0.13.0 +jmespath==1.1.0 +jsonschema==4.26.0 +jsonschema-specifications==2025.9.1 +lark==1.2.2 +llguidance==1.3.0 +llmsql==0.1.15 +llvmlite==0.44.0 +lm-format-enforcer==0.11.3 +loguru==0.7.3 +markdown-it-py==4.0.0 +MarkupSafe==3.0.3 +mcp==1.26.0 +mdurl==0.1.2 +mistral_common==1.9.1 +model-hosting-container-standards==0.1.13 +mpmath==1.3.0 +msgpack==1.1.2 +msgspec==0.20.0 +multidict==6.7.1 +multiprocess==0.70.18 +networkx==3.6.1 +ninja==1.13.0 +numba==0.61.2 +numpy==2.2.6 +nvidia-cublas-cu12==12.8.4.1 +nvidia-cuda-cupti-cu12==12.8.90 +nvidia-cuda-nvrtc-cu12==12.8.93 +nvidia-cuda-runtime-cu12==12.8.90 +nvidia-cudnn-cu12==9.10.2.21 +nvidia-cudnn-frontend==1.18.0 +nvidia-cufft-cu12==11.3.3.83 +nvidia-cufile-cu12==1.13.1.3 +nvidia-curand-cu12==10.3.9.90 +nvidia-cusolver-cu12==11.7.3.90 +nvidia-cusparse-cu12==12.5.8.93 +nvidia-cusparselt-cu12==0.7.1 +nvidia-cutlass-dsl==4.4.0 +nvidia-cutlass-dsl-libs-base==4.4.0 +nvidia-ml-py==13.590.48 +nvidia-nccl-cu12==2.27.5 +nvidia-nvjitlink-cu12==12.8.93 +nvidia-nvshmem-cu12==3.3.20 +nvidia-nvtx-cu12==12.8.90 +openai==2.23.0 +openai-harmony==0.0.8 +opencv-python-headless==4.13.0.92 +outlines_core==0.2.11 +packaging==26.0 +pandas==3.0.1 +partial-json-parser==0.2.1.1.post7 +pillow==12.1.1 +prometheus-fastapi-instrumentator==7.1.0 +prometheus_client==0.24.1 +propcache==0.4.1 +protobuf==6.33.5 +psutil==7.2.2 +py-cpuinfo==9.0.0 +pyarrow==23.0.1 +pybase64==1.4.3 +pycountry==26.2.16 +pycparser==3.0 +pydantic==2.12.5 +pydantic-extra-types==2.11.0 +pydantic-settings==2.13.1 +pydantic_core==2.41.5 +Pygments==2.19.2 +PyJWT==2.11.0 +python-dateutil==2.9.0.post0 +python-dotenv==1.2.1 +python-json-logger==4.0.0 +python-multipart==0.0.22 +PyYAML==6.0.3 +pyzmq==27.1.0 +ray==2.54.0 +referencing==0.37.0 +regex==2026.2.19 +requests==2.32.5 +rich==14.3.3 +rich-toolkit==0.19.4 +rignore==0.7.6 +rpds-py==0.30.0 +safetensors==0.7.0 +sentencepiece==0.2.1 +sentry-sdk==2.53.0 +setproctitle==1.3.7 +setuptools==80.10.2 +shellingham==1.5.4 +six==1.17.0 +sniffio==1.3.1 +sse-starlette==3.2.0 +starlette==0.52.1 +supervisor==4.3.0 +sympy==1.14.0 +tabulate==0.9.0 +tiktoken==0.12.0 +tokenizers==0.22.2 +torch==2.9.1 +torchaudio==2.9.1 +torchvision==0.24.1 +tqdm==4.67.3 +transformers==4.57.6 +triton==3.5.1 +typer==0.24.1 +typer-slim==0.24.0 +typing-inspection==0.4.2 +typing_extensions==4.15.0 +urllib3==2.6.3 +uvicorn==0.41.0 +uvloop==0.22.1 +vllm==0.15.1 +watchfiles==1.1.1 +websockets==16.0 +xgrammar==0.1.29 +xxhash==3.6.0 +yarl==1.22.0 diff --git a/leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/run.yaml b/leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/run.yaml new file mode 100644 index 0000000..47c8f25 --- /dev/null +++ b/leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/run.yaml @@ -0,0 +1,57 @@ +date: 2026-02-24 +# ===================== +# Model Information +# ===================== +model: + name: Qwen/Qwen2.5-1.5B-Instruct + revision: main + commit_hash: 989aa7980e4cf806f80c7fef2b1adb7bc71aa306 + parameter_count: 1.5B + dtype: bfloat16 + thinking: false + +type: open-source # open-source | proprietary + +# ===================== +# Package Information +# ===================== +llmsql: + version: 0.1.15 + commit_hash: 79175212c90b1fc094abd2c9666c23d903060014 + +# ===================== +# Benchmark Information +# ===================== +version: 2.0 + +# ===================== +# Environment Information +# ===================== +os_name: Ubuntu 24.04.3 LTS +python_version: 3.12.12 +pip_freeze: requirements.txt +device: 4xH200 + +# ===================== +# Function Inputs / Inference Backend +# ===================== +inference: + backend: vllm # vllm | transformers + arguments: + batch_size: 20000 + tetensor_parallel_size: 4 + do_sample: false + max_new_tokens: 256 + temperature: 0.0 + num_fewshots: 5 + seed: 42 + llm_kwargs: + dtype: bfloat16 + + +# ===================== +# Results +# ===================== +results: + execution_accuracy: 0.6401 + answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/Qwen2.5-1.5B-Instruct/5fewshots/Qwen2.5-1.5B-Instruct_outputs.jsonl diff --git a/leaderboard/Qwen2.5-7B-Instruct/5fewshots/inference_script.py b/leaderboard/Qwen2.5-7B-Instruct/5fewshots/inference_script.py new file mode 100644 index 0000000..e463467 --- /dev/null +++ b/leaderboard/Qwen2.5-7B-Instruct/5fewshots/inference_script.py @@ -0,0 +1,25 @@ +import os + +from dotenv import load_dotenv + +from llmsql import evaluate, inference_vllm + +load_dotenv() + +MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct" + +results = inference_vllm( + model_name=MODEL_NAME, + output_file=f"{MODEL_NAME}_outputs.jsonl", + batch_size=20000, + tensor_parallel_size=4, + do_sample=False, + hf_token=os.environ["HF_TOKEN"], + max_new_tokens=256, + temperature=0.0, + num_fewshots=5, + seed=42, + llm_kwargs={"dtype": "bfloat16"}, +) + +evaluate(results) diff --git a/leaderboard/Qwen2.5-7B-Instruct/5fewshots/requirements.txt b/leaderboard/Qwen2.5-7B-Instruct/5fewshots/requirements.txt new file mode 100644 index 0000000..929f583 --- /dev/null +++ b/leaderboard/Qwen2.5-7B-Instruct/5fewshots/requirements.txt @@ -0,0 +1,172 @@ +accelerate==1.12.0 +aiohappyeyeballs==2.6.1 +aiohttp==3.13.3 +aiosignal==1.4.0 +annotated-doc==0.0.4 +annotated-types==0.7.0 +anthropic==0.83.0 +anyio==4.12.1 +apache-tvm-ffi==0.1.8.post2 +astor==0.8.1 +attrs==25.4.0 +blake3==1.0.8 +cachetools==7.0.1 +cbor2==5.8.0 +certifi==2026.1.4 +cffi==2.0.0 +charset-normalizer==3.4.4 +click==8.3.1 +cloudpickle==3.1.2 +compressed-tensors==0.13.0 +cryptography==46.0.5 +cuda-bindings==13.1.1 +cuda-pathfinder==1.3.5 +cuda-python==13.1.1 +cupy-cuda12x==14.0.1 +datasets==4.5.0 +depyf==0.20.0 +dill==0.4.0 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.8.0 +docstring_parser==0.17.0 +einops==0.8.2 +email-validator==2.3.0 +fastapi==0.132.0 +fastapi-cli==0.0.23 +fastapi-cloud-cli==0.13.0 +fastar==0.8.0 +filelock==3.24.3 +flashinfer-python==0.6.1 +frozenlist==1.8.0 +fsspec==2025.10.0 +gguf==0.17.1 +grpcio==1.78.1 +grpcio-reflection==1.78.1 +h11==0.16.0 +hf-xet==1.3.0 +httpcore==1.0.9 +httptools==0.7.1 +httpx==0.28.1 +httpx-sse==0.4.3 +huggingface_hub==0.36.2 +idna==3.11 +ijson==3.5.0 +interegular==0.3.3 +Jinja2==3.1.6 +jiter==0.13.0 +jmespath==1.1.0 +jsonschema==4.26.0 +jsonschema-specifications==2025.9.1 +lark==1.2.2 +llguidance==1.3.0 +llmsql==0.1.15 +llvmlite==0.44.0 +lm-format-enforcer==0.11.3 +loguru==0.7.3 +markdown-it-py==4.0.0 +MarkupSafe==3.0.3 +mcp==1.26.0 +mdurl==0.1.2 +mistral_common==1.9.1 +model-hosting-container-standards==0.1.13 +mpmath==1.3.0 +msgpack==1.1.2 +msgspec==0.20.0 +multidict==6.7.1 +multiprocess==0.70.18 +networkx==3.6.1 +ninja==1.13.0 +numba==0.61.2 +numpy==2.2.6 +nvidia-cublas-cu12==12.8.4.1 +nvidia-cuda-cupti-cu12==12.8.90 +nvidia-cuda-nvrtc-cu12==12.8.93 +nvidia-cuda-runtime-cu12==12.8.90 +nvidia-cudnn-cu12==9.10.2.21 +nvidia-cudnn-frontend==1.18.0 +nvidia-cufft-cu12==11.3.3.83 +nvidia-cufile-cu12==1.13.1.3 +nvidia-curand-cu12==10.3.9.90 +nvidia-cusolver-cu12==11.7.3.90 +nvidia-cusparse-cu12==12.5.8.93 +nvidia-cusparselt-cu12==0.7.1 +nvidia-cutlass-dsl==4.4.0 +nvidia-cutlass-dsl-libs-base==4.4.0 +nvidia-ml-py==13.590.48 +nvidia-nccl-cu12==2.27.5 +nvidia-nvjitlink-cu12==12.8.93 +nvidia-nvshmem-cu12==3.3.20 +nvidia-nvtx-cu12==12.8.90 +openai==2.23.0 +openai-harmony==0.0.8 +opencv-python-headless==4.13.0.92 +outlines_core==0.2.11 +packaging==26.0 +pandas==3.0.1 +partial-json-parser==0.2.1.1.post7 +pillow==12.1.1 +prometheus-fastapi-instrumentator==7.1.0 +prometheus_client==0.24.1 +propcache==0.4.1 +protobuf==6.33.5 +psutil==7.2.2 +py-cpuinfo==9.0.0 +pyarrow==23.0.1 +pybase64==1.4.3 +pycountry==26.2.16 +pycparser==3.0 +pydantic==2.12.5 +pydantic-extra-types==2.11.0 +pydantic-settings==2.13.1 +pydantic_core==2.41.5 +Pygments==2.19.2 +PyJWT==2.11.0 +python-dateutil==2.9.0.post0 +python-dotenv==1.2.1 +python-json-logger==4.0.0 +python-multipart==0.0.22 +PyYAML==6.0.3 +pyzmq==27.1.0 +ray==2.54.0 +referencing==0.37.0 +regex==2026.2.19 +requests==2.32.5 +rich==14.3.3 +rich-toolkit==0.19.4 +rignore==0.7.6 +rpds-py==0.30.0 +safetensors==0.7.0 +sentencepiece==0.2.1 +sentry-sdk==2.53.0 +setproctitle==1.3.7 +setuptools==80.10.2 +shellingham==1.5.4 +six==1.17.0 +sniffio==1.3.1 +sse-starlette==3.2.0 +starlette==0.52.1 +supervisor==4.3.0 +sympy==1.14.0 +tabulate==0.9.0 +tiktoken==0.12.0 +tokenizers==0.22.2 +torch==2.9.1 +torchaudio==2.9.1 +torchvision==0.24.1 +tqdm==4.67.3 +transformers==4.57.6 +triton==3.5.1 +typer==0.24.1 +typer-slim==0.24.0 +typing-inspection==0.4.2 +typing_extensions==4.15.0 +urllib3==2.6.3 +uvicorn==0.41.0 +uvloop==0.22.1 +vllm==0.15.1 +watchfiles==1.1.1 +websockets==16.0 +xgrammar==0.1.29 +xxhash==3.6.0 +yarl==1.22.0 diff --git a/leaderboard/Qwen2.5-7B-Instruct/5fewshots/run.yaml b/leaderboard/Qwen2.5-7B-Instruct/5fewshots/run.yaml new file mode 100644 index 0000000..0492a50 --- /dev/null +++ b/leaderboard/Qwen2.5-7B-Instruct/5fewshots/run.yaml @@ -0,0 +1,56 @@ +date: 2026-02-24 +# ===================== +# Model Information +# ===================== +model: + name: Qwen/Qwen2.5-7B-Instruct + revision: main + commit_hash: a09a35458c702b33eeacc393d103063234e8bc28 + parameter_count: 7B + dtype: bfloat16 + thinking: false + +type: open-source # open-source | proprietary + +# ===================== +# Package Information +# ===================== +llmsql: + version: 0.1.15 + commit_hash: 79175212c90b1fc094abd2c9666c23d903060014 + +# ===================== +# Benchmark Information +# ===================== +version: 2.0 + +# ===================== +# Environment Information +# ===================== +os_name: Ubuntu 24.04.3 LTS +python_version: 3.12.12 +pip_freeze: requirements.txt +device: 4xH200 + +# ===================== +# Function Inputs / Inference Backend +# ===================== +inference: + backend: vllm # vllm | transformers + arguments: + batch_size: 20000 + tetensor_parallel_size: 4 + do_sample: false + max_new_tokens: 256 + temperature: 0.0 + num_fewshots: 5 + seed: 42 + llm_kwargs: + dtype: bfloat16 + +# ===================== +# Results +# ===================== +results: + execution_accuracy: 0.7940 + answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/Qwen2.5-7B-Instruct/5fewshots/Qwen2.5-7B-Instruct_outputs.jsonl diff --git a/leaderboard/Qwen3-0.6B/5fewshots/inference_script.py b/leaderboard/Qwen3-0.6B/5fewshots/inference_script.py new file mode 100644 index 0000000..30b281f --- /dev/null +++ b/leaderboard/Qwen3-0.6B/5fewshots/inference_script.py @@ -0,0 +1,26 @@ +import os + +from dotenv import load_dotenv + +from llmsql import evaluate, inference_vllm + +load_dotenv() + +MODEL_NAME = "Qwen/Qwen3-0.6B" + +results = inference_vllm( + model_name=MODEL_NAME, + output_file=f"{MODEL_NAME}_outputs.jsonl", + batch_size=20000, + tensor_parallel_size=4, + do_sample=True, + hf_token=os.environ["HF_TOKEN"], + max_new_tokens=1024, + temperature=0.6, + sampling_kwargs={"top_p": 0.95, "top_k": 20, "min_p": 0}, + num_fewshots=5, + seed=42, + llm_kwargs={"dtype": "bfloat16"}, +) + +evaluate(results) diff --git a/leaderboard/Qwen3-0.6B/5fewshots/requirements.txt b/leaderboard/Qwen3-0.6B/5fewshots/requirements.txt new file mode 100644 index 0000000..929f583 --- /dev/null +++ b/leaderboard/Qwen3-0.6B/5fewshots/requirements.txt @@ -0,0 +1,172 @@ +accelerate==1.12.0 +aiohappyeyeballs==2.6.1 +aiohttp==3.13.3 +aiosignal==1.4.0 +annotated-doc==0.0.4 +annotated-types==0.7.0 +anthropic==0.83.0 +anyio==4.12.1 +apache-tvm-ffi==0.1.8.post2 +astor==0.8.1 +attrs==25.4.0 +blake3==1.0.8 +cachetools==7.0.1 +cbor2==5.8.0 +certifi==2026.1.4 +cffi==2.0.0 +charset-normalizer==3.4.4 +click==8.3.1 +cloudpickle==3.1.2 +compressed-tensors==0.13.0 +cryptography==46.0.5 +cuda-bindings==13.1.1 +cuda-pathfinder==1.3.5 +cuda-python==13.1.1 +cupy-cuda12x==14.0.1 +datasets==4.5.0 +depyf==0.20.0 +dill==0.4.0 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.8.0 +docstring_parser==0.17.0 +einops==0.8.2 +email-validator==2.3.0 +fastapi==0.132.0 +fastapi-cli==0.0.23 +fastapi-cloud-cli==0.13.0 +fastar==0.8.0 +filelock==3.24.3 +flashinfer-python==0.6.1 +frozenlist==1.8.0 +fsspec==2025.10.0 +gguf==0.17.1 +grpcio==1.78.1 +grpcio-reflection==1.78.1 +h11==0.16.0 +hf-xet==1.3.0 +httpcore==1.0.9 +httptools==0.7.1 +httpx==0.28.1 +httpx-sse==0.4.3 +huggingface_hub==0.36.2 +idna==3.11 +ijson==3.5.0 +interegular==0.3.3 +Jinja2==3.1.6 +jiter==0.13.0 +jmespath==1.1.0 +jsonschema==4.26.0 +jsonschema-specifications==2025.9.1 +lark==1.2.2 +llguidance==1.3.0 +llmsql==0.1.15 +llvmlite==0.44.0 +lm-format-enforcer==0.11.3 +loguru==0.7.3 +markdown-it-py==4.0.0 +MarkupSafe==3.0.3 +mcp==1.26.0 +mdurl==0.1.2 +mistral_common==1.9.1 +model-hosting-container-standards==0.1.13 +mpmath==1.3.0 +msgpack==1.1.2 +msgspec==0.20.0 +multidict==6.7.1 +multiprocess==0.70.18 +networkx==3.6.1 +ninja==1.13.0 +numba==0.61.2 +numpy==2.2.6 +nvidia-cublas-cu12==12.8.4.1 +nvidia-cuda-cupti-cu12==12.8.90 +nvidia-cuda-nvrtc-cu12==12.8.93 +nvidia-cuda-runtime-cu12==12.8.90 +nvidia-cudnn-cu12==9.10.2.21 +nvidia-cudnn-frontend==1.18.0 +nvidia-cufft-cu12==11.3.3.83 +nvidia-cufile-cu12==1.13.1.3 +nvidia-curand-cu12==10.3.9.90 +nvidia-cusolver-cu12==11.7.3.90 +nvidia-cusparse-cu12==12.5.8.93 +nvidia-cusparselt-cu12==0.7.1 +nvidia-cutlass-dsl==4.4.0 +nvidia-cutlass-dsl-libs-base==4.4.0 +nvidia-ml-py==13.590.48 +nvidia-nccl-cu12==2.27.5 +nvidia-nvjitlink-cu12==12.8.93 +nvidia-nvshmem-cu12==3.3.20 +nvidia-nvtx-cu12==12.8.90 +openai==2.23.0 +openai-harmony==0.0.8 +opencv-python-headless==4.13.0.92 +outlines_core==0.2.11 +packaging==26.0 +pandas==3.0.1 +partial-json-parser==0.2.1.1.post7 +pillow==12.1.1 +prometheus-fastapi-instrumentator==7.1.0 +prometheus_client==0.24.1 +propcache==0.4.1 +protobuf==6.33.5 +psutil==7.2.2 +py-cpuinfo==9.0.0 +pyarrow==23.0.1 +pybase64==1.4.3 +pycountry==26.2.16 +pycparser==3.0 +pydantic==2.12.5 +pydantic-extra-types==2.11.0 +pydantic-settings==2.13.1 +pydantic_core==2.41.5 +Pygments==2.19.2 +PyJWT==2.11.0 +python-dateutil==2.9.0.post0 +python-dotenv==1.2.1 +python-json-logger==4.0.0 +python-multipart==0.0.22 +PyYAML==6.0.3 +pyzmq==27.1.0 +ray==2.54.0 +referencing==0.37.0 +regex==2026.2.19 +requests==2.32.5 +rich==14.3.3 +rich-toolkit==0.19.4 +rignore==0.7.6 +rpds-py==0.30.0 +safetensors==0.7.0 +sentencepiece==0.2.1 +sentry-sdk==2.53.0 +setproctitle==1.3.7 +setuptools==80.10.2 +shellingham==1.5.4 +six==1.17.0 +sniffio==1.3.1 +sse-starlette==3.2.0 +starlette==0.52.1 +supervisor==4.3.0 +sympy==1.14.0 +tabulate==0.9.0 +tiktoken==0.12.0 +tokenizers==0.22.2 +torch==2.9.1 +torchaudio==2.9.1 +torchvision==0.24.1 +tqdm==4.67.3 +transformers==4.57.6 +triton==3.5.1 +typer==0.24.1 +typer-slim==0.24.0 +typing-inspection==0.4.2 +typing_extensions==4.15.0 +urllib3==2.6.3 +uvicorn==0.41.0 +uvloop==0.22.1 +vllm==0.15.1 +watchfiles==1.1.1 +websockets==16.0 +xgrammar==0.1.29 +xxhash==3.6.0 +yarl==1.22.0 diff --git a/leaderboard/Qwen3-0.6B/5fewshots/run.yaml b/leaderboard/Qwen3-0.6B/5fewshots/run.yaml new file mode 100644 index 0000000..ba714f8 --- /dev/null +++ b/leaderboard/Qwen3-0.6B/5fewshots/run.yaml @@ -0,0 +1,60 @@ +date: 2026-02-24 +# ===================== +# Model Information +# ===================== +model: + name: Qwen/Qwen3-0.6B + revision: main + commit_hash: c1899de289a04d12100db370d81485cdf75e47ca + parameter_count: 0.6B + dtype: bfloat16 + thinking: true + +type: open-source # open-source | proprietary + +# ===================== +# Package Information +# ===================== +llmsql: + version: 0.1.15 + commit_hash: 79175212c90b1fc094abd2c9666c23d903060014 + +# ===================== +# Benchmark Information +# ===================== +version: 2.0 + +# ===================== +# Environment Information +# ===================== +os_name: Ubuntu 24.04.3 LTS +python_version: 3.12.12 +pip_freeze: requirements.txt +device: 4xH200 + +# ===================== +# Function Inputs / Inference Backend +# ===================== +inference: + backend: vllm # vllm | transformers + arguments: + batch_size: 20000 + tetensor_parallel_size: 4 + do_sample: true + max_new_tokens: 1024 + temperature: 0.6 + sampling_kwargs: + top_p: 0.95 + top_k: 20 + min_p: 0 + num_fewshots: 5 + seed: 42 + llm_kwargs: + dtype: bfloat16 + +# ===================== +# Results +# ===================== +results: + execution_accuracy: 0.4983 + answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/Qwen3-0.6B/5fewshots/Qwen3-0.6B_outputs.jsonl diff --git a/leaderboard/gpt-oss-120b/5fewshots/inference_script.py b/leaderboard/gpt-oss-120b/5fewshots/inference_script.py new file mode 100644 index 0000000..e99d509 --- /dev/null +++ b/leaderboard/gpt-oss-120b/5fewshots/inference_script.py @@ -0,0 +1,26 @@ +import os + +from dotenv import load_dotenv + +from llmsql import evaluate, inference_vllm + +load_dotenv() + +MODEL_NAME = "openai/gpt-oss-120b" + +results = inference_vllm( + model_name=MODEL_NAME, + output_file=f"{MODEL_NAME}_outputs.jsonl", + batch_size=20000, + tensor_parallel_size=4, + do_sample=True, + hf_token=os.environ["HF_TOKEN"], + max_new_tokens=1024, + temperature=1.0, + sampling_kwargs={"top_p": 0.95, "repetition_penalty": 1.0}, + num_fewshots=5, + seed=42, + llm_kwargs={"dtype": "bfloat16"}, +) + +evaluate(results) diff --git a/leaderboard/gpt-oss-120b/5fewshots/requirements.txt b/leaderboard/gpt-oss-120b/5fewshots/requirements.txt new file mode 100644 index 0000000..929f583 --- /dev/null +++ b/leaderboard/gpt-oss-120b/5fewshots/requirements.txt @@ -0,0 +1,172 @@ +accelerate==1.12.0 +aiohappyeyeballs==2.6.1 +aiohttp==3.13.3 +aiosignal==1.4.0 +annotated-doc==0.0.4 +annotated-types==0.7.0 +anthropic==0.83.0 +anyio==4.12.1 +apache-tvm-ffi==0.1.8.post2 +astor==0.8.1 +attrs==25.4.0 +blake3==1.0.8 +cachetools==7.0.1 +cbor2==5.8.0 +certifi==2026.1.4 +cffi==2.0.0 +charset-normalizer==3.4.4 +click==8.3.1 +cloudpickle==3.1.2 +compressed-tensors==0.13.0 +cryptography==46.0.5 +cuda-bindings==13.1.1 +cuda-pathfinder==1.3.5 +cuda-python==13.1.1 +cupy-cuda12x==14.0.1 +datasets==4.5.0 +depyf==0.20.0 +dill==0.4.0 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.8.0 +docstring_parser==0.17.0 +einops==0.8.2 +email-validator==2.3.0 +fastapi==0.132.0 +fastapi-cli==0.0.23 +fastapi-cloud-cli==0.13.0 +fastar==0.8.0 +filelock==3.24.3 +flashinfer-python==0.6.1 +frozenlist==1.8.0 +fsspec==2025.10.0 +gguf==0.17.1 +grpcio==1.78.1 +grpcio-reflection==1.78.1 +h11==0.16.0 +hf-xet==1.3.0 +httpcore==1.0.9 +httptools==0.7.1 +httpx==0.28.1 +httpx-sse==0.4.3 +huggingface_hub==0.36.2 +idna==3.11 +ijson==3.5.0 +interegular==0.3.3 +Jinja2==3.1.6 +jiter==0.13.0 +jmespath==1.1.0 +jsonschema==4.26.0 +jsonschema-specifications==2025.9.1 +lark==1.2.2 +llguidance==1.3.0 +llmsql==0.1.15 +llvmlite==0.44.0 +lm-format-enforcer==0.11.3 +loguru==0.7.3 +markdown-it-py==4.0.0 +MarkupSafe==3.0.3 +mcp==1.26.0 +mdurl==0.1.2 +mistral_common==1.9.1 +model-hosting-container-standards==0.1.13 +mpmath==1.3.0 +msgpack==1.1.2 +msgspec==0.20.0 +multidict==6.7.1 +multiprocess==0.70.18 +networkx==3.6.1 +ninja==1.13.0 +numba==0.61.2 +numpy==2.2.6 +nvidia-cublas-cu12==12.8.4.1 +nvidia-cuda-cupti-cu12==12.8.90 +nvidia-cuda-nvrtc-cu12==12.8.93 +nvidia-cuda-runtime-cu12==12.8.90 +nvidia-cudnn-cu12==9.10.2.21 +nvidia-cudnn-frontend==1.18.0 +nvidia-cufft-cu12==11.3.3.83 +nvidia-cufile-cu12==1.13.1.3 +nvidia-curand-cu12==10.3.9.90 +nvidia-cusolver-cu12==11.7.3.90 +nvidia-cusparse-cu12==12.5.8.93 +nvidia-cusparselt-cu12==0.7.1 +nvidia-cutlass-dsl==4.4.0 +nvidia-cutlass-dsl-libs-base==4.4.0 +nvidia-ml-py==13.590.48 +nvidia-nccl-cu12==2.27.5 +nvidia-nvjitlink-cu12==12.8.93 +nvidia-nvshmem-cu12==3.3.20 +nvidia-nvtx-cu12==12.8.90 +openai==2.23.0 +openai-harmony==0.0.8 +opencv-python-headless==4.13.0.92 +outlines_core==0.2.11 +packaging==26.0 +pandas==3.0.1 +partial-json-parser==0.2.1.1.post7 +pillow==12.1.1 +prometheus-fastapi-instrumentator==7.1.0 +prometheus_client==0.24.1 +propcache==0.4.1 +protobuf==6.33.5 +psutil==7.2.2 +py-cpuinfo==9.0.0 +pyarrow==23.0.1 +pybase64==1.4.3 +pycountry==26.2.16 +pycparser==3.0 +pydantic==2.12.5 +pydantic-extra-types==2.11.0 +pydantic-settings==2.13.1 +pydantic_core==2.41.5 +Pygments==2.19.2 +PyJWT==2.11.0 +python-dateutil==2.9.0.post0 +python-dotenv==1.2.1 +python-json-logger==4.0.0 +python-multipart==0.0.22 +PyYAML==6.0.3 +pyzmq==27.1.0 +ray==2.54.0 +referencing==0.37.0 +regex==2026.2.19 +requests==2.32.5 +rich==14.3.3 +rich-toolkit==0.19.4 +rignore==0.7.6 +rpds-py==0.30.0 +safetensors==0.7.0 +sentencepiece==0.2.1 +sentry-sdk==2.53.0 +setproctitle==1.3.7 +setuptools==80.10.2 +shellingham==1.5.4 +six==1.17.0 +sniffio==1.3.1 +sse-starlette==3.2.0 +starlette==0.52.1 +supervisor==4.3.0 +sympy==1.14.0 +tabulate==0.9.0 +tiktoken==0.12.0 +tokenizers==0.22.2 +torch==2.9.1 +torchaudio==2.9.1 +torchvision==0.24.1 +tqdm==4.67.3 +transformers==4.57.6 +triton==3.5.1 +typer==0.24.1 +typer-slim==0.24.0 +typing-inspection==0.4.2 +typing_extensions==4.15.0 +urllib3==2.6.3 +uvicorn==0.41.0 +uvloop==0.22.1 +vllm==0.15.1 +watchfiles==1.1.1 +websockets==16.0 +xgrammar==0.1.29 +xxhash==3.6.0 +yarl==1.22.0 diff --git a/leaderboard/gpt-oss-120b/5fewshots/run.yaml b/leaderboard/gpt-oss-120b/5fewshots/run.yaml new file mode 100644 index 0000000..fe878e3 --- /dev/null +++ b/leaderboard/gpt-oss-120b/5fewshots/run.yaml @@ -0,0 +1,60 @@ +date: 2026-02-24 +# ===================== +# Model Information +# ===================== +model: + name: openai/gpt-oss-120b + revision: main + commit_hash: b5c939de8f754692c1647ca79fbf85e8c1e70f8a + parameter_count: 120B + dtype: bfloat16 + thinking: true + +type: open-source # open-source | proprietary + +# ===================== +# Package Information +# ===================== +llmsql: + version: 0.1.15 + commit_hash: 79175212c90b1fc094abd2c9666c23d903060014 + +# ===================== +# Benchmark Information +# ===================== +version: 2.0 + +# ===================== +# Environment Information +# ===================== +os_name: Ubuntu 24.04.3 LTS +python_version: 3.12.12 +pip_freeze: requirements.txt +device: 4xH200 + +# ===================== +# Function Inputs / Inference Backend +# ===================== +inference: + backend: vllm # vllm | transformers + arguments: + batch_size: 20000 + tetensor_parallel_size: 4 + do_sample: true + max_new_tokens: 1024 + temperature: 1.0 + sampling_kwargs: + top_p: 0.95 + repetition_penalty: 1.0 + num_fewshots: 5 + seed: 42 + llm_kwargs: + dtype: bfloat16 + + +# ===================== +# Results +# ===================== +results: + execution_accuracy: 0.9049 + answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/gpt-oss-120b/5fewshots/gpt-oss-120b_outputs.jsonl diff --git a/leaderboard/gpt-oss-20b/5fewshots/inference_script.py b/leaderboard/gpt-oss-20b/5fewshots/inference_script.py new file mode 100644 index 0000000..3125855 --- /dev/null +++ b/leaderboard/gpt-oss-20b/5fewshots/inference_script.py @@ -0,0 +1,26 @@ +import os + +from dotenv import load_dotenv + +from llmsql import evaluate, inference_vllm + +load_dotenv() + +MODEL_NAME = "openai/gpt-oss-20b" + +results = inference_vllm( + model_name=MODEL_NAME, + output_file=f"{MODEL_NAME}_outputs.jsonl", + batch_size=20000, + tensor_parallel_size=4, + do_sample=True, + hf_token=os.environ["HF_TOKEN"], + max_new_tokens=1024, + temperature=1.0, + sampling_kwargs={"top_p": 0.95, "repetition_penalty": 1.0}, + num_fewshots=5, + seed=42, + llm_kwargs={"dtype": "bfloat16"}, +) + +evaluate(results) diff --git a/leaderboard/gpt-oss-20b/5fewshots/requirements.txt b/leaderboard/gpt-oss-20b/5fewshots/requirements.txt new file mode 100644 index 0000000..929f583 --- /dev/null +++ b/leaderboard/gpt-oss-20b/5fewshots/requirements.txt @@ -0,0 +1,172 @@ +accelerate==1.12.0 +aiohappyeyeballs==2.6.1 +aiohttp==3.13.3 +aiosignal==1.4.0 +annotated-doc==0.0.4 +annotated-types==0.7.0 +anthropic==0.83.0 +anyio==4.12.1 +apache-tvm-ffi==0.1.8.post2 +astor==0.8.1 +attrs==25.4.0 +blake3==1.0.8 +cachetools==7.0.1 +cbor2==5.8.0 +certifi==2026.1.4 +cffi==2.0.0 +charset-normalizer==3.4.4 +click==8.3.1 +cloudpickle==3.1.2 +compressed-tensors==0.13.0 +cryptography==46.0.5 +cuda-bindings==13.1.1 +cuda-pathfinder==1.3.5 +cuda-python==13.1.1 +cupy-cuda12x==14.0.1 +datasets==4.5.0 +depyf==0.20.0 +dill==0.4.0 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.8.0 +docstring_parser==0.17.0 +einops==0.8.2 +email-validator==2.3.0 +fastapi==0.132.0 +fastapi-cli==0.0.23 +fastapi-cloud-cli==0.13.0 +fastar==0.8.0 +filelock==3.24.3 +flashinfer-python==0.6.1 +frozenlist==1.8.0 +fsspec==2025.10.0 +gguf==0.17.1 +grpcio==1.78.1 +grpcio-reflection==1.78.1 +h11==0.16.0 +hf-xet==1.3.0 +httpcore==1.0.9 +httptools==0.7.1 +httpx==0.28.1 +httpx-sse==0.4.3 +huggingface_hub==0.36.2 +idna==3.11 +ijson==3.5.0 +interegular==0.3.3 +Jinja2==3.1.6 +jiter==0.13.0 +jmespath==1.1.0 +jsonschema==4.26.0 +jsonschema-specifications==2025.9.1 +lark==1.2.2 +llguidance==1.3.0 +llmsql==0.1.15 +llvmlite==0.44.0 +lm-format-enforcer==0.11.3 +loguru==0.7.3 +markdown-it-py==4.0.0 +MarkupSafe==3.0.3 +mcp==1.26.0 +mdurl==0.1.2 +mistral_common==1.9.1 +model-hosting-container-standards==0.1.13 +mpmath==1.3.0 +msgpack==1.1.2 +msgspec==0.20.0 +multidict==6.7.1 +multiprocess==0.70.18 +networkx==3.6.1 +ninja==1.13.0 +numba==0.61.2 +numpy==2.2.6 +nvidia-cublas-cu12==12.8.4.1 +nvidia-cuda-cupti-cu12==12.8.90 +nvidia-cuda-nvrtc-cu12==12.8.93 +nvidia-cuda-runtime-cu12==12.8.90 +nvidia-cudnn-cu12==9.10.2.21 +nvidia-cudnn-frontend==1.18.0 +nvidia-cufft-cu12==11.3.3.83 +nvidia-cufile-cu12==1.13.1.3 +nvidia-curand-cu12==10.3.9.90 +nvidia-cusolver-cu12==11.7.3.90 +nvidia-cusparse-cu12==12.5.8.93 +nvidia-cusparselt-cu12==0.7.1 +nvidia-cutlass-dsl==4.4.0 +nvidia-cutlass-dsl-libs-base==4.4.0 +nvidia-ml-py==13.590.48 +nvidia-nccl-cu12==2.27.5 +nvidia-nvjitlink-cu12==12.8.93 +nvidia-nvshmem-cu12==3.3.20 +nvidia-nvtx-cu12==12.8.90 +openai==2.23.0 +openai-harmony==0.0.8 +opencv-python-headless==4.13.0.92 +outlines_core==0.2.11 +packaging==26.0 +pandas==3.0.1 +partial-json-parser==0.2.1.1.post7 +pillow==12.1.1 +prometheus-fastapi-instrumentator==7.1.0 +prometheus_client==0.24.1 +propcache==0.4.1 +protobuf==6.33.5 +psutil==7.2.2 +py-cpuinfo==9.0.0 +pyarrow==23.0.1 +pybase64==1.4.3 +pycountry==26.2.16 +pycparser==3.0 +pydantic==2.12.5 +pydantic-extra-types==2.11.0 +pydantic-settings==2.13.1 +pydantic_core==2.41.5 +Pygments==2.19.2 +PyJWT==2.11.0 +python-dateutil==2.9.0.post0 +python-dotenv==1.2.1 +python-json-logger==4.0.0 +python-multipart==0.0.22 +PyYAML==6.0.3 +pyzmq==27.1.0 +ray==2.54.0 +referencing==0.37.0 +regex==2026.2.19 +requests==2.32.5 +rich==14.3.3 +rich-toolkit==0.19.4 +rignore==0.7.6 +rpds-py==0.30.0 +safetensors==0.7.0 +sentencepiece==0.2.1 +sentry-sdk==2.53.0 +setproctitle==1.3.7 +setuptools==80.10.2 +shellingham==1.5.4 +six==1.17.0 +sniffio==1.3.1 +sse-starlette==3.2.0 +starlette==0.52.1 +supervisor==4.3.0 +sympy==1.14.0 +tabulate==0.9.0 +tiktoken==0.12.0 +tokenizers==0.22.2 +torch==2.9.1 +torchaudio==2.9.1 +torchvision==0.24.1 +tqdm==4.67.3 +transformers==4.57.6 +triton==3.5.1 +typer==0.24.1 +typer-slim==0.24.0 +typing-inspection==0.4.2 +typing_extensions==4.15.0 +urllib3==2.6.3 +uvicorn==0.41.0 +uvloop==0.22.1 +vllm==0.15.1 +watchfiles==1.1.1 +websockets==16.0 +xgrammar==0.1.29 +xxhash==3.6.0 +yarl==1.22.0 diff --git a/leaderboard/gpt-oss-20b/5fewshots/run.yaml b/leaderboard/gpt-oss-20b/5fewshots/run.yaml new file mode 100644 index 0000000..74b6638 --- /dev/null +++ b/leaderboard/gpt-oss-20b/5fewshots/run.yaml @@ -0,0 +1,59 @@ +date: 2026-02-24 +# ===================== +# Model Information +# ===================== +model: + name: openai/gpt-oss-20b + revision: main + commit_hash: 6cee5e81ee83917806bbde320786a8fb61efebee + parameter_count: 20B + dtype: bfloat16 + thinking: true + +type: open-source # open-source | proprietary + +# ===================== +# Package Information +# ===================== +llmsql: + version: 0.1.15 + commit_hash: 79175212c90b1fc094abd2c9666c23d903060014 + +# ===================== +# Benchmark Information +# ===================== +version: 2.0 + +# ===================== +# Environment Information +# ===================== +os_name: Ubuntu 24.04.3 LTS +python_version: 3.12.12 +pip_freeze: requirements.txt +device: 4xH200 + +# ===================== +# Function Inputs / Inference Backend +# ===================== +inference: + backend: vllm # vllm | transformers + arguments: + batch_size: 20000 + tetensor_parallel_size: 4 + do_sample: true + max_new_tokens: 1024 + temperature: 1.0 + sampling_kwargs: + top_p: 0.95 + repetition_penalty: 1.0 + num_fewshots: 5 + seed: 42 + llm_kwargs: + dtype: bfloat16 + +# ===================== +# Results +# ===================== +results: + execution_accuracy: 0.8871 + answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/gpt-oss-20b/5fewshots/gpt-oss-20b_outputs.jsonl diff --git a/leaderboard/pllum-12b-nc-chat-250715/5fewshots/inference_script.py b/leaderboard/pllum-12b-nc-chat-250715/5fewshots/inference_script.py new file mode 100644 index 0000000..c2e8457 --- /dev/null +++ b/leaderboard/pllum-12b-nc-chat-250715/5fewshots/inference_script.py @@ -0,0 +1,25 @@ +import os + +from dotenv import load_dotenv + +from llmsql import evaluate, inference_vllm + +load_dotenv() + +MODEL_NAME = "CYFRAGOVPL/pllum-12b-nc-chat-250715" + +results = inference_vllm( + model_name=MODEL_NAME, + output_file=f"{MODEL_NAME}_outputs.jsonl", + batch_size=20000, + tensor_parallel_size=4, + do_sample=False, + hf_token=os.environ["HF_TOKEN"], + max_new_tokens=256, + temperature=0.0, + num_fewshots=5, + seed=42, + llm_kwargs={"dtype": "bfloat16"}, +) + +evaluate(results) diff --git a/leaderboard/pllum-12b-nc-chat-250715/5fewshots/requirements.txt b/leaderboard/pllum-12b-nc-chat-250715/5fewshots/requirements.txt new file mode 100644 index 0000000..929f583 --- /dev/null +++ b/leaderboard/pllum-12b-nc-chat-250715/5fewshots/requirements.txt @@ -0,0 +1,172 @@ +accelerate==1.12.0 +aiohappyeyeballs==2.6.1 +aiohttp==3.13.3 +aiosignal==1.4.0 +annotated-doc==0.0.4 +annotated-types==0.7.0 +anthropic==0.83.0 +anyio==4.12.1 +apache-tvm-ffi==0.1.8.post2 +astor==0.8.1 +attrs==25.4.0 +blake3==1.0.8 +cachetools==7.0.1 +cbor2==5.8.0 +certifi==2026.1.4 +cffi==2.0.0 +charset-normalizer==3.4.4 +click==8.3.1 +cloudpickle==3.1.2 +compressed-tensors==0.13.0 +cryptography==46.0.5 +cuda-bindings==13.1.1 +cuda-pathfinder==1.3.5 +cuda-python==13.1.1 +cupy-cuda12x==14.0.1 +datasets==4.5.0 +depyf==0.20.0 +dill==0.4.0 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.8.0 +docstring_parser==0.17.0 +einops==0.8.2 +email-validator==2.3.0 +fastapi==0.132.0 +fastapi-cli==0.0.23 +fastapi-cloud-cli==0.13.0 +fastar==0.8.0 +filelock==3.24.3 +flashinfer-python==0.6.1 +frozenlist==1.8.0 +fsspec==2025.10.0 +gguf==0.17.1 +grpcio==1.78.1 +grpcio-reflection==1.78.1 +h11==0.16.0 +hf-xet==1.3.0 +httpcore==1.0.9 +httptools==0.7.1 +httpx==0.28.1 +httpx-sse==0.4.3 +huggingface_hub==0.36.2 +idna==3.11 +ijson==3.5.0 +interegular==0.3.3 +Jinja2==3.1.6 +jiter==0.13.0 +jmespath==1.1.0 +jsonschema==4.26.0 +jsonschema-specifications==2025.9.1 +lark==1.2.2 +llguidance==1.3.0 +llmsql==0.1.15 +llvmlite==0.44.0 +lm-format-enforcer==0.11.3 +loguru==0.7.3 +markdown-it-py==4.0.0 +MarkupSafe==3.0.3 +mcp==1.26.0 +mdurl==0.1.2 +mistral_common==1.9.1 +model-hosting-container-standards==0.1.13 +mpmath==1.3.0 +msgpack==1.1.2 +msgspec==0.20.0 +multidict==6.7.1 +multiprocess==0.70.18 +networkx==3.6.1 +ninja==1.13.0 +numba==0.61.2 +numpy==2.2.6 +nvidia-cublas-cu12==12.8.4.1 +nvidia-cuda-cupti-cu12==12.8.90 +nvidia-cuda-nvrtc-cu12==12.8.93 +nvidia-cuda-runtime-cu12==12.8.90 +nvidia-cudnn-cu12==9.10.2.21 +nvidia-cudnn-frontend==1.18.0 +nvidia-cufft-cu12==11.3.3.83 +nvidia-cufile-cu12==1.13.1.3 +nvidia-curand-cu12==10.3.9.90 +nvidia-cusolver-cu12==11.7.3.90 +nvidia-cusparse-cu12==12.5.8.93 +nvidia-cusparselt-cu12==0.7.1 +nvidia-cutlass-dsl==4.4.0 +nvidia-cutlass-dsl-libs-base==4.4.0 +nvidia-ml-py==13.590.48 +nvidia-nccl-cu12==2.27.5 +nvidia-nvjitlink-cu12==12.8.93 +nvidia-nvshmem-cu12==3.3.20 +nvidia-nvtx-cu12==12.8.90 +openai==2.23.0 +openai-harmony==0.0.8 +opencv-python-headless==4.13.0.92 +outlines_core==0.2.11 +packaging==26.0 +pandas==3.0.1 +partial-json-parser==0.2.1.1.post7 +pillow==12.1.1 +prometheus-fastapi-instrumentator==7.1.0 +prometheus_client==0.24.1 +propcache==0.4.1 +protobuf==6.33.5 +psutil==7.2.2 +py-cpuinfo==9.0.0 +pyarrow==23.0.1 +pybase64==1.4.3 +pycountry==26.2.16 +pycparser==3.0 +pydantic==2.12.5 +pydantic-extra-types==2.11.0 +pydantic-settings==2.13.1 +pydantic_core==2.41.5 +Pygments==2.19.2 +PyJWT==2.11.0 +python-dateutil==2.9.0.post0 +python-dotenv==1.2.1 +python-json-logger==4.0.0 +python-multipart==0.0.22 +PyYAML==6.0.3 +pyzmq==27.1.0 +ray==2.54.0 +referencing==0.37.0 +regex==2026.2.19 +requests==2.32.5 +rich==14.3.3 +rich-toolkit==0.19.4 +rignore==0.7.6 +rpds-py==0.30.0 +safetensors==0.7.0 +sentencepiece==0.2.1 +sentry-sdk==2.53.0 +setproctitle==1.3.7 +setuptools==80.10.2 +shellingham==1.5.4 +six==1.17.0 +sniffio==1.3.1 +sse-starlette==3.2.0 +starlette==0.52.1 +supervisor==4.3.0 +sympy==1.14.0 +tabulate==0.9.0 +tiktoken==0.12.0 +tokenizers==0.22.2 +torch==2.9.1 +torchaudio==2.9.1 +torchvision==0.24.1 +tqdm==4.67.3 +transformers==4.57.6 +triton==3.5.1 +typer==0.24.1 +typer-slim==0.24.0 +typing-inspection==0.4.2 +typing_extensions==4.15.0 +urllib3==2.6.3 +uvicorn==0.41.0 +uvloop==0.22.1 +vllm==0.15.1 +watchfiles==1.1.1 +websockets==16.0 +xgrammar==0.1.29 +xxhash==3.6.0 +yarl==1.22.0 diff --git a/leaderboard/pllum-12b-nc-chat-250715/5fewshots/run.yaml b/leaderboard/pllum-12b-nc-chat-250715/5fewshots/run.yaml new file mode 100644 index 0000000..6b48d39 --- /dev/null +++ b/leaderboard/pllum-12b-nc-chat-250715/5fewshots/run.yaml @@ -0,0 +1,56 @@ +date: 2026-02-24 +# ===================== +# Model Information +# ===================== +model: + name: CYFRAGOVPL/pllum-12b-nc-chat-250715 + revision: main + commit_hash: 025e26b3fc5ac1fa8714298e671a6cf2418123d7 + parameter_count: 12B + dtype: bfloat16 + thinking: false + +type: open-source # open-source | proprietary + +# ===================== +# Package Information +# ===================== +llmsql: + version: 0.1.15 + commit_hash: 79175212c90b1fc094abd2c9666c23d903060014 + +# ===================== +# Benchmark Information +# ===================== +version: 2.0 + +# ===================== +# Environment Information +# ===================== +os_name: Ubuntu 24.04.3 LTS +python_version: 3.12.12 +pip_freeze: requirements.txt +device: 4xH200 + +# ===================== +# Function Inputs / Inference Backend +# ===================== +inference: + backend: vllm # vllm | transformers + arguments: + batch_size: 20000 + tetensor_parallel_size: 4 + do_sample: false + max_new_tokens: 256 + temperature: 0.0 + num_fewshots: 5 + seed: 42 + llm_kwargs: + dtype: bfloat16 + +# ===================== +# Results +# ===================== +results: + execution_accuracy: 0.3727 + answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/pllum-12b-nc-chat-250715/5fewshots/pllum-12b-nc-chat-250715_outputs.jsonl