Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 43 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -189,9 +189,50 @@ python eval/main.py --task "purpcode/PHTest" --model purpcode/purpco
```

Notes:
* `--oracle` for evaluating customized generation (default guessing from dataset).
* `--oracle` for evaluating customized generation (default: guessing from dataset).
* `--backend` for choosing inference backend (default: `vllm`; options: `hf`, `openai`, `bedrock`).
* `--llm_judge` for specifying the LLM judge model (default: `meta-llama/Llama-3.3-70B-Instruct` via `vllm`; options: `openai`, `bedrock`).

<details><summary><b>CyberSecEval SCG Evaluation Setup</b> <i>:: click to expand ::</i></summary>
<details><summary><b>OpenAI Backend Setup</b> <i>:: click to expand ::</i></summary>
<div>

To use OpenAI backend for running OpenAI models:

```bash
export OPENAI_API_KEY="your-openai-api-key"
export OPENAI_API_BASE="https://api.openai.com/v1"

# Running official OpenAI models
python eval/main.py --task "purpcode/CyberSecEval-FRR" \
--model "openai/gpt-4o" \
--backend openai

# Using OpenAI models as LLM judge
python eval/main.py --task "purpcode/CyberSecEval-FRR" \
--model purpcode/purpcode-14b-rl \
--llm_judge "openai/gpt-4o"
```

To use OpenAI backend with OpenAI-compatible servers (e.g., sglang) for running models:

```bash
# --- TMUX SESSION "sgl" ---
tmux at -t sgl || tmux new -s sgl
conda activate sgl
python3 -m sglang_router.launch_server --model Qwen/Qwen2.5-14B-Instruct-1M --dp-size 8 --port 8000 --host 0.0.0.0 & tmux detach
# --------------------------

# Running models through OpenAI-compatible servers (e.g., sglang)
# Note: Add "openai/" prefix when using OpenAI backend for non-OpenAI models
python eval/main.py --task "purpcode/CyberSecEval-FRR" \
--model "openai/Qwen/Qwen2.5-14B-Instruct-1M" \
--backend openai
```

</div>
</details>

<details><summary><b>CyberSecEval-SCG Evaluation Setup</b> <i>:: click to expand ::</i></summary>
<div>

```bash
Expand Down
3 changes: 1 addition & 2 deletions eval/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@

from eval.generate import preprocess_generation

# TODO(@zhewang2001): allow users to play LLM judge based on vLLM, instead of relying on bedrock
DEFAULT_LLM_JUDGE = "bedrock/us.meta.llama3-3-70b-instruct-v1:0"
DEFAULT_LLM_JUDGE = "meta-llama/Llama-3.3-70B-Instruct"


def to_evalplus_format(generation_path: str) -> str:
Expand Down
20 changes: 13 additions & 7 deletions eval/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
)

from utils import SYSTEM_PROMPT, split_batch
from utils.litellm import configure_openai_api, is_o_series_model

os.environ["TOKENIZERS_PARALLELISM"] = "false"

Expand Down Expand Up @@ -178,6 +179,11 @@ def generate_openai(
temperature: float = 0.0,
max_new_tokens: int = 8192,
):
assert model.startswith("openai/"), (
"If running openai backend, model name must start with 'openai/'. "
"For example, 'deepseek-ai/DeepSeek-R1' should be 'openai/deepseek-ai/DeepSeek-R1'"
)

outputs = []
with ThreadPoolExecutor(max_workers=len(messages_batch)) as executor:
futures = []
Expand All @@ -187,15 +193,15 @@ def generate_openai(
"num_retries": 16,
"retry_strategy": "exponential_backoff_retry",
"max_tokens": max_new_tokens,
"model": f"openai/{model}",
"api_key": os.getenv("OPENAI_API_KEY", "none"),
"api_base": os.getenv("OPENAI_API_BASE", "http://0.0.0.0:8000/v1"),
"model": model,
"temperature": temperature,
"stop": ["<end_of_turn>"],
**configure_openai_api(model),
}

if model != "o4-mini":
# O-series models don't support customized temperature. Only default temperature=1 is supported.
kwargs["temperature"] = temperature
kwargs["stop"] = ["<end_of_turn>"]
if is_o_series_model(model):
del kwargs["temperature"]
del kwargs["stop"]

future = executor.submit(completion_with_retries, **kwargs)
futures.append(future)
Expand Down
92 changes: 73 additions & 19 deletions utils/litellm.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,21 @@
#
# SPDX-License-Identifier: Apache-2.0

import os
from concurrent.futures import ThreadPoolExecutor
from copy import deepcopy
from types import SimpleNamespace
from typing import Callable, Dict, List

import torch
from dotenv import load_dotenv
from litellm import completion_with_retries
from termcolor import cprint
from tqdm import tqdm
from vllm import LLM

from utils import split_batch

load_dotenv()


def log_costs(completions):
costs = [r._hidden_params["response_cost"] for r in completions]
Expand Down Expand Up @@ -45,8 +47,29 @@ def mini_batch_completion(messages, parallel: int = 32, **kwargs):
return outputs


def configure_openai_api(model: str) -> dict:
return {
"api_key": (
os.getenv("OPENAI_API_KEY", "none") if model.count("/") == 1 else "none"
),
"api_base": (
os.getenv("OPENAI_API_BASE", "http://0.0.0.0:8000/v1")
if model.count("/") == 1
else "http://0.0.0.0:8000/v1"
),
}


def is_o_series_model(model: str) -> bool:
return (
model.startswith("openai/o1-")
or model.startswith("openai/o3-")
or model.startswith("openai/o4-")
)


def run_batched_inference(
batched_rows: List, # each row includes at least "messages"
batched_rows: List,
row_transform: Callable[[Dict], Dict] = lambda x: x,
max_new_tokens: int = None,
temperature: float = None,
Expand All @@ -57,24 +80,55 @@ def run_batched_inference(
assert batched_rows and "messages" in batched_rows[0]
batched_rows = [row_transform(row) for row in batched_rows]
print("Running batched completion for LLM judge")
parameters = {
"model": model,
"parallel": parallel,
"messages": batched_rows,
"max_tokens": max_new_tokens,
"temperature": temperature,
**kwargs,
}
if "thinking" in kwargs:
assert parameters["max_tokens"] is None
assert parameters["temperature"] is None

if model.startswith("openai/") or model.startswith("bedrock/"):
if model.startswith("openai/"):
kwargs.update(configure_openai_api(model))
elif model.startswith("bedrock/"):
load_dotenv()

parameters = {
"model": model,
"parallel": parallel,
"messages": batched_rows,
"max_tokens": max_new_tokens,
"temperature": temperature,
**kwargs,
}
if "thinking" in kwargs:
assert parameters["max_tokens"] is None
assert parameters["temperature"] is None
else:
if is_o_series_model(model):
if "temperature" in parameters:
del parameters["temperature"]
elif parameters["temperature"] is None:
parameters["temperature"] = 0.0

outputs = mini_batch_completion(**parameters)
log_costs(outputs)
outputs = [item.choices[0].message for item in outputs]
else:
if parameters["temperature"] is None:
parameters["temperature"] = 0.0
model = LLM(
model=model,
generation_config="auto",
trust_remote_code=True,
tensor_parallel_size=(
torch.cuda.device_count() if torch.cuda.is_available() else 1
),
)
sampling_params = model.get_default_sampling_params()
sampling_params.temperature = temperature if temperature is not None else 0.0
sampling_params.max_tokens = (
max_new_tokens if max_new_tokens is not None else 2048
)
sampling_params.skip_special_tokens = True

outputs = mini_batch_completion(**parameters)
log_costs(outputs)
outputs = [item.choices[0].message for item in outputs]
prompts = [row["messages"] for row in batched_rows]
outputs = [
SimpleNamespace(content=o.outputs[0].text)
for o in model.chat(prompts, sampling_params, use_tqdm=True)
]

output_rows = []
for row, ext in zip(batched_rows, outputs):
Expand Down