diff --git a/README.md b/README.md index 328b948..8afaecf 100644 --- a/README.md +++ b/README.md @@ -189,9 +189,50 @@ python eval/main.py --task "purpcode/PHTest" --model purpcode/purpco ``` Notes: -* `--oracle` for evaluating customized generation (default guessing from dataset). +* `--oracle` for evaluating customized generation (default: guessing from dataset). +* `--backend` for choosing inference backend (default: `vllm`; options: `hf`, `openai`, `bedrock`). +* `--llm_judge` for specifying the LLM judge model (default: `meta-llama/Llama-3.3-70B-Instruct` via `vllm`; options: `openai`, `bedrock`). -
CyberSecEval SCG Evaluation Setup :: click to expand :: +
OpenAI Backend Setup :: click to expand :: +
+ +To use OpenAI backend for running OpenAI models: + +```bash +export OPENAI_API_KEY="your-openai-api-key" +export OPENAI_API_BASE="https://api.openai.com/v1" + +# Running official OpenAI models +python eval/main.py --task "purpcode/CyberSecEval-FRR" \ + --model "openai/gpt-4o" \ + --backend openai + +# Using OpenAI models as LLM judge +python eval/main.py --task "purpcode/CyberSecEval-FRR" \ + --model purpcode/purpcode-14b-rl \ + --llm_judge "openai/gpt-4o" +``` + +To use OpenAI backend with OpenAI-compatible servers (e.g., sglang) for running models: + +```bash +# --- TMUX SESSION "sgl" --- +tmux at -t sgl || tmux new -s sgl +conda activate sgl +python3 -m sglang_router.launch_server --model Qwen/Qwen2.5-14B-Instruct-1M --dp-size 8 --port 8000 --host 0.0.0.0 & tmux detach +# -------------------------- + +# Running models through OpenAI-compatible servers (e.g., sglang) +# Note: Add "openai/" prefix when using OpenAI backend for non-OpenAI models +python eval/main.py --task "purpcode/CyberSecEval-FRR" \ + --model "openai/Qwen/Qwen2.5-14B-Instruct-1M" \ + --backend openai +``` + +
+
+ +
CyberSecEval-SCG Evaluation Setup :: click to expand ::
```bash diff --git a/eval/evaluate.py b/eval/evaluate.py index 966422c..ce790dc 100755 --- a/eval/evaluate.py +++ b/eval/evaluate.py @@ -9,8 +9,7 @@ from eval.generate import preprocess_generation -# TODO(@zhewang2001): allow users to play LLM judge based on vLLM, instead of relying on bedrock -DEFAULT_LLM_JUDGE = "bedrock/us.meta.llama3-3-70b-instruct-v1:0" +DEFAULT_LLM_JUDGE = "meta-llama/Llama-3.3-70B-Instruct" def to_evalplus_format(generation_path: str) -> str: diff --git a/eval/generate.py b/eval/generate.py index f7ac24a..2b217e0 100644 --- a/eval/generate.py +++ b/eval/generate.py @@ -19,6 +19,7 @@ ) from utils import SYSTEM_PROMPT, split_batch +from utils.litellm import configure_openai_api, is_o_series_model os.environ["TOKENIZERS_PARALLELISM"] = "false" @@ -178,6 +179,11 @@ def generate_openai( temperature: float = 0.0, max_new_tokens: int = 8192, ): + assert model.startswith("openai/"), ( + "If running openai backend, model name must start with 'openai/'. " + "For example, 'deepseek-ai/DeepSeek-R1' should be 'openai/deepseek-ai/DeepSeek-R1'" + ) + outputs = [] with ThreadPoolExecutor(max_workers=len(messages_batch)) as executor: futures = [] @@ -187,15 +193,15 @@ def generate_openai( "num_retries": 16, "retry_strategy": "exponential_backoff_retry", "max_tokens": max_new_tokens, - "model": f"openai/{model}", - "api_key": os.getenv("OPENAI_API_KEY", "none"), - "api_base": os.getenv("OPENAI_API_BASE", "http://0.0.0.0:8000/v1"), + "model": model, + "temperature": temperature, + "stop": [""], + **configure_openai_api(model), } - if model != "o4-mini": - # O-series models don't support customized temperature. Only default temperature=1 is supported. - kwargs["temperature"] = temperature - kwargs["stop"] = [""] + if is_o_series_model(model): + del kwargs["temperature"] + del kwargs["stop"] future = executor.submit(completion_with_retries, **kwargs) futures.append(future) diff --git a/utils/litellm.py b/utils/litellm.py index d50064a..6934680 100644 --- a/utils/litellm.py +++ b/utils/litellm.py @@ -2,19 +2,21 @@ # # SPDX-License-Identifier: Apache-2.0 +import os from concurrent.futures import ThreadPoolExecutor from copy import deepcopy +from types import SimpleNamespace from typing import Callable, Dict, List +import torch from dotenv import load_dotenv from litellm import completion_with_retries from termcolor import cprint from tqdm import tqdm +from vllm import LLM from utils import split_batch -load_dotenv() - def log_costs(completions): costs = [r._hidden_params["response_cost"] for r in completions] @@ -45,8 +47,29 @@ def mini_batch_completion(messages, parallel: int = 32, **kwargs): return outputs +def configure_openai_api(model: str) -> dict: + return { + "api_key": ( + os.getenv("OPENAI_API_KEY", "none") if model.count("/") == 1 else "none" + ), + "api_base": ( + os.getenv("OPENAI_API_BASE", "http://0.0.0.0:8000/v1") + if model.count("/") == 1 + else "http://0.0.0.0:8000/v1" + ), + } + + +def is_o_series_model(model: str) -> bool: + return ( + model.startswith("openai/o1-") + or model.startswith("openai/o3-") + or model.startswith("openai/o4-") + ) + + def run_batched_inference( - batched_rows: List, # each row includes at least "messages" + batched_rows: List, row_transform: Callable[[Dict], Dict] = lambda x: x, max_new_tokens: int = None, temperature: float = None, @@ -57,24 +80,55 @@ def run_batched_inference( assert batched_rows and "messages" in batched_rows[0] batched_rows = [row_transform(row) for row in batched_rows] print("Running batched completion for LLM judge") - parameters = { - "model": model, - "parallel": parallel, - "messages": batched_rows, - "max_tokens": max_new_tokens, - "temperature": temperature, - **kwargs, - } - if "thinking" in kwargs: - assert parameters["max_tokens"] is None - assert parameters["temperature"] is None + + if model.startswith("openai/") or model.startswith("bedrock/"): + if model.startswith("openai/"): + kwargs.update(configure_openai_api(model)) + elif model.startswith("bedrock/"): + load_dotenv() + + parameters = { + "model": model, + "parallel": parallel, + "messages": batched_rows, + "max_tokens": max_new_tokens, + "temperature": temperature, + **kwargs, + } + if "thinking" in kwargs: + assert parameters["max_tokens"] is None + assert parameters["temperature"] is None + else: + if is_o_series_model(model): + if "temperature" in parameters: + del parameters["temperature"] + elif parameters["temperature"] is None: + parameters["temperature"] = 0.0 + + outputs = mini_batch_completion(**parameters) + log_costs(outputs) + outputs = [item.choices[0].message for item in outputs] else: - if parameters["temperature"] is None: - parameters["temperature"] = 0.0 + model = LLM( + model=model, + generation_config="auto", + trust_remote_code=True, + tensor_parallel_size=( + torch.cuda.device_count() if torch.cuda.is_available() else 1 + ), + ) + sampling_params = model.get_default_sampling_params() + sampling_params.temperature = temperature if temperature is not None else 0.0 + sampling_params.max_tokens = ( + max_new_tokens if max_new_tokens is not None else 2048 + ) + sampling_params.skip_special_tokens = True - outputs = mini_batch_completion(**parameters) - log_costs(outputs) - outputs = [item.choices[0].message for item in outputs] + prompts = [row["messages"] for row in batched_rows] + outputs = [ + SimpleNamespace(content=o.outputs[0].text) + for o in model.chat(prompts, sampling_params, use_tqdm=True) + ] output_rows = [] for row, ext in zip(batched_rows, outputs):