From feac32311642a5aff1cb7bc8e4d01d923659189d Mon Sep 17 00:00:00 2001 From: zhewang2001 Date: Mon, 11 Aug 2025 16:53:57 +0000 Subject: [PATCH 1/8] feat(litellm): add OpenAI provider support --- utils/litellm.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/utils/litellm.py b/utils/litellm.py index d50064a..d9a1235 100644 --- a/utils/litellm.py +++ b/utils/litellm.py @@ -2,6 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 +import os from concurrent.futures import ThreadPoolExecutor from copy import deepcopy from typing import Callable, Dict, List @@ -13,8 +14,6 @@ from utils import split_batch -load_dotenv() - def log_costs(completions): costs = [r._hidden_params["response_cost"] for r in completions] @@ -57,6 +56,19 @@ def run_batched_inference( assert batched_rows and "messages" in batched_rows[0] batched_rows = [row_transform(row) for row in batched_rows] print("Running batched completion for LLM judge") + + if model.startswith("openai"): + kwargs["api_key"] = ( + os.getenv("OPENAI_API_KEY", "none") if model.count("/") == 1 else "none" + ) + kwargs["api_base"] = ( + os.getenv("OPENAI_API_BASE", "http://0.0.0.0:8000/v1") + if model.count("/") == 1 + else "http://0.0.0.0:8000/v1" + ) + elif model.startswith("bedrock"): + load_dotenv() + parameters = { "model": model, "parallel": parallel, @@ -69,7 +81,14 @@ def run_batched_inference( assert parameters["max_tokens"] is None assert parameters["temperature"] is None else: - if parameters["temperature"] is None: + if ( + model.startswith("openai/o1-") + or model.startswith("openai/o3-") + or model.startswith("openai/o4-") + ): + if "temperature" in parameters: + del parameters["temperature"] + elif parameters["temperature"] is None: parameters["temperature"] = 0.0 outputs = mini_batch_completion(**parameters) From caf2cc4610376e211418d45fa8aede321dde00ed Mon Sep 17 00:00:00 2001 From: zhewang2001 Date: Mon, 11 Aug 2025 16:57:09 +0000 Subject: [PATCH 2/8] fix(openai): resolve model naming and API configuration conflicts --- eval/generate.py | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/eval/generate.py b/eval/generate.py index f7ac24a..bac720f 100644 --- a/eval/generate.py +++ b/eval/generate.py @@ -178,6 +178,11 @@ def generate_openai( temperature: float = 0.0, max_new_tokens: int = 8192, ): + assert model.startswith("openai/"), ( + "If running openai backend, model name must start with 'openai/'. " + "For example, 'deepseek-ai/DeepSeek-R1' should be 'openai/deepseek-ai/DeepSeek-R1'" + ) + outputs = [] with ThreadPoolExecutor(max_workers=len(messages_batch)) as executor: futures = [] @@ -187,15 +192,29 @@ def generate_openai( "num_retries": 16, "retry_strategy": "exponential_backoff_retry", "max_tokens": max_new_tokens, - "model": f"openai/{model}", - "api_key": os.getenv("OPENAI_API_KEY", "none"), - "api_base": os.getenv("OPENAI_API_BASE", "http://0.0.0.0:8000/v1"), + "model": model, + "api_key": ( + os.getenv("OPENAI_API_KEY", "none") + if model.count("/") == 1 + else "none" + ), + "api_base": ( + os.getenv("OPENAI_API_BASE", "http://0.0.0.0:8000/v1") + if model.count("/") == 1 + else "http://0.0.0.0:8000/v1" + ), + "temperature": temperature, + "stop": [""], } - if model != "o4-mini": + if ( + model.startswith("openai/o1-") + or model.startswith("openai/o3-") + or model.startswith("openai/o4-") + ): # O-series models don't support customized temperature. Only default temperature=1 is supported. - kwargs["temperature"] = temperature - kwargs["stop"] = [""] + del kwargs["temperature"] + del kwargs["stop"] future = executor.submit(completion_with_retries, **kwargs) futures.append(future) From a426911d8e496bb84f6a36698ec6d57a8bd3539c Mon Sep 17 00:00:00 2001 From: zhewang2001 Date: Mon, 11 Aug 2025 17:22:33 +0000 Subject: [PATCH 3/8] fix: gemini comments --- eval/generate.py | 19 +++---------------- utils/litellm.py | 36 +++++++++++++++++++++++------------- 2 files changed, 26 insertions(+), 29 deletions(-) diff --git a/eval/generate.py b/eval/generate.py index bac720f..2b217e0 100644 --- a/eval/generate.py +++ b/eval/generate.py @@ -19,6 +19,7 @@ ) from utils import SYSTEM_PROMPT, split_batch +from utils.litellm import configure_openai_api, is_o_series_model os.environ["TOKENIZERS_PARALLELISM"] = "false" @@ -193,26 +194,12 @@ def generate_openai( "retry_strategy": "exponential_backoff_retry", "max_tokens": max_new_tokens, "model": model, - "api_key": ( - os.getenv("OPENAI_API_KEY", "none") - if model.count("/") == 1 - else "none" - ), - "api_base": ( - os.getenv("OPENAI_API_BASE", "http://0.0.0.0:8000/v1") - if model.count("/") == 1 - else "http://0.0.0.0:8000/v1" - ), "temperature": temperature, "stop": [""], + **configure_openai_api(model), } - if ( - model.startswith("openai/o1-") - or model.startswith("openai/o3-") - or model.startswith("openai/o4-") - ): - # O-series models don't support customized temperature. Only default temperature=1 is supported. + if is_o_series_model(model): del kwargs["temperature"] del kwargs["stop"] diff --git a/utils/litellm.py b/utils/litellm.py index d9a1235..255632b 100644 --- a/utils/litellm.py +++ b/utils/litellm.py @@ -44,6 +44,27 @@ def mini_batch_completion(messages, parallel: int = 32, **kwargs): return outputs +def configure_openai_api(model: str) -> dict: + return { + "api_key": ( + os.getenv("OPENAI_API_KEY", "none") if model.count("/") == 1 else "none" + ), + "api_base": ( + os.getenv("OPENAI_API_BASE", "http://0.0.0.0:8000/v1") + if model.count("/") == 1 + else "http://0.0.0.0:8000/v1" + ), + } + + +def is_o_series_model(model: str) -> bool: + return ( + model.startswith("openai/o1-") + or model.startswith("openai/o3-") + or model.startswith("openai/o4-") + ) + + def run_batched_inference( batched_rows: List, # each row includes at least "messages" row_transform: Callable[[Dict], Dict] = lambda x: x, @@ -58,14 +79,7 @@ def run_batched_inference( print("Running batched completion for LLM judge") if model.startswith("openai"): - kwargs["api_key"] = ( - os.getenv("OPENAI_API_KEY", "none") if model.count("/") == 1 else "none" - ) - kwargs["api_base"] = ( - os.getenv("OPENAI_API_BASE", "http://0.0.0.0:8000/v1") - if model.count("/") == 1 - else "http://0.0.0.0:8000/v1" - ) + kwargs.update(configure_openai_api(model)) elif model.startswith("bedrock"): load_dotenv() @@ -81,11 +95,7 @@ def run_batched_inference( assert parameters["max_tokens"] is None assert parameters["temperature"] is None else: - if ( - model.startswith("openai/o1-") - or model.startswith("openai/o3-") - or model.startswith("openai/o4-") - ): + if is_o_series_model(model): if "temperature" in parameters: del parameters["temperature"] elif parameters["temperature"] is None: From f39e6bdd6e88ad18f6d87a83abc23f43f4e66c19 Mon Sep 17 00:00:00 2001 From: zhewang2001 Date: Mon, 11 Aug 2025 18:55:24 +0000 Subject: [PATCH 4/8] refactor: change default llm judge to gpt-4o --- eval/evaluate.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/eval/evaluate.py b/eval/evaluate.py index 966422c..371f155 100755 --- a/eval/evaluate.py +++ b/eval/evaluate.py @@ -9,8 +9,7 @@ from eval.generate import preprocess_generation -# TODO(@zhewang2001): allow users to play LLM judge based on vLLM, instead of relying on bedrock -DEFAULT_LLM_JUDGE = "bedrock/us.meta.llama3-3-70b-instruct-v1:0" +DEFAULT_LLM_JUDGE = "openai/gpt-4o" def to_evalplus_format(generation_path: str) -> str: From f0a0f5d023b139f11d8e6aa459c1cbe7e487c93d Mon Sep 17 00:00:00 2001 From: zhewang2001 Date: Mon, 11 Aug 2025 19:08:50 +0000 Subject: [PATCH 5/8] feat(litellm): add vllm server support --- eval/evaluate.py | 2 +- utils/litellm.py | 40 ++++++++++++++++++++++++++++++++++++++-- 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/eval/evaluate.py b/eval/evaluate.py index 371f155..ce790dc 100755 --- a/eval/evaluate.py +++ b/eval/evaluate.py @@ -9,7 +9,7 @@ from eval.generate import preprocess_generation -DEFAULT_LLM_JUDGE = "openai/gpt-4o" +DEFAULT_LLM_JUDGE = "meta-llama/Llama-3.3-70B-Instruct" def to_evalplus_format(generation_path: str) -> str: diff --git a/utils/litellm.py b/utils/litellm.py index 255632b..39d4d14 100644 --- a/utils/litellm.py +++ b/utils/litellm.py @@ -5,12 +5,14 @@ import os from concurrent.futures import ThreadPoolExecutor from copy import deepcopy +from types import SimpleNamespace from typing import Callable, Dict, List from dotenv import load_dotenv from litellm import completion_with_retries from termcolor import cprint from tqdm import tqdm +from vllm import LLM from utils import split_batch @@ -78,10 +80,44 @@ def run_batched_inference( batched_rows = [row_transform(row) for row in batched_rows] print("Running batched completion for LLM judge") - if model.startswith("openai"): + if model.startswith("openai/"): kwargs.update(configure_openai_api(model)) - elif model.startswith("bedrock"): + elif model.startswith("bedrock/"): load_dotenv() + else: + model = LLM( + model=model, + generation_config="auto", + trust_remote_code=True, + tensor_parallel_size=8, + ) + sampling_params = model.get_default_sampling_params() + sampling_params.temperature = temperature if temperature is not None else 0.0 + sampling_params.max_tokens = ( + max_new_tokens if max_new_tokens is not None else 2048 + ) + sampling_params.skip_special_tokens = True + + prompts = [row["messages"] for row in batched_rows] + vllm_outputs = model.chat(prompts, sampling_params, use_tqdm=True) + + outputs = [SimpleNamespace(content=o.outputs[0].text) for o in vllm_outputs] + + output_rows = [] + for row, ext in zip(batched_rows, outputs): + row = deepcopy(row) + reasoning_content = ( + "\n" + ext.reasoning_content + "\n\n" + if hasattr(ext, "reasoning_content") + and ext.reasoning_content + or "thinking" in kwargs + else "" + ) + row["messages"].append( + {"role": "assistant", "content": reasoning_content + ext.content} + ) + output_rows.append(row) + return output_rows parameters = { "model": model, From 7a066b6b7ee389dca8a1b51859602ba65163461e Mon Sep 17 00:00:00 2001 From: zhewang2001 Date: Mon, 11 Aug 2025 19:27:51 +0000 Subject: [PATCH 6/8] refactor: simplify logic --- utils/litellm.py | 78 ++++++++++++++++++++---------------------------- 1 file changed, 32 insertions(+), 46 deletions(-) diff --git a/utils/litellm.py b/utils/litellm.py index 39d4d14..30ece70 100644 --- a/utils/litellm.py +++ b/utils/litellm.py @@ -68,7 +68,7 @@ def is_o_series_model(model: str) -> bool: def run_batched_inference( - batched_rows: List, # each row includes at least "messages" + batched_rows: List, row_transform: Callable[[Dict], Dict] = lambda x: x, max_new_tokens: int = None, temperature: float = None, @@ -80,10 +80,33 @@ def run_batched_inference( batched_rows = [row_transform(row) for row in batched_rows] print("Running batched completion for LLM judge") - if model.startswith("openai/"): - kwargs.update(configure_openai_api(model)) - elif model.startswith("bedrock/"): - load_dotenv() + if model.startswith("openai/") or model.startswith("bedrock/"): + if model.startswith("openai/"): + kwargs.update(configure_openai_api(model)) + elif model.startswith("bedrock/"): + load_dotenv() + + parameters = { + "model": model, + "parallel": parallel, + "messages": batched_rows, + "max_tokens": max_new_tokens, + "temperature": temperature, + **kwargs, + } + if "thinking" in kwargs: + assert parameters["max_tokens"] is None + assert parameters["temperature"] is None + else: + if is_o_series_model(model): + if "temperature" in parameters: + del parameters["temperature"] + elif parameters["temperature"] is None: + parameters["temperature"] = 0.0 + + outputs = mini_batch_completion(**parameters) + log_costs(outputs) + outputs = [item.choices[0].message for item in outputs] else: model = LLM( model=model, @@ -99,47 +122,10 @@ def run_batched_inference( sampling_params.skip_special_tokens = True prompts = [row["messages"] for row in batched_rows] - vllm_outputs = model.chat(prompts, sampling_params, use_tqdm=True) - - outputs = [SimpleNamespace(content=o.outputs[0].text) for o in vllm_outputs] - - output_rows = [] - for row, ext in zip(batched_rows, outputs): - row = deepcopy(row) - reasoning_content = ( - "\n" + ext.reasoning_content + "\n\n" - if hasattr(ext, "reasoning_content") - and ext.reasoning_content - or "thinking" in kwargs - else "" - ) - row["messages"].append( - {"role": "assistant", "content": reasoning_content + ext.content} - ) - output_rows.append(row) - return output_rows - - parameters = { - "model": model, - "parallel": parallel, - "messages": batched_rows, - "max_tokens": max_new_tokens, - "temperature": temperature, - **kwargs, - } - if "thinking" in kwargs: - assert parameters["max_tokens"] is None - assert parameters["temperature"] is None - else: - if is_o_series_model(model): - if "temperature" in parameters: - del parameters["temperature"] - elif parameters["temperature"] is None: - parameters["temperature"] = 0.0 - - outputs = mini_batch_completion(**parameters) - log_costs(outputs) - outputs = [item.choices[0].message for item in outputs] + outputs = [ + SimpleNamespace(content=o.outputs[0].text) + for o in model.chat(prompts, sampling_params, use_tqdm=True) + ] output_rows = [] for row, ext in zip(batched_rows, outputs): From faf59873723910b5820d965ed69a0e816b7b6184 Mon Sep 17 00:00:00 2001 From: zhewang2001 Date: Mon, 11 Aug 2025 20:23:58 +0000 Subject: [PATCH 7/8] docs: add instructions for openai backend setup --- README.md | 45 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 328b948..8afaecf 100644 --- a/README.md +++ b/README.md @@ -189,9 +189,50 @@ python eval/main.py --task "purpcode/PHTest" --model purpcode/purpco ``` Notes: -* `--oracle` for evaluating customized generation (default guessing from dataset). +* `--oracle` for evaluating customized generation (default: guessing from dataset). +* `--backend` for choosing inference backend (default: `vllm`; options: `hf`, `openai`, `bedrock`). +* `--llm_judge` for specifying the LLM judge model (default: `meta-llama/Llama-3.3-70B-Instruct` via `vllm`; options: `openai`, `bedrock`). -
CyberSecEval SCG Evaluation Setup :: click to expand :: +
OpenAI Backend Setup :: click to expand :: +
+ +To use OpenAI backend for running OpenAI models: + +```bash +export OPENAI_API_KEY="your-openai-api-key" +export OPENAI_API_BASE="https://api.openai.com/v1" + +# Running official OpenAI models +python eval/main.py --task "purpcode/CyberSecEval-FRR" \ + --model "openai/gpt-4o" \ + --backend openai + +# Using OpenAI models as LLM judge +python eval/main.py --task "purpcode/CyberSecEval-FRR" \ + --model purpcode/purpcode-14b-rl \ + --llm_judge "openai/gpt-4o" +``` + +To use OpenAI backend with OpenAI-compatible servers (e.g., sglang) for running models: + +```bash +# --- TMUX SESSION "sgl" --- +tmux at -t sgl || tmux new -s sgl +conda activate sgl +python3 -m sglang_router.launch_server --model Qwen/Qwen2.5-14B-Instruct-1M --dp-size 8 --port 8000 --host 0.0.0.0 & tmux detach +# -------------------------- + +# Running models through OpenAI-compatible servers (e.g., sglang) +# Note: Add "openai/" prefix when using OpenAI backend for non-OpenAI models +python eval/main.py --task "purpcode/CyberSecEval-FRR" \ + --model "openai/Qwen/Qwen2.5-14B-Instruct-1M" \ + --backend openai +``` + +
+
+ +
CyberSecEval-SCG Evaluation Setup :: click to expand ::
```bash From 41277235a0441ec3c23dd9610f8e783a4fd88d96 Mon Sep 17 00:00:00 2001 From: zhewang2001 Date: Mon, 11 Aug 2025 20:47:57 +0000 Subject: [PATCH 8/8] refactor: change default tensor parallel size in vllm --- utils/litellm.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/utils/litellm.py b/utils/litellm.py index 30ece70..6934680 100644 --- a/utils/litellm.py +++ b/utils/litellm.py @@ -8,6 +8,7 @@ from types import SimpleNamespace from typing import Callable, Dict, List +import torch from dotenv import load_dotenv from litellm import completion_with_retries from termcolor import cprint @@ -112,7 +113,9 @@ def run_batched_inference( model=model, generation_config="auto", trust_remote_code=True, - tensor_parallel_size=8, + tensor_parallel_size=( + torch.cuda.device_count() if torch.cuda.is_available() else 1 + ), ) sampling_params = model.get_default_sampling_params() sampling_params.temperature = temperature if temperature is not None else 0.0