From 66516dcaef41fc54414ab3fd96fa2c96cfadae4e Mon Sep 17 00:00:00 2001 From: openhands Date: Mon, 12 Jan 2026 21:33:21 +0000 Subject: [PATCH] feat: Support LLM_API_KEY environment variable override for benchmark configs Add support for overriding the api_key in LLM configuration files via the LLM_API_KEY environment variable. This allows cloud environments to inject the API key via secrets (e.g., secrets.LLM_API_KEY_EVAL) without modifying the config files. Changes: - Add benchmarks/utils/llm_config.py with load_llm_config() utility function - Update all run_infer.py files to use the new utility - Update validate_cfg.py to use the new utility - Add comprehensive tests for the new functionality Co-authored-by: openhands --- benchmarks/commit0/run_infer.py | 10 +- benchmarks/gaia/run_infer.py | 9 +- benchmarks/multiswebench/run_infer.py | 10 +- benchmarks/openagentsafety/run_infer.py | 10 +- benchmarks/scripts/validate_cfg.py | 6 +- benchmarks/swebench/run_infer.py | 10 +- benchmarks/swebenchmultimodal/run_infer.py | 9 +- benchmarks/swtbench/run_infer.py | 10 +- benchmarks/utils/llm_config.py | 41 ++++++++ tests/test_llm_config.py | 109 +++++++++++++++++++++ 10 files changed, 171 insertions(+), 53 deletions(-) create mode 100644 benchmarks/utils/llm_config.py create mode 100644 tests/test_llm_config.py diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py index 2c1a704f..7451a87f 100644 --- a/benchmarks/commit0/run_infer.py +++ b/benchmarks/commit0/run_infer.py @@ -23,13 +23,14 @@ get_default_on_result_writer, ) from benchmarks.utils.image_utils import image_exists +from benchmarks.utils.llm_config import load_llm_config from benchmarks.utils.models import ( EvalInstance, EvalMetadata, EvalOutput, ) from benchmarks.utils.version import SDK_SHORT_SHA -from openhands.sdk import LLM, Agent, Conversation, get_logger +from openhands.sdk import Agent, Conversation, get_logger from openhands.sdk.workspace import RemoteWorkspace from openhands.tools.preset.default import get_default_tools from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace @@ -600,12 +601,7 @@ def main() -> None: if args.max_attempts < 1: raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}") - llm_config_path = args.llm_config_path - if not os.path.isfile(llm_config_path): - raise ValueError(f"LLM config file {llm_config_path} does not exist") - with open(llm_config_path, "r") as f: - llm_config = f.read() - llm = LLM.model_validate_json(llm_config) + llm = load_llm_config(args.llm_config_path) logger.info("Using LLM config: %s", llm.model_dump_json(indent=2)) dataset_description = ( diff --git a/benchmarks/gaia/run_infer.py b/benchmarks/gaia/run_infer.py index dbfb48f4..3f5ed347 100644 --- a/benchmarks/gaia/run_infer.py +++ b/benchmarks/gaia/run_infer.py @@ -24,10 +24,10 @@ get_default_on_result_writer, ) from benchmarks.utils.image_utils import image_exists +from benchmarks.utils.llm_config import load_llm_config from benchmarks.utils.models import EvalInstance, EvalMetadata, EvalOutput from benchmarks.utils.version import SDK_SHORT_SHA from openhands.sdk import ( - LLM, Agent, Conversation, Event, @@ -557,12 +557,7 @@ def main() -> None: raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}") # Load LLM config - llm_config_path = args.llm_config_path - if not os.path.isfile(llm_config_path): - raise ValueError(f"LLM config file {llm_config_path} does not exist") - with open(llm_config_path, "r") as f: - llm_config = f.read() - llm = LLM.model_validate_json(llm_config) + llm = load_llm_config(args.llm_config_path) logger.info("Using LLM config: %s", llm.model_dump_json(indent=2)) # Construct dataset description diff --git a/benchmarks/multiswebench/run_infer.py b/benchmarks/multiswebench/run_infer.py index 392cb50f..964cf581 100644 --- a/benchmarks/multiswebench/run_infer.py +++ b/benchmarks/multiswebench/run_infer.py @@ -25,13 +25,14 @@ get_default_on_result_writer, ) from benchmarks.utils.image_utils import image_exists +from benchmarks.utils.llm_config import load_llm_config from benchmarks.utils.models import ( EvalInstance, EvalMetadata, EvalOutput, ) from benchmarks.utils.version import SDK_SHORT_SHA -from openhands.sdk import LLM, Agent, Conversation, get_logger +from openhands.sdk import Agent, Conversation, get_logger from openhands.sdk.workspace import RemoteWorkspace from openhands.tools.preset.default import get_default_tools from openhands.workspace import APIRemoteWorkspace, DockerWorkspace @@ -435,12 +436,7 @@ def main() -> None: if args.max_attempts < 1: raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}") - llm_config_path = args.llm_config_path - if not os.path.isfile(llm_config_path): - raise ValueError(f"LLM config file {llm_config_path} does not exist") - with open(llm_config_path, "r") as f: - llm_config = f.read() - llm = LLM.model_validate_json(llm_config) + llm = load_llm_config(args.llm_config_path) logger.info("Using LLM config: %s", llm.model_dump_json(indent=2)) dataset_description = ( diff --git a/benchmarks/openagentsafety/run_infer.py b/benchmarks/openagentsafety/run_infer.py index 5d883d8f..b7378325 100644 --- a/benchmarks/openagentsafety/run_infer.py +++ b/benchmarks/openagentsafety/run_infer.py @@ -19,8 +19,9 @@ from benchmarks.utils.dataset import get_dataset from benchmarks.utils.evaluation import Evaluation from benchmarks.utils.evaluation_utils import construct_eval_output_dir +from benchmarks.utils.llm_config import load_llm_config from benchmarks.utils.models import EvalInstance, EvalMetadata, EvalOutput -from openhands.sdk import LLM, Agent, Conversation, get_logger +from openhands.sdk import Agent, Conversation, get_logger from openhands.sdk.workspace import RemoteWorkspace from openhands.tools.preset.default import get_default_tools from openhands.workspace import DockerWorkspace @@ -540,12 +541,7 @@ def main() -> None: raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}") # Load LLM config - llm_config_path = args.llm_config_path - if not os.path.isfile(llm_config_path): - raise ValueError(f"LLM config file {llm_config_path} does not exist") - with open(llm_config_path, "r") as f: - llm_config = f.read() - llm = LLM.model_validate_json(llm_config) + llm = load_llm_config(args.llm_config_path) logger.info("Using LLM config: %s", llm.model_dump_json(indent=2)) # Construct output directory diff --git a/benchmarks/scripts/validate_cfg.py b/benchmarks/scripts/validate_cfg.py index 335672d8..4500fcc9 100644 --- a/benchmarks/scripts/validate_cfg.py +++ b/benchmarks/scripts/validate_cfg.py @@ -1,6 +1,6 @@ import argparse -from openhands.sdk import LLM +from benchmarks.utils.llm_config import load_llm_config def main(): @@ -8,9 +8,7 @@ def main(): parser.add_argument("config_path", type=str, help="Path to JSON LLM configuration") args = parser.parse_args() - with open(args.config_path, "r") as f: - llm_config = f.read() - llm = LLM.model_validate_json(llm_config) + llm = load_llm_config(args.config_path) print("LLM configuration is valid:") print(llm.model_dump_json(indent=2)) diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py index 37023ffa..33e13ff5 100644 --- a/benchmarks/swebench/run_infer.py +++ b/benchmarks/swebench/run_infer.py @@ -22,13 +22,14 @@ get_default_on_result_writer, ) from benchmarks.utils.image_utils import image_exists +from benchmarks.utils.llm_config import load_llm_config from benchmarks.utils.models import ( EvalInstance, EvalMetadata, EvalOutput, ) from benchmarks.utils.version import SDK_SHORT_SHA -from openhands.sdk import LLM, Agent, Conversation, get_logger +from openhands.sdk import Agent, Conversation, get_logger from openhands.sdk.workspace import RemoteWorkspace from openhands.tools.preset.default import get_default_tools from openhands.workspace import APIRemoteWorkspace, DockerWorkspace @@ -326,12 +327,7 @@ def main() -> None: if args.max_attempts < 1: raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}") - llm_config_path = args.llm_config_path - if not os.path.isfile(llm_config_path): - raise ValueError(f"LLM config file {llm_config_path} does not exist") - with open(llm_config_path, "r") as f: - llm_config = f.read() - llm = LLM.model_validate_json(llm_config) + llm = load_llm_config(args.llm_config_path) logger.info("Using LLM config: %s", llm.model_dump_json(indent=2)) dataset_description = ( diff --git a/benchmarks/swebenchmultimodal/run_infer.py b/benchmarks/swebenchmultimodal/run_infer.py index b7d3e375..d6e0833d 100644 --- a/benchmarks/swebenchmultimodal/run_infer.py +++ b/benchmarks/swebenchmultimodal/run_infer.py @@ -21,6 +21,7 @@ get_default_on_result_writer, ) from benchmarks.utils.image_utils import image_exists +from benchmarks.utils.llm_config import load_llm_config from benchmarks.utils.models import ( EvalInstance, EvalMetadata, @@ -28,7 +29,6 @@ ) from benchmarks.utils.version import SDK_SHORT_SHA from openhands.sdk import ( - LLM, Agent, Conversation, ImageContent, @@ -391,12 +391,7 @@ def main() -> None: if args.max_attempts < 1: raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}") - llm_config_path = args.llm_config_path - if not os.path.isfile(llm_config_path): - raise ValueError(f"LLM config file {llm_config_path} does not exist") - with open(llm_config_path, "r") as f: - llm_config = f.read() - llm = LLM.model_validate_json(llm_config) + llm = load_llm_config(args.llm_config_path) logger.info("Using LLM config: %s", llm.model_dump_json(indent=2)) dataset_description = ( diff --git a/benchmarks/swtbench/run_infer.py b/benchmarks/swtbench/run_infer.py index 02ec8a6e..3e88d365 100644 --- a/benchmarks/swtbench/run_infer.py +++ b/benchmarks/swtbench/run_infer.py @@ -15,6 +15,7 @@ get_default_on_result_writer, ) from benchmarks.utils.image_utils import image_exists +from benchmarks.utils.llm_config import load_llm_config from benchmarks.utils.models import ( EvalInstance, EvalMetadata, @@ -22,7 +23,7 @@ ) from benchmarks.utils.version import SDK_SHORT_SHA from openhands.agent_server.docker.build import _base_slug -from openhands.sdk import LLM, Agent, Conversation, __version__, get_logger +from openhands.sdk import Agent, Conversation, __version__, get_logger from openhands.sdk.workspace import RemoteWorkspace from openhands.tools.preset.default import get_default_tools from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace, DockerWorkspace @@ -355,12 +356,7 @@ def main() -> None: if args.max_attempts < 1: raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}") - llm_config_path = args.llm_config_path - if not os.path.isfile(llm_config_path): - raise ValueError(f"LLM config file {llm_config_path} does not exist") - with open(llm_config_path, "r") as f: - llm_config = f.read() - llm = LLM.model_validate_json(llm_config) + llm = load_llm_config(args.llm_config_path) logger.info("Using LLM config: %s", llm.model_dump_json(indent=2)) dataset_description = ( diff --git a/benchmarks/utils/llm_config.py b/benchmarks/utils/llm_config.py new file mode 100644 index 00000000..a925564f --- /dev/null +++ b/benchmarks/utils/llm_config.py @@ -0,0 +1,41 @@ +""" +Utilities for loading LLM configuration with environment variable overrides. +""" + +import json +import os + +from openhands.sdk import LLM + + +LLM_API_KEY_ENV_VAR = "LLM_API_KEY" + + +def load_llm_config(config_path: str) -> LLM: + """Load LLM configuration from a JSON file with environment variable override. + + If the LLM_API_KEY environment variable is set, it will override the api_key + value in the JSON configuration file. This allows cloud environments to inject + the API key via secrets without modifying the config file. + + Args: + config_path: Path to the JSON LLM configuration file. + + Returns: + LLM instance with the loaded configuration. + + Raises: + ValueError: If the config file does not exist. + """ + if not os.path.isfile(config_path): + raise ValueError(f"LLM config file {config_path} does not exist") + + with open(config_path, "r") as f: + config_data = json.load(f) + + # Override api_key with environment variable if set + env_api_key = os.getenv(LLM_API_KEY_ENV_VAR) + if env_api_key: + config_data["api_key"] = env_api_key + + return LLM.model_validate(config_data) diff --git a/tests/test_llm_config.py b/tests/test_llm_config.py new file mode 100644 index 00000000..cc2e6af7 --- /dev/null +++ b/tests/test_llm_config.py @@ -0,0 +1,109 @@ +"""Tests for LLM configuration loading with environment variable override.""" + +import json +import os +import tempfile +from unittest.mock import patch + +import pytest +from pydantic import SecretStr + +from benchmarks.utils.llm_config import LLM_API_KEY_ENV_VAR, load_llm_config + + +def get_api_key_value(api_key: str | SecretStr | None) -> str | None: + """Extract the actual value from api_key which can be str, SecretStr, or None.""" + if api_key is None: + return None + if isinstance(api_key, SecretStr): + return api_key.get_secret_value() + return api_key + + +@pytest.fixture +def sample_config(): + """Create a sample LLM config dict.""" + return { + "model": "test-model", + "base_url": "https://api.example.com", + "api_key": "config-api-key", + } + + +@pytest.fixture +def config_file(sample_config): + """Create a temporary config file.""" + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + json.dump(sample_config, f) + f.flush() + yield f.name + os.unlink(f.name) + + +def test_load_llm_config_from_file(config_file, sample_config): + """Test loading LLM config from file without env var override.""" + with patch.dict(os.environ, {}, clear=True): + # Ensure LLM_API_KEY is not set + os.environ.pop(LLM_API_KEY_ENV_VAR, None) + + llm = load_llm_config(config_file) + + assert llm.model == sample_config["model"] + assert llm.base_url == sample_config["base_url"] + assert get_api_key_value(llm.api_key) == sample_config["api_key"] + + +def test_load_llm_config_with_env_override(config_file, sample_config): + """Test that LLM_API_KEY env var overrides the config file api_key.""" + env_api_key = "env-override-api-key" + + with patch.dict(os.environ, {LLM_API_KEY_ENV_VAR: env_api_key}): + llm = load_llm_config(config_file) + + assert llm.model == sample_config["model"] + assert llm.base_url == sample_config["base_url"] + # api_key should be overridden by env var + assert get_api_key_value(llm.api_key) == env_api_key + + +def test_load_llm_config_env_override_empty_string(config_file, sample_config): + """Test that empty string env var does not override config.""" + with patch.dict(os.environ, {LLM_API_KEY_ENV_VAR: ""}): + llm = load_llm_config(config_file) + + # Empty string is falsy, so config value should be used + assert get_api_key_value(llm.api_key) == sample_config["api_key"] + + +def test_load_llm_config_file_not_found(): + """Test that ValueError is raised when config file doesn't exist.""" + with pytest.raises(ValueError, match="does not exist"): + load_llm_config("/nonexistent/path/config.json") + + +def test_load_llm_config_without_api_key_in_file(): + """Test loading config without api_key in file, with env var set.""" + config_without_key = { + "model": "test-model", + "base_url": "https://api.example.com", + } + + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + json.dump(config_without_key, f) + f.flush() + config_path = f.name + + try: + env_api_key = "env-api-key" + with patch.dict(os.environ, {LLM_API_KEY_ENV_VAR: env_api_key}): + llm = load_llm_config(config_path) + + assert llm.model == config_without_key["model"] + assert get_api_key_value(llm.api_key) == env_api_key + finally: + os.unlink(config_path) + + +def test_llm_api_key_env_var_constant(): + """Test that the env var constant is correctly defined.""" + assert LLM_API_KEY_ENV_VAR == "LLM_API_KEY"