From 66516dcaef41fc54414ab3fd96fa2c96cfadae4e Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Mon, 12 Jan 2026 21:33:21 +0000
Subject: [PATCH] feat: Support LLM_API_KEY environment variable override for
 benchmark configs

Add support for overriding the api_key in LLM configuration files via the
LLM_API_KEY environment variable. This allows cloud environments to inject
the API key via secrets (e.g., secrets.LLM_API_KEY_EVAL) without modifying
the config files.

Changes:
- Add benchmarks/utils/llm_config.py with load_llm_config() utility function
- Update all run_infer.py files to use the new utility
- Update validate_cfg.py to use the new utility
- Add comprehensive tests for the new functionality

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/commit0/run_infer.py            |  10 +-
 benchmarks/gaia/run_infer.py               |   9 +-
 benchmarks/multiswebench/run_infer.py      |  10 +-
 benchmarks/openagentsafety/run_infer.py    |  10 +-
 benchmarks/scripts/validate_cfg.py         |   6 +-
 benchmarks/swebench/run_infer.py           |  10 +-
 benchmarks/swebenchmultimodal/run_infer.py |   9 +-
 benchmarks/swtbench/run_infer.py           |  10 +-
 benchmarks/utils/llm_config.py             |  41 ++++++++
 tests/test_llm_config.py                   | 109 +++++++++++++++++++++
 10 files changed, 171 insertions(+), 53 deletions(-)
 create mode 100644 benchmarks/utils/llm_config.py
 create mode 100644 tests/test_llm_config.py

diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py
index 2c1a704f..7451a87f 100644
--- a/benchmarks/commit0/run_infer.py
+++ b/benchmarks/commit0/run_infer.py
@@ -23,13 +23,14 @@
     get_default_on_result_writer,
 )
 from benchmarks.utils.image_utils import image_exists
+from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import (
     EvalInstance,
     EvalMetadata,
     EvalOutput,
 )
 from benchmarks.utils.version import SDK_SHORT_SHA
-from openhands.sdk import LLM, Agent, Conversation, get_logger
+from openhands.sdk import Agent, Conversation, get_logger
 from openhands.sdk.workspace import RemoteWorkspace
 from openhands.tools.preset.default import get_default_tools
 from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace
@@ -600,12 +601,7 @@ def main() -> None:
     if args.max_attempts < 1:
         raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")
 
-    llm_config_path = args.llm_config_path
-    if not os.path.isfile(llm_config_path):
-        raise ValueError(f"LLM config file {llm_config_path} does not exist")
-    with open(llm_config_path, "r") as f:
-        llm_config = f.read()
-    llm = LLM.model_validate_json(llm_config)
+    llm = load_llm_config(args.llm_config_path)
     logger.info("Using LLM config: %s", llm.model_dump_json(indent=2))
 
     dataset_description = (
diff --git a/benchmarks/gaia/run_infer.py b/benchmarks/gaia/run_infer.py
index dbfb48f4..3f5ed347 100644
--- a/benchmarks/gaia/run_infer.py
+++ b/benchmarks/gaia/run_infer.py
@@ -24,10 +24,10 @@
     get_default_on_result_writer,
 )
 from benchmarks.utils.image_utils import image_exists
+from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import EvalInstance, EvalMetadata, EvalOutput
 from benchmarks.utils.version import SDK_SHORT_SHA
 from openhands.sdk import (
-    LLM,
     Agent,
     Conversation,
     Event,
@@ -557,12 +557,7 @@ def main() -> None:
         raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")
 
     # Load LLM config
-    llm_config_path = args.llm_config_path
-    if not os.path.isfile(llm_config_path):
-        raise ValueError(f"LLM config file {llm_config_path} does not exist")
-    with open(llm_config_path, "r") as f:
-        llm_config = f.read()
-    llm = LLM.model_validate_json(llm_config)
+    llm = load_llm_config(args.llm_config_path)
     logger.info("Using LLM config: %s", llm.model_dump_json(indent=2))
 
     # Construct dataset description
diff --git a/benchmarks/multiswebench/run_infer.py b/benchmarks/multiswebench/run_infer.py
index 392cb50f..964cf581 100644
--- a/benchmarks/multiswebench/run_infer.py
+++ b/benchmarks/multiswebench/run_infer.py
@@ -25,13 +25,14 @@
     get_default_on_result_writer,
 )
 from benchmarks.utils.image_utils import image_exists
+from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import (
     EvalInstance,
     EvalMetadata,
     EvalOutput,
 )
 from benchmarks.utils.version import SDK_SHORT_SHA
-from openhands.sdk import LLM, Agent, Conversation, get_logger
+from openhands.sdk import Agent, Conversation, get_logger
 from openhands.sdk.workspace import RemoteWorkspace
 from openhands.tools.preset.default import get_default_tools
 from openhands.workspace import APIRemoteWorkspace, DockerWorkspace
@@ -435,12 +436,7 @@ def main() -> None:
     if args.max_attempts < 1:
         raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")
 
-    llm_config_path = args.llm_config_path
-    if not os.path.isfile(llm_config_path):
-        raise ValueError(f"LLM config file {llm_config_path} does not exist")
-    with open(llm_config_path, "r") as f:
-        llm_config = f.read()
-    llm = LLM.model_validate_json(llm_config)
+    llm = load_llm_config(args.llm_config_path)
     logger.info("Using LLM config: %s", llm.model_dump_json(indent=2))
 
     dataset_description = (
diff --git a/benchmarks/openagentsafety/run_infer.py b/benchmarks/openagentsafety/run_infer.py
index 5d883d8f..b7378325 100644
--- a/benchmarks/openagentsafety/run_infer.py
+++ b/benchmarks/openagentsafety/run_infer.py
@@ -19,8 +19,9 @@
 from benchmarks.utils.dataset import get_dataset
 from benchmarks.utils.evaluation import Evaluation
 from benchmarks.utils.evaluation_utils import construct_eval_output_dir
+from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import EvalInstance, EvalMetadata, EvalOutput
-from openhands.sdk import LLM, Agent, Conversation, get_logger
+from openhands.sdk import Agent, Conversation, get_logger
 from openhands.sdk.workspace import RemoteWorkspace
 from openhands.tools.preset.default import get_default_tools
 from openhands.workspace import DockerWorkspace
@@ -540,12 +541,7 @@ def main() -> None:
         raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")
 
     # Load LLM config
-    llm_config_path = args.llm_config_path
-    if not os.path.isfile(llm_config_path):
-        raise ValueError(f"LLM config file {llm_config_path} does not exist")
-    with open(llm_config_path, "r") as f:
-        llm_config = f.read()
-    llm = LLM.model_validate_json(llm_config)
+    llm = load_llm_config(args.llm_config_path)
     logger.info("Using LLM config: %s", llm.model_dump_json(indent=2))
 
     # Construct output directory
diff --git a/benchmarks/scripts/validate_cfg.py b/benchmarks/scripts/validate_cfg.py
index 335672d8..4500fcc9 100644
--- a/benchmarks/scripts/validate_cfg.py
+++ b/benchmarks/scripts/validate_cfg.py
@@ -1,6 +1,6 @@
 import argparse
 
-from openhands.sdk import LLM
+from benchmarks.utils.llm_config import load_llm_config
 
 
 def main():
@@ -8,9 +8,7 @@ def main():
     parser.add_argument("config_path", type=str, help="Path to JSON LLM configuration")
     args = parser.parse_args()
 
-    with open(args.config_path, "r") as f:
-        llm_config = f.read()
-    llm = LLM.model_validate_json(llm_config)
+    llm = load_llm_config(args.config_path)
 
     print("LLM configuration is valid:")
     print(llm.model_dump_json(indent=2))
diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py
index 37023ffa..33e13ff5 100644
--- a/benchmarks/swebench/run_infer.py
+++ b/benchmarks/swebench/run_infer.py
@@ -22,13 +22,14 @@
     get_default_on_result_writer,
 )
 from benchmarks.utils.image_utils import image_exists
+from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import (
     EvalInstance,
     EvalMetadata,
     EvalOutput,
 )
 from benchmarks.utils.version import SDK_SHORT_SHA
-from openhands.sdk import LLM, Agent, Conversation, get_logger
+from openhands.sdk import Agent, Conversation, get_logger
 from openhands.sdk.workspace import RemoteWorkspace
 from openhands.tools.preset.default import get_default_tools
 from openhands.workspace import APIRemoteWorkspace, DockerWorkspace
@@ -326,12 +327,7 @@ def main() -> None:
     if args.max_attempts < 1:
         raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")
 
-    llm_config_path = args.llm_config_path
-    if not os.path.isfile(llm_config_path):
-        raise ValueError(f"LLM config file {llm_config_path} does not exist")
-    with open(llm_config_path, "r") as f:
-        llm_config = f.read()
-    llm = LLM.model_validate_json(llm_config)
+    llm = load_llm_config(args.llm_config_path)
     logger.info("Using LLM config: %s", llm.model_dump_json(indent=2))
 
     dataset_description = (
diff --git a/benchmarks/swebenchmultimodal/run_infer.py b/benchmarks/swebenchmultimodal/run_infer.py
index b7d3e375..d6e0833d 100644
--- a/benchmarks/swebenchmultimodal/run_infer.py
+++ b/benchmarks/swebenchmultimodal/run_infer.py
@@ -21,6 +21,7 @@
     get_default_on_result_writer,
 )
 from benchmarks.utils.image_utils import image_exists
+from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import (
     EvalInstance,
     EvalMetadata,
@@ -28,7 +29,6 @@
 )
 from benchmarks.utils.version import SDK_SHORT_SHA
 from openhands.sdk import (
-    LLM,
     Agent,
     Conversation,
     ImageContent,
@@ -391,12 +391,7 @@ def main() -> None:
     if args.max_attempts < 1:
         raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")
 
-    llm_config_path = args.llm_config_path
-    if not os.path.isfile(llm_config_path):
-        raise ValueError(f"LLM config file {llm_config_path} does not exist")
-    with open(llm_config_path, "r") as f:
-        llm_config = f.read()
-    llm = LLM.model_validate_json(llm_config)
+    llm = load_llm_config(args.llm_config_path)
     logger.info("Using LLM config: %s", llm.model_dump_json(indent=2))
 
     dataset_description = (
diff --git a/benchmarks/swtbench/run_infer.py b/benchmarks/swtbench/run_infer.py
index 02ec8a6e..3e88d365 100644
--- a/benchmarks/swtbench/run_infer.py
+++ b/benchmarks/swtbench/run_infer.py
@@ -15,6 +15,7 @@
     get_default_on_result_writer,
 )
 from benchmarks.utils.image_utils import image_exists
+from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import (
     EvalInstance,
     EvalMetadata,
@@ -22,7 +23,7 @@
 )
 from benchmarks.utils.version import SDK_SHORT_SHA
 from openhands.agent_server.docker.build import _base_slug
-from openhands.sdk import LLM, Agent, Conversation, __version__, get_logger
+from openhands.sdk import Agent, Conversation, __version__, get_logger
 from openhands.sdk.workspace import RemoteWorkspace
 from openhands.tools.preset.default import get_default_tools
 from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace, DockerWorkspace
@@ -355,12 +356,7 @@ def main() -> None:
     if args.max_attempts < 1:
         raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")
 
-    llm_config_path = args.llm_config_path
-    if not os.path.isfile(llm_config_path):
-        raise ValueError(f"LLM config file {llm_config_path} does not exist")
-    with open(llm_config_path, "r") as f:
-        llm_config = f.read()
-    llm = LLM.model_validate_json(llm_config)
+    llm = load_llm_config(args.llm_config_path)
     logger.info("Using LLM config: %s", llm.model_dump_json(indent=2))
 
     dataset_description = (
diff --git a/benchmarks/utils/llm_config.py b/benchmarks/utils/llm_config.py
new file mode 100644
index 00000000..a925564f
--- /dev/null
+++ b/benchmarks/utils/llm_config.py
@@ -0,0 +1,41 @@
+"""
+Utilities for loading LLM configuration with environment variable overrides.
+"""
+
+import json
+import os
+
+from openhands.sdk import LLM
+
+
+LLM_API_KEY_ENV_VAR = "LLM_API_KEY"
+
+
+def load_llm_config(config_path: str) -> LLM:
+    """Load LLM configuration from a JSON file with environment variable override.
+
+    If the LLM_API_KEY environment variable is set, it will override the api_key
+    value in the JSON configuration file. This allows cloud environments to inject
+    the API key via secrets without modifying the config file.
+
+    Args:
+        config_path: Path to the JSON LLM configuration file.
+
+    Returns:
+        LLM instance with the loaded configuration.
+
+    Raises:
+        ValueError: If the config file does not exist.
+    """
+    if not os.path.isfile(config_path):
+        raise ValueError(f"LLM config file {config_path} does not exist")
+
+    with open(config_path, "r") as f:
+        config_data = json.load(f)
+
+    # Override api_key with environment variable if set
+    env_api_key = os.getenv(LLM_API_KEY_ENV_VAR)
+    if env_api_key:
+        config_data["api_key"] = env_api_key
+
+    return LLM.model_validate(config_data)
diff --git a/tests/test_llm_config.py b/tests/test_llm_config.py
new file mode 100644
index 00000000..cc2e6af7
--- /dev/null
+++ b/tests/test_llm_config.py
@@ -0,0 +1,109 @@
+"""Tests for LLM configuration loading with environment variable override."""
+
+import json
+import os
+import tempfile
+from unittest.mock import patch
+
+import pytest
+from pydantic import SecretStr
+
+from benchmarks.utils.llm_config import LLM_API_KEY_ENV_VAR, load_llm_config
+
+
+def get_api_key_value(api_key: str | SecretStr | None) -> str | None:
+    """Extract the actual value from api_key which can be str, SecretStr, or None."""
+    if api_key is None:
+        return None
+    if isinstance(api_key, SecretStr):
+        return api_key.get_secret_value()
+    return api_key
+
+
+@pytest.fixture
+def sample_config():
+    """Create a sample LLM config dict."""
+    return {
+        "model": "test-model",
+        "base_url": "https://api.example.com",
+        "api_key": "config-api-key",
+    }
+
+
+@pytest.fixture
+def config_file(sample_config):
+    """Create a temporary config file."""
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+        json.dump(sample_config, f)
+        f.flush()
+        yield f.name
+    os.unlink(f.name)
+
+
+def test_load_llm_config_from_file(config_file, sample_config):
+    """Test loading LLM config from file without env var override."""
+    with patch.dict(os.environ, {}, clear=True):
+        # Ensure LLM_API_KEY is not set
+        os.environ.pop(LLM_API_KEY_ENV_VAR, None)
+
+        llm = load_llm_config(config_file)
+
+        assert llm.model == sample_config["model"]
+        assert llm.base_url == sample_config["base_url"]
+        assert get_api_key_value(llm.api_key) == sample_config["api_key"]
+
+
+def test_load_llm_config_with_env_override(config_file, sample_config):
+    """Test that LLM_API_KEY env var overrides the config file api_key."""
+    env_api_key = "env-override-api-key"
+
+    with patch.dict(os.environ, {LLM_API_KEY_ENV_VAR: env_api_key}):
+        llm = load_llm_config(config_file)
+
+        assert llm.model == sample_config["model"]
+        assert llm.base_url == sample_config["base_url"]
+        # api_key should be overridden by env var
+        assert get_api_key_value(llm.api_key) == env_api_key
+
+
+def test_load_llm_config_env_override_empty_string(config_file, sample_config):
+    """Test that empty string env var does not override config."""
+    with patch.dict(os.environ, {LLM_API_KEY_ENV_VAR: ""}):
+        llm = load_llm_config(config_file)
+
+        # Empty string is falsy, so config value should be used
+        assert get_api_key_value(llm.api_key) == sample_config["api_key"]
+
+
+def test_load_llm_config_file_not_found():
+    """Test that ValueError is raised when config file doesn't exist."""
+    with pytest.raises(ValueError, match="does not exist"):
+        load_llm_config("/nonexistent/path/config.json")
+
+
+def test_load_llm_config_without_api_key_in_file():
+    """Test loading config without api_key in file, with env var set."""
+    config_without_key = {
+        "model": "test-model",
+        "base_url": "https://api.example.com",
+    }
+
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+        json.dump(config_without_key, f)
+        f.flush()
+        config_path = f.name
+
+    try:
+        env_api_key = "env-api-key"
+        with patch.dict(os.environ, {LLM_API_KEY_ENV_VAR: env_api_key}):
+            llm = load_llm_config(config_path)
+
+            assert llm.model == config_without_key["model"]
+            assert get_api_key_value(llm.api_key) == env_api_key
+    finally:
+        os.unlink(config_path)
+
+
+def test_llm_api_key_env_var_constant():
+    """Test that the env var constant is correctly defined."""
+    assert LLM_API_KEY_ENV_VAR == "LLM_API_KEY"