OpenHands · simonrosenberg · Jan 12, 2026
diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py
@@ -23,13 +23,14 @@
     get_default_on_result_writer,
 )
 from benchmarks.utils.image_utils import image_exists
+from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import (
     EvalInstance,
     EvalMetadata,
     EvalOutput,
 )
 from benchmarks.utils.version import SDK_SHORT_SHA
-from openhands.sdk import LLM, Agent, Conversation, get_logger
+from openhands.sdk import Agent, Conversation, get_logger
 from openhands.sdk.workspace import RemoteWorkspace
 from openhands.tools.preset.default import get_default_tools
 from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace
@@ -600,12 +601,7 @@ def main() -> None:
     if args.max_attempts < 1:
         raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")
 
-    llm_config_path = args.llm_config_path
-    if not os.path.isfile(llm_config_path):
-        raise ValueError(f"LLM config file {llm_config_path} does not exist")
-    with open(llm_config_path, "r") as f:
-        llm_config = f.read()
-    llm = LLM.model_validate_json(llm_config)
+    llm = load_llm_config(args.llm_config_path)
     logger.info("Using LLM config: %s", llm.model_dump_json(indent=2))
 
     dataset_description = (

diff --git a/benchmarks/gaia/run_infer.py b/benchmarks/gaia/run_infer.py
@@ -24,10 +24,10 @@
     get_default_on_result_writer,
 )
 from benchmarks.utils.image_utils import image_exists
+from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import EvalInstance, EvalMetadata, EvalOutput
 from benchmarks.utils.version import SDK_SHORT_SHA
 from openhands.sdk import (
-    LLM,
     Agent,
     Conversation,
     Event,
@@ -557,12 +557,7 @@ def main() -> None:
         raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")
 
     # Load LLM config
-    llm_config_path = args.llm_config_path
-    if not os.path.isfile(llm_config_path):
-        raise ValueError(f"LLM config file {llm_config_path} does not exist")
-    with open(llm_config_path, "r") as f:
-        llm_config = f.read()
-    llm = LLM.model_validate_json(llm_config)
+    llm = load_llm_config(args.llm_config_path)
     logger.info("Using LLM config: %s", llm.model_dump_json(indent=2))
 
     # Construct dataset description

diff --git a/benchmarks/multiswebench/run_infer.py b/benchmarks/multiswebench/run_infer.py
@@ -25,13 +25,14 @@
     get_default_on_result_writer,
 )
 from benchmarks.utils.image_utils import image_exists
+from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import (
     EvalInstance,
     EvalMetadata,
     EvalOutput,
 )
 from benchmarks.utils.version import SDK_SHORT_SHA
-from openhands.sdk import LLM, Agent, Conversation, get_logger
+from openhands.sdk import Agent, Conversation, get_logger
 from openhands.sdk.workspace import RemoteWorkspace
 from openhands.tools.preset.default import get_default_tools
 from openhands.workspace import APIRemoteWorkspace, DockerWorkspace
@@ -435,12 +436,7 @@ def main() -> None:
     if args.max_attempts < 1:
         raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")
 
-    llm_config_path = args.llm_config_path
-    if not os.path.isfile(llm_config_path):
-        raise ValueError(f"LLM config file {llm_config_path} does not exist")
-    with open(llm_config_path, "r") as f:
-        llm_config = f.read()
-    llm = LLM.model_validate_json(llm_config)
+    llm = load_llm_config(args.llm_config_path)
     logger.info("Using LLM config: %s", llm.model_dump_json(indent=2))
 
     dataset_description = (

diff --git a/benchmarks/openagentsafety/run_infer.py b/benchmarks/openagentsafety/run_infer.py
@@ -19,8 +19,9 @@
 from benchmarks.utils.dataset import get_dataset
 from benchmarks.utils.evaluation import Evaluation
 from benchmarks.utils.evaluation_utils import construct_eval_output_dir
+from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import EvalInstance, EvalMetadata, EvalOutput
-from openhands.sdk import LLM, Agent, Conversation, get_logger
+from openhands.sdk import Agent, Conversation, get_logger
 from openhands.sdk.workspace import RemoteWorkspace
 from openhands.tools.preset.default import get_default_tools
 from openhands.workspace import DockerWorkspace
@@ -540,12 +541,7 @@ def main() -> None:
         raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")
 
     # Load LLM config
-    llm_config_path = args.llm_config_path
-    if not os.path.isfile(llm_config_path):
-        raise ValueError(f"LLM config file {llm_config_path} does not exist")
-    with open(llm_config_path, "r") as f:
-        llm_config = f.read()
-    llm = LLM.model_validate_json(llm_config)
+    llm = load_llm_config(args.llm_config_path)
     logger.info("Using LLM config: %s", llm.model_dump_json(indent=2))
 
     # Construct output directory

diff --git a/benchmarks/scripts/validate_cfg.py b/benchmarks/scripts/validate_cfg.py
@@ -1,16 +1,14 @@
 import argparse
 
-from openhands.sdk import LLM
+from benchmarks.utils.llm_config import load_llm_config
 
 
 def main():
     parser = argparse.ArgumentParser(description="Validate LLM configuration")
     parser.add_argument("config_path", type=str, help="Path to JSON LLM configuration")
     args = parser.parse_args()
 
-    with open(args.config_path, "r") as f:
-        llm_config = f.read()
-    llm = LLM.model_validate_json(llm_config)
+    llm = load_llm_config(args.config_path)
 
     print("LLM configuration is valid:")
     print(llm.model_dump_json(indent=2))

diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py
@@ -22,13 +22,14 @@
     get_default_on_result_writer,
 )
 from benchmarks.utils.image_utils import image_exists
+from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import (
     EvalInstance,
     EvalMetadata,
     EvalOutput,
 )
 from benchmarks.utils.version import SDK_SHORT_SHA
-from openhands.sdk import LLM, Agent, Conversation, get_logger
+from openhands.sdk import Agent, Conversation, get_logger
 from openhands.sdk.workspace import RemoteWorkspace
 from openhands.tools.preset.default import get_default_tools
 from openhands.workspace import APIRemoteWorkspace, DockerWorkspace
@@ -326,12 +327,7 @@ def main() -> None:
     if args.max_attempts < 1:
         raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")
 
-    llm_config_path = args.llm_config_path
-    if not os.path.isfile(llm_config_path):
-        raise ValueError(f"LLM config file {llm_config_path} does not exist")
-    with open(llm_config_path, "r") as f:
-        llm_config = f.read()
-    llm = LLM.model_validate_json(llm_config)
+    llm = load_llm_config(args.llm_config_path)
     logger.info("Using LLM config: %s", llm.model_dump_json(indent=2))
 
     dataset_description = (

diff --git a/benchmarks/swebenchmultimodal/run_infer.py b/benchmarks/swebenchmultimodal/run_infer.py
@@ -21,14 +21,14 @@
     get_default_on_result_writer,
 )
 from benchmarks.utils.image_utils import image_exists
+from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import (
     EvalInstance,
     EvalMetadata,
     EvalOutput,
 )
 from benchmarks.utils.version import SDK_SHORT_SHA
 from openhands.sdk import (
-    LLM,
     Agent,
     Conversation,
     ImageContent,
@@ -391,12 +391,7 @@ def main() -> None:
     if args.max_attempts < 1:
         raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")
 
-    llm_config_path = args.llm_config_path
-    if not os.path.isfile(llm_config_path):
-        raise ValueError(f"LLM config file {llm_config_path} does not exist")
-    with open(llm_config_path, "r") as f:
-        llm_config = f.read()
-    llm = LLM.model_validate_json(llm_config)
+    llm = load_llm_config(args.llm_config_path)
     logger.info("Using LLM config: %s", llm.model_dump_json(indent=2))
 
     dataset_description = (

diff --git a/benchmarks/swtbench/run_infer.py b/benchmarks/swtbench/run_infer.py
@@ -15,14 +15,15 @@
     get_default_on_result_writer,
 )
 from benchmarks.utils.image_utils import image_exists
+from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import (
     EvalInstance,
     EvalMetadata,
     EvalOutput,
 )
 from benchmarks.utils.version import SDK_SHORT_SHA
 from openhands.agent_server.docker.build import _base_slug
-from openhands.sdk import LLM, Agent, Conversation, __version__, get_logger
+from openhands.sdk import Agent, Conversation, __version__, get_logger
 from openhands.sdk.workspace import RemoteWorkspace
 from openhands.tools.preset.default import get_default_tools
 from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace, DockerWorkspace
@@ -355,12 +356,7 @@ def main() -> None:
     if args.max_attempts < 1:
         raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")
 
-    llm_config_path = args.llm_config_path
-    if not os.path.isfile(llm_config_path):
-        raise ValueError(f"LLM config file {llm_config_path} does not exist")
-    with open(llm_config_path, "r") as f:
-        llm_config = f.read()
-    llm = LLM.model_validate_json(llm_config)
+    llm = load_llm_config(args.llm_config_path)
     logger.info("Using LLM config: %s", llm.model_dump_json(indent=2))
 
     dataset_description = (

diff --git a/benchmarks/utils/llm_config.py b/benchmarks/utils/llm_config.py
@@ -0,0 +1,41 @@
+"""
+Utilities for loading LLM configuration with environment variable overrides.
+"""
+
+import json
+import os
+
+from openhands.sdk import LLM
+
+
+LLM_API_KEY_ENV_VAR = "LLM_API_KEY"
+
+
+def load_llm_config(config_path: str) -> LLM:
+    """Load LLM configuration from a JSON file with environment variable override.
+
+    If the LLM_API_KEY environment variable is set, it will override the api_key
+    value in the JSON configuration file. This allows cloud environments to inject
+    the API key via secrets without modifying the config file.
+
+    Args:
+        config_path: Path to the JSON LLM configuration file.
+
+    Returns:
+        LLM instance with the loaded configuration.
+
+    Raises:
+        ValueError: If the config file does not exist.
+    """
+    if not os.path.isfile(config_path):
+        raise ValueError(f"LLM config file {config_path} does not exist")
+
+    with open(config_path, "r") as f:
+        config_data = json.load(f)
+
+    # Override api_key with environment variable if set
+    env_api_key = os.getenv(LLM_API_KEY_ENV_VAR)
+    if env_api_key:
+        config_data["api_key"] = env_api_key
+
+    return LLM.model_validate(config_data)
diff --git a/tests/test_llm_config.py b/tests/test_llm_config.py
@@ -0,0 +1,109 @@
+"""Tests for LLM configuration loading with environment variable override."""
+
+import json
+import os
+import tempfile
+from unittest.mock import patch
+
+import pytest
+from pydantic import SecretStr
+
+from benchmarks.utils.llm_config import LLM_API_KEY_ENV_VAR, load_llm_config
+
+
+def get_api_key_value(api_key: str | SecretStr | None) -> str | None:
+    """Extract the actual value from api_key which can be str, SecretStr, or None."""
+    if api_key is None:
+        return None
+    if isinstance(api_key, SecretStr):
+        return api_key.get_secret_value()
+    return api_key
+
+
+@pytest.fixture
+def sample_config():
+    """Create a sample LLM config dict."""
+    return {
+        "model": "test-model",
+        "base_url": "https://api.example.com",
+        "api_key": "config-api-key",
+    }
+
+
+@pytest.fixture
+def config_file(sample_config):
+    """Create a temporary config file."""
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+        json.dump(sample_config, f)
+        f.flush()
+        yield f.name
+    os.unlink(f.name)
+
+
+def test_load_llm_config_from_file(config_file, sample_config):
+    """Test loading LLM config from file without env var override."""
+    with patch.dict(os.environ, {}, clear=True):
+        # Ensure LLM_API_KEY is not set
+        os.environ.pop(LLM_API_KEY_ENV_VAR, None)
+
+        llm = load_llm_config(config_file)
+
+        assert llm.model == sample_config["model"]
+        assert llm.base_url == sample_config["base_url"]
+        assert get_api_key_value(llm.api_key) == sample_config["api_key"]
+
+
+def test_load_llm_config_with_env_override(config_file, sample_config):
+    """Test that LLM_API_KEY env var overrides the config file api_key."""
+    env_api_key = "env-override-api-key"
+
+    with patch.dict(os.environ, {LLM_API_KEY_ENV_VAR: env_api_key}):
+        llm = load_llm_config(config_file)
+
+        assert llm.model == sample_config["model"]
+        assert llm.base_url == sample_config["base_url"]
+        # api_key should be overridden by env var
+        assert get_api_key_value(llm.api_key) == env_api_key
+
+
+def test_load_llm_config_env_override_empty_string(config_file, sample_config):
+    """Test that empty string env var does not override config."""
+    with patch.dict(os.environ, {LLM_API_KEY_ENV_VAR: ""}):
+        llm = load_llm_config(config_file)
+
+        # Empty string is falsy, so config value should be used
+        assert get_api_key_value(llm.api_key) == sample_config["api_key"]
+
+
+def test_load_llm_config_file_not_found():
+    """Test that ValueError is raised when config file doesn't exist."""
+    with pytest.raises(ValueError, match="does not exist"):
+        load_llm_config("/nonexistent/path/config.json")
+
+
+def test_load_llm_config_without_api_key_in_file():
+    """Test loading config without api_key in file, with env var set."""
+    config_without_key = {
+        "model": "test-model",
+        "base_url": "https://api.example.com",
+    }
+
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+        json.dump(config_without_key, f)
+        f.flush()
+        config_path = f.name
+
+    try:
+        env_api_key = "env-api-key"
+        with patch.dict(os.environ, {LLM_API_KEY_ENV_VAR: env_api_key}):
+            llm = load_llm_config(config_path)
+
+            assert llm.model == config_without_key["model"]
+            assert get_api_key_value(llm.api_key) == env_api_key
+    finally:
+        os.unlink(config_path)
+
+
+def test_llm_api_key_env_var_constant():
+    """Test that the env var constant is correctly defined."""
+    assert LLM_API_KEY_ENV_VAR == "LLM_API_KEY"