diff --git a/vlmrun/cli/_cli/chat.py b/vlmrun/cli/_cli/chat.py index 7790a4f..0a63364 100644 --- a/vlmrun/cli/_cli/chat.py +++ b/vlmrun/cli/_cli/chat.py @@ -273,6 +273,28 @@ def upload_files( return file_responses +def _completion_create_kwargs( + model: str, + messages: List[Dict[str, Any]], + stream: bool, + session_id: Optional[str] = None, +) -> Dict[str, Any]: + """Build kwargs for agent.completions.create() (CLI-only). + + session_id is a VLM Run-specific parameter. The OpenAI client does not accept + it as a top-level argument; we pass it via extra_body so the backend gets it + without changing the OpenAI client. + """ + kwargs: Dict[str, Any] = { + "model": model, + "messages": messages, + "stream": stream, + } + if session_id is not None: + kwargs["extra_body"] = {"session_id": session_id} + return kwargs + + def build_messages( prompt: str, file_responses: Optional[List[FileResponse]] = None ) -> List[Dict[str, Any]]: @@ -613,19 +635,17 @@ def chat( handle_api_errors(), ): response = client.agent.completions.create( - model=model, - messages=messages, - stream=False, - session_id=session_id, + **_completion_create_kwargs( + model, messages, False, session_id + ) ) else: # JSON output: no status messages, just make the API call with handle_api_errors(): response = client.agent.completions.create( - model=model, - messages=messages, - stream=False, - session_id=session_id, + **_completion_create_kwargs( + model, messages, False, session_id + ) ) latency_s = time.time() - start_time @@ -680,10 +700,9 @@ def chat( handle_api_errors(), ): stream = client.agent.completions.create( - model=model, - messages=messages, - stream=True, - session_id=session_id, + **_completion_create_kwargs( + model, messages, True, session_id + ) ) # Collect streaming content and usage data diff --git a/vlmrun/cli/cli.py b/vlmrun/cli/cli.py index 5d3139b..ec923bf 100644 --- a/vlmrun/cli/cli.py +++ b/vlmrun/cli/cli.py @@ -20,7 +20,7 @@ from vlmrun.cli._cli.config import app as config_app, get_config from vlmrun.cli._cli.chat import chat as chat_command from vlmrun.cli._cli.models import app as models_app -from vlmrun.constants import DEFAULT_BASE_URL +from vlmrun.constants import DEFAULT_AGENT_BASE_URL, DEFAULT_BASE_URL app = typer.Typer( name="vlmrun", @@ -119,7 +119,11 @@ def main( if ctx.invoked_subcommand is not None: check_credentials(ctx, api_key, base_url) - ctx.obj = VLMRun(api_key=api_key, base_url=base_url) + # Chat subcommand uses the Agent API; use agent base URL when none set + client_base_url = base_url + if ctx.invoked_subcommand == "chat" and client_base_url is None: + client_base_url = DEFAULT_AGENT_BASE_URL + ctx.obj = VLMRun(api_key=api_key, base_url=client_base_url) # Add subcommands diff --git a/vlmrun/constants.py b/vlmrun/constants.py index a6670bf..fea8c2c 100644 --- a/vlmrun/constants.py +++ b/vlmrun/constants.py @@ -2,6 +2,8 @@ import os DEFAULT_BASE_URL = "https://api.vlm.run/v1" +# Agent API base URL (used by the chat subcommand for OpenAI-compatible completions) +DEFAULT_AGENT_BASE_URL = "https://agent.vlm.run/v1" # Cache directories - use VLMRUN_CACHE_DIR env var if set, otherwise default to ~/.vlmrun/cache VLMRUN_HOME = Path.home() / ".vlmrun"