aliasrobotics · UnaiAlias · Dec 10, 2025 · Dec 10, 2025 · Dec 10, 2025 · Dec 10, 2025
diff --git a/docs/providers/ollama.md b/docs/providers/ollama.md
@@ -1,5 +1,7 @@
 # Ollama Configuration
 
+## Ollama Local (Self-hosted)
+
 #### [Ollama Integration](https://ollama.com/)
 For local models using Ollama, add the following to your .env:
 
@@ -9,3 +11,28 @@ OLLAMA_API_BASE=http://localhost:8000/v1 # note, maybe you have a different endp
 ```
 
 Make sure that the Ollama server is running and accessible at the specified base URL. You can swap the model with any other supported by your local Ollama instance.
+
+## Ollama Cloud
+
+For cloud models using Ollama Cloud (no GPU required), add the following to your .env:
+
+```bash
+# API Key from ollama.com
+OLLAMA_API_KEY=your_api_key_here
+OLLAMA_API_BASE=https://ollama.com
+
+# Cloud model (note the ollama_cloud/ prefix)
+CAI_MODEL=ollama_cloud/gpt-oss:120b
+```
+
+**Requirements:**
+1. Create an account at [ollama.com](https://ollama.com)
+2. Generate an API key from your profile
+3. Use models with `ollama_cloud/` prefix (e.g., `ollama_cloud/gpt-oss:120b`)
+
+**Key differences:**
+- Prefix: `ollama_cloud/` (cloud) vs `ollama/` (local)
+- API Key: Required for cloud, not needed for local
+- Endpoint: `https://ollama.com/v1` (cloud) vs `http://localhost:8000/v1` (local)
+
+See [Ollama Cloud documentation](ollama_cloud.md) for detailed setup instructions.
diff --git a/docs/providers/ollama_cloud.md b/docs/providers/ollama_cloud.md
@@ -0,0 +1,79 @@
+# Ollama Cloud
+
+Run large language models without local GPU using Ollama's cloud service.
+
+## Quick Start
+
+### 1. Get API Key
+
+- Create account at [ollama.com](https://ollama.com)
+- Generate API key from your profile
+
+### 2. Configure `.env`
+
+```bash
+OLLAMA_API_KEY=your_api_key_here
+OLLAMA_API_BASE=https://ollama.com
+CAI_MODEL=ollama_cloud/gpt-oss:120b
+```
+
+### 3. Run
+
+```bash
+cai
+```
+
+## Available Models
+
+View in CAI with `/model-show` under "Ollama Cloud" category:
+
+- `ollama_cloud/gpt-oss:120b` - General purpose 120B model
+- `ollama_cloud/llama3.3:70b` - Llama 3.3 70B
+- `ollama_cloud/qwen2.5:72b` - Qwen 2.5 72B
+- `ollama_cloud/deepseek-v3:671b` - DeepSeek V3 671B
+
+More models at [ollama.com/library](https://ollama.com/library).
+
+## Model Selection
+
+```bash
+# By name
+CAI> /model ollama_cloud/gpt-oss:120b
+
+# By number (after /model-show)
+CAI> /model 3
+```
+
+## Local vs Cloud
+
+| Feature | Local | Cloud |
+|---------|-------|-------|
+| Prefix | `ollama/` | `ollama_cloud/` |
+| API Key | Not required | Required |
+| Endpoint | `http://localhost:8000/v1` | `https://ollama.com/v1` |
+| GPU | Required | Not required |
+
+## Troubleshooting
+
+**Unauthorized error**: Verify `OLLAMA_API_KEY` is set correctly
+
+**Path not found**: Ensure `OLLAMA_API_BASE=https://ollama.com` (without `/v1`)
+
+**Model not listed**: Check model prefix is `ollama_cloud/`, not `ollama/`
+
+## Validation
+
+Test connection with curl:
+
+```bash
+curl https://ollama.com/v1/chat/completions \
+  -H "Authorization: Bearer $OLLAMA_API_KEY" \
+  -H "Content-Type: application/json" \
+  -d '{"model": "gpt-oss:120b", "messages": [{"role": "user", "content": "test"}]}'
+```
+
+## References
+
+- [Ollama Cloud Docs](https://ollama.com/docs/cloud)
+- [Model Library](https://ollama.com/library)
+- [Get API Key](https://ollama.com/settings/keys)
diff --git a/src/cai/repl/commands/completer.py b/src/cai/repl/commands/completer.py
@@ -184,8 +184,18 @@ def fetch_all_models(self):  # pylint: disable=too-many-branches,too-many-statem
             try:
                 # Get Ollama models with a short timeout to prevent hanging
                 api_base = get_ollama_api_base()
+
+                # Add authentication headers for Ollama Cloud if using OPENAI_BASE_URL
+                headers = {}
+                if "ollama.com" in api_base:
+                    api_key = os.getenv("OPENAI_API_KEY")
+                    if api_key:
+                        headers["Authorization"] = f"Bearer {api_key}"
+
                 response = requests.get(
-                    f"{api_base.replace('/v1', '')}/api/tags", timeout=0.5)
+                    f"{api_base.replace('/v1', '')}/api/tags",
+                    headers=headers,
+                    timeout=0.5)
 
                 if response.status_code == 200:
                     data = response.json()

diff --git a/src/cai/repl/commands/model.py b/src/cai/repl/commands/model.py
@@ -12,7 +12,7 @@
 from rich.console import Console  # pylint: disable=import-error
 from rich.table import Table  # pylint: disable=import-error
 from rich.panel import Panel  # pylint: disable=import-error
-from cai.util import get_ollama_api_base, COST_TRACKER
+from cai.util import get_ollama_api_base, get_ollama_auth_headers, COST_TRACKER
 from cai.repl.commands.base import Command, register_command
 
 console = Console()
@@ -99,6 +99,32 @@ def get_predefined_model_categories() -> Dict[str, List[Dict[str, str]]]:
                 "name": "deepseek-r1",
                 "description": "DeepSeek's specialized reasoning model"
             }
+        ],
+        "Ollama Cloud": [
+            {
+                "name": "ollama_cloud/gpt-oss:120b",
+                "description": (
+                    "Ollama Cloud - Large 120B parameter model (no GPU required)"
+                )
+            },
+            {
+                "name": "ollama_cloud/llama3.3:70b",
+                "description": (
+                    "Ollama Cloud - Llama 3.3 70B model (no GPU required)"
+                )
+            },
+            {
+                "name": "ollama_cloud/qwen2.5:72b",
+                "description": (
+                    "Ollama Cloud - Qwen 2.5 72B model (no GPU required)"
+                )
+            },
+            {
+                "name": "ollama_cloud/deepseek-v3:671b",
+                "description": (
+                    "Ollama Cloud - DeepSeek V3 671B model (no GPU required)"
+                )
+            }
         ]
     }
 
@@ -117,7 +143,8 @@ def get_all_predefined_models() -> List[Dict[str, Any]]:
         "Alias": "OpenAI",  # Alias models use OpenAI as base
         "Anthropic Claude": "Anthropic",
         "OpenAI": "OpenAI", 
-        "DeepSeek": "DeepSeek"
+        "DeepSeek": "DeepSeek",
+        "Ollama Cloud": "Ollama Cloud"
     }
 
     for category, models in model_categories.items():
@@ -175,7 +202,11 @@ def load_all_available_models() -> tuple[List[str], List[Dict[str, Any]]]:
     try:
         response = requests.get(LITELLM_URL, timeout=5)
         if response.status_code == 200:
-            litellm_names = sorted(response.json().keys())
+            # Filter out obsolete Ollama Cloud models (replaced by ollama_cloud/ prefix)
+            litellm_names = [
+                model_name for model_name in sorted(response.json().keys())
+                if not (model_name.startswith("ollama/") and "-cloud" in model_name)
+            ]
     except Exception:  # pylint: disable=broad-except
         pass
 
@@ -184,7 +215,17 @@ def load_all_available_models() -> tuple[List[str], List[Dict[str, Any]]]:
     ollama_names = []
     try:
         api_base = get_ollama_api_base()
-        response = requests.get(f"{api_base.replace('/v1', '')}/api/tags", timeout=1)
+        ollama_base = api_base.replace('/v1', '')
+
+        # Add authentication headers for Ollama Cloud if needed
+        headers = {}
+        is_cloud = "ollama.com" in api_base
+        timeout = 5 if is_cloud else 1  # Cloud needs more time
+
+        if is_cloud:
+            headers = get_ollama_auth_headers()
+
+        response = requests.get(f"{ollama_base}/api/tags", headers=headers, timeout=timeout)
         if response.status_code == 200:
             data = response.json()
             ollama_data = data.get('models', data.get('items', []))
@@ -499,7 +540,46 @@ def handle(self, args: Optional[List[str]] = None) -> bool:  # pylint: disable=t
             total_models = 0
             displayed_models = 0
 
-            # Process and display models (use global cache for numbering)
+            # First, add predefined models (Alias, Claude, OpenAI, DeepSeek, Ollama Cloud)
+            predefined_models = get_all_predefined_models()
+            for model in predefined_models:
+                model_name = model["name"]
+
+                # Skip if search term provided and not in model name
+                if search_term and search_term not in model_name.lower():
+                    continue
+
+                displayed_models += 1
+                total_models += 1
+
+                # Find index from global cache
+                try:
+                    model_index = _GLOBAL_MODEL_CACHE.index(model_name) + 1
+                except ValueError:
+                    continue
+
+                # Format pricing info
+                input_cost_str = (
+                    f"${model['input_cost']:.2f}"
+                    if model['input_cost'] is not None else "Unknown"
+                )
+                output_cost_str = (
+                    f"${model['output_cost']:.2f}"
+                    if model['output_cost'] is not None else "Unknown"
+                )
+
+                # Add row to table
+                model_table.add_row(
+                    str(model_index),
+                    model_name,
+                    model["provider"],
+                    "N/A",  # max_tokens
+                    input_cost_str,
+                    output_cost_str,
+                    model.get("description", "")
+                )
+
+            # Process and display LiteLLM models (use global cache for numbering)
             for model_name, model_info in sorted(model_data.items()):
                 # Find the model index from global cache
                 try:

diff --git a/src/cai/repl/ui/banner.py b/src/cai/repl/ui/banner.py
@@ -76,12 +76,19 @@ def get_supported_models_count():
 
             # Try to get Ollama models count
             try:
-                ollama_api_base = os.getenv(
-                    "OLLAMA_API_BASE",
-                    "http://host.docker.internal:8000/v1"
-                )
+                from cai.util import get_ollama_api_base
+                ollama_api_base = get_ollama_api_base()
+
+                # Add authentication headers for Ollama Cloud if using OPENAI_BASE_URL
+                headers = {}
+                if "ollama.com" in ollama_api_base:
+                    api_key = os.getenv("OPENAI_API_KEY")
+                    if api_key:
+                        headers["Authorization"] = f"Bearer {api_key}"
+
                 ollama_response = requests.get(
                     f"{ollama_api_base.replace('/v1', '')}/api/tags",
+                    headers=headers,
                     timeout=1
                 )
 

diff --git a/src/cai/repl/ui/toolbar.py b/src/cai/repl/ui/toolbar.py
@@ -96,11 +96,20 @@ def update_toolbar_in_background():
         ollama_status = "unavailable"
         try:
             # Get Ollama models with a short timeout to prevent hanging
-            api_base = os.getenv(
-                "OLLAMA_API_BASE",
-                "http://host.docker.internal:8000/v1")
+            from cai.util import get_ollama_api_base
+            api_base = get_ollama_api_base()
+
+            # Add authentication headers for Ollama Cloud if using OPENAI_BASE_URL
+            headers = {}
+            if "ollama.com" in api_base:
+                api_key = os.getenv("OPENAI_API_KEY")
+                if api_key:
+                    headers["Authorization"] = f"Bearer {api_key}"
+
             response = requests.get(
-                f"{api_base.replace('/v1', '')}/api/tags", timeout=0.5)
+                f"{api_base.replace('/v1', '')}/api/tags",
+                headers=headers,
+                timeout=0.5)
 
             if response.status_code == 200:
                 data = response.json()

diff --git a/src/cai/sdk/agents/models/openai_chatcompletions.py b/src/cai/sdk/agents/models/openai_chatcompletions.py
@@ -2708,7 +2708,23 @@ async def _fetch_response(
             provider = model_str.split("/")[0]
 
             # Apply provider-specific configurations
-            if provider == "deepseek":
+            if provider == "ollama_cloud":
+                # Ollama Cloud configuration
+                ollama_api_key = os.getenv("OLLAMA_API_KEY")
+                ollama_api_base = os.getenv("OLLAMA_API_BASE", "https://ollama.com")
+
+                if ollama_api_key:
+                    kwargs["api_key"] = ollama_api_key
+                if ollama_api_base:
+                    kwargs["api_base"] = ollama_api_base
+
+                # Drop params not supported by Ollama
+                litellm.drop_params = True
+                kwargs.pop("parallel_tool_calls", None)
+                kwargs.pop("store", None)
+                if not converted_tools:
+                    kwargs.pop("tool_choice", None)
+            elif provider == "deepseek":
                 litellm.drop_params = True
                 kwargs.pop("parallel_tool_calls", None)
                 kwargs.pop("store", None)  # DeepSeek doesn't support store parameter
@@ -2846,6 +2862,51 @@ async def _fetch_response(
         max_retries = 3
         retry_count = 0
 
+        # Check if this is Ollama Cloud (ollama_cloud/ prefix)
+        # Ollama Cloud is OpenAI-compatible, so we bypass LiteLLM to avoid parsing issues
+        is_ollama_cloud = "ollama_cloud/" in model_str
+
+        if is_ollama_cloud:
+            # Use AsyncOpenAI client directly for Ollama Cloud
+            # Ollama Cloud is fully OpenAI-compatible at /v1/chat/completions
+            try:
+                # Configure the client with Ollama Cloud settings
+                ollama_api_key = os.getenv("OLLAMA_API_KEY") or os.getenv("OPENAI_API_KEY")
+                ollama_base_url = os.getenv("OLLAMA_API_BASE", "https://ollama.com")
+
+                # Ensure the URL has /v1 for OpenAI compatibility
+                if not ollama_base_url.endswith("/v1"):
+                    ollama_base_url = f"{ollama_base_url}/v1"
+
+                # Create a temporary client configured for Ollama Cloud
+                ollama_client = AsyncOpenAI(
+                    api_key=ollama_api_key,
+                    base_url=ollama_base_url
+                )
+
+                # Remove the ollama_cloud/ prefix from the model name
+                clean_model = kwargs["model"].replace("ollama_cloud/", "")
+                kwargs["model"] = clean_model
+
+                # Remove LiteLLM-specific parameters
+                kwargs.pop("extra_headers", None)
+                kwargs.pop("api_key", None)
+                kwargs.pop("api_base", None)
+                kwargs.pop("custom_llm_provider", None)
+
+                # Call Ollama Cloud using OpenAI-compatible API
+                if stream:
+                    return await ollama_client.chat.completions.create(**kwargs)
+                else:
+                    return await ollama_client.chat.completions.create(**kwargs)
+
+            except Exception as e:
+                # If Ollama Cloud fails, raise with helpful message
+                raise Exception(
+                    f"Error connecting to Ollama Cloud: {str(e)}\n"
+                    f"Verify OLLAMA_API_KEY and OLLAMA_API_BASE are configured correctly."
+                ) from e
+
         while retry_count < max_retries:
             try:
                 if self.is_ollama: