fuzzylabs · YazIbrahim · Jun 6, 2025 · Jun 6, 2025 · Jun 6, 2025 · Jun 6, 2025
diff --git a/README.md b/README.md
@@ -33,7 +33,7 @@ We've been writing blogs and sharing our learnings along the way. Check out our
 The SRE Agent supports multiple the following LLM providers:
 
 ### Anthropic
-- **Models**: e.g. "claude-4-0-sonnet-latest"
+- **Models**: e.g. "claude-3-7-sonnet-latest"
 - **Setup**: Requires `ANTHROPIC_API_KEY`
 
 ### Google Gemini
@@ -233,3 +233,4 @@ Check out our blog posts for insights and updates:
 
 - [Bringing Agentic AI into the Real World](https://www.fuzzylabs.ai/blog-post/bringing-agentic-ai-into-the-real-world)
 - [How We're Building an Autonomous SRE with FastMCP](https://www.fuzzylabs.ai/blog-post/how-were-building-an-autonomous-sre-with-fastmcp)
+- [Can we trust an agent in Prod?](https://www.fuzzylabs.ai/blog-post/can-we-trust-an-agent-in-prod)
diff --git a/docs/credentials.md b/docs/credentials.md
@@ -21,7 +21,7 @@ The following credentials must be retrieved prior to running the agent. These cr
 > **Note**: You only need to configure **one** LLM provider. Choose either Anthropic or Google Gemini and provide the corresponding API key.
 
 - **PROVIDER**: The LLM provider name (e.g., "anthropic", "google").
-- **MODEL**: The specific model name to use (e.g., "claude-3-5-sonnet-20241022", "gemini-1.5-pro").
+- **MODEL**: The specific model name to use (e.g., "claude-3-7-sonnet-latest", "gemini-2.5-flash").
 
 **Choose one of the following:**
 - **ANTHROPIC_API_KEY**: An API key for Anthropic Claude models *(required if using Anthropic provider)*.

diff --git a/setup_credentials.py b/setup_credentials.py
@@ -91,7 +91,7 @@ def get_credential_config(platform: str) -> dict[str, dict[str, Any]]:
         },
         "MAX_TOKENS": {
             "prompt": "Controls the maximum number of tokens the LLM can generate in "
-            "its response e.g. 10000: ",
+            "its response e.g. 8000: ",
             "mask_value": False,
         },
         "DEV_BEARER_TOKEN": {

diff --git a/sre_agent/client/utils/schemas.py b/sre_agent/client/utils/schemas.py
@@ -68,7 +68,7 @@ class ClientConfig:
         default_factory=lambda: json.loads(os.getenv("TOOLS", "[]"))
     )
     model: str = os.getenv("LLM_MODEL", "claude-3-7-sonnet-latest")
-    max_tokens: int = 1000
+    max_tokens: int = 8000
     max_tool_retries: int = 3
     query_timeout: int = int(
         os.getenv("QUERY_TIMEOUT", DEFAULT_QUERY_TIMEOUT) or DEFAULT_QUERY_TIMEOUT

diff --git a/sre_agent/llm/utils/clients.py b/sre_agent/llm/utils/clients.py
@@ -9,6 +9,7 @@
 from anthropic.types import ToolParam
 from google import genai
 from google.genai import types
+from google.genai.types import CachedContent
 from pydantic import BaseModel
 from shared.logger import logger  # type: ignore
 from shared.schemas import (  # type: ignore
@@ -177,26 +178,49 @@ def __init__(self, settings: LLMSettings = LLMSettings()) -> None:
         """The constructor for the Gemini client."""
         super().__init__(settings)
         self.client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
+        self._cache: CachedContent | None = None
+
+    def cache_tools(self, tools: list[Any]) -> list[Any]:
+        """A method for adding a cache block to tools."""
+        if tools:
+            try:
+                from google.genai import types
+
+                config = types.CreateCachedContentConfig(
+                    tools=tools,
+                    ttl="600s",
+                )
+                self._cache = self.client.caches.create(
+                    model=self.settings.model, config=config
+                )
+            except Exception as e:
+                logger.warning(f"Failed to create Gemini cache: {e}")
+        return tools
 
     def generate(self, payload: TextGenerationPayload) -> Message:
         """A method for generating text using the Gemini API."""
         adapter = GeminiTextGenerationPayloadAdapter(payload)
-
         messages, tools = adapter.adapt()
 
-        if not self.settings.max_tokens:
-            raise ValueError("Max tokens configuration has not been set.")
+        cached_tools = self.cache_tools(tools)
+
+        # Use cache if available
+        config_kwargs = {"max_output_tokens": self.settings.max_tokens}
+        if self._cache:
+            config_kwargs["cached_content"] = self._cache.name
+            messages = [messages[-1]] if messages else []
+        else:
+            config_kwargs["tools"] = cached_tools
 
         response = self.client.models.generate_content(
             model=self.settings.model,
             contents=messages,
-            config=types.GenerateContentConfig(
-                tools=tools,
-                max_output_tokens=self.settings.max_tokens,
-            ),
+            config=types.GenerateContentConfig(**config_kwargs),
         )
 
         if response.usage_metadata:
+            # Log with cache information
+
             logger.info(
                 f"Token usage - Input: {response.usage_metadata.prompt_token_count}, "
                 f"Output: {response.usage_metadata.candidates_token_count}, "
@@ -219,7 +243,9 @@ def generate(self, payload: TextGenerationPayload) -> Message:
             usage=Usage(
                 input_tokens=response.usage_metadata.prompt_token_count,
                 output_tokens=response.usage_metadata.candidates_token_count,
-                cache_creation_input_tokens=None,
+                cache_creation_input_tokens=response.usage_metadata.cache_creation_token_count
+                if hasattr(response.usage_metadata, "cache_creation_token_count")
+                else None,
                 cache_read_input_tokens=response.usage_metadata.cached_content_token_count,
             )
             if response.usage_metadata