diff --git a/README.md b/README.md index 793d861..228808a 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ We've been writing blogs and sharing our learnings along the way. Check out our The SRE Agent supports multiple the following LLM providers: ### Anthropic -- **Models**: e.g. "claude-4-0-sonnet-latest" +- **Models**: e.g. "claude-3-7-sonnet-latest" - **Setup**: Requires `ANTHROPIC_API_KEY` ### Google Gemini @@ -233,3 +233,4 @@ Check out our blog posts for insights and updates: - [Bringing Agentic AI into the Real World](https://www.fuzzylabs.ai/blog-post/bringing-agentic-ai-into-the-real-world) - [How We're Building an Autonomous SRE with FastMCP](https://www.fuzzylabs.ai/blog-post/how-were-building-an-autonomous-sre-with-fastmcp) +- [Can we trust an agent in Prod?](https://www.fuzzylabs.ai/blog-post/can-we-trust-an-agent-in-prod) diff --git a/docs/credentials.md b/docs/credentials.md index ed576ea..454c458 100644 --- a/docs/credentials.md +++ b/docs/credentials.md @@ -21,7 +21,7 @@ The following credentials must be retrieved prior to running the agent. These cr > **Note**: You only need to configure **one** LLM provider. Choose either Anthropic or Google Gemini and provide the corresponding API key. - **PROVIDER**: The LLM provider name (e.g., "anthropic", "google"). -- **MODEL**: The specific model name to use (e.g., "claude-3-5-sonnet-20241022", "gemini-1.5-pro"). +- **MODEL**: The specific model name to use (e.g., "claude-3-7-sonnet-latest", "gemini-2.5-flash"). **Choose one of the following:** - **ANTHROPIC_API_KEY**: An API key for Anthropic Claude models *(required if using Anthropic provider)*. diff --git a/setup_credentials.py b/setup_credentials.py index 879068b..9937a84 100644 --- a/setup_credentials.py +++ b/setup_credentials.py @@ -91,7 +91,7 @@ def get_credential_config(platform: str) -> dict[str, dict[str, Any]]: }, "MAX_TOKENS": { "prompt": "Controls the maximum number of tokens the LLM can generate in " - "its response e.g. 10000: ", + "its response e.g. 8000: ", "mask_value": False, }, "DEV_BEARER_TOKEN": { diff --git a/sre_agent/client/utils/schemas.py b/sre_agent/client/utils/schemas.py index b4b4e48..5a6f2cf 100644 --- a/sre_agent/client/utils/schemas.py +++ b/sre_agent/client/utils/schemas.py @@ -68,7 +68,7 @@ class ClientConfig: default_factory=lambda: json.loads(os.getenv("TOOLS", "[]")) ) model: str = os.getenv("LLM_MODEL", "claude-3-7-sonnet-latest") - max_tokens: int = 1000 + max_tokens: int = 8000 max_tool_retries: int = 3 query_timeout: int = int( os.getenv("QUERY_TIMEOUT", DEFAULT_QUERY_TIMEOUT) or DEFAULT_QUERY_TIMEOUT diff --git a/sre_agent/llm/utils/clients.py b/sre_agent/llm/utils/clients.py index 05d3e9a..47f6350 100644 --- a/sre_agent/llm/utils/clients.py +++ b/sre_agent/llm/utils/clients.py @@ -9,6 +9,7 @@ from anthropic.types import ToolParam from google import genai from google.genai import types +from google.genai.types import CachedContent from pydantic import BaseModel from shared.logger import logger # type: ignore from shared.schemas import ( # type: ignore @@ -177,26 +178,49 @@ def __init__(self, settings: LLMSettings = LLMSettings()) -> None: """The constructor for the Gemini client.""" super().__init__(settings) self.client = genai.Client(api_key=os.getenv("GEMINI_API_KEY")) + self._cache: CachedContent | None = None + + def cache_tools(self, tools: list[Any]) -> list[Any]: + """A method for adding a cache block to tools.""" + if tools: + try: + from google.genai import types + + config = types.CreateCachedContentConfig( + tools=tools, + ttl="600s", + ) + self._cache = self.client.caches.create( + model=self.settings.model, config=config + ) + except Exception as e: + logger.warning(f"Failed to create Gemini cache: {e}") + return tools def generate(self, payload: TextGenerationPayload) -> Message: """A method for generating text using the Gemini API.""" adapter = GeminiTextGenerationPayloadAdapter(payload) - messages, tools = adapter.adapt() - if not self.settings.max_tokens: - raise ValueError("Max tokens configuration has not been set.") + cached_tools = self.cache_tools(tools) + + # Use cache if available + config_kwargs = {"max_output_tokens": self.settings.max_tokens} + if self._cache: + config_kwargs["cached_content"] = self._cache.name + messages = [messages[-1]] if messages else [] + else: + config_kwargs["tools"] = cached_tools response = self.client.models.generate_content( model=self.settings.model, contents=messages, - config=types.GenerateContentConfig( - tools=tools, - max_output_tokens=self.settings.max_tokens, - ), + config=types.GenerateContentConfig(**config_kwargs), ) if response.usage_metadata: + # Log with cache information + logger.info( f"Token usage - Input: {response.usage_metadata.prompt_token_count}, " f"Output: {response.usage_metadata.candidates_token_count}, " @@ -219,7 +243,9 @@ def generate(self, payload: TextGenerationPayload) -> Message: usage=Usage( input_tokens=response.usage_metadata.prompt_token_count, output_tokens=response.usage_metadata.candidates_token_count, - cache_creation_input_tokens=None, + cache_creation_input_tokens=response.usage_metadata.cache_creation_token_count + if hasattr(response.usage_metadata, "cache_creation_token_count") + else None, cache_read_input_tokens=response.usage_metadata.cached_content_token_count, ) if response.usage_metadata