Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions SCORERS.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ Evaluates whether the output is factually consistent with the expected answer.
- `input` (string): The input question or prompt
- `output` (string, required): The generated answer to evaluate
- `expected` (string, required): The ground truth answer
- `model` (string, optional): Model to use (default: configured via `init()` or "gpt-4o")
- `model` (string, optional): Model to use (default: configured via `init()` or "gpt-5-mini")
- `client` (Client, optional): Custom OpenAI client

**Score Range:** 0-1
Expand Down Expand Up @@ -209,7 +209,7 @@ Evaluates how relevant the retrieved context is to the input question.
- `input` (string, required): The question
- `output` (string, required): The generated answer
- `context` (string[] | string, required): Retrieved context passages
- `model` (string, optional): Model to use (default: "gpt-4o-mini")
- `model` (string, optional): Model to use (default: "gpt-5-nano")

**Score Range:** 0-1

Expand Down Expand Up @@ -600,7 +600,7 @@ Note: Interpretation varies by scorer type. Binary scorers (ExactMatch, ValidJSO

Many scorers share these common parameters:

- `model` (string): LLM model to use for evaluation (default: configured via `init()` or "gpt-4o")
- `model` (string): LLM model to use for evaluation (default: configured via `init()` or "gpt-5-mini")
- `client` (Client): Custom OpenAI-compatible client
- `use_cot` (boolean): Enable chain-of-thought reasoning for LLM scorers (default: true)
- `temperature` (number): LLM temperature setting
Expand All @@ -616,13 +616,13 @@ import OpenAI from "openai";

init({
client: new OpenAI({ apiKey: "..." }),
defaultModel: "gpt-4o",
defaultModel: "gpt-5-mini",
});
```

```python
from autoevals import init
from openai import OpenAI

init(OpenAI(api_key="..."), default_model="gpt-4o")
init(OpenAI(api_key="..."), default_model="gpt-5-mini")
```
20 changes: 10 additions & 10 deletions js/llm.fixtures.ts
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ export const openaiClassifierShouldEvaluateTitlesWithCoT = [
id: "chatcmpl-B7XFw0OCpCbMVwLizRts3Cl72Obg0",
object: "chat.completion",
created: 1741135832,
model: "gpt-4o-2024-08-06",
model: "gpt-5-mini-2025-08-07",
choices: [
{
index: 0,
Expand Down Expand Up @@ -98,7 +98,7 @@ export const openaiClassifierShouldEvaluateTitlesWithCoT = [
id: "chatcmpl-B7YPU81s7cb2uzlwJ8w9aS5qhfhtJ",
object: "chat.completion",
created: 1741140268,
model: "gpt-4o-2024-08-06",
model: "gpt-5-mini-2025-08-07",
choices: [
{
index: 0,
Expand Down Expand Up @@ -141,7 +141,7 @@ export const openaiClassifierShouldEvaluateTitlesWithCoT = [
id: "chatcmpl-B7YQ9ILZ9DJR2AjY2s4qU15Rc6qII",
object: "chat.completion",
created: 1741140309,
model: "gpt-4o-2024-08-06",
model: "gpt-5-mini-2025-08-07",
choices: [
{
index: 0,
Expand Down Expand Up @@ -180,7 +180,7 @@ export const openaiClassifierShouldEvaluateTitlesWithCoT = [
id: "chatcmpl-B7YQa80DGu61zUWpdPtXRaJdRQz6l",
object: "chat.completion",
created: 1741140336,
model: "gpt-4o-2024-08-06",
model: "gpt-5-mini-2025-08-07",
choices: [
{
index: 0,
Expand Down Expand Up @@ -222,7 +222,7 @@ export const openaiClassifierShouldEvaluateArithmeticExpressions = [
id: "chatcmpl-B7YSMVJ7qaQTJ9OtR6zPUEdHxrNbT",
object: "chat.completion",
created: 1741140446,
model: "gpt-4o-2024-08-06",
model: "gpt-5-mini-2025-08-07",
choices: [
{
index: 0,
Expand Down Expand Up @@ -265,7 +265,7 @@ export const openaiClassifierShouldEvaluateArithmeticExpressions = [
id: "chatcmpl-B7YTPWIPOFpRcVOjEnU6s0kZXgPdB",
object: "chat.completion",
created: 1741140511,
model: "gpt-4o-2024-08-06",
model: "gpt-5-mini-2025-08-07",
choices: [
{
index: 0,
Expand Down Expand Up @@ -308,7 +308,7 @@ export const openaiClassifierShouldEvaluateArithmeticExpressions = [
id: "chatcmpl-B7YU2qluNL0SenvL1zBiSzrka236n",
object: "chat.completion",
created: 1741140550,
model: "gpt-4o-2024-08-06",
model: "gpt-5-mini-2025-08-07",
choices: [
{
index: 0,
Expand Down Expand Up @@ -351,7 +351,7 @@ export const openaiClassifierShouldEvaluateArithmeticExpressions = [
id: "chatcmpl-B7YUTk3771FhLlXQNZPaobEC0d8R6",
object: "chat.completion",
created: 1741140577,
model: "gpt-4o-2024-08-06",
model: "gpt-5-mini-2025-08-07",
choices: [
{
index: 0,
Expand Down Expand Up @@ -390,7 +390,7 @@ export const openaiClassifierShouldEvaluateArithmeticExpressions = [
id: "chatcmpl-B7YUtrpit4RvQCeqfOcZme9L6pMAP",
object: "chat.completion",
created: 1741140603,
model: "gpt-4o-2024-08-06",
model: "gpt-5-mini-2025-08-07",
choices: [
{
index: 0,
Expand Down Expand Up @@ -432,7 +432,7 @@ export const openaiClassifierShouldEvaluateArithmeticExpressions = [
id: "chatcmpl-B7YV8HHTm4hZU58Zp9gcjwp3MigEl",
object: "chat.completion",
created: 1741140618,
model: "gpt-4o-2024-08-06",
model: "gpt-5-mini-2025-08-07",
choices: [
{
index: 0,
Expand Down
4 changes: 2 additions & 2 deletions js/llm.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ Issue Description: {{page_content}}
id: "chatcmpl-test",
object: "chat.completion",
created: 1234567890,
model: "gpt-4o",
model: "gpt-5-mini",
choices: [
{
index: 0,
Expand Down Expand Up @@ -294,7 +294,7 @@ Issue Description: {{page_content}}
id: "chatcmpl-test",
object: "chat.completion",
created: 1234567890,
model: "gpt-4o",
model: "gpt-5-mini",
choices: [
{
index: 0,
Expand Down
2 changes: 1 addition & 1 deletion js/llm.ts
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ export type LLMArgs = {
* The default model to use for LLM-based evaluations.
* @deprecated Use `init({ defaultModel: "..." })` to configure the default model instead.
*/
export const DEFAULT_MODEL = "gpt-4o";
export const DEFAULT_MODEL = "gpt-5-mini";

const PLAIN_RESPONSE_SCHEMA = {
properties: {
Expand Down
6 changes: 3 additions & 3 deletions js/oai.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -261,8 +261,8 @@ describe("OAI", () => {
expect(Object.is(builtClient, otherClient)).toBe(true);
});

test("getDefaultModel returns gpt-4o by default", () => {
expect(getDefaultModel()).toBe("gpt-4o");
test("getDefaultModel returns gpt-5-mini by default", () => {
expect(getDefaultModel()).toBe("gpt-5-mini");
});

test("init sets default model", () => {
Expand All @@ -275,7 +275,7 @@ describe("OAI", () => {
expect(getDefaultModel()).toBe("claude-3-5-sonnet-20241022");

init({ defaultModel: undefined });
expect(getDefaultModel()).toBe("gpt-4o");
expect(getDefaultModel()).toBe("gpt-5-mini");
});

test("init can set both client and default model", () => {
Expand Down
6 changes: 3 additions & 3 deletions js/oai.ts
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ export interface InitOptions {
client?: OpenAI;
/**
* The default model to use for evaluations when not specified per-call.
* Defaults to "gpt-4o" if not set.
* Defaults to "gpt-5-mini" if not set.
*
* When using non-OpenAI providers via the Braintrust proxy, set this to
* the appropriate model string (e.g., "claude-3-5-sonnet-20241022").
Expand Down Expand Up @@ -200,10 +200,10 @@ export const init = ({ client, defaultModel }: InitOptions = {}) => {
};

/**
* Get the configured default model, or "gpt-4o" if not set.
* Get the configured default model, or "gpt-5-mini" if not set.
*/
export const getDefaultModel = (): string => {
return globalThis.__defaultModel ?? "gpt-4o";
return globalThis.__defaultModel ?? "gpt-5-mini";
};

export async function cachedChatCompletion(
Expand Down
7 changes: 3 additions & 4 deletions js/ragas.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ test("Ragas generation test", async () => {
output: data.output,
expected: data.expected,
context: data.context,
temperature: 0,
});

if (score === 1) {
Expand Down Expand Up @@ -119,7 +118,7 @@ describe("ContextRelevancy score clamping", () => {
id: "chatcmpl-test",
object: "chat.completion",
created: Date.now(),
model: "gpt-4o",
model: "gpt-5-mini",
choices: [
{
index: 0,
Expand Down Expand Up @@ -184,7 +183,7 @@ describe("ContextRelevancy score clamping", () => {
id: "chatcmpl-test",
object: "chat.completion",
created: Date.now(),
model: "gpt-4o",
model: "gpt-5-mini",
choices: [
{
index: 0,
Expand Down Expand Up @@ -264,7 +263,7 @@ describe("AnswerCorrectness custom embedding model", () => {
id: "test-id",
object: "chat.completion",
created: Date.now(),
model: "gpt-4o",
model: "gpt-5-mini",
choices: [
{
index: 0,
Expand Down
14 changes: 9 additions & 5 deletions js/ragas.ts
Original file line number Diff line number Diff line change
Expand Up @@ -390,10 +390,12 @@ export const ContextRecall: ScorerWithPartial<string, RagasArgs> = makePartial(
return {
name: "ContextRecall",
score:
statements.statements.reduce(
(acc, { attributed }) => acc + attributed,
0,
) / statements.statements.length,
statements.statements.length > 0
? statements.statements.reduce(
(acc, { attributed }) => acc + attributed,
0,
) / statements.statements.length
: 0,
metadata: {
statements: statements.statements,
},
Expand Down Expand Up @@ -983,8 +985,10 @@ function parseArgs(args: ScorerArgs<string, RagasArgs>): {
"messages"
> = {
model: args.model ?? getDefaultModel(),
temperature: args.temperature ?? 0,
};
if (args.temperature !== undefined) {
chatArgs.temperature = args.temperature;
}
if (args.maxTokens) {
chatArgs.max_tokens = args.maxTokens;
}
Expand Down
4 changes: 2 additions & 2 deletions py/autoevals/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
This module provides a collection of pre-built LLM scorers for common evaluation tasks.

All evaluators accept the following common arguments:
- model: Model to use (defaults to gpt-4o)
- model: Model to use (defaults to gpt-5-mini)
- temperature: Controls randomness (0-1). If not specified, uses the model's default.
- max_tokens: Maximum tokens to generate. If not specified, uses the model's default.
- client: OpenAI client (defaults to global client from init())
Expand Down Expand Up @@ -79,7 +79,7 @@
)

# Deprecated: Use init(default_model="...") to configure the default model instead.
DEFAULT_MODEL = "gpt-4o"
DEFAULT_MODEL = "gpt-5-mini"

PLAIN_RESPONSE_SCHEMA = {
"properties": {"choice": {"description": "The choice", "title": "Choice", "type": "string"}},
Expand Down
6 changes: 3 additions & 3 deletions py/autoevals/oai.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ def init(client: Client | None = None, is_async: bool = False, default_model: st
is_async: Whether to create a client with async operations. Defaults to False.
Deprecated: Use the `client` argument directly with your desired async/sync configuration.
default_model: The default model to use for evaluations when not specified per-call.
Defaults to "gpt-4o" if not set. When using non-OpenAI providers via the Braintrust
Defaults to "gpt-5-mini" if not set. When using non-OpenAI providers via the Braintrust
proxy, set this to the appropriate model string (e.g., "claude-3-5-sonnet-20241022").

Example:
Expand Down Expand Up @@ -284,8 +284,8 @@ def init(client: Client | None = None, is_async: bool = False, default_model: st


def get_default_model() -> str:
"""Get the configured default model, or "gpt-4o" if not set."""
return _default_model_var.get(None) or "gpt-4o"
"""Get the configured default model, or "gpt-5-mini" if not set."""
return _default_model_var.get(None) or "gpt-5-mini"


warned_deprecated_api_key_base_url = False
Expand Down
10 changes: 5 additions & 5 deletions py/autoevals/ragas.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

**Common arguments**:

- `model`: Model to use for evaluation, defaults to the model configured via init(default_model=...) or "gpt-4o"
- `model`: Model to use for evaluation, defaults to the model configured via init(default_model=...) or "gpt-5-mini"
- `client`: Optional Client for API calls. If not provided, uses global client from init()

**Example - Direct usage**:
Expand Down Expand Up @@ -124,8 +124,8 @@ def check_required(name, **kwargs):


# Deprecated: Use init(default_model="...") to configure the default model instead.
# This was previously "gpt-4o-mini" but now defaults to the configured model.
DEFAULT_RAGAS_MODEL = "gpt-4o-mini"
# This was previously "gpt-5-nano" but now defaults to the configured model.
DEFAULT_RAGAS_MODEL = "gpt-5-nano"


def _get_model(model: str | None) -> str:
Expand All @@ -138,7 +138,7 @@ def _get_model(model: str | None) -> str:
return model

# Check if user configured a custom default via init(default_model=...)
# If they did (even if it's "gpt-4o"), respect it for consistency
# If they did (even if it's "gpt-5-mini"), respect it for consistency
configured_default = _default_model_var.get(None)
if configured_default is not None:
return configured_default
Expand Down Expand Up @@ -559,7 +559,7 @@ def _postprocess(self, response):

return Score(
name=self._name(),
score=ones / total,
score=ones / total if total > 0 else 0,
metadata={
"statements": statements,
"recall": statements,
Expand Down
10 changes: 5 additions & 5 deletions py/autoevals/test_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ def test_factuality():
}
],
"created": 1734029028,
"model": "gpt-4o-2024-08-06",
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion",
"system_fingerprint": "fp_cc5cf1c6e3",
"usage": {
Expand Down Expand Up @@ -232,7 +232,7 @@ def test_factuality_client():
}
],
"created": 1734029028,
"model": "gpt-4o-2024-08-06",
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion",
"system_fingerprint": "fp_cc5cf1c6e3",
"usage": {
Expand Down Expand Up @@ -297,7 +297,7 @@ def test_init_client():
}
],
"created": 1734029028,
"model": "gpt-4o-2024-08-06",
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion",
"system_fingerprint": "fp_cc5cf1c6e3",
"usage": {
Expand Down Expand Up @@ -373,7 +373,7 @@ def capture_request(request):
"id": "chatcmpl-test",
"object": "chat.completion",
"created": 1234567890,
"model": "gpt-4o",
"model": "gpt-5-mini",
"choices": [
{
"index": 0,
Expand Down Expand Up @@ -429,7 +429,7 @@ def capture_request(request):
"id": "chatcmpl-test",
"object": "chat.completion",
"created": 1234567890,
"model": "gpt-4o",
"model": "gpt-5-mini",
"choices": [
{
"index": 0,
Expand Down
Loading