braintrustdata · Qard · Jan 30, 2026 · Feb 5, 2026 · Feb 5, 2026
diff --git a/SCORERS.md b/SCORERS.md
@@ -25,7 +25,7 @@ Evaluates whether the output is factually consistent with the expected answer.
 - `input` (string): The input question or prompt
 - `output` (string, required): The generated answer to evaluate
 - `expected` (string, required): The ground truth answer
-- `model` (string, optional): Model to use (default: configured via `init()` or "gpt-4o")
+- `model` (string, optional): Model to use (default: configured via `init()` or "gpt-5-mini")
 - `client` (Client, optional): Custom OpenAI client
 
 **Score Range:** 0-1
@@ -209,7 +209,7 @@ Evaluates how relevant the retrieved context is to the input question.
 - `input` (string, required): The question
 - `output` (string, required): The generated answer
 - `context` (string[] | string, required): Retrieved context passages
-- `model` (string, optional): Model to use (default: "gpt-4o-mini")
+- `model` (string, optional): Model to use (default: "gpt-5-nano")
 
 **Score Range:** 0-1
 
@@ -600,7 +600,7 @@ Note: Interpretation varies by scorer type. Binary scorers (ExactMatch, ValidJSO
 
 Many scorers share these common parameters:
 
-- `model` (string): LLM model to use for evaluation (default: configured via `init()` or "gpt-4o")
+- `model` (string): LLM model to use for evaluation (default: configured via `init()` or "gpt-5-mini")
 - `client` (Client): Custom OpenAI-compatible client
 - `use_cot` (boolean): Enable chain-of-thought reasoning for LLM scorers (default: true)
 - `temperature` (number): LLM temperature setting
@@ -616,13 +616,13 @@ import OpenAI from "openai";
 
 init({
   client: new OpenAI({ apiKey: "..." }),
-  defaultModel: "gpt-4o",
+  defaultModel: "gpt-5-mini",
 });
 ```
 
 ```python
 from autoevals import init
 from openai import OpenAI
 
-init(OpenAI(api_key="..."), default_model="gpt-4o")
+init(OpenAI(api_key="..."), default_model="gpt-5-mini")
 ```
diff --git a/js/llm.fixtures.ts b/js/llm.fixtures.ts
@@ -52,7 +52,7 @@ export const openaiClassifierShouldEvaluateTitlesWithCoT = [
     id: "chatcmpl-B7XFw0OCpCbMVwLizRts3Cl72Obg0",
     object: "chat.completion",
     created: 1741135832,
-    model: "gpt-4o-2024-08-06",
+    model: "gpt-5-mini-2025-08-07",
     choices: [
       {
         index: 0,
@@ -98,7 +98,7 @@ export const openaiClassifierShouldEvaluateTitlesWithCoT = [
     id: "chatcmpl-B7YPU81s7cb2uzlwJ8w9aS5qhfhtJ",
     object: "chat.completion",
     created: 1741140268,
-    model: "gpt-4o-2024-08-06",
+    model: "gpt-5-mini-2025-08-07",
     choices: [
       {
         index: 0,
@@ -141,7 +141,7 @@ export const openaiClassifierShouldEvaluateTitlesWithCoT = [
     id: "chatcmpl-B7YQ9ILZ9DJR2AjY2s4qU15Rc6qII",
     object: "chat.completion",
     created: 1741140309,
-    model: "gpt-4o-2024-08-06",
+    model: "gpt-5-mini-2025-08-07",
     choices: [
       {
         index: 0,
@@ -180,7 +180,7 @@ export const openaiClassifierShouldEvaluateTitlesWithCoT = [
     id: "chatcmpl-B7YQa80DGu61zUWpdPtXRaJdRQz6l",
     object: "chat.completion",
     created: 1741140336,
-    model: "gpt-4o-2024-08-06",
+    model: "gpt-5-mini-2025-08-07",
     choices: [
       {
         index: 0,
@@ -222,7 +222,7 @@ export const openaiClassifierShouldEvaluateArithmeticExpressions = [
     id: "chatcmpl-B7YSMVJ7qaQTJ9OtR6zPUEdHxrNbT",
     object: "chat.completion",
     created: 1741140446,
-    model: "gpt-4o-2024-08-06",
+    model: "gpt-5-mini-2025-08-07",
     choices: [
       {
         index: 0,
@@ -265,7 +265,7 @@ export const openaiClassifierShouldEvaluateArithmeticExpressions = [
     id: "chatcmpl-B7YTPWIPOFpRcVOjEnU6s0kZXgPdB",
     object: "chat.completion",
     created: 1741140511,
-    model: "gpt-4o-2024-08-06",
+    model: "gpt-5-mini-2025-08-07",
     choices: [
       {
         index: 0,
@@ -308,7 +308,7 @@ export const openaiClassifierShouldEvaluateArithmeticExpressions = [
     id: "chatcmpl-B7YU2qluNL0SenvL1zBiSzrka236n",
     object: "chat.completion",
     created: 1741140550,
-    model: "gpt-4o-2024-08-06",
+    model: "gpt-5-mini-2025-08-07",
     choices: [
       {
         index: 0,
@@ -351,7 +351,7 @@ export const openaiClassifierShouldEvaluateArithmeticExpressions = [
     id: "chatcmpl-B7YUTk3771FhLlXQNZPaobEC0d8R6",
     object: "chat.completion",
     created: 1741140577,
-    model: "gpt-4o-2024-08-06",
+    model: "gpt-5-mini-2025-08-07",
     choices: [
       {
         index: 0,
@@ -390,7 +390,7 @@ export const openaiClassifierShouldEvaluateArithmeticExpressions = [
     id: "chatcmpl-B7YUtrpit4RvQCeqfOcZme9L6pMAP",
     object: "chat.completion",
     created: 1741140603,
-    model: "gpt-4o-2024-08-06",
+    model: "gpt-5-mini-2025-08-07",
     choices: [
       {
         index: 0,
@@ -432,7 +432,7 @@ export const openaiClassifierShouldEvaluateArithmeticExpressions = [
     id: "chatcmpl-B7YV8HHTm4hZU58Zp9gcjwp3MigEl",
     object: "chat.completion",
     created: 1741140618,
-    model: "gpt-4o-2024-08-06",
+    model: "gpt-5-mini-2025-08-07",
     choices: [
       {
         index: 0,

diff --git a/js/llm.test.ts b/js/llm.test.ts
@@ -236,7 +236,7 @@ Issue Description: {{page_content}}
             id: "chatcmpl-test",
             object: "chat.completion",
             created: 1234567890,
-            model: "gpt-4o",
+            model: "gpt-5-mini",
             choices: [
               {
                 index: 0,
@@ -294,7 +294,7 @@ Issue Description: {{page_content}}
             id: "chatcmpl-test",
             object: "chat.completion",
             created: 1234567890,
-            model: "gpt-4o",
+            model: "gpt-5-mini",
             choices: [
               {
                 index: 0,

diff --git a/js/llm.ts b/js/llm.ts
@@ -69,7 +69,7 @@ export type LLMArgs = {
  * The default model to use for LLM-based evaluations.
  * @deprecated Use `init({ defaultModel: "..." })` to configure the default model instead.
  */
-export const DEFAULT_MODEL = "gpt-4o";
+export const DEFAULT_MODEL = "gpt-5-mini";
 
 const PLAIN_RESPONSE_SCHEMA = {
   properties: {

diff --git a/js/oai.test.ts b/js/oai.test.ts
@@ -261,8 +261,8 @@ describe("OAI", () => {
     expect(Object.is(builtClient, otherClient)).toBe(true);
   });
 
-  test("getDefaultModel returns gpt-4o by default", () => {
-    expect(getDefaultModel()).toBe("gpt-4o");
+  test("getDefaultModel returns gpt-5-mini by default", () => {
+    expect(getDefaultModel()).toBe("gpt-5-mini");
   });
 
   test("init sets default model", () => {
@@ -275,7 +275,7 @@ describe("OAI", () => {
     expect(getDefaultModel()).toBe("claude-3-5-sonnet-20241022");
 
     init({ defaultModel: undefined });
-    expect(getDefaultModel()).toBe("gpt-4o");
+    expect(getDefaultModel()).toBe("gpt-5-mini");
   });
 
   test("init can set both client and default model", () => {

diff --git a/js/oai.ts b/js/oai.ts
@@ -163,7 +163,7 @@ export interface InitOptions {
   client?: OpenAI;
   /**
    * The default model to use for evaluations when not specified per-call.
-   * Defaults to "gpt-4o" if not set.
+   * Defaults to "gpt-5-mini" if not set.
    *
    * When using non-OpenAI providers via the Braintrust proxy, set this to
    * the appropriate model string (e.g., "claude-3-5-sonnet-20241022").
@@ -200,10 +200,10 @@ export const init = ({ client, defaultModel }: InitOptions = {}) => {
 };
 
 /**
- * Get the configured default model, or "gpt-4o" if not set.
+ * Get the configured default model, or "gpt-5-mini" if not set.
  */
 export const getDefaultModel = (): string => {
-  return globalThis.__defaultModel ?? "gpt-4o";
+  return globalThis.__defaultModel ?? "gpt-5-mini";
 };
 
 export async function cachedChatCompletion(

diff --git a/js/ragas.test.ts b/js/ragas.test.ts
@@ -59,7 +59,6 @@ test("Ragas generation test", async () => {
       output: data.output,
       expected: data.expected,
       context: data.context,
-      temperature: 0,
     });
 
     if (score === 1) {
@@ -119,7 +118,7 @@ describe("ContextRelevancy score clamping", () => {
           id: "chatcmpl-test",
           object: "chat.completion",
           created: Date.now(),
-          model: "gpt-4o",
+          model: "gpt-5-mini",
           choices: [
             {
               index: 0,
@@ -184,7 +183,7 @@ describe("ContextRelevancy score clamping", () => {
           id: "chatcmpl-test",
           object: "chat.completion",
           created: Date.now(),
-          model: "gpt-4o",
+          model: "gpt-5-mini",
           choices: [
             {
               index: 0,
@@ -264,7 +263,7 @@ describe("AnswerCorrectness custom embedding model", () => {
           id: "test-id",
           object: "chat.completion",
           created: Date.now(),
-          model: "gpt-4o",
+          model: "gpt-5-mini",
           choices: [
             {
               index: 0,

diff --git a/js/ragas.ts b/js/ragas.ts
@@ -390,10 +390,12 @@ export const ContextRecall: ScorerWithPartial<string, RagasArgs> = makePartial(
     return {
       name: "ContextRecall",
       score:
-        statements.statements.reduce(
-          (acc, { attributed }) => acc + attributed,
-          0,
-        ) / statements.statements.length,
+        statements.statements.length > 0
+          ? statements.statements.reduce(
+              (acc, { attributed }) => acc + attributed,
+              0,
+            ) / statements.statements.length
+          : 0,
       metadata: {
         statements: statements.statements,
       },
@@ -983,8 +985,10 @@ function parseArgs(args: ScorerArgs<string, RagasArgs>): {
     "messages"
   > = {
     model: args.model ?? getDefaultModel(),
-    temperature: args.temperature ?? 0,
   };
+  if (args.temperature !== undefined) {
+    chatArgs.temperature = args.temperature;
+  }
   if (args.maxTokens) {
     chatArgs.max_tokens = args.maxTokens;
   }

diff --git a/py/autoevals/llm.py b/py/autoevals/llm.py
@@ -3,7 +3,7 @@
 This module provides a collection of pre-built LLM scorers for common evaluation tasks.
 
 All evaluators accept the following common arguments:
-- model: Model to use (defaults to gpt-4o)
+- model: Model to use (defaults to gpt-5-mini)
 - temperature: Controls randomness (0-1). If not specified, uses the model's default.
 - max_tokens: Maximum tokens to generate. If not specified, uses the model's default.
 - client: OpenAI client (defaults to global client from init())
@@ -79,7 +79,7 @@
 )
 
 # Deprecated: Use init(default_model="...") to configure the default model instead.
-DEFAULT_MODEL = "gpt-4o"
+DEFAULT_MODEL = "gpt-5-mini"
 
 PLAIN_RESPONSE_SCHEMA = {
     "properties": {"choice": {"description": "The choice", "title": "Choice", "type": "string"}},

diff --git a/py/autoevals/oai.py b/py/autoevals/oai.py
@@ -254,7 +254,7 @@ def init(client: Client | None = None, is_async: bool = False, default_model: st
         is_async: Whether to create a client with async operations. Defaults to False.
             Deprecated: Use the `client` argument directly with your desired async/sync configuration.
         default_model: The default model to use for evaluations when not specified per-call.
-            Defaults to "gpt-4o" if not set. When using non-OpenAI providers via the Braintrust
+            Defaults to "gpt-5-mini" if not set. When using non-OpenAI providers via the Braintrust
             proxy, set this to the appropriate model string (e.g., "claude-3-5-sonnet-20241022").
 
     Example:
@@ -284,8 +284,8 @@ def init(client: Client | None = None, is_async: bool = False, default_model: st
 
 
 def get_default_model() -> str:
-    """Get the configured default model, or "gpt-4o" if not set."""
-    return _default_model_var.get(None) or "gpt-4o"
+    """Get the configured default model, or "gpt-5-mini" if not set."""
+    return _default_model_var.get(None) or "gpt-5-mini"
 
 
 warned_deprecated_api_key_base_url = False

diff --git a/py/autoevals/ragas.py b/py/autoevals/ragas.py
@@ -17,7 +17,7 @@
 
 **Common arguments**:
 
-    - `model`: Model to use for evaluation, defaults to the model configured via init(default_model=...) or "gpt-4o"
+    - `model`: Model to use for evaluation, defaults to the model configured via init(default_model=...) or "gpt-5-mini"
     - `client`: Optional Client for API calls. If not provided, uses global client from init()
 
 **Example - Direct usage**:
@@ -124,8 +124,8 @@ def check_required(name, **kwargs):
 
 
 # Deprecated: Use init(default_model="...") to configure the default model instead.
-# This was previously "gpt-4o-mini" but now defaults to the configured model.
-DEFAULT_RAGAS_MODEL = "gpt-4o-mini"
+# This was previously "gpt-5-nano" but now defaults to the configured model.
+DEFAULT_RAGAS_MODEL = "gpt-5-nano"
 
 
 def _get_model(model: str | None) -> str:
@@ -138,7 +138,7 @@ def _get_model(model: str | None) -> str:
         return model
 
     # Check if user configured a custom default via init(default_model=...)
-    # If they did (even if it's "gpt-4o"), respect it for consistency
+    # If they did (even if it's "gpt-5-mini"), respect it for consistency
     configured_default = _default_model_var.get(None)
     if configured_default is not None:
         return configured_default
@@ -559,7 +559,7 @@ def _postprocess(self, response):
 
         return Score(
             name=self._name(),
-            score=ones / total,
+            score=ones / total if total > 0 else 0,
             metadata={
                 "statements": statements,
                 "recall": statements,

diff --git a/py/autoevals/test_llm.py b/py/autoevals/test_llm.py
@@ -176,7 +176,7 @@ def test_factuality():
                 }
             ],
             "created": 1734029028,
-            "model": "gpt-4o-2024-08-06",
+            "model": "gpt-5-mini-2025-08-07",
             "object": "chat.completion",
             "system_fingerprint": "fp_cc5cf1c6e3",
             "usage": {
@@ -232,7 +232,7 @@ def test_factuality_client():
                 }
             ],
             "created": 1734029028,
-            "model": "gpt-4o-2024-08-06",
+            "model": "gpt-5-mini-2025-08-07",
             "object": "chat.completion",
             "system_fingerprint": "fp_cc5cf1c6e3",
             "usage": {
@@ -297,7 +297,7 @@ def test_init_client():
                 }
             ],
             "created": 1734029028,
-            "model": "gpt-4o-2024-08-06",
+            "model": "gpt-5-mini-2025-08-07",
             "object": "chat.completion",
             "system_fingerprint": "fp_cc5cf1c6e3",
             "usage": {
@@ -373,7 +373,7 @@ def capture_request(request):
                 "id": "chatcmpl-test",
                 "object": "chat.completion",
                 "created": 1234567890,
-                "model": "gpt-4o",
+                "model": "gpt-5-mini",
                 "choices": [
                     {
                         "index": 0,
@@ -429,7 +429,7 @@ def capture_request(request):
                 "id": "chatcmpl-test",
                 "object": "chat.completion",
                 "created": 1234567890,
-                "model": "gpt-4o",
+                "model": "gpt-5-mini",
                 "choices": [
                     {
                         "index": 0,