braintrustdata · ibolmo · Feb 6, 2026 · Feb 6, 2026
diff --git a/.github/workflows/js.yaml b/.github/workflows/js.yaml
@@ -643,3 +643,76 @@ jobs:
             kill $(cat mock-server.pid) || true
             rm mock-server.pid
           fi
+
+  framework-tests:
+    needs: build
+    runs-on: ${{ matrix.os }}
+
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, windows-latest]
+        node-version: [20, 22]
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: ${{ matrix.node-version }}
+
+      - uses: pnpm/action-setup@v4
+
+      - name: Download build artifact
+        uses: actions/download-artifact@v4
+        with:
+          name: ${{ needs.build.outputs.artifact-name }}-${{ matrix.node-version }}-dist
+          path: js/artifacts
+          run-id: ${{ github.run_id }}
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Start mock API server
+        working-directory: js/test-server
+        shell: bash
+        run: |
+          node mock-braintrust-api.js &
+          echo $! > mock-server.pid
+          # Wait for server to be ready
+          for i in {1..30}; do
+            if curl -s http://localhost:8001/version > /dev/null; then
+              echo "Mock server is ready"
+              break
+            fi
+            sleep 1
+          done
+
+      - name: Run framework tests
+        working-directory: js/framework-tests
+        shell: bash
+        env:
+          BRAINTRUST_TAR: ${{ github.workspace }}/js/artifacts/braintrust-*.tgz
+          BRAINTRUST_API_KEY: fake-test-key-framework-tests
+          BRAINTRUST_API_URL: http://localhost:8001
+          BRAINTRUST_APP_URL: http://localhost:8001
+        run: |
+          # Expand tarball path glob
+          TARBALL=$(ls $BRAINTRUST_TAR | head -n 1)
+          if [ -z "$TARBALL" ]; then
+            echo "Error: No tarball found matching $BRAINTRUST_TAR"
+            exit 1
+          fi
+          echo "Using tarball: $TARBALL"
+          export BRAINTRUST_TAR="$TARBALL"
+          make test
+
+      - name: Stop mock API server
+        if: always()
+        working-directory: js/test-server
+        shell: bash
+        run: |
+          if [ -f mock-server.pid ]; then
+            kill $(cat mock-server.pid) || true
+            rm mock-server.pid
+          fi
diff --git a/js/framework-tests/Makefile b/js/framework-tests/Makefile
@@ -0,0 +1,35 @@
+.PHONY: test list clean
+
+# Auto-discover all scenarios with Makefiles
+SCENARIOS := $(shell find scenarios -mindepth 1 -maxdepth 1 -type d \
+	-exec test -f {}/Makefile \; -print | sed 's|scenarios/||' | sort)
+
+test:
+	@echo "==> Running all framework test scenarios"
+	@FAILED=0; \
+	for scenario in $(SCENARIOS); do \
+		echo ""; \
+		echo "==> Running framework test scenario: $$scenario"; \
+		echo ""; \
+		$(MAKE) -C scenarios/$$scenario test || FAILED=1; \
+	done; \
+	if [ $$FAILED -eq 0 ]; then \
+		echo ""; \
+		echo "✓ All framework test scenarios passed"; \
+	else \
+		echo ""; \
+		echo "✗ Some framework test scenarios failed"; \
+		exit 1; \
+	fi
+
+list:
+	@echo "Available framework test scenarios:"
+	@for scenario in $(SCENARIOS); do \
+		echo "  - $$scenario"; \
+	done
+
+clean:
+	@echo "==> Cleaning all framework test scenarios"
+	@for scenario in $(SCENARIOS); do \
+		$(MAKE) -C scenarios/$$scenario clean; \
+	done
diff --git a/js/framework-tests/README.md b/js/framework-tests/README.md
@@ -0,0 +1,142 @@
+# Braintrust Framework Tests
+
+Test suite for using the Braintrust `Eval()` API directly across different JavaScript/TypeScript frameworks and runtimes.
+
+## Quick Reference
+
+```bash
+# From sdk/js/framework-tests/:
+make test                     # Run all scenarios
+make list                     # List available scenarios
+make clean                    # Clean all scenarios
+cd scenarios/vitest && make test  # Run specific scenario
+
+# From a specific scenario:
+cd scenarios/vitest
+make test                     # Auto-creates tarball if needed
+make clean                    # Remove artifacts
+```
+
+## Purpose
+
+This test suite verifies that the Braintrust SDK's `Eval()` API works correctly when used directly (not via CLI) in various JavaScript/TypeScript frameworks and runtimes. These tests cover the customer use cases where they:
+
+- Want to run evals in their existing test framework (Jest, Vitest)
+- Need better ESM/TypeScript support than the CLI provides (vite-node, tsx)
+- Are using alternative runtimes (Deno)
+
+## Distinction from CLI Tests
+
+- **CLI Tests** (`sdk/js/cli-tests/`): Test the `braintrust eval` CLI command
+- **Framework Tests** (this directory): Test calling `Eval()` directly in code
+
+## Structure
+
+Tests are organized into scenarios under `scenarios/`:
+
+```
+scenarios/
+├── vitest/          # Eval() in Vitest test runner
+├── jest/            # Eval() in Jest test runner
+├── vite-node/       # Eval() via vite-node execution
+├── tsx/             # Eval() via tsx execution
+└── deno/            # Eval() in Deno runtime
+```
+
+Each scenario is an independent package that:
+
+- Installs the built SDK tarball
+- Runs framework-specific commands to execute evals
+- Validates that `Eval()` works correctly with a mock API server
+
+## Creating a New Scenario
+
+### Requirements
+
+- Makefile with `setup` and `test` targets
+- package.json or runtime config (deno.json)
+- README.md explaining the use case (15-25 lines)
+- .gitignore (ignore artifacts, track lock files)
+- Test files that call `Eval()` directly
+- `.tool-versions` file (if scenario requires non-Node runtimes)
+- POSIX shell syntax (`[ ]` not `[[ ]]`)
+
+### Example Scenario Structure
+
+```
+scenarios/my-framework/
+├── Makefile           # setup + test targets
+├── package.json       # Dependencies
+├── .tool-versions     # Optional: for non-Node runtimes
+├── tests/
+│   └── my-test.ts     # Calls Eval() directly
+├── .gitignore
+└── README.md
+```
+
+### Example Test File
+
+```typescript
+import { Eval } from "braintrust";
+
+// In test framework (Jest/Vitest):
+test("my eval", async () => {
+  await Eval("Test Name", {
+    data: () => [{ input: "test", expected: "test" }],
+    task: async (input: string) => input,
+    scores: [
+      /* ... */
+    ],
+  });
+});
+
+// Or standalone (vite-node/tsx/deno):
+Eval("Test Name", {
+  data: () => [{ input: "test", expected: "test" }],
+  task: async (input: string) => input,
+  scores: [
+    /* ... */
+  ],
+});
+```
+
+## Design Principles
+
+- **Direct Eval() calls**: All tests call `Eval()` directly, not via CLI
+- **Mock API server**: Tests send logs to a mock server on localhost:8001
+- **Framework-native execution**: Use the framework's natural command (jest, vitest, vite-node, tsx, deno)
+- **Well-known tarball paths**: Use `braintrust-latest.tgz` not version-specific paths
+- **No workarounds**: Expose real issues with framework integration
+- **Track lock files**: Commit lock files to detect dependency changes
+- **Makefiles are source of truth**: No npm scripts for test commands
+- **POSIX shell syntax**: Use `[ ]` not `[[ ]]` for portability
+
+## Environment Variables
+
+- **`BRAINTRUST_TAR`**: Path to braintrust tarball (auto-created if not set)
+
+### CI Environment Variables
+
+In CI, these are set to use the mock API server:
+
+- **`BRAINTRUST_API_KEY`**: Set to `fake-test-key-framework-tests`
+- **`BRAINTRUST_API_URL`**: Set to `http://localhost:8001`
+- **`BRAINTRUST_APP_URL`**: Set to `http://localhost:8001`
+
+A lightweight mock server runs on port 8001 during CI tests to handle all API calls, ensuring tests behave realistically while preventing production API hits.
+
+### Project Names
+
+Each scenario uses a unique project name as the first argument to `Eval()` (e.g., `test-framework-vitest`). This ensures test results are isolated and easily identifiable.
+
+## CI Integration
+
+Framework tests run in `sdk/.github/workflows/js.yaml` as a separate job alongside CLI tests.
+
+## Reference Scenarios
+
+- **vitest**: Running evals in Vitest test files (customer use case: reuse test utils)
+- **jest**: Running evals in Jest test files (most popular test framework)
+- **vite-node**: Running evals with vite-node for better ESM support (customer workaround)
+- **tsx**: Running evals with tsx for fast TypeScript execution
+- **deno**: Running evals in Deno runtime
diff --git a/js/framework-tests/scenarios/deno/.gitignore b/js/framework-tests/scenarios/deno/.gitignore
@@ -0,0 +1,4 @@
+node_modules/
+braintrust-latest.tgz
+deno.lock
+package-lock.json
diff --git a/js/framework-tests/scenarios/deno/.tool-versions b/js/framework-tests/scenarios/deno/.tool-versions
@@ -0,0 +1 @@
+deno 2.6.8
diff --git a/js/framework-tests/scenarios/deno/Makefile b/js/framework-tests/scenarios/deno/Makefile
@@ -0,0 +1,53 @@
+.PHONY: setup test clean
+
+BRAINTRUST_TAR ?= $(shell \
+	cd ../../.. && \
+	pnpm exec turbo build --filter=braintrust >/dev/null 2>&1 && \
+	mkdir -p artifacts && \
+	VERSION=$$(node -p "require('./package.json').version") && \
+	pnpm pack --pack-destination artifacts >/dev/null 2>&1 && \
+	echo "$$PWD/artifacts/braintrust-$$VERSION.tgz")
+
+setup:
+	@echo "==> Setting up eval-deno scenario"
+	@if [ ! -f "$(BRAINTRUST_TAR)" ]; then \
+		echo "Error: Tarball not found at $(BRAINTRUST_TAR)"; \
+		exit 1; \
+	fi
+	@cp "$(BRAINTRUST_TAR)" braintrust-latest.tgz
+	@echo "==> Installing Deno via mise"
+	@mise install
+	@echo "==> Installing dependencies with npm (for Deno node_modules)"
+	@npm install
+	@echo "==> Installing Deno dependencies"
+	@mise exec -- deno install
+
+test: setup
+	@echo "==> Running eval-deno CLI tests"
+	@echo ""
+	@FAILED=0; \
+	TESTS=$$(find tests -name '*.eval.ts' | sort); \
+	TOTAL=$$(echo "$$TESTS" | wc -l | tr -d ' '); \
+	COUNT=0; \
+	for test in $$TESTS; do \
+		COUNT=$$((COUNT + 1)); \
+		TEST_NAME=$$(basename "$$test" .eval.ts); \
+		echo "  [$$COUNT/$$TOTAL] Testing $$TEST_NAME..."; \
+		if mise exec -- deno run --allow-env --allow-read --allow-net "$$test"; then \
+			echo "    ✓ Passed"; \
+		else \
+			echo "    ✗ Failed"; \
+			FAILED=1; \
+		fi; \
+		echo ""; \
+	done; \
+	if [ $$FAILED -eq 0 ]; then \
+		echo "✓ All eval-deno tests passed"; \
+	else \
+		echo "✗ Some eval-deno tests failed"; \
+	fi; \
+	exit $$FAILED
+
+clean:
+	@echo "==> Cleaning eval-deno scenario"
+	@rm -rf node_modules braintrust-latest.tgz deno.lock package-lock.json
diff --git a/js/framework-tests/scenarios/deno/README.md b/js/framework-tests/scenarios/deno/README.md
@@ -0,0 +1,46 @@
+# Deno Framework Scenario
+
+Tests that `Eval()` works with Deno runtime by running evals directly.
+
+## What This Tests
+
+- Deno's security model (permissions: --allow-env, --allow-read, --allow-net)
+- Import mapping via deno.json
+- Running evals with Deno's runtime
+- ESM-only environment
+
+## Use Case
+
+Users writing Deno applications want to use Braintrust evals without switching to Node.js. This tests that the SDK works correctly in Deno's runtime environment.
+
+## Design Decisions
+
+**Why npm: imports with nodeModulesDir?** Following Deno best practices, we use `npm:` imports with `nodeModulesDir: "auto"`. We install the tarball using npm first to populate node_modules, then Deno imports from there using its npm compatibility layer.
+
+**Why not use CLI?** The `braintrust eval` CLI is a Node.js binary and can't be executed directly by Deno. Users in Deno environments would call `Eval()` directly in their code.
+
+**Why these permissions?** Evals need:
+
+- `--allow-env`: Read environment variables
+- `--allow-read`: Read eval files
+- `--allow-net`: Send data to Braintrust API
+
+## Requirements
+
+- `mise` must be installed for runtime management
+
+Deno will be automatically installed via `mise` using the version specified in `.tool-versions`.
+
+## Expected Behavior
+
+Should run evals using Deno's runtime with proper permissions.
+
+## Running
+
+```bash
+# From this directory:
+make test
+
+# From cli-tests root:
+make test
+```
diff --git a/js/framework-tests/scenarios/deno/deno.json b/js/framework-tests/scenarios/deno/deno.json
@@ -0,0 +1,9 @@
+{
+  "imports": {
+    "braintrust": "npm:braintrust"
+  },
+  "nodeModulesDir": "auto",
+  "workspace": {
+    "members": []
+  }
+}
diff --git a/js/framework-tests/scenarios/deno/package.json b/js/framework-tests/scenarios/deno/package.json
@@ -0,0 +1,8 @@
+{
+  "name": "braintrust-cli-test-eval-deno",
+  "type": "module",
+  "private": true,
+  "dependencies": {
+    "braintrust": "file:./braintrust-latest.tgz"
+  }
+}