Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 137 additions & 0 deletions .github/workflows/braintrust-container.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
name: Braintrust Container

on:
push:
branches:
- master
paths:
- 'scripts/scheduled/braintrust_*.py'
- 'requirements-braintrust.txt'
- 'build/braintrust/**'
- '.github/workflows/braintrust-container.yaml'
tags:
- 'v*'
pull_request:
paths:
- 'scripts/scheduled/braintrust_*.py'
- 'requirements-braintrust.txt'
- 'build/braintrust/**'
- '.github/workflows/braintrust-container.yaml'
workflow_dispatch:

concurrency:
group: braintrust-${{ github.ref }}
cancel-in-progress: true

jobs:
build-dev:
if: ${{ github.event_name == 'pull_request' || (github.event_name == 'push' && !startsWith(github.ref, 'refs/tags/')) || github.event_name == 'workflow_dispatch' }}
name: "Braintrust Image Build (Dev)"
permissions:
contents: 'read'
id-token: 'write'
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up QEMU
uses: docker/setup-qemu-action@v3
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- id: auth
name: Authenticate to Google Cloud
uses: google-github-actions/auth@v2
with:
token_format: 'access_token'
workload_identity_provider: 'projects/${{ secrets.DEV_GKE_PROJECT_ID}}/locations/global/workloadIdentityPools/github/providers/github'
service_account: '${{ secrets.DEV_GKE_SA }}'
- name: Login to GAR
uses: docker/login-action@v3
with:
registry: us-east1-docker.pkg.dev
username: oauth2accesstoken
password: '${{ steps.auth.outputs.access_token }}'
- name: Get branch name
id: branch-raw
uses: tj-actions/branch-names@v5.1
- name: Format branch name
id: branch-name
run: >-
echo "current_branch="$(echo ${{ steps.branch-raw.outputs.current_branch }}
| awk '{print tolower($0)}'
| sed 's|.*/\([^/]*\)/.*|\1|; t; s|.*|\0|'
| sed 's/[^a-z0-9\.\-]//g')
>> $GITHUB_OUTPUT
- name: Get current date
id: date
run: echo "date=$(date +'%Y%m%d%H%M')" >> $GITHUB_OUTPUT
- name: Generate image metadata
id: meta
uses: docker/metadata-action@v3
with:
images: |
us-east1-docker.pkg.dev/${{ secrets.DEV_PROJECT }}/containers/sefaria-braintrust-${{ steps.branch-name.outputs.current_branch }}
tags: |
type=ref,event=branch
type=sha,enable=true,priority=100,prefix=sha-,suffix=-${{ steps.date.outputs.date }},format=short
type=sha
flavor: |
latest=true
- name: Build and push
uses: docker/build-push-action@v6
with:
context: .
push: true
file: ./build/braintrust/Dockerfile
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}

build-prod:
if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') }}
name: "Braintrust Image Build (Prod)"
permissions:
contents: 'read'
id-token: 'write'
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up QEMU
uses: docker/setup-qemu-action@v3
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- id: auth
name: Authenticate to Google Cloud
uses: google-github-actions/auth@v2
with:
token_format: 'access_token'
workload_identity_provider: 'projects/${{ secrets.PROD_GKE_PROJECT_ID}}/locations/global/workloadIdentityPools/github/providers/github'
service_account: '${{ secrets.PROD_GKE_SA }}'
- name: Login to GAR
uses: docker/login-action@v3
with:
registry: us-east1-docker.pkg.dev
username: oauth2accesstoken
password: '${{ steps.auth.outputs.access_token }}'
- name: Get current date
id: date
run: echo "date=$(date +'%Y%m%d%H%M')" >> $GITHUB_OUTPUT
- name: Generate image metadata
id: meta
uses: docker/metadata-action@v3
with:
images: |
us-east1-docker.pkg.dev/${{ secrets.PROD_GKE_PROJECT }}/containers/${{ secrets.IMAGE_NAME }}-braintrust
tags: |
type=ref,event=tag
type=sha,enable=true,priority=100,prefix=sha-,suffix=-${{ steps.date.outputs.date }},format=short
type=sha
type=semver,pattern={{raw}}
flavor: |
latest=true
- name: Build and push
uses: docker/build-push-action@v6
with:
context: .
push: true
file: ./build/braintrust/Dockerfile
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
20 changes: 20 additions & 0 deletions build/braintrust/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
FROM python:3.11-slim

LABEL org.opencontainers.image.source="https://github.com/Sefaria/Sefaria-Project"
LABEL org.opencontainers.image.description="Braintrust automation scripts for log backup and dataset tagging"

COPY requirements-braintrust.txt /tmp/requirements.txt
RUN pip install --no-cache-dir -r /tmp/requirements.txt && rm /tmp/requirements.txt

RUN useradd --create-home --shell /bin/bash --uid 1000 --gid 0 braintrust

WORKDIR /app

RUN mkdir -p /app/scripts /app/shared && chown -R braintrust:braintrust /app

COPY scripts/scheduled/braintrust_backup_logs.py /app/scripts/braintrust_backup_logs.py
COPY scripts/scheduled/braintrust_tag_and_push.py /app/scripts/braintrust_tag_and_push.py

USER braintrust

ENTRYPOINT ["python"]
3 changes: 3 additions & 0 deletions build/ci/production-helm-deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ export WEB_IMAGE="us-east1-docker.pkg.dev/$PROJECT_ID/containers/$IMAGE_NAME-web
export NODE_IMAGE="us-east1-docker.pkg.dev/$PROJECT_ID/containers/$IMAGE_NAME-node"
export ASSET_IMAGE="us-east1-docker.pkg.dev/$PROJECT_ID/containers/$IMAGE_NAME-asset"
export LINKER_IMAGE="us-east1-docker.pkg.dev/$PROJECT_ID/containers/$IMAGE_NAME-linker"
export BRAINTRUST_IMAGE="us-east1-docker.pkg.dev/$PROJECT_ID/containers/$IMAGE_NAME-braintrust"
export TAG="$GIT_COMMIT"

yq e -i '.web.containerImage.imageRegistry = strenv(WEB_IMAGE)' $1
Expand All @@ -18,6 +19,8 @@ yq e -i '.linker.containerImage.tag = strenv(TAG)' $1
yq e -i '.nodejs.containerImage.tag = strenv(TAG)' $1
yq e -i '.nginx.containerImage.tag = strenv(TAG)' $1
yq e -i '.monitor.containerImage.tag = strenv(TAG)' $1
yq e -i '.cronJobs.braintrust.image.repository = strenv(BRAINTRUST_IMAGE)' $1
yq e -i '.cronJobs.braintrust.image.tag = strenv(TAG)' $1

helm repo add sefaria-project https://sefaria.github.io/Sefaria-Project
helm upgrade -i production sefaria-project/sefaria --version $CHART_VERSION --namespace $NAMESPACE -f $1 --debug --timeout=30m0s
Expand Down
19 changes: 19 additions & 0 deletions build/ci/production-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -193,13 +193,32 @@ cronJobs:
enabled: true
weeklyEmailNotifications:
enabled: true
braintrust:
enabled: true
image:
repository:
tag:
Comment on lines +196 to +200
Copy link

Copilot AI Feb 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PR description says the Braintrust cronjobs are disabled by default, but production-values.yaml enables them (and both sub-jobs) for production. Either keep them disabled here until explicitly enabled, or update the PR description to reflect that production will run them after deploy.

Copilot uses AI. Check for mistakes.
backupLogs:
enabled: true
schedule: "0 1 * * 0"
serviceAccount: braintrust-backup-logs
bucket: braintrust-logs
prefix: "logs/"
tagAndPush:
enabled: true
schedule: "0 2 * * *"
serviceAccount: braintrust-tag-push
secrets:
localSettings:
ref: local-settings-secrets-production
backupManager:
ref: backup-manager-secret-production
slackWebhook:
ref: slack-webhook-production
braintrust:
ref: braintrust-secret-production
anthropic:
ref: anthropic-api-key-production
instrumentation:
enabled: false
otelEndpoint: "http://otel-collector-collector.monitoring:4317"
Expand Down
92 changes: 92 additions & 0 deletions helm-chart/sefaria/templates/cronjob/braintrust-backup-logs.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
{{- if and .Values.cronJobs.braintrust.enabled .Values.cronJobs.braintrust.backupLogs.enabled }}
---
apiVersion: batch/v1
kind: CronJob
metadata:
name: {{ .Values.deployEnv }}-braintrust-backup-logs
labels:
{{- include "sefaria.labels" . | nindent 4 }}
spec:
schedule: "{{ .Values.cronJobs.braintrust.backupLogs.schedule }}"
concurrencyPolicy: Forbid
jobTemplate:
spec:
backoffLimit: 1
Comment on lines 9 to 14
Copy link

Copilot AI Feb 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No concurrencyPolicy is set, so a slow export/upload could overlap with the next schedule and upload duplicate backups. Consider setting concurrencyPolicy: Forbid for predictable weekly backups.

Copilot uses AI. Check for mistakes.
template:
spec:
serviceAccount: {{ .Values.cronJobs.braintrust.backupLogs.serviceAccount }}
initContainers:
# Init container: Query Braintrust logs and create CSV
- name: braintrust-log-exporter
image: "{{ .Values.cronJobs.braintrust.image.repository }}:{{ .Values.cronJobs.braintrust.image.tag }}"
env:
- name: BRAINTRUST_API_KEY
valueFrom:
secretKeyRef:
name: {{ .Values.secrets.braintrust.ref }}
key: api-key
- name: BRAINTRUST_PROJECT_ID
valueFrom:
secretKeyRef:
name: {{ .Values.secrets.braintrust.ref }}
key: project-id
volumeMounts:
- mountPath: /tmp
name: shared-volume
command: ["python"]
args: ["/app/scripts/braintrust_backup_logs.py"]
resources:
requests:
memory: "256Mi"
cpu: "250m"
limits:
memory: "500Mi"
cpu: "1000m"
containers:
# Main container: Upload CSV to GCS bucket
- name: braintrust-log-uploader
image: google/cloud-sdk
volumeMounts:
- mountPath: /tmp
name: shared-volume
env:
- name: BUCKET
value: {{ .Values.cronJobs.braintrust.backupLogs.bucket }}
- name: PREFIX
value: {{ .Values.cronJobs.braintrust.backupLogs.prefix }}
command: ["bash"]
args:
- "-c"
- |
set -e

# Find the most recent CSV file
CSV_FILE=$(ls -t /tmp/logs_backup_*.csv 2>/dev/null | head -1)

if [ -z "$CSV_FILE" ]; then
echo "No CSV file found in /tmp"
exit 0
fi

FILENAME=$(basename "$CSV_FILE")
DESTINATION="gs://${BUCKET}/${PREFIX}${FILENAME}"

echo "Uploading $CSV_FILE to $DESTINATION"
gsutil cp "$CSV_FILE" "$DESTINATION"
echo "Upload complete"

# Cleanup
rm -f "$CSV_FILE"
resources:
requests:
memory: "256Mi"
cpu: "100m"
limits:
memory: "500Mi"
Copy link

Copilot AI Feb 4, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The main container lacks CPU requests. While the limits are set to 500Mi memory, there are no CPU requests defined, which could lead to suboptimal pod scheduling. According to Kubernetes best practices, both memory and CPU should have requests defined.

Add a CPU request:

requests:
  memory: "500Mi"
  cpu: "100m"  # or another appropriate value
Suggested change
memory: "500Mi"
memory: "500Mi"
requests:
memory: "500Mi"
cpu: "100m"

Copilot uses AI. Check for mistakes.
restartPolicy: OnFailure
volumes:
- name: shared-volume
emptyDir: {}
successfulJobsHistoryLimit: 1
failedJobsHistoryLimit: 2
{{- end }}
64 changes: 64 additions & 0 deletions helm-chart/sefaria/templates/cronjob/braintrust-tag-and-push.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
{{- if and .Values.cronJobs.braintrust.enabled .Values.cronJobs.braintrust.tagAndPush.enabled }}
---
apiVersion: batch/v1
kind: CronJob
metadata:
name: {{ .Values.deployEnv }}-braintrust-tag-and-push
labels:
{{- include "sefaria.labels" . | nindent 4 }}
spec:
schedule: "{{ .Values.cronJobs.braintrust.tagAndPush.schedule }}"
concurrencyPolicy: Forbid
jobTemplate:
spec:
backoffLimit: 1
Comment on lines 9 to 14
Copy link

Copilot AI Feb 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No concurrencyPolicy is set, so if a run takes longer than the schedule interval, multiple runs can overlap and double-tag/double-push logs. Consider setting concurrencyPolicy: Forbid (and optionally startingDeadlineSeconds) for safer operation.

Copilot uses AI. Check for mistakes.
template:
spec:
serviceAccount: {{ .Values.cronJobs.braintrust.tagAndPush.serviceAccount }}
securityContext:
fsGroup: 1000
containers:
- name: braintrust-tag-and-push
image: "{{ .Values.cronJobs.braintrust.image.repository }}:{{ .Values.cronJobs.braintrust.image.tag }}"
env:
- name: BRAINTRUST_API_KEY
valueFrom:
Comment on lines +21 to +25
Copy link

Copilot AI Feb 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PR description mentions deploying scripts via ConfigMaps/init containers, but the chart changes shown execute scripts baked into the image and there are no new braintrust script ConfigMaps under helm-chart/sefaria/templates/configmap. Either add the missing templates or update the PR description/docs to match the implemented approach.

Copilot uses AI. Check for mistakes.
secretKeyRef:
name: {{ .Values.secrets.braintrust.ref }}
key: api-key
- name: BRAINTRUST_PROJECT_ID
valueFrom:
secretKeyRef:
name: {{ .Values.secrets.braintrust.ref }}
key: project-id
- name: ANTHROPIC_API_KEY
valueFrom:
secretKeyRef:
name: {{ .Values.secrets.anthropic.ref }}
key: api-key
- name: BRAINTRUST_SHARED_STORAGE
value: "/shared/braintrust"
volumeMounts:
- mountPath: /shared/braintrust
name: shared-storage
command: ["python"]
args: ["/app/scripts/braintrust_tag_and_push.py", "all"]
Comment on lines +41 to +45
Copy link

Copilot AI Feb 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The braintrust image runs as a non-root user (see build/braintrust/Dockerfile). Ensure the shared-storage volume mount is writable in all modes (especially PVCs) by setting an fsGroup or adding an initContainer to adjust ownership/permissions.

Copilot uses AI. Check for mistakes.
resources:
limits:
memory: "3Gi"
cpu: "2000m"
requests:
memory: "1Gi"
cpu: "500m"
restartPolicy: OnFailure
volumes:
- name: shared-storage
{{- if .Values.cronJobs.braintrust.tagAndPush.usePvc }}
persistentVolumeClaim:
claimName: {{ .Values.cronJobs.braintrust.tagAndPush.pvcName }}
{{- else }}
emptyDir: {}
{{- end }}
successfulJobsHistoryLimit: 1
failedJobsHistoryLimit: 2
{{- end }}
Loading
Loading