From d7df3a3eee0e7bf2a2369d02f8645fc78d7ec1d2 Mon Sep 17 00:00:00 2001 From: Ahmad Alhour Date: Fri, 12 Dec 2025 10:50:20 +0100 Subject: [PATCH 1/5] feat(databricks): add sample application for Databricks integration Add a new sample application that deploys the Databricks Prometheus Exporter in a Multipass VM with Grafana Alloy for metrics collection. The sample app: - Runs the exporter as a Docker container (ghcr.io/grafana/databricks-prometheus-exporter) - Configures Alloy to scrape metrics (5m interval, 4m timeout) - Forwards metrics and container logs to Grafana Cloud - Uses OAuth2 Service Principal authentication Includes .CI_BYPASS as Databricks is a cloud-hosted service that cannot be spun up in CI. Metrics collected: 18 exporter metrics across billing, jobs, pipelines, and SQL queries domains. --- sample-apps/databricks/.CI_BYPASS | 3 + sample-apps/databricks/Makefile | 54 +++++ sample-apps/databricks/README.md | 192 ++++++++++++++++ sample-apps/databricks/cloud-init.yaml | 209 +++++++++++++++++ .../jinja/templates/cloud-init-template.yaml | 213 ++++++++++++++++++ .../jinja/variables/cloud-init.yaml | 16 ++ .../tests/configs/databricks.config | 2 + .../databricks/tests/metrics/databricks | 20 ++ 8 files changed, 709 insertions(+) create mode 100644 sample-apps/databricks/.CI_BYPASS create mode 100644 sample-apps/databricks/Makefile create mode 100644 sample-apps/databricks/README.md create mode 100644 sample-apps/databricks/cloud-init.yaml create mode 100644 sample-apps/databricks/jinja/templates/cloud-init-template.yaml create mode 100644 sample-apps/databricks/jinja/variables/cloud-init.yaml create mode 100644 sample-apps/databricks/tests/configs/databricks.config create mode 100644 sample-apps/databricks/tests/metrics/databricks diff --git a/sample-apps/databricks/.CI_BYPASS b/sample-apps/databricks/.CI_BYPASS new file mode 100644 index 0000000..a155df8 --- /dev/null +++ b/sample-apps/databricks/.CI_BYPASS @@ -0,0 +1,3 @@ +# Databricks is a cloud-hosted SaaS service that cannot be spun up locally. +# This sample-app is excluded from CI runs. + diff --git a/sample-apps/databricks/Makefile b/sample-apps/databricks/Makefile new file mode 100644 index 0000000..c2a9555 --- /dev/null +++ b/sample-apps/databricks/Makefile @@ -0,0 +1,54 @@ +VM_NAME ?= databricks-sample-app +CONFIG_FILE_DIR := jinja/variables +CONFIG_FILE := $(CONFIG_FILE_DIR)/cloud-init.yaml +LOKI_INSTANCE ?= your-loki-instance:3100 +PROMETHEUS_INSTANCE ?= your-prometheus-instance:9090 + +# Import last to ensure expected parameters are set and available to import from this file +include ../Makefile + +.PHONY: run run-ci stop render-config launch-vm clean defaultconfig + +run: launch-vm + @echo "VM $(VM_NAME) launched and configured." + @echo "" + @echo "IMPORTANT: You must configure jinja/variables/cloud-init.yaml with your Databricks credentials before launching." + @echo "See README.md for required configuration values." + +run-ci: clean defaultconfig launch-vm + @echo "Running in CI mode" + +stop: + @multipass stop $(VM_NAME) + @multipass delete $(VM_NAME) + @multipass purge + +render-config: + @docker run --rm -v $(shell pwd)/jinja/templates:/templates -v $(shell pwd)/jinja/variables:/variables dinutac/jinja2docker:latest /templates/cloud-init-template.yaml /variables/cloud-init.yaml --format=yaml > cloud-init.yaml + +launch-vm: render-config + @multipass launch -n $(VM_NAME) --disk 10G --cloud-init cloud-init.yaml + +clean: + @rm -f cloud-init.yaml + @rm -rf $(CONFIG_FILE_DIR) + +defaultconfig: + @mkdir -p $(CONFIG_FILE_DIR) + @echo "# Databricks Exporter Configuration" > $(CONFIG_FILE) + @echo "# See README.md for instructions on obtaining these values" >> $(CONFIG_FILE) + @echo "" >> $(CONFIG_FILE) + @echo "# Grafana Cloud endpoints" >> $(CONFIG_FILE) + @echo "loki_url: http://$(LOKI_INSTANCE)/loki/api/v1/push" >> $(CONFIG_FILE) + @echo "loki_user: your_loki_username" >> $(CONFIG_FILE) + @echo "loki_pass: your_loki_password" >> $(CONFIG_FILE) + @echo "prom_url: http://$(PROMETHEUS_INSTANCE)/api/v1/push" >> $(CONFIG_FILE) + @echo "prom_user: your_prometheus_username" >> $(CONFIG_FILE) + @echo "prom_pass: your_prometheus_password" >> $(CONFIG_FILE) + @echo "" >> $(CONFIG_FILE) + @echo "# Databricks OAuth2 Service Principal credentials" >> $(CONFIG_FILE) + @echo "databricks_server_hostname: your-workspace.cloud.databricks.com" >> $(CONFIG_FILE) + @echo "databricks_warehouse_http_path: /sql/1.0/warehouses/your-warehouse-id" >> $(CONFIG_FILE) + @echo "databricks_client_id: your-service-principal-client-id" >> $(CONFIG_FILE) + @echo "databricks_client_secret: your-service-principal-client-secret" >> $(CONFIG_FILE) + diff --git a/sample-apps/databricks/README.md b/sample-apps/databricks/README.md new file mode 100644 index 0000000..db6f057 --- /dev/null +++ b/sample-apps/databricks/README.md @@ -0,0 +1,192 @@ +# Databricks Sample Application + +This sample application sets up monitoring for Databricks using Grafana Alloy. The application runs the Databricks Prometheus Exporter in a containerized environment and includes automated setup using cloud-init. + +## Overview + +The sample application: +- Deploys the [Databricks Prometheus Exporter](https://github.com/grafana/databricks-prometheus-exporter) as a Docker container +- Configures Grafana Alloy to scrape metrics from the exporter +- Forwards metrics to Grafana Cloud (or any Prometheus-compatible remote write endpoint) +- Automates the entire setup process using cloud-init + +## Prerequisites + +- [Multipass](https://multipass.run/) for VM management +- A Databricks workspace with Unity Catalog enabled +- A Databricks Service Principal with OAuth2 credentials +- A running SQL Warehouse (or one configured to auto-start) + +## Databricks Configuration + +Before running the sample app, you need to set up authentication in your Databricks workspace. + +### 1. Create a Service Principal + +1. Log into your Databricks workspace +2. Go to **Settings** → **Admin Console** → **Service Principals** +3. Click **Add Service Principal** +4. Note the **Application ID** (this is your Client ID) +5. Click **Generate Secret** under OAuth Secrets +6. Copy and securely store the **Client Secret** + +### 2. Grant Required Permissions + +Run these SQL commands as a Databricks admin (replace `` with your Application ID): + +```sql +GRANT MANAGE ON CATALOG system TO ``; +GRANT USE CATALOG ON CATALOG system TO ``; +GRANT USE SCHEMA ON SCHEMA system.billing TO ``; +GRANT SELECT ON SCHEMA system.billing TO ``; +GRANT USE SCHEMA ON SCHEMA system.query TO ``; +GRANT SELECT ON SCHEMA system.query TO ``; +GRANT USE SCHEMA ON SCHEMA system.lakeflow TO ``; +GRANT SELECT ON SCHEMA system.lakeflow TO ``; +GRANT SELECT ON TABLE system.lakeflow.pipeline_update_timeline TO ``; +``` + +### 3. Get Configuration Values + +- **Server Hostname**: Found in your Databricks workspace URL (e.g., `dbc-abc123-def456.cloud.databricks.com`) +- **Warehouse HTTP Path**: Go to **SQL Warehouses** → Select your warehouse → **Connection Details** → Copy **HTTP Path** +- **Client ID**: The **Application ID** from step 1 +- **Client Secret**: The secret generated in step 1 + +## Configuration + +1. Create a configuration file: +```bash +make defaultconfig +``` + +2. Edit `jinja/variables/cloud-init.yaml` with your credentials: +```yaml +# Grafana Cloud endpoints +loki_url: https://logs-prod-us-central1.grafana.net/loki/api/v1/push +loki_user: your_loki_username +loki_pass: your_loki_password +prom_url: https://prometheus-prod-us-central1.grafana.net/api/prom/push +prom_user: your_prometheus_username +prom_pass: your_prometheus_password + +# Databricks OAuth2 Service Principal credentials +databricks_server_hostname: dbc-abc123-def456.cloud.databricks.com +databricks_warehouse_http_path: /sql/1.0/warehouses/abc123def456 +databricks_client_id: your-application-id +databricks_client_secret: your-client-secret +``` + +## Usage + +### Running the Application + +1. Launch the VM with the sample application: +```bash +make run +``` + +This command will: +- Create a new VM named `databricks-sample-app` +- Configure the VM using cloud-init +- Deploy the Databricks exporter in a Docker container +- Set up Grafana Alloy for monitoring and forwarding metrics + +### Stopping the Application + +To stop and clean up the VM: +```bash +make stop +``` + +## Metrics Collected + +The exporter collects 18 metrics across four categories: + +### Billing Metrics +- `databricks_billing_dbus_total` - Daily DBU consumption per workspace and SKU +- `databricks_billing_cost_estimate_usd` - Estimated cost in USD +- `databricks_price_change_events` - Count of price changes per SKU + +### Job Metrics +- `databricks_job_runs_total` - Total job runs +- `databricks_job_run_status` - Job run counts by result state +- `databricks_job_run_duration_seconds` - Job duration quantiles (p50, p95, p99) +- `databricks_task_retries_total` - Task retry counts +- `databricks_job_sla_miss_total` - Jobs exceeding SLA threshold + +### Pipeline Metrics +- `databricks_pipeline_runs_total` - Total pipeline runs +- `databricks_pipeline_run_status` - Pipeline runs by result state +- `databricks_pipeline_run_duration_seconds` - Pipeline duration quantiles +- `databricks_pipeline_retry_events_total` - Pipeline retry counts +- `databricks_pipeline_freshness_lag_seconds` - Data freshness lag + +### SQL Query Metrics +- `databricks_queries_total` - Total SQL queries executed +- `databricks_query_errors_total` - Failed query count +- `databricks_query_duration_seconds` - Query duration quantiles +- `databricks_queries_running` - Estimated concurrent queries + +### System Metrics +- `databricks_up` - Exporter health (1 = healthy, 0 = unhealthy) + +## Development + +### Project Structure +``` +databricks/ +├── .CI_BYPASS # Excludes from CI (cloud service) +├── Makefile # Build and deployment commands +├── README.md # This file +├── jinja/ +│ └── templates/ # Cloud-init templates +└── tests/ + ├── configs/ # Test configuration + └── metrics/ # Expected metrics list +``` + +### Available Make Commands + +- `make run` - Launch the VM with the sample application +- `make stop` - Stop and delete the VM +- `make render-config` - Generate cloud-init configuration +- `make clean` - Clean up generated files +- `make defaultconfig` - Create default configuration template + +## Troubleshooting + +### Check VM Status +```bash +multipass info databricks-sample-app +``` + +### View Cloud-Init Logs +```bash +multipass exec databricks-sample-app -- sudo cat /var/log/cloud-init-output.log +``` + +### Check Alloy Status +```bash +multipass exec databricks-sample-app -- systemctl status alloy +``` + +### Check Exporter Container +```bash +multipass exec databricks-sample-app -- docker ps +multipass exec databricks-sample-app -- docker logs databricks-exporter +``` + +### Verify Metrics +```bash +multipass exec databricks-sample-app -- curl -s localhost:9976/metrics | head -50 +``` + +### Common Issues + +1. **Authentication Errors (401)**: Verify Client ID and Client Secret are correct +2. **No Metrics**: Check that the SQL Warehouse is running and the Service Principal has required permissions +3. **Connection Errors**: Verify the Server Hostname doesn't include `https://` prefix + +For more detailed troubleshooting, see the [Databricks Exporter README](https://github.com/grafana/databricks-prometheus-exporter#troubleshooting). + diff --git a/sample-apps/databricks/cloud-init.yaml b/sample-apps/databricks/cloud-init.yaml new file mode 100644 index 0000000..8a290f8 --- /dev/null +++ b/sample-apps/databricks/cloud-init.yaml @@ -0,0 +1,209 @@ +#cloud-config +# Cloud-init configuration for setting up Alloy and Databricks exporter sample-app + +package_update: true +package_upgrade: false + +packages: + - git + - gpg + - curl + - wget + +write_files: + # Alloy profile + - owner: root:root + path: /etc/default/alloy + content: | + ## Path: + ## Description: Grafana Alloy settings + ## Type: string + ## Default: "" + ## ServiceRestart: alloy + # + # Command line options for Alloy. + # + # The configuration file holding the Alloy config. + CONFIG_FILE="/etc/alloy/config.alloy" + # User-defined arguments to pass to the run command. + CUSTOM_ARGS="--stability.level=experimental" + # Restart on system upgrade. Defaults to true. + RESTART_ON_UPGRADE=true + + # Alloy configuration + - owner: root:root + path: /etc/alloy/config.alloy + content: | + // Alloy self-monitoring + prometheus.exporter.self "alloy_check" { } + + discovery.relabel "alloy_check" { + targets = prometheus.exporter.self.alloy_check.targets + rule { + target_label = "instance" + replacement = constants.hostname + } + rule { + target_label = "alloy_hostname" + replacement = constants.hostname + } + rule { + target_label = "job" + replacement = "integrations/alloy-check" + } + } + + prometheus.scrape "alloy_check" { + targets = discovery.relabel.alloy_check.output + forward_to = [prometheus.relabel.alloy_check.receiver] + scrape_interval = "60s" + } + + prometheus.relabel "alloy_check" { + forward_to = [prometheus.remote_write.metrics_service.receiver] + rule { + source_labels = ["__name__"] + regex = "(prometheus_target_sync_length_seconds_sum|prometheus_target_scrapes_.*|prometheus_target_interval.*|prometheus_sd_discovered_targets|alloy_build.*|prometheus_remote_write_wal_samples_appended_total|process_start_time_seconds)" + action = "keep" + } + } + + // Databricks exporter scraping + // The exporter runs as a Docker container on localhost:9976 + prometheus.scrape "integrations_databricks" { + targets = [{ + __address__ = "localhost:9976", + }] + forward_to = [prometheus.relabel.integrations_databricks.receiver] + scrape_interval = "5m" + scrape_timeout = "4m" + job_name = "integrations/databricks" + } + + prometheus.relabel "integrations_databricks" { + forward_to = [prometheus.remote_write.metrics_service.receiver] + rule { + target_label = "instance" + replacement = constants.hostname + } + rule { + target_label = "job" + replacement = "integrations/databricks" + } + } + + prometheus.remote_write "metrics_service" { + endpoint { + url = "http://your-prometheus-instance:9090/api/v1/push" + basic_auth { + username = "your_prometheus_username" + password = "your_prometheus_password" + } + } + } + + // Docker log discovery for the databricks-exporter container + discovery.docker "databricks_exporter" { + host = "unix:///var/run/docker.sock" + refresh_interval = "5s" + filter { + name = "name" + values = ["databricks-exporter"] + } + } + + discovery.relabel "databricks_exporter" { + targets = discovery.docker.databricks_exporter.targets + rule { + source_labels = ["__meta_docker_container_name"] + target_label = "name" + replacement = "databricks-exporter" + } + rule { + source_labels = ["__meta_docker_container_name"] + target_label = "job" + replacement = "integrations/databricks" + } + rule { + source_labels = ["__meta_docker_container_name"] + target_label = "instance" + replacement = constants.hostname + } + } + + loki.source.docker "databricks_exporter" { + host = "unix:///var/run/docker.sock" + targets = discovery.docker.databricks_exporter.targets + forward_to = [loki.write.grafana_cloud_loki.receiver] + relabel_rules = discovery.relabel.databricks_exporter.rules + } + + loki.write "grafana_cloud_loki" { + endpoint { + url = "http://your-loki-instance:3100/loki/api/v1/push" + basic_auth { + username = "your_loki_username" + password = "your_loki_password" + } + } + } + +runcmd: + - mkdir -p /home/ubuntu + - mkdir -p /etc/apt/keyrings/ + # Create required directory for alloy + - mkdir -p /var/lib/alloy + - chown -R root:root /var/lib/alloy + + # Install Grafana repo + - curl -fsSL https://apt.grafana.com/gpg.key | gpg --dearmor -o /etc/apt/keyrings/grafana.gpg + - echo "deb [signed-by=/etc/apt/keyrings/grafana.gpg] https://apt.grafana.com stable main" > /etc/apt/sources.list.d/grafana.list + + # Install Docker repo + - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg + - echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null + + - apt-get update + - DEBIAN_FRONTEND=noninteractive apt-get install -y alloy docker-ce docker-ce-cli containerd.io + + # Add ubuntu user to docker group + - groupadd -f docker + - usermod -aG docker ubuntu + - chmod 666 /var/run/docker.sock || true + - systemctl restart docker + - systemctl enable docker + - systemctl start docker + + # Run the Databricks exporter container + # NOTE: The exporter queries System Tables which can take 2-4 minutes + # Scrape interval should be 5m with a 4m timeout to avoid overlapping scrapes + - | + docker run -d \ + --name databricks-exporter \ + --restart unless-stopped \ + -p 9976:9976 \ + -e DATABRICKS_EXPORTER_SERVER_HOSTNAME="your-workspace.cloud.databricks.com" \ + -e DATABRICKS_EXPORTER_WAREHOUSE_HTTP_PATH="/sql/1.0/warehouses/your-warehouse-id" \ + -e DATABRICKS_EXPORTER_CLIENT_ID="your-service-principal-client-id" \ + -e DATABRICKS_EXPORTER_CLIENT_SECRET="your-service-principal-client-secret" \ + ghcr.io/grafana/databricks-prometheus-exporter:latest + + # Wait for the exporter to start + - | + for i in {1..30}; do + if curl -s http://localhost:9976/metrics > /dev/null 2>&1; then + echo "Databricks exporter is ready" + break + fi + echo "Waiting for Databricks exporter to start..." + sleep 2 + done + + # Configure Alloy to run as root (needed for Docker socket access) + - sed -i '/^\[Service\]/,/^\[/ { /^[ \t]*User=/d; /^[ \t]*Group=/d }' /lib/systemd/system/alloy.service || echo "Could not modify /lib/systemd/system/alloy.service" + - sed -i '/^\[Service\]/,/^\[/ { /^[ \t]*User=/d; /^[ \t]*Group=/d }' /etc/systemd/system/alloy.service || echo "Could not modify /etc/systemd/system/alloy.service" + + - systemctl daemon-reload + - systemctl enable alloy + - systemctl restart alloy + diff --git a/sample-apps/databricks/jinja/templates/cloud-init-template.yaml b/sample-apps/databricks/jinja/templates/cloud-init-template.yaml new file mode 100644 index 0000000..2ff1c1a --- /dev/null +++ b/sample-apps/databricks/jinja/templates/cloud-init-template.yaml @@ -0,0 +1,213 @@ +#cloud-config +# Cloud-init configuration for setting up Alloy and Databricks exporter sample-app + +package_update: true +package_upgrade: false + +packages: + - git + - gpg + - curl + - wget + +write_files: + # Alloy profile + - owner: root:root + path: /etc/default/alloy + content: | + ## Path: + ## Description: Grafana Alloy settings + ## Type: string + ## Default: "" + ## ServiceRestart: alloy + # + # Command line options for Alloy. + # + # The configuration file holding the Alloy config. + CONFIG_FILE="/etc/alloy/config.alloy" + # User-defined arguments to pass to the run command. + CUSTOM_ARGS="--stability.level=experimental" + # Restart on system upgrade. Defaults to true. + RESTART_ON_UPGRADE=true + + # Alloy configuration + - owner: root:root + path: /etc/alloy/config.alloy + content: | + // Alloy self-monitoring + prometheus.exporter.self "alloy_check" { } + + discovery.relabel "alloy_check" { + targets = prometheus.exporter.self.alloy_check.targets + rule { + target_label = "instance" + replacement = constants.hostname + } + rule { + target_label = "alloy_hostname" + replacement = constants.hostname + } + rule { + target_label = "job" + replacement = "integrations/alloy-check" + } + } + + prometheus.scrape "alloy_check" { + targets = discovery.relabel.alloy_check.output + forward_to = [prometheus.relabel.alloy_check.receiver] + scrape_interval = "60s" + } + + prometheus.relabel "alloy_check" { + forward_to = [prometheus.remote_write.metrics_service.receiver] + rule { + source_labels = ["__name__"] + regex = "(prometheus_target_sync_length_seconds_sum|prometheus_target_scrapes_.*|prometheus_target_interval.*|prometheus_sd_discovered_targets|alloy_build.*|prometheus_remote_write_wal_samples_appended_total|process_start_time_seconds)" + action = "keep" + } + } + + // Databricks exporter scraping + // The exporter runs as a Docker container on localhost:9976 + prometheus.scrape "integrations_databricks" { + targets = [{ + __address__ = "localhost:9976", + }] + forward_to = [prometheus.relabel.integrations_databricks.receiver] + scrape_interval = "5m" + scrape_timeout = "4m" + job_name = "integrations/databricks" + } + + prometheus.relabel "integrations_databricks" { + forward_to = [prometheus.remote_write.metrics_service.receiver] + rule { + target_label = "instance" + replacement = constants.hostname + } + rule { + target_label = "job" + replacement = "integrations/databricks" + } + } + + prometheus.remote_write "metrics_service" { + endpoint { + url = "{{ prom_url }}" + {% if prom_user and prom_pass -%} + basic_auth { + username = "{{ prom_user }}" + password = "{{ prom_pass }}" + } + {%- endif %} + } + } + + // Docker log discovery for the databricks-exporter container + discovery.docker "databricks_exporter" { + host = "unix:///var/run/docker.sock" + refresh_interval = "5s" + filter { + name = "name" + values = ["databricks-exporter"] + } + } + + discovery.relabel "databricks_exporter" { + targets = discovery.docker.databricks_exporter.targets + rule { + source_labels = ["__meta_docker_container_name"] + target_label = "name" + replacement = "databricks-exporter" + } + rule { + source_labels = ["__meta_docker_container_name"] + target_label = "job" + replacement = "integrations/databricks" + } + rule { + source_labels = ["__meta_docker_container_name"] + target_label = "instance" + replacement = constants.hostname + } + } + + loki.source.docker "databricks_exporter" { + host = "unix:///var/run/docker.sock" + targets = discovery.docker.databricks_exporter.targets + forward_to = [loki.write.grafana_cloud_loki.receiver] + relabel_rules = discovery.relabel.databricks_exporter.rules + } + + loki.write "grafana_cloud_loki" { + endpoint { + url = "{{ loki_url }}" + {% if loki_user and loki_pass -%} + basic_auth { + username = "{{ loki_user }}" + password = "{{ loki_pass }}" + } + {%- endif %} + } + } + +runcmd: + - mkdir -p /home/ubuntu + - mkdir -p /etc/apt/keyrings/ + # Create required directory for alloy + - mkdir -p /var/lib/alloy + - chown -R root:root /var/lib/alloy + + # Install Grafana repo + - curl -fsSL https://apt.grafana.com/gpg.key | gpg --dearmor -o /etc/apt/keyrings/grafana.gpg + - echo "deb [signed-by=/etc/apt/keyrings/grafana.gpg] https://apt.grafana.com stable main" > /etc/apt/sources.list.d/grafana.list + + # Install Docker repo + - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg + - echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null + + - apt-get update + - DEBIAN_FRONTEND=noninteractive apt-get install -y alloy docker-ce docker-ce-cli containerd.io + + # Add ubuntu user to docker group + - groupadd -f docker + - usermod -aG docker ubuntu + - chmod 666 /var/run/docker.sock || true + - systemctl restart docker + - systemctl enable docker + - systemctl start docker + + # Run the Databricks exporter container + # NOTE: The exporter queries System Tables which can take 2-4 minutes + # Scrape interval should be 5m with a 4m timeout to avoid overlapping scrapes + - | + docker run -d \ + --name databricks-exporter \ + --restart unless-stopped \ + -p 9976:9976 \ + -e DATABRICKS_EXPORTER_SERVER_HOSTNAME="{{ databricks_server_hostname }}" \ + -e DATABRICKS_EXPORTER_WAREHOUSE_HTTP_PATH="{{ databricks_warehouse_http_path }}" \ + -e DATABRICKS_EXPORTER_CLIENT_ID="{{ databricks_client_id }}" \ + -e DATABRICKS_EXPORTER_CLIENT_SECRET="{{ databricks_client_secret }}" \ + ghcr.io/grafana/databricks-prometheus-exporter:latest + + # Wait for the exporter to start + - | + for i in {1..30}; do + if curl -s http://localhost:9976/metrics > /dev/null 2>&1; then + echo "Databricks exporter is ready" + break + fi + echo "Waiting for Databricks exporter to start..." + sleep 2 + done + + # Configure Alloy to run as root (needed for Docker socket access) + - sed -i '/^\[Service\]/,/^\[/ { /^[ \t]*User=/d; /^[ \t]*Group=/d }' /lib/systemd/system/alloy.service || echo "Could not modify /lib/systemd/system/alloy.service" + - sed -i '/^\[Service\]/,/^\[/ { /^[ \t]*User=/d; /^[ \t]*Group=/d }' /etc/systemd/system/alloy.service || echo "Could not modify /etc/systemd/system/alloy.service" + + - systemctl daemon-reload + - systemctl enable alloy + - systemctl restart alloy + diff --git a/sample-apps/databricks/jinja/variables/cloud-init.yaml b/sample-apps/databricks/jinja/variables/cloud-init.yaml new file mode 100644 index 0000000..4fd4e92 --- /dev/null +++ b/sample-apps/databricks/jinja/variables/cloud-init.yaml @@ -0,0 +1,16 @@ +# Databricks Exporter Configuration +# See README.md for instructions on obtaining these values + +# Grafana Cloud endpoints +loki_url: http://your-loki-instance:3100/loki/api/v1/push +loki_user: your_loki_username +loki_pass: your_loki_password +prom_url: http://your-prometheus-instance:9090/api/v1/push +prom_user: your_prometheus_username +prom_pass: your_prometheus_password + +# Databricks OAuth2 Service Principal credentials +databricks_server_hostname: your-workspace.cloud.databricks.com +databricks_warehouse_http_path: /sql/1.0/warehouses/your-warehouse-id +databricks_client_id: your-service-principal-client-id +databricks_client_secret: your-service-principal-client-secret diff --git a/sample-apps/databricks/tests/configs/databricks.config b/sample-apps/databricks/tests/configs/databricks.config new file mode 100644 index 0000000..7bf1ab0 --- /dev/null +++ b/sample-apps/databricks/tests/configs/databricks.config @@ -0,0 +1,2 @@ +JOB_LABEL=integrations/databricks + diff --git a/sample-apps/databricks/tests/metrics/databricks b/sample-apps/databricks/tests/metrics/databricks new file mode 100644 index 0000000..a1e3891 --- /dev/null +++ b/sample-apps/databricks/tests/metrics/databricks @@ -0,0 +1,20 @@ +databricks_billing_cost_estimate_usd +databricks_billing_dbus_total +databricks_job_run_duration_seconds +databricks_job_run_status +databricks_job_runs_total +databricks_job_sla_miss_total +databricks_pipeline_freshness_lag_seconds +databricks_pipeline_retry_events_total +databricks_pipeline_run_duration_seconds +databricks_pipeline_run_status +databricks_pipeline_runs_total +databricks_price_change_events +databricks_queries_running +databricks_queries_total +databricks_query_duration_seconds +databricks_query_errors_total +databricks_task_retries_total +databricks_up +up + From a2a4dac5d1fc3c4696b4d15884f444ee82592027 Mon Sep 17 00:00:00 2001 From: Ahmad Alhour Date: Wed, 14 Jan 2026 16:16:17 +0100 Subject: [PATCH 2/5] feat(runbook): refactor the sample-app to a runbook doc --- sample-apps/databricks/.CI_BYPASS | 4 +- sample-apps/databricks/Makefile | 86 ++--- sample-apps/databricks/README.md | 315 +++++++++++------- sample-apps/databricks/cloud-init.yaml | 209 ------------ .../databricks/configs/alloy-advanced.alloy | 74 ++++ .../databricks/configs/alloy-simple.alloy | 43 +++ .../jinja/templates/cloud-init-template.yaml | 213 ------------ .../jinja/variables/cloud-init.yaml | 16 - 8 files changed, 351 insertions(+), 609 deletions(-) delete mode 100644 sample-apps/databricks/cloud-init.yaml create mode 100644 sample-apps/databricks/configs/alloy-advanced.alloy create mode 100644 sample-apps/databricks/configs/alloy-simple.alloy delete mode 100644 sample-apps/databricks/jinja/templates/cloud-init-template.yaml delete mode 100644 sample-apps/databricks/jinja/variables/cloud-init.yaml diff --git a/sample-apps/databricks/.CI_BYPASS b/sample-apps/databricks/.CI_BYPASS index a155df8..8b3415c 100644 --- a/sample-apps/databricks/.CI_BYPASS +++ b/sample-apps/databricks/.CI_BYPASS @@ -1,3 +1,3 @@ # Databricks is a cloud-hosted SaaS service that cannot be spun up locally. -# This sample-app is excluded from CI runs. - +# This runbook provides manual setup instructions for existing Databricks workspaces. +# Configuration validation tests can still run in CI. diff --git a/sample-apps/databricks/Makefile b/sample-apps/databricks/Makefile index c2a9555..8daf301 100644 --- a/sample-apps/databricks/Makefile +++ b/sample-apps/databricks/Makefile @@ -1,54 +1,40 @@ -VM_NAME ?= databricks-sample-app -CONFIG_FILE_DIR := jinja/variables -CONFIG_FILE := $(CONFIG_FILE_DIR)/cloud-init.yaml -LOKI_INSTANCE ?= your-loki-instance:3100 -PROMETHEUS_INSTANCE ?= your-prometheus-instance:9090 +.PHONY: help validate-config test clean -# Import last to ensure expected parameters are set and available to import from this file -include ../Makefile - -.PHONY: run run-ci stop render-config launch-vm clean defaultconfig - -run: launch-vm - @echo "VM $(VM_NAME) launched and configured." +help: + @echo "Databricks Monitoring Runbook" @echo "" - @echo "IMPORTANT: You must configure jinja/variables/cloud-init.yaml with your Databricks credentials before launching." - @echo "See README.md for required configuration values." - -run-ci: clean defaultconfig launch-vm - @echo "Running in CI mode" - -stop: - @multipass stop $(VM_NAME) - @multipass delete $(VM_NAME) - @multipass purge - -render-config: - @docker run --rm -v $(shell pwd)/jinja/templates:/templates -v $(shell pwd)/jinja/variables:/variables dinutac/jinja2docker:latest /templates/cloud-init-template.yaml /variables/cloud-init.yaml --format=yaml > cloud-init.yaml - -launch-vm: render-config - @multipass launch -n $(VM_NAME) --disk 10G --cloud-init cloud-init.yaml + @echo "Available commands:" + @echo " make validate-config - Validate Alloy configuration syntax" + @echo " make test - Run metric validation tests" + @echo " make clean - Clean up temporary files" + @echo "" + @echo "This is a runbook, not an automated sample app." + @echo "Follow the instructions in README.md for manual setup." + +validate-config: + @echo "Validating Alloy configurations..." + @if command -v alloy > /dev/null 2>&1; then \ + alloy fmt configs/alloy-simple.alloy; \ + alloy fmt configs/alloy-advanced.alloy; \ + echo "✓ Configuration syntax is valid"; \ + else \ + echo "⚠ Alloy binary not found. Install from:"; \ + echo " https://grafana.com/docs/alloy/latest/get-started/install/"; \ + fi + +test: + @echo "Running metric validation tests..." + @echo "Note: This validates expected metrics, not live data." + @if [ -f tests/metrics/databricks ]; then \ + echo "Expected metrics ($(shell wc -l < tests/metrics/databricks | tr -d ' '))"; \ + cat tests/metrics/databricks; \ + else \ + echo "✗ Metric validation file not found"; \ + exit 1; \ + fi clean: - @rm -f cloud-init.yaml - @rm -rf $(CONFIG_FILE_DIR) - -defaultconfig: - @mkdir -p $(CONFIG_FILE_DIR) - @echo "# Databricks Exporter Configuration" > $(CONFIG_FILE) - @echo "# See README.md for instructions on obtaining these values" >> $(CONFIG_FILE) - @echo "" >> $(CONFIG_FILE) - @echo "# Grafana Cloud endpoints" >> $(CONFIG_FILE) - @echo "loki_url: http://$(LOKI_INSTANCE)/loki/api/v1/push" >> $(CONFIG_FILE) - @echo "loki_user: your_loki_username" >> $(CONFIG_FILE) - @echo "loki_pass: your_loki_password" >> $(CONFIG_FILE) - @echo "prom_url: http://$(PROMETHEUS_INSTANCE)/api/v1/push" >> $(CONFIG_FILE) - @echo "prom_user: your_prometheus_username" >> $(CONFIG_FILE) - @echo "prom_pass: your_prometheus_password" >> $(CONFIG_FILE) - @echo "" >> $(CONFIG_FILE) - @echo "# Databricks OAuth2 Service Principal credentials" >> $(CONFIG_FILE) - @echo "databricks_server_hostname: your-workspace.cloud.databricks.com" >> $(CONFIG_FILE) - @echo "databricks_warehouse_http_path: /sql/1.0/warehouses/your-warehouse-id" >> $(CONFIG_FILE) - @echo "databricks_client_id: your-service-principal-client-id" >> $(CONFIG_FILE) - @echo "databricks_client_secret: your-service-principal-client-secret" >> $(CONFIG_FILE) - + @echo "Cleaning up temporary files..." + @rm -f configs/*.tmp + @rm -f *.log + @echo "✓ Clean complete" diff --git a/sample-apps/databricks/README.md b/sample-apps/databricks/README.md index db6f057..2dabd4b 100644 --- a/sample-apps/databricks/README.md +++ b/sample-apps/databricks/README.md @@ -1,102 +1,167 @@ -# Databricks Sample Application +# Databricks Monitoring Runbook -This sample application sets up monitoring for Databricks using Grafana Alloy. The application runs the Databricks Prometheus Exporter in a containerized environment and includes automated setup using cloud-init. - -## Overview - -The sample application: -- Deploys the [Databricks Prometheus Exporter](https://github.com/grafana/databricks-prometheus-exporter) as a Docker container -- Configures Grafana Alloy to scrape metrics from the exporter -- Forwards metrics to Grafana Cloud (or any Prometheus-compatible remote write endpoint) -- Automates the entire setup process using cloud-init +This runbook guides you through setting up monitoring for Databricks using Grafana Alloy. Unlike automated sample apps, this requires manual configuration in your existing Databricks workspace. ## Prerequisites -- [Multipass](https://multipass.run/) for VM management +Before you begin, ensure you have the following: + - A Databricks workspace with Unity Catalog enabled -- A Databricks Service Principal with OAuth2 credentials -- A running SQL Warehouse (or one configured to auto-start) +- Administrative access to create Service Principals +- A SQL Warehouse (serverless is recommended for cost efficiency) +- Grafana Alloy installed on a host that can reach Databricks APIs +- Grafana Cloud credentials (or any Prometheus-compatible endpoint) + +## Quick Start + +To get started with this runbook, follow these steps: + +1. **Clone the repository**: + ```sh + git clone https://github.com/grafana/integration-sample-apps.git + cd sample-apps/databricks + ``` +1. **Configure Databricks** (follow Databricks Configuration section below) +1. **Configure Alloy**: + - Copy `configs/alloy-simple.alloy` to your Alloy config directory + - Update with your Databricks credentials and workspace details + - Restart Alloy service +1. **Verify metrics**: + - Query `databricks_up` in your Prometheus instance + - Check Alloy logs for successful scrapes ## Databricks Configuration -Before running the sample app, you need to set up authentication in your Databricks workspace. +### Step 1: Get your workspace hostname + +1. Copy your workspace URL subdomain, for example, `dbc-abc123-def456.cloud.databricks.com`. + +### Step 2: Create or configure SQL Warehouse + +1. Go to **SQL Warehouses** in the sidebar. +1. Either select an existing warehouse or click **Create SQL warehouse**: + - **Size**: 2X-Small (minimum size to reduce costs) + - **Auto stop**: After 10 minutes of inactivity + - **Scaling**: Min 1, Max 1 cluster +1. Click **Create**, then go to the **Connection Details** tab. +1. Copy the **HTTP path**, for example, `/sql/1.0/warehouses/abc123def456`. + +### Step 3: Create a Service Principal -### 1. Create a Service Principal +1. Click your workspace name (top-right) and select **Manage Account**. +1. Go to **User Management** > **Service Principals** tab > **Add service principal**. +1. Enter a name, for example, `grafana-cloud-integration`. +1. Go to **Credentials & secrets** tab > **OAuth secrets** > **Generate secret**. +1. Select the maximum lifetime (730 days) and click **Generate**. +1. Copy the **Client ID** and **Client Secret**. You will need both for the Alloy configuration. -1. Log into your Databricks workspace -2. Go to **Settings** → **Admin Console** → **Service Principals** -3. Click **Add Service Principal** -4. Note the **Application ID** (this is your Client ID) -5. Click **Generate Secret** under OAuth Secrets -6. Copy and securely store the **Client Secret** +### Step 4: Assign the Service Principal to your workspace -### 2. Grant Required Permissions +1. Go to **Workspaces** in the sidebar and select your workspace. +1. Go to the **Permissions** tab and click **Add permissions**. +1. Search for the Service Principal and assign it the **Admin** permission. -Run these SQL commands as a Databricks admin (replace `` with your Application ID): +### Step 5: Grant SQL permissions to the Service Principal + +As a metastore admin or user with MANAGE privilege, run the following SQL statements in a query editor: ```sql -GRANT MANAGE ON CATALOG system TO ``; -GRANT USE CATALOG ON CATALOG system TO ``; -GRANT USE SCHEMA ON SCHEMA system.billing TO ``; -GRANT SELECT ON SCHEMA system.billing TO ``; -GRANT USE SCHEMA ON SCHEMA system.query TO ``; -GRANT SELECT ON SCHEMA system.query TO ``; -GRANT USE SCHEMA ON SCHEMA system.lakeflow TO ``; -GRANT SELECT ON SCHEMA system.lakeflow TO ``; -GRANT SELECT ON TABLE system.lakeflow.pipeline_update_timeline TO ``; +GRANT USE CATALOG ON CATALOG system TO ``; +GRANT USE SCHEMA ON SCHEMA system.billing TO ``; +GRANT SELECT ON SCHEMA system.billing TO ``; +GRANT USE SCHEMA ON SCHEMA system.query TO ``; +GRANT SELECT ON SCHEMA system.query TO ``; +GRANT USE SCHEMA ON SCHEMA system.lakeflow TO ``; +GRANT SELECT ON SCHEMA system.lakeflow TO ``; ``` -### 3. Get Configuration Values +Replace `` with your Service Principal's Client ID. + +Refer to the [Databricks documentation](https://docs.databricks.com/en/dev-tools/auth/oauth-m2m.html) for detailed OAuth2 M2M setup instructions. + +## Alloy Configuration + +### Simple Configuration + +See [`configs/alloy-simple.alloy`](configs/alloy-simple.alloy) for a basic setup that collects all default metrics with recommended settings. + +### Advanced Configuration -- **Server Hostname**: Found in your Databricks workspace URL (e.g., `dbc-abc123-def456.cloud.databricks.com`) -- **Warehouse HTTP Path**: Go to **SQL Warehouses** → Select your warehouse → **Connection Details** → Copy **HTTP Path** -- **Client ID**: The **Application ID** from step 1 -- **Client Secret**: The secret generated in step 1 +See [`configs/alloy-advanced.alloy`](configs/alloy-advanced.alloy) for a configuration with all optional parameters, tuning options, and metric filtering examples. -## Configuration +### Environment Variables + +Store sensitive credentials as environment variables: -1. Create a configuration file: ```bash -make defaultconfig +export DATABRICKS_CLIENT_ID="your-application-id" +export DATABRICKS_CLIENT_SECRET="your-client-secret" +export PROMETHEUS_URL="https://prometheus-prod-us-central1.grafana.net/api/prom/push" +export PROMETHEUS_USER="your-prometheus-username" +export PROMETHEUS_PASS="your-prometheus-password" ``` -2. Edit `jinja/variables/cloud-init.yaml` with your credentials: -```yaml -# Grafana Cloud endpoints -loki_url: https://logs-prod-us-central1.grafana.net/loki/api/v1/push -loki_user: your_loki_username -loki_pass: your_loki_password -prom_url: https://prometheus-prod-us-central1.grafana.net/api/prom/push -prom_user: your_prometheus_username -prom_pass: your_prometheus_password - -# Databricks OAuth2 Service Principal credentials -databricks_server_hostname: dbc-abc123-def456.cloud.databricks.com -databricks_warehouse_http_path: /sql/1.0/warehouses/abc123def456 -databricks_client_id: your-application-id -databricks_client_secret: your-client-secret -``` +### Configuration Options + +| Parameter | Default | Description | +|-----------|---------|-------------| +| `server_hostname` | Required | Databricks workspace hostname (e.g., `dbc-abc123.cloud.databricks.com`) | +| `warehouse_http_path` | Required | SQL Warehouse HTTP path (e.g., `/sql/1.0/warehouses/xyz`) | +| `client_id` | Required | OAuth2 Application ID of your Service Principal | +| `client_secret` | Required | OAuth2 Client Secret | +| `query_timeout` | `5m` | Timeout for individual SQL queries | +| `billing_lookback` | `24h` | How far back to query billing data | +| `jobs_lookback` | `3h` | How far back to query job runs | +| `pipelines_lookback` | `3h` | How far back to query pipeline runs | +| `queries_lookback` | `2h` | How far back to query SQL warehouse queries | +| `sla_threshold_seconds` | `3600` | Duration threshold for job SLA miss detection | +| `collect_task_retries` | `false` | Collect task-level retry metrics (⚠️ high cardinality) | + +### Tuning Recommendations -## Usage +- **`scrape_interval`**: Use 10-30 minutes. The exporter queries Databricks System Tables which can be slow and costly. Increase the interval to reduce SQL Warehouse usage. +- **`scrape_timeout`**: Must be less than `scrape_interval`. Typical scrapes take 90-120 seconds depending on data volume. +- **Lookback windows**: Should be at least 2x the scrape interval to ensure data continuity between scrapes. The defaults (`3h` for jobs and pipelines, `2h` for queries) work well with 10-30 minute scrape intervals. -### Running the Application +## Validating Metrics + +### Check Alloy Status -1. Launch the VM with the sample application: ```bash -make run +# Check Alloy service status +systemctl status alloy + +# View Alloy logs +journalctl -u alloy -f + +# Check metrics endpoint +curl http://localhost:12345/metrics | grep databricks ``` -This command will: -- Create a new VM named `databricks-sample-app` -- Configure the VM using cloud-init -- Deploy the Databricks exporter in a Docker container -- Set up Grafana Alloy for monitoring and forwarding metrics +### Verify in Prometheus -### Stopping the Application +Query for the health metric: -To stop and clean up the VM: -```bash -make stop +```promql +databricks_up{job="databricks"} +``` + +Should return `1` if the exporter is healthy. + +### Check Key Metrics + +```promql +# Billing metrics +databricks_billing_dbus_total + +# Job metrics +databricks_job_runs_total + +# Query metrics +databricks_queries_total + +# Exporter up/down +databricks_up ``` ## Metrics Collected @@ -106,18 +171,18 @@ The exporter collects 18 metrics across four categories: ### Billing Metrics - `databricks_billing_dbus_total` - Daily DBU consumption per workspace and SKU - `databricks_billing_cost_estimate_usd` - Estimated cost in USD -- `databricks_price_change_events` - Count of price changes per SKU +- `databricks_price_change_events_total` - Count of price changes per SKU ### Job Metrics - `databricks_job_runs_total` - Total job runs -- `databricks_job_run_status` - Job run counts by result state +- `databricks_job_run_status_total` - Job run counts by result state - `databricks_job_run_duration_seconds` - Job duration quantiles (p50, p95, p99) -- `databricks_task_retries_total` - Task retry counts +- `databricks_task_retries_total` - Task retry counts (optional, high cardinality) - `databricks_job_sla_miss_total` - Jobs exceeding SLA threshold ### Pipeline Metrics - `databricks_pipeline_runs_total` - Total pipeline runs -- `databricks_pipeline_run_status` - Pipeline runs by result state +- `databricks_pipeline_run_status_total` - Pipeline runs by result state - `databricks_pipeline_run_duration_seconds` - Pipeline duration quantiles - `databricks_pipeline_retry_events_total` - Pipeline retry counts - `databricks_pipeline_freshness_lag_seconds` - Data freshness lag @@ -131,62 +196,74 @@ The exporter collects 18 metrics across four categories: ### System Metrics - `databricks_up` - Exporter health (1 = healthy, 0 = unhealthy) -## Development +## Troubleshooting -### Project Structure -``` -databricks/ -├── .CI_BYPASS # Excludes from CI (cloud service) -├── Makefile # Build and deployment commands -├── README.md # This file -├── jinja/ -│ └── templates/ # Cloud-init templates -└── tests/ - ├── configs/ # Test configuration - └── metrics/ # Expected metrics list -``` +### Common Issues -### Available Make Commands +#### Authentication Errors (401) +**Symptom**: Alloy logs show `401 Unauthorized` -- `make run` - Launch the VM with the sample application -- `make stop` - Stop and delete the VM -- `make render-config` - Generate cloud-init configuration -- `make clean` - Clean up generated files -- `make defaultconfig` - Create default configuration template +**Solution**: +- Verify Client ID and Client Secret are correct +- Ensure the Service Principal exists and hasn't expired (check OAuth secret lifetime) +- Verify the Service Principal has workspace Admin permission -## Troubleshooting +#### No Metrics Appearing +**Symptom**: `databricks_up` returns no data or returns `0` -### Check VM Status -```bash -multipass info databricks-sample-app -``` +**Solution**: +- Check that the SQL Warehouse is running (or configured to auto-start) +- Verify the Service Principal has all required SQL permissions (re-run GRANT statements) +- Check Alloy logs for SQL query errors +- Verify network connectivity to `.cloud.databricks.com` -### View Cloud-Init Logs -```bash -multipass exec databricks-sample-app -- sudo cat /var/log/cloud-init-output.log -``` +#### SQL Permission Errors +**Symptom**: Alloy logs show `PERMISSION_DENIED` or `TABLE_OR_VIEW_NOT_FOUND` -### Check Alloy Status -```bash -multipass exec databricks-sample-app -- systemctl status alloy -``` +**Solution**: +- Re-run the GRANT SQL statements as a metastore admin +- Verify Unity Catalog is enabled in your workspace +- Check that System Tables are enabled (they should be by default with Unity Catalog) -### Check Exporter Container -```bash -multipass exec databricks-sample-app -- docker ps -multipass exec databricks-sample-app -- docker logs databricks-exporter -``` +#### Connection Timeouts +**Symptom**: Queries take longer than `scrape_timeout` -### Verify Metrics -```bash -multipass exec databricks-sample-app -- curl -s localhost:9976/metrics | head -50 -``` +**Solution**: +- Increase `scrape_timeout` (but keep it less than `scrape_interval`) +- Reduce lookback windows to query less data +- Use a larger SQL Warehouse size if queries are consistently slow +- Consider increasing `scrape_interval` to 20-30 minutes -### Common Issues +#### High Cardinality Warning +**Symptom**: Too many time series, high storage costs + +**Solution**: +- Disable `collect_task_retries` if enabled (this adds `task_key` label) +- Review metric cardinality with `databricks_*` queries in Prometheus +- Consider metric relabeling to drop high-cardinality labels (see `alloy-advanced.alloy` for examples) + +## Make Commands + +This runbook provides validation commands: + +- `make validate-config` - Validate Alloy configuration syntax +- `make test` - Run metric validation tests +- `make clean` - Clean up temporary files +- `make help` - Show available commands + +## Additional Resources + +- [Databricks OAuth2 M2M Documentation](https://docs.databricks.com/en/dev-tools/auth/oauth-m2m.html) +- [Databricks System Tables Documentation](https://docs.databricks.com/en/admin/system-tables/index.html) +- [Grafana Alloy Documentation](https://grafana.com/docs/alloy/latest/) +- [Databricks Exporter GitHub](https://github.com/grafana/databricks-prometheus-exporter) +- [Integration Documentation](https://grafana.com/docs/grafana-cloud/monitor-infrastructure/integrations/integration-reference/integration-databricks/) -1. **Authentication Errors (401)**: Verify Client ID and Client Secret are correct -2. **No Metrics**: Check that the SQL Warehouse is running and the Service Principal has required permissions -3. **Connection Errors**: Verify the Server Hostname doesn't include `https://` prefix +## Platform Support -For more detailed troubleshooting, see the [Databricks Exporter README](https://github.com/grafana/databricks-prometheus-exporter#troubleshooting). +This runbook is platform-agnostic. Grafana Alloy can be installed on: +- Linux (systemd service) +- Docker (container) +- Kubernetes (Helm chart or operator) +Refer to the [Alloy installation documentation](https://grafana.com/docs/alloy/latest/get-started/install/) for your platform. diff --git a/sample-apps/databricks/cloud-init.yaml b/sample-apps/databricks/cloud-init.yaml deleted file mode 100644 index 8a290f8..0000000 --- a/sample-apps/databricks/cloud-init.yaml +++ /dev/null @@ -1,209 +0,0 @@ -#cloud-config -# Cloud-init configuration for setting up Alloy and Databricks exporter sample-app - -package_update: true -package_upgrade: false - -packages: - - git - - gpg - - curl - - wget - -write_files: - # Alloy profile - - owner: root:root - path: /etc/default/alloy - content: | - ## Path: - ## Description: Grafana Alloy settings - ## Type: string - ## Default: "" - ## ServiceRestart: alloy - # - # Command line options for Alloy. - # - # The configuration file holding the Alloy config. - CONFIG_FILE="/etc/alloy/config.alloy" - # User-defined arguments to pass to the run command. - CUSTOM_ARGS="--stability.level=experimental" - # Restart on system upgrade. Defaults to true. - RESTART_ON_UPGRADE=true - - # Alloy configuration - - owner: root:root - path: /etc/alloy/config.alloy - content: | - // Alloy self-monitoring - prometheus.exporter.self "alloy_check" { } - - discovery.relabel "alloy_check" { - targets = prometheus.exporter.self.alloy_check.targets - rule { - target_label = "instance" - replacement = constants.hostname - } - rule { - target_label = "alloy_hostname" - replacement = constants.hostname - } - rule { - target_label = "job" - replacement = "integrations/alloy-check" - } - } - - prometheus.scrape "alloy_check" { - targets = discovery.relabel.alloy_check.output - forward_to = [prometheus.relabel.alloy_check.receiver] - scrape_interval = "60s" - } - - prometheus.relabel "alloy_check" { - forward_to = [prometheus.remote_write.metrics_service.receiver] - rule { - source_labels = ["__name__"] - regex = "(prometheus_target_sync_length_seconds_sum|prometheus_target_scrapes_.*|prometheus_target_interval.*|prometheus_sd_discovered_targets|alloy_build.*|prometheus_remote_write_wal_samples_appended_total|process_start_time_seconds)" - action = "keep" - } - } - - // Databricks exporter scraping - // The exporter runs as a Docker container on localhost:9976 - prometheus.scrape "integrations_databricks" { - targets = [{ - __address__ = "localhost:9976", - }] - forward_to = [prometheus.relabel.integrations_databricks.receiver] - scrape_interval = "5m" - scrape_timeout = "4m" - job_name = "integrations/databricks" - } - - prometheus.relabel "integrations_databricks" { - forward_to = [prometheus.remote_write.metrics_service.receiver] - rule { - target_label = "instance" - replacement = constants.hostname - } - rule { - target_label = "job" - replacement = "integrations/databricks" - } - } - - prometheus.remote_write "metrics_service" { - endpoint { - url = "http://your-prometheus-instance:9090/api/v1/push" - basic_auth { - username = "your_prometheus_username" - password = "your_prometheus_password" - } - } - } - - // Docker log discovery for the databricks-exporter container - discovery.docker "databricks_exporter" { - host = "unix:///var/run/docker.sock" - refresh_interval = "5s" - filter { - name = "name" - values = ["databricks-exporter"] - } - } - - discovery.relabel "databricks_exporter" { - targets = discovery.docker.databricks_exporter.targets - rule { - source_labels = ["__meta_docker_container_name"] - target_label = "name" - replacement = "databricks-exporter" - } - rule { - source_labels = ["__meta_docker_container_name"] - target_label = "job" - replacement = "integrations/databricks" - } - rule { - source_labels = ["__meta_docker_container_name"] - target_label = "instance" - replacement = constants.hostname - } - } - - loki.source.docker "databricks_exporter" { - host = "unix:///var/run/docker.sock" - targets = discovery.docker.databricks_exporter.targets - forward_to = [loki.write.grafana_cloud_loki.receiver] - relabel_rules = discovery.relabel.databricks_exporter.rules - } - - loki.write "grafana_cloud_loki" { - endpoint { - url = "http://your-loki-instance:3100/loki/api/v1/push" - basic_auth { - username = "your_loki_username" - password = "your_loki_password" - } - } - } - -runcmd: - - mkdir -p /home/ubuntu - - mkdir -p /etc/apt/keyrings/ - # Create required directory for alloy - - mkdir -p /var/lib/alloy - - chown -R root:root /var/lib/alloy - - # Install Grafana repo - - curl -fsSL https://apt.grafana.com/gpg.key | gpg --dearmor -o /etc/apt/keyrings/grafana.gpg - - echo "deb [signed-by=/etc/apt/keyrings/grafana.gpg] https://apt.grafana.com stable main" > /etc/apt/sources.list.d/grafana.list - - # Install Docker repo - - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg - - echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null - - - apt-get update - - DEBIAN_FRONTEND=noninteractive apt-get install -y alloy docker-ce docker-ce-cli containerd.io - - # Add ubuntu user to docker group - - groupadd -f docker - - usermod -aG docker ubuntu - - chmod 666 /var/run/docker.sock || true - - systemctl restart docker - - systemctl enable docker - - systemctl start docker - - # Run the Databricks exporter container - # NOTE: The exporter queries System Tables which can take 2-4 minutes - # Scrape interval should be 5m with a 4m timeout to avoid overlapping scrapes - - | - docker run -d \ - --name databricks-exporter \ - --restart unless-stopped \ - -p 9976:9976 \ - -e DATABRICKS_EXPORTER_SERVER_HOSTNAME="your-workspace.cloud.databricks.com" \ - -e DATABRICKS_EXPORTER_WAREHOUSE_HTTP_PATH="/sql/1.0/warehouses/your-warehouse-id" \ - -e DATABRICKS_EXPORTER_CLIENT_ID="your-service-principal-client-id" \ - -e DATABRICKS_EXPORTER_CLIENT_SECRET="your-service-principal-client-secret" \ - ghcr.io/grafana/databricks-prometheus-exporter:latest - - # Wait for the exporter to start - - | - for i in {1..30}; do - if curl -s http://localhost:9976/metrics > /dev/null 2>&1; then - echo "Databricks exporter is ready" - break - fi - echo "Waiting for Databricks exporter to start..." - sleep 2 - done - - # Configure Alloy to run as root (needed for Docker socket access) - - sed -i '/^\[Service\]/,/^\[/ { /^[ \t]*User=/d; /^[ \t]*Group=/d }' /lib/systemd/system/alloy.service || echo "Could not modify /lib/systemd/system/alloy.service" - - sed -i '/^\[Service\]/,/^\[/ { /^[ \t]*User=/d; /^[ \t]*Group=/d }' /etc/systemd/system/alloy.service || echo "Could not modify /etc/systemd/system/alloy.service" - - - systemctl daemon-reload - - systemctl enable alloy - - systemctl restart alloy - diff --git a/sample-apps/databricks/configs/alloy-advanced.alloy b/sample-apps/databricks/configs/alloy-advanced.alloy new file mode 100644 index 0000000..f717a51 --- /dev/null +++ b/sample-apps/databricks/configs/alloy-advanced.alloy @@ -0,0 +1,74 @@ +// Advanced Databricks monitoring configuration for Grafana Alloy +// +// This configuration includes all optional parameters and tuning options. +// Use this as a reference for customizing your setup. +// +// Prerequisites: +// - Databricks workspace with Unity Catalog enabled +// - Service Principal with OAuth2 credentials +// - SQL Warehouse configured and accessible +// +// Set environment variables before starting Alloy: +// export DATABRICKS_CLIENT_ID="your-application-id" +// export DATABRICKS_CLIENT_SECRET="your-client-secret" +// export PROMETHEUS_URL="https://prometheus-prod-us-central1.grafana.net/api/prom/push" +// export PROMETHEUS_USER="your-prometheus-username" +// export PROMETHEUS_PASS="your-prometheus-password" + +prometheus.exporter.databricks "example" { + // Required parameters + server_hostname = "dbc-abc123-def456.cloud.databricks.com" // Replace with your workspace hostname + warehouse_http_path = "/sql/1.0/warehouses/abc123def456" // Replace with your SQL Warehouse HTTP path + client_id = env("DATABRICKS_CLIENT_ID") + client_secret = env("DATABRICKS_CLIENT_SECRET") + + // Optional tuning parameters + query_timeout = "5m" // Timeout for individual SQL queries + billing_lookback = "24h" // How far back to query billing data (Databricks billing has 24-48h lag) + jobs_lookback = "3h" // How far back to query job runs + pipelines_lookback = "3h" // How far back to query pipeline runs + queries_lookback = "2h" // How far back to query SQL warehouse queries + sla_threshold_seconds = 3600 // Duration threshold (seconds) for job SLA miss detection + collect_task_retries = false // ⚠️ HIGH CARDINALITY: Collect task-level retry metrics (adds task_key label) +} + +prometheus.scrape "databricks" { + targets = prometheus.exporter.databricks.example.targets + forward_to = [prometheus.remote_write.grafana_cloud.receiver] + scrape_interval = "10m" // Recommended: 10-30 minutes (queries can be slow and costly) + scrape_timeout = "9m" // Must be < scrape_interval; typical scrapes take 90-120s + + // Optional: Enable clustering for high availability + clustering { + enabled = true + } +} + +prometheus.remote_write "grafana_cloud" { + endpoint { + url = env("PROMETHEUS_URL") + + basic_auth { + username = env("PROMETHEUS_USER") + password = env("PROMETHEUS_PASS") + } + } +} + +// Optional: Add metric relabeling to reduce cardinality or filter metrics +prometheus.relabel "databricks_metrics" { + forward_to = [prometheus.remote_write.grafana_cloud.receiver] + + // Example: Drop high-cardinality labels if needed + // rule { + // source_labels = ["task_key"] + // action = "labeldrop" + // } + + // Example: Keep only specific metrics + // rule { + // source_labels = ["__name__"] + // regex = "databricks_(up|billing_.*|job_run_status_total)" + // action = "keep" + // } +} diff --git a/sample-apps/databricks/configs/alloy-simple.alloy b/sample-apps/databricks/configs/alloy-simple.alloy new file mode 100644 index 0000000..2bf0b0e --- /dev/null +++ b/sample-apps/databricks/configs/alloy-simple.alloy @@ -0,0 +1,43 @@ +// Simple Databricks monitoring configuration for Grafana Alloy +// +// This configuration: +// - Scrapes metrics from Databricks System Tables using the built-in exporter +// - Forwards metrics to Grafana Cloud (or any Prometheus-compatible endpoint) +// - Uses environment variables for sensitive credentials +// +// Prerequisites: +// - Databricks workspace with Unity Catalog enabled +// - Service Principal with OAuth2 credentials +// - SQL Warehouse configured and accessible +// +// Set environment variables before starting Alloy: +// export DATABRICKS_CLIENT_ID="your-application-id" +// export DATABRICKS_CLIENT_SECRET="your-client-secret" +// export PROMETHEUS_URL="https://prometheus-prod-us-central1.grafana.net/api/prom/push" +// export PROMETHEUS_USER="your-prometheus-username" +// export PROMETHEUS_PASS="your-prometheus-password" + +prometheus.exporter.databricks "example" { + server_hostname = "dbc-abc123-def456.cloud.databricks.com" // Replace with your workspace hostname + warehouse_http_path = "/sql/1.0/warehouses/abc123def456" // Replace with your SQL Warehouse HTTP path + client_id = env("DATABRICKS_CLIENT_ID") + client_secret = env("DATABRICKS_CLIENT_SECRET") +} + +prometheus.scrape "databricks" { + targets = prometheus.exporter.databricks.example.targets + forward_to = [prometheus.remote_write.grafana_cloud.receiver] + scrape_interval = "10m" + scrape_timeout = "9m" +} + +prometheus.remote_write "grafana_cloud" { + endpoint { + url = env("PROMETHEUS_URL") + + basic_auth { + username = env("PROMETHEUS_USER") + password = env("PROMETHEUS_PASS") + } + } +} diff --git a/sample-apps/databricks/jinja/templates/cloud-init-template.yaml b/sample-apps/databricks/jinja/templates/cloud-init-template.yaml deleted file mode 100644 index 2ff1c1a..0000000 --- a/sample-apps/databricks/jinja/templates/cloud-init-template.yaml +++ /dev/null @@ -1,213 +0,0 @@ -#cloud-config -# Cloud-init configuration for setting up Alloy and Databricks exporter sample-app - -package_update: true -package_upgrade: false - -packages: - - git - - gpg - - curl - - wget - -write_files: - # Alloy profile - - owner: root:root - path: /etc/default/alloy - content: | - ## Path: - ## Description: Grafana Alloy settings - ## Type: string - ## Default: "" - ## ServiceRestart: alloy - # - # Command line options for Alloy. - # - # The configuration file holding the Alloy config. - CONFIG_FILE="/etc/alloy/config.alloy" - # User-defined arguments to pass to the run command. - CUSTOM_ARGS="--stability.level=experimental" - # Restart on system upgrade. Defaults to true. - RESTART_ON_UPGRADE=true - - # Alloy configuration - - owner: root:root - path: /etc/alloy/config.alloy - content: | - // Alloy self-monitoring - prometheus.exporter.self "alloy_check" { } - - discovery.relabel "alloy_check" { - targets = prometheus.exporter.self.alloy_check.targets - rule { - target_label = "instance" - replacement = constants.hostname - } - rule { - target_label = "alloy_hostname" - replacement = constants.hostname - } - rule { - target_label = "job" - replacement = "integrations/alloy-check" - } - } - - prometheus.scrape "alloy_check" { - targets = discovery.relabel.alloy_check.output - forward_to = [prometheus.relabel.alloy_check.receiver] - scrape_interval = "60s" - } - - prometheus.relabel "alloy_check" { - forward_to = [prometheus.remote_write.metrics_service.receiver] - rule { - source_labels = ["__name__"] - regex = "(prometheus_target_sync_length_seconds_sum|prometheus_target_scrapes_.*|prometheus_target_interval.*|prometheus_sd_discovered_targets|alloy_build.*|prometheus_remote_write_wal_samples_appended_total|process_start_time_seconds)" - action = "keep" - } - } - - // Databricks exporter scraping - // The exporter runs as a Docker container on localhost:9976 - prometheus.scrape "integrations_databricks" { - targets = [{ - __address__ = "localhost:9976", - }] - forward_to = [prometheus.relabel.integrations_databricks.receiver] - scrape_interval = "5m" - scrape_timeout = "4m" - job_name = "integrations/databricks" - } - - prometheus.relabel "integrations_databricks" { - forward_to = [prometheus.remote_write.metrics_service.receiver] - rule { - target_label = "instance" - replacement = constants.hostname - } - rule { - target_label = "job" - replacement = "integrations/databricks" - } - } - - prometheus.remote_write "metrics_service" { - endpoint { - url = "{{ prom_url }}" - {% if prom_user and prom_pass -%} - basic_auth { - username = "{{ prom_user }}" - password = "{{ prom_pass }}" - } - {%- endif %} - } - } - - // Docker log discovery for the databricks-exporter container - discovery.docker "databricks_exporter" { - host = "unix:///var/run/docker.sock" - refresh_interval = "5s" - filter { - name = "name" - values = ["databricks-exporter"] - } - } - - discovery.relabel "databricks_exporter" { - targets = discovery.docker.databricks_exporter.targets - rule { - source_labels = ["__meta_docker_container_name"] - target_label = "name" - replacement = "databricks-exporter" - } - rule { - source_labels = ["__meta_docker_container_name"] - target_label = "job" - replacement = "integrations/databricks" - } - rule { - source_labels = ["__meta_docker_container_name"] - target_label = "instance" - replacement = constants.hostname - } - } - - loki.source.docker "databricks_exporter" { - host = "unix:///var/run/docker.sock" - targets = discovery.docker.databricks_exporter.targets - forward_to = [loki.write.grafana_cloud_loki.receiver] - relabel_rules = discovery.relabel.databricks_exporter.rules - } - - loki.write "grafana_cloud_loki" { - endpoint { - url = "{{ loki_url }}" - {% if loki_user and loki_pass -%} - basic_auth { - username = "{{ loki_user }}" - password = "{{ loki_pass }}" - } - {%- endif %} - } - } - -runcmd: - - mkdir -p /home/ubuntu - - mkdir -p /etc/apt/keyrings/ - # Create required directory for alloy - - mkdir -p /var/lib/alloy - - chown -R root:root /var/lib/alloy - - # Install Grafana repo - - curl -fsSL https://apt.grafana.com/gpg.key | gpg --dearmor -o /etc/apt/keyrings/grafana.gpg - - echo "deb [signed-by=/etc/apt/keyrings/grafana.gpg] https://apt.grafana.com stable main" > /etc/apt/sources.list.d/grafana.list - - # Install Docker repo - - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg - - echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null - - - apt-get update - - DEBIAN_FRONTEND=noninteractive apt-get install -y alloy docker-ce docker-ce-cli containerd.io - - # Add ubuntu user to docker group - - groupadd -f docker - - usermod -aG docker ubuntu - - chmod 666 /var/run/docker.sock || true - - systemctl restart docker - - systemctl enable docker - - systemctl start docker - - # Run the Databricks exporter container - # NOTE: The exporter queries System Tables which can take 2-4 minutes - # Scrape interval should be 5m with a 4m timeout to avoid overlapping scrapes - - | - docker run -d \ - --name databricks-exporter \ - --restart unless-stopped \ - -p 9976:9976 \ - -e DATABRICKS_EXPORTER_SERVER_HOSTNAME="{{ databricks_server_hostname }}" \ - -e DATABRICKS_EXPORTER_WAREHOUSE_HTTP_PATH="{{ databricks_warehouse_http_path }}" \ - -e DATABRICKS_EXPORTER_CLIENT_ID="{{ databricks_client_id }}" \ - -e DATABRICKS_EXPORTER_CLIENT_SECRET="{{ databricks_client_secret }}" \ - ghcr.io/grafana/databricks-prometheus-exporter:latest - - # Wait for the exporter to start - - | - for i in {1..30}; do - if curl -s http://localhost:9976/metrics > /dev/null 2>&1; then - echo "Databricks exporter is ready" - break - fi - echo "Waiting for Databricks exporter to start..." - sleep 2 - done - - # Configure Alloy to run as root (needed for Docker socket access) - - sed -i '/^\[Service\]/,/^\[/ { /^[ \t]*User=/d; /^[ \t]*Group=/d }' /lib/systemd/system/alloy.service || echo "Could not modify /lib/systemd/system/alloy.service" - - sed -i '/^\[Service\]/,/^\[/ { /^[ \t]*User=/d; /^[ \t]*Group=/d }' /etc/systemd/system/alloy.service || echo "Could not modify /etc/systemd/system/alloy.service" - - - systemctl daemon-reload - - systemctl enable alloy - - systemctl restart alloy - diff --git a/sample-apps/databricks/jinja/variables/cloud-init.yaml b/sample-apps/databricks/jinja/variables/cloud-init.yaml deleted file mode 100644 index 4fd4e92..0000000 --- a/sample-apps/databricks/jinja/variables/cloud-init.yaml +++ /dev/null @@ -1,16 +0,0 @@ -# Databricks Exporter Configuration -# See README.md for instructions on obtaining these values - -# Grafana Cloud endpoints -loki_url: http://your-loki-instance:3100/loki/api/v1/push -loki_user: your_loki_username -loki_pass: your_loki_password -prom_url: http://your-prometheus-instance:9090/api/v1/push -prom_user: your_prometheus_username -prom_pass: your_prometheus_password - -# Databricks OAuth2 Service Principal credentials -databricks_server_hostname: your-workspace.cloud.databricks.com -databricks_warehouse_http_path: /sql/1.0/warehouses/your-warehouse-id -databricks_client_id: your-service-principal-client-id -databricks_client_secret: your-service-principal-client-secret From 1d0a15cf18603cba6c21c96cc7adf40982d13a55 Mon Sep 17 00:00:00 2001 From: Ahmad Alhour Date: Wed, 14 Jan 2026 17:00:21 +0100 Subject: [PATCH 3/5] Remove the Makefile and update README.md --- sample-apps/databricks/Makefile | 40 -------------------------------- sample-apps/databricks/README.md | 9 ------- 2 files changed, 49 deletions(-) delete mode 100644 sample-apps/databricks/Makefile diff --git a/sample-apps/databricks/Makefile b/sample-apps/databricks/Makefile deleted file mode 100644 index 8daf301..0000000 --- a/sample-apps/databricks/Makefile +++ /dev/null @@ -1,40 +0,0 @@ -.PHONY: help validate-config test clean - -help: - @echo "Databricks Monitoring Runbook" - @echo "" - @echo "Available commands:" - @echo " make validate-config - Validate Alloy configuration syntax" - @echo " make test - Run metric validation tests" - @echo " make clean - Clean up temporary files" - @echo "" - @echo "This is a runbook, not an automated sample app." - @echo "Follow the instructions in README.md for manual setup." - -validate-config: - @echo "Validating Alloy configurations..." - @if command -v alloy > /dev/null 2>&1; then \ - alloy fmt configs/alloy-simple.alloy; \ - alloy fmt configs/alloy-advanced.alloy; \ - echo "✓ Configuration syntax is valid"; \ - else \ - echo "⚠ Alloy binary not found. Install from:"; \ - echo " https://grafana.com/docs/alloy/latest/get-started/install/"; \ - fi - -test: - @echo "Running metric validation tests..." - @echo "Note: This validates expected metrics, not live data." - @if [ -f tests/metrics/databricks ]; then \ - echo "Expected metrics ($(shell wc -l < tests/metrics/databricks | tr -d ' '))"; \ - cat tests/metrics/databricks; \ - else \ - echo "✗ Metric validation file not found"; \ - exit 1; \ - fi - -clean: - @echo "Cleaning up temporary files..." - @rm -f configs/*.tmp - @rm -f *.log - @echo "✓ Clean complete" diff --git a/sample-apps/databricks/README.md b/sample-apps/databricks/README.md index 2dabd4b..6ed8d88 100644 --- a/sample-apps/databricks/README.md +++ b/sample-apps/databricks/README.md @@ -242,15 +242,6 @@ The exporter collects 18 metrics across four categories: - Review metric cardinality with `databricks_*` queries in Prometheus - Consider metric relabeling to drop high-cardinality labels (see `alloy-advanced.alloy` for examples) -## Make Commands - -This runbook provides validation commands: - -- `make validate-config` - Validate Alloy configuration syntax -- `make test` - Run metric validation tests -- `make clean` - Clean up temporary files -- `make help` - Show available commands - ## Additional Resources - [Databricks OAuth2 M2M Documentation](https://docs.databricks.com/en/dev-tools/auth/oauth-m2m.html) From 73662ffa119bb79c4c8308d2e6c11415592ba00f Mon Sep 17 00:00:00 2001 From: Ahmad Alhour Date: Wed, 14 Jan 2026 17:00:47 +0100 Subject: [PATCH 4/5] Remove test files. --- .../tests/configs/databricks.config | 2 -- .../databricks/tests/metrics/databricks | 20 ------------------- 2 files changed, 22 deletions(-) delete mode 100644 sample-apps/databricks/tests/configs/databricks.config delete mode 100644 sample-apps/databricks/tests/metrics/databricks diff --git a/sample-apps/databricks/tests/configs/databricks.config b/sample-apps/databricks/tests/configs/databricks.config deleted file mode 100644 index 7bf1ab0..0000000 --- a/sample-apps/databricks/tests/configs/databricks.config +++ /dev/null @@ -1,2 +0,0 @@ -JOB_LABEL=integrations/databricks - diff --git a/sample-apps/databricks/tests/metrics/databricks b/sample-apps/databricks/tests/metrics/databricks deleted file mode 100644 index a1e3891..0000000 --- a/sample-apps/databricks/tests/metrics/databricks +++ /dev/null @@ -1,20 +0,0 @@ -databricks_billing_cost_estimate_usd -databricks_billing_dbus_total -databricks_job_run_duration_seconds -databricks_job_run_status -databricks_job_runs_total -databricks_job_sla_miss_total -databricks_pipeline_freshness_lag_seconds -databricks_pipeline_retry_events_total -databricks_pipeline_run_duration_seconds -databricks_pipeline_run_status -databricks_pipeline_runs_total -databricks_price_change_events -databricks_queries_running -databricks_queries_total -databricks_query_duration_seconds -databricks_query_errors_total -databricks_task_retries_total -databricks_up -up - From 07ef52394e8a4fc559d5fb655876a331495912e6 Mon Sep 17 00:00:00 2001 From: Ahmad Alhour Date: Wed, 14 Jan 2026 17:05:17 +0100 Subject: [PATCH 5/5] Wrap contents with format --- .../databricks/configs/alloy-advanced.alloy | 75 +++++++++++-------- .../databricks/configs/alloy-simple.alloy | 19 ++--- 2 files changed, 52 insertions(+), 42 deletions(-) diff --git a/sample-apps/databricks/configs/alloy-advanced.alloy b/sample-apps/databricks/configs/alloy-advanced.alloy index f717a51..497c37f 100644 --- a/sample-apps/databricks/configs/alloy-advanced.alloy +++ b/sample-apps/databricks/configs/alloy-advanced.alloy @@ -4,16 +4,21 @@ // Use this as a reference for customizing your setup. // // Prerequisites: -// - Databricks workspace with Unity Catalog enabled -// - Service Principal with OAuth2 credentials -// - SQL Warehouse configured and accessible +// - Databricks workspace with Unity Catalog and System Tables enabled +// - Service Principal with OAuth2 M2M authentication configured +// - SQL Warehouse for querying System Tables (serverless recommended for cost efficiency) +// +// Tuning recommendations: +// - Lookback windows should be at least 2x the scrape_interval to ensure data continuity +// - With a 10-minute scrape interval, use at least 20 minutes of lookback +// - Increase scrape_interval to 20-30 minutes to reduce SQL Warehouse costs // // Set environment variables before starting Alloy: -// export DATABRICKS_CLIENT_ID="your-application-id" -// export DATABRICKS_CLIENT_SECRET="your-client-secret" +// export DATABRICKS_CLIENT_ID="" +// export DATABRICKS_CLIENT_SECRET="" // export PROMETHEUS_URL="https://prometheus-prod-us-central1.grafana.net/api/prom/push" -// export PROMETHEUS_USER="your-prometheus-username" -// export PROMETHEUS_PASS="your-prometheus-password" +// export PROMETHEUS_USER="" +// export PROMETHEUS_PASS="" prometheus.exporter.databricks "example" { // Required parameters @@ -22,21 +27,22 @@ prometheus.exporter.databricks "example" { client_id = env("DATABRICKS_CLIENT_ID") client_secret = env("DATABRICKS_CLIENT_SECRET") - // Optional tuning parameters - query_timeout = "5m" // Timeout for individual SQL queries - billing_lookback = "24h" // How far back to query billing data (Databricks billing has 24-48h lag) - jobs_lookback = "3h" // How far back to query job runs - pipelines_lookback = "3h" // How far back to query pipeline runs - queries_lookback = "2h" // How far back to query SQL warehouse queries - sla_threshold_seconds = 3600 // Duration threshold (seconds) for job SLA miss detection - collect_task_retries = false // ⚠️ HIGH CARDINALITY: Collect task-level retry metrics (adds task_key label) + // Optional tuning parameters (all have defaults) + query_timeout = "5m" // Timeout for individual SQL queries (default: 5m) + billing_lookback = "24h" // How far back to query billing data (default: 24h, Databricks billing has 24-48h lag) + jobs_lookback = "3h" // How far back to query job runs (default: 3h) + pipelines_lookback = "3h" // How far back to query pipeline runs (default: 3h) + queries_lookback = "2h" // How far back to query SQL warehouse queries (default: 2h) + sla_threshold_seconds = 3600 // Duration threshold in seconds for job SLA miss detection (default: 3600) + collect_task_retries = false // Collect task retry metrics (default: false) ⚠️ HIGH CARDINALITY: adds task_key label } +// Configure a prometheus.scrape component to collect databricks metrics. prometheus.scrape "databricks" { targets = prometheus.exporter.databricks.example.targets forward_to = [prometheus.remote_write.grafana_cloud.receiver] - scrape_interval = "10m" // Recommended: 10-30 minutes (queries can be slow and costly) - scrape_timeout = "9m" // Must be < scrape_interval; typical scrapes take 90-120s + scrape_interval = "10m" // Recommended: 10-30 minutes (System Table queries can be slow and costly) + scrape_timeout = "9m" // Must be < scrape_interval (typical scrapes take 90-120s) // Optional: Enable clustering for high availability clustering { @@ -56,19 +62,22 @@ prometheus.remote_write "grafana_cloud" { } // Optional: Add metric relabeling to reduce cardinality or filter metrics -prometheus.relabel "databricks_metrics" { - forward_to = [prometheus.remote_write.grafana_cloud.receiver] - - // Example: Drop high-cardinality labels if needed - // rule { - // source_labels = ["task_key"] - // action = "labeldrop" - // } - - // Example: Keep only specific metrics - // rule { - // source_labels = ["__name__"] - // regex = "databricks_(up|billing_.*|job_run_status_total)" - // action = "keep" - // } -} +// To use this, change the prometheus.scrape forward_to to: +// forward_to = [prometheus.relabel.databricks_metrics.receiver] +// +// prometheus.relabel "databricks_metrics" { +// forward_to = [prometheus.remote_write.grafana_cloud.receiver] +// +// // Example: Drop high-cardinality labels if needed +// rule { +// source_labels = ["task_key"] +// action = "labeldrop" +// } +// +// // Example: Keep only specific metrics +// rule { +// source_labels = ["__name__"] +// regex = "databricks_(up|billing_.*|job_run_status_total)" +// action = "keep" +// } +// } diff --git a/sample-apps/databricks/configs/alloy-simple.alloy b/sample-apps/databricks/configs/alloy-simple.alloy index 2bf0b0e..3ee85eb 100644 --- a/sample-apps/databricks/configs/alloy-simple.alloy +++ b/sample-apps/databricks/configs/alloy-simple.alloy @@ -6,16 +6,16 @@ // - Uses environment variables for sensitive credentials // // Prerequisites: -// - Databricks workspace with Unity Catalog enabled -// - Service Principal with OAuth2 credentials -// - SQL Warehouse configured and accessible +// - Databricks workspace with Unity Catalog and System Tables enabled +// - Service Principal with OAuth2 M2M authentication configured +// - SQL Warehouse for querying System Tables (serverless recommended for cost efficiency) // // Set environment variables before starting Alloy: -// export DATABRICKS_CLIENT_ID="your-application-id" -// export DATABRICKS_CLIENT_SECRET="your-client-secret" +// export DATABRICKS_CLIENT_ID="" +// export DATABRICKS_CLIENT_SECRET="" // export PROMETHEUS_URL="https://prometheus-prod-us-central1.grafana.net/api/prom/push" -// export PROMETHEUS_USER="your-prometheus-username" -// export PROMETHEUS_PASS="your-prometheus-password" +// export PROMETHEUS_USER="" +// export PROMETHEUS_PASS="" prometheus.exporter.databricks "example" { server_hostname = "dbc-abc123-def456.cloud.databricks.com" // Replace with your workspace hostname @@ -24,11 +24,12 @@ prometheus.exporter.databricks "example" { client_secret = env("DATABRICKS_CLIENT_SECRET") } +// Configure a prometheus.scrape component to collect databricks metrics. prometheus.scrape "databricks" { targets = prometheus.exporter.databricks.example.targets forward_to = [prometheus.remote_write.grafana_cloud.receiver] - scrape_interval = "10m" - scrape_timeout = "9m" + scrape_interval = "10m" // Recommended: 10-30 minutes + scrape_timeout = "9m" // Must be < scrape_interval } prometheus.remote_write "grafana_cloud" {