diff --git a/examples/lading-logrotate.yaml b/examples/lading-logrotate.yaml new file mode 100644 index 000000000..42a991200 --- /dev/null +++ b/examples/lading-logrotate.yaml @@ -0,0 +1,44 @@ +# file_to_blackhole_0ms_latency example using logrotate generator +# +# This example demonstrates the logrotate generator (NOT logrotate_fs) for +# environments where FUSE is unavailable (e.g., Kubernetes without privileged +# containers). +# +# Key differences from logrotate_fs: +# - Uses real filesystem writes to `root` path (not a FUSE mount) +# - Uses `throttle` config instead of `load_profile` +# - Works in standard containers without FUSE/privileged mode +# +# The "0ms latency" comes from `timeout_millis: 0` in the throttle config, +# meaning writes return immediately without simulating I/O delay. +# +# Throttle options: +# - all_out: No rate limiting, produce as fast as possible +# - stable: Fixed rate with optional timeout +# - bytes_per_second: Rate limit (e.g., "1 MiB", "10 MB") +# - timeout_millis: 0 = immediate writes, >0 = simulate I/O latency +# - linear: Ramp up from initial to maximum rate over time +# - initial_bytes_per_second: Starting rate +# - maximum_bytes_per_second: Target rate +# - rate_of_change: How fast to increase (bytes/sec per second) + +generator: + - file_gen: + logrotate: + seed: [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, + 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131] + root: /tmp/logs + concurrent_logs: 8 + maximum_bytes_per_log: 100MiB + total_rotations: 4 + max_depth: 0 + variant: "ascii" + maximum_prebuild_cache_size_bytes: 1GiB + throttle: + stable: + bytes_per_second: "1.3 MiB" + timeout_millis: 0 + +blackhole: + - tcp: + binding_addr: "0.0.0.0:8080" diff --git a/k8s/file_to_blackhole_0ms_latency/README.md b/k8s/file_to_blackhole_0ms_latency/README.md new file mode 100644 index 000000000..a2fe505ad --- /dev/null +++ b/k8s/file_to_blackhole_0ms_latency/README.md @@ -0,0 +1,63 @@ +# file_to_blackhole_0ms_latency Experiment + +Tests Datadog Agent log tailing under load from lading's `logrotate` file generator. + +## Overview + +This experiment: +1. Deploys lading to generate rotating log files at `/var/log/lading/` +2. Deploys Datadog Agent configured to tail those log files +3. Routes agent output to a blackhole (lading-intake) +4. Monitors for OOMKills or restarts + +Uses the `logrotate` generator (not `logrotate_fs`) which writes to real filesystem +and doesn't require FUSE or privileged containers. + +## Prerequisites + +- kind: `brew install kind` +- kubectl: `brew install kubectl` +- helm: `brew install helm` +- jq: `brew install jq` +- Docker running + +## Usage + +```bash +./k8s/file_to_blackhole_0ms_latency/experiment.sh \ + --agent-memory 512 \ + --tags "purpose:test,experiment:file_to_blackhole_0ms_latency" +``` + +### Options + +| Flag | Required | Description | +|------|----------|-------------| +| `--agent-memory` | Yes | Agent container memory limit in MB | +| `--duration` | No | Test duration in seconds (default: 300) | +| `--tags` | Yes | DD_TAGS value for the agent | + +## Load Configuration + +Default load: 8 concurrent log files at 1.3 MiB/s total throughput. + +To modify, edit `manifests/lading.yaml`: + +```yaml +throttle: + stable: + bytes_per_second: "1.3 MiB" # Adjust rate + timeout_millis: 0 # 0 = immediate writes +``` + +## Results + +- **SUCCESS**: Agent survived test duration without restarts +- **FAILURE (OOMKilled)**: Agent needs more memory +- **FAILURE (other)**: Configuration or stability issue + +## Cleanup + +```bash +kind delete cluster --name lading-test +``` diff --git a/k8s/file_to_blackhole_0ms_latency/experiment.sh b/k8s/file_to_blackhole_0ms_latency/experiment.sh new file mode 100755 index 000000000..df2ce2477 --- /dev/null +++ b/k8s/file_to_blackhole_0ms_latency/experiment.sh @@ -0,0 +1,243 @@ +#!/bin/bash +set -e + +# Parse arguments +AGENT_MEMORY_MB="" +DURATION=300 +DD_TAGS_VALUE="" + +while [[ $# -gt 0 ]]; do + case $1 in + --agent-memory) + AGENT_MEMORY_MB="$2" + shift 2 + ;; + --duration) + DURATION="$2" + shift 2 + ;; + --tags) + DD_TAGS_VALUE="$2" + shift 2 + ;; + *) + echo "Unknown option: $1" + echo "Usage: $0 --agent-memory [--duration ] --tags " + exit 1 + ;; + esac +done + +if [ -z "$AGENT_MEMORY_MB" ]; then + echo "ERROR: --agent-memory is required" + echo "Usage: $0 --agent-memory [--duration ] --tags " + exit 1 +fi + +if [ -z "$DD_TAGS_VALUE" ]; then + echo "ERROR: --tags is required" + echo "Usage: $0 --agent-memory [--duration ] --tags " + exit 1 +fi + +echo "========================================" +echo "Datadog Agent Log Tailing Test" +echo "========================================" +echo "Agent memory limit: ${AGENT_MEMORY_MB} MB" +echo "Test duration: ${DURATION} seconds" +echo "Tags: ${DD_TAGS_VALUE}" +echo "Started at: $(date)" +echo + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +echo "[1/6] Checking prerequisites..." +command -v kind >/dev/null 2>&1 || { echo "ERROR: kind not found"; exit 1; } +command -v kubectl >/dev/null 2>&1 || { echo "ERROR: kubectl not found"; exit 1; } +command -v helm >/dev/null 2>&1 || { echo "ERROR: helm not found"; exit 1; } +command -v jq >/dev/null 2>&1 || { echo "ERROR: jq not found"; exit 1; } +echo " ✓ Prerequisites available" +echo + +echo "[2/6] Creating fresh cluster..." +if kind get clusters 2>/dev/null | grep -q "^lading-test$"; then + echo " Deleting existing cluster..." + kind delete cluster --name lading-test +fi +kind create cluster --name lading-test +echo " ✓ Cluster ready" +echo + +echo "[3/6] Installing Prometheus..." +kubectl create namespace monitoring 2>/dev/null || true +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts >/dev/null 2>&1 || true +helm repo update >/dev/null 2>&1 +helm install prometheus prometheus-community/prometheus \ + --namespace monitoring \ + --set server.service.type=ClusterIP \ + --set alertmanager.enabled=false \ + --set prometheus-pushgateway.enabled=false \ + --set kube-state-metrics.enabled=true >/dev/null 2>&1 +echo " ✓ Prometheus installed" +echo + +echo "[4/6] Installing Datadog Operator..." +helm repo add datadog https://helm.datadoghq.com >/dev/null 2>&1 || true +helm repo update >/dev/null 2>&1 +helm install datadog-operator datadog/datadog-operator --version 2.15.2 >/dev/null 2>&1 +echo " Waiting for operator..." +kubectl wait --for=condition=available --timeout=120s deployment/datadog-operator 2>/dev/null || sleep 30 +echo " ✓ Operator ready" +echo + +echo "[5/6] Applying manifests..." +kubectl apply -f "$SCRIPT_DIR/manifests/datadog-secret.yaml" +kubectl apply -f "$SCRIPT_DIR/manifests/deny-egress.yaml" +kubectl apply -f "$SCRIPT_DIR/manifests/lading-intake.yaml" + +# Deploy lading first so logs exist before agent starts +kubectl apply -f "$SCRIPT_DIR/manifests/lading.yaml" +echo " ✓ Lading file generator deployed" + +# Wait for lading to create log files +echo " Waiting for log files..." +TIMEOUT=120 +ELAPSED=0 +while [ $ELAPSED -lt $TIMEOUT ]; do + LADING_POD=$(kubectl get pods -l app=lading -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") + if [ -n "$LADING_POD" ]; then + POD_READY=$(kubectl get pod "$LADING_POD" -o jsonpath='{.status.containerStatuses[0].ready}' 2>/dev/null || echo "false") + if [ "$POD_READY" = "true" ]; then + LOG_COUNT=$(kubectl exec "$LADING_POD" -- sh -c 'ls /var/log/lading/*.log 2>/dev/null | wc -l' || echo 0) + if [ "$LOG_COUNT" -gt 0 ]; then + echo " ✓ Log files created ($LOG_COUNT files)" + break + fi + fi + fi + sleep 2 + ELAPSED=$((ELAPSED + 2)) +done + +if [ $ELAPSED -ge $TIMEOUT ]; then + echo " ✗ Timeout waiting for log files" + kubectl logs -l app=lading --tail=20 + exit 1 +fi + +# Now deploy agent +AGENT_MANIFEST=$(cat "$SCRIPT_DIR/manifests/datadog-agent.yaml" | \ + sed "s/{{ AGENT_MEMORY_MB }}/${AGENT_MEMORY_MB}/g" | \ + sed "s|{{ DD_TAGS }}|${DD_TAGS_VALUE}|g") + +if echo "$AGENT_MANIFEST" | grep -q "{{ AGENT_MEMORY_MB }}"; then + echo " ✗ ERROR: Template substitution failed for memory placeholder" + exit 1 +fi +if echo "$AGENT_MANIFEST" | grep -q "{{ DD_TAGS }}"; then + echo " ✗ ERROR: Template substitution failed for DD_TAGS" + exit 1 +fi + +echo "$AGENT_MANIFEST" | kubectl apply -f - +echo " ✓ Agent deployed (egress blocked)" + +echo " Waiting for agent..." +TIMEOUT=120 +ELAPSED=0 +while [ $ELAPSED -lt $TIMEOUT ]; do + AGENT_POD=$(kubectl get pods -l app.kubernetes.io/name=datadog-agent-deployment -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") + if [ -n "$AGENT_POD" ]; then + READY=$(kubectl get pod "$AGENT_POD" -o jsonpath='{.status.containerStatuses[0].ready}' 2>/dev/null || echo "false") + if [ "$READY" = "true" ]; then + echo " ✓ Agent ready" + break + fi + fi + sleep 2 + ELAPSED=$((ELAPSED + 2)) +done + +if [ $ELAPSED -ge $TIMEOUT ]; then + echo " ✗ Timeout waiting for agent" + kubectl describe pods -l app.kubernetes.io/name=datadog-agent-deployment + exit 1 +fi + +# Check for any failed pods +FAILED_PODS=$(kubectl get pods -o json | jq -r '.items[] | select(.status.phase == "Failed" or .status.phase == "Unknown") | .metadata.name') +if [ -n "$FAILED_PODS" ]; then + echo " ✗ Found failed pods:" + kubectl get pods + exit 1 +fi +echo " ✓ All systems healthy" +echo + +# Monitor for restarts +echo "[6/6] Monitoring for restarts (${DURATION}s)..." +echo " Started at: $(date)" +ELAPSED=0 +LAST_REPORT=0 + +while [ $ELAPSED -lt $DURATION ]; do + RESTART_DATA=$(kubectl get pods -l app.kubernetes.io/name=datadog-agent-deployment -o json 2>/dev/null) + if [ $? -ne 0 ]; then + sleep 5 + ELAPSED=$((ELAPSED + 5)) + continue + fi + + RESTART_COUNT=$(echo "$RESTART_DATA" | jq '[.items[].status.containerStatuses[]?.restartCount // 0] | add' 2>/dev/null || echo 0) + if [ -z "$RESTART_COUNT" ] || [ "$RESTART_COUNT" = "null" ]; then + RESTART_COUNT=0 + fi + + if [ $((ELAPSED - LAST_REPORT)) -ge 30 ]; then + REMAINING=$((DURATION - ELAPSED)) + echo " ${ELAPSED}s elapsed, ${REMAINING}s remaining (restarts: ${RESTART_COUNT})" + LAST_REPORT=$ELAPSED + fi + + if [ "$RESTART_COUNT" -gt 0 ]; then + CONTAINER_NAME=$(echo "$RESTART_DATA" | jq -r '.items[].status.containerStatuses[]? | select(.restartCount > 0) | .name' 2>/dev/null | head -1) + REASON=$(echo "$RESTART_DATA" | jq -r '.items[].status.containerStatuses[]? | select(.restartCount > 0) | .lastState.terminated.reason // "Unknown"' 2>/dev/null | head -1) + + echo + echo "========================================" + echo "RESULT: FAILURE" + echo "========================================" + echo "Container restarted: ${CONTAINER_NAME}" + echo "Restart count: ${RESTART_COUNT}" + echo "Reason: ${REASON}" + echo "Time to failure: ${ELAPSED}s" + echo + + if [ "$REASON" = "OOMKilled" ]; then + echo "💡 Container needs MORE memory" + else + echo "⚠️ Non-OOM restart:" + kubectl logs -l app.kubernetes.io/name=datadog-agent-deployment -c "${CONTAINER_NAME}" --previous --tail=20 + fi + echo "========================================" + exit 1 + fi + + sleep 5 + ELAPSED=$((ELAPSED + 5)) +done + +echo " Completed at: $(date)" +echo + +echo "========================================" +echo "RESULT: SUCCESS" +echo "========================================" +echo "No restarts detected" +echo "Test duration: ${DURATION} seconds" +echo "Tags: ${DD_TAGS_VALUE}" +echo + +echo "💡 Agent stable - cluster is still running for examination" +echo " View lading metrics: kubectl port-forward svc/lading 9000:9000" +echo " View agent logs: kubectl logs -l app.kubernetes.io/name=datadog-agent-deployment -f" diff --git a/k8s/file_to_blackhole_0ms_latency/manifests/datadog-agent.yaml b/k8s/file_to_blackhole_0ms_latency/manifests/datadog-agent.yaml new file mode 100644 index 000000000..a55a38bc9 --- /dev/null +++ b/k8s/file_to_blackhole_0ms_latency/manifests/datadog-agent.yaml @@ -0,0 +1,79 @@ +apiVersion: datadoghq.com/v2alpha1 +kind: DatadogAgent +metadata: + name: datadog + namespace: default +spec: + global: + clusterName: lading-test + site: datadoghq.com + credentials: + apiSecret: + secretName: datadog-secret + keyName: api-key + endpoint: + url: http://lading-intake:8080 + + features: + logCollection: + enabled: true + + prometheusScrape: + enabled: true + enableServiceEndpoints: true + + override: + clusterAgent: + containers: + cluster-agent: + livenessProbe: + initialDelaySeconds: 60 + periodSeconds: 30 + timeoutSeconds: 10 + failureThreshold: 10 + readinessProbe: + initialDelaySeconds: 60 + periodSeconds: 30 + timeoutSeconds: 10 + failureThreshold: 10 + nodeAgent: + image: + name: gcr.io/datadoghq/agent:7.72.1 + extraConfd: + configMap: + name: agent-logs-config + containers: + agent: + env: + - name: DD_HOSTNAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: DD_TAGS + value: "{{ DD_TAGS }}" + resources: + limits: + memory: "{{ AGENT_MEMORY_MB }}Mi" + volumeMounts: + - name: lading-logs + mountPath: /var/log/lading + readOnly: true + volumes: + - name: lading-logs + hostPath: + path: /var/log/lading + type: DirectoryOrCreate +--- +# Log collection configuration for the agent +apiVersion: v1 +kind: ConfigMap +metadata: + name: agent-logs-config + namespace: default +data: + lading.yaml: | + logs: + - type: file + path: /var/log/lading/*.log + service: lading-logrotate + source: lading diff --git a/k8s/file_to_blackhole_0ms_latency/manifests/datadog-secret.yaml b/k8s/file_to_blackhole_0ms_latency/manifests/datadog-secret.yaml new file mode 100644 index 000000000..0a12fb4ea --- /dev/null +++ b/k8s/file_to_blackhole_0ms_latency/manifests/datadog-secret.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: Secret +metadata: + name: datadog-secret + namespace: default +type: Opaque +stringData: + api-key: "fake-api-key-for-testing" diff --git a/k8s/file_to_blackhole_0ms_latency/manifests/deny-egress.yaml b/k8s/file_to_blackhole_0ms_latency/manifests/deny-egress.yaml new file mode 100644 index 000000000..b21e1293a --- /dev/null +++ b/k8s/file_to_blackhole_0ms_latency/manifests/deny-egress.yaml @@ -0,0 +1,22 @@ +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: deny-internet-egress + namespace: default +spec: + podSelector: {} + policyTypes: + - Egress + egress: + # Allow DNS + - to: + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: kube-system + ports: + - protocol: UDP + port: 53 + # Allow intra-cluster communication + - to: + - podSelector: {} + # Block everything else (internet egress blocked) diff --git a/k8s/file_to_blackhole_0ms_latency/manifests/lading-intake.yaml b/k8s/file_to_blackhole_0ms_latency/manifests/lading-intake.yaml new file mode 100644 index 000000000..10a086393 --- /dev/null +++ b/k8s/file_to_blackhole_0ms_latency/manifests/lading-intake.yaml @@ -0,0 +1,67 @@ +# Lading intake (blackhole) - mimics Datadog API to receive agent output +# +# This deployment acts as a fake Datadog backend for self-contained testing: +# - Accepts agent API v2 submissions at :8080 +# - Discards all received data (blackhole mode) +# - Allows testing without external Datadog connectivity +# - Used with network isolation (deny-egress.yaml) to ensure agent only talks to this intake +# - Infinite runtime: runs until manually stopped +# +# The agent is configured to send to http://lading-intake:8080 instead of Datadog. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: lading-intake-config + namespace: default +data: + lading.yaml: | + blackhole: + - datadog: + v2: + binding_addr: "0.0.0.0:8080" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: lading-intake + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: lading-intake + template: + metadata: + labels: + app: lading-intake + spec: + containers: + - name: lading + image: ghcr.io/datadog/lading:0.29.2 + args: + - "--config-path" + - "/etc/lading/lading.yaml" + - "--no-target" + - "--prometheus-addr" + - "0.0.0.0:9000" + - "--experiment-duration-infinite" + volumeMounts: + - name: config + mountPath: /etc/lading + volumes: + - name: config + configMap: + name: lading-intake-config +--- +apiVersion: v1 +kind: Service +metadata: + name: lading-intake + namespace: default +spec: + selector: + app: lading-intake + ports: + - port: 8080 + targetPort: 8080 diff --git a/k8s/file_to_blackhole_0ms_latency/manifests/lading.yaml b/k8s/file_to_blackhole_0ms_latency/manifests/lading.yaml new file mode 100644 index 000000000..f43a26542 --- /dev/null +++ b/k8s/file_to_blackhole_0ms_latency/manifests/lading.yaml @@ -0,0 +1,94 @@ +# Lading file generator for file_to_blackhole experiment +# +# Generates rotating log files at /var/log/lading that the Datadog Agent tails. +# Uses the logrotate generator (NOT logrotate_fs) - no FUSE required. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: lading-config + namespace: default +data: + lading.yaml: | + generator: + - file_gen: + logrotate: + seed: [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, + 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131] + root: /var/log/lading + concurrent_logs: 8 + maximum_bytes_per_log: 100MiB + total_rotations: 4 + max_depth: 1 + variant: "ascii" + maximum_prebuild_cache_size_bytes: 1GiB + throttle: + stable: + bytes_per_second: "1.3 MiB" + timeout_millis: 0 +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: lading + namespace: default + labels: + app: lading +spec: + replicas: 1 + selector: + matchLabels: + app: lading + template: + metadata: + labels: + app: lading + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9000" + spec: + containers: + - name: lading + image: ghcr.io/datadog/lading:0.29.2 + command: ["lading"] + args: + - "--config-path" + - "/etc/lading/lading.yaml" + - "--no-target" + - "--prometheus-addr" + - "0.0.0.0:9000" + - "--experiment-duration-infinite" + resources: + limits: + memory: "4Gi" + cpu: "2" + requests: + memory: "4Gi" + cpu: "2" + volumeMounts: + - name: config + mountPath: /etc/lading + readOnly: true + - name: logs + mountPath: /var/log/lading + volumes: + - name: config + configMap: + name: lading-config + - name: logs + hostPath: + path: /var/log/lading + type: DirectoryOrCreate +--- +apiVersion: v1 +kind: Service +metadata: + name: lading + namespace: default +spec: + selector: + app: lading + ports: + - name: prometheus + port: 9000 + targetPort: 9000 diff --git a/k8s/uds_dogstatsd_to_api/README.md b/k8s/uds_dogstatsd_to_api/README.md new file mode 100644 index 000000000..eab015805 --- /dev/null +++ b/k8s/uds_dogstatsd_to_api/README.md @@ -0,0 +1,93 @@ +# Lading in k8s Demonstration + +Testing setup to demonstrate memory limits for Datadog Agent under lading load. + +Experiment is rigged up through `experiment.sh`. That script takes multiple +memory parameters for each configured Agent pod container, setting them as +limits in `manifests/datadog-agent.yaml`. Experiment runs for a given duration +-- suggested, 300 seconds at a minimum -- and does two things: + +* watches for container restarts during the experiment, signaling failure if one + is detected or +* executes to experiment duration and queries Prometheus to calculate the peak + memory consumed by each Agent container, relative to configured limits. + +Experiments are **isolated from the internet** to avoid sending metrics et al to +actual Datadog intake. See `manifests/deny-egress.yaml` for details. + +## Prerequisites + +- kind: `brew install kind` +- kubectl: `brew install kubectl` +- helm: `brew install helm` +- jq: `brew install jq` +- python3: System Python 3 +- Docker running + +## Usage + +### Test a specific memory limit + +```bash +# Test 2000 MB total for 5 minutes with explicit per-container limits +./k8s/experiment.sh --total-limit 2000 --agent-memory 1200 --trace-memory 400 --sysprobe-memory 300 --process-memory 100 --tags "purpose:test,limit:2000mb" +``` + +All memory flags are mandatory and must sum to `--total-limit`, which acts as a check flag. + +### To find a minimum memory limit + +Run the script multiple times with different limits. Results are: + +- **OOMKilled** (FAILURE): Agent needs more memory, script exits +- **Stable** (SUCCESS): Agent survived test duration, cluster kept running for examination + +## Manifests + +All manifests are in `manifests/` directory. The script uses template +substitution for: + +- **manifests/datadog-agent.yaml**: DatadogAgent CRD for Datadog Operator + - Uses `{{ AGENT_MEMORY_MB }}`, `{{ TRACE_MEMORY_MB }}`, `{{ + SYSPROBE_MEMORY_MB }}`, `{{ PROCESS_MEMORY_MB }}`, and `{{ DD_TAGS }}` + placeholders + - Configured for DogStatsD via Unix domain socket at `/var/run/datadog/dsd.socket` + - Shares `/var/run/datadog` via hostPath with lading pod + +- **manifests/lading.yaml**: Lading load generator (lading 0.29.2) + - ConfigMap with exact config from `uds_dogstatsd_to_api` test + - Sends 100 MiB/s of DogStatsD metrics + - High cardinality: 1k-10k contexts, many tags + - Service with Prometheus scrape annotations for lading metrics + +- **manifests/lading-intake.yaml**: Lading intake (blackhole) mimicking Datadog + API (lading 0.29.2) + - Receives and discards agent output for self-contained testing + +- **manifests/datadog-secret.yaml**: Placeholder secret (fake API key, not validated) +- **manifests/deny-egress.yaml**: NetworkPolicy blocking internet egress (security isolation) + +## Test configuration + +Taken from +[`datadog-agent/test/regression/cases/uds_dogstatsd_to_api`](https://github.com/DataDog/datadog-agent/blob/main/test/regression/cases/uds_dogstatsd_to_api/lading/lading.yaml). This +experiment is **high stress** for metrics intake and high memory use from +`agent` container is expected. + +Adjust lading load generation configuration in the ConfigMap called +`lading-config`. Adjust Agent configuration in `manifests/datadog-agent.yaml`. + +## Cleanup + +Cluster is left online after script exits. Re-run of `experiment.sh` will +destroy the cluster. Manually clean up the cluster like so: + +```bash +kind delete cluster --name lading-test +``` + +## Notes + +- **Agent version**: 7.72.1 +- **Lading version**: 0.29.2 +- **Agent features enabled**: APM (trace-agent), Log Collection, NPM/system-probe, DogStatsD, Prometheus scrape diff --git a/k8s/uds_dogstatsd_to_api/analyze_memory.py b/k8s/uds_dogstatsd_to_api/analyze_memory.py new file mode 100755 index 000000000..492fca0c5 --- /dev/null +++ b/k8s/uds_dogstatsd_to_api/analyze_memory.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 +import sys +import json +import urllib.request +import urllib.parse + +def query_container(prom_url, pod, container, duration): + query = f'max_over_time(container_memory_working_set_bytes{{namespace="default",pod="{pod}",container="{container}"}}[{duration}s])' + params = {'query': query} + url = f"{prom_url}?{urllib.parse.urlencode(params)}" + + try: + with urllib.request.urlopen(url, timeout=10) as response: + data = json.loads(response.read().decode()) + + if data['status'] == 'success' and data['data']['result']: + value_bytes = float(data['data']['result'][0]['value'][1]) + return data, value_bytes + return data, None + except Exception as e: + print(f"Error querying {container}: {e}", file=sys.stderr) + return None, None + +def main(): + if len(sys.argv) != 8: + print("Usage: analyze_memory.py ", file=sys.stderr) + sys.exit(1) + + prom_url = sys.argv[1] + pod = sys.argv[2] + duration = sys.argv[3] + agent_limit = int(sys.argv[4]) + trace_limit = int(sys.argv[5]) + sysprobe_limit = int(sys.argv[6]) + process_limit = int(sys.argv[7]) + total_limit = agent_limit + trace_limit + sysprobe_limit + process_limit + + containers = { + 'agent': agent_limit, + 'trace-agent': trace_limit, + 'system-probe': sysprobe_limit, + 'process-agent': process_limit + } + + results = {} + + for container, limit_mb in containers.items(): + data, value_bytes = query_container(prom_url, pod, container, duration) + + if value_bytes is not None: + value_mb = value_bytes / 1024 / 1024 + percent = (value_mb / limit_mb) * 100 + results[container] = (value_mb, limit_mb, percent) + print(f" {container}: {value_mb:.2f} MB / {limit_mb} MB ({percent:.1f}%)") + else: + print(f" {container}: Could not retrieve metrics") + results[container] = (0, limit_mb, 0) + + # Calculate total + total_mb = sum(r[0] for r in results.values()) + total_percent = (total_mb / total_limit) * 100 + print(f" TOTAL: {total_mb:.2f} MB / {total_limit} MB ({total_percent:.1f}%)") + +if __name__ == '__main__': + main() diff --git a/k8s/uds_dogstatsd_to_api/experiment.sh b/k8s/uds_dogstatsd_to_api/experiment.sh new file mode 100755 index 000000000..537d7052e --- /dev/null +++ b/k8s/uds_dogstatsd_to_api/experiment.sh @@ -0,0 +1,321 @@ +#!/bin/bash +set -e + +# Parse arguments +TOTAL_LIMIT="" +AGENT_MEMORY_MB="" +TRACE_MEMORY_MB="" +SYSPROBE_MEMORY_MB="" +PROCESS_MEMORY_MB="" +DURATION=300 +DD_TAGS_VALUE="" + +while [[ $# -gt 0 ]]; do + case $1 in + --total-limit) + TOTAL_LIMIT="$2" + shift 2 + ;; + --agent-memory) + AGENT_MEMORY_MB="$2" + shift 2 + ;; + --trace-memory) + TRACE_MEMORY_MB="$2" + shift 2 + ;; + --sysprobe-memory) + SYSPROBE_MEMORY_MB="$2" + shift 2 + ;; + --process-memory) + PROCESS_MEMORY_MB="$2" + shift 2 + ;; + --duration) + DURATION="$2" + shift 2 + ;; + --tags) + DD_TAGS_VALUE="$2" + shift 2 + ;; + *) + echo "Unknown option: $1" + echo "Usage: $0 --total-limit --agent-memory --trace-memory --sysprobe-memory --process-memory [--duration ] --tags " + exit 1 + ;; + esac +done + +if [ -z "$TOTAL_LIMIT" ]; then + echo "ERROR: --total-limit is required" + echo "Usage: $0 --total-limit --agent-memory --trace-memory --sysprobe-memory --process-memory --tags " + exit 1 +fi + +if [ -z "$AGENT_MEMORY_MB" ]; then + echo "ERROR: --agent-memory is required" + echo "Usage: $0 --total-limit --agent-memory --trace-memory --sysprobe-memory --process-memory --tags " + exit 1 +fi + +if [ -z "$TRACE_MEMORY_MB" ]; then + echo "ERROR: --trace-memory is required" + echo "Usage: $0 --total-limit --agent-memory --trace-memory --sysprobe-memory --process-memory --tags " + exit 1 +fi + +if [ -z "$SYSPROBE_MEMORY_MB" ]; then + echo "ERROR: --sysprobe-memory is required" + echo "Usage: $0 --total-limit --agent-memory --trace-memory --sysprobe-memory --process-memory --tags " + exit 1 +fi + +if [ -z "$PROCESS_MEMORY_MB" ]; then + echo "ERROR: --process-memory is required" + echo "Usage: $0 --total-limit --agent-memory --trace-memory --sysprobe-memory --process-memory --tags " + exit 1 +fi + +if [ -z "$DD_TAGS_VALUE" ]; then + echo "ERROR: --tags is required" + echo "Usage: $0 --total-limit --agent-memory --trace-memory --sysprobe-memory --process-memory --tags " + exit 1 +fi + +# Verify individual limits sum to total. +CALCULATED_TOTAL=$((AGENT_MEMORY_MB + TRACE_MEMORY_MB + SYSPROBE_MEMORY_MB + PROCESS_MEMORY_MB)) +if [ "$CALCULATED_TOTAL" -ne "$TOTAL_LIMIT" ]; then + echo "ERROR: Individual memory limits do not sum to total limit" + echo "Total limit: ${TOTAL_LIMIT} MB" + echo "Sum of individual limits: ${CALCULATED_TOTAL} MB (agent=${AGENT_MEMORY_MB} + trace=${TRACE_MEMORY_MB} + sysprobe=${SYSPROBE_MEMORY_MB} + process=${PROCESS_MEMORY_MB})" + exit 1 +fi + +TOTAL_MEMORY_MB=$TOTAL_LIMIT + +echo "========================================" +echo "Datadog Agent Memory Limit Test" +echo "========================================" +echo "Memory limits per container:" +echo " agent: ${AGENT_MEMORY_MB} MB" +echo " trace-agent: ${TRACE_MEMORY_MB} MB" +echo " system-probe: ${SYSPROBE_MEMORY_MB} MB" +echo " process-agent: ${PROCESS_MEMORY_MB} MB" +echo " TOTAL: ${TOTAL_MEMORY_MB} MB" +echo "Test duration: ${DURATION} seconds" +echo "Tags: ${DD_TAGS_VALUE}" +echo "Started at: $(date)" +echo + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +echo "[1/6] Checking prerequisites..." +command -v kind >/dev/null 2>&1 || { echo "ERROR: kind not found"; exit 1; } +command -v kubectl >/dev/null 2>&1 || { echo "ERROR: kubectl not found"; exit 1; } +command -v helm >/dev/null 2>&1 || { echo "ERROR: helm not found"; exit 1; } +command -v jq >/dev/null 2>&1 || { echo "ERROR: jq not found"; exit 1; } +command -v bc >/dev/null 2>&1 || { echo "ERROR: bc not found"; exit 1; } +echo " ✓ Prerequisites available" +echo + +echo "[2/6] Creating fresh cluster..." +if kind get clusters 2>/dev/null | grep -q "^lading-test$"; then + echo " Deleting existing cluster..." + kind delete cluster --name lading-test +fi +kind create cluster --name lading-test +echo " ✓ Cluster ready" +echo + +echo "[3/6] Installing Prometheus..." +kubectl create namespace monitoring 2>/dev/null || true +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts >/dev/null 2>&1 || true +helm repo update >/dev/null 2>&1 +helm install prometheus prometheus-community/prometheus \ + --namespace monitoring \ + --set server.service.type=ClusterIP \ + --set alertmanager.enabled=false \ + --set prometheus-pushgateway.enabled=false \ + --set kube-state-metrics.enabled=true >/dev/null 2>&1 +echo " ✓ Prometheus installed" +echo + +echo "[4/6] Installing Datadog Operator..." +helm repo add datadog https://helm.datadoghq.com >/dev/null 2>&1 || true +helm repo update >/dev/null 2>&1 +helm install datadog-operator datadog/datadog-operator --version 2.15.2 >/dev/null 2>&1 +echo " Waiting for operator..." +kubectl wait --for=condition=available --timeout=120s deployment/datadog-operator 2>/dev/null || sleep 30 +echo " ✓ Operator ready" +echo + +echo "[5/6] Applying manifests with ${TOTAL_MEMORY_MB} MB limit..." +kubectl apply -f "$SCRIPT_DIR/manifests/datadog-secret.yaml" +kubectl apply -f "$SCRIPT_DIR/manifests/deny-egress.yaml" +kubectl apply -f "$SCRIPT_DIR/manifests/lading-intake.yaml" + +AGENT_MANIFEST=$(cat "$SCRIPT_DIR/manifests/datadog-agent.yaml" | \ + sed "s/{{ AGENT_MEMORY_MB }}/${AGENT_MEMORY_MB}/g" | \ + sed "s/{{ TRACE_MEMORY_MB }}/${TRACE_MEMORY_MB}/g" | \ + sed "s/{{ SYSPROBE_MEMORY_MB }}/${SYSPROBE_MEMORY_MB}/g" | \ + sed "s/{{ PROCESS_MEMORY_MB }}/${PROCESS_MEMORY_MB}/g" | \ + sed "s|{{ DD_TAGS }}|${DD_TAGS_VALUE}|g") + +if echo "$AGENT_MANIFEST" | grep -q "{{ .*_MEMORY_MB }}"; then + echo " ✗ ERROR: Template substitution failed for memory placeholders" + exit 1 +fi +if echo "$AGENT_MANIFEST" | grep -q "{{ DD_TAGS }}"; then + echo " ✗ ERROR: Template substitution failed for DD_TAGS" + exit 1 +fi + +echo "$AGENT_MANIFEST" | kubectl apply -f - +echo " ✓ Agent deployed (egress blocked)" + +# We wait for agent pods to be ready and socket to be created before starting +# the lading load generator instance. +echo " Waiting for agent and DogStatsD socket..." +TIMEOUT=120 +ELAPSED=0 +while [ $ELAPSED -lt $TIMEOUT ]; do + AGENT_POD=$(kubectl get pods -l app.kubernetes.io/name=datadog-agent-deployment -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") + if [ -n "$AGENT_POD" ]; then + SOCKET_EXISTS=$(kubectl exec "$AGENT_POD" -c agent -- test -S /var/run/datadog/dsd.socket 2>/dev/null && echo "yes" || echo "no") + if [ "$SOCKET_EXISTS" = "yes" ]; then + echo " ✓ DogStatsD socket ready" + break + fi + fi + sleep 2 + ELAPSED=$((ELAPSED + 2)) +done + +if [ $ELAPSED -ge $TIMEOUT ]; then + echo " ✗ Timeout waiting for DogStatsD socket" + exit 1 +fi + +# Now deploy lading load generator instance. +kubectl apply -f "$SCRIPT_DIR/manifests/lading.yaml" +echo " ✓ Manifests applied" +echo + +echo " Waiting for lading health..." +TIMEOUT=60 +ELAPSED=0 +while [ $ELAPSED -lt $TIMEOUT ]; do + LADING_POD=$(kubectl get pods -l app=lading -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") + if [ -n "$LADING_POD" ]; then + HTTP_CODE=$(kubectl exec "$LADING_POD" -- wget -q -O- --timeout=2 http://localhost:9000/metrics 2>/dev/null | head -1 && echo "ok" || echo "fail") + if [ "$HTTP_CODE" = "ok" ]; then + echo " ✓ Lading Prometheus endpoint healthy" + break + fi + fi + sleep 2 + ELAPSED=$((ELAPSED + 2)) +done + +if [ $ELAPSED -ge $TIMEOUT ]; then + echo " ✗ Timeout waiting for lading health" + kubectl logs -l app=lading --tail=20 + exit 1 +fi + +# Check for any failed pods. No failures are expected. Failure signals invalid +# memory limits maybe but at this point more likely misconfiguration. +FAILED_PODS=$(kubectl get pods -o json | jq -r '.items[] | select(.status.phase == "Failed" or .status.phase == "Unknown" or .status.phase == "CrashLoopBackOff") | .metadata.name') +if [ -n "$FAILED_PODS" ]; then + echo " ✗ Found failed pods:" + kubectl get pods + exit 1 +fi +echo " ✓ All systems healthy" +echo + +# Monitor for restarts +echo "[6/6] Monitoring for restarts (${DURATION}s)..." +echo " Started at: $(date)" +MONITOR_START_TIME=$(date +%s) +ELAPSED=0 +LAST_REPORT=0 + +while [ $ELAPSED -lt $DURATION ]; do + RESTART_DATA=$(kubectl get pods -l app.kubernetes.io/name=datadog-agent-deployment -o json 2>/dev/null) + if [ $? -ne 0 ]; then + sleep 5 + ELAPSED=$((ELAPSED + 5)) + continue + fi + + RESTART_COUNT=$(echo "$RESTART_DATA" | jq '[.items[].status.containerStatuses[]?.restartCount // 0] | add' 2>/dev/null || echo 0) + if [ -z "$RESTART_COUNT" ] || [ "$RESTART_COUNT" = "null" ]; then + RESTART_COUNT=0 + fi + + if [ $((ELAPSED - LAST_REPORT)) -ge 30 ]; then + REMAINING=$((DURATION - ELAPSED)) + echo " ${ELAPSED}s elapsed, ${REMAINING}s remaining (restarts: ${RESTART_COUNT})" + LAST_REPORT=$ELAPSED + fi + + if [ "$RESTART_COUNT" -gt 0 ]; then + CONTAINER_NAME=$(echo "$RESTART_DATA" | jq -r '.items[].status.containerStatuses[]? | select(.restartCount > 0) | .name' 2>/dev/null | head -1) + REASON=$(echo "$RESTART_DATA" | jq -r '.items[].status.containerStatuses[]? | select(.restartCount > 0) | .lastState.terminated.reason // "Unknown"' 2>/dev/null | head -1) + + echo + echo "========================================" + echo "RESULT: FAILURE" + echo "========================================" + echo "Container restarted: ${CONTAINER_NAME}" + echo "Restart count: ${RESTART_COUNT}" + echo "Reason: ${REASON}" + echo "Time to failure: ${ELAPSED}s" + echo + + if [ "$REASON" = "OOMKilled" ]; then + echo "💡 Container needs MORE memory" + else + echo "⚠️ Non-OOM restart:" + kubectl logs -l app.kubernetes.io/name=datadog-agent-deployment -c "${CONTAINER_NAME}" --previous --tail=20 + fi + echo "========================================" + exit 1 + fi + + sleep 5 + ELAPSED=$((ELAPSED + 5)) +done + +echo " Completed at: $(date)" +echo + +echo "========================================" +echo "RESULT: SUCCESS" +echo "========================================" +echo "No restarts detected" +echo "Test duration: ${DURATION} seconds" +echo "Tags: ${DD_TAGS_VALUE}" +echo + +# Query Prometheus for per-container memory usage +echo "Container memory usage:" +AGENT_POD=$(kubectl get pods -l app.kubernetes.io/name=datadog-agent-deployment -o jsonpath='{.items[0].metadata.name}') + +# Port-forward Prometheus to localhost +kubectl port-forward -n monitoring svc/prometheus-server 9090:80 >/dev/null 2>&1 & +PROM_PID=$! +sleep 3 + +# Run Python analysis script +python3 "$SCRIPT_DIR/analyze_memory.py" "http://localhost:9090/api/v1/query" "${AGENT_POD}" "${DURATION}" "${AGENT_MEMORY_MB}" "${TRACE_MEMORY_MB}" "${SYSPROBE_MEMORY_MB}" "${PROCESS_MEMORY_MB}" + +# Kill port-forward quietly +kill $PROM_PID >/dev/null 2>&1 +wait $PROM_PID 2>/dev/null +echo + +echo "💡 Agent stable - cluster is still running for examination" diff --git a/k8s/uds_dogstatsd_to_api/manifests/datadog-agent.yaml b/k8s/uds_dogstatsd_to_api/manifests/datadog-agent.yaml new file mode 100644 index 000000000..c78f494e6 --- /dev/null +++ b/k8s/uds_dogstatsd_to_api/manifests/datadog-agent.yaml @@ -0,0 +1,84 @@ +apiVersion: datadoghq.com/v2alpha1 +kind: DatadogAgent +metadata: + name: datadog + namespace: default +spec: + global: + clusterName: lading-test + site: datadoghq.com + credentials: + apiSecret: + secretName: datadog-secret + keyName: api-key + endpoint: + url: http://lading-intake:8080 + + features: + apm: + enabled: true + + logCollection: + enabled: true + + dogstatsd: + unixDomainSocketConfig: + enabled: true + path: /var/run/datadog/dsd.socket + + npm: + enabled: true + + prometheusScrape: + enabled: true + enableServiceEndpoints: true + + override: + clusterAgent: + containers: + cluster-agent: + livenessProbe: + initialDelaySeconds: 60 + periodSeconds: 30 + timeoutSeconds: 10 + failureThreshold: 10 + readinessProbe: + initialDelaySeconds: 60 + periodSeconds: 30 + timeoutSeconds: 10 + failureThreshold: 10 + nodeAgent: + image: + name: gcr.io/datadoghq/agent:7.72.1 + containers: + agent: + env: + - name: DD_HOSTNAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: DD_TAGS + value: "{{ DD_TAGS }}" + resources: + limits: + memory: "{{ AGENT_MEMORY_MB }}Mi" + volumeMounts: + - name: dsdsocket + mountPath: /var/run/datadog + trace-agent: + resources: + limits: + memory: "{{ TRACE_MEMORY_MB }}Mi" + system-probe: + resources: + limits: + memory: "{{ SYSPROBE_MEMORY_MB }}Mi" + process-agent: + resources: + limits: + memory: "{{ PROCESS_MEMORY_MB }}Mi" + volumes: + - name: dsdsocket + hostPath: + path: /var/run/datadog + type: DirectoryOrCreate diff --git a/k8s/uds_dogstatsd_to_api/manifests/datadog-secret.yaml b/k8s/uds_dogstatsd_to_api/manifests/datadog-secret.yaml new file mode 100644 index 000000000..0a12fb4ea --- /dev/null +++ b/k8s/uds_dogstatsd_to_api/manifests/datadog-secret.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: Secret +metadata: + name: datadog-secret + namespace: default +type: Opaque +stringData: + api-key: "fake-api-key-for-testing" diff --git a/k8s/uds_dogstatsd_to_api/manifests/deny-egress.yaml b/k8s/uds_dogstatsd_to_api/manifests/deny-egress.yaml new file mode 100644 index 000000000..b21e1293a --- /dev/null +++ b/k8s/uds_dogstatsd_to_api/manifests/deny-egress.yaml @@ -0,0 +1,22 @@ +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: deny-internet-egress + namespace: default +spec: + podSelector: {} + policyTypes: + - Egress + egress: + # Allow DNS + - to: + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: kube-system + ports: + - protocol: UDP + port: 53 + # Allow intra-cluster communication + - to: + - podSelector: {} + # Block everything else (internet egress blocked) diff --git a/k8s/uds_dogstatsd_to_api/manifests/lading-intake.yaml b/k8s/uds_dogstatsd_to_api/manifests/lading-intake.yaml new file mode 100644 index 000000000..10a086393 --- /dev/null +++ b/k8s/uds_dogstatsd_to_api/manifests/lading-intake.yaml @@ -0,0 +1,67 @@ +# Lading intake (blackhole) - mimics Datadog API to receive agent output +# +# This deployment acts as a fake Datadog backend for self-contained testing: +# - Accepts agent API v2 submissions at :8080 +# - Discards all received data (blackhole mode) +# - Allows testing without external Datadog connectivity +# - Used with network isolation (deny-egress.yaml) to ensure agent only talks to this intake +# - Infinite runtime: runs until manually stopped +# +# The agent is configured to send to http://lading-intake:8080 instead of Datadog. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: lading-intake-config + namespace: default +data: + lading.yaml: | + blackhole: + - datadog: + v2: + binding_addr: "0.0.0.0:8080" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: lading-intake + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: lading-intake + template: + metadata: + labels: + app: lading-intake + spec: + containers: + - name: lading + image: ghcr.io/datadog/lading:0.29.2 + args: + - "--config-path" + - "/etc/lading/lading.yaml" + - "--no-target" + - "--prometheus-addr" + - "0.0.0.0:9000" + - "--experiment-duration-infinite" + volumeMounts: + - name: config + mountPath: /etc/lading + volumes: + - name: config + configMap: + name: lading-intake-config +--- +apiVersion: v1 +kind: Service +metadata: + name: lading-intake + namespace: default +spec: + selector: + app: lading-intake + ports: + - port: 8080 + targetPort: 8080 diff --git a/k8s/uds_dogstatsd_to_api/manifests/lading.yaml b/k8s/uds_dogstatsd_to_api/manifests/lading.yaml new file mode 100644 index 000000000..b38ee1372 --- /dev/null +++ b/k8s/uds_dogstatsd_to_api/manifests/lading.yaml @@ -0,0 +1,137 @@ +# Lading load generator - sends 100 MiB/s of DogStatsD metrics to the Datadog agent +# +# This deployment generates synthetic load matching the uds_dogstatsd_to_api regression test: +# - High cardinality: 1,000-10,000 unique metric contexts +# - Heavy tagging: 2-50 tags per metric, 3-150 chars each +# - Unix domain socket: connects to agent at /var/run/datadog/dsd.socket +# - Deterministic: uses fixed seed for reproducible load patterns +# - Infinite runtime: runs until manually stopped +# +# The Service exposes Prometheus metrics at :9000/metrics for monitoring lading itself. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: lading-config + namespace: default +data: + lading.yaml: | + # From datadog-agent test/regression/cases/uds_dogstatsd_to_api + generator: + - unix_datagram: + seed: [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, + 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131] + path: "/var/run/datadog/dsd.socket" + variant: + dogstatsd: + contexts: + inclusive: + min: 1000 + max: 10000 + name_length: + inclusive: + min: 1 + max: 200 + tag_length: + inclusive: + min: 3 + max: 150 + tags_per_msg: + inclusive: + min: 2 + max: 50 + multivalue_count: + inclusive: + min: 2 + max: 32 + multivalue_pack_probability: 0.08 + kind_weights: + metric: 90 + event: 5 + service_check: 5 + metric_weights: + count: 100 + gauge: 10 + timer: 0 + distribution: 0 + set: 0 + histogram: 0 + bytes_per_second: "100 MiB" + maximum_prebuild_cache_size_bytes: "500 MiB" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: lading + namespace: default + labels: + app: lading +spec: + replicas: 1 + selector: + matchLabels: + app: lading + template: + metadata: + labels: + app: lading + spec: + containers: + - name: lading + image: ghcr.io/datadog/lading:0.29.2 + command: ["lading"] + args: + - "--config-path" + - "/etc/lading/lading.yaml" + - "--no-target" + - "--prometheus-addr" + - "0.0.0.0:9000" + - "--experiment-duration-infinite" + resources: + limits: + memory: "4Gi" + cpu: "2" + requests: + memory: "4Gi" + cpu: "2" + volumeMounts: + - name: config + mountPath: /etc/lading + readOnly: true + - name: dsdsocket + mountPath: /var/run/datadog + volumes: + - name: config + configMap: + name: lading-config + - name: dsdsocket + hostPath: + path: /var/run/datadog + type: DirectoryOrCreate +--- +apiVersion: v1 +kind: Service +metadata: + name: lading + namespace: default + annotations: + ad.datadoghq.com/service.checks: | + { + "openmetrics": { + "init_config": {}, + "instances": [ + { + "openmetrics_endpoint": "http://%%host%%:9000/metrics", + "namespace": "lading", + "metrics": [".*"] + } + ] + } + } +spec: + selector: + app: lading + ports: + - name: prometheus + port: 9000 + targetPort: 9000