Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions examples/lading-logrotate.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# file_to_blackhole_0ms_latency example using logrotate generator
#
# This example demonstrates the logrotate generator (NOT logrotate_fs) for
# environments where FUSE is unavailable (e.g., Kubernetes without privileged
# containers).
#
# Key differences from logrotate_fs:
# - Uses real filesystem writes to `root` path (not a FUSE mount)
# - Uses `throttle` config instead of `load_profile`
# - Works in standard containers without FUSE/privileged mode
#
# The "0ms latency" comes from `timeout_millis: 0` in the throttle config,
# meaning writes return immediately without simulating I/O delay.
#
# Throttle options:
# - all_out: No rate limiting, produce as fast as possible
# - stable: Fixed rate with optional timeout
# - bytes_per_second: Rate limit (e.g., "1 MiB", "10 MB")
# - timeout_millis: 0 = immediate writes, >0 = simulate I/O latency
# - linear: Ramp up from initial to maximum rate over time
# - initial_bytes_per_second: Starting rate
# - maximum_bytes_per_second: Target rate
# - rate_of_change: How fast to increase (bytes/sec per second)

generator:
- file_gen:
logrotate:
seed: [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53,
59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131]
root: /tmp/logs
concurrent_logs: 8
maximum_bytes_per_log: 100MiB
total_rotations: 4
max_depth: 0
variant: "ascii"
maximum_prebuild_cache_size_bytes: 1GiB
throttle:
stable:
bytes_per_second: "1.3 MiB"
timeout_millis: 0

blackhole:
- tcp:
binding_addr: "0.0.0.0:8080"
63 changes: 63 additions & 0 deletions k8s/file_to_blackhole_0ms_latency/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# file_to_blackhole_0ms_latency Experiment

Tests Datadog Agent log tailing under load from lading's `logrotate` file generator.

## Overview

This experiment:
1. Deploys lading to generate rotating log files at `/var/log/lading/`
2. Deploys Datadog Agent configured to tail those log files
3. Routes agent output to a blackhole (lading-intake)
4. Monitors for OOMKills or restarts

Uses the `logrotate` generator (not `logrotate_fs`) which writes to real filesystem
and doesn't require FUSE or privileged containers.

## Prerequisites

- kind: `brew install kind`
- kubectl: `brew install kubectl`
- helm: `brew install helm`
- jq: `brew install jq`
- Docker running

## Usage

```bash
./k8s/file_to_blackhole_0ms_latency/experiment.sh \
--agent-memory 512 \
--tags "purpose:test,experiment:file_to_blackhole_0ms_latency"
```

### Options

| Flag | Required | Description |
|------|----------|-------------|
| `--agent-memory` | Yes | Agent container memory limit in MB |
| `--duration` | No | Test duration in seconds (default: 300) |
| `--tags` | Yes | DD_TAGS value for the agent |

## Load Configuration

Default load: 8 concurrent log files at 1.3 MiB/s total throughput.

To modify, edit `manifests/lading.yaml`:

```yaml
throttle:
stable:
bytes_per_second: "1.3 MiB" # Adjust rate
timeout_millis: 0 # 0 = immediate writes
```

## Results

- **SUCCESS**: Agent survived test duration without restarts
- **FAILURE (OOMKilled)**: Agent needs more memory
- **FAILURE (other)**: Configuration or stability issue

## Cleanup

```bash
kind delete cluster --name lading-test
```
243 changes: 243 additions & 0 deletions k8s/file_to_blackhole_0ms_latency/experiment.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
#!/bin/bash
set -e

# Parse arguments
AGENT_MEMORY_MB=""
DURATION=300
DD_TAGS_VALUE=""

while [[ $# -gt 0 ]]; do
case $1 in
--agent-memory)
AGENT_MEMORY_MB="$2"
shift 2
;;
--duration)
DURATION="$2"
shift 2
;;
--tags)
DD_TAGS_VALUE="$2"
shift 2
;;
*)
echo "Unknown option: $1"
echo "Usage: $0 --agent-memory <MB> [--duration <seconds>] --tags <DD_TAGS>"
exit 1
;;
esac
done

if [ -z "$AGENT_MEMORY_MB" ]; then
echo "ERROR: --agent-memory is required"
echo "Usage: $0 --agent-memory <MB> [--duration <seconds>] --tags <DD_TAGS>"
exit 1
fi

if [ -z "$DD_TAGS_VALUE" ]; then
echo "ERROR: --tags is required"
echo "Usage: $0 --agent-memory <MB> [--duration <seconds>] --tags <DD_TAGS>"
exit 1
fi

echo "========================================"
echo "Datadog Agent Log Tailing Test"
echo "========================================"
echo "Agent memory limit: ${AGENT_MEMORY_MB} MB"
echo "Test duration: ${DURATION} seconds"
echo "Tags: ${DD_TAGS_VALUE}"
echo "Started at: $(date)"
echo

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

echo "[1/6] Checking prerequisites..."
command -v kind >/dev/null 2>&1 || { echo "ERROR: kind not found"; exit 1; }
command -v kubectl >/dev/null 2>&1 || { echo "ERROR: kubectl not found"; exit 1; }
command -v helm >/dev/null 2>&1 || { echo "ERROR: helm not found"; exit 1; }
command -v jq >/dev/null 2>&1 || { echo "ERROR: jq not found"; exit 1; }
echo " ✓ Prerequisites available"
echo

echo "[2/6] Creating fresh cluster..."
if kind get clusters 2>/dev/null | grep -q "^lading-test$"; then
echo " Deleting existing cluster..."
kind delete cluster --name lading-test
fi
kind create cluster --name lading-test
echo " ✓ Cluster ready"
echo

echo "[3/6] Installing Prometheus..."
kubectl create namespace monitoring 2>/dev/null || true
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts >/dev/null 2>&1 || true
helm repo update >/dev/null 2>&1
helm install prometheus prometheus-community/prometheus \
--namespace monitoring \
--set server.service.type=ClusterIP \
--set alertmanager.enabled=false \
--set prometheus-pushgateway.enabled=false \
--set kube-state-metrics.enabled=true >/dev/null 2>&1
echo " ✓ Prometheus installed"
echo

echo "[4/6] Installing Datadog Operator..."
helm repo add datadog https://helm.datadoghq.com >/dev/null 2>&1 || true
helm repo update >/dev/null 2>&1
helm install datadog-operator datadog/datadog-operator --version 2.15.2 >/dev/null 2>&1
echo " Waiting for operator..."
kubectl wait --for=condition=available --timeout=120s deployment/datadog-operator 2>/dev/null || sleep 30
echo " ✓ Operator ready"
echo

echo "[5/6] Applying manifests..."
kubectl apply -f "$SCRIPT_DIR/manifests/datadog-secret.yaml"
kubectl apply -f "$SCRIPT_DIR/manifests/deny-egress.yaml"
kubectl apply -f "$SCRIPT_DIR/manifests/lading-intake.yaml"

# Deploy lading first so logs exist before agent starts
kubectl apply -f "$SCRIPT_DIR/manifests/lading.yaml"
echo " ✓ Lading file generator deployed"

# Wait for lading to create log files
echo " Waiting for log files..."
TIMEOUT=120
ELAPSED=0
while [ $ELAPSED -lt $TIMEOUT ]; do
LADING_POD=$(kubectl get pods -l app=lading -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")
if [ -n "$LADING_POD" ]; then
POD_READY=$(kubectl get pod "$LADING_POD" -o jsonpath='{.status.containerStatuses[0].ready}' 2>/dev/null || echo "false")
if [ "$POD_READY" = "true" ]; then
LOG_COUNT=$(kubectl exec "$LADING_POD" -- sh -c 'ls /var/log/lading/*.log 2>/dev/null | wc -l' || echo 0)
if [ "$LOG_COUNT" -gt 0 ]; then
echo " ✓ Log files created ($LOG_COUNT files)"
break
fi
fi
fi
sleep 2
ELAPSED=$((ELAPSED + 2))
done

if [ $ELAPSED -ge $TIMEOUT ]; then
echo " ✗ Timeout waiting for log files"
kubectl logs -l app=lading --tail=20
exit 1
fi

# Now deploy agent
AGENT_MANIFEST=$(cat "$SCRIPT_DIR/manifests/datadog-agent.yaml" | \
sed "s/{{ AGENT_MEMORY_MB }}/${AGENT_MEMORY_MB}/g" | \
sed "s|{{ DD_TAGS }}|${DD_TAGS_VALUE}|g")

if echo "$AGENT_MANIFEST" | grep -q "{{ AGENT_MEMORY_MB }}"; then
echo " ✗ ERROR: Template substitution failed for memory placeholder"
exit 1
fi
if echo "$AGENT_MANIFEST" | grep -q "{{ DD_TAGS }}"; then
echo " ✗ ERROR: Template substitution failed for DD_TAGS"
exit 1
fi

echo "$AGENT_MANIFEST" | kubectl apply -f -
echo " ✓ Agent deployed (egress blocked)"

echo " Waiting for agent..."
TIMEOUT=120
ELAPSED=0
while [ $ELAPSED -lt $TIMEOUT ]; do
AGENT_POD=$(kubectl get pods -l app.kubernetes.io/name=datadog-agent-deployment -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")
if [ -n "$AGENT_POD" ]; then
READY=$(kubectl get pod "$AGENT_POD" -o jsonpath='{.status.containerStatuses[0].ready}' 2>/dev/null || echo "false")
if [ "$READY" = "true" ]; then
echo " ✓ Agent ready"
break
fi
fi
sleep 2
ELAPSED=$((ELAPSED + 2))
done

if [ $ELAPSED -ge $TIMEOUT ]; then
echo " ✗ Timeout waiting for agent"
kubectl describe pods -l app.kubernetes.io/name=datadog-agent-deployment
exit 1
fi

# Check for any failed pods
FAILED_PODS=$(kubectl get pods -o json | jq -r '.items[] | select(.status.phase == "Failed" or .status.phase == "Unknown") | .metadata.name')
if [ -n "$FAILED_PODS" ]; then
echo " ✗ Found failed pods:"
kubectl get pods
exit 1
fi
echo " ✓ All systems healthy"
echo

# Monitor for restarts
echo "[6/6] Monitoring for restarts (${DURATION}s)..."
echo " Started at: $(date)"
ELAPSED=0
LAST_REPORT=0

while [ $ELAPSED -lt $DURATION ]; do
RESTART_DATA=$(kubectl get pods -l app.kubernetes.io/name=datadog-agent-deployment -o json 2>/dev/null)
if [ $? -ne 0 ]; then
sleep 5
ELAPSED=$((ELAPSED + 5))
continue
fi

RESTART_COUNT=$(echo "$RESTART_DATA" | jq '[.items[].status.containerStatuses[]?.restartCount // 0] | add' 2>/dev/null || echo 0)
if [ -z "$RESTART_COUNT" ] || [ "$RESTART_COUNT" = "null" ]; then
RESTART_COUNT=0
fi

if [ $((ELAPSED - LAST_REPORT)) -ge 30 ]; then
REMAINING=$((DURATION - ELAPSED))
echo " ${ELAPSED}s elapsed, ${REMAINING}s remaining (restarts: ${RESTART_COUNT})"
LAST_REPORT=$ELAPSED
fi

if [ "$RESTART_COUNT" -gt 0 ]; then
CONTAINER_NAME=$(echo "$RESTART_DATA" | jq -r '.items[].status.containerStatuses[]? | select(.restartCount > 0) | .name' 2>/dev/null | head -1)
REASON=$(echo "$RESTART_DATA" | jq -r '.items[].status.containerStatuses[]? | select(.restartCount > 0) | .lastState.terminated.reason // "Unknown"' 2>/dev/null | head -1)

echo
echo "========================================"
echo "RESULT: FAILURE"
echo "========================================"
echo "Container restarted: ${CONTAINER_NAME}"
echo "Restart count: ${RESTART_COUNT}"
echo "Reason: ${REASON}"
echo "Time to failure: ${ELAPSED}s"
echo

if [ "$REASON" = "OOMKilled" ]; then
echo "💡 Container needs MORE memory"
else
echo "⚠️ Non-OOM restart:"
kubectl logs -l app.kubernetes.io/name=datadog-agent-deployment -c "${CONTAINER_NAME}" --previous --tail=20
fi
echo "========================================"
exit 1
fi

sleep 5
ELAPSED=$((ELAPSED + 5))
done

echo " Completed at: $(date)"
echo

echo "========================================"
echo "RESULT: SUCCESS"
echo "========================================"
echo "No restarts detected"
echo "Test duration: ${DURATION} seconds"
echo "Tags: ${DD_TAGS_VALUE}"
echo

echo "💡 Agent stable - cluster is still running for examination"
echo " View lading metrics: kubectl port-forward svc/lading 9000:9000"
echo " View agent logs: kubectl logs -l app.kubernetes.io/name=datadog-agent-deployment -f"
Loading
Loading