diff --git a/packages/web/docs/src/components/otel-metrics/label-card.tsx b/packages/web/docs/src/components/otel-metrics/label-card.tsx new file mode 100644 index 00000000000..182630dc29c --- /dev/null +++ b/packages/web/docs/src/components/otel-metrics/label-card.tsx @@ -0,0 +1,58 @@ +import { Info, Lightbulb, Tag } from 'lucide-react'; + +interface LabelCardProps { + name: string; + meaning: string; + typicalValues: string[]; + notes?: string; +} + +export function LabelCard({ name, meaning, typicalValues, notes }: LabelCardProps) { + return ( +
+
+
+ +
+
+ + {name} + +

+ {meaning} +

+
+
+ +
+
+
+ + + Typical Values + +
+
+ {typicalValues.map(value => ( + + {value} + + ))} +
+
+ + {notes && ( +
+
+ +

{notes}

+
+
+ )} +
+
+ ); +} diff --git a/packages/web/docs/src/components/otel-metrics/metric-card.tsx b/packages/web/docs/src/components/otel-metrics/metric-card.tsx new file mode 100644 index 00000000000..5f7d1df83d0 --- /dev/null +++ b/packages/web/docs/src/components/otel-metrics/metric-card.tsx @@ -0,0 +1,163 @@ +import { useEffect, useRef, useState } from 'react'; +import { Activity, BarChart3, Gauge, TrendingUp } from 'lucide-react'; + +interface MetricCardProps { + name: string; + type: 'Counter' | 'Histogram' | 'UpDownCounter' | 'Gauge'; + unit?: string; + description?: string; + labels?: string[]; +} + +const typeConfig = { + Counter: { + icon: TrendingUp, + color: + 'bg-emerald-50 text-emerald-700 border-emerald-200 dark:bg-emerald-900/30 dark:text-emerald-300 dark:border-emerald-700/50', + badge: 'bg-emerald-100 text-emerald-800', + }, + Histogram: { + icon: BarChart3, + color: + 'bg-blue-50 text-blue-700 border-blue-200 dark:bg-blue-900/30 dark:text-blue-300 dark:border-blue-700/50', + badge: 'bg-blue-100 text-blue-800', + }, + UpDownCounter: { + icon: Activity, + color: + 'bg-amber-50 text-amber-700 border-amber-200 dark:bg-amber-900/30 dark:text-amber-300 dark:border-amber-700/50', + badge: 'bg-amber-100 text-amber-800', + }, + Gauge: { + icon: Gauge, + color: + 'bg-slate-50 text-slate-700 border-slate-200 dark:bg-slate-800/60 dark:text-slate-100 dark:border-slate-700', + badge: 'bg-slate-100 text-slate-800', + }, +}; + +export function MetricCard({ name, type, unit, description, labels }: MetricCardProps) { + const config = typeConfig[type]; + const Icon = config.icon; + const [isCopied, setIsCopied] = useState(false); + const copiedTimeoutRef = useRef | null>(null); + const metricId = `metric-${name + .toLowerCase() + .replace(/[^a-z0-9]+/g, '-') + .replace(/(^-|-$)/g, '')}`; + + useEffect(() => { + return () => { + if (copiedTimeoutRef.current) { + clearTimeout(copiedTimeoutRef.current); + } + }; + }, []); + + function showCopiedState() { + setIsCopied(true); + + if (copiedTimeoutRef.current) { + clearTimeout(copiedTimeoutRef.current); + } + + copiedTimeoutRef.current = setTimeout(() => { + setIsCopied(false); + }, 1200); + } + + async function copyMetricLink() { + if (typeof window === 'undefined') { + return; + } + + const metricUrl = `${window.location.origin}${window.location.pathname}${window.location.search}#${metricId}`; + + try { + await navigator.clipboard.writeText(metricUrl); + showCopiedState(); + } catch { + window.location.hash = metricId; + } + } + + return ( +
+
+
+
+
+ + {name} + + + + {isCopied ? `Copied link to ${name}` : ''} + +
+
+
+ {unit && ( +
+ Unit: + {unit} +
+ )} +
+ + {type} +
+
+
+ + {description && ( +

+ {description} +

+ )} + + {labels && labels.length > 0 && ( +
+
+ + Labels + +
+
+ {labels.map(label => ( + + {label} + + ))} +
+
+ )} +
+
+ ); +} diff --git a/packages/web/docs/src/components/otel-metrics/metrics-section.tsx b/packages/web/docs/src/components/otel-metrics/metrics-section.tsx new file mode 100644 index 00000000000..72c4dd68b45 --- /dev/null +++ b/packages/web/docs/src/components/otel-metrics/metrics-section.tsx @@ -0,0 +1,80 @@ +'use client'; + +import { useId, useState } from 'react'; +import { ChevronDown } from 'lucide-react'; +import { LabelCard } from './label-card'; +import { MetricCard } from './metric-card'; + +interface Metric { + name: string; + type: 'Counter' | 'Histogram' | 'UpDownCounter' | 'Gauge'; + unit?: string; + description?: string; + labels?: string[]; +} + +interface Label { + name: string; + meaning: string; + typicalValues: string[]; + notes?: string; +} + +interface MetricsSectionProps { + title?: string; + description?: string; + metrics?: Metric[]; + labels?: Label[]; +} +export function MetricsSection({ metrics, labels }: MetricsSectionProps) { + const [isLabelsOpen, setIsLabelsOpen] = useState(false); + const labelsRegionId = useId(); + + return ( +
+ {metrics && metrics.length > 0 && ( +
+

+ Metrics +

+
+ {metrics.map(metric => ( + + ))} +
+
+ )} + + {labels && labels.length > 0 && ( +
+ +
+
+
+ {labels.map(label => ( +
+ +
+ ))} +
+
+
+
+ )} +
+ ); +} diff --git a/packages/web/docs/src/content/router/configuration/telemetry.mdx b/packages/web/docs/src/content/router/configuration/telemetry.mdx index b5f979f1087..9b42ca64555 100644 --- a/packages/web/docs/src/content/router/configuration/telemetry.mdx +++ b/packages/web/docs/src/content/router/configuration/telemetry.mdx @@ -5,7 +5,7 @@ title: 'telemetry' # telemetry The `telemetry` configuration controls client identification, Hive reporting, and OpenTelemetry -tracing behavior in Hive Router. +tracing and metrics behavior in Hive Router. ## client_identification @@ -232,6 +232,265 @@ telemetry: x-api-key: key ``` + + + + + + +## metrics + +Top-level OpenTelemetry metrics configuration. + +
+ Show metrics configuration + +Metrics are enabled when at least one exporter is configured and enabled. + +| Field | Type | Default | Notes | +| ----------------- | -------- | ------- | ------------------------------------------------------------------------------------- | +| `exporters` | `array` | `[]` | List of exporters used to send metrics. | +| `instrumentation` | `object` | `{}` | Instrument behavior for metrics (histogram aggregation and per-instrument overrides). | + +
+
+ `exporters` + +Each item configures one metrics exporter. + +Each item in this array defines one exporter instance, so you can configure multiple metrics +destinations if needed. + +This reference documents OTLP and Prometheus exporter configuration. + +| Field | Type | Default | Notes | +| --------- | --------- | ------- | ------------------------------------------------------ | +| `kind` | `string` | - | Exporter kind. Supported values: `otlp`, `prometheus`. | +| `enabled` | `boolean` | `true` | Enables or disables this exporter. | + +
+ `otlp` + +| Field | Type | Default | Notes | +| ------------------------------------------------------------------------------------------------------------------- | -------------------- | ------------ | ----------------------------------------------------------------- | +| `kind` | `string` | - | Must be `otlp`. | +| `enabled` | `boolean` | `true` | Enables or disables this exporter. | +| `endpoint` | `StringOrExpression` | - | OTLP endpoint. Must be set explicitly. | +| `protocol` | `string` | - | OTLP transport protocol. Supported values: `http`, `grpc`. | +| `interval` | `string` | `60s` | Interval between OTLP export attempts. | +| [`temporality`](https://opentelemetry.io/docs/specs/otel/metrics/supplementary-guidelines/#aggregation-temporality) | `string` | `cumulative` | Aggregation temporality. Supported values: `cumulative`, `delta`. | +| `max_export_timeout` | `string` | `5s` | Maximum time for one OTLP export attempt. | +| `http` | `object` | - | HTTP-specific OTLP settings (for `protocol: http`). | +| `grpc` | `object` | - | gRPC-specific OTLP settings (for `protocol: grpc`). | + +OTLP over HTTP: + +| Field | Type | Value / Default | Notes | +| -------------- | -------- | --------------- | ------------------------------------------------------------- | +| `protocol` | `string` | `http` | OTLP transport protocol. | +| `http.headers` | `object` | `{}` | Map of header names to values (`string` or `{ expression }`). | + +```yaml filename="router.config.yaml" +telemetry: + metrics: + exporters: + - kind: otlp + enabled: true + protocol: http + endpoint: https://otel-collector.example.com/v1/metrics + interval: 60s + temporality: cumulative + max_export_timeout: 5s + http: + headers: + x-otlp-header: value +``` + +OTLP over gRPC: + +| Field | Type | Value / Default | Notes | +| ---------------------- | -------- | --------------- | ---------------------------------------------------------------------------- | +| `protocol` | `string` | `grpc` | OTLP transport protocol. | +| `grpc.metadata` | `object` | `{}` | Map of metadata keys to values (`string` or `{ expression }`). | +| `grpc.tls.domain_name` | `string` | - | Domain name used to verify the server certificate. | +| `grpc.tls.key` | `string` | - | Path to the client private key file. | +| `grpc.tls.cert` | `string` | - | Path to the client certificate file (PEM). | +| `grpc.tls.ca` | `string` | - | Path to the CA certificate file (PEM) used to verify the server certificate. | + +```yaml filename="router.config.yaml" +telemetry: + metrics: + exporters: + - kind: otlp + enabled: true + protocol: grpc + endpoint: https://otel-collector.example.com:4317 + interval: 60s + temporality: cumulative + max_export_timeout: 5s + grpc: + metadata: + x-api-key: key +``` + +
+ +
+ `prometheus` + +| Field | Type | Default | Notes | +| --------- | --------- | ---------- | ------------------------------------------- | +| `kind` | `string` | - | Must be `prometheus`. | +| `enabled` | `boolean` | `true` | Enables/disables Prometheus metrics export. | +| `port` | `integer` | - | Optional port for metrics endpoint. | +| `path` | `string` | `/metrics` | HTTP path exposed for scraping. | + +```yaml filename="router.config.yaml" +telemetry: + metrics: + exporters: + - kind: prometheus + enabled: true + port: 9090 + path: /metrics +``` + +
+ +
+
+ +
+
+ `instrumentation` + +Controls histogram aggregation and per-instrument overrides. + +| Field | Type | Default | Notes | +| ------------------ | -------- | ----------------------- | ----------------------------------------------------------- | +| `common.histogram` | `object` | exponential aggregation | Histogram aggregation strategy for instrumented histograms. | +| `instruments` | `object` | `{}` | Map of metric name to `false`, `true`, or object override. | + +
+ `common.histogram` + +Set aggregation mode with `aggregation`. + +`explicit` aggregation (default): + +| Field | Type | Default | Notes | +| ------------- | -------- | ------- | ----------------------------------------------------- | +| `aggregation` | `string` | - | Must be `explicit`. | +| `seconds` | `object` | - | Explicit histogram config for metrics with unit `s`. | +| `bytes` | `object` | - | Explicit histogram config for metrics with unit `By`. | + +`seconds` and `bytes` fields: + +| Field | Type | Default | Notes | +| ---------------- | ---------------------- | ------- | --------------------------------------------------------------- | +| `buckets` | `number[] \| string[]` | varies | Explicit bucket upper bounds. Must be non-empty and increasing. | +| `record_min_max` | `boolean` | `false` | Record min/max values for this unit bucket set. | + +Default explicit buckets: + +- `seconds.buckets`: `[0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1, 2.5, 5, 7.5, 10]` +- `bytes.buckets`: + `[128, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 3145728, 4194304, 5242880]` + +Bucket format rules: + +- `buckets` can be either all numbers or all strings. +- mixed arrays are not allowed. +- for `seconds.buckets`, string values are parsed as durations (for example `"5ms"`, `"1s"`). +- for `bytes.buckets`, string values are parsed as human-readable sizes (for example `"1KB"`, + `"5MB"`). + +[`exponential`](https://opentelemetry.io/docs/specs/otel/metrics/data-model/#exponentialhistogram) +aggregation: + +| Field | Type | Default | Notes | +| ------------------------------------------------------------------------------------------------------------------- | --------- | ------- | ----------------------------------------- | +| `aggregation` | `string` | - | Must be `exponential`. | +| [`max_size`](https://opentelemetry.io/docs/specs/otel/metrics/sdk/#base2-exponential-bucket-histogram-aggregation) | `integer` | - | Max bucket count. Required. | +| [`max_scale`](https://opentelemetry.io/docs/specs/otel/metrics/sdk/#base2-exponential-bucket-histogram-aggregation) | `integer` | - | Max scale for bucket precision. Required. | +| `record_min_max` | `boolean` | `false` | Record min/max values. | + +
+ +
+ `instruments` + +`instruments` is a map keyed by metric name. Value can be: + +- `false` to disable a metric +- `true` to keep defaults +- object to keep metric enabled and override attributes + +Object form supports: + +| Field | Type | Notes | +| ------------ | -------- | ------------------------------------------------------------------------------ | +| `attributes` | `object` | Map of attribute name to `boolean` (`false` drops attribute, `true` keeps it). | + +```yaml filename="router.config.yaml" +telemetry: + metrics: + instrumentation: + common: + histogram: + aggregation: explicit + seconds: + buckets: + [ + '5ms', + '10ms', + '25ms', + '50ms', + '75ms', + '100ms', + '250ms', + '500ms', + '750ms', + '1s', + '2.5s', + '5s', + '7.5s', + '10s' + ] + record_min_max: false + bytes: + buckets: + [ + '128B', + '512B', + '1KB', + '2KB', + '4KB', + '8KB', + '16KB', + '32KB', + '64KB', + '128KB', + '256KB', + '512KB', + '1MB', + '2MB', + '3MB', + '4MB', + '5MB' + ] + record_min_max: false + instruments: + http.server.request.duration: true + http.client.request.duration: + attributes: + subgraph.name: true + http.response.status_code: true + server.address: false +``` + +
+
diff --git a/packages/web/docs/src/content/router/observability/metrics.mdx b/packages/web/docs/src/content/router/observability/metrics.mdx new file mode 100644 index 00000000000..27877394cb5 --- /dev/null +++ b/packages/web/docs/src/content/router/observability/metrics.mdx @@ -0,0 +1,825 @@ +--- +title: 'OpenTelemetry Metrics' +--- + +import { Callout } from '#components/callout' +import { MetricsSection } from '#components/otel-metrics/metrics-section' +import { Tabs } from '@theguild/components' + +# OpenTelemetry Metrics + +Hive Router exposes OpenTelemetry metrics for gateway traffic, subgraph traffic, cache behavior, +supergraph lifecycle, and GraphQL errors. + +This guide explains where to export metrics, how to configure OTLP and Prometheus, how to customize +instruments, and what each metric/label means in practice. + +## Choose your metrics destination + +Hive Router exposes metrics through two widely used integration patterns: + +- OTLP-based observability backends +- Prometheus scrape endpoints + +Most teams already running an OpenTelemetry pipeline tend to integrate via OTLP, while teams built +around Prometheus and Grafana typically stick with Prometheus scraping. + +### Send metrics to OTLP-compatible backends + +Hive Router can export metrics using OTLP to standard OpenTelemetry pipelines, including the +OpenTelemetry Collector and vendor backends that support OTLP ingestion over HTTP or gRPC. + +After enabling the exporter, generate some traffic through the router and confirm that new metric +series appear in your backend (for example HTTP server/client latency, cache metrics, and supergraph +execution metrics). + +If metrics do not appear, verify: + +- Endpoint reachability (network, DNS, TLS) +- Authentication credentials or headers +- Exporter protocol matches the backend (OTLP/HTTP vs OTLP/gRPC) + + + + + +```yaml filename="router.config.yaml" +telemetry: + metrics: + exporters: + - kind: otlp + enabled: true + protocol: http + endpoint: https://otel-collector.example.com/v1/metrics + interval: 30s + max_export_timeout: 5s + http: + headers: + authorization: + expression: | + "Bearer " + env("OTLP_TOKEN") +``` + + + + + +```yaml filename="router.config.yaml" +telemetry: + metrics: + exporters: + - kind: otlp + enabled: true + protocol: grpc + endpoint: https://otel-collector.example.com:4317 + interval: 30s + max_export_timeout: 5s + grpc: + metadata: + x-api-key: + expression: env("OTEL_API_KEY") + tls: + domain_name: otel-collector.example.com + ca: /etc/certs/ca.pem + cert: /etc/certs/client.pem + key: /etc/certs/client.key +``` + + + + + +### Expose metrics for Prometheus scraping + +If your observability stack is Prometheus-first, Hive Router can expose an HTTP endpoint that +Prometheus scrapes at its configured interval. + +The `port` and `path` settings define the address where the Router exposes metrics. Prometheus must +be able to reach that address from its runtime environment (local network, Kubernetes service, or VM +network path). + + + If `port` is not set, or is the same as the main HTTP server port, the Router exposes metrics + through the same HTTP server that serves the GraphQL API. If the port is different, the Router + starts a separate HTTP server dedicated solely to the Prometheus metrics endpoint. + + +In production, make sure this endpoint is reachable only by trusted scrapers (for example via +network policy, firewall rules, or private ingress). Once configured, confirm the target appears as +healthy in Prometheus and then verify expected series are present (for example +`http.server.request.duration`, `http.client.request.duration`). + +```yaml filename="router.config.yaml" +telemetry: + metrics: + exporters: + - kind: prometheus + enabled: true + port: 9090 + path: /metrics +``` + +## Production baseline + +For production workloads, start with a single primary exporter, define a clear service identity, and +keep default instrumentation settings. + +```yaml filename="router.config.yaml" +telemetry: + resource: + attributes: + service.name: hive-router + service.namespace: your-platform + deployment.environment: + expression: env("ENVIRONMENT") + metrics: + exporters: + - kind: otlp + enabled: true + protocol: grpc + endpoint: https://otel-collector.example.com:4317 + interval: 30s + max_export_timeout: 5s +``` + +This is a safe baseline and works well before introducing instrumentation-level customization. +Additional exporters can be added later, but starting with one simplifies validation and +troubleshooting. + +### Cardinality considerations + +For production workloads, consider disabling `graphql.operation.name` label or even +`graphql.operation.type` on high-volume metrics. + + + `graphql.operation.name` can create very high-cardinality metrics. + +Operation names come from client requests. Without persisted operations, clients can send many +distinct operation names (or random names), which can rapidly increase cardinality and cost in +Prometheus and OTLP backends. + + + +```yaml filename="router.config.yaml" +telemetry: + metrics: + instrumentation: + instruments: + http.server.request.duration: + attributes: + graphql.operation.name: false + http.server.request.body.size: + attributes: + graphql.operation.name: false + http.server.response.body.size: + attributes: + graphql.operation.name: false +``` + +## Customize instrumentation + +You can override behavior per metric under `telemetry.metrics.instrumentation.instruments`. + + + Disable non-essential labels to control cost, since each additional label value increases active + time-series cardinality - a primary billing and performance driver in platforms like Grafana and + Datadog. + + +For production guidance on label cardinality (especially `graphql.operation.name`), see +[Cardinality considerations](#cardinality-considerations). + +- `false` disables a metric. +- `true` keeps default behavior. +- object form enables metric + optional attribute overrides. + +```yaml filename="router.config.yaml" +telemetry: + metrics: + instrumentation: + instruments: + # Disable HTTP server request duration metric + http.server.request.duration: false + http.client.request.duration: + attributes: + # Disable the label + subgraph.name: false + # Enable the label (labels are enabled by default) + http.response.status_code: true +``` + +Attribute override behavior: + +- `false` - drop label from that metric +- `true` - keep label (all labels are enabled by default) + +Histogram aggregation can also be customized under +`telemetry.metrics.instrumentation.common.histogram`. + +- [`explicit`](https://opentelemetry.io/docs/specs/otel/metrics/data-model/#histogram) (default) + uses unit-specific bucket sets. Lets you configure unit-specific buckets: + - `seconds` for histogram unit `s` + - `bytes` for histogram unit `By` +- [`exponential`](https://opentelemetry.io/docs/specs/otel/metrics/data-model/#exponentialhistogram) + uses one shared exponential strategy for all histogram metrics. +- `record_min_max controls whether min and max are reported for histogram points. + +Bucket format rules: + +- [`buckets`](https://opentelemetry.io/docs/specs/otel/metrics/sdk/#explicit-bucket-histogram-aggregation) + can be either all numbers or all strings. +- mixed arrays are not allowed. +- `seconds.buckets` string values are parsed as durations (for example `"5ms"`, `"1s"`). +- `bytes.buckets` string values are parsed as human-readable sizes (for example `"1KB"`, `"5MB"`). + +In `explicit` mode, histogram units other than `s` and `By` fail startup. + +```yaml filename="router.config.yaml" +telemetry: + metrics: + instrumentation: + common: + histogram: + aggregation: explicit + seconds: + buckets: + [ + '5ms', + '10ms', + '25ms', + '50ms', + '75ms', + '100ms', + '250ms', + '500ms', + '750ms', + '1s', + '2.5s', + '5s', + '7.5s', + '10s' + ] + record_min_max: false + bytes: + buckets: + [ + '128B', + '512B', + '1KB', + '2KB', + '4KB', + '8KB', + '16KB', + '32KB', + '64KB', + '128KB', + '256KB', + '512KB', + '1MB', + '2MB', + '3MB', + '4MB', + '5MB' + ] + record_min_max: false +``` + +## Metrics reference + +### GraphQL + +GraphQL metrics capture errors surfaced by the router across all stages of a GraphQL request +lifecycle. + + + +### Supergraph + +Supergraph metrics cover polling and processing lifecycle of schema updates. + + + +### HTTP server + +HTTP server metrics capture inbound client traffic processed by the router. + += 400'], + notes: 'Only set for failed requests' + }, + { + name: 'graphql.operation.name', + meaning: 'GraphQL operation name associated with the HTTP request', + typicalValues: ['UsersQuery', 'IntrospectionQuery', 'UNKNOWN'], + notes: + 'Used by http.server.request.duration, http.server.request.body.size, and http.server.response.body.size. High-cardinality risk: value is client-controlled and can explode without persisted operations.' + }, + { + name: 'graphql.operation.type', + meaning: 'GraphQL operation type', + typicalValues: ['query', 'mutation', 'subscription'], + notes: + 'Used by http.server.request.duration, http.server.request.body.size, and http.server.response.body.size. Omitted when unknown' + }, + { + name: 'graphql.response.status', + meaning: 'GraphQL response status for the request', + typicalValues: ['ok', 'error'], + notes: + 'Used by http.server.request.duration, http.server.request.body.size, and http.server.response.body.size. "error" indicates the GraphQL response contains at least one error' + } + ]} +/> + +### HTTP client + +HTTP client metrics capture outbound requests the router makes to subgraphs. + += 400 or execution error code string' + }, + { + name: 'graphql.response.status', + meaning: 'GraphQL response status for the subgraph request', + typicalValues: ['ok', 'error'], + notes: + 'Set to "ok" when the parsed subgraph response has no GraphQL errors. Set to "error" when the subgraph response includes GraphQL errors or when transport/deserialization fails.' + } + ]} +/> + +### Cache + +Cache metrics track lookup behavior and cache size across router caches used during request +preparation and planning stages. + +#### Parsing cache + +Parsing cache metrics measure query parse cache hit/miss behavior and cache size. + + + +#### Validation cache + +Validation cache metrics measure query validation cache hit/miss behavior and cache size. + + + +#### Normalization cache + +Normalization cache metrics measure query normalization cache hit/miss behavior and cache size. + + + +#### Planning cache + +Planning cache metrics measure query planning cache hit/miss behavior and cache size. + + + +#### Labels + +These labels are shared by cache lookup counters and duration histograms. + +
+ +
+ +## What to monitor in production + +The examples below show which signals to monitor in production and how to break them down so you can +quickly isolate API, subgraph, cache, and GraphQL issues. + +### Monitor end-to-end latency of your GraphQL API + +Use [`http.server.request.duration`](#metric-http-server-request-duration) as your primary latency +signal. + +In production, break this metric down by `http.route`, `http.request.method`, +`http.response.status_code`, and/or `graphql.response.status`, then track p95 and p99 latency per +route and method. Keep successful and failed responses separated so error-path latency does not get +hidden by healthy traffic. + +### Monitor health of your subgraphs + +Use [`http.client.request.duration`](#metric-http-client-request-duration) and +[`http.client.active_requests`](#metric-http-client-active-requests) to monitor dependency health +across your federated graph. + +Break these metrics down by `subgraph.name`, `http.response.status_code`, and `error.type` to +identify which subgraph is driving tail latency or error spikes. + +### Monitor cache effectiveness and planning pressure + +Use the cache metrics to evaluate cache hit ratio, miss cost, and pressure over time. + +For request and duration metrics, split by `result` (`hit` and `miss`) so you can track hit ratio +and miss latency per cache kind. + +### Monitor GraphQL errors over time + +Use [`hive.router.graphql.errors_total`](#metric-hive-router-graphql-errors-total) and break it down +by `code` to track both volume and error distribution. + +In production, monitor how error-code distribution changes over time, not only total count, so you +can separate validation issues from execution failures. + +## Configuration reference + +For full options and defaults, see +[telemetry configuration reference](/docs/router/configuration/telemetry).