From 21c821ceb50dfa5d675c6513092bdf9f2320ac9e Mon Sep 17 00:00:00 2001 From: Kamil Kisiela Date: Fri, 13 Feb 2026 11:20:44 +0100 Subject: [PATCH 01/14] metrics: wip --- .../components/otel-metrics/label-card.tsx | 60 ++ .../components/otel-metrics/metric-card.tsx | 95 +++ .../otel-metrics/metrics-section.tsx | 55 ++ .../content/router/observability/metrics.mdx | 637 ++++++++++++++++++ 4 files changed, 847 insertions(+) create mode 100644 packages/web/docs/src/components/otel-metrics/label-card.tsx create mode 100644 packages/web/docs/src/components/otel-metrics/metric-card.tsx create mode 100644 packages/web/docs/src/components/otel-metrics/metrics-section.tsx create mode 100644 packages/web/docs/src/content/router/observability/metrics.mdx diff --git a/packages/web/docs/src/components/otel-metrics/label-card.tsx b/packages/web/docs/src/components/otel-metrics/label-card.tsx new file mode 100644 index 00000000000..357a0ed020d --- /dev/null +++ b/packages/web/docs/src/components/otel-metrics/label-card.tsx @@ -0,0 +1,60 @@ +import { Info, Lightbulb, Tag } from 'lucide-react'; + +interface LabelCardProps { + name: string; + meaning: string; + typicalValues: string[]; + notes?: string; +} + +export function LabelCard({ name, meaning, typicalValues, notes }: LabelCardProps) { + return ( +
+
+
+
+ +
+
+ + {name} + +

+ {meaning} +

+
+
+ +
+
+
+ + + Typical Values + +
+
+ {typicalValues.map(value => ( + + {value} + + ))} +
+
+ + {notes && ( +
+
+ +

{notes}

+
+
+ )} +
+
+
+ ); +} diff --git a/packages/web/docs/src/components/otel-metrics/metric-card.tsx b/packages/web/docs/src/components/otel-metrics/metric-card.tsx new file mode 100644 index 00000000000..b31d81488f6 --- /dev/null +++ b/packages/web/docs/src/components/otel-metrics/metric-card.tsx @@ -0,0 +1,95 @@ +import { Activity, BarChart3, Gauge, TrendingUp } from 'lucide-react'; + +interface MetricCardProps { + name: string; + type: 'Counter' | 'Histogram' | 'UpDownCounter' | 'Gauge'; + unit?: string; + description?: string; + labels?: string[]; +} + +const typeConfig = { + Counter: { + icon: TrendingUp, + color: + 'bg-emerald-50 text-emerald-700 border-emerald-200 dark:bg-emerald-900/30 dark:text-emerald-300 dark:border-emerald-700/50', + badge: 'bg-emerald-100 text-emerald-800', + }, + Histogram: { + icon: BarChart3, + color: + 'bg-blue-50 text-blue-700 border-blue-200 dark:bg-blue-900/30 dark:text-blue-300 dark:border-blue-700/50', + badge: 'bg-blue-100 text-blue-800', + }, + UpDownCounter: { + icon: Activity, + color: + 'bg-amber-50 text-amber-700 border-amber-200 dark:bg-amber-900/30 dark:text-amber-300 dark:border-amber-700/50', + badge: 'bg-amber-100 text-amber-800', + }, + Gauge: { + icon: Gauge, + color: + 'bg-slate-50 text-slate-700 border-slate-200 dark:bg-slate-800/60 dark:text-slate-100 dark:border-slate-700', + badge: 'bg-slate-100 text-slate-800', + }, +}; + +export function MetricCard({ name, type, unit, description, labels }: MetricCardProps) { + const config = typeConfig[type]; + const Icon = config.icon; + + return ( +
+
+
+
+ + {name} + +
+
+ {unit && ( +
+ Unit: + {unit} +
+ )} +
+ + {type} +
+
+
+ + {description && ( +

+ {description} +

+ )} + + {labels && labels.length > 0 && ( +
+
+ + Labels + +
+
+ {labels.map(label => ( + + {label} + + ))} +
+
+ )} +
+
+ ); +} diff --git a/packages/web/docs/src/components/otel-metrics/metrics-section.tsx b/packages/web/docs/src/components/otel-metrics/metrics-section.tsx new file mode 100644 index 00000000000..f0866350cc9 --- /dev/null +++ b/packages/web/docs/src/components/otel-metrics/metrics-section.tsx @@ -0,0 +1,55 @@ +import { LabelCard } from './label-card'; +import { MetricCard } from './metric-card'; + +interface Metric { + name: string; + type: 'Counter' | 'Histogram' | 'UpDownCounter' | 'Gauge'; + unit?: string; + description?: string; + labels?: string[]; +} + +interface Label { + name: string; + meaning: string; + typicalValues: string[]; + notes?: string; +} + +interface MetricsSectionProps { + title?: string; + description?: string; + metrics?: Metric[]; + labels?: Label[]; +} +export function MetricsSection({ metrics, labels }: MetricsSectionProps) { + return ( +
+ {metrics && metrics.length > 0 && ( +
+

+ Metrics +

+
+ {metrics.map(metric => ( + + ))} +
+
+ )} + + {labels && labels.length > 0 && ( +
+

+ Labels Reference +

+
+ {labels.map(label => ( + + ))} +
+
+ )} +
+ ); +} diff --git a/packages/web/docs/src/content/router/observability/metrics.mdx b/packages/web/docs/src/content/router/observability/metrics.mdx new file mode 100644 index 00000000000..0533d1c3e8b --- /dev/null +++ b/packages/web/docs/src/content/router/observability/metrics.mdx @@ -0,0 +1,637 @@ +--- +title: 'OpenTelemetry Metrics' +--- + +import { Callout } from '#components/callout' +import { MetricsSection } from '#components/otel-metrics/metrics-section' +import { Tabs } from '@theguild/components' + +# OpenTelemetry Metrics + +Hive Router exposes OpenTelemetry metrics for gateway traffic, subgraph traffic, cache behavior, +supergraph lifecycle, and GraphQL errors. + +This guide explains where to export metrics, how to configure OTLP and Prometheus, how to customize +instruments, and what each metric/label means in practice. + +## Choose your metrics destination + +Hive Router supports two common metrics paths: + +- OTLP-compatible backends +- Prometheus scraping + +In practice, teams with existing OpenTelemetry pipelines usually choose OTLP. Teams with existing +Prometheus/Grafana stacks usually choose Prometheus. + +### Send metrics to OTLP-compatible backends + +Hive Router can export metrics directly to any OTLP-compatible destination, including OpenTelemetry +Collector and vendor backends that support OTLP ingestion, either through HTTP or gRPC. + +After enabling the exporter, send traffic through the router and verify that new metric series +appear in your backend (for example HTTP server/client duration, cache metrics, and supergraph +metrics). If metrics are missing, start by validating endpoint reachability and auth credentials, +then check exporter protocol alignment (HTTP vs gRPC). + + + + + +```yaml filename="router.config.yaml" +telemetry: + metrics: + exporters: + - kind: otlp + enabled: true + protocol: http + endpoint: https://otel-collector.example.com/v1/metrics + interval: 30s + max_export_timeout: 5s + http: + headers: + authorization: + expression: | + "Bearer " + env("OTLP_TOKEN") +``` + + + + + +```yaml filename="router.config.yaml" +telemetry: + metrics: + exporters: + - kind: otlp + enabled: true + protocol: grpc + endpoint: https://otel-collector.example.com:4317 + interval: 30s + max_export_timeout: 5s + grpc: + metadata: + x-api-key: + expression: env("OTEL_API_KEY") + tls: + domain_name: otel-collector.example.com + ca: /etc/certs/ca.pem + cert: /etc/certs/client.pem + key: /etc/certs/client.key +``` + + + + + +### Expose metrics for Prometheus scraping + +If your observability stack is Prometheus-first, Hive Router can expose a scrape endpoint that +Prometheus polls on a schedule. + +The `port` and `path` settings define where Router serves metrics. Prometheus must be able to reach +that address from its runtime environment (local network, Kubernetes service, or VM network path). + + + If `port` is not set, or is the same as the main HTTP server port, the Router exposes metrics + through the same HTTP server that serves the GraphQL API. If the port is different, the Router + starts a separate HTTP server dedicated solely to the Prometheus metrics endpoint. + + +In production, make sure this endpoint is reachable only by trusted scrapers and that any ingress or +firewall policy allows Prometheus access. Once configured, confirm the target appears as healthy +(`UP`) in Prometheus and then verify expected series are present (for example +`http.server.request.duration`, `http.client.request.duration`, and `hive.router.*` cache/supergraph +metrics). + +```yaml filename="router.config.yaml" +telemetry: + metrics: + exporters: + - kind: prometheus + enabled: true + port: 9090 + path: /metrics +``` + +## Production baseline + +For production workloads, start with one primary exporter, a clear service identity, and default +instrument settings. + +```yaml filename="router.config.yaml" +telemetry: + resource: + attributes: + service.name: hive-router + service.namespace: your-platform + deployment.environment: + expression: env("ENVIRONMENT") + metrics: + exporters: + - kind: otlp + enabled: true + protocol: grpc + endpoint: https://otel-collector.example.com:4317 + interval: 30s + max_export_timeout: 5s +``` + +This is a safe baseline and works well before introducing instrumentation-level customization. By +default, all metrics and labels are exposed. + +## Instrument customization + +You can override behavior per metric under `telemetry.metrics.instrumentation.instruments`. + +- `false` disables a metric. +- `true` keeps default behavior. +- object form enables metric + optional attribute overrides. + +```yaml filename="router.config.yaml" +telemetry: + metrics: + instrumentation: + instruments: + # Disable HTTP server request duration metric + http.server.request.duration: false + http.client.request.duration: + attributes: + # Disable the label + subgraph.name: false + # Enable the label (labels are enabled by default) + http.response.status_code: true +``` + +Attribute override behavior: + +- `false` - drop label from that metric +- `true` - keep label (all labels are enabled by default) + +## Metrics reference + +### GraphQL + +GraphQL metrics focus on validation, parsing and execution-time errors surfaced by the router. + + + +### Supergraph + +Supergraph metrics cover polling and processing lifecycle of schema updates. + + + +### HTTP server + +HTTP server metrics describe inbound client traffic handled by the router. + += 400'], + notes: 'Only set for failed requests' + } + ]} +/> + +### HTTP client + +HTTP client metrics describe outbound requests to the subgraphs, made by the router. + += 400 or execution error code string' + } + ]} +/> + +### Cache + +Cache metrics track lookup behavior and cache size across router caches. + +#### Parsing cache + +Parsing cache metrics measure query parse cache hit/miss behavior and cache size. + + + +#### Validation cache + +Validation cache metrics measure query validation cache hit/miss behavior and cache size. + + + +#### Normalization cache + +Normalization cache metrics measure query normalization cache hit/miss behavior and cache size. + + + +#### Planning cache + +Planning cache metrics measure query planning cache hit/miss behavior and cache size. + + + +#### Labels + +These labels are shared by cache lookup counters and duration histograms. + + + +## Troubleshooting + +When metrics are missing or incomplete, check in layers: + +- exporter setup +- instrument overrides +- label overrides +- transport reachability + +If no metrics appear at all, verify exporter enablement, endpoint reachability, and credentials. + +If a configured instrument key is unknown, startup fails with a clear error and valid metric names. + +If a configured attribute key is unknown, Router logs a warning and ignores it. + +## Configuration reference + +For full options and defaults, see: + +- [telemetry configuration reference](/docs/router/configuration/telemetry) From 02b1b0be4d115a2ff68285b930f51843facc5ba3 Mon Sep 17 00:00:00 2001 From: Kamil Kisiela Date: Fri, 13 Feb 2026 11:52:04 +0100 Subject: [PATCH 02/14] asd --- .../components/otel-metrics/label-card.tsx | 76 +++++++++---------- .../components/otel-metrics/metric-card.tsx | 76 ++++++++++++++++++- .../otel-metrics/metrics-section.tsx | 41 ++++++++-- .../content/router/observability/metrics.mdx | 22 +++--- 4 files changed, 154 insertions(+), 61 deletions(-) diff --git a/packages/web/docs/src/components/otel-metrics/label-card.tsx b/packages/web/docs/src/components/otel-metrics/label-card.tsx index 357a0ed020d..182630dc29c 100644 --- a/packages/web/docs/src/components/otel-metrics/label-card.tsx +++ b/packages/web/docs/src/components/otel-metrics/label-card.tsx @@ -9,51 +9,49 @@ interface LabelCardProps { export function LabelCard({ name, meaning, typicalValues, notes }: LabelCardProps) { return ( -
-
-
-
- +
+
+
+ +
+
+ + {name} + +

+ {meaning} +

+
+
+ +
+
+
+ + + Typical Values +
-
- - {name} - -

- {meaning} -

+
+ {typicalValues.map(value => ( + + {value} + + ))}
-
-
-
- - - Typical Values - -
-
- {typicalValues.map(value => ( - - {value} - - ))} + {notes && ( +
+
+ +

{notes}

- - {notes && ( -
-
- -

{notes}

-
-
- )} -
+ )}
); diff --git a/packages/web/docs/src/components/otel-metrics/metric-card.tsx b/packages/web/docs/src/components/otel-metrics/metric-card.tsx index b31d81488f6..116937dbb65 100644 --- a/packages/web/docs/src/components/otel-metrics/metric-card.tsx +++ b/packages/web/docs/src/components/otel-metrics/metric-card.tsx @@ -1,3 +1,4 @@ +import { useEffect, useRef, useState } from 'react'; import { Activity, BarChart3, Gauge, TrendingUp } from 'lucide-react'; interface MetricCardProps { @@ -38,15 +39,82 @@ const typeConfig = { export function MetricCard({ name, type, unit, description, labels }: MetricCardProps) { const config = typeConfig[type]; const Icon = config.icon; + const [isCopied, setIsCopied] = useState(false); + const copiedTimeoutRef = useRef | null>(null); + const metricId = `metric-${name + .toLowerCase() + .replace(/[^a-z0-9]+/g, '-') + .replace(/(^-|-$)/g, '')}`; + + useEffect(() => { + return () => { + if (copiedTimeoutRef.current) { + clearTimeout(copiedTimeoutRef.current); + } + }; + }, []); + + function showCopiedState() { + setIsCopied(true); + + if (copiedTimeoutRef.current) { + clearTimeout(copiedTimeoutRef.current); + } + + copiedTimeoutRef.current = setTimeout(() => { + setIsCopied(false); + }, 1200); + } + + async function copyMetricLink() { + if (typeof window === 'undefined') { + return; + } + + const metricUrl = `${window.location.origin}${window.location.pathname}${window.location.search}#${metricId}`; + + try { + await navigator.clipboard.writeText(metricUrl); + showCopiedState(); + } catch { + window.location.hash = metricId; + } + } return ( -
+
- - {name} - +
+ + {name} + + + + {isCopied ? `Copied link to ${name}` : ''} + +
{unit && ( diff --git a/packages/web/docs/src/components/otel-metrics/metrics-section.tsx b/packages/web/docs/src/components/otel-metrics/metrics-section.tsx index f0866350cc9..72c4dd68b45 100644 --- a/packages/web/docs/src/components/otel-metrics/metrics-section.tsx +++ b/packages/web/docs/src/components/otel-metrics/metrics-section.tsx @@ -1,3 +1,7 @@ +'use client'; + +import { useId, useState } from 'react'; +import { ChevronDown } from 'lucide-react'; import { LabelCard } from './label-card'; import { MetricCard } from './metric-card'; @@ -23,6 +27,9 @@ interface MetricsSectionProps { labels?: Label[]; } export function MetricsSection({ metrics, labels }: MetricsSectionProps) { + const [isLabelsOpen, setIsLabelsOpen] = useState(false); + const labelsRegionId = useId(); + return (
{metrics && metrics.length > 0 && ( @@ -39,14 +46,32 @@ export function MetricsSection({ metrics, labels }: MetricsSectionProps) { )} {labels && labels.length > 0 && ( -
-

- Labels Reference -

-
- {labels.map(label => ( - - ))} +
+ +
+
+
+ {labels.map(label => ( +
+ +
+ ))} +
+
)} diff --git a/packages/web/docs/src/content/router/observability/metrics.mdx b/packages/web/docs/src/content/router/observability/metrics.mdx index 0533d1c3e8b..b228701444c 100644 --- a/packages/web/docs/src/content/router/observability/metrics.mdx +++ b/packages/web/docs/src/content/router/observability/metrics.mdx @@ -604,16 +604,18 @@ Planning cache metrics measure query planning cache hit/miss behavior and cache These labels are shared by cache lookup counters and duration histograms. - +
+ +
## Troubleshooting From 40e8fe32b9d1d7dcadd6eaf88d832a1c0777de28 Mon Sep 17 00:00:00 2001 From: Kamil Kisiela Date: Fri, 13 Feb 2026 11:57:03 +0100 Subject: [PATCH 03/14] sad --- .../content/router/observability/metrics.mdx | 51 ++++++++++++++++--- 1 file changed, 44 insertions(+), 7 deletions(-) diff --git a/packages/web/docs/src/content/router/observability/metrics.mdx b/packages/web/docs/src/content/router/observability/metrics.mdx index b228701444c..831b4fb41df 100644 --- a/packages/web/docs/src/content/router/observability/metrics.mdx +++ b/packages/web/docs/src/content/router/observability/metrics.mdx @@ -189,9 +189,14 @@ GraphQL metrics focus on validation, parsing and execution-time errors surfaced { name: 'code', meaning: 'GraphQL error code', - typicalValues: ['GRAPHQL_PARSE_FAILED', 'GRAPHQL_VALIDATION_FAILED', 'PLAN_EXECUTION_FAILED', 'UNKNOWN', '...'], - notes: - `Uses "extensions.code" values and router's error codes. "UNKNOWN" is used when no code is available.` + typicalValues: [ + 'GRAPHQL_PARSE_FAILED', + 'GRAPHQL_VALIDATION_FAILED', + 'PLAN_EXECUTION_FAILED', + 'UNKNOWN', + '...' + ], + notes: `Uses "extensions.code" values and router's error codes. "UNKNOWN" is used when no code is available.` } ]} /> @@ -605,18 +610,50 @@ Planning cache metrics measure query planning cache hit/miss behavior and cache These labels are shared by cache lookup counters and duration histograms.
- + />
+## Production observability playbook + +### Monitor latency of your GraphQL API + +Use `http.server.request.duration` as your primary latency signal. In production, break this metric +down by `http.route`, `http.request.method`, and `http.response.status_code`, then track p95 and p99 +per route and method. Keep successful and failed responses separated so error-path latency does not +get hidden by healthy traffic. + +### Monitor health of your subgraphs + +Use `http.client.request.duration` and `http.client.active_requests` to monitor dependency health +across your federated graph. Break these metrics down by `subgraph.name`, +`http.response.status_code`, and `error.type` to identify which subgraph is driving tail latency or +error spikes. When you need infrastructure-level debugging, add `server.address` and `server.port` +to distinguish endpoint-level issues from service-level behavior. + +### Monitor cache effectiveness and planning pressure + +Use the cache metric families `hive.router.parse_cache.*`, `hive.router.validate_cache.*`, +`hive.router.normalize_cache.*`, and `hive.router.plan_cache.*` to evaluate cache behavior over +time. For request and duration metrics, split by `result` (`hit` and `miss`) so you can track hit +ratio and miss latency per cache tier. Combine those views with cache size trends to catch sustained +cache-efficiency regressions, especially in planning cache paths. + +### Monitor GraphQL errors over time + +Use `hive.router.graphql.errors_total` and break it down by `code` to track both volume and error +shape. In production, monitor how error-code distribution changes over time, not only total count, +so you can separate validation issues from execution failures. Alert on sharp increases in +`GRAPHQL_VALIDATION_FAILED`, `PLAN_EXECUTION_FAILED`, or `UNKNOWN`. + ## Troubleshooting When metrics are missing or incomplete, check in layers: From fbadb6de2a97e1af41a78585b92f7078c11b034e Mon Sep 17 00:00:00 2001 From: Kamil Kisiela Date: Fri, 13 Feb 2026 12:07:57 +0100 Subject: [PATCH 04/14] asd --- .../content/router/observability/metrics.mdx | 64 ++++++++----------- 1 file changed, 27 insertions(+), 37 deletions(-) diff --git a/packages/web/docs/src/content/router/observability/metrics.mdx b/packages/web/docs/src/content/router/observability/metrics.mdx index 831b4fb41df..bd60da693a5 100644 --- a/packages/web/docs/src/content/router/observability/metrics.mdx +++ b/packages/web/docs/src/content/router/observability/metrics.mdx @@ -140,7 +140,7 @@ telemetry: This is a safe baseline and works well before introducing instrumentation-level customization. By default, all metrics and labels are exposed. -## Instrument customization +## Customize instrumentation You can override behavior per metric under `telemetry.metrics.instrumentation.instruments`. @@ -622,55 +622,45 @@ These labels are shared by cache lookup counters and duration histograms. />
-## Production observability playbook +## What to monitor in production -### Monitor latency of your GraphQL API - -Use `http.server.request.duration` as your primary latency signal. In production, break this metric -down by `http.route`, `http.request.method`, and `http.response.status_code`, then track p95 and p99 -per route and method. Keep successful and failed responses separated so error-path latency does not -get hidden by healthy traffic. +The examples below show what to monitor in production and how to break metrics down so you can +quickly detect and isolate API, subgraph, cache, and GraphQL issues. -### Monitor health of your subgraphs +### Monitor latency of your GraphQL API -Use `http.client.request.duration` and `http.client.active_requests` to monitor dependency health -across your federated graph. Break these metrics down by `subgraph.name`, -`http.response.status_code`, and `error.type` to identify which subgraph is driving tail latency or -error spikes. When you need infrastructure-level debugging, add `server.address` and `server.port` -to distinguish endpoint-level issues from service-level behavior. +Use [`http.server.request.duration`](#metric-http-server-request-duration) as your primary latency +signal. -### Monitor cache effectiveness and planning pressure +In production, break this metric down by `http.route`, `http.request.method`, and +`http.response.status_code`, then track p95 and p99 per route and method. Keep successful and failed +responses separated so error-path latency does not get hidden by healthy traffic. -Use the cache metric families `hive.router.parse_cache.*`, `hive.router.validate_cache.*`, -`hive.router.normalize_cache.*`, and `hive.router.plan_cache.*` to evaluate cache behavior over -time. For request and duration metrics, split by `result` (`hit` and `miss`) so you can track hit -ratio and miss latency per cache tier. Combine those views with cache size trends to catch sustained -cache-efficiency regressions, especially in planning cache paths. +### Monitor health of your subgraphs -### Monitor GraphQL errors over time +Use [`http.client.request.duration`](#metric-http-client-request-duration) and +[`http.client.active_requests`](#metric-http-client-active-requests) to monitor dependency health +across your federated graph. -Use `hive.router.graphql.errors_total` and break it down by `code` to track both volume and error -shape. In production, monitor how error-code distribution changes over time, not only total count, -so you can separate validation issues from execution failures. Alert on sharp increases in -`GRAPHQL_VALIDATION_FAILED`, `PLAN_EXECUTION_FAILED`, or `UNKNOWN`. +Break these metrics down by `subgraph.name`, `http.response.status_code`, and `error.type` to +identify which subgraph is driving tail latency or error spikes. -## Troubleshooting +### Monitor cache effectiveness and planning pressure -When metrics are missing or incomplete, check in layers: +Use the cache metrics to evaluate cache behavior over time. -- exporter setup -- instrument overrides -- label overrides -- transport reachability +For request and duration metrics, split by `result` (`hit` and `miss`) so you can track hit ratio +and miss latency per cache tier. -If no metrics appear at all, verify exporter enablement, endpoint reachability, and credentials. +### Monitor GraphQL errors over time -If a configured instrument key is unknown, startup fails with a clear error and valid metric names. +Use [`hive.router.graphql.errors_total`](#metric-hive-router-graphql-errors-total) and break it down +by `code` to track both volume and error shape. -If a configured attribute key is unknown, Router logs a warning and ignores it. +In production, monitor how error-code distribution changes over time, not only total count, so you +can separate validation issues from execution failures. ## Configuration reference -For full options and defaults, see: - -- [telemetry configuration reference](/docs/router/configuration/telemetry) +For full options and defaults, see +[telemetry configuration reference](/docs/router/configuration/telemetry). From f66246fbd81fb24bcbf6ce77e0551d50e8bb5a8a Mon Sep 17 00:00:00 2001 From: Kamil Kisiela Date: Fri, 13 Feb 2026 12:47:30 +0100 Subject: [PATCH 05/14] asd --- .../router/configuration/telemetry.mdx | 200 +++++++++++++++++- 1 file changed, 199 insertions(+), 1 deletion(-) diff --git a/packages/web/docs/src/content/router/configuration/telemetry.mdx b/packages/web/docs/src/content/router/configuration/telemetry.mdx index b5f979f1087..0e8f8242d7d 100644 --- a/packages/web/docs/src/content/router/configuration/telemetry.mdx +++ b/packages/web/docs/src/content/router/configuration/telemetry.mdx @@ -5,7 +5,7 @@ title: 'telemetry' # telemetry The `telemetry` configuration controls client identification, Hive reporting, and OpenTelemetry -tracing behavior in Hive Router. +tracing and metrics behavior in Hive Router. ## client_identification @@ -232,6 +232,204 @@ telemetry: x-api-key: key ``` + + +
+ + + +## metrics + +Top-level OpenTelemetry metrics configuration. + +
+ Show metrics configuration + +Metrics are enabled when at least one exporter is configured and enabled. + +| Field | Type | Default | Notes | +| ----------------- | -------- | ------- | ------------------------------------------------------------------------------------- | +| `exporters` | `array` | `[]` | List of exporters used to send metrics. Supported kinds: `otlp`, `prometheus`. | +| `instrumentation` | `object` | `{}` | Instrument behavior for metrics (histogram aggregation and per-instrument overrides). | + +
+
+ `exporters` + +Each item configures one metrics exporter. + +Each item in this array defines one exporter instance, so you can configure multiple metrics +destinations if needed. + +This reference documents OTLP and Prometheus exporter configuration. + +| Field | Type | Default | Notes | +| --------- | --------- | ------- | ------------------------------------------------------ | +| `kind` | `string` | - | Exporter kind. Supported values: `otlp`, `prometheus`. | +| `enabled` | `boolean` | `true` | Enables or disables this exporter. | + +
+ `otlp` + +| Field | Type | Default | Notes | +| -------------------- | -------------------- | ------------ | ----------------------------------------------------------------- | +| `kind` | `string` | - | Must be `otlp`. | +| `enabled` | `boolean` | `true` | Enables or disables this exporter. | +| `endpoint` | `StringOrExpression` | - | OTLP endpoint. Must be set explicitly. | +| `protocol` | `string` | - | OTLP transport protocol. Supported values: `http`, `grpc`. | +| `interval` | `string` | `60s` | Interval between OTLP export attempts. | +| `temporality` | `string` | `cumulative` | Aggregation temporality. Supported values: `cumulative`, `delta`. | +| `max_export_timeout` | `string` | `5s` | Maximum time for one OTLP export attempt. | +| `http` | `object` | - | HTTP-specific OTLP settings (for `protocol: http`). | +| `grpc` | `object` | - | gRPC-specific OTLP settings (for `protocol: grpc`). | + +OTLP over HTTP: + +| Field | Type | Value / Default | Notes | +| -------------- | -------- | --------------- | ------------------------------------------------------------- | +| `protocol` | `string` | `http` | OTLP transport protocol. | +| `http.headers` | `object` | `{}` | Map of header names to values (`string` or `{ expression }`). | + +```yaml filename="router.config.yaml" +telemetry: + metrics: + exporters: + - kind: otlp + enabled: true + protocol: http + endpoint: https://otel-collector.example.com/v1/metrics + interval: 60s + temporality: cumulative + max_export_timeout: 5s + http: + headers: + x-otlp-header: value +``` + +OTLP over gRPC: + +| Field | Type | Value / Default | Notes | +| ---------------------- | -------- | --------------- | ---------------------------------------------------------------------------- | +| `protocol` | `string` | `grpc` | OTLP transport protocol. | +| `grpc.metadata` | `object` | `{}` | Map of metadata keys to values (`string` or `{ expression }`). | +| `grpc.tls.domain_name` | `string` | - | Domain name used to verify the server certificate. | +| `grpc.tls.key` | `string` | - | Path to the client private key file. | +| `grpc.tls.cert` | `string` | - | Path to the client certificate file (PEM). | +| `grpc.tls.ca` | `string` | - | Path to the CA certificate file (PEM) used to verify the server certificate. | + +```yaml filename="router.config.yaml" +telemetry: + metrics: + exporters: + - kind: otlp + enabled: true + protocol: grpc + endpoint: https://otel-collector.example.com:4317 + interval: 60s + temporality: cumulative + max_export_timeout: 5s + grpc: + metadata: + x-api-key: key +``` + +
+ +
+ `prometheus` + +| Field | Type | Default | Notes | +| --------- | --------- | ---------- | ------------------------------------------- | +| `kind` | `string` | - | Must be `prometheus`. | +| `enabled` | `boolean` | `true` | Enables/disables Prometheus metrics export. | +| `port` | `integer` | - | Optional port for metrics endpoint. | +| `path` | `string` | `/metrics` | HTTP path exposed for scraping. | + +```yaml filename="router.config.yaml" +telemetry: + metrics: + exporters: + - kind: prometheus + enabled: true + port: 9090 + path: /metrics +``` + +
+ +
+
+ +
+
+ `instrumentation` + +Controls histogram aggregation and per-instrument overrides. + +| Field | Type | Default | Notes | +| ------------------ | -------- | ----------------------- | ----------------------------------------------------------- | +| `common.histogram` | `object` | exponential aggregation | Histogram aggregation strategy for instrumented histograms. | +| `instruments` | `object` | `{}` | Map of metric name to `false`, `true`, or object override. | + +
+ `common.histogram` + +Set aggregation mode with `aggregation`. + +`explicit` aggregation: + +| Field | Type | Default | Notes | +| ---------------- | ---------- | ------- | --------------------------- | +| `aggregation` | `string` | - | Must be `explicit`. | +| `boundaries` | `number[]` | - | Explicit bucket boundaries. | +| `record_min_max` | `boolean` | `false` | Record min/max values. | + +`exponential` aggregation (default): + +| Field | Type | Default | Notes | +| ---------------- | --------- | ------- | ------------------------------- | +| `aggregation` | `string` | - | Must be `exponential`. | +| `max_size` | `integer` | `160` | Max bucket count. | +| `max_scale` | `integer` | `20` | Max scale for bucket precision. | +| `record_min_max` | `boolean` | `false` | Record min/max values. | + +
+ +
+ `instruments` + +`instruments` is a map keyed by metric name. Value can be: + +- `false` to disable a metric +- `true` to keep defaults +- object to keep metric enabled and override attributes + +Object form supports: + +| Field | Type | Notes | +| ------------ | -------- | ------------------------------------------------------------------------------ | +| `attributes` | `object` | Map of attribute name to `boolean` (`false` drops attribute, `true` keeps it). | + +```yaml filename="router.config.yaml" +telemetry: + metrics: + instrumentation: + common: + histogram: + aggregation: exponential + max_size: 160 + max_scale: 20 + instruments: + http.server.request.duration: true + http.client.request.duration: + attributes: + subgraph.name: true + http.response.status_code: true + server.address: false +``` + +
+
From 1c6f7d9e6a86de6113dc64eeaed39edca47a9c2f Mon Sep 17 00:00:00 2001 From: Kamil Kisiela Date: Fri, 13 Feb 2026 13:50:36 +0100 Subject: [PATCH 06/14] asd --- .../router/configuration/telemetry.mdx | 2 +- .../content/router/observability/metrics.mdx | 80 ++++++++++--------- 2 files changed, 45 insertions(+), 37 deletions(-) diff --git a/packages/web/docs/src/content/router/configuration/telemetry.mdx b/packages/web/docs/src/content/router/configuration/telemetry.mdx index 0e8f8242d7d..32c5e1850fa 100644 --- a/packages/web/docs/src/content/router/configuration/telemetry.mdx +++ b/packages/web/docs/src/content/router/configuration/telemetry.mdx @@ -249,7 +249,7 @@ Metrics are enabled when at least one exporter is configured and enabled. | Field | Type | Default | Notes | | ----------------- | -------- | ------- | ------------------------------------------------------------------------------------- | -| `exporters` | `array` | `[]` | List of exporters used to send metrics. Supported kinds: `otlp`, `prometheus`. | +| `exporters` | `array` | `[]` | List of exporters used to send metrics. | | `instrumentation` | `object` | `{}` | Instrument behavior for metrics (histogram aggregation and per-instrument overrides). |
diff --git a/packages/web/docs/src/content/router/observability/metrics.mdx b/packages/web/docs/src/content/router/observability/metrics.mdx index bd60da693a5..c9a9534d724 100644 --- a/packages/web/docs/src/content/router/observability/metrics.mdx +++ b/packages/web/docs/src/content/router/observability/metrics.mdx @@ -16,23 +16,28 @@ instruments, and what each metric/label means in practice. ## Choose your metrics destination -Hive Router supports two common metrics paths: +Hive Router exposes metrics through two widely used integration patterns: -- OTLP-compatible backends -- Prometheus scraping +- OTLP-based observability backends +- Prometheus scrape endpoints -In practice, teams with existing OpenTelemetry pipelines usually choose OTLP. Teams with existing -Prometheus/Grafana stacks usually choose Prometheus. +Most teams already running an OpenTelemetry pipeline tend to integrate via OTLP, while teams built +around Prometheus and Grafana typically stick with Prometheus scraping. ### Send metrics to OTLP-compatible backends -Hive Router can export metrics directly to any OTLP-compatible destination, including OpenTelemetry -Collector and vendor backends that support OTLP ingestion, either through HTTP or gRPC. +Hive Router can export metrics using OTLP to standard OpenTelemetry pipelines, including the +OpenTelemetry Collector and vendor backends that support OTLP ingestion over HTTP or gRPC. -After enabling the exporter, send traffic through the router and verify that new metric series -appear in your backend (for example HTTP server/client duration, cache metrics, and supergraph -metrics). If metrics are missing, start by validating endpoint reachability and auth credentials, -then check exporter protocol alignment (HTTP vs gRPC). +After enabling the exporter, generate some traffic through the router and confirm that new metric +series appear in your backend (for example HTTP server/client latency, cache metrics, and supergraph +execution metrics). + +If metrics do not appear, verify: + +- Endpoint reachability (network, DNS, TLS) +- Authentication credentials or headers +- Exporter protocol matches the backend (OTLP/HTTP vs OTLP/gRPC) @@ -86,11 +91,12 @@ telemetry: ### Expose metrics for Prometheus scraping -If your observability stack is Prometheus-first, Hive Router can expose a scrape endpoint that -Prometheus polls on a schedule. +If your observability stack is Prometheus-first, Hive Router can expose an HTTP endpoint that +Prometheus scrapes at its configured interval. -The `port` and `path` settings define where Router serves metrics. Prometheus must be able to reach -that address from its runtime environment (local network, Kubernetes service, or VM network path). +The `port` and `path` settings define the address where the Router exposes metrics. Prometheus must +be able to reach that address from its runtime environment (local network, Kubernetes service, or VM +network path). If `port` is not set, or is the same as the main HTTP server port, the Router exposes metrics @@ -98,11 +104,10 @@ that address from its runtime environment (local network, Kubernetes service, or starts a separate HTTP server dedicated solely to the Prometheus metrics endpoint. -In production, make sure this endpoint is reachable only by trusted scrapers and that any ingress or -firewall policy allows Prometheus access. Once configured, confirm the target appears as healthy -(`UP`) in Prometheus and then verify expected series are present (for example -`http.server.request.duration`, `http.client.request.duration`, and `hive.router.*` cache/supergraph -metrics). +In production, make sure this endpoint is reachable only by trusted scrapers (for example via +network policy, firewall rules, or private ingress). Once configured, confirm the target appears as +healthy in Prometheus and then verify expected series are present (for example +`http.server.request.duration`, `http.client.request.duration`). ```yaml filename="router.config.yaml" telemetry: @@ -116,8 +121,8 @@ telemetry: ## Production baseline -For production workloads, start with one primary exporter, a clear service identity, and default -instrument settings. +For production workloads, start with a single primary exporter, define a clear service identity, and +keep default instrumentation settings. ```yaml filename="router.config.yaml" telemetry: @@ -137,8 +142,9 @@ telemetry: max_export_timeout: 5s ``` -This is a safe baseline and works well before introducing instrumentation-level customization. By -default, all metrics and labels are exposed. +This is a safe baseline and works well before introducing instrumentation-level customization. +Additional exporters can be added later, but starting with one simplifies validation and +troubleshooting. ## Customize instrumentation @@ -172,7 +178,8 @@ Attribute override behavior: ### GraphQL -GraphQL metrics focus on validation, parsing and execution-time errors surfaced by the router. +GraphQL metrics capture errors surfaced by the router across all stages of a GraphQL request +lifecycle.
@@ -416,9 +435,13 @@ telemetry: instrumentation: common: histogram: - aggregation: exponential - max_size: 160 - max_scale: 20 + aggregation: explicit + seconds: + buckets: ["5ms", "10ms", "25ms", "50ms", "75ms", "100ms", "250ms", "500ms", "750ms", "1s", "2.5s", "5s", "7.5s", "10s"] + record_min_max: false + bytes: + buckets: ["128B", "512B", "1KB", "2KB", "4KB", "8KB", "16KB", "32KB", "64KB", "128KB", "256KB", "512KB", "1MB", "2MB", "3MB", "4MB", "5MB"] + record_min_max: false instruments: http.server.request.duration: true http.client.request.duration: diff --git a/packages/web/docs/src/content/router/observability/metrics.mdx b/packages/web/docs/src/content/router/observability/metrics.mdx index c9a9534d724..68d5a89e78f 100644 --- a/packages/web/docs/src/content/router/observability/metrics.mdx +++ b/packages/web/docs/src/content/router/observability/metrics.mdx @@ -174,6 +174,38 @@ Attribute override behavior: - `false` - drop label from that metric - `true` - keep label (all labels are enabled by default) +Histogram aggregation can also be customized under `telemetry.metrics.instrumentation.common.histogram`. + +- `explicit` (default) uses unit-specific bucket sets. +- `exponential` uses one shared exponential strategy for all histogram metrics. +- `explicit` lets you configure unit-specific buckets: + - `seconds` for histogram unit `s` + - `bytes` for histogram unit `By` + +Bucket format rules: + +- `buckets` can be either all numbers or all strings. +- mixed arrays are not allowed. +- `seconds.buckets` string values are parsed as durations (for example `"5ms"`, `"1s"`). +- `bytes.buckets` string values are parsed as human-readable sizes (for example `"1KB"`, `"5MB"`). + +In `explicit` mode, histogram units other than `s` and `By` fail startup. + +```yaml filename="router.config.yaml" +telemetry: + metrics: + instrumentation: + common: + histogram: + aggregation: explicit + seconds: + buckets: ["5ms", "10ms", "25ms", "50ms", "75ms", "100ms", "250ms", "500ms", "750ms", "1s", "2.5s", "5s", "7.5s", "10s"] + record_min_max: false + bytes: + buckets: ["128B", "512B", "1KB", "2KB", "4KB", "8KB", "16KB", "32KB", "64KB", "128KB", "256KB", "512KB", "1MB", "2MB", "3MB", "4MB", "5MB"] + record_min_max: false +``` + ## Metrics reference ### GraphQL From 5f1fe954786e3a2cfa8d0ce4e2f7a1c37a0c8f93 Mon Sep 17 00:00:00 2001 From: Kamil Kisiela Date: Fri, 13 Feb 2026 15:52:53 +0100 Subject: [PATCH 09/14] asd --- .../router/configuration/telemetry.mdx | 100 ++++++++++++------ .../content/router/observability/metrics.mdx | 53 ++++++++-- 2 files changed, 115 insertions(+), 38 deletions(-) diff --git a/packages/web/docs/src/content/router/configuration/telemetry.mdx b/packages/web/docs/src/content/router/configuration/telemetry.mdx index 29aa3895052..9b42ca64555 100644 --- a/packages/web/docs/src/content/router/configuration/telemetry.mdx +++ b/packages/web/docs/src/content/router/configuration/telemetry.mdx @@ -249,7 +249,7 @@ Metrics are enabled when at least one exporter is configured and enabled. | Field | Type | Default | Notes | | ----------------- | -------- | ------- | ------------------------------------------------------------------------------------- | -| `exporters` | `array` | `[]` | List of exporters used to send metrics. | +| `exporters` | `array` | `[]` | List of exporters used to send metrics. | | `instrumentation` | `object` | `{}` | Instrument behavior for metrics (histogram aggregation and per-instrument overrides). |
@@ -271,17 +271,17 @@ This reference documents OTLP and Prometheus exporter configuration.
`otlp` -| Field | Type | Default | Notes | -| -------------------- | -------------------- | ------------ | ----------------------------------------------------------------- | -| `kind` | `string` | - | Must be `otlp`. | -| `enabled` | `boolean` | `true` | Enables or disables this exporter. | -| `endpoint` | `StringOrExpression` | - | OTLP endpoint. Must be set explicitly. | -| `protocol` | `string` | - | OTLP transport protocol. Supported values: `http`, `grpc`. | -| `interval` | `string` | `60s` | Interval between OTLP export attempts. | -| `temporality` | `string` | `cumulative` | Aggregation temporality. Supported values: `cumulative`, `delta`. | -| `max_export_timeout` | `string` | `5s` | Maximum time for one OTLP export attempt. | -| `http` | `object` | - | HTTP-specific OTLP settings (for `protocol: http`). | -| `grpc` | `object` | - | gRPC-specific OTLP settings (for `protocol: grpc`). | +| Field | Type | Default | Notes | +| ------------------------------------------------------------------------------------------------------------------- | -------------------- | ------------ | ----------------------------------------------------------------- | +| `kind` | `string` | - | Must be `otlp`. | +| `enabled` | `boolean` | `true` | Enables or disables this exporter. | +| `endpoint` | `StringOrExpression` | - | OTLP endpoint. Must be set explicitly. | +| `protocol` | `string` | - | OTLP transport protocol. Supported values: `http`, `grpc`. | +| `interval` | `string` | `60s` | Interval between OTLP export attempts. | +| [`temporality`](https://opentelemetry.io/docs/specs/otel/metrics/supplementary-guidelines/#aggregation-temporality) | `string` | `cumulative` | Aggregation temporality. Supported values: `cumulative`, `delta`. | +| `max_export_timeout` | `string` | `5s` | Maximum time for one OTLP export attempt. | +| `http` | `object` | - | HTTP-specific OTLP settings (for `protocol: http`). | +| `grpc` | `object` | - | gRPC-specific OTLP settings (for `protocol: grpc`). | OTLP over HTTP: @@ -378,39 +378,42 @@ Set aggregation mode with `aggregation`. `explicit` aggregation (default): -| Field | Type | Default | Notes | -| ----------------- | -------- | ------- | ------------------------------------------------------------------- | -| `aggregation` | `string` | - | Must be `explicit`. | -| `seconds` | `object` | - | Explicit histogram config for metrics with unit `s`. | -| `bytes` | `object` | - | Explicit histogram config for metrics with unit `By`. | +| Field | Type | Default | Notes | +| ------------- | -------- | ------- | ----------------------------------------------------- | +| `aggregation` | `string` | - | Must be `explicit`. | +| `seconds` | `object` | - | Explicit histogram config for metrics with unit `s`. | +| `bytes` | `object` | - | Explicit histogram config for metrics with unit `By`. | `seconds` and `bytes` fields: -| Field | Type | Default | Notes | -| ---------------- | ---------- | ------- | ------------------------------------------------------------ | +| Field | Type | Default | Notes | +| ---------------- | ---------------------- | ------- | --------------------------------------------------------------- | | `buckets` | `number[] \| string[]` | varies | Explicit bucket upper bounds. Must be non-empty and increasing. | -| `record_min_max` | `boolean` | `false` | Record min/max values for this unit bucket set. | +| `record_min_max` | `boolean` | `false` | Record min/max values for this unit bucket set. | Default explicit buckets: - `seconds.buckets`: `[0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1, 2.5, 5, 7.5, 10]` -- `bytes.buckets`: `[128, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 3145728, 4194304, 5242880]` +- `bytes.buckets`: + `[128, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 3145728, 4194304, 5242880]` Bucket format rules: - `buckets` can be either all numbers or all strings. - mixed arrays are not allowed. - for `seconds.buckets`, string values are parsed as durations (for example `"5ms"`, `"1s"`). -- for `bytes.buckets`, string values are parsed as human-readable sizes (for example `"1KB"`, `"5MB"`). +- for `bytes.buckets`, string values are parsed as human-readable sizes (for example `"1KB"`, + `"5MB"`). -`exponential` aggregation: +[`exponential`](https://opentelemetry.io/docs/specs/otel/metrics/data-model/#exponentialhistogram) +aggregation: -| Field | Type | Default | Notes | -| ---------------- | --------- | ------- | ------------------------------- | -| `aggregation` | `string` | - | Must be `exponential`. | -| `max_size` | `integer` | - | Max bucket count. Required. | -| `max_scale` | `integer` | - | Max scale for bucket precision. Required. | -| `record_min_max` | `boolean` | `false` | Record min/max values. | +| Field | Type | Default | Notes | +| ------------------------------------------------------------------------------------------------------------------- | --------- | ------- | ----------------------------------------- | +| `aggregation` | `string` | - | Must be `exponential`. | +| [`max_size`](https://opentelemetry.io/docs/specs/otel/metrics/sdk/#base2-exponential-bucket-histogram-aggregation) | `integer` | - | Max bucket count. Required. | +| [`max_scale`](https://opentelemetry.io/docs/specs/otel/metrics/sdk/#base2-exponential-bucket-histogram-aggregation) | `integer` | - | Max scale for bucket precision. Required. | +| `record_min_max` | `boolean` | `false` | Record min/max values. |
@@ -437,10 +440,45 @@ telemetry: histogram: aggregation: explicit seconds: - buckets: ["5ms", "10ms", "25ms", "50ms", "75ms", "100ms", "250ms", "500ms", "750ms", "1s", "2.5s", "5s", "7.5s", "10s"] + buckets: + [ + '5ms', + '10ms', + '25ms', + '50ms', + '75ms', + '100ms', + '250ms', + '500ms', + '750ms', + '1s', + '2.5s', + '5s', + '7.5s', + '10s' + ] record_min_max: false bytes: - buckets: ["128B", "512B", "1KB", "2KB", "4KB", "8KB", "16KB", "32KB", "64KB", "128KB", "256KB", "512KB", "1MB", "2MB", "3MB", "4MB", "5MB"] + buckets: + [ + '128B', + '512B', + '1KB', + '2KB', + '4KB', + '8KB', + '16KB', + '32KB', + '64KB', + '128KB', + '256KB', + '512KB', + '1MB', + '2MB', + '3MB', + '4MB', + '5MB' + ] record_min_max: false instruments: http.server.request.duration: true diff --git a/packages/web/docs/src/content/router/observability/metrics.mdx b/packages/web/docs/src/content/router/observability/metrics.mdx index 68d5a89e78f..68b810e5aaa 100644 --- a/packages/web/docs/src/content/router/observability/metrics.mdx +++ b/packages/web/docs/src/content/router/observability/metrics.mdx @@ -174,17 +174,21 @@ Attribute override behavior: - `false` - drop label from that metric - `true` - keep label (all labels are enabled by default) -Histogram aggregation can also be customized under `telemetry.metrics.instrumentation.common.histogram`. +Histogram aggregation can also be customized under +`telemetry.metrics.instrumentation.common.histogram`. -- `explicit` (default) uses unit-specific bucket sets. -- `exponential` uses one shared exponential strategy for all histogram metrics. -- `explicit` lets you configure unit-specific buckets: +- [`explicit`](https://opentelemetry.io/docs/specs/otel/metrics/data-model/#histogram) (default) + uses unit-specific bucket sets. Lets you configure unit-specific buckets: - `seconds` for histogram unit `s` - `bytes` for histogram unit `By` +- [`exponential`](https://opentelemetry.io/docs/specs/otel/metrics/data-model/#exponentialhistogram) + uses one shared exponential strategy for all histogram metrics. +- `record_min_max controls whether min and max are reported for histogram points. Bucket format rules: -- `buckets` can be either all numbers or all strings. +- [`buckets`](https://opentelemetry.io/docs/specs/otel/metrics/sdk/#explicit-bucket-histogram-aggregation) + can be either all numbers or all strings. - mixed arrays are not allowed. - `seconds.buckets` string values are parsed as durations (for example `"5ms"`, `"1s"`). - `bytes.buckets` string values are parsed as human-readable sizes (for example `"1KB"`, `"5MB"`). @@ -199,10 +203,45 @@ telemetry: histogram: aggregation: explicit seconds: - buckets: ["5ms", "10ms", "25ms", "50ms", "75ms", "100ms", "250ms", "500ms", "750ms", "1s", "2.5s", "5s", "7.5s", "10s"] + buckets: + [ + '5ms', + '10ms', + '25ms', + '50ms', + '75ms', + '100ms', + '250ms', + '500ms', + '750ms', + '1s', + '2.5s', + '5s', + '7.5s', + '10s' + ] record_min_max: false bytes: - buckets: ["128B", "512B", "1KB", "2KB", "4KB", "8KB", "16KB", "32KB", "64KB", "128KB", "256KB", "512KB", "1MB", "2MB", "3MB", "4MB", "5MB"] + buckets: + [ + '128B', + '512B', + '1KB', + '2KB', + '4KB', + '8KB', + '16KB', + '32KB', + '64KB', + '128KB', + '256KB', + '512KB', + '1MB', + '2MB', + '3MB', + '4MB', + '5MB' + ] record_min_max: false ``` From 22520f2c16bd2c59b012c2fa2a4e61c396deb0f7 Mon Sep 17 00:00:00 2001 From: Kamil Kisiela Date: Fri, 13 Feb 2026 21:40:26 +0100 Subject: [PATCH 10/14] Update metrics.mdx --- .../content/router/observability/metrics.mdx | 26 ++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/packages/web/docs/src/content/router/observability/metrics.mdx b/packages/web/docs/src/content/router/observability/metrics.mdx index 68b810e5aaa..c7e336f4f7a 100644 --- a/packages/web/docs/src/content/router/observability/metrics.mdx +++ b/packages/web/docs/src/content/router/observability/metrics.mdx @@ -340,7 +340,9 @@ HTTP server metrics capture inbound client traffic processed by the router. 'network.protocol.name', 'network.protocol.version', 'url.scheme', - 'error.type' + 'error.type', + 'graphql.operation.name', + 'graphql.operation.type' ] }, { @@ -355,7 +357,9 @@ HTTP server metrics capture inbound client traffic processed by the router. 'network.protocol.name', 'network.protocol.version', 'url.scheme', - 'error.type' + 'error.type', + 'graphql.operation.name', + 'graphql.operation.type' ] }, { @@ -370,7 +374,9 @@ HTTP server metrics capture inbound client traffic processed by the router. 'network.protocol.name', 'network.protocol.version', 'url.scheme', - 'error.type' + 'error.type', + 'graphql.operation.name', + 'graphql.operation.type' ] }, { @@ -430,6 +436,20 @@ HTTP server metrics capture inbound client traffic processed by the router. meaning: 'Error classification for failed requests', typicalValues: ['status code >= 400'], notes: 'Only set for failed requests' + }, + { + name: 'graphql.operation.name', + meaning: 'GraphQL operation name associated with the HTTP request', + typicalValues: ['UsersQuery', 'IntrospectionQuery', 'UNKNOWN'], + notes: + 'Used by http.server.request.duration, http.server.request.body.size, and http.server.response.body.size' + }, + { + name: 'graphql.operation.type', + meaning: 'GraphQL operation type', + typicalValues: ['query', 'mutation', 'subscription'], + notes: + 'Used by http.server.request.duration, http.server.request.body.size, and http.server.response.body.size. Omitted when unknown' } ]} /> From d53624ff9dee4fab73a99d9da64c377f57083f61 Mon Sep 17 00:00:00 2001 From: Kamil Kisiela Date: Mon, 16 Feb 2026 12:43:41 +0100 Subject: [PATCH 11/14] Update metrics.mdx --- .../content/router/observability/metrics.mdx | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/packages/web/docs/src/content/router/observability/metrics.mdx b/packages/web/docs/src/content/router/observability/metrics.mdx index c7e336f4f7a..408431f77a4 100644 --- a/packages/web/docs/src/content/router/observability/metrics.mdx +++ b/packages/web/docs/src/content/router/observability/metrics.mdx @@ -342,7 +342,8 @@ HTTP server metrics capture inbound client traffic processed by the router. 'url.scheme', 'error.type', 'graphql.operation.name', - 'graphql.operation.type' + 'graphql.operation.type', + 'graphql.response.status' ] }, { @@ -359,7 +360,8 @@ HTTP server metrics capture inbound client traffic processed by the router. 'url.scheme', 'error.type', 'graphql.operation.name', - 'graphql.operation.type' + 'graphql.operation.type', + 'graphql.response.status' ] }, { @@ -376,7 +378,8 @@ HTTP server metrics capture inbound client traffic processed by the router. 'url.scheme', 'error.type', 'graphql.operation.name', - 'graphql.operation.type' + 'graphql.operation.type', + 'graphql.response.status' ] }, { @@ -450,6 +453,13 @@ HTTP server metrics capture inbound client traffic processed by the router. typicalValues: ['query', 'mutation', 'subscription'], notes: 'Used by http.server.request.duration, http.server.request.body.size, and http.server.response.body.size. Omitted when unknown' + }, + { + name: 'graphql.response.status', + meaning: 'GraphQL response status for the request', + typicalValues: ['ok', 'error'], + notes: + 'Used by http.server.request.duration, http.server.request.body.size, and http.server.response.body.size. "error" indicates the GraphQL response contains at least one error' } ]} /> @@ -731,9 +741,10 @@ quickly isolate API, subgraph, cache, and GraphQL issues. Use [`http.server.request.duration`](#metric-http-server-request-duration) as your primary latency signal. -In production, break this metric down by `http.route`, `http.request.method`, and -`http.response.status_code`, then track p95 and p99 latency per route and method. Keep successful -and failed responses separated so error-path latency does not get hidden by healthy traffic. +In production, break this metric down by `http.route`, `http.request.method`, +`http.response.status_code`, and/or `graphql.response.status`, then track p95 and p99 latency per +route and method. Keep successful and failed responses separated so error-path latency does not get +hidden by healthy traffic. ### Monitor health of your subgraphs From 26bfd56864b278b24e3ff211dd89bc34886d45a7 Mon Sep 17 00:00:00 2001 From: Kamil Kisiela Date: Mon, 16 Feb 2026 14:01:41 +0100 Subject: [PATCH 12/14] Update metrics.mdx --- .../src/content/router/observability/metrics.mdx | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/packages/web/docs/src/content/router/observability/metrics.mdx b/packages/web/docs/src/content/router/observability/metrics.mdx index 408431f77a4..c0443ba5dcd 100644 --- a/packages/web/docs/src/content/router/observability/metrics.mdx +++ b/packages/web/docs/src/content/router/observability/metrics.mdx @@ -484,7 +484,8 @@ HTTP client metrics capture outbound requests the router makes to subgraphs. 'url.scheme', 'subgraph.name', 'http.response.status_code', - 'error.type' + 'error.type', + 'graphql.response.status' ] }, { @@ -501,7 +502,8 @@ HTTP client metrics capture outbound requests the router makes to subgraphs. 'url.scheme', 'subgraph.name', 'http.response.status_code', - 'error.type' + 'error.type', + 'graphql.response.status' ] }, { @@ -518,7 +520,8 @@ HTTP client metrics capture outbound requests the router makes to subgraphs. 'url.scheme', 'subgraph.name', 'http.response.status_code', - 'error.type' + 'error.type', + 'graphql.response.status' ] }, { @@ -597,6 +600,13 @@ HTTP client metrics capture outbound requests the router makes to subgraphs. meaning: 'Error classification', typicalValues: ['400', 'SUBGRAPH_REQUEST_FAILURE', '...'], notes: 'Numeric status code >= 400 or execution error code string' + }, + { + name: 'graphql.response.status', + meaning: 'GraphQL response status for the subgraph request', + typicalValues: ['ok', 'error'], + notes: + 'Set to "ok" when the parsed subgraph response has no GraphQL errors. Set to "error" when the subgraph response includes GraphQL errors or when transport/deserialization fails.' } ]} /> From d31bf7865d91c02a7a5ab1ed78b4e2c6e5f18be8 Mon Sep 17 00:00:00 2001 From: Kamil Kisiela Date: Mon, 16 Feb 2026 16:05:55 +0100 Subject: [PATCH 13/14] cardinality considirations --- .../content/router/observability/metrics.mdx | 37 ++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/packages/web/docs/src/content/router/observability/metrics.mdx b/packages/web/docs/src/content/router/observability/metrics.mdx index c0443ba5dcd..d31ed9c9d37 100644 --- a/packages/web/docs/src/content/router/observability/metrics.mdx +++ b/packages/web/docs/src/content/router/observability/metrics.mdx @@ -146,10 +146,45 @@ This is a safe baseline and works well before introducing instrumentation-level Additional exporters can be added later, but starting with one simplifies validation and troubleshooting. +### Cardinality considerations + +For production workloads, consider disabling `graphql.operation.name` label or even `graphql.operation.type` on high-volume metrics. + + + `graphql.operation.name` can create very high-cardinality metrics. + + Operation names come from client requests. Without persisted operations, clients can send many + distinct operation names (or random names), which can rapidly increase cardinality and cost in + Prometheus and OTLP backends. + + +```yaml filename="router.config.yaml" +telemetry: + metrics: + instrumentation: + instruments: + http.server.request.duration: + attributes: + graphql.operation.name: false + http.server.request.body.size: + attributes: + graphql.operation.name: false + http.server.response.body.size: + attributes: + graphql.operation.name: false +``` + ## Customize instrumentation You can override behavior per metric under `telemetry.metrics.instrumentation.instruments`. + + Disable non-essential labels to control cost, since each additional label value increases active time-series cardinality - a primary billing and performance driver in platforms like Grafana and Datadog. + + +For production guidance on label cardinality (especially `graphql.operation.name`), see +[Cardinality considerations](#cardinality-considerations). + - `false` disables a metric. - `true` keeps default behavior. - object form enables metric + optional attribute overrides. @@ -445,7 +480,7 @@ HTTP server metrics capture inbound client traffic processed by the router. meaning: 'GraphQL operation name associated with the HTTP request', typicalValues: ['UsersQuery', 'IntrospectionQuery', 'UNKNOWN'], notes: - 'Used by http.server.request.duration, http.server.request.body.size, and http.server.response.body.size' + 'Used by http.server.request.duration, http.server.request.body.size, and http.server.response.body.size. High-cardinality risk: value is client-controlled and can explode without persisted operations.' }, { name: 'graphql.operation.type', From a7dc6de7cd0da593a8b7038d6ad153b0e21220a6 Mon Sep 17 00:00:00 2001 From: Kamil Kisiela Date: Mon, 16 Feb 2026 16:09:13 +0100 Subject: [PATCH 14/14] Update metrics.mdx --- .../src/content/router/observability/metrics.mdx | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/packages/web/docs/src/content/router/observability/metrics.mdx b/packages/web/docs/src/content/router/observability/metrics.mdx index d31ed9c9d37..27877394cb5 100644 --- a/packages/web/docs/src/content/router/observability/metrics.mdx +++ b/packages/web/docs/src/content/router/observability/metrics.mdx @@ -148,14 +148,16 @@ troubleshooting. ### Cardinality considerations -For production workloads, consider disabling `graphql.operation.name` label or even `graphql.operation.type` on high-volume metrics. +For production workloads, consider disabling `graphql.operation.name` label or even +`graphql.operation.type` on high-volume metrics. `graphql.operation.name` can create very high-cardinality metrics. - Operation names come from client requests. Without persisted operations, clients can send many - distinct operation names (or random names), which can rapidly increase cardinality and cost in - Prometheus and OTLP backends. +Operation names come from client requests. Without persisted operations, clients can send many +distinct operation names (or random names), which can rapidly increase cardinality and cost in +Prometheus and OTLP backends. + ```yaml filename="router.config.yaml" @@ -179,7 +181,9 @@ telemetry: You can override behavior per metric under `telemetry.metrics.instrumentation.instruments`. - Disable non-essential labels to control cost, since each additional label value increases active time-series cardinality - a primary billing and performance driver in platforms like Grafana and Datadog. + Disable non-essential labels to control cost, since each additional label value increases active + time-series cardinality - a primary billing and performance driver in platforms like Grafana and + Datadog. For production guidance on label cardinality (especially `graphql.operation.name`), see